算子将抽取与落盘固定到流程中 (#134)

* feature: 将抽取动作移到每一个算子中

* feature: 落盘算子改为默认执行

* feature: 优化前端展示

* feature: 使用pyproject管理依赖
This commit is contained in:
hhhhsc701
2025-12-05 17:26:29 +08:00
committed by GitHub
parent 744d15ba24
commit d59c167da4
70 changed files with 289 additions and 539 deletions

View File

@@ -15,12 +15,7 @@ _configure_importer()
def _import_operators():
from . import text_formatter
from . import word_formatter
from . import img_formatter
from . import file_exporter
from . import slide_formatter
from . import unstructured_formatter
from . import mineru_formatter

View File

@@ -1,6 +0,0 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='FileExporter',
module_path="ops.formatter.file_exporter.process")

View File

@@ -1,16 +0,0 @@
name: '落盘算子'
name_en: 'save file operator'
description: '将文件内容保存为文件。'
description_en: 'Save the file data as a file.'
language: 'Python'
vendor: 'Huawei'
raw_id: 'FileExporter'
version: '1.0.0'
types:
- 'collect'
modal: 'others'
effect:
before: ''
after: ''
inputs: 'all'
outputs: 'all'

View File

@@ -1,145 +0,0 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: Json文本抽取
Create: 2024/06/06 15:43
"""
import time
import os
import uuid
from typing import Tuple, Dict, Any
from loguru import logger
from datamate.core.constant import Fields
from datamate.core.base_op import Mapper
from datamate.common.utils import check_valid_path
class FileExporter(Mapper):
"""把输入的json文件流抽取为txt"""
def __init__(self, *args, **kwargs):
super(FileExporter, self).__init__(*args, **kwargs)
self.last_ops = True
self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
'xlsx', 'xls', 'csv', 'pptx', 'ppt',
'xml', 'json', 'doc', 'docx', 'pdf'])
self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
file_name = sample[self.filename_key]
file_type = sample[self.filetype_key]
try:
start = time.time()
if file_type in self.text_support_ext:
sample, save_path = self.get_textfile_handler(sample)
elif file_type in self.data_support_ext:
sample, save_path = self.get_datafile_handler(sample)
elif file_type in self.medical_support_ext:
sample, save_path = self.get_medicalfile_handler(sample)
else:
raise TypeError(f"{file_type} is unsupported! please check support_ext in FileExporter Ops")
if sample[self.text_key] == '' and sample[self.data_key] == b'':
sample[self.filesize_key] = "0"
return sample
if save_path:
self.save_file(sample, save_path)
sample[self.text_key] = ''
sample[self.data_key] = b''
sample[Fields.result] = True
file_type = save_path.split('.')[-1]
sample[self.filetype_key] = file_type
base_name, _ = os.path.splitext(file_name)
new_file_name = base_name + '.' + file_type
sample[self.filename_key] = new_file_name
base_name, _ = os.path.splitext(save_path)
sample[self.filepath_key] = base_name
file_size = os.path.getsize(base_name)
sample[self.filesize_key] = f"{file_size}"
logger.info(f"origin file named {file_name} has been save to {save_path}")
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FileExporter costs {time.time() - start:.6f} s")
except UnicodeDecodeError as err:
logger.error(f"fileName: {sample[self.filename_key]}, "
f"method: FileExporter causes decode error: {err}")
raise
return sample
def get_save_path(self, sample: Dict[str, Any], target_type) -> str:
export_path = os.path.abspath(sample[self.export_path_key])
file_name = sample[self.filename_key]
new_file_name = os.path.splitext(file_name)[0] + '.' + target_type
if not check_valid_path(export_path):
os.makedirs(export_path, exist_ok=True)
res = os.path.join(export_path, new_file_name)
return res
def get_textfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
target_type = sample.get("target_type", None)
# target_type存在则保存为扫描件, docx格式
if target_type:
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, target_type)
# 不存在则保存为txt文件,正常文本清洗
else:
sample = self._get_from_text(sample)
save_path = self.get_save_path(sample, 'txt')
return sample, save_path
def get_datafile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
target_type = sample.get("target_type", None)
# target_type存在, 图转文保存为target_type,markdown格式
if target_type:
sample = self._get_from_text(sample)
save_path = self.get_save_path(sample, target_type)
# 不存在则保存为原本图片文件格式,正常图片清洗
else:
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, sample[self.filetype_key])
return sample, save_path
def get_medicalfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
target_type = 'png'
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, target_type)
return sample, save_path
def save_file(self, sample, save_path):
file_name, _ = os.path.splitext(save_path)
# 以二进制格式保存文件
file_sample = sample[self.text_key].encode('utf-8') if sample[self.text_key] else sample[self.data_key]
with open(file_name, 'wb') as f:
f.write(file_sample)
# 获取父目录路径
parent_dir = os.path.dirname(file_name)
os.chmod(parent_dir, 0o770)
os.chmod(file_name, 0o640)
def _get_from_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
sample[self.data_key] = bytes(sample[self.data_key])
sample[self.text_key] = ''
return sample
def _get_from_text(self, sample: Dict[str, Any]) -> Dict[str, Any]:
sample[self.data_key] = b''
sample[self.text_key] = str(sample[self.text_key])
return sample
def _get_uuid(self):
res = str(uuid.uuid4())
return res

View File

@@ -1,6 +0,0 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgFormatter',
module_path="ops.formatter.img_formatter.process")

View File

@@ -1,16 +0,0 @@
name: '读取图片文件'
name_en: 'Image File Reader'
description: '读取图片文件。'
description_en: 'Reads image files.'
language: 'Python'
vendor: 'Huawei'
raw_id: 'ImgFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -1,35 +0,0 @@
# # -- encoding: utf-8 --
#
# Description:
# Create: 2024/1/30 15:24
# """
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import numpy_to_bytes
from datamate.core.base_op import Mapper
class ImgFormatter(Mapper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
file_path = sample[self.filepath_key]
img_data = _img_extract(file_path)
sample[self.data_key] = numpy_to_bytes(img_data, file_type)
logger.info(f"fileName: {file_name}, method: ImgExtract costs {(time.time() - start):6f} s")
return sample
def _img_extract(file_path):
return cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1)

View File

@@ -1,6 +0,0 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='TextFormatter',
module_path="ops.formatter.text_formatter.process")

View File

@@ -1,16 +0,0 @@
name: 'TXT文本抽取'
name_en: 'TXT Text Extraction'
description: '抽取TXT中的文本'
description_en: 'Extracts text from TXT files.'
language: 'python'
vendor: 'huawei'
raw_id: 'TxtFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -1,44 +0,0 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: Json文本抽取
Create: 2024/06/06 15:43
"""
import time
from loguru import logger
from typing import Dict, Any
from datamate.core.base_op import Mapper
class TextFormatter(Mapper):
"""把输入的json文件流抽取为txt"""
def __init__(self, *args, **kwargs):
super(TextFormatter, self).__init__(*args, **kwargs)
@staticmethod
def _extract_json(byte_io):
"""将默认使用utf-8编码的Json文件流解码,抽取为txt"""
# 用utf-8-sig的格式进行抽取,可以避免uft-8 BOM编码格式的文件在抽取后产生隐藏字符作为前缀。
return byte_io.decode("utf-8-sig").replace("\r\n", "\n")
def byte_read(self, sample: Dict[str, Any]):
filepath = sample[self.filepath_key]
with open(filepath, "rb") as file:
byte_data = file.read()
sample[self.data_key] = byte_data
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
try:
self.byte_read(sample)
sample[self.text_key] = self._extract_json(sample[self.data_key])
sample[self.data_key] = b"" # 将sample[self.data_key]置空
logger.info(
f"fileName: {sample[self.filename_key]}, method: TextFormatter costs {(time.time() - start):6f} s")
except UnicodeDecodeError as err:
logger.exception(f"fileName: {sample[self.filename_key]}, method: TextFormatter causes decode error: {err}")
raise
return sample

View File

@@ -1,6 +0,0 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='UnstructuredFormatter',
module_path="ops.formatter.unstructured_formatter.process")

View File

@@ -1,16 +0,0 @@
name: 'Unstructured文本抽取'
name_en: 'Unstructured Text Extraction'
description: '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。'
description_en: 'Extracts text from Unstructured files, currently supporting PowerPoint presentations, Word documents and Excel spreadsheets files.'
language: 'python'
vendor: 'huawei'
raw_id: 'UnstructuredFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -1,37 +0,0 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 非结构化文本抽取
Create: 2025/10/22 15:15
"""
import time
from typing import Dict, Any
from loguru import logger
from unstructured.partition.auto import partition
from datamate.core.base_op import Mapper
class UnstructuredFormatter(Mapper):
"""把输入的非结构化文本抽取为txt"""
def __init__(self, *args, **kwargs):
super(UnstructuredFormatter, self).__init__(*args, **kwargs)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
filepath = sample.get(self.filepath_key)
filename = sample.get(self.filename_key)
if not filename.lower().endswith((".ppt", ".pptx", "docx", "xlsx", ".csv")):
return sample
try:
elements = partition(filename=filepath)
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
except UnicodeDecodeError as err:
logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
raise
return sample

View File

@@ -1,6 +0,0 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='WordFormatter',
module_path="ops.formatter.word_formatter.process")

View File

@@ -1,16 +0,0 @@
name: 'Word文本抽取'
name_en: 'Word Text Extraction'
description: '抽取Word中的文本'
description_en: 'Extracts text from Word files.'
language: 'java'
vendor: 'huawei'
raw_id: 'WordFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -1,68 +0,0 @@
# # -- encoding: utf-8 --
#
# Description:
# Create: 2024/1/30 15:24
# """
from loguru import logger
import os
import subprocess
import time
from typing import Dict, Any
from datamate.common.utils import check_valid_path
from datamate.core.base_op import Mapper
class WordFormatter(Mapper):
SEPERATOR = ' | '
def __init__(self, *args, **kwargs):
super(WordFormatter, self).__init__(*args, **kwargs)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
file_name = sample[self.filename_key]
file_path = sample[self.filepath_key]
file_type = sample[self.filetype_key]
txt_content = self.word2html(file_path, file_type)
sample[self.text_key] = txt_content
logger.info(f"fileName: {file_name}, method: WordFormatter costs {(time.time() - start):6f} s")
return sample
@staticmethod
def word2html(file_path, file_type):
check_valid_path(file_path)
file_dir = file_path.rsplit('/', 1)[0]
file_name = file_path.rsplit('/', 1)[1]
html_file_path = os.path.join(file_dir, f"{file_name}.txt")
current_file_path = os.path.dirname(os.path.abspath(__file__))
try:
process = subprocess.Popen(
['java', '-jar', f'{current_file_path}/../../../java_operator/WordFormatter-1.0.jar', file_path,
html_file_path, file_type], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, stderr = process.communicate(timeout=24 * 60 * 60)
if process.returncode == 0:
logger.info(f"Convert {file_path} successfully to DOCX")
else:
logger.info(f"Convert {file_path} failed, error: {stderr.strip().decode('utf-8')}.")
raise RuntimeError()
except subprocess.CalledProcessError as e:
logger.error(f"Convert failed: {e}, return code: {e.returncode}")
except FileNotFoundError:
logger.error("LibreOffice command not found, please make sure it is available in PATH")
except Exception as e:
logger.error(f"An unexpected error occurred, convert failed: {e}", )
try:
with open(html_file_path, 'r', encoding='utf-8') as file:
txt_content = file.read()
os.remove(html_file_path)
logger.info("Tmp docx file removed")
except FileNotFoundError:
logger.error(f"Tmp file {html_file_path} does not exist")
except PermissionError:
logger.error(f"You are not allowed to delete tmp file {html_file_path}")
logger.info(f"Convert {html_file_path} to html success")
return txt_content