You've already forked DataMate
算子将抽取与落盘固定到流程中 (#134)
* feature: 将抽取动作移到每一个算子中 * feature: 落盘算子改为默认执行 * feature: 优化前端展示 * feature: 使用pyproject管理依赖
This commit is contained in:
@@ -15,12 +15,7 @@ _configure_importer()
|
||||
|
||||
|
||||
def _import_operators():
|
||||
from . import text_formatter
|
||||
from . import word_formatter
|
||||
from . import img_formatter
|
||||
from . import file_exporter
|
||||
from . import slide_formatter
|
||||
from . import unstructured_formatter
|
||||
from . import mineru_formatter
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='FileExporter',
|
||||
module_path="ops.formatter.file_exporter.process")
|
||||
@@ -1,16 +0,0 @@
|
||||
name: '落盘算子'
|
||||
name_en: 'save file operator'
|
||||
description: '将文件内容保存为文件。'
|
||||
description_en: 'Save the file data as a file.'
|
||||
language: 'Python'
|
||||
vendor: 'Huawei'
|
||||
raw_id: 'FileExporter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'others'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'all'
|
||||
outputs: 'all'
|
||||
@@ -1,145 +0,0 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: Json文本抽取
|
||||
Create: 2024/06/06 15:43
|
||||
"""
|
||||
import time
|
||||
import os
|
||||
import uuid
|
||||
from typing import Tuple, Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.constant import Fields
|
||||
from datamate.core.base_op import Mapper
|
||||
from datamate.common.utils import check_valid_path
|
||||
|
||||
|
||||
class FileExporter(Mapper):
|
||||
"""把输入的json文件流抽取为txt"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(FileExporter, self).__init__(*args, **kwargs)
|
||||
self.last_ops = True
|
||||
self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
|
||||
'xlsx', 'xls', 'csv', 'pptx', 'ppt',
|
||||
'xml', 'json', 'doc', 'docx', 'pdf'])
|
||||
self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
|
||||
self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = sample[self.filetype_key]
|
||||
|
||||
try:
|
||||
start = time.time()
|
||||
if file_type in self.text_support_ext:
|
||||
sample, save_path = self.get_textfile_handler(sample)
|
||||
elif file_type in self.data_support_ext:
|
||||
sample, save_path = self.get_datafile_handler(sample)
|
||||
elif file_type in self.medical_support_ext:
|
||||
sample, save_path = self.get_medicalfile_handler(sample)
|
||||
else:
|
||||
raise TypeError(f"{file_type} is unsupported! please check support_ext in FileExporter Ops")
|
||||
|
||||
if sample[self.text_key] == '' and sample[self.data_key] == b'':
|
||||
sample[self.filesize_key] = "0"
|
||||
return sample
|
||||
|
||||
if save_path:
|
||||
self.save_file(sample, save_path)
|
||||
sample[self.text_key] = ''
|
||||
sample[self.data_key] = b''
|
||||
sample[Fields.result] = True
|
||||
|
||||
file_type = save_path.split('.')[-1]
|
||||
sample[self.filetype_key] = file_type
|
||||
|
||||
base_name, _ = os.path.splitext(file_name)
|
||||
new_file_name = base_name + '.' + file_type
|
||||
sample[self.filename_key] = new_file_name
|
||||
|
||||
base_name, _ = os.path.splitext(save_path)
|
||||
sample[self.filepath_key] = base_name
|
||||
file_size = os.path.getsize(base_name)
|
||||
sample[self.filesize_key] = f"{file_size}"
|
||||
|
||||
logger.info(f"origin file named {file_name} has been save to {save_path}")
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileExporter costs {time.time() - start:.6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.error(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileExporter causes decode error: {err}")
|
||||
raise
|
||||
return sample
|
||||
|
||||
def get_save_path(self, sample: Dict[str, Any], target_type) -> str:
|
||||
export_path = os.path.abspath(sample[self.export_path_key])
|
||||
file_name = sample[self.filename_key]
|
||||
new_file_name = os.path.splitext(file_name)[0] + '.' + target_type
|
||||
|
||||
if not check_valid_path(export_path):
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
res = os.path.join(export_path, new_file_name)
|
||||
return res
|
||||
|
||||
def get_textfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
||||
target_type = sample.get("target_type", None)
|
||||
|
||||
# target_type存在则保存为扫描件, docx格式
|
||||
if target_type:
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
# 不存在则保存为txt文件,正常文本清洗
|
||||
else:
|
||||
sample = self._get_from_text(sample)
|
||||
save_path = self.get_save_path(sample, 'txt')
|
||||
return sample, save_path
|
||||
|
||||
def get_datafile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
||||
target_type = sample.get("target_type", None)
|
||||
|
||||
# target_type存在, 图转文保存为target_type,markdown格式
|
||||
if target_type:
|
||||
sample = self._get_from_text(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
# 不存在则保存为原本图片文件格式,正常图片清洗
|
||||
else:
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, sample[self.filetype_key])
|
||||
return sample, save_path
|
||||
|
||||
def get_medicalfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
||||
target_type = 'png'
|
||||
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
|
||||
return sample, save_path
|
||||
|
||||
def save_file(self, sample, save_path):
|
||||
file_name, _ = os.path.splitext(save_path)
|
||||
# 以二进制格式保存文件
|
||||
file_sample = sample[self.text_key].encode('utf-8') if sample[self.text_key] else sample[self.data_key]
|
||||
with open(file_name, 'wb') as f:
|
||||
f.write(file_sample)
|
||||
# 获取父目录路径
|
||||
|
||||
parent_dir = os.path.dirname(file_name)
|
||||
os.chmod(parent_dir, 0o770)
|
||||
os.chmod(file_name, 0o640)
|
||||
|
||||
def _get_from_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
sample[self.data_key] = bytes(sample[self.data_key])
|
||||
sample[self.text_key] = ''
|
||||
return sample
|
||||
|
||||
def _get_from_text(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
sample[self.data_key] = b''
|
||||
sample[self.text_key] = str(sample[self.text_key])
|
||||
return sample
|
||||
|
||||
def _get_uuid(self):
|
||||
res = str(uuid.uuid4())
|
||||
return res
|
||||
@@ -1,6 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgFormatter',
|
||||
module_path="ops.formatter.img_formatter.process")
|
||||
@@ -1,16 +0,0 @@
|
||||
name: '读取图片文件'
|
||||
name_en: 'Image File Reader'
|
||||
description: '读取图片文件。'
|
||||
description_en: 'Reads image files.'
|
||||
language: 'Python'
|
||||
vendor: 'Huawei'
|
||||
raw_id: 'ImgFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
@@ -1,35 +0,0 @@
|
||||
# # -- encoding: utf-8 --
|
||||
|
||||
#
|
||||
# Description:
|
||||
# Create: 2024/1/30 15:24
|
||||
# """
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import numpy_to_bytes
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgFormatter(Mapper):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
file_path = sample[self.filepath_key]
|
||||
img_data = _img_extract(file_path)
|
||||
sample[self.data_key] = numpy_to_bytes(img_data, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgExtract costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
|
||||
def _img_extract(file_path):
|
||||
return cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1)
|
||||
@@ -1,6 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='TextFormatter',
|
||||
module_path="ops.formatter.text_formatter.process")
|
||||
@@ -1,16 +0,0 @@
|
||||
name: 'TXT文本抽取'
|
||||
name_en: 'TXT Text Extraction'
|
||||
description: '抽取TXT中的文本'
|
||||
description_en: 'Extracts text from TXT files.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'TxtFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
@@ -1,44 +0,0 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: Json文本抽取
|
||||
Create: 2024/06/06 15:43
|
||||
"""
|
||||
import time
|
||||
from loguru import logger
|
||||
from typing import Dict, Any
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class TextFormatter(Mapper):
|
||||
"""把输入的json文件流抽取为txt"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(TextFormatter, self).__init__(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _extract_json(byte_io):
|
||||
"""将默认使用utf-8编码的Json文件流解码,抽取为txt"""
|
||||
# 用utf-8-sig的格式进行抽取,可以避免uft-8 BOM编码格式的文件在抽取后产生隐藏字符作为前缀。
|
||||
return byte_io.decode("utf-8-sig").replace("\r\n", "\n")
|
||||
|
||||
def byte_read(self, sample: Dict[str, Any]):
|
||||
filepath = sample[self.filepath_key]
|
||||
with open(filepath, "rb") as file:
|
||||
byte_data = file.read()
|
||||
sample[self.data_key] = byte_data
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
try:
|
||||
self.byte_read(sample)
|
||||
sample[self.text_key] = self._extract_json(sample[self.data_key])
|
||||
sample[self.data_key] = b"" # 将sample[self.data_key]置空
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: TextFormatter costs {(time.time() - start):6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.exception(f"fileName: {sample[self.filename_key]}, method: TextFormatter causes decode error: {err}")
|
||||
raise
|
||||
return sample
|
||||
@@ -1,6 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='UnstructuredFormatter',
|
||||
module_path="ops.formatter.unstructured_formatter.process")
|
||||
@@ -1,16 +0,0 @@
|
||||
name: 'Unstructured文本抽取'
|
||||
name_en: 'Unstructured Text Extraction'
|
||||
description: '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。'
|
||||
description_en: 'Extracts text from Unstructured files, currently supporting PowerPoint presentations, Word documents and Excel spreadsheets files.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'UnstructuredFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
@@ -1,37 +0,0 @@
|
||||
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 非结构化文本抽取
|
||||
Create: 2025/10/22 15:15
|
||||
"""
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class UnstructuredFormatter(Mapper):
|
||||
"""把输入的非结构化文本抽取为txt"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(UnstructuredFormatter, self).__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
filepath = sample.get(self.filepath_key)
|
||||
filename = sample.get(self.filename_key)
|
||||
if not filename.lower().endswith((".ppt", ".pptx", "docx", "xlsx", ".csv")):
|
||||
return sample
|
||||
try:
|
||||
elements = partition(filename=filepath)
|
||||
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
|
||||
logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
|
||||
raise
|
||||
return sample
|
||||
@@ -1,6 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='WordFormatter',
|
||||
module_path="ops.formatter.word_formatter.process")
|
||||
@@ -1,16 +0,0 @@
|
||||
name: 'Word文本抽取'
|
||||
name_en: 'Word Text Extraction'
|
||||
description: '抽取Word中的文本'
|
||||
description_en: 'Extracts text from Word files.'
|
||||
language: 'java'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'WordFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
@@ -1,68 +0,0 @@
|
||||
# # -- encoding: utf-8 --
|
||||
|
||||
#
|
||||
# Description:
|
||||
# Create: 2024/1/30 15:24
|
||||
# """
|
||||
from loguru import logger
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from datamate.common.utils import check_valid_path
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class WordFormatter(Mapper):
|
||||
SEPERATOR = ' | '
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(WordFormatter, self).__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
file_path = sample[self.filepath_key]
|
||||
file_type = sample[self.filetype_key]
|
||||
txt_content = self.word2html(file_path, file_type)
|
||||
sample[self.text_key] = txt_content
|
||||
logger.info(f"fileName: {file_name}, method: WordFormatter costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
@staticmethod
|
||||
def word2html(file_path, file_type):
|
||||
check_valid_path(file_path)
|
||||
file_dir = file_path.rsplit('/', 1)[0]
|
||||
file_name = file_path.rsplit('/', 1)[1]
|
||||
html_file_path = os.path.join(file_dir, f"{file_name}.txt")
|
||||
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
['java', '-jar', f'{current_file_path}/../../../java_operator/WordFormatter-1.0.jar', file_path,
|
||||
html_file_path, file_type], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||
stdout, stderr = process.communicate(timeout=24 * 60 * 60)
|
||||
if process.returncode == 0:
|
||||
logger.info(f"Convert {file_path} successfully to DOCX")
|
||||
else:
|
||||
logger.info(f"Convert {file_path} failed, error: {stderr.strip().decode('utf-8')}.")
|
||||
raise RuntimeError()
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Convert failed: {e}, return code: {e.returncode}")
|
||||
except FileNotFoundError:
|
||||
logger.error("LibreOffice command not found, please make sure it is available in PATH")
|
||||
except Exception as e:
|
||||
logger.error(f"An unexpected error occurred, convert failed: {e}", )
|
||||
|
||||
try:
|
||||
with open(html_file_path, 'r', encoding='utf-8') as file:
|
||||
txt_content = file.read()
|
||||
os.remove(html_file_path)
|
||||
logger.info("Tmp docx file removed")
|
||||
except FileNotFoundError:
|
||||
logger.error(f"Tmp file {html_file_path} does not exist")
|
||||
except PermissionError:
|
||||
logger.error(f"You are not allowed to delete tmp file {html_file_path}")
|
||||
logger.info(f"Convert {html_file_path} to html success")
|
||||
return txt_content
|
||||
Reference in New Issue
Block a user