算子将抽取与落盘固定到流程中 (#134)

* feature: 将抽取动作移到每一个算子中 * feature: 落盘算子改为默认执行 * feature: 优化前端展示 * feature: 使用pyproject管理依赖
2025-12-05 17:26:29 +08:00
parent 744d15ba24
commit d59c167da4
70 changed files with 289 additions and 539 deletions
--- a/runtime/ops/formatter/init.py
+++ b/runtime/ops/formatter/init.py
@@ -15,12 +15,7 @@ _configure_importer()


 def _import_operators():
-    from . import text_formatter
-    from . import word_formatter
-    from . import img_formatter
-    from . import file_exporter
    from . import slide_formatter
-    from . import unstructured_formatter
    from . import mineru_formatter


--- a/runtime/ops/formatter/file_exporter/init.py
+++ b/runtime/ops/formatter/file_exporter/init.py
@@ -1,6 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from datamate.core.base_op import OPERATORS
-
-OPERATORS.register_module(module_name='FileExporter',
-                          module_path="ops.formatter.file_exporter.process")
--- a/runtime/ops/formatter/file_exporter/metadata.yml
+++ b/runtime/ops/formatter/file_exporter/metadata.yml
@@ -1,16 +0,0 @@
-name: '落盘算子'
-name_en: 'save file operator'
-description: '将文件内容保存为文件。'
-description_en: 'Save the file data as a file.'
-language: 'Python'
-vendor: 'Huawei'
-raw_id: 'FileExporter'
-version: '1.0.0'
-types:
-  - 'collect'
-modal: 'others'
-effect:
-  before: ''
-  after: ''
-inputs: 'all'
-outputs: 'all'
--- a/runtime/ops/formatter/file_exporter/process.py
+++ b/runtime/ops/formatter/file_exporter/process.py
@@ -1,145 +0,0 @@
-#!/user/bin/python
-# -*- coding: utf-8 -*-
-
-"""
-Description: Json文本抽取
-Create: 2024/06/06 15:43
-"""
-import time
-import os
-import uuid
-from typing import Tuple, Dict, Any
-from loguru import logger
-
-from datamate.core.constant import Fields
-from datamate.core.base_op import Mapper
-from datamate.common.utils import check_valid_path
-
-
-class FileExporter(Mapper):
-    """把输入的json文件流抽取为txt"""
-
-    def __init__(self, *args, **kwargs):
-        super(FileExporter, self).__init__(*args, **kwargs)
-        self.last_ops = True
-        self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
-                                                                'xlsx', 'xls', 'csv', 'pptx', 'ppt',
-                                                                'xml', 'json', 'doc', 'docx', 'pdf'])
-        self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
-        self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])
-
-    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
-        file_name = sample[self.filename_key]
-        file_type = sample[self.filetype_key]
-
-        try:
-            start = time.time()
-            if file_type in self.text_support_ext:
-                sample, save_path = self.get_textfile_handler(sample)
-            elif file_type in self.data_support_ext:
-                sample, save_path = self.get_datafile_handler(sample)
-            elif file_type in self.medical_support_ext:
-                sample, save_path = self.get_medicalfile_handler(sample)
-            else:
-                raise TypeError(f"{file_type} is unsupported! please check support_ext in FileExporter Ops")
-
-            if sample[self.text_key] == '' and sample[self.data_key] == b'':
-                sample[self.filesize_key] = "0"
-                return sample
-
-            if save_path:
-                self.save_file(sample, save_path)
-                sample[self.text_key] = ''
-                sample[self.data_key] = b''
-                sample[Fields.result] = True
-
-                file_type = save_path.split('.')[-1]
-                sample[self.filetype_key] = file_type
-
-                base_name, _ = os.path.splitext(file_name)
-                new_file_name = base_name + '.' + file_type
-                sample[self.filename_key] = new_file_name
-
-                base_name, _ = os.path.splitext(save_path)
-                sample[self.filepath_key] = base_name
-                file_size = os.path.getsize(base_name)
-                sample[self.filesize_key] = f"{file_size}"
-
-            logger.info(f"origin file named {file_name} has been save to {save_path}")
-            logger.info(f"fileName: {sample[self.filename_key]}, "
-                        f"method: FileExporter costs {time.time() - start:.6f} s")
-        except UnicodeDecodeError as err:
-            logger.error(f"fileName: {sample[self.filename_key]}, "
-                         f"method: FileExporter causes decode error: {err}")
-            raise
-        return sample
-
-    def get_save_path(self, sample: Dict[str, Any], target_type) -> str:
-        export_path = os.path.abspath(sample[self.export_path_key])
-        file_name = sample[self.filename_key]
-        new_file_name = os.path.splitext(file_name)[0] + '.' + target_type
-
-        if not check_valid_path(export_path):
-            os.makedirs(export_path, exist_ok=True)
-        res = os.path.join(export_path, new_file_name)
-        return res
-
-    def get_textfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
-        target_type = sample.get("target_type", None)
-
-        # target_type存在则保存为扫描件, docx格式
-        if target_type:
-            sample = self._get_from_data(sample)
-            save_path = self.get_save_path(sample, target_type)
-        # 不存在则保存为txt文件，正常文本清洗
-        else:
-            sample = self._get_from_text(sample)
-            save_path = self.get_save_path(sample, 'txt')
-        return sample, save_path
-
-    def get_datafile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
-        target_type = sample.get("target_type", None)
-
-        # target_type存在, 图转文保存为target_type，markdown格式
-        if target_type:
-            sample = self._get_from_text(sample)
-            save_path = self.get_save_path(sample, target_type)
-        # 不存在则保存为原本图片文件格式，正常图片清洗
-        else:
-            sample = self._get_from_data(sample)
-            save_path = self.get_save_path(sample, sample[self.filetype_key])
-        return sample, save_path
-
-    def get_medicalfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
-        target_type = 'png'
-
-        sample = self._get_from_data(sample)
-        save_path = self.get_save_path(sample, target_type)
-
-        return sample, save_path
-
-    def save_file(self, sample, save_path):
-        file_name, _ = os.path.splitext(save_path)
-        # 以二进制格式保存文件
-        file_sample = sample[self.text_key].encode('utf-8') if sample[self.text_key] else sample[self.data_key]
-        with open(file_name, 'wb') as f:
-            f.write(file_sample)
-            # 获取父目录路径
-
-        parent_dir = os.path.dirname(file_name)
-        os.chmod(parent_dir, 0o770)
-        os.chmod(file_name, 0o640)
-
-    def _get_from_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
-        sample[self.data_key] = bytes(sample[self.data_key])
-        sample[self.text_key] = ''
-        return sample
-
-    def _get_from_text(self, sample: Dict[str, Any]) -> Dict[str, Any]:
-        sample[self.data_key] = b''
-        sample[self.text_key] = str(sample[self.text_key])
-        return sample
-
-    def _get_uuid(self):
-        res = str(uuid.uuid4())
-        return res
--- a/runtime/ops/formatter/img_formatter/init.py
+++ b/runtime/ops/formatter/img_formatter/init.py
@@ -1,6 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from datamate.core.base_op import OPERATORS
-
-OPERATORS.register_module(module_name='ImgFormatter',
-                          module_path="ops.formatter.img_formatter.process")
--- a/runtime/ops/formatter/img_formatter/metadata.yml
+++ b/runtime/ops/formatter/img_formatter/metadata.yml
@@ -1,16 +0,0 @@
-name: '读取图片文件'
-name_en: 'Image File Reader'
-description: '读取图片文件。'
-description_en: 'Reads image files.'
-language: 'Python'
-vendor: 'Huawei'
-raw_id: 'ImgFormatter'
-version: '1.0.0'
-types:
-  - 'collect'
-modal: 'image'
-effect:
-  before: ''
-  after: ''
-inputs: 'image'
-outputs: 'image'
--- a/runtime/ops/formatter/img_formatter/process.py
+++ b/runtime/ops/formatter/img_formatter/process.py
@@ -1,35 +0,0 @@
-# # -- encoding: utf-8 --
-
-#
-# Description:
-# Create: 2024/1/30 15:24
-# """
-import time
-from typing import Dict, Any
-
-import cv2
-import numpy as np
-from loguru import logger
-
-from datamate.common.utils import numpy_to_bytes
-from datamate.core.base_op import Mapper
-
-
-class ImgFormatter(Mapper):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
-        start = time.time()
-        file_name = sample[self.filename_key]
-        file_type = "." + sample[self.filetype_key]
-        file_path = sample[self.filepath_key]
-        img_data = _img_extract(file_path)
-        sample[self.data_key] = numpy_to_bytes(img_data, file_type)
-        logger.info(f"fileName: {file_name}, method: ImgExtract costs {(time.time() - start):6f} s")
-        return sample
-
-
-def _img_extract(file_path):
-    return cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1)
--- a/runtime/ops/formatter/text_formatter/init.py
+++ b/runtime/ops/formatter/text_formatter/init.py
@@ -1,6 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from datamate.core.base_op import OPERATORS
-
-OPERATORS.register_module(module_name='TextFormatter',
-                          module_path="ops.formatter.text_formatter.process")
--- a/runtime/ops/formatter/text_formatter/metadata.yml
+++ b/runtime/ops/formatter/text_formatter/metadata.yml
@@ -1,16 +0,0 @@
-name: 'TXT文本抽取'
-name_en: 'TXT Text Extraction'
-description: '抽取TXT中的文本'
-description_en: 'Extracts text from TXT files.'
-language: 'python'
-vendor: 'huawei'
-raw_id: 'TxtFormatter'
-version: '1.0.0'
-types:
-  - 'collect'
-modal: 'text'
-effect:
-  before: ''
-  after: ''
-inputs: 'text'
-outputs: 'text'
--- a/runtime/ops/formatter/text_formatter/process.py
+++ b/runtime/ops/formatter/text_formatter/process.py
@@ -1,44 +0,0 @@
-#!/user/bin/python
-# -*- coding: utf-8 -*-
-
-"""
-Description: Json文本抽取
-Create: 2024/06/06 15:43
-"""
-import time
-from loguru import logger
-from typing import Dict, Any
-
-from datamate.core.base_op import Mapper
-
-
-class TextFormatter(Mapper):
-    """把输入的json文件流抽取为txt"""
-
-    def __init__(self, *args, **kwargs):
-        super(TextFormatter, self).__init__(*args, **kwargs)
-
-    @staticmethod
-    def _extract_json(byte_io):
-        """将默认使用utf-8编码的Json文件流解码，抽取为txt"""
-        # 用utf-8-sig的格式进行抽取，可以避免uft-8 BOM编码格式的文件在抽取后产生隐藏字符作为前缀。
-        return byte_io.decode("utf-8-sig").replace("\r\n", "\n")
-
-    def byte_read(self, sample: Dict[str, Any]):
-        filepath = sample[self.filepath_key]
-        with open(filepath, "rb") as file:
-            byte_data = file.read()
-        sample[self.data_key] = byte_data
-
-    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
-        start = time.time()
-        try:
-            self.byte_read(sample)
-            sample[self.text_key] = self._extract_json(sample[self.data_key])
-            sample[self.data_key] = b""  # 将sample[self.data_key]置空
-            logger.info(
-                f"fileName: {sample[self.filename_key]}, method: TextFormatter costs {(time.time() - start):6f} s")
-        except UnicodeDecodeError as err:
-            logger.exception(f"fileName: {sample[self.filename_key]}, method: TextFormatter causes decode error: {err}")
-            raise
-        return sample
--- a/runtime/ops/formatter/unstructured_formatter/init.py
+++ b/runtime/ops/formatter/unstructured_formatter/init.py
@@ -1,6 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from datamate.core.base_op import OPERATORS
-
-OPERATORS.register_module(module_name='UnstructuredFormatter',
-                          module_path="ops.formatter.unstructured_formatter.process")
--- a/runtime/ops/formatter/unstructured_formatter/metadata.yml
+++ b/runtime/ops/formatter/unstructured_formatter/metadata.yml
@@ -1,16 +0,0 @@
-name: 'Unstructured文本抽取'
-name_en: 'Unstructured Text Extraction'
-description: '抽取非结构化文件的文本，目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。'
-description_en: 'Extracts text from Unstructured files, currently supporting PowerPoint presentations, Word documents and Excel spreadsheets files.'
-language: 'python'
-vendor: 'huawei'
-raw_id: 'UnstructuredFormatter'
-version: '1.0.0'
-types:
-  - 'collect'
-modal: 'text'
-effect:
-  before: ''
-  after: ''
-inputs: 'text'
-outputs: 'text'
--- a/runtime/ops/formatter/unstructured_formatter/process.py
+++ b/runtime/ops/formatter/unstructured_formatter/process.py
@@ -1,37 +0,0 @@
-
-#!/user/bin/python
-# -*- coding: utf-8 -*-
-
-"""
-Description: 非结构化文本抽取
-Create: 2025/10/22 15:15
-"""
-import time
-from typing import Dict, Any
-
-from loguru import logger
-from unstructured.partition.auto import partition
-
-from datamate.core.base_op import Mapper
-
-
-class UnstructuredFormatter(Mapper):
-    """把输入的非结构化文本抽取为txt"""
-
-    def __init__(self, *args, **kwargs):
-        super(UnstructuredFormatter, self).__init__(*args, **kwargs)
-
-    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
-        start = time.time()
-        filepath = sample.get(self.filepath_key)
-        filename = sample.get(self.filename_key)
-        if not filename.lower().endswith((".ppt", ".pptx", "docx", "xlsx", ".csv")):
-            return sample
-        try:
-            elements = partition(filename=filepath)
-            sample[self.text_key] = "\n\n".join([str(el) for el in elements])
-            logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
-        except UnicodeDecodeError as err:
-            logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
-            raise
-        return sample
--- a/runtime/ops/formatter/word_formatter/init.py
+++ b/runtime/ops/formatter/word_formatter/init.py
@@ -1,6 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from datamate.core.base_op import OPERATORS
-
-OPERATORS.register_module(module_name='WordFormatter',
-                          module_path="ops.formatter.word_formatter.process")
--- a/runtime/ops/formatter/word_formatter/metadata.yml
+++ b/runtime/ops/formatter/word_formatter/metadata.yml
@@ -1,16 +0,0 @@
-name: 'Word文本抽取'
-name_en: 'Word Text Extraction'
-description: '抽取Word中的文本'
-description_en: 'Extracts text from Word files.'
-language: 'java'
-vendor: 'huawei'
-raw_id: 'WordFormatter'
-version: '1.0.0'
-types:
-  - 'collect'
-modal: 'text'
-effect:
-  before: ''
-  after: ''
-inputs: 'text'
-outputs: 'text'
--- a/runtime/ops/formatter/word_formatter/process.py
+++ b/runtime/ops/formatter/word_formatter/process.py
@@ -1,68 +0,0 @@
-# # -- encoding: utf-8 --
-
-#
-# Description:
-# Create: 2024/1/30 15:24
-# """
-from loguru import logger
-import os
-import subprocess
-import time
-from typing import Dict, Any
-
-from datamate.common.utils import check_valid_path
-from datamate.core.base_op import Mapper
-
-
-class WordFormatter(Mapper):
-    SEPERATOR = ' | '
-
-    def __init__(self, *args, **kwargs):
-        super(WordFormatter, self).__init__(*args, **kwargs)
-
-    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
-        start = time.time()
-        file_name = sample[self.filename_key]
-        file_path = sample[self.filepath_key]
-        file_type = sample[self.filetype_key]
-        txt_content = self.word2html(file_path, file_type)
-        sample[self.text_key] = txt_content
-        logger.info(f"fileName: {file_name}, method: WordFormatter costs {(time.time() - start):6f} s")
-        return sample
-
-    @staticmethod
-    def word2html(file_path, file_type):
-        check_valid_path(file_path)
-        file_dir = file_path.rsplit('/', 1)[0]
-        file_name = file_path.rsplit('/', 1)[1]
-        html_file_path = os.path.join(file_dir, f"{file_name}.txt")
-
-        current_file_path = os.path.dirname(os.path.abspath(__file__))
-        try:
-            process = subprocess.Popen(
-                ['java', '-jar', f'{current_file_path}/../../../java_operator/WordFormatter-1.0.jar', file_path,
-                 html_file_path, file_type], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-            stdout, stderr = process.communicate(timeout=24 * 60 * 60)
-            if process.returncode == 0:
-                logger.info(f"Convert {file_path} successfully to DOCX")
-            else:
-                logger.info(f"Convert {file_path} failed, error: {stderr.strip().decode('utf-8')}.")
-                raise RuntimeError()
-        except subprocess.CalledProcessError as e:
-            logger.error(f"Convert failed: ｛e｝, return code: ｛e.returncode｝")
-        except FileNotFoundError:
-            logger.error("LibreOffice command not found, please make sure it is available in PATH")
-        except Exception as e:
-            logger.error(f"An unexpected error occurred, convert failed: ｛e｝", )
-
-        try:
-            with open(html_file_path, 'r', encoding='utf-8') as file:
-                txt_content = file.read()
-            os.remove(html_file_path)
-            logger.info("Tmp docx file removed")
-        except FileNotFoundError:
-            logger.error(f"Tmp file ｛html_file_path｝ does not exist")
-        except PermissionError:
-            logger.error(f"You are not allowed to delete tmp file {html_file_path}")
-        logger.info(f"Convert {html_file_path} to html success")
-        return txt_content