Merge pull request #1 from ModelEngine-Group/main

merge
2025-10-23 17:13:24 +08:00
parent 4e53cc813b d66925410e
commit 8e9a94971e
7 changed files with 63 additions and 3 deletions
--- a/runtime/ops/formatter/init.py
+++ b/runtime/ops/formatter/init.py
@@ -20,6 +20,7 @@ def _import_operators():
    from . import img_formatter
    from . import file_exporter
    from . import slide_formatter
    from . import unstructured_formatter
 _import_operators()
--- a/runtime/ops/formatter/unstructured_formatter/init.py
+++ b/runtime/ops/formatter/unstructured_formatter/init.py
@@ -0,0 +1,6 @@
 # -*- coding: utf-8 -*-
 from datamate.core.base_op import OPERATORS
 OPERATORS.register_module(module_name='UnstructuredFormatter',
                          module_path="ops.formatter.unstructured_formatter.process")
--- a/runtime/ops/formatter/unstructured_formatter/metadata.yml
+++ b/runtime/ops/formatter/unstructured_formatter/metadata.yml
@@ -0,0 +1,16 @@
 name: '非结构化文本抽取'
 name_en: 'Unstructured Text Extraction'
 description: '抽取非结构化文件的文本，目前支持word文档'
 description_en: 'Extracts text from Unstructured files, currently supporting Word documents.'
 language: 'python'
 vendor: 'huawei'
 raw_id: 'UnstructuredFormatter'
 version: '1.0.0'
 types:
  - 'collect'
 modal: 'text'
 effect:
  before: ''
  after: ''
 inputs: 'text'
 outputs: 'text'
--- a/runtime/ops/formatter/unstructured_formatter/process.py
+++ b/runtime/ops/formatter/unstructured_formatter/process.py
@@ -0,0 +1,35 @@
 #!/user/bin/python
 # -*- coding: utf-8 -*-
 """
 Description: 非结构化文本抽取
 Create: 2025/10/22 15:15
 """
 import time
 from typing import Dict, Any
 from loguru import logger
 from unstructured.partition.auto import partition
 from datamate.core.base_op import Mapper
 class UnstructuredFormatter(Mapper):
    """把输入的非结构化文本抽取为txt"""
    def __init__(self, *args, **kwargs):
        super(UnstructuredFormatter, self).__init__(*args, **kwargs)
    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
        start = time.time()
        filepath = sample.get(self.filepath_key)
        filename = sample.get(self.filename_key)
        try:
            elements = partition(filename=filepath)
            sample[self.text_key] = "\n\n".join([str(el) for el in elements])
            logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
        except UnicodeDecodeError as err:
            logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
            raise
        return sample
--- a/runtime/ops/requirements.txt
+++ b/runtime/ops/requirements.txt
@@ -19,3 +19,4 @@ xmltodict==1.0.2
 zhconv==1.4.3
 sqlalchemy==2.0.40
 pymysql==1.1.1
 unstructured[docx]==0.18.15
--- a/scripts/db/data-operator-init.sql
+++ b/scripts/db/data-operator-init.sql
@@ -68,6 +68,7 @@ VALUES (1, '模态', 'predefined', 0),
 INSERT IGNORE INTO t_operator
 (id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
 VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
       ('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本，目前支持word文档。', '1.0.0', 'text', 'text', null, null, '', false),
       ('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
       ('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值，该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时，选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
       ('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值，该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
@@ -121,7 +122,7 @@ AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRe
            'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
            'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
            'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
-            'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner');
+            'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'UnstructuredFormatter');
 INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
 SELECT c.id, o.id
@@ -137,4 +138,4 @@ SELECT c.id, o.id
 FROM t_operator_category c
       CROSS JOIN t_operator o
 WHERE c.id IN (7, 8, 11)
-  AND o.id IN ('FileExporter');
+  AND o.id IN ('FileExporter', 'UnstructuredFormatter');
--- a/scripts/images/runtime/Dockerfile
+++ b/scripts/images/runtime/Dockerfile
@@ -7,7 +7,7 @@ ENV PYTHONPATH=/opt/runtime/datamate/
 RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \
    && apt update \
-    && apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 \
+    && apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64  libreoffice\
    && apt clean \
    && rm -rf /var/lib/apt/lists/*