diff --git a/runtime/ops/formatter/__init__.py b/runtime/ops/formatter/__init__.py index c08289b..bc02387 100644 --- a/runtime/ops/formatter/__init__.py +++ b/runtime/ops/formatter/__init__.py @@ -20,6 +20,7 @@ def _import_operators(): from . import img_formatter from . import file_exporter from . import slide_formatter + from . import unstructured_formatter _import_operators() diff --git a/runtime/ops/formatter/unstructured_formatter/__init__.py b/runtime/ops/formatter/unstructured_formatter/__init__.py new file mode 100644 index 0000000..ab5ad41 --- /dev/null +++ b/runtime/ops/formatter/unstructured_formatter/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +from datamate.core.base_op import OPERATORS + +OPERATORS.register_module(module_name='UnstructuredFormatter', + module_path="ops.formatter.unstructured_formatter.process") diff --git a/runtime/ops/formatter/unstructured_formatter/metadata.yml b/runtime/ops/formatter/unstructured_formatter/metadata.yml new file mode 100644 index 0000000..fc2956c --- /dev/null +++ b/runtime/ops/formatter/unstructured_formatter/metadata.yml @@ -0,0 +1,16 @@ +name: '非结构化文本抽取' +name_en: 'Unstructured Text Extraction' +description: '抽取非结构化文件的文本,目前支持word文档' +description_en: 'Extracts text from Unstructured files, currently supporting Word documents.' +language: 'python' +vendor: 'huawei' +raw_id: 'UnstructuredFormatter' +version: '1.0.0' +types: + - 'collect' +modal: 'text' +effect: + before: '' + after: '' +inputs: 'text' +outputs: 'text' diff --git a/runtime/ops/formatter/unstructured_formatter/process.py b/runtime/ops/formatter/unstructured_formatter/process.py new file mode 100644 index 0000000..3107f8b --- /dev/null +++ b/runtime/ops/formatter/unstructured_formatter/process.py @@ -0,0 +1,35 @@ + +#!/user/bin/python +# -*- coding: utf-8 -*- + +""" +Description: 非结构化文本抽取 +Create: 2025/10/22 15:15 +""" +import time +from typing import Dict, Any + +from loguru import logger +from unstructured.partition.auto import partition + +from datamate.core.base_op import Mapper + + +class UnstructuredFormatter(Mapper): + """把输入的非结构化文本抽取为txt""" + + def __init__(self, *args, **kwargs): + super(UnstructuredFormatter, self).__init__(*args, **kwargs) + + def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]: + start = time.time() + filepath = sample.get(self.filepath_key) + filename = sample.get(self.filename_key) + try: + elements = partition(filename=filepath) + sample[self.text_key] = "\n\n".join([str(el) for el in elements]) + logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s") + except UnicodeDecodeError as err: + logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}") + raise + return sample diff --git a/runtime/ops/requirements.txt b/runtime/ops/requirements.txt index 92dcfc6..8c842b0 100644 --- a/runtime/ops/requirements.txt +++ b/runtime/ops/requirements.txt @@ -19,3 +19,4 @@ xmltodict==1.0.2 zhconv==1.4.3 sqlalchemy==2.0.40 pymysql==1.1.1 +unstructured[docx]==0.18.15 \ No newline at end of file diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql index 9ee826a..4a1b8a6 100644 --- a/scripts/db/data-operator-init.sql +++ b/scripts/db/data-operator-init.sql @@ -68,6 +68,7 @@ VALUES (1, '模态', 'predefined', 0), INSERT IGNORE INTO t_operator (id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star) VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false), + ('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持word文档。', '1.0.0', 'text', 'text', null, null, '', false), ('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false), ('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'), ('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'), @@ -121,7 +122,7 @@ AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRe 'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner', 'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner', 'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner', - 'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner'); + 'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'UnstructuredFormatter'); INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id) SELECT c.id, o.id @@ -137,4 +138,4 @@ SELECT c.id, o.id FROM t_operator_category c CROSS JOIN t_operator o WHERE c.id IN (7, 8, 11) - AND o.id IN ('FileExporter'); + AND o.id IN ('FileExporter', 'UnstructuredFormatter'); diff --git a/scripts/images/runtime/Dockerfile b/scripts/images/runtime/Dockerfile index 147fe5f..6d3f8ef 100644 --- a/scripts/images/runtime/Dockerfile +++ b/scripts/images/runtime/Dockerfile @@ -7,7 +7,7 @@ ENV PYTHONPATH=/opt/runtime/datamate/ RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \ && apt update \ - && apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 \ + && apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 libreoffice\ && apt clean \ && rm -rf /var/lib/apt/lists/*