You've already forked DataMate
@@ -20,6 +20,7 @@ def _import_operators():
|
||||
from . import img_formatter
|
||||
from . import file_exporter
|
||||
from . import slide_formatter
|
||||
from . import unstructured_formatter
|
||||
|
||||
|
||||
_import_operators()
|
||||
|
||||
6
runtime/ops/formatter/unstructured_formatter/__init__.py
Normal file
6
runtime/ops/formatter/unstructured_formatter/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='UnstructuredFormatter',
|
||||
module_path="ops.formatter.unstructured_formatter.process")
|
||||
16
runtime/ops/formatter/unstructured_formatter/metadata.yml
Normal file
16
runtime/ops/formatter/unstructured_formatter/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '非结构化文本抽取'
|
||||
name_en: 'Unstructured Text Extraction'
|
||||
description: '抽取非结构化文件的文本,目前支持word文档'
|
||||
description_en: 'Extracts text from Unstructured files, currently supporting Word documents.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'UnstructuredFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
35
runtime/ops/formatter/unstructured_formatter/process.py
Normal file
35
runtime/ops/formatter/unstructured_formatter/process.py
Normal file
@@ -0,0 +1,35 @@
|
||||
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 非结构化文本抽取
|
||||
Create: 2025/10/22 15:15
|
||||
"""
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class UnstructuredFormatter(Mapper):
|
||||
"""把输入的非结构化文本抽取为txt"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(UnstructuredFormatter, self).__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
filepath = sample.get(self.filepath_key)
|
||||
filename = sample.get(self.filename_key)
|
||||
try:
|
||||
elements = partition(filename=filepath)
|
||||
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
|
||||
logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
|
||||
raise
|
||||
return sample
|
||||
@@ -19,3 +19,4 @@ xmltodict==1.0.2
|
||||
zhconv==1.4.3
|
||||
sqlalchemy==2.0.40
|
||||
pymysql==1.1.1
|
||||
unstructured[docx]==0.18.15
|
||||
@@ -68,6 +68,7 @@ VALUES (1, '模态', 'predefined', 0),
|
||||
INSERT IGNORE INTO t_operator
|
||||
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
|
||||
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||
('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持word文档。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
|
||||
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
|
||||
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
||||
@@ -121,7 +122,7 @@ AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRe
|
||||
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
|
||||
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
|
||||
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
|
||||
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner');
|
||||
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'UnstructuredFormatter');
|
||||
|
||||
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
||||
SELECT c.id, o.id
|
||||
@@ -137,4 +138,4 @@ SELECT c.id, o.id
|
||||
FROM t_operator_category c
|
||||
CROSS JOIN t_operator o
|
||||
WHERE c.id IN (7, 8, 11)
|
||||
AND o.id IN ('FileExporter');
|
||||
AND o.id IN ('FileExporter', 'UnstructuredFormatter');
|
||||
|
||||
@@ -7,7 +7,7 @@ ENV PYTHONPATH=/opt/runtime/datamate/
|
||||
|
||||
RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \
|
||||
&& apt update \
|
||||
&& apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 \
|
||||
&& apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 libreoffice\
|
||||
&& apt clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
Reference in New Issue
Block a user