You've already forked DataMate
@@ -20,6 +20,7 @@ def _import_operators():
|
|||||||
from . import img_formatter
|
from . import img_formatter
|
||||||
from . import file_exporter
|
from . import file_exporter
|
||||||
from . import slide_formatter
|
from . import slide_formatter
|
||||||
|
from . import unstructured_formatter
|
||||||
|
|
||||||
|
|
||||||
_import_operators()
|
_import_operators()
|
||||||
|
|||||||
6
runtime/ops/formatter/unstructured_formatter/__init__.py
Normal file
6
runtime/ops/formatter/unstructured_formatter/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from datamate.core.base_op import OPERATORS
|
||||||
|
|
||||||
|
OPERATORS.register_module(module_name='UnstructuredFormatter',
|
||||||
|
module_path="ops.formatter.unstructured_formatter.process")
|
||||||
16
runtime/ops/formatter/unstructured_formatter/metadata.yml
Normal file
16
runtime/ops/formatter/unstructured_formatter/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
name: '非结构化文本抽取'
|
||||||
|
name_en: 'Unstructured Text Extraction'
|
||||||
|
description: '抽取非结构化文件的文本,目前支持word文档'
|
||||||
|
description_en: 'Extracts text from Unstructured files, currently supporting Word documents.'
|
||||||
|
language: 'python'
|
||||||
|
vendor: 'huawei'
|
||||||
|
raw_id: 'UnstructuredFormatter'
|
||||||
|
version: '1.0.0'
|
||||||
|
types:
|
||||||
|
- 'collect'
|
||||||
|
modal: 'text'
|
||||||
|
effect:
|
||||||
|
before: ''
|
||||||
|
after: ''
|
||||||
|
inputs: 'text'
|
||||||
|
outputs: 'text'
|
||||||
35
runtime/ops/formatter/unstructured_formatter/process.py
Normal file
35
runtime/ops/formatter/unstructured_formatter/process.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
|
||||||
|
#!/user/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
Description: 非结构化文本抽取
|
||||||
|
Create: 2025/10/22 15:15
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
from loguru import logger
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
|
||||||
|
from datamate.core.base_op import Mapper
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredFormatter(Mapper):
|
||||||
|
"""把输入的非结构化文本抽取为txt"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(UnstructuredFormatter, self).__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
start = time.time()
|
||||||
|
filepath = sample.get(self.filepath_key)
|
||||||
|
filename = sample.get(self.filename_key)
|
||||||
|
try:
|
||||||
|
elements = partition(filename=filepath)
|
||||||
|
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
|
||||||
|
logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
|
||||||
|
except UnicodeDecodeError as err:
|
||||||
|
logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
|
||||||
|
raise
|
||||||
|
return sample
|
||||||
@@ -19,3 +19,4 @@ xmltodict==1.0.2
|
|||||||
zhconv==1.4.3
|
zhconv==1.4.3
|
||||||
sqlalchemy==2.0.40
|
sqlalchemy==2.0.40
|
||||||
pymysql==1.1.1
|
pymysql==1.1.1
|
||||||
|
unstructured[docx]==0.18.15
|
||||||
@@ -68,6 +68,7 @@ VALUES (1, '模态', 'predefined', 0),
|
|||||||
INSERT IGNORE INTO t_operator
|
INSERT IGNORE INTO t_operator
|
||||||
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
|
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
|
||||||
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||||
|
('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持word文档。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||||
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
|
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
|
||||||
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
|
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
|
||||||
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
||||||
@@ -121,7 +122,7 @@ AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRe
|
|||||||
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
|
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
|
||||||
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
|
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
|
||||||
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
|
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
|
||||||
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner');
|
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'UnstructuredFormatter');
|
||||||
|
|
||||||
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
||||||
SELECT c.id, o.id
|
SELECT c.id, o.id
|
||||||
@@ -137,4 +138,4 @@ SELECT c.id, o.id
|
|||||||
FROM t_operator_category c
|
FROM t_operator_category c
|
||||||
CROSS JOIN t_operator o
|
CROSS JOIN t_operator o
|
||||||
WHERE c.id IN (7, 8, 11)
|
WHERE c.id IN (7, 8, 11)
|
||||||
AND o.id IN ('FileExporter');
|
AND o.id IN ('FileExporter', 'UnstructuredFormatter');
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ ENV PYTHONPATH=/opt/runtime/datamate/
|
|||||||
|
|
||||||
RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \
|
RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \
|
||||||
&& apt update \
|
&& apt update \
|
||||||
&& apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 \
|
&& apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 libreoffice\
|
||||||
&& apt clean \
|
&& apt clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user