feature: modify UnstructuredFormatter and ExternalPDFFormatter description (#44)

* feature: add UnstructuredFormatter

* feature: add UnstructuredFormatter in db

* feature: add unstructured[docx]==0.18.15

* feature: support doc

* feature: add mineru

* feature: add external pdf extract operator by using mineru

* feature: mineru docker install bugfix

* feature: add unstructured xlsx/xls/csv/pptx/ppt

* feature: modify UnstructuredFormatter and ExternalPDFFormatter description

---------

Co-authored-by: Startalker <438747480@qq.com>
This commit is contained in:
Startalker
2025-10-31 10:32:14 +08:00
committed by GitHub
parent c6958d1511
commit a600c1d793
4 changed files with 8 additions and 8 deletions

View File

@@ -1,7 +1,7 @@
name: '外部PDF文本抽取' name: 'MinerU PDF文本抽取'
name_en: 'External PDF Text Extraction' name_en: 'External PDF Text Extraction'
description: '基于外部API,抽取PDF中的文本。' description: '基于MinerU API,抽取PDF中的文本。'
description_en: 'Extracts text from PDF files based on external APIs.' description_en: 'Extracts text from PDF files based on MinerU API.'
language: 'python' language: 'python'
vendor: 'huawei' vendor: 'huawei'
raw_id: 'ExternalPDFFormatter' raw_id: 'ExternalPDFFormatter'

View File

@@ -2,7 +2,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
""" """
Description: 外部PDF文本抽取 Description: MinerU PDF文本抽取
Create: 2025/10/29 17:24 Create: 2025/10/29 17:24
""" """
import json import json

View File

@@ -1,7 +1,7 @@
name: '非结构化文本抽取' name: '非结构化文本抽取'
name_en: 'Unstructured Text Extraction' name_en: 'Unstructured Text Extraction'
description: '抽取非结构化文件的文本,目前支持word文档' description: '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。'
description_en: 'Extracts text from Unstructured files, currently supporting Word documents.' description_en: 'Extracts text from Unstructured files, currently supporting PowerPoint presentations, Word documents and Excel spreadsheets files.'
language: 'python' language: 'python'
vendor: 'huawei' vendor: 'huawei'
raw_id: 'UnstructuredFormatter' raw_id: 'UnstructuredFormatter'

View File

@@ -70,8 +70,8 @@ VALUES ('64465bec-b46b-11f0-8291-00155d0e4808', '模态', 'modal', 'predefined'
INSERT IGNORE INTO t_operator INSERT IGNORE INTO t_operator
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star) (id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false), VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持word文档', '1.0.0', 'text', 'text', null, null, '', false), ('UnstructuredFormatter', '非结构化文本抽取', '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿', '1.0.0', 'text', 'text', null, null, '', false),
('ExternalPDFFormatter', '外部PDF文本抽取', '基于外部API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false), ('ExternalPDFFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false), ('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'), ('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'), ('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),