算子将抽取与落盘固定到流程中 (#134)

* feature: 将抽取动作移到每一个算子中

* feature: 落盘算子改为默认执行

* feature: 优化前端展示

* feature: 使用pyproject管理依赖
This commit is contained in:
hhhhsc701
2025-12-05 17:26:29 +08:00
committed by GitHub
parent 744d15ba24
commit d59c167da4
70 changed files with 289 additions and 539 deletions

View File

@@ -150,6 +150,7 @@ const OperatorFlow: React.FC<OperatorFlowProps> = ({
max={selectedOperators.length}
defaultValue={index + 1}
className="w-10 h-6 text-xs text-center"
style={{ width: 60 }}
autoFocus
onBlur={(e) => handleIndexChange(operator.id, e.target.value)}
onKeyDown={(e) => {

View File

@@ -227,9 +227,8 @@ export default function FileTable({result, fetchTaskResult}) {
dataIndex: "status",
key: "status",
filters: [
{ text: "已完成", value: "已完成" },
{ text: "失败", value: "失败" },
{ text: "处理中", value: "处理中" },
{ text: "已完成", value: "COMPLETED" },
{ text: "失败", value: "FAILED" },
],
onFilter: (value: string, record: any) => record.status === value,
render: (status: string) => (
@@ -237,9 +236,7 @@ export default function FileTable({result, fetchTaskResult}) {
status={
status === "COMPLETED"
? "success"
: status === "FAILED"
? "error"
: "processing"
: "error"
}
text={TaskStatusMap[status as TaskStatus].label}
/>
@@ -248,6 +245,7 @@ export default function FileTable({result, fetchTaskResult}) {
{
title: "操作",
key: "action",
width: 200,
render: (_text: string, record: any) => (
<div className="flex">
{record.status === "COMPLETED" ? (

View File

@@ -33,6 +33,7 @@ class FileWithHighRepeatPhraseRateFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._file_with_high_repeat_phrase_rate_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "

View File

@@ -30,6 +30,7 @@ class FileWithHighRepeatWordRateFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._file_with_high_repeat_word_rate_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "

View File

@@ -26,6 +26,7 @@ class FileWithHighSpecialCharRateFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._file_with_high_special_char_rate_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "

View File

@@ -105,6 +105,7 @@ class ImgAdvertisementImagesCleaner(Filter):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]

View File

@@ -27,6 +27,7 @@ class ImgBlurredImagesCleaner(Filter):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -61,6 +61,7 @@ class ImgDuplicatedImagesCleaner(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
"""重复图片去重算子执行入口"""
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
img_data = self._duplicate_images_filter(file_name, sample[self.data_key])

View File

@@ -227,6 +227,7 @@ class ImgSimilarImagesCleaner(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
"""去除相似图片算子执行入口"""
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
img_bytes = sample[self.data_key]
data = bytes_to_numpy(img_bytes) if img_bytes else np.array([])

View File

@@ -150,6 +150,7 @@ class DuplicateFilesFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
sample[self.text_key] = self.deduplicate_files(sample, file_name)

View File

@@ -90,6 +90,7 @@ class FileWithManySensitiveWordsFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._file_with_many_sensitive_words_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "

View File

@@ -31,6 +31,7 @@ class FileWithShortOrLongLengthFilter(Filter):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._file_with_short_or_long_length_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "

View File

@@ -15,12 +15,7 @@ _configure_importer()
def _import_operators():
from . import text_formatter
from . import word_formatter
from . import img_formatter
from . import file_exporter
from . import slide_formatter
from . import unstructured_formatter
from . import mineru_formatter

View File

@@ -1,6 +0,0 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='FileExporter',
module_path="ops.formatter.file_exporter.process")

View File

@@ -1,16 +0,0 @@
name: '落盘算子'
name_en: 'save file operator'
description: '将文件内容保存为文件。'
description_en: 'Save the file data as a file.'
language: 'Python'
vendor: 'Huawei'
raw_id: 'FileExporter'
version: '1.0.0'
types:
- 'collect'
modal: 'others'
effect:
before: ''
after: ''
inputs: 'all'
outputs: 'all'

View File

@@ -1,145 +0,0 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: Json文本抽取
Create: 2024/06/06 15:43
"""
import time
import os
import uuid
from typing import Tuple, Dict, Any
from loguru import logger
from datamate.core.constant import Fields
from datamate.core.base_op import Mapper
from datamate.common.utils import check_valid_path
class FileExporter(Mapper):
"""把输入的json文件流抽取为txt"""
def __init__(self, *args, **kwargs):
super(FileExporter, self).__init__(*args, **kwargs)
self.last_ops = True
self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
'xlsx', 'xls', 'csv', 'pptx', 'ppt',
'xml', 'json', 'doc', 'docx', 'pdf'])
self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
file_name = sample[self.filename_key]
file_type = sample[self.filetype_key]
try:
start = time.time()
if file_type in self.text_support_ext:
sample, save_path = self.get_textfile_handler(sample)
elif file_type in self.data_support_ext:
sample, save_path = self.get_datafile_handler(sample)
elif file_type in self.medical_support_ext:
sample, save_path = self.get_medicalfile_handler(sample)
else:
raise TypeError(f"{file_type} is unsupported! please check support_ext in FileExporter Ops")
if sample[self.text_key] == '' and sample[self.data_key] == b'':
sample[self.filesize_key] = "0"
return sample
if save_path:
self.save_file(sample, save_path)
sample[self.text_key] = ''
sample[self.data_key] = b''
sample[Fields.result] = True
file_type = save_path.split('.')[-1]
sample[self.filetype_key] = file_type
base_name, _ = os.path.splitext(file_name)
new_file_name = base_name + '.' + file_type
sample[self.filename_key] = new_file_name
base_name, _ = os.path.splitext(save_path)
sample[self.filepath_key] = base_name
file_size = os.path.getsize(base_name)
sample[self.filesize_key] = f"{file_size}"
logger.info(f"origin file named {file_name} has been save to {save_path}")
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FileExporter costs {time.time() - start:.6f} s")
except UnicodeDecodeError as err:
logger.error(f"fileName: {sample[self.filename_key]}, "
f"method: FileExporter causes decode error: {err}")
raise
return sample
def get_save_path(self, sample: Dict[str, Any], target_type) -> str:
export_path = os.path.abspath(sample[self.export_path_key])
file_name = sample[self.filename_key]
new_file_name = os.path.splitext(file_name)[0] + '.' + target_type
if not check_valid_path(export_path):
os.makedirs(export_path, exist_ok=True)
res = os.path.join(export_path, new_file_name)
return res
def get_textfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
target_type = sample.get("target_type", None)
# target_type存在则保存为扫描件, docx格式
if target_type:
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, target_type)
# 不存在则保存为txt文件,正常文本清洗
else:
sample = self._get_from_text(sample)
save_path = self.get_save_path(sample, 'txt')
return sample, save_path
def get_datafile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
target_type = sample.get("target_type", None)
# target_type存在, 图转文保存为target_type,markdown格式
if target_type:
sample = self._get_from_text(sample)
save_path = self.get_save_path(sample, target_type)
# 不存在则保存为原本图片文件格式,正常图片清洗
else:
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, sample[self.filetype_key])
return sample, save_path
def get_medicalfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
target_type = 'png'
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, target_type)
return sample, save_path
def save_file(self, sample, save_path):
file_name, _ = os.path.splitext(save_path)
# 以二进制格式保存文件
file_sample = sample[self.text_key].encode('utf-8') if sample[self.text_key] else sample[self.data_key]
with open(file_name, 'wb') as f:
f.write(file_sample)
# 获取父目录路径
parent_dir = os.path.dirname(file_name)
os.chmod(parent_dir, 0o770)
os.chmod(file_name, 0o640)
def _get_from_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
sample[self.data_key] = bytes(sample[self.data_key])
sample[self.text_key] = ''
return sample
def _get_from_text(self, sample: Dict[str, Any]) -> Dict[str, Any]:
sample[self.data_key] = b''
sample[self.text_key] = str(sample[self.text_key])
return sample
def _get_uuid(self):
res = str(uuid.uuid4())
return res

View File

@@ -1,6 +0,0 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgFormatter',
module_path="ops.formatter.img_formatter.process")

View File

@@ -1,16 +0,0 @@
name: '读取图片文件'
name_en: 'Image File Reader'
description: '读取图片文件。'
description_en: 'Reads image files.'
language: 'Python'
vendor: 'Huawei'
raw_id: 'ImgFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -1,35 +0,0 @@
# # -- encoding: utf-8 --
#
# Description:
# Create: 2024/1/30 15:24
# """
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import numpy_to_bytes
from datamate.core.base_op import Mapper
class ImgFormatter(Mapper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
file_path = sample[self.filepath_key]
img_data = _img_extract(file_path)
sample[self.data_key] = numpy_to_bytes(img_data, file_type)
logger.info(f"fileName: {file_name}, method: ImgExtract costs {(time.time() - start):6f} s")
return sample
def _img_extract(file_path):
return cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1)

View File

@@ -1,6 +0,0 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='TextFormatter',
module_path="ops.formatter.text_formatter.process")

View File

@@ -1,16 +0,0 @@
name: 'TXT文本抽取'
name_en: 'TXT Text Extraction'
description: '抽取TXT中的文本'
description_en: 'Extracts text from TXT files.'
language: 'python'
vendor: 'huawei'
raw_id: 'TxtFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -1,44 +0,0 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: Json文本抽取
Create: 2024/06/06 15:43
"""
import time
from loguru import logger
from typing import Dict, Any
from datamate.core.base_op import Mapper
class TextFormatter(Mapper):
"""把输入的json文件流抽取为txt"""
def __init__(self, *args, **kwargs):
super(TextFormatter, self).__init__(*args, **kwargs)
@staticmethod
def _extract_json(byte_io):
"""将默认使用utf-8编码的Json文件流解码,抽取为txt"""
# 用utf-8-sig的格式进行抽取,可以避免uft-8 BOM编码格式的文件在抽取后产生隐藏字符作为前缀。
return byte_io.decode("utf-8-sig").replace("\r\n", "\n")
def byte_read(self, sample: Dict[str, Any]):
filepath = sample[self.filepath_key]
with open(filepath, "rb") as file:
byte_data = file.read()
sample[self.data_key] = byte_data
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
try:
self.byte_read(sample)
sample[self.text_key] = self._extract_json(sample[self.data_key])
sample[self.data_key] = b"" # 将sample[self.data_key]置空
logger.info(
f"fileName: {sample[self.filename_key]}, method: TextFormatter costs {(time.time() - start):6f} s")
except UnicodeDecodeError as err:
logger.exception(f"fileName: {sample[self.filename_key]}, method: TextFormatter causes decode error: {err}")
raise
return sample

View File

@@ -1,6 +0,0 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='UnstructuredFormatter',
module_path="ops.formatter.unstructured_formatter.process")

View File

@@ -1,16 +0,0 @@
name: 'Unstructured文本抽取'
name_en: 'Unstructured Text Extraction'
description: '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。'
description_en: 'Extracts text from Unstructured files, currently supporting PowerPoint presentations, Word documents and Excel spreadsheets files.'
language: 'python'
vendor: 'huawei'
raw_id: 'UnstructuredFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -1,37 +0,0 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 非结构化文本抽取
Create: 2025/10/22 15:15
"""
import time
from typing import Dict, Any
from loguru import logger
from unstructured.partition.auto import partition
from datamate.core.base_op import Mapper
class UnstructuredFormatter(Mapper):
"""把输入的非结构化文本抽取为txt"""
def __init__(self, *args, **kwargs):
super(UnstructuredFormatter, self).__init__(*args, **kwargs)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
filepath = sample.get(self.filepath_key)
filename = sample.get(self.filename_key)
if not filename.lower().endswith((".ppt", ".pptx", "docx", "xlsx", ".csv")):
return sample
try:
elements = partition(filename=filepath)
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
except UnicodeDecodeError as err:
logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
raise
return sample

View File

@@ -1,6 +0,0 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='WordFormatter',
module_path="ops.formatter.word_formatter.process")

View File

@@ -1,16 +0,0 @@
name: 'Word文本抽取'
name_en: 'Word Text Extraction'
description: '抽取Word中的文本'
description_en: 'Extracts text from Word files.'
language: 'java'
vendor: 'huawei'
raw_id: 'WordFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -1,68 +0,0 @@
# # -- encoding: utf-8 --
#
# Description:
# Create: 2024/1/30 15:24
# """
from loguru import logger
import os
import subprocess
import time
from typing import Dict, Any
from datamate.common.utils import check_valid_path
from datamate.core.base_op import Mapper
class WordFormatter(Mapper):
SEPERATOR = ' | '
def __init__(self, *args, **kwargs):
super(WordFormatter, self).__init__(*args, **kwargs)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
file_name = sample[self.filename_key]
file_path = sample[self.filepath_key]
file_type = sample[self.filetype_key]
txt_content = self.word2html(file_path, file_type)
sample[self.text_key] = txt_content
logger.info(f"fileName: {file_name}, method: WordFormatter costs {(time.time() - start):6f} s")
return sample
@staticmethod
def word2html(file_path, file_type):
check_valid_path(file_path)
file_dir = file_path.rsplit('/', 1)[0]
file_name = file_path.rsplit('/', 1)[1]
html_file_path = os.path.join(file_dir, f"{file_name}.txt")
current_file_path = os.path.dirname(os.path.abspath(__file__))
try:
process = subprocess.Popen(
['java', '-jar', f'{current_file_path}/../../../java_operator/WordFormatter-1.0.jar', file_path,
html_file_path, file_type], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, stderr = process.communicate(timeout=24 * 60 * 60)
if process.returncode == 0:
logger.info(f"Convert {file_path} successfully to DOCX")
else:
logger.info(f"Convert {file_path} failed, error: {stderr.strip().decode('utf-8')}.")
raise RuntimeError()
except subprocess.CalledProcessError as e:
logger.error(f"Convert failed: {e}, return code: {e.returncode}")
except FileNotFoundError:
logger.error("LibreOffice command not found, please make sure it is available in PATH")
except Exception as e:
logger.error(f"An unexpected error occurred, convert failed: {e}", )
try:
with open(html_file_path, 'r', encoding='utf-8') as file:
txt_content = file.read()
os.remove(html_file_path)
logger.info("Tmp docx file removed")
except FileNotFoundError:
logger.error(f"Tmp file {html_file_path} does not exist")
except PermissionError:
logger.error(f"You are not allowed to delete tmp file {html_file_path}")
logger.info(f"Convert {html_file_path} to html success")
return txt_content

View File

@@ -30,6 +30,7 @@ class ContentCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._content_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: ContentCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -64,6 +64,7 @@ class AnonymizedCreditCardNumber(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._credit_card_number_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: CreditCardNumberCleaner costs {time.time() - start:6f} s")

View File

@@ -25,6 +25,7 @@ class EmailNumberCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._email_number_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: EmailCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -22,6 +22,7 @@ class EmojiCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._emoji_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: EmojiCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -41,6 +41,7 @@ class ExtraSpaceCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")

View File

@@ -34,6 +34,7 @@ class FullWidthCharacterCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")

View File

@@ -44,6 +44,7 @@ class GrableCharactersCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._grable_characters_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: GrableCharactersCleaner costs {time.time() - start:6f} s")

View File

@@ -64,6 +64,7 @@ class HtmlTagCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
if sample[self.filetype_key] != "xml":
sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
logger.info(

View File

@@ -71,6 +71,7 @@ class AnonymizedIdNumber(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._id_number_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: IDNumberCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -28,6 +28,7 @@ class ImgDenoise(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]

View File

@@ -97,6 +97,7 @@ class ImgDirectionCorrect(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]

View File

@@ -88,6 +88,7 @@ class ImgBrightness(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -59,6 +59,7 @@ class ImgContrast(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -69,6 +69,7 @@ class ImgSaturation(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -57,6 +57,7 @@ class ImgSharpness(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -25,6 +25,7 @@ class ImgPerspectiveTransformation(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -29,6 +29,7 @@ class ImgResize(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]

View File

@@ -60,6 +60,7 @@ class ImgShadowRemove(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]

View File

@@ -21,6 +21,7 @@ class ImgTypeUnify(Mapper):
def execute(self, sample):
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
origin_file_type = sample[self.filetype_key]
if origin_file_type == self._setting_type:

View File

@@ -80,6 +80,7 @@ class ImgWatermarkRemove(Mapper):
def execute(self, sample: Dict[str, Any]):
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]

View File

@@ -24,6 +24,7 @@ class InvisibleCharactersCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._invisible_characters_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: InvisibleCharactersCleaner costs {time.time() - start:6f} s")

View File

@@ -37,6 +37,7 @@ class AnonymizedIpAddress(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._ip_address_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: IPAddressCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -35,6 +35,7 @@ class KnowledgeRelationSlice(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start_time = time.time()
self.read_file_first(sample)
chunk_item = get_json_list(sample[self.text_key], chunk_size=self.chunk_size, overlap_size=self.overlap_size)
chunk_item_json = json.dumps(chunk_item, ensure_ascii=False)

View File

@@ -36,6 +36,7 @@ class LegendCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._clean_html_tag(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: LegendCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -37,6 +37,7 @@ class AnonymizedPhoneNumber(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._phone_number_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: PhoneNumberCleaner costs {time.time() - start:6f} s")

View File

@@ -53,6 +53,7 @@ class PoliticalWordCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._political_word_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: PoliticalWordCleaner costs {time.time() - start:6f} s")

View File

@@ -47,8 +47,8 @@ def duplicate_sentences_filter(input_data: str, file_name: str, duplicate_th: in
paragraph_counts[paragraph_strip] = -1
except Exception as err:
logger.exception(f"fileName: file_name, method: RemoveDuplicateSentencess. An error occurred when using "
f"filtering duplicate sentences. The error is: {err}")
logger.exception(f"fileName: {file_name}, method: RemoveDuplicateSentencess. An error occurred when using "
f"filtering duplicate sentences. The error is: {err}")
return input_data
# 将去重后的段落重新组合成文本
@@ -63,6 +63,7 @@ class DuplicateSentencesFilter(Filter):
duplicate_th = 5 # 段落重复次数阈值
file_name = sample[self.filename_key]
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = duplicate_sentences_filter(sample[self.text_key], file_name, duplicate_th)
logger.info(f"fileName: {file_name}, RemoveDuplicateSentencess costs {time.time() - start:6f} s")
return sample

View File

@@ -56,6 +56,7 @@ class SexualAndViolentWordCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._sexual_and_violent_word_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: SexualAndViolentWordCleaner costs {time.time() - start:6f} s")

View File

@@ -61,6 +61,7 @@ class TextToWord(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
"""将文本信息转换为docx文件流"""
start = time.time()
self.read_file_first(sample)
sample[self.data_key] = self._txt_to_docx(sample[self.text_key]) # 将文字转换为word字符串流
sample[self.text_key] = ""
sample["target_type"] = "docx"

View File

@@ -27,6 +27,7 @@ class TraditionalChineseCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._traditional_chinese_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: TraditionalChinese costs {time.time() - start:6f} s")

View File

@@ -23,6 +23,7 @@ class UnicodeSpaceCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._clean_unicode_space(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: UnicodeSpaceCleaner costs {time.time() - start:6f} s")

View File

@@ -26,6 +26,7 @@ class AnonymizedUrlCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
sample[self.text_key] = self._url_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: UrlCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -52,6 +52,7 @@ class XMLTagCleaner(Mapper):
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
self.read_file_first(sample)
file_name = sample[self.filename_key]
if sample[self.filetype_key] == "xml":
try:

View File

@@ -0,0 +1,28 @@
[project]
name = "ops"
version = "0.0.1"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"beautifulsoup4>=4.14.3",
"datasketch>=1.8.0",
"email-validator>=2.3.0",
"emoji>=2.15.0",
"jieba>=0.42.1",
"loguru>=0.7.3",
"numpy>=2.2.0,<=2.2.6",
"opencv-contrib-python-headless>=4.12.0.88",
"opencv-python-headless>=4.12.0.88",
"openslide-python>=1.4.3",
"paddleocr>=3.3.2",
"pandas>=2.2.0,<=2.2.3",
"pycryptodome>=3.23.0",
"pymysql>=1.1.2",
"python-docx>=1.2.0",
"pytz>=2025.2",
"six>=1.17.0",
"sqlalchemy>=2.0.44",
"xmltodict>=1.0.2",
"zhconv>=1.4.3",
]

View File

@@ -1,22 +0,0 @@
beautifulsoup4==4.14.2
datamate==0.0.1
datasketch==1.6.5
email_validator==2.3.0
emoji==2.2.0
jieba==0.42.1
loguru==0.7.3
numpy==2.2.6
opencv_contrib_python-headless==4.10.0.84
opencv_python-headless==4.12.0.88
openslide_python==1.4.2
paddleocr==3.2.0
pandas==2.2.3
pycryptodome==3.23.0
python_docx==1.2.0
pytz==2025.2
six==1.17.0
xmltodict==1.0.2
zhconv==1.4.3
sqlalchemy==2.0.40
pymysql==1.1.1
unstructured[docx,csv,xlsx,pptx]==0.18.15

View File

@@ -2,10 +2,15 @@
import json
import os
import time
import traceback
import uuid
from typing import List, Dict, Any, Tuple
import cv2
import numpy as np
from loguru import logger
from unstructured.partition.auto import partition
from datamate.common.error_code import ERROR_CODE_TABLE, UNKNOWN_ERROR_CODE
from datamate.common.utils.llm_request import LlmReq
@@ -52,6 +57,7 @@ class BaseOp:
def __init__(self, *args, **kwargs):
self.accelerator = kwargs.get('accelerator', "cpu")
self.is_last_op = kwargs.get('is_last_op', False)
self.is_first_op = kwargs.get('is_first_op', False)
self._name = kwargs.get('op_name', None)
self.infer_model = None
self.text_key = kwargs.get('text_key', "text")
@@ -122,10 +128,10 @@ class BaseOp:
raise NotImplementedError("This is in BaseOp, plese re-define this method in Sub-classes")
def fill_sample_params(self, sample: Dict[str, Any], **kwargs):
if not sample.get("text", None):
if not sample.get(self.text_key, None):
sample[self.text_key] = ""
if not sample.get("data", None):
if not sample.get(self.data_key, None):
sample[self.data_key] = b""
if not sample[self.data_key] and not sample[self.text_key]:
@@ -137,6 +143,27 @@ class BaseOp:
failed_reason = {"op_name": op_name, "error_code": error_code, "reason": exc_info}
sample["failed_reason"] = failed_reason
def read_file(self, sample):
filepath = sample[self.filepath_key]
filetype = sample[self.filetype_key]
if filetype in ["ppt", "pptx", "docx", "doc", "xlsx"]:
elements = partition(filename=filepath)
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
elif filetype in ["txt", "md", "markdown", "xml", "html", "csv", "json", "jsonl"]:
with open(filepath, 'rb') as f:
content = f.read()
sample[self.text_key] = content.decode("utf-8-sig").replace("\r\n", "\n")
elif filetype in ['jpg', 'jpeg', 'png', 'bmp']:
image_np = cv2.imdecode(np.fromfile(filepath, dtype=np.uint8), -1)
if image_np.size:
data = cv2.imencode(f".{filetype}", image_np)[1]
image_bytes = data.tobytes()
sample[self.data_key] = image_bytes
def read_file_first(self, sample):
if self.is_first_op:
self.read_file(sample)
class Mapper(BaseOp):
def __init__(self, *args, **kwargs):
@@ -158,15 +185,16 @@ class Mapper(BaseOp):
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
f"{str(get_exception_info(e))}")
sample["execute_status"] = execute_status
task_info = TaskInfoPersistence()
task_info.persistence_task_info(sample)
sample[self.filesize_key] = "0"
sample[self.filetype_key] = ""
TaskInfoPersistence().update_task_result(sample)
raise e
sample["execute_status"] = execute_status
# 加载文件成功执行信息到数据库
if self.is_last_op:
task_info = TaskInfoPersistence()
task_info.persistence_task_info(sample)
if FileExporter().execute(sample):
TaskInfoPersistence().persistence_task_info(sample)
return sample
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
@@ -197,8 +225,9 @@ class Slicer(BaseOp):
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
f"{str(get_exception_info(e))}")
sample["execute_status"] = execute_status
task_info = TaskInfoPersistence()
task_info.persistence_task_info(sample)
sample[self.filesize_key] = "0"
sample[self.filetype_key] = ""
TaskInfoPersistence().update_task_result(sample)
return [sample]
self.load_sample_to_sample(sample, sample_list)
@@ -206,8 +235,8 @@ class Slicer(BaseOp):
# 加载文件成功执行信息到数据库
if self.is_last_op:
task_info = TaskInfoPersistence()
task_info.persistence_task_info(sample)
if FileExporter().execute(sample):
TaskInfoPersistence().persistence_task_info(sample)
return [sample]
@@ -286,22 +315,24 @@ class Filter(BaseOp):
sample["execute_status"] = execute_status
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
f"{str(get_exception_info(e))}")
task_info = TaskInfoPersistence()
task_info.persistence_task_info(sample)
sample[self.filesize_key] = "0"
sample[self.filetype_key] = ""
TaskInfoPersistence().update_task_result(sample)
raise e
sample["execute_status"] = execute_status
# 文件无内容会被过滤
if sample[self.text_key] == "" and sample[self.data_key] == b"":
task_info = TaskInfoPersistence()
sample["fileSize"] = "0"
task_info.persistence_task_info(sample)
sample[self.filesize_key] = "0"
sample[self.filetype_key] = ""
task_info.update_task_result(sample)
return False
# 加载文件成功执行信息到数据库
if self.is_last_op:
task_info = TaskInfoPersistence()
task_info.persistence_task_info(sample)
if FileExporter().execute(sample):
TaskInfoPersistence().persistence_task_info(sample)
return True
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
@@ -379,3 +410,131 @@ class LLM(Mapper):
raise RuntimeError(f"Save jsonl file Failed!, save_path: {save_path}.") from e
logger.info(f"LLM output has been save to {save_path}.")
class FileExporter(BaseOp):
"""把输入的json文件流抽取为txt"""
def __init__(self, *args, **kwargs):
super(FileExporter, self).__init__(*args, **kwargs)
self.last_ops = True
self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
'xlsx', 'xls', 'csv', 'pptx', 'ppt',
'xml', 'json', 'doc', 'docx', 'pdf'])
self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])
def execute(self, sample: Dict[str, Any]):
file_name = sample[self.filename_key]
file_type = sample[self.filetype_key]
try:
start = time.time()
if file_type in self.text_support_ext:
sample, save_path = self.get_textfile_handler(sample)
elif file_type in self.data_support_ext:
sample, save_path = self.get_datafile_handler(sample)
elif file_type in self.medical_support_ext:
sample, save_path = self.get_medicalfile_handler(sample)
else:
raise TypeError(f"{file_type} is unsupported! please check support_ext in FileExporter Ops")
if sample[self.text_key] == '' and sample[self.data_key] == b'':
sample[self.filesize_key] = "0"
return False
if save_path:
self.save_file(sample, save_path)
sample[self.text_key] = ''
sample[self.data_key] = b''
sample[Fields.result] = True
file_type = save_path.split('.')[-1]
sample[self.filetype_key] = file_type
base_name, _ = os.path.splitext(file_name)
new_file_name = base_name + '.' + file_type
sample[self.filename_key] = new_file_name
base_name, _ = os.path.splitext(save_path)
sample[self.filepath_key] = base_name
file_size = os.path.getsize(base_name)
sample[self.filesize_key] = f"{file_size}"
logger.info(f"origin file named {file_name} has been save to {save_path}")
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FileExporter costs {time.time() - start:.6f} s")
except UnicodeDecodeError as err:
logger.error(f"fileName: {sample[self.filename_key]}, "
f"method: FileExporter causes decode error: {err}")
raise
return True
def get_save_path(self, sample: Dict[str, Any], target_type):
export_path = os.path.abspath(sample[self.export_path_key])
file_name = sample[self.filename_key]
new_file_name = os.path.splitext(file_name)[0] + '.' + target_type
if not check_valid_path(export_path):
os.makedirs(export_path, exist_ok=True)
return os.path.join(export_path, new_file_name)
def get_textfile_handler(self, sample: Dict[str, Any]):
target_type = sample.get("target_type", None)
# target_type存在则保存为扫描件, docx格式
if target_type:
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, target_type)
# 不存在则保存为txt文件,正常文本清洗
else:
sample = self._get_from_text(sample)
save_path = self.get_save_path(sample, 'txt')
return sample, save_path
def get_datafile_handler(self, sample: Dict[str, Any]):
target_type = sample.get("target_type", None)
# target_type存在, 图转文保存为target_type,markdown格式
if target_type:
sample = self._get_from_text(sample)
save_path = self.get_save_path(sample, target_type)
# 不存在则保存为原本图片文件格式,正常图片清洗
else:
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, sample[self.filetype_key])
return sample, save_path
def get_medicalfile_handler(self, sample: Dict[str, Any]):
target_type = 'png'
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, target_type)
return sample, save_path
def save_file(self, sample, save_path):
file_name, _ = os.path.splitext(save_path)
# 以二进制格式保存文件
file_sample = sample[self.text_key].encode('utf-8') if sample[self.text_key] else sample[self.data_key]
with open(file_name, 'wb') as f:
f.write(file_sample)
# 获取父目录路径
parent_dir = os.path.dirname(file_name)
os.chmod(parent_dir, 0o770)
os.chmod(file_name, 0o640)
def _get_from_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
sample[self.data_key] = bytes(sample[self.data_key])
sample[self.text_key] = ''
return sample
def _get_from_text(self, sample: Dict[str, Any]) -> Dict[str, Any]:
sample[self.data_key] = b''
sample[self.text_key] = str(sample[self.text_key])
return sample
@staticmethod
def _get_uuid():
return str(uuid.uuid4())

View File

@@ -119,6 +119,8 @@ class RayDataset(BasicDataset):
# 加载Ops module
temp_ops = self.load_ops_module(op_name)
if index == 0:
init_kwargs["is_first_op"] = True
if index == len(cfg_process) - 1:
init_kwargs["is_last_op"] = True
@@ -182,7 +184,8 @@ class RayDataset(BasicDataset):
fn_kwargs=kwargs,
resources=resources,
num_cpus=0.05,
concurrency=(1, 1 if operators_cls.use_model else int(max_actor_nums)))
compute=rd.ActorPoolStrategy(min_size=1,
max_size=int(max_actor_nums)))
elif issubclass(operators_cls, (Slicer, RELATIVE_Slicer)):
self.data = self.data.flat_map(operators_cls,
@@ -190,7 +193,8 @@ class RayDataset(BasicDataset):
fn_kwargs=kwargs,
resources=resources,
num_cpus=0.05,
concurrency=(1, int(max_actor_nums)))
compute=rd.ActorPoolStrategy(min_size=1,
max_size=int(max_actor_nums)))
elif issubclass(operators_cls, (Filter, RELATIVE_Filter)):
self.data = self.data.filter(operators_cls,
@@ -198,7 +202,8 @@ class RayDataset(BasicDataset):
fn_kwargs=kwargs,
resources=resources,
num_cpus=0.05,
concurrency=(1, int(max_actor_nums)))
compute=rd.ActorPoolStrategy(min_size=1,
max_size=int(max_actor_nums)))
else:
logger.error(
'Ray executor only support Filter, Mapper and Slicer OPs for now')

View File

@@ -25,13 +25,13 @@ class TaskInfoPersistence:
with open(sql_config_path, 'r', encoding='utf-8') as f:
return json.load(f)
def persistence_task_info(self, sample: Dict[str, Any]):
def update_task_result(self, sample, file_id = str(uuid.uuid4())):
instance_id = str(sample.get("instance_id"))
src_file_name = str(sample.get("sourceFileName"))
src_file_type = str(sample.get("sourceFileType"))
src_file_id = str(sample.get("sourceFileId"))
src_file_size = int(sample.get("sourceFileSize"))
file_id = str(uuid.uuid4())
file_size = str(sample.get("fileSize"))
file_type = str(sample.get("fileType"))
file_name = str(sample.get("fileName"))
@@ -53,6 +53,10 @@ class TaskInfoPersistence:
}
self.insert_result(result_data, str(self.sql_dict.get("insert_clean_result_sql")))
def update_file_result(self, sample, file_id):
file_size = str(sample.get("fileSize"))
file_type = str(sample.get("fileType"))
file_name = str(sample.get("fileName"))
dataset_id = str(sample.get("dataset_id"))
file_path = str(sample.get("filePath"))
create_time = datetime.now()
@@ -72,6 +76,11 @@ class TaskInfoPersistence:
}
self.insert_result(file_data, str(self.sql_dict.get("insert_dataset_file_sql")))
def persistence_task_info(self, sample: Dict[str, Any]):
file_id = str(uuid.uuid4())
self.update_task_result(sample, file_id)
self.update_file_result(sample, file_id)
@staticmethod
def insert_result(data, sql):
retries = 0

View File

@@ -16,27 +16,13 @@ classifiers = [
# Core dependencies
dependencies = [
"uvicorn[standard]",
"fastapi",
"loguru",
"jsonargparse",
"ray[default, data]==2.46.0",
"opencv-python"
]
[project.optional-dependencies]
dj = [
"py-data-juicer~=1.4.0"
]
op = [
"python-docx==1.1.0"
]
# All dependencies
all = [
"datamate[dj]",
"datamate[op]"
"fastapi>=0.123.9",
"jsonargparse>=4.44.0",
"loguru>=0.7.3",
"opencv-python-headless>=4.12.0.88",
"ray[data,default]==2.52.1",
"unstructured[csv,docx,pptx,xlsx]==0.18.15",
"uvicorn[standard]>=0.38.0",
]
[build-system]

View File

@@ -59,8 +59,7 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', '文本清洗模板', '文本清
('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板');
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, null),
@@ -85,10 +84,10 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileExporter', 28, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgFormatter', 1, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null);
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
VALUES ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 3, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 4, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 5, null),
@@ -99,5 +98,4 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 10, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 11, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'FileExporter', 14, null);
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null);

View File

@@ -67,10 +67,7 @@ VALUES ('64465bec-b46b-11f0-8291-00155d0e4808', '模态', 'modal', 'predefined'
INSERT IGNORE INTO t_operator
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('UnstructuredFormatter', 'Unstructured文本抽取', '基于Unstructured抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。', '1.0.0', 'text', 'text', null, null, '', false),
('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'multimodal', 'multimodal', null, null, '', false),
VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
@@ -97,7 +94,6 @@ VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0',
('UnicodeSpaceCleaner', '空格标准化', '将文档中不同的 unicode 空格,如 u2008,转换为正常空格\\u0020。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('AnonymizedUrlCleaner', 'URL网址匿名化', '将文档中的url网址匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('XMLTagCleaner', 'XML标签去除', '去除XML中的标签。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('ImgFormatter', '读取图片文件', '读取图片文件。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgBlurredImagesCleaner', '模糊图片过滤', '去除模糊的图片。', '1.0.0', 'image', 'image', null, '{"blurredThreshold": {"name": "梯度函数值", "description": "梯度函数值取值越小,图片模糊度越高。", "type": "slider", "defaultVal": 1000, "min": 1, "max": 10000, "step": 1}}', '', 'false'),
('ImgBrightness', '图片亮度增强', '自适应调节图片的亮度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgContrast', '图片对比度增强', '自适应调节图片的对比度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
@@ -117,7 +113,7 @@ SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('d8a5df7a-52a9-42c2-83c4-01062e60f597', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter',
AND o.id IN ('FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter',
'FileWithHighRepeatWordRateFilter', 'FileWithHighSpecialCharRateFilter', 'FileWithManySensitiveWordsFilter',
'DuplicateFilesFilter', 'DuplicateSentencesFilter', 'AnonymizedCreditCardNumber', 'AnonymizedIdNumber',
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
@@ -130,13 +126,6 @@ SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
AND o.id IN ('ImgFormatter', 'ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
AND o.id IN ('ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN ('4d7dbd77-0a92-44f3-9056-2cd62d4a71e4', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
AND o.id IN ('FileExporter', 'UnstructuredFormatter');

View File

@@ -16,7 +16,7 @@ WORKDIR /opt/runtime
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e . --system \
&& uv pip install -r /opt/runtime/datamate/ops/requirements.txt --system
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
&& chmod +x /opt/runtime/start.sh \