You've already forked DataMate
算子将抽取与落盘固定到流程中 (#134)
* feature: 将抽取动作移到每一个算子中 * feature: 落盘算子改为默认执行 * feature: 优化前端展示 * feature: 使用pyproject管理依赖
This commit is contained in:
@@ -150,6 +150,7 @@ const OperatorFlow: React.FC<OperatorFlowProps> = ({
|
|||||||
max={selectedOperators.length}
|
max={selectedOperators.length}
|
||||||
defaultValue={index + 1}
|
defaultValue={index + 1}
|
||||||
className="w-10 h-6 text-xs text-center"
|
className="w-10 h-6 text-xs text-center"
|
||||||
|
style={{ width: 60 }}
|
||||||
autoFocus
|
autoFocus
|
||||||
onBlur={(e) => handleIndexChange(operator.id, e.target.value)}
|
onBlur={(e) => handleIndexChange(operator.id, e.target.value)}
|
||||||
onKeyDown={(e) => {
|
onKeyDown={(e) => {
|
||||||
|
|||||||
@@ -227,9 +227,8 @@ export default function FileTable({result, fetchTaskResult}) {
|
|||||||
dataIndex: "status",
|
dataIndex: "status",
|
||||||
key: "status",
|
key: "status",
|
||||||
filters: [
|
filters: [
|
||||||
{ text: "已完成", value: "已完成" },
|
{ text: "已完成", value: "COMPLETED" },
|
||||||
{ text: "失败", value: "失败" },
|
{ text: "失败", value: "FAILED" },
|
||||||
{ text: "处理中", value: "处理中" },
|
|
||||||
],
|
],
|
||||||
onFilter: (value: string, record: any) => record.status === value,
|
onFilter: (value: string, record: any) => record.status === value,
|
||||||
render: (status: string) => (
|
render: (status: string) => (
|
||||||
@@ -237,9 +236,7 @@ export default function FileTable({result, fetchTaskResult}) {
|
|||||||
status={
|
status={
|
||||||
status === "COMPLETED"
|
status === "COMPLETED"
|
||||||
? "success"
|
? "success"
|
||||||
: status === "FAILED"
|
: "error"
|
||||||
? "error"
|
|
||||||
: "processing"
|
|
||||||
}
|
}
|
||||||
text={TaskStatusMap[status as TaskStatus].label}
|
text={TaskStatusMap[status as TaskStatus].label}
|
||||||
/>
|
/>
|
||||||
@@ -248,6 +245,7 @@ export default function FileTable({result, fetchTaskResult}) {
|
|||||||
{
|
{
|
||||||
title: "操作",
|
title: "操作",
|
||||||
key: "action",
|
key: "action",
|
||||||
|
width: 200,
|
||||||
render: (_text: string, record: any) => (
|
render: (_text: string, record: any) => (
|
||||||
<div className="flex">
|
<div className="flex">
|
||||||
{record.status === "COMPLETED" ? (
|
{record.status === "COMPLETED" ? (
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ class FileWithHighRepeatPhraseRateFilter(Filter):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._file_with_high_repeat_phrase_rate_filter(sample[self.text_key],
|
sample[self.text_key] = self._file_with_high_repeat_phrase_rate_filter(sample[self.text_key],
|
||||||
sample[self.filename_key])
|
sample[self.filename_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ class FileWithHighRepeatWordRateFilter(Filter):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._file_with_high_repeat_word_rate_filter(sample[self.text_key],
|
sample[self.text_key] = self._file_with_high_repeat_word_rate_filter(sample[self.text_key],
|
||||||
sample[self.filename_key])
|
sample[self.filename_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ class FileWithHighSpecialCharRateFilter(Filter):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._file_with_high_special_char_rate_filter(sample[self.text_key],
|
sample[self.text_key] = self._file_with_high_special_char_rate_filter(sample[self.text_key],
|
||||||
sample[self.filename_key])
|
sample[self.filename_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||||
|
|||||||
@@ -105,6 +105,7 @@ class ImgAdvertisementImagesCleaner(Filter):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
file_type = "." + sample[self.filetype_key]
|
file_type = "." + sample[self.filetype_key]
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ class ImgBlurredImagesCleaner(Filter):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
file_type = "." + sample[self.filetype_key]
|
file_type = "." + sample[self.filetype_key]
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ class ImgDuplicatedImagesCleaner(Filter):
|
|||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
"""重复图片去重算子执行入口"""
|
"""重复图片去重算子执行入口"""
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
|
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
|
||||||
img_data = self._duplicate_images_filter(file_name, sample[self.data_key])
|
img_data = self._duplicate_images_filter(file_name, sample[self.data_key])
|
||||||
|
|||||||
@@ -227,6 +227,7 @@ class ImgSimilarImagesCleaner(Filter):
|
|||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
"""去除相似图片算子执行入口"""
|
"""去除相似图片算子执行入口"""
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
data = bytes_to_numpy(img_bytes) if img_bytes else np.array([])
|
data = bytes_to_numpy(img_bytes) if img_bytes else np.array([])
|
||||||
|
|||||||
@@ -150,6 +150,7 @@ class DuplicateFilesFilter(Filter):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
|
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
|
||||||
sample[self.text_key] = self.deduplicate_files(sample, file_name)
|
sample[self.text_key] = self.deduplicate_files(sample, file_name)
|
||||||
|
|||||||
@@ -90,6 +90,7 @@ class FileWithManySensitiveWordsFilter(Filter):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._file_with_many_sensitive_words_filter(sample[self.text_key],
|
sample[self.text_key] = self._file_with_many_sensitive_words_filter(sample[self.text_key],
|
||||||
sample[self.filename_key])
|
sample[self.filename_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ class FileWithShortOrLongLengthFilter(Filter):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._file_with_short_or_long_length_filter(sample[self.text_key],
|
sample[self.text_key] = self._file_with_short_or_long_length_filter(sample[self.text_key],
|
||||||
sample[self.filename_key])
|
sample[self.filename_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||||
|
|||||||
@@ -15,12 +15,7 @@ _configure_importer()
|
|||||||
|
|
||||||
|
|
||||||
def _import_operators():
|
def _import_operators():
|
||||||
from . import text_formatter
|
|
||||||
from . import word_formatter
|
|
||||||
from . import img_formatter
|
|
||||||
from . import file_exporter
|
|
||||||
from . import slide_formatter
|
from . import slide_formatter
|
||||||
from . import unstructured_formatter
|
|
||||||
from . import mineru_formatter
|
from . import mineru_formatter
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from datamate.core.base_op import OPERATORS
|
|
||||||
|
|
||||||
OPERATORS.register_module(module_name='FileExporter',
|
|
||||||
module_path="ops.formatter.file_exporter.process")
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
name: '落盘算子'
|
|
||||||
name_en: 'save file operator'
|
|
||||||
description: '将文件内容保存为文件。'
|
|
||||||
description_en: 'Save the file data as a file.'
|
|
||||||
language: 'Python'
|
|
||||||
vendor: 'Huawei'
|
|
||||||
raw_id: 'FileExporter'
|
|
||||||
version: '1.0.0'
|
|
||||||
types:
|
|
||||||
- 'collect'
|
|
||||||
modal: 'others'
|
|
||||||
effect:
|
|
||||||
before: ''
|
|
||||||
after: ''
|
|
||||||
inputs: 'all'
|
|
||||||
outputs: 'all'
|
|
||||||
@@ -1,145 +0,0 @@
|
|||||||
#!/user/bin/python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
"""
|
|
||||||
Description: Json文本抽取
|
|
||||||
Create: 2024/06/06 15:43
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
import os
|
|
||||||
import uuid
|
|
||||||
from typing import Tuple, Dict, Any
|
|
||||||
from loguru import logger
|
|
||||||
|
|
||||||
from datamate.core.constant import Fields
|
|
||||||
from datamate.core.base_op import Mapper
|
|
||||||
from datamate.common.utils import check_valid_path
|
|
||||||
|
|
||||||
|
|
||||||
class FileExporter(Mapper):
|
|
||||||
"""把输入的json文件流抽取为txt"""
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super(FileExporter, self).__init__(*args, **kwargs)
|
|
||||||
self.last_ops = True
|
|
||||||
self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
|
|
||||||
'xlsx', 'xls', 'csv', 'pptx', 'ppt',
|
|
||||||
'xml', 'json', 'doc', 'docx', 'pdf'])
|
|
||||||
self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
|
|
||||||
self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])
|
|
||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
file_name = sample[self.filename_key]
|
|
||||||
file_type = sample[self.filetype_key]
|
|
||||||
|
|
||||||
try:
|
|
||||||
start = time.time()
|
|
||||||
if file_type in self.text_support_ext:
|
|
||||||
sample, save_path = self.get_textfile_handler(sample)
|
|
||||||
elif file_type in self.data_support_ext:
|
|
||||||
sample, save_path = self.get_datafile_handler(sample)
|
|
||||||
elif file_type in self.medical_support_ext:
|
|
||||||
sample, save_path = self.get_medicalfile_handler(sample)
|
|
||||||
else:
|
|
||||||
raise TypeError(f"{file_type} is unsupported! please check support_ext in FileExporter Ops")
|
|
||||||
|
|
||||||
if sample[self.text_key] == '' and sample[self.data_key] == b'':
|
|
||||||
sample[self.filesize_key] = "0"
|
|
||||||
return sample
|
|
||||||
|
|
||||||
if save_path:
|
|
||||||
self.save_file(sample, save_path)
|
|
||||||
sample[self.text_key] = ''
|
|
||||||
sample[self.data_key] = b''
|
|
||||||
sample[Fields.result] = True
|
|
||||||
|
|
||||||
file_type = save_path.split('.')[-1]
|
|
||||||
sample[self.filetype_key] = file_type
|
|
||||||
|
|
||||||
base_name, _ = os.path.splitext(file_name)
|
|
||||||
new_file_name = base_name + '.' + file_type
|
|
||||||
sample[self.filename_key] = new_file_name
|
|
||||||
|
|
||||||
base_name, _ = os.path.splitext(save_path)
|
|
||||||
sample[self.filepath_key] = base_name
|
|
||||||
file_size = os.path.getsize(base_name)
|
|
||||||
sample[self.filesize_key] = f"{file_size}"
|
|
||||||
|
|
||||||
logger.info(f"origin file named {file_name} has been save to {save_path}")
|
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
|
||||||
f"method: FileExporter costs {time.time() - start:.6f} s")
|
|
||||||
except UnicodeDecodeError as err:
|
|
||||||
logger.error(f"fileName: {sample[self.filename_key]}, "
|
|
||||||
f"method: FileExporter causes decode error: {err}")
|
|
||||||
raise
|
|
||||||
return sample
|
|
||||||
|
|
||||||
def get_save_path(self, sample: Dict[str, Any], target_type) -> str:
|
|
||||||
export_path = os.path.abspath(sample[self.export_path_key])
|
|
||||||
file_name = sample[self.filename_key]
|
|
||||||
new_file_name = os.path.splitext(file_name)[0] + '.' + target_type
|
|
||||||
|
|
||||||
if not check_valid_path(export_path):
|
|
||||||
os.makedirs(export_path, exist_ok=True)
|
|
||||||
res = os.path.join(export_path, new_file_name)
|
|
||||||
return res
|
|
||||||
|
|
||||||
def get_textfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
|
||||||
target_type = sample.get("target_type", None)
|
|
||||||
|
|
||||||
# target_type存在则保存为扫描件, docx格式
|
|
||||||
if target_type:
|
|
||||||
sample = self._get_from_data(sample)
|
|
||||||
save_path = self.get_save_path(sample, target_type)
|
|
||||||
# 不存在则保存为txt文件,正常文本清洗
|
|
||||||
else:
|
|
||||||
sample = self._get_from_text(sample)
|
|
||||||
save_path = self.get_save_path(sample, 'txt')
|
|
||||||
return sample, save_path
|
|
||||||
|
|
||||||
def get_datafile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
|
||||||
target_type = sample.get("target_type", None)
|
|
||||||
|
|
||||||
# target_type存在, 图转文保存为target_type,markdown格式
|
|
||||||
if target_type:
|
|
||||||
sample = self._get_from_text(sample)
|
|
||||||
save_path = self.get_save_path(sample, target_type)
|
|
||||||
# 不存在则保存为原本图片文件格式,正常图片清洗
|
|
||||||
else:
|
|
||||||
sample = self._get_from_data(sample)
|
|
||||||
save_path = self.get_save_path(sample, sample[self.filetype_key])
|
|
||||||
return sample, save_path
|
|
||||||
|
|
||||||
def get_medicalfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
|
||||||
target_type = 'png'
|
|
||||||
|
|
||||||
sample = self._get_from_data(sample)
|
|
||||||
save_path = self.get_save_path(sample, target_type)
|
|
||||||
|
|
||||||
return sample, save_path
|
|
||||||
|
|
||||||
def save_file(self, sample, save_path):
|
|
||||||
file_name, _ = os.path.splitext(save_path)
|
|
||||||
# 以二进制格式保存文件
|
|
||||||
file_sample = sample[self.text_key].encode('utf-8') if sample[self.text_key] else sample[self.data_key]
|
|
||||||
with open(file_name, 'wb') as f:
|
|
||||||
f.write(file_sample)
|
|
||||||
# 获取父目录路径
|
|
||||||
|
|
||||||
parent_dir = os.path.dirname(file_name)
|
|
||||||
os.chmod(parent_dir, 0o770)
|
|
||||||
os.chmod(file_name, 0o640)
|
|
||||||
|
|
||||||
def _get_from_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
sample[self.data_key] = bytes(sample[self.data_key])
|
|
||||||
sample[self.text_key] = ''
|
|
||||||
return sample
|
|
||||||
|
|
||||||
def _get_from_text(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
sample[self.data_key] = b''
|
|
||||||
sample[self.text_key] = str(sample[self.text_key])
|
|
||||||
return sample
|
|
||||||
|
|
||||||
def _get_uuid(self):
|
|
||||||
res = str(uuid.uuid4())
|
|
||||||
return res
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from datamate.core.base_op import OPERATORS
|
|
||||||
|
|
||||||
OPERATORS.register_module(module_name='ImgFormatter',
|
|
||||||
module_path="ops.formatter.img_formatter.process")
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
name: '读取图片文件'
|
|
||||||
name_en: 'Image File Reader'
|
|
||||||
description: '读取图片文件。'
|
|
||||||
description_en: 'Reads image files.'
|
|
||||||
language: 'Python'
|
|
||||||
vendor: 'Huawei'
|
|
||||||
raw_id: 'ImgFormatter'
|
|
||||||
version: '1.0.0'
|
|
||||||
types:
|
|
||||||
- 'collect'
|
|
||||||
modal: 'image'
|
|
||||||
effect:
|
|
||||||
before: ''
|
|
||||||
after: ''
|
|
||||||
inputs: 'image'
|
|
||||||
outputs: 'image'
|
|
||||||
@@ -1,35 +0,0 @@
|
|||||||
# # -- encoding: utf-8 --
|
|
||||||
|
|
||||||
#
|
|
||||||
# Description:
|
|
||||||
# Create: 2024/1/30 15:24
|
|
||||||
# """
|
|
||||||
import time
|
|
||||||
from typing import Dict, Any
|
|
||||||
|
|
||||||
import cv2
|
|
||||||
import numpy as np
|
|
||||||
from loguru import logger
|
|
||||||
|
|
||||||
from datamate.common.utils import numpy_to_bytes
|
|
||||||
from datamate.core.base_op import Mapper
|
|
||||||
|
|
||||||
|
|
||||||
class ImgFormatter(Mapper):
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
start = time.time()
|
|
||||||
file_name = sample[self.filename_key]
|
|
||||||
file_type = "." + sample[self.filetype_key]
|
|
||||||
file_path = sample[self.filepath_key]
|
|
||||||
img_data = _img_extract(file_path)
|
|
||||||
sample[self.data_key] = numpy_to_bytes(img_data, file_type)
|
|
||||||
logger.info(f"fileName: {file_name}, method: ImgExtract costs {(time.time() - start):6f} s")
|
|
||||||
return sample
|
|
||||||
|
|
||||||
|
|
||||||
def _img_extract(file_path):
|
|
||||||
return cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1)
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from datamate.core.base_op import OPERATORS
|
|
||||||
|
|
||||||
OPERATORS.register_module(module_name='TextFormatter',
|
|
||||||
module_path="ops.formatter.text_formatter.process")
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
name: 'TXT文本抽取'
|
|
||||||
name_en: 'TXT Text Extraction'
|
|
||||||
description: '抽取TXT中的文本'
|
|
||||||
description_en: 'Extracts text from TXT files.'
|
|
||||||
language: 'python'
|
|
||||||
vendor: 'huawei'
|
|
||||||
raw_id: 'TxtFormatter'
|
|
||||||
version: '1.0.0'
|
|
||||||
types:
|
|
||||||
- 'collect'
|
|
||||||
modal: 'text'
|
|
||||||
effect:
|
|
||||||
before: ''
|
|
||||||
after: ''
|
|
||||||
inputs: 'text'
|
|
||||||
outputs: 'text'
|
|
||||||
@@ -1,44 +0,0 @@
|
|||||||
#!/user/bin/python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
"""
|
|
||||||
Description: Json文本抽取
|
|
||||||
Create: 2024/06/06 15:43
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
from loguru import logger
|
|
||||||
from typing import Dict, Any
|
|
||||||
|
|
||||||
from datamate.core.base_op import Mapper
|
|
||||||
|
|
||||||
|
|
||||||
class TextFormatter(Mapper):
|
|
||||||
"""把输入的json文件流抽取为txt"""
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super(TextFormatter, self).__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _extract_json(byte_io):
|
|
||||||
"""将默认使用utf-8编码的Json文件流解码,抽取为txt"""
|
|
||||||
# 用utf-8-sig的格式进行抽取,可以避免uft-8 BOM编码格式的文件在抽取后产生隐藏字符作为前缀。
|
|
||||||
return byte_io.decode("utf-8-sig").replace("\r\n", "\n")
|
|
||||||
|
|
||||||
def byte_read(self, sample: Dict[str, Any]):
|
|
||||||
filepath = sample[self.filepath_key]
|
|
||||||
with open(filepath, "rb") as file:
|
|
||||||
byte_data = file.read()
|
|
||||||
sample[self.data_key] = byte_data
|
|
||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
start = time.time()
|
|
||||||
try:
|
|
||||||
self.byte_read(sample)
|
|
||||||
sample[self.text_key] = self._extract_json(sample[self.data_key])
|
|
||||||
sample[self.data_key] = b"" # 将sample[self.data_key]置空
|
|
||||||
logger.info(
|
|
||||||
f"fileName: {sample[self.filename_key]}, method: TextFormatter costs {(time.time() - start):6f} s")
|
|
||||||
except UnicodeDecodeError as err:
|
|
||||||
logger.exception(f"fileName: {sample[self.filename_key]}, method: TextFormatter causes decode error: {err}")
|
|
||||||
raise
|
|
||||||
return sample
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from datamate.core.base_op import OPERATORS
|
|
||||||
|
|
||||||
OPERATORS.register_module(module_name='UnstructuredFormatter',
|
|
||||||
module_path="ops.formatter.unstructured_formatter.process")
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
name: 'Unstructured文本抽取'
|
|
||||||
name_en: 'Unstructured Text Extraction'
|
|
||||||
description: '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。'
|
|
||||||
description_en: 'Extracts text from Unstructured files, currently supporting PowerPoint presentations, Word documents and Excel spreadsheets files.'
|
|
||||||
language: 'python'
|
|
||||||
vendor: 'huawei'
|
|
||||||
raw_id: 'UnstructuredFormatter'
|
|
||||||
version: '1.0.0'
|
|
||||||
types:
|
|
||||||
- 'collect'
|
|
||||||
modal: 'text'
|
|
||||||
effect:
|
|
||||||
before: ''
|
|
||||||
after: ''
|
|
||||||
inputs: 'text'
|
|
||||||
outputs: 'text'
|
|
||||||
@@ -1,37 +0,0 @@
|
|||||||
|
|
||||||
#!/user/bin/python
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
"""
|
|
||||||
Description: 非结构化文本抽取
|
|
||||||
Create: 2025/10/22 15:15
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
from typing import Dict, Any
|
|
||||||
|
|
||||||
from loguru import logger
|
|
||||||
from unstructured.partition.auto import partition
|
|
||||||
|
|
||||||
from datamate.core.base_op import Mapper
|
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredFormatter(Mapper):
|
|
||||||
"""把输入的非结构化文本抽取为txt"""
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super(UnstructuredFormatter, self).__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
start = time.time()
|
|
||||||
filepath = sample.get(self.filepath_key)
|
|
||||||
filename = sample.get(self.filename_key)
|
|
||||||
if not filename.lower().endswith((".ppt", ".pptx", "docx", "xlsx", ".csv")):
|
|
||||||
return sample
|
|
||||||
try:
|
|
||||||
elements = partition(filename=filepath)
|
|
||||||
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
|
|
||||||
logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
|
|
||||||
except UnicodeDecodeError as err:
|
|
||||||
logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
|
|
||||||
raise
|
|
||||||
return sample
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from datamate.core.base_op import OPERATORS
|
|
||||||
|
|
||||||
OPERATORS.register_module(module_name='WordFormatter',
|
|
||||||
module_path="ops.formatter.word_formatter.process")
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
name: 'Word文本抽取'
|
|
||||||
name_en: 'Word Text Extraction'
|
|
||||||
description: '抽取Word中的文本'
|
|
||||||
description_en: 'Extracts text from Word files.'
|
|
||||||
language: 'java'
|
|
||||||
vendor: 'huawei'
|
|
||||||
raw_id: 'WordFormatter'
|
|
||||||
version: '1.0.0'
|
|
||||||
types:
|
|
||||||
- 'collect'
|
|
||||||
modal: 'text'
|
|
||||||
effect:
|
|
||||||
before: ''
|
|
||||||
after: ''
|
|
||||||
inputs: 'text'
|
|
||||||
outputs: 'text'
|
|
||||||
@@ -1,68 +0,0 @@
|
|||||||
# # -- encoding: utf-8 --
|
|
||||||
|
|
||||||
#
|
|
||||||
# Description:
|
|
||||||
# Create: 2024/1/30 15:24
|
|
||||||
# """
|
|
||||||
from loguru import logger
|
|
||||||
import os
|
|
||||||
import subprocess
|
|
||||||
import time
|
|
||||||
from typing import Dict, Any
|
|
||||||
|
|
||||||
from datamate.common.utils import check_valid_path
|
|
||||||
from datamate.core.base_op import Mapper
|
|
||||||
|
|
||||||
|
|
||||||
class WordFormatter(Mapper):
|
|
||||||
SEPERATOR = ' | '
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super(WordFormatter, self).__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
|
||||||
start = time.time()
|
|
||||||
file_name = sample[self.filename_key]
|
|
||||||
file_path = sample[self.filepath_key]
|
|
||||||
file_type = sample[self.filetype_key]
|
|
||||||
txt_content = self.word2html(file_path, file_type)
|
|
||||||
sample[self.text_key] = txt_content
|
|
||||||
logger.info(f"fileName: {file_name}, method: WordFormatter costs {(time.time() - start):6f} s")
|
|
||||||
return sample
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def word2html(file_path, file_type):
|
|
||||||
check_valid_path(file_path)
|
|
||||||
file_dir = file_path.rsplit('/', 1)[0]
|
|
||||||
file_name = file_path.rsplit('/', 1)[1]
|
|
||||||
html_file_path = os.path.join(file_dir, f"{file_name}.txt")
|
|
||||||
|
|
||||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
|
||||||
try:
|
|
||||||
process = subprocess.Popen(
|
|
||||||
['java', '-jar', f'{current_file_path}/../../../java_operator/WordFormatter-1.0.jar', file_path,
|
|
||||||
html_file_path, file_type], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
||||||
stdout, stderr = process.communicate(timeout=24 * 60 * 60)
|
|
||||||
if process.returncode == 0:
|
|
||||||
logger.info(f"Convert {file_path} successfully to DOCX")
|
|
||||||
else:
|
|
||||||
logger.info(f"Convert {file_path} failed, error: {stderr.strip().decode('utf-8')}.")
|
|
||||||
raise RuntimeError()
|
|
||||||
except subprocess.CalledProcessError as e:
|
|
||||||
logger.error(f"Convert failed: {e}, return code: {e.returncode}")
|
|
||||||
except FileNotFoundError:
|
|
||||||
logger.error("LibreOffice command not found, please make sure it is available in PATH")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"An unexpected error occurred, convert failed: {e}", )
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open(html_file_path, 'r', encoding='utf-8') as file:
|
|
||||||
txt_content = file.read()
|
|
||||||
os.remove(html_file_path)
|
|
||||||
logger.info("Tmp docx file removed")
|
|
||||||
except FileNotFoundError:
|
|
||||||
logger.error(f"Tmp file {html_file_path} does not exist")
|
|
||||||
except PermissionError:
|
|
||||||
logger.error(f"You are not allowed to delete tmp file {html_file_path}")
|
|
||||||
logger.info(f"Convert {html_file_path} to html success")
|
|
||||||
return txt_content
|
|
||||||
@@ -30,6 +30,7 @@ class ContentCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._content_filter(sample[self.text_key])
|
sample[self.text_key] = self._content_filter(sample[self.text_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, method: ContentCleaner costs {time.time() - start:6f} s")
|
logger.info(f"fileName: {sample[self.filename_key]}, method: ContentCleaner costs {time.time() - start:6f} s")
|
||||||
return sample
|
return sample
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ class AnonymizedCreditCardNumber(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._credit_card_number_filter(sample[self.text_key])
|
sample[self.text_key] = self._credit_card_number_filter(sample[self.text_key])
|
||||||
logger.info(
|
logger.info(
|
||||||
f"fileName: {sample[self.filename_key]}, method: CreditCardNumberCleaner costs {time.time() - start:6f} s")
|
f"fileName: {sample[self.filename_key]}, method: CreditCardNumberCleaner costs {time.time() - start:6f} s")
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ class EmailNumberCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._email_number_filter(sample[self.text_key])
|
sample[self.text_key] = self._email_number_filter(sample[self.text_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, method: EmailCleaner costs {time.time() - start:6f} s")
|
logger.info(f"fileName: {sample[self.filename_key]}, method: EmailCleaner costs {time.time() - start:6f} s")
|
||||||
return sample
|
return sample
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ class EmojiCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._emoji_filter(sample[self.text_key])
|
sample[self.text_key] = self._emoji_filter(sample[self.text_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, method: EmojiCleaner costs {time.time() - start:6f} s")
|
logger.info(f"fileName: {sample[self.filename_key]}, method: EmojiCleaner costs {time.time() - start:6f} s")
|
||||||
return sample
|
return sample
|
||||||
|
|||||||
@@ -41,6 +41,7 @@ class ExtraSpaceCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
|
sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
|
||||||
logger.info(
|
logger.info(
|
||||||
f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")
|
f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ class FullWidthCharacterCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
|
sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||||
f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")
|
f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ class GrableCharactersCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._grable_characters_filter(sample[self.text_key])
|
sample[self.text_key] = self._grable_characters_filter(sample[self.text_key])
|
||||||
logger.info(
|
logger.info(
|
||||||
f"fileName: {sample[self.filename_key]}, method: GrableCharactersCleaner costs {time.time() - start:6f} s")
|
f"fileName: {sample[self.filename_key]}, method: GrableCharactersCleaner costs {time.time() - start:6f} s")
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ class HtmlTagCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
if sample[self.filetype_key] != "xml":
|
if sample[self.filetype_key] != "xml":
|
||||||
sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
|
sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|||||||
@@ -71,6 +71,7 @@ class AnonymizedIdNumber(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._id_number_filter(sample[self.text_key])
|
sample[self.text_key] = self._id_number_filter(sample[self.text_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, method: IDNumberCleaner costs {time.time() - start:6f} s")
|
logger.info(f"fileName: {sample[self.filename_key]}, method: IDNumberCleaner costs {time.time() - start:6f} s")
|
||||||
return sample
|
return sample
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ class ImgDenoise(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
|
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
|
|
||||||
|
|||||||
@@ -97,6 +97,7 @@ class ImgDirectionCorrect(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
file_type = "." + sample[self.filetype_key]
|
file_type = "." + sample[self.filetype_key]
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
|
|||||||
@@ -88,6 +88,7 @@ class ImgBrightness(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
file_type = "." + sample[self.filetype_key]
|
file_type = "." + sample[self.filetype_key]
|
||||||
|
|||||||
@@ -59,6 +59,7 @@ class ImgContrast(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
file_type = "." + sample[self.filetype_key]
|
file_type = "." + sample[self.filetype_key]
|
||||||
|
|||||||
@@ -69,6 +69,7 @@ class ImgSaturation(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
file_type = "." + sample[self.filetype_key]
|
file_type = "." + sample[self.filetype_key]
|
||||||
|
|||||||
@@ -57,6 +57,7 @@ class ImgSharpness(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
file_type = "." + sample[self.filetype_key]
|
file_type = "." + sample[self.filetype_key]
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ class ImgPerspectiveTransformation(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
file_type = "." + sample[self.filetype_key]
|
file_type = "." + sample[self.filetype_key]
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ class ImgResize(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
file_type = "." + sample[self.filetype_key]
|
file_type = "." + sample[self.filetype_key]
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
|
|||||||
@@ -60,6 +60,7 @@ class ImgShadowRemove(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
file_type = "." + sample[self.filetype_key]
|
file_type = "." + sample[self.filetype_key]
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ class ImgTypeUnify(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample):
|
def execute(self, sample):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
origin_file_type = sample[self.filetype_key]
|
origin_file_type = sample[self.filetype_key]
|
||||||
if origin_file_type == self._setting_type:
|
if origin_file_type == self._setting_type:
|
||||||
|
|||||||
@@ -80,6 +80,7 @@ class ImgWatermarkRemove(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]):
|
def execute(self, sample: Dict[str, Any]):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
file_type = "." + sample[self.filetype_key]
|
file_type = "." + sample[self.filetype_key]
|
||||||
img_bytes = sample[self.data_key]
|
img_bytes = sample[self.data_key]
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ class InvisibleCharactersCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._invisible_characters_filter(sample[self.text_key])
|
sample[self.text_key] = self._invisible_characters_filter(sample[self.text_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||||
f"method: InvisibleCharactersCleaner costs {time.time() - start:6f} s")
|
f"method: InvisibleCharactersCleaner costs {time.time() - start:6f} s")
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ class AnonymizedIpAddress(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._ip_address_filter(sample[self.text_key])
|
sample[self.text_key] = self._ip_address_filter(sample[self.text_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, method: IPAddressCleaner costs {time.time() - start:6f} s")
|
logger.info(f"fileName: {sample[self.filename_key]}, method: IPAddressCleaner costs {time.time() - start:6f} s")
|
||||||
return sample
|
return sample
|
||||||
|
|||||||
@@ -35,6 +35,7 @@ class KnowledgeRelationSlice(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
|
|
||||||
chunk_item = get_json_list(sample[self.text_key], chunk_size=self.chunk_size, overlap_size=self.overlap_size)
|
chunk_item = get_json_list(sample[self.text_key], chunk_size=self.chunk_size, overlap_size=self.overlap_size)
|
||||||
chunk_item_json = json.dumps(chunk_item, ensure_ascii=False)
|
chunk_item_json = json.dumps(chunk_item, ensure_ascii=False)
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ class LegendCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._clean_html_tag(sample[self.text_key])
|
sample[self.text_key] = self._clean_html_tag(sample[self.text_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, method: LegendCleaner costs {time.time() - start:6f} s")
|
logger.info(f"fileName: {sample[self.filename_key]}, method: LegendCleaner costs {time.time() - start:6f} s")
|
||||||
return sample
|
return sample
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ class AnonymizedPhoneNumber(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._phone_number_filter(sample[self.text_key])
|
sample[self.text_key] = self._phone_number_filter(sample[self.text_key])
|
||||||
logger.info(
|
logger.info(
|
||||||
f"fileName: {sample[self.filename_key]}, method: PhoneNumberCleaner costs {time.time() - start:6f} s")
|
f"fileName: {sample[self.filename_key]}, method: PhoneNumberCleaner costs {time.time() - start:6f} s")
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ class PoliticalWordCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._political_word_filter(sample[self.text_key])
|
sample[self.text_key] = self._political_word_filter(sample[self.text_key])
|
||||||
logger.info(
|
logger.info(
|
||||||
f"fileName: {sample[self.filename_key]}, method: PoliticalWordCleaner costs {time.time() - start:6f} s")
|
f"fileName: {sample[self.filename_key]}, method: PoliticalWordCleaner costs {time.time() - start:6f} s")
|
||||||
|
|||||||
@@ -47,8 +47,8 @@ def duplicate_sentences_filter(input_data: str, file_name: str, duplicate_th: in
|
|||||||
paragraph_counts[paragraph_strip] = -1
|
paragraph_counts[paragraph_strip] = -1
|
||||||
|
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
logger.exception(f"fileName: {file_name}, method: RemoveDuplicateSentencess. An error occurred when using "
|
logger.exception(f"fileName: {file_name}, method: RemoveDuplicateSentencess. An error occurred when using "
|
||||||
f"filtering duplicate sentences. The error is: {err}")
|
f"filtering duplicate sentences. The error is: {err}")
|
||||||
return input_data
|
return input_data
|
||||||
|
|
||||||
# 将去重后的段落重新组合成文本
|
# 将去重后的段落重新组合成文本
|
||||||
@@ -63,6 +63,7 @@ class DuplicateSentencesFilter(Filter):
|
|||||||
duplicate_th = 5 # 段落重复次数阈值
|
duplicate_th = 5 # 段落重复次数阈值
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = duplicate_sentences_filter(sample[self.text_key], file_name, duplicate_th)
|
sample[self.text_key] = duplicate_sentences_filter(sample[self.text_key], file_name, duplicate_th)
|
||||||
logger.info(f"fileName: {file_name}, RemoveDuplicateSentencess costs {time.time() - start:6f} s")
|
logger.info(f"fileName: {file_name}, RemoveDuplicateSentencess costs {time.time() - start:6f} s")
|
||||||
return sample
|
return sample
|
||||||
|
|||||||
@@ -56,6 +56,7 @@ class SexualAndViolentWordCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._sexual_and_violent_word_filter(sample[self.text_key])
|
sample[self.text_key] = self._sexual_and_violent_word_filter(sample[self.text_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||||
f"method: SexualAndViolentWordCleaner costs {time.time() - start:6f} s")
|
f"method: SexualAndViolentWordCleaner costs {time.time() - start:6f} s")
|
||||||
|
|||||||
@@ -61,6 +61,7 @@ class TextToWord(Mapper):
|
|||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
"""将文本信息转换为docx文件流"""
|
"""将文本信息转换为docx文件流"""
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.data_key] = self._txt_to_docx(sample[self.text_key]) # 将文字转换为word字符串流
|
sample[self.data_key] = self._txt_to_docx(sample[self.text_key]) # 将文字转换为word字符串流
|
||||||
sample[self.text_key] = ""
|
sample[self.text_key] = ""
|
||||||
sample["target_type"] = "docx"
|
sample["target_type"] = "docx"
|
||||||
|
|||||||
@@ -27,6 +27,7 @@ class TraditionalChineseCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._traditional_chinese_filter(sample[self.text_key])
|
sample[self.text_key] = self._traditional_chinese_filter(sample[self.text_key])
|
||||||
logger.info(
|
logger.info(
|
||||||
f"fileName: {sample[self.filename_key]}, method: TraditionalChinese costs {time.time() - start:6f} s")
|
f"fileName: {sample[self.filename_key]}, method: TraditionalChinese costs {time.time() - start:6f} s")
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ class UnicodeSpaceCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._clean_unicode_space(sample[self.text_key])
|
sample[self.text_key] = self._clean_unicode_space(sample[self.text_key])
|
||||||
logger.info(
|
logger.info(
|
||||||
f"fileName: {sample[self.filename_key]}, method: UnicodeSpaceCleaner costs {time.time() - start:6f} s")
|
f"fileName: {sample[self.filename_key]}, method: UnicodeSpaceCleaner costs {time.time() - start:6f} s")
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ class AnonymizedUrlCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
sample[self.text_key] = self._url_filter(sample[self.text_key])
|
sample[self.text_key] = self._url_filter(sample[self.text_key])
|
||||||
logger.info(f"fileName: {sample[self.filename_key]}, method: UrlCleaner costs {time.time() - start:6f} s")
|
logger.info(f"fileName: {sample[self.filename_key]}, method: UrlCleaner costs {time.time() - start:6f} s")
|
||||||
return sample
|
return sample
|
||||||
|
|||||||
@@ -52,6 +52,7 @@ class XMLTagCleaner(Mapper):
|
|||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
self.read_file_first(sample)
|
||||||
file_name = sample[self.filename_key]
|
file_name = sample[self.filename_key]
|
||||||
if sample[self.filetype_key] == "xml":
|
if sample[self.filetype_key] == "xml":
|
||||||
try:
|
try:
|
||||||
|
|||||||
28
runtime/ops/pyproject.toml
Normal file
28
runtime/ops/pyproject.toml
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
[project]
|
||||||
|
name = "ops"
|
||||||
|
version = "0.0.1"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"beautifulsoup4>=4.14.3",
|
||||||
|
"datasketch>=1.8.0",
|
||||||
|
"email-validator>=2.3.0",
|
||||||
|
"emoji>=2.15.0",
|
||||||
|
"jieba>=0.42.1",
|
||||||
|
"loguru>=0.7.3",
|
||||||
|
"numpy>=2.2.0,<=2.2.6",
|
||||||
|
"opencv-contrib-python-headless>=4.12.0.88",
|
||||||
|
"opencv-python-headless>=4.12.0.88",
|
||||||
|
"openslide-python>=1.4.3",
|
||||||
|
"paddleocr>=3.3.2",
|
||||||
|
"pandas>=2.2.0,<=2.2.3",
|
||||||
|
"pycryptodome>=3.23.0",
|
||||||
|
"pymysql>=1.1.2",
|
||||||
|
"python-docx>=1.2.0",
|
||||||
|
"pytz>=2025.2",
|
||||||
|
"six>=1.17.0",
|
||||||
|
"sqlalchemy>=2.0.44",
|
||||||
|
"xmltodict>=1.0.2",
|
||||||
|
"zhconv>=1.4.3",
|
||||||
|
]
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
beautifulsoup4==4.14.2
|
|
||||||
datamate==0.0.1
|
|
||||||
datasketch==1.6.5
|
|
||||||
email_validator==2.3.0
|
|
||||||
emoji==2.2.0
|
|
||||||
jieba==0.42.1
|
|
||||||
loguru==0.7.3
|
|
||||||
numpy==2.2.6
|
|
||||||
opencv_contrib_python-headless==4.10.0.84
|
|
||||||
opencv_python-headless==4.12.0.88
|
|
||||||
openslide_python==1.4.2
|
|
||||||
paddleocr==3.2.0
|
|
||||||
pandas==2.2.3
|
|
||||||
pycryptodome==3.23.0
|
|
||||||
python_docx==1.2.0
|
|
||||||
pytz==2025.2
|
|
||||||
six==1.17.0
|
|
||||||
xmltodict==1.0.2
|
|
||||||
zhconv==1.4.3
|
|
||||||
sqlalchemy==2.0.40
|
|
||||||
pymysql==1.1.1
|
|
||||||
unstructured[docx,csv,xlsx,pptx]==0.18.15
|
|
||||||
@@ -2,10 +2,15 @@
|
|||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
import uuid
|
||||||
from typing import List, Dict, Any, Tuple
|
from typing import List, Dict, Any, Tuple
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
from unstructured.partition.auto import partition
|
||||||
|
|
||||||
from datamate.common.error_code import ERROR_CODE_TABLE, UNKNOWN_ERROR_CODE
|
from datamate.common.error_code import ERROR_CODE_TABLE, UNKNOWN_ERROR_CODE
|
||||||
from datamate.common.utils.llm_request import LlmReq
|
from datamate.common.utils.llm_request import LlmReq
|
||||||
@@ -52,6 +57,7 @@ class BaseOp:
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.accelerator = kwargs.get('accelerator', "cpu")
|
self.accelerator = kwargs.get('accelerator', "cpu")
|
||||||
self.is_last_op = kwargs.get('is_last_op', False)
|
self.is_last_op = kwargs.get('is_last_op', False)
|
||||||
|
self.is_first_op = kwargs.get('is_first_op', False)
|
||||||
self._name = kwargs.get('op_name', None)
|
self._name = kwargs.get('op_name', None)
|
||||||
self.infer_model = None
|
self.infer_model = None
|
||||||
self.text_key = kwargs.get('text_key', "text")
|
self.text_key = kwargs.get('text_key', "text")
|
||||||
@@ -122,10 +128,10 @@ class BaseOp:
|
|||||||
raise NotImplementedError("This is in BaseOp, plese re-define this method in Sub-classes")
|
raise NotImplementedError("This is in BaseOp, plese re-define this method in Sub-classes")
|
||||||
|
|
||||||
def fill_sample_params(self, sample: Dict[str, Any], **kwargs):
|
def fill_sample_params(self, sample: Dict[str, Any], **kwargs):
|
||||||
if not sample.get("text", None):
|
if not sample.get(self.text_key, None):
|
||||||
sample[self.text_key] = ""
|
sample[self.text_key] = ""
|
||||||
|
|
||||||
if not sample.get("data", None):
|
if not sample.get(self.data_key, None):
|
||||||
sample[self.data_key] = b""
|
sample[self.data_key] = b""
|
||||||
|
|
||||||
if not sample[self.data_key] and not sample[self.text_key]:
|
if not sample[self.data_key] and not sample[self.text_key]:
|
||||||
@@ -137,6 +143,27 @@ class BaseOp:
|
|||||||
failed_reason = {"op_name": op_name, "error_code": error_code, "reason": exc_info}
|
failed_reason = {"op_name": op_name, "error_code": error_code, "reason": exc_info}
|
||||||
sample["failed_reason"] = failed_reason
|
sample["failed_reason"] = failed_reason
|
||||||
|
|
||||||
|
def read_file(self, sample):
|
||||||
|
filepath = sample[self.filepath_key]
|
||||||
|
filetype = sample[self.filetype_key]
|
||||||
|
if filetype in ["ppt", "pptx", "docx", "doc", "xlsx"]:
|
||||||
|
elements = partition(filename=filepath)
|
||||||
|
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
|
||||||
|
elif filetype in ["txt", "md", "markdown", "xml", "html", "csv", "json", "jsonl"]:
|
||||||
|
with open(filepath, 'rb') as f:
|
||||||
|
content = f.read()
|
||||||
|
sample[self.text_key] = content.decode("utf-8-sig").replace("\r\n", "\n")
|
||||||
|
elif filetype in ['jpg', 'jpeg', 'png', 'bmp']:
|
||||||
|
image_np = cv2.imdecode(np.fromfile(filepath, dtype=np.uint8), -1)
|
||||||
|
if image_np.size:
|
||||||
|
data = cv2.imencode(f".{filetype}", image_np)[1]
|
||||||
|
image_bytes = data.tobytes()
|
||||||
|
sample[self.data_key] = image_bytes
|
||||||
|
|
||||||
|
def read_file_first(self, sample):
|
||||||
|
if self.is_first_op:
|
||||||
|
self.read_file(sample)
|
||||||
|
|
||||||
|
|
||||||
class Mapper(BaseOp):
|
class Mapper(BaseOp):
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
@@ -158,15 +185,16 @@ class Mapper(BaseOp):
|
|||||||
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
|
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
|
||||||
f"{str(get_exception_info(e))}")
|
f"{str(get_exception_info(e))}")
|
||||||
sample["execute_status"] = execute_status
|
sample["execute_status"] = execute_status
|
||||||
task_info = TaskInfoPersistence()
|
sample[self.filesize_key] = "0"
|
||||||
task_info.persistence_task_info(sample)
|
sample[self.filetype_key] = ""
|
||||||
|
TaskInfoPersistence().update_task_result(sample)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
sample["execute_status"] = execute_status
|
sample["execute_status"] = execute_status
|
||||||
# 加载文件成功执行信息到数据库
|
# 加载文件成功执行信息到数据库
|
||||||
if self.is_last_op:
|
if self.is_last_op:
|
||||||
task_info = TaskInfoPersistence()
|
if FileExporter().execute(sample):
|
||||||
task_info.persistence_task_info(sample)
|
TaskInfoPersistence().persistence_task_info(sample)
|
||||||
return sample
|
return sample
|
||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
@@ -197,8 +225,9 @@ class Slicer(BaseOp):
|
|||||||
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
|
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
|
||||||
f"{str(get_exception_info(e))}")
|
f"{str(get_exception_info(e))}")
|
||||||
sample["execute_status"] = execute_status
|
sample["execute_status"] = execute_status
|
||||||
task_info = TaskInfoPersistence()
|
sample[self.filesize_key] = "0"
|
||||||
task_info.persistence_task_info(sample)
|
sample[self.filetype_key] = ""
|
||||||
|
TaskInfoPersistence().update_task_result(sample)
|
||||||
return [sample]
|
return [sample]
|
||||||
|
|
||||||
self.load_sample_to_sample(sample, sample_list)
|
self.load_sample_to_sample(sample, sample_list)
|
||||||
@@ -206,8 +235,8 @@ class Slicer(BaseOp):
|
|||||||
|
|
||||||
# 加载文件成功执行信息到数据库
|
# 加载文件成功执行信息到数据库
|
||||||
if self.is_last_op:
|
if self.is_last_op:
|
||||||
task_info = TaskInfoPersistence()
|
if FileExporter().execute(sample):
|
||||||
task_info.persistence_task_info(sample)
|
TaskInfoPersistence().persistence_task_info(sample)
|
||||||
|
|
||||||
return [sample]
|
return [sample]
|
||||||
|
|
||||||
@@ -286,22 +315,24 @@ class Filter(BaseOp):
|
|||||||
sample["execute_status"] = execute_status
|
sample["execute_status"] = execute_status
|
||||||
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
|
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
|
||||||
f"{str(get_exception_info(e))}")
|
f"{str(get_exception_info(e))}")
|
||||||
task_info = TaskInfoPersistence()
|
sample[self.filesize_key] = "0"
|
||||||
task_info.persistence_task_info(sample)
|
sample[self.filetype_key] = ""
|
||||||
|
TaskInfoPersistence().update_task_result(sample)
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
sample["execute_status"] = execute_status
|
sample["execute_status"] = execute_status
|
||||||
# 文件无内容会被过滤
|
# 文件无内容会被过滤
|
||||||
if sample[self.text_key] == "" and sample[self.data_key] == b"":
|
if sample[self.text_key] == "" and sample[self.data_key] == b"":
|
||||||
task_info = TaskInfoPersistence()
|
task_info = TaskInfoPersistence()
|
||||||
sample["fileSize"] = "0"
|
sample[self.filesize_key] = "0"
|
||||||
task_info.persistence_task_info(sample)
|
sample[self.filetype_key] = ""
|
||||||
|
task_info.update_task_result(sample)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# 加载文件成功执行信息到数据库
|
# 加载文件成功执行信息到数据库
|
||||||
if self.is_last_op:
|
if self.is_last_op:
|
||||||
task_info = TaskInfoPersistence()
|
if FileExporter().execute(sample):
|
||||||
task_info.persistence_task_info(sample)
|
TaskInfoPersistence().persistence_task_info(sample)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
@@ -379,3 +410,131 @@ class LLM(Mapper):
|
|||||||
raise RuntimeError(f"Save jsonl file Failed!, save_path: {save_path}.") from e
|
raise RuntimeError(f"Save jsonl file Failed!, save_path: {save_path}.") from e
|
||||||
|
|
||||||
logger.info(f"LLM output has been save to {save_path}.")
|
logger.info(f"LLM output has been save to {save_path}.")
|
||||||
|
|
||||||
|
|
||||||
|
class FileExporter(BaseOp):
|
||||||
|
"""把输入的json文件流抽取为txt"""
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super(FileExporter, self).__init__(*args, **kwargs)
|
||||||
|
self.last_ops = True
|
||||||
|
self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
|
||||||
|
'xlsx', 'xls', 'csv', 'pptx', 'ppt',
|
||||||
|
'xml', 'json', 'doc', 'docx', 'pdf'])
|
||||||
|
self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
|
||||||
|
self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])
|
||||||
|
|
||||||
|
def execute(self, sample: Dict[str, Any]):
|
||||||
|
file_name = sample[self.filename_key]
|
||||||
|
file_type = sample[self.filetype_key]
|
||||||
|
|
||||||
|
try:
|
||||||
|
start = time.time()
|
||||||
|
if file_type in self.text_support_ext:
|
||||||
|
sample, save_path = self.get_textfile_handler(sample)
|
||||||
|
elif file_type in self.data_support_ext:
|
||||||
|
sample, save_path = self.get_datafile_handler(sample)
|
||||||
|
elif file_type in self.medical_support_ext:
|
||||||
|
sample, save_path = self.get_medicalfile_handler(sample)
|
||||||
|
else:
|
||||||
|
raise TypeError(f"{file_type} is unsupported! please check support_ext in FileExporter Ops")
|
||||||
|
|
||||||
|
if sample[self.text_key] == '' and sample[self.data_key] == b'':
|
||||||
|
sample[self.filesize_key] = "0"
|
||||||
|
return False
|
||||||
|
|
||||||
|
if save_path:
|
||||||
|
self.save_file(sample, save_path)
|
||||||
|
sample[self.text_key] = ''
|
||||||
|
sample[self.data_key] = b''
|
||||||
|
sample[Fields.result] = True
|
||||||
|
|
||||||
|
file_type = save_path.split('.')[-1]
|
||||||
|
sample[self.filetype_key] = file_type
|
||||||
|
|
||||||
|
base_name, _ = os.path.splitext(file_name)
|
||||||
|
new_file_name = base_name + '.' + file_type
|
||||||
|
sample[self.filename_key] = new_file_name
|
||||||
|
|
||||||
|
base_name, _ = os.path.splitext(save_path)
|
||||||
|
sample[self.filepath_key] = base_name
|
||||||
|
file_size = os.path.getsize(base_name)
|
||||||
|
sample[self.filesize_key] = f"{file_size}"
|
||||||
|
|
||||||
|
logger.info(f"origin file named {file_name} has been save to {save_path}")
|
||||||
|
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||||
|
f"method: FileExporter costs {time.time() - start:.6f} s")
|
||||||
|
except UnicodeDecodeError as err:
|
||||||
|
logger.error(f"fileName: {sample[self.filename_key]}, "
|
||||||
|
f"method: FileExporter causes decode error: {err}")
|
||||||
|
raise
|
||||||
|
return True
|
||||||
|
|
||||||
|
def get_save_path(self, sample: Dict[str, Any], target_type):
|
||||||
|
export_path = os.path.abspath(sample[self.export_path_key])
|
||||||
|
file_name = sample[self.filename_key]
|
||||||
|
new_file_name = os.path.splitext(file_name)[0] + '.' + target_type
|
||||||
|
|
||||||
|
if not check_valid_path(export_path):
|
||||||
|
os.makedirs(export_path, exist_ok=True)
|
||||||
|
return os.path.join(export_path, new_file_name)
|
||||||
|
|
||||||
|
def get_textfile_handler(self, sample: Dict[str, Any]):
|
||||||
|
target_type = sample.get("target_type", None)
|
||||||
|
|
||||||
|
# target_type存在则保存为扫描件, docx格式
|
||||||
|
if target_type:
|
||||||
|
sample = self._get_from_data(sample)
|
||||||
|
save_path = self.get_save_path(sample, target_type)
|
||||||
|
# 不存在则保存为txt文件,正常文本清洗
|
||||||
|
else:
|
||||||
|
sample = self._get_from_text(sample)
|
||||||
|
save_path = self.get_save_path(sample, 'txt')
|
||||||
|
return sample, save_path
|
||||||
|
|
||||||
|
def get_datafile_handler(self, sample: Dict[str, Any]):
|
||||||
|
target_type = sample.get("target_type", None)
|
||||||
|
|
||||||
|
# target_type存在, 图转文保存为target_type,markdown格式
|
||||||
|
if target_type:
|
||||||
|
sample = self._get_from_text(sample)
|
||||||
|
save_path = self.get_save_path(sample, target_type)
|
||||||
|
# 不存在则保存为原本图片文件格式,正常图片清洗
|
||||||
|
else:
|
||||||
|
sample = self._get_from_data(sample)
|
||||||
|
save_path = self.get_save_path(sample, sample[self.filetype_key])
|
||||||
|
return sample, save_path
|
||||||
|
|
||||||
|
def get_medicalfile_handler(self, sample: Dict[str, Any]):
|
||||||
|
target_type = 'png'
|
||||||
|
|
||||||
|
sample = self._get_from_data(sample)
|
||||||
|
save_path = self.get_save_path(sample, target_type)
|
||||||
|
|
||||||
|
return sample, save_path
|
||||||
|
|
||||||
|
def save_file(self, sample, save_path):
|
||||||
|
file_name, _ = os.path.splitext(save_path)
|
||||||
|
# 以二进制格式保存文件
|
||||||
|
file_sample = sample[self.text_key].encode('utf-8') if sample[self.text_key] else sample[self.data_key]
|
||||||
|
with open(file_name, 'wb') as f:
|
||||||
|
f.write(file_sample)
|
||||||
|
# 获取父目录路径
|
||||||
|
|
||||||
|
parent_dir = os.path.dirname(file_name)
|
||||||
|
os.chmod(parent_dir, 0o770)
|
||||||
|
os.chmod(file_name, 0o640)
|
||||||
|
|
||||||
|
def _get_from_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
sample[self.data_key] = bytes(sample[self.data_key])
|
||||||
|
sample[self.text_key] = ''
|
||||||
|
return sample
|
||||||
|
|
||||||
|
def _get_from_text(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
sample[self.data_key] = b''
|
||||||
|
sample[self.text_key] = str(sample[self.text_key])
|
||||||
|
return sample
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _get_uuid():
|
||||||
|
return str(uuid.uuid4())
|
||||||
|
|||||||
@@ -119,6 +119,8 @@ class RayDataset(BasicDataset):
|
|||||||
|
|
||||||
# 加载Ops module
|
# 加载Ops module
|
||||||
temp_ops = self.load_ops_module(op_name)
|
temp_ops = self.load_ops_module(op_name)
|
||||||
|
if index == 0:
|
||||||
|
init_kwargs["is_first_op"] = True
|
||||||
|
|
||||||
if index == len(cfg_process) - 1:
|
if index == len(cfg_process) - 1:
|
||||||
init_kwargs["is_last_op"] = True
|
init_kwargs["is_last_op"] = True
|
||||||
@@ -182,7 +184,8 @@ class RayDataset(BasicDataset):
|
|||||||
fn_kwargs=kwargs,
|
fn_kwargs=kwargs,
|
||||||
resources=resources,
|
resources=resources,
|
||||||
num_cpus=0.05,
|
num_cpus=0.05,
|
||||||
concurrency=(1, 1 if operators_cls.use_model else int(max_actor_nums)))
|
compute=rd.ActorPoolStrategy(min_size=1,
|
||||||
|
max_size=int(max_actor_nums)))
|
||||||
|
|
||||||
elif issubclass(operators_cls, (Slicer, RELATIVE_Slicer)):
|
elif issubclass(operators_cls, (Slicer, RELATIVE_Slicer)):
|
||||||
self.data = self.data.flat_map(operators_cls,
|
self.data = self.data.flat_map(operators_cls,
|
||||||
@@ -190,7 +193,8 @@ class RayDataset(BasicDataset):
|
|||||||
fn_kwargs=kwargs,
|
fn_kwargs=kwargs,
|
||||||
resources=resources,
|
resources=resources,
|
||||||
num_cpus=0.05,
|
num_cpus=0.05,
|
||||||
concurrency=(1, int(max_actor_nums)))
|
compute=rd.ActorPoolStrategy(min_size=1,
|
||||||
|
max_size=int(max_actor_nums)))
|
||||||
|
|
||||||
elif issubclass(operators_cls, (Filter, RELATIVE_Filter)):
|
elif issubclass(operators_cls, (Filter, RELATIVE_Filter)):
|
||||||
self.data = self.data.filter(operators_cls,
|
self.data = self.data.filter(operators_cls,
|
||||||
@@ -198,7 +202,8 @@ class RayDataset(BasicDataset):
|
|||||||
fn_kwargs=kwargs,
|
fn_kwargs=kwargs,
|
||||||
resources=resources,
|
resources=resources,
|
||||||
num_cpus=0.05,
|
num_cpus=0.05,
|
||||||
concurrency=(1, int(max_actor_nums)))
|
compute=rd.ActorPoolStrategy(min_size=1,
|
||||||
|
max_size=int(max_actor_nums)))
|
||||||
else:
|
else:
|
||||||
logger.error(
|
logger.error(
|
||||||
'Ray executor only support Filter, Mapper and Slicer OPs for now')
|
'Ray executor only support Filter, Mapper and Slicer OPs for now')
|
||||||
|
|||||||
@@ -25,13 +25,13 @@ class TaskInfoPersistence:
|
|||||||
with open(sql_config_path, 'r', encoding='utf-8') as f:
|
with open(sql_config_path, 'r', encoding='utf-8') as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
def persistence_task_info(self, sample: Dict[str, Any]):
|
def update_task_result(self, sample, file_id = str(uuid.uuid4())):
|
||||||
instance_id = str(sample.get("instance_id"))
|
instance_id = str(sample.get("instance_id"))
|
||||||
src_file_name = str(sample.get("sourceFileName"))
|
src_file_name = str(sample.get("sourceFileName"))
|
||||||
src_file_type = str(sample.get("sourceFileType"))
|
src_file_type = str(sample.get("sourceFileType"))
|
||||||
src_file_id = str(sample.get("sourceFileId"))
|
src_file_id = str(sample.get("sourceFileId"))
|
||||||
src_file_size = int(sample.get("sourceFileSize"))
|
src_file_size = int(sample.get("sourceFileSize"))
|
||||||
file_id = str(uuid.uuid4())
|
|
||||||
file_size = str(sample.get("fileSize"))
|
file_size = str(sample.get("fileSize"))
|
||||||
file_type = str(sample.get("fileType"))
|
file_type = str(sample.get("fileType"))
|
||||||
file_name = str(sample.get("fileName"))
|
file_name = str(sample.get("fileName"))
|
||||||
@@ -53,6 +53,10 @@ class TaskInfoPersistence:
|
|||||||
}
|
}
|
||||||
self.insert_result(result_data, str(self.sql_dict.get("insert_clean_result_sql")))
|
self.insert_result(result_data, str(self.sql_dict.get("insert_clean_result_sql")))
|
||||||
|
|
||||||
|
def update_file_result(self, sample, file_id):
|
||||||
|
file_size = str(sample.get("fileSize"))
|
||||||
|
file_type = str(sample.get("fileType"))
|
||||||
|
file_name = str(sample.get("fileName"))
|
||||||
dataset_id = str(sample.get("dataset_id"))
|
dataset_id = str(sample.get("dataset_id"))
|
||||||
file_path = str(sample.get("filePath"))
|
file_path = str(sample.get("filePath"))
|
||||||
create_time = datetime.now()
|
create_time = datetime.now()
|
||||||
@@ -72,6 +76,11 @@ class TaskInfoPersistence:
|
|||||||
}
|
}
|
||||||
self.insert_result(file_data, str(self.sql_dict.get("insert_dataset_file_sql")))
|
self.insert_result(file_data, str(self.sql_dict.get("insert_dataset_file_sql")))
|
||||||
|
|
||||||
|
def persistence_task_info(self, sample: Dict[str, Any]):
|
||||||
|
file_id = str(uuid.uuid4())
|
||||||
|
self.update_task_result(sample, file_id)
|
||||||
|
self.update_file_result(sample, file_id)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def insert_result(data, sql):
|
def insert_result(data, sql):
|
||||||
retries = 0
|
retries = 0
|
||||||
|
|||||||
@@ -16,27 +16,13 @@ classifiers = [
|
|||||||
|
|
||||||
# Core dependencies
|
# Core dependencies
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"uvicorn[standard]",
|
"fastapi>=0.123.9",
|
||||||
"fastapi",
|
"jsonargparse>=4.44.0",
|
||||||
"loguru",
|
"loguru>=0.7.3",
|
||||||
"jsonargparse",
|
"opencv-python-headless>=4.12.0.88",
|
||||||
"ray[default, data]==2.46.0",
|
"ray[data,default]==2.52.1",
|
||||||
"opencv-python"
|
"unstructured[csv,docx,pptx,xlsx]==0.18.15",
|
||||||
]
|
"uvicorn[standard]>=0.38.0",
|
||||||
|
|
||||||
[project.optional-dependencies]
|
|
||||||
dj = [
|
|
||||||
"py-data-juicer~=1.4.0"
|
|
||||||
]
|
|
||||||
|
|
||||||
op = [
|
|
||||||
"python-docx==1.1.0"
|
|
||||||
]
|
|
||||||
|
|
||||||
# All dependencies
|
|
||||||
all = [
|
|
||||||
"datamate[dj]",
|
|
||||||
"datamate[op]"
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
|
|||||||
@@ -59,8 +59,7 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', '文本清洗模板', '文本清
|
|||||||
('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板');
|
('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板');
|
||||||
|
|
||||||
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
|
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
|
||||||
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
|
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
|
||||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
|
|
||||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, null),
|
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, null),
|
||||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, null),
|
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, null),
|
||||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, null),
|
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, null),
|
||||||
@@ -85,10 +84,10 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
|
|||||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, null),
|
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, null),
|
||||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, null),
|
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, null),
|
||||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, null),
|
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, null),
|
||||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null),
|
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null);
|
||||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileExporter', 28, null),
|
|
||||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgFormatter', 1, null),
|
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
|
||||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null),
|
VALUES ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null),
|
||||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 3, null),
|
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 3, null),
|
||||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 4, null),
|
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 4, null),
|
||||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 5, null),
|
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 5, null),
|
||||||
@@ -99,5 +98,4 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
|
|||||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 10, null),
|
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 10, null),
|
||||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 11, null),
|
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 11, null),
|
||||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, null),
|
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, null),
|
||||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null),
|
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null);
|
||||||
('4421504e-c6c9-4760-b55a-509d17429597', 'FileExporter', 14, null);
|
|
||||||
@@ -67,10 +67,7 @@ VALUES ('64465bec-b46b-11f0-8291-00155d0e4808', '模态', 'modal', 'predefined'
|
|||||||
|
|
||||||
INSERT IGNORE INTO t_operator
|
INSERT IGNORE INTO t_operator
|
||||||
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
|
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
|
||||||
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||||
('UnstructuredFormatter', 'Unstructured文本抽取', '基于Unstructured抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。', '1.0.0', 'text', 'text', null, null, '', false),
|
|
||||||
('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
|
||||||
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'multimodal', 'multimodal', null, null, '', false),
|
|
||||||
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
|
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
|
||||||
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
||||||
('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
||||||
@@ -97,7 +94,6 @@ VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0',
|
|||||||
('UnicodeSpaceCleaner', '空格标准化', '将文档中不同的 unicode 空格,如 u2008,转换为正常空格\\u0020。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
('UnicodeSpaceCleaner', '空格标准化', '将文档中不同的 unicode 空格,如 u2008,转换为正常空格\\u0020。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||||
('AnonymizedUrlCleaner', 'URL网址匿名化', '将文档中的url网址匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
('AnonymizedUrlCleaner', 'URL网址匿名化', '将文档中的url网址匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||||
('XMLTagCleaner', 'XML标签去除', '去除XML中的标签。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
('XMLTagCleaner', 'XML标签去除', '去除XML中的标签。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||||
('ImgFormatter', '读取图片文件', '读取图片文件。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
|
||||||
('ImgBlurredImagesCleaner', '模糊图片过滤', '去除模糊的图片。', '1.0.0', 'image', 'image', null, '{"blurredThreshold": {"name": "梯度函数值", "description": "梯度函数值取值越小,图片模糊度越高。", "type": "slider", "defaultVal": 1000, "min": 1, "max": 10000, "step": 1}}', '', 'false'),
|
('ImgBlurredImagesCleaner', '模糊图片过滤', '去除模糊的图片。', '1.0.0', 'image', 'image', null, '{"blurredThreshold": {"name": "梯度函数值", "description": "梯度函数值取值越小,图片模糊度越高。", "type": "slider", "defaultVal": 1000, "min": 1, "max": 10000, "step": 1}}', '', 'false'),
|
||||||
('ImgBrightness', '图片亮度增强', '自适应调节图片的亮度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
('ImgBrightness', '图片亮度增强', '自适应调节图片的亮度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||||
('ImgContrast', '图片对比度增强', '自适应调节图片的对比度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
('ImgContrast', '图片对比度增强', '自适应调节图片的对比度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||||
@@ -117,7 +113,7 @@ SELECT c.id, o.id
|
|||||||
FROM t_operator_category c
|
FROM t_operator_category c
|
||||||
CROSS JOIN t_operator o
|
CROSS JOIN t_operator o
|
||||||
WHERE c.id IN ('d8a5df7a-52a9-42c2-83c4-01062e60f597', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
|
WHERE c.id IN ('d8a5df7a-52a9-42c2-83c4-01062e60f597', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
|
||||||
AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter',
|
AND o.id IN ('FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter',
|
||||||
'FileWithHighRepeatWordRateFilter', 'FileWithHighSpecialCharRateFilter', 'FileWithManySensitiveWordsFilter',
|
'FileWithHighRepeatWordRateFilter', 'FileWithHighSpecialCharRateFilter', 'FileWithManySensitiveWordsFilter',
|
||||||
'DuplicateFilesFilter', 'DuplicateSentencesFilter', 'AnonymizedCreditCardNumber', 'AnonymizedIdNumber',
|
'DuplicateFilesFilter', 'DuplicateSentencesFilter', 'AnonymizedCreditCardNumber', 'AnonymizedIdNumber',
|
||||||
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
|
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
|
||||||
@@ -130,13 +126,6 @@ SELECT c.id, o.id
|
|||||||
FROM t_operator_category c
|
FROM t_operator_category c
|
||||||
CROSS JOIN t_operator o
|
CROSS JOIN t_operator o
|
||||||
WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
|
WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
|
||||||
AND o.id IN ('ImgFormatter', 'ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
|
AND o.id IN ('ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
|
||||||
'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
|
'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
|
||||||
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify');
|
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify');
|
||||||
|
|
||||||
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
|
||||||
SELECT c.id, o.id
|
|
||||||
FROM t_operator_category c
|
|
||||||
CROSS JOIN t_operator o
|
|
||||||
WHERE c.id IN ('4d7dbd77-0a92-44f3-9056-2cd62d4a71e4', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
|
|
||||||
AND o.id IN ('FileExporter', 'UnstructuredFormatter');
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ WORKDIR /opt/runtime
|
|||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||||
uv pip install -e . --system \
|
uv pip install -e . --system \
|
||||||
&& uv pip install -r /opt/runtime/datamate/ops/requirements.txt --system
|
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system
|
||||||
|
|
||||||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||||
&& chmod +x /opt/runtime/start.sh \
|
&& chmod +x /opt/runtime/start.sh \
|
||||||
|
|||||||
Reference in New Issue
Block a user