You've already forked DataMate
算子将抽取与落盘固定到流程中 (#134)
* feature: 将抽取动作移到每一个算子中 * feature: 落盘算子改为默认执行 * feature: 优化前端展示 * feature: 使用pyproject管理依赖
This commit is contained in:
@@ -150,6 +150,7 @@ const OperatorFlow: React.FC<OperatorFlowProps> = ({
|
||||
max={selectedOperators.length}
|
||||
defaultValue={index + 1}
|
||||
className="w-10 h-6 text-xs text-center"
|
||||
style={{ width: 60 }}
|
||||
autoFocus
|
||||
onBlur={(e) => handleIndexChange(operator.id, e.target.value)}
|
||||
onKeyDown={(e) => {
|
||||
|
||||
@@ -227,9 +227,8 @@ export default function FileTable({result, fetchTaskResult}) {
|
||||
dataIndex: "status",
|
||||
key: "status",
|
||||
filters: [
|
||||
{ text: "已完成", value: "已完成" },
|
||||
{ text: "失败", value: "失败" },
|
||||
{ text: "处理中", value: "处理中" },
|
||||
{ text: "已完成", value: "COMPLETED" },
|
||||
{ text: "失败", value: "FAILED" },
|
||||
],
|
||||
onFilter: (value: string, record: any) => record.status === value,
|
||||
render: (status: string) => (
|
||||
@@ -237,9 +236,7 @@ export default function FileTable({result, fetchTaskResult}) {
|
||||
status={
|
||||
status === "COMPLETED"
|
||||
? "success"
|
||||
: status === "FAILED"
|
||||
? "error"
|
||||
: "processing"
|
||||
: "error"
|
||||
}
|
||||
text={TaskStatusMap[status as TaskStatus].label}
|
||||
/>
|
||||
@@ -248,6 +245,7 @@ export default function FileTable({result, fetchTaskResult}) {
|
||||
{
|
||||
title: "操作",
|
||||
key: "action",
|
||||
width: 200,
|
||||
render: (_text: string, record: any) => (
|
||||
<div className="flex">
|
||||
{record.status === "COMPLETED" ? (
|
||||
|
||||
@@ -33,6 +33,7 @@ class FileWithHighRepeatPhraseRateFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._file_with_high_repeat_phrase_rate_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
|
||||
@@ -30,6 +30,7 @@ class FileWithHighRepeatWordRateFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._file_with_high_repeat_word_rate_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
|
||||
@@ -26,6 +26,7 @@ class FileWithHighSpecialCharRateFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._file_with_high_special_char_rate_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
|
||||
@@ -105,6 +105,7 @@ class ImgAdvertisementImagesCleaner(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
|
||||
@@ -27,6 +27,7 @@ class ImgBlurredImagesCleaner(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -61,6 +61,7 @@ class ImgDuplicatedImagesCleaner(Filter):
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""重复图片去重算子执行入口"""
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
|
||||
img_data = self._duplicate_images_filter(file_name, sample[self.data_key])
|
||||
|
||||
@@ -227,6 +227,7 @@ class ImgSimilarImagesCleaner(Filter):
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""去除相似图片算子执行入口"""
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
data = bytes_to_numpy(img_bytes) if img_bytes else np.array([])
|
||||
|
||||
@@ -150,6 +150,7 @@ class DuplicateFilesFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
|
||||
sample[self.text_key] = self.deduplicate_files(sample, file_name)
|
||||
|
||||
@@ -90,6 +90,7 @@ class FileWithManySensitiveWordsFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._file_with_many_sensitive_words_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
|
||||
@@ -31,6 +31,7 @@ class FileWithShortOrLongLengthFilter(Filter):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._file_with_short_or_long_length_filter(sample[self.text_key],
|
||||
sample[self.filename_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
|
||||
@@ -15,12 +15,7 @@ _configure_importer()
|
||||
|
||||
|
||||
def _import_operators():
|
||||
from . import text_formatter
|
||||
from . import word_formatter
|
||||
from . import img_formatter
|
||||
from . import file_exporter
|
||||
from . import slide_formatter
|
||||
from . import unstructured_formatter
|
||||
from . import mineru_formatter
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='FileExporter',
|
||||
module_path="ops.formatter.file_exporter.process")
|
||||
@@ -1,16 +0,0 @@
|
||||
name: '落盘算子'
|
||||
name_en: 'save file operator'
|
||||
description: '将文件内容保存为文件。'
|
||||
description_en: 'Save the file data as a file.'
|
||||
language: 'Python'
|
||||
vendor: 'Huawei'
|
||||
raw_id: 'FileExporter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'others'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'all'
|
||||
outputs: 'all'
|
||||
@@ -1,145 +0,0 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: Json文本抽取
|
||||
Create: 2024/06/06 15:43
|
||||
"""
|
||||
import time
|
||||
import os
|
||||
import uuid
|
||||
from typing import Tuple, Dict, Any
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.constant import Fields
|
||||
from datamate.core.base_op import Mapper
|
||||
from datamate.common.utils import check_valid_path
|
||||
|
||||
|
||||
class FileExporter(Mapper):
|
||||
"""把输入的json文件流抽取为txt"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(FileExporter, self).__init__(*args, **kwargs)
|
||||
self.last_ops = True
|
||||
self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
|
||||
'xlsx', 'xls', 'csv', 'pptx', 'ppt',
|
||||
'xml', 'json', 'doc', 'docx', 'pdf'])
|
||||
self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
|
||||
self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = sample[self.filetype_key]
|
||||
|
||||
try:
|
||||
start = time.time()
|
||||
if file_type in self.text_support_ext:
|
||||
sample, save_path = self.get_textfile_handler(sample)
|
||||
elif file_type in self.data_support_ext:
|
||||
sample, save_path = self.get_datafile_handler(sample)
|
||||
elif file_type in self.medical_support_ext:
|
||||
sample, save_path = self.get_medicalfile_handler(sample)
|
||||
else:
|
||||
raise TypeError(f"{file_type} is unsupported! please check support_ext in FileExporter Ops")
|
||||
|
||||
if sample[self.text_key] == '' and sample[self.data_key] == b'':
|
||||
sample[self.filesize_key] = "0"
|
||||
return sample
|
||||
|
||||
if save_path:
|
||||
self.save_file(sample, save_path)
|
||||
sample[self.text_key] = ''
|
||||
sample[self.data_key] = b''
|
||||
sample[Fields.result] = True
|
||||
|
||||
file_type = save_path.split('.')[-1]
|
||||
sample[self.filetype_key] = file_type
|
||||
|
||||
base_name, _ = os.path.splitext(file_name)
|
||||
new_file_name = base_name + '.' + file_type
|
||||
sample[self.filename_key] = new_file_name
|
||||
|
||||
base_name, _ = os.path.splitext(save_path)
|
||||
sample[self.filepath_key] = base_name
|
||||
file_size = os.path.getsize(base_name)
|
||||
sample[self.filesize_key] = f"{file_size}"
|
||||
|
||||
logger.info(f"origin file named {file_name} has been save to {save_path}")
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileExporter costs {time.time() - start:.6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.error(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileExporter causes decode error: {err}")
|
||||
raise
|
||||
return sample
|
||||
|
||||
def get_save_path(self, sample: Dict[str, Any], target_type) -> str:
|
||||
export_path = os.path.abspath(sample[self.export_path_key])
|
||||
file_name = sample[self.filename_key]
|
||||
new_file_name = os.path.splitext(file_name)[0] + '.' + target_type
|
||||
|
||||
if not check_valid_path(export_path):
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
res = os.path.join(export_path, new_file_name)
|
||||
return res
|
||||
|
||||
def get_textfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
||||
target_type = sample.get("target_type", None)
|
||||
|
||||
# target_type存在则保存为扫描件, docx格式
|
||||
if target_type:
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
# 不存在则保存为txt文件,正常文本清洗
|
||||
else:
|
||||
sample = self._get_from_text(sample)
|
||||
save_path = self.get_save_path(sample, 'txt')
|
||||
return sample, save_path
|
||||
|
||||
def get_datafile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
||||
target_type = sample.get("target_type", None)
|
||||
|
||||
# target_type存在, 图转文保存为target_type,markdown格式
|
||||
if target_type:
|
||||
sample = self._get_from_text(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
# 不存在则保存为原本图片文件格式,正常图片清洗
|
||||
else:
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, sample[self.filetype_key])
|
||||
return sample, save_path
|
||||
|
||||
def get_medicalfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
|
||||
target_type = 'png'
|
||||
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
|
||||
return sample, save_path
|
||||
|
||||
def save_file(self, sample, save_path):
|
||||
file_name, _ = os.path.splitext(save_path)
|
||||
# 以二进制格式保存文件
|
||||
file_sample = sample[self.text_key].encode('utf-8') if sample[self.text_key] else sample[self.data_key]
|
||||
with open(file_name, 'wb') as f:
|
||||
f.write(file_sample)
|
||||
# 获取父目录路径
|
||||
|
||||
parent_dir = os.path.dirname(file_name)
|
||||
os.chmod(parent_dir, 0o770)
|
||||
os.chmod(file_name, 0o640)
|
||||
|
||||
def _get_from_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
sample[self.data_key] = bytes(sample[self.data_key])
|
||||
sample[self.text_key] = ''
|
||||
return sample
|
||||
|
||||
def _get_from_text(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
sample[self.data_key] = b''
|
||||
sample[self.text_key] = str(sample[self.text_key])
|
||||
return sample
|
||||
|
||||
def _get_uuid(self):
|
||||
res = str(uuid.uuid4())
|
||||
return res
|
||||
@@ -1,6 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgFormatter',
|
||||
module_path="ops.formatter.img_formatter.process")
|
||||
@@ -1,16 +0,0 @@
|
||||
name: '读取图片文件'
|
||||
name_en: 'Image File Reader'
|
||||
description: '读取图片文件。'
|
||||
description_en: 'Reads image files.'
|
||||
language: 'Python'
|
||||
vendor: 'Huawei'
|
||||
raw_id: 'ImgFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
@@ -1,35 +0,0 @@
|
||||
# # -- encoding: utf-8 --
|
||||
|
||||
#
|
||||
# Description:
|
||||
# Create: 2024/1/30 15:24
|
||||
# """
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import numpy_to_bytes
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgFormatter(Mapper):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
file_path = sample[self.filepath_key]
|
||||
img_data = _img_extract(file_path)
|
||||
sample[self.data_key] = numpy_to_bytes(img_data, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgExtract costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
|
||||
def _img_extract(file_path):
|
||||
return cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1)
|
||||
@@ -1,6 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='TextFormatter',
|
||||
module_path="ops.formatter.text_formatter.process")
|
||||
@@ -1,16 +0,0 @@
|
||||
name: 'TXT文本抽取'
|
||||
name_en: 'TXT Text Extraction'
|
||||
description: '抽取TXT中的文本'
|
||||
description_en: 'Extracts text from TXT files.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'TxtFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
@@ -1,44 +0,0 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: Json文本抽取
|
||||
Create: 2024/06/06 15:43
|
||||
"""
|
||||
import time
|
||||
from loguru import logger
|
||||
from typing import Dict, Any
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class TextFormatter(Mapper):
|
||||
"""把输入的json文件流抽取为txt"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(TextFormatter, self).__init__(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _extract_json(byte_io):
|
||||
"""将默认使用utf-8编码的Json文件流解码,抽取为txt"""
|
||||
# 用utf-8-sig的格式进行抽取,可以避免uft-8 BOM编码格式的文件在抽取后产生隐藏字符作为前缀。
|
||||
return byte_io.decode("utf-8-sig").replace("\r\n", "\n")
|
||||
|
||||
def byte_read(self, sample: Dict[str, Any]):
|
||||
filepath = sample[self.filepath_key]
|
||||
with open(filepath, "rb") as file:
|
||||
byte_data = file.read()
|
||||
sample[self.data_key] = byte_data
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
try:
|
||||
self.byte_read(sample)
|
||||
sample[self.text_key] = self._extract_json(sample[self.data_key])
|
||||
sample[self.data_key] = b"" # 将sample[self.data_key]置空
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: TextFormatter costs {(time.time() - start):6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.exception(f"fileName: {sample[self.filename_key]}, method: TextFormatter causes decode error: {err}")
|
||||
raise
|
||||
return sample
|
||||
@@ -1,6 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='UnstructuredFormatter',
|
||||
module_path="ops.formatter.unstructured_formatter.process")
|
||||
@@ -1,16 +0,0 @@
|
||||
name: 'Unstructured文本抽取'
|
||||
name_en: 'Unstructured Text Extraction'
|
||||
description: '抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。'
|
||||
description_en: 'Extracts text from Unstructured files, currently supporting PowerPoint presentations, Word documents and Excel spreadsheets files.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'UnstructuredFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
@@ -1,37 +0,0 @@
|
||||
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 非结构化文本抽取
|
||||
Create: 2025/10/22 15:15
|
||||
"""
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class UnstructuredFormatter(Mapper):
|
||||
"""把输入的非结构化文本抽取为txt"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(UnstructuredFormatter, self).__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
filepath = sample.get(self.filepath_key)
|
||||
filename = sample.get(self.filename_key)
|
||||
if not filename.lower().endswith((".ppt", ".pptx", "docx", "xlsx", ".csv")):
|
||||
return sample
|
||||
try:
|
||||
elements = partition(filename=filepath)
|
||||
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
|
||||
logger.info(f"fileName: {filename}, method: UnstructuredFormatter costs {(time.time() - start):6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.exception(f"fileName: {filename}, method: UnstructuredFormatter causes decode error: {err}")
|
||||
raise
|
||||
return sample
|
||||
@@ -1,6 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='WordFormatter',
|
||||
module_path="ops.formatter.word_formatter.process")
|
||||
@@ -1,16 +0,0 @@
|
||||
name: 'Word文本抽取'
|
||||
name_en: 'Word Text Extraction'
|
||||
description: '抽取Word中的文本'
|
||||
description_en: 'Extracts text from Word files.'
|
||||
language: 'java'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'WordFormatter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'collect'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
@@ -1,68 +0,0 @@
|
||||
# # -- encoding: utf-8 --
|
||||
|
||||
#
|
||||
# Description:
|
||||
# Create: 2024/1/30 15:24
|
||||
# """
|
||||
from loguru import logger
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from datamate.common.utils import check_valid_path
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class WordFormatter(Mapper):
|
||||
SEPERATOR = ' | '
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(WordFormatter, self).__init__(*args, **kwargs)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
file_path = sample[self.filepath_key]
|
||||
file_type = sample[self.filetype_key]
|
||||
txt_content = self.word2html(file_path, file_type)
|
||||
sample[self.text_key] = txt_content
|
||||
logger.info(f"fileName: {file_name}, method: WordFormatter costs {(time.time() - start):6f} s")
|
||||
return sample
|
||||
|
||||
@staticmethod
|
||||
def word2html(file_path, file_type):
|
||||
check_valid_path(file_path)
|
||||
file_dir = file_path.rsplit('/', 1)[0]
|
||||
file_name = file_path.rsplit('/', 1)[1]
|
||||
html_file_path = os.path.join(file_dir, f"{file_name}.txt")
|
||||
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
try:
|
||||
process = subprocess.Popen(
|
||||
['java', '-jar', f'{current_file_path}/../../../java_operator/WordFormatter-1.0.jar', file_path,
|
||||
html_file_path, file_type], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
||||
stdout, stderr = process.communicate(timeout=24 * 60 * 60)
|
||||
if process.returncode == 0:
|
||||
logger.info(f"Convert {file_path} successfully to DOCX")
|
||||
else:
|
||||
logger.info(f"Convert {file_path} failed, error: {stderr.strip().decode('utf-8')}.")
|
||||
raise RuntimeError()
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Convert failed: {e}, return code: {e.returncode}")
|
||||
except FileNotFoundError:
|
||||
logger.error("LibreOffice command not found, please make sure it is available in PATH")
|
||||
except Exception as e:
|
||||
logger.error(f"An unexpected error occurred, convert failed: {e}", )
|
||||
|
||||
try:
|
||||
with open(html_file_path, 'r', encoding='utf-8') as file:
|
||||
txt_content = file.read()
|
||||
os.remove(html_file_path)
|
||||
logger.info("Tmp docx file removed")
|
||||
except FileNotFoundError:
|
||||
logger.error(f"Tmp file {html_file_path} does not exist")
|
||||
except PermissionError:
|
||||
logger.error(f"You are not allowed to delete tmp file {html_file_path}")
|
||||
logger.info(f"Convert {html_file_path} to html success")
|
||||
return txt_content
|
||||
@@ -30,6 +30,7 @@ class ContentCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._content_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: ContentCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -64,6 +64,7 @@ class AnonymizedCreditCardNumber(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._credit_card_number_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: CreditCardNumberCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -25,6 +25,7 @@ class EmailNumberCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._email_number_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: EmailCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -22,6 +22,7 @@ class EmojiCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._emoji_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: EmojiCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -41,6 +41,7 @@ class ExtraSpaceCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -34,6 +34,7 @@ class FullWidthCharacterCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -44,6 +44,7 @@ class GrableCharactersCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._grable_characters_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: GrableCharactersCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -64,6 +64,7 @@ class HtmlTagCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
if sample[self.filetype_key] != "xml":
|
||||
sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
|
||||
logger.info(
|
||||
|
||||
@@ -71,6 +71,7 @@ class AnonymizedIdNumber(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._id_number_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: IDNumberCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -28,6 +28,7 @@ class ImgDenoise(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
|
||||
img_bytes = sample[self.data_key]
|
||||
|
||||
|
||||
@@ -97,6 +97,7 @@ class ImgDirectionCorrect(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
|
||||
@@ -88,6 +88,7 @@ class ImgBrightness(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -59,6 +59,7 @@ class ImgContrast(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -69,6 +69,7 @@ class ImgSaturation(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -57,6 +57,7 @@ class ImgSharpness(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -25,6 +25,7 @@ class ImgPerspectiveTransformation(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -29,6 +29,7 @@ class ImgResize(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
|
||||
@@ -60,6 +60,7 @@ class ImgShadowRemove(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
|
||||
@@ -21,6 +21,7 @@ class ImgTypeUnify(Mapper):
|
||||
|
||||
def execute(self, sample):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
origin_file_type = sample[self.filetype_key]
|
||||
if origin_file_type == self._setting_type:
|
||||
|
||||
@@ -80,6 +80,7 @@ class ImgWatermarkRemove(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
|
||||
@@ -24,6 +24,7 @@ class InvisibleCharactersCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._invisible_characters_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: InvisibleCharactersCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -37,6 +37,7 @@ class AnonymizedIpAddress(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._ip_address_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: IPAddressCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -35,6 +35,7 @@ class KnowledgeRelationSlice(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start_time = time.time()
|
||||
self.read_file_first(sample)
|
||||
|
||||
chunk_item = get_json_list(sample[self.text_key], chunk_size=self.chunk_size, overlap_size=self.overlap_size)
|
||||
chunk_item_json = json.dumps(chunk_item, ensure_ascii=False)
|
||||
|
||||
@@ -36,6 +36,7 @@ class LegendCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._clean_html_tag(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: LegendCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -37,6 +37,7 @@ class AnonymizedPhoneNumber(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._phone_number_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: PhoneNumberCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -53,6 +53,7 @@ class PoliticalWordCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._political_word_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: PoliticalWordCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -47,8 +47,8 @@ def duplicate_sentences_filter(input_data: str, file_name: str, duplicate_th: in
|
||||
paragraph_counts[paragraph_strip] = -1
|
||||
|
||||
except Exception as err:
|
||||
logger.exception(f"fileName: {file_name}, method: RemoveDuplicateSentencess. An error occurred when using "
|
||||
f"filtering duplicate sentences. The error is: {err}")
|
||||
logger.exception(f"fileName: {file_name}, method: RemoveDuplicateSentencess. An error occurred when using "
|
||||
f"filtering duplicate sentences. The error is: {err}")
|
||||
return input_data
|
||||
|
||||
# 将去重后的段落重新组合成文本
|
||||
@@ -63,6 +63,7 @@ class DuplicateSentencesFilter(Filter):
|
||||
duplicate_th = 5 # 段落重复次数阈值
|
||||
file_name = sample[self.filename_key]
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = duplicate_sentences_filter(sample[self.text_key], file_name, duplicate_th)
|
||||
logger.info(f"fileName: {file_name}, RemoveDuplicateSentencess costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -56,6 +56,7 @@ class SexualAndViolentWordCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._sexual_and_violent_word_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: SexualAndViolentWordCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -61,6 +61,7 @@ class TextToWord(Mapper):
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""将文本信息转换为docx文件流"""
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.data_key] = self._txt_to_docx(sample[self.text_key]) # 将文字转换为word字符串流
|
||||
sample[self.text_key] = ""
|
||||
sample["target_type"] = "docx"
|
||||
|
||||
@@ -27,6 +27,7 @@ class TraditionalChineseCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._traditional_chinese_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: TraditionalChinese costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -23,6 +23,7 @@ class UnicodeSpaceCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._clean_unicode_space(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: UnicodeSpaceCleaner costs {time.time() - start:6f} s")
|
||||
|
||||
@@ -26,6 +26,7 @@ class AnonymizedUrlCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
sample[self.text_key] = self._url_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: UrlCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
@@ -52,6 +52,7 @@ class XMLTagCleaner(Mapper):
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
self.read_file_first(sample)
|
||||
file_name = sample[self.filename_key]
|
||||
if sample[self.filetype_key] == "xml":
|
||||
try:
|
||||
|
||||
28
runtime/ops/pyproject.toml
Normal file
28
runtime/ops/pyproject.toml
Normal file
@@ -0,0 +1,28 @@
|
||||
[project]
|
||||
name = "ops"
|
||||
version = "0.0.1"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"beautifulsoup4>=4.14.3",
|
||||
"datasketch>=1.8.0",
|
||||
"email-validator>=2.3.0",
|
||||
"emoji>=2.15.0",
|
||||
"jieba>=0.42.1",
|
||||
"loguru>=0.7.3",
|
||||
"numpy>=2.2.0,<=2.2.6",
|
||||
"opencv-contrib-python-headless>=4.12.0.88",
|
||||
"opencv-python-headless>=4.12.0.88",
|
||||
"openslide-python>=1.4.3",
|
||||
"paddleocr>=3.3.2",
|
||||
"pandas>=2.2.0,<=2.2.3",
|
||||
"pycryptodome>=3.23.0",
|
||||
"pymysql>=1.1.2",
|
||||
"python-docx>=1.2.0",
|
||||
"pytz>=2025.2",
|
||||
"six>=1.17.0",
|
||||
"sqlalchemy>=2.0.44",
|
||||
"xmltodict>=1.0.2",
|
||||
"zhconv>=1.4.3",
|
||||
]
|
||||
@@ -1,22 +0,0 @@
|
||||
beautifulsoup4==4.14.2
|
||||
datamate==0.0.1
|
||||
datasketch==1.6.5
|
||||
email_validator==2.3.0
|
||||
emoji==2.2.0
|
||||
jieba==0.42.1
|
||||
loguru==0.7.3
|
||||
numpy==2.2.6
|
||||
opencv_contrib_python-headless==4.10.0.84
|
||||
opencv_python-headless==4.12.0.88
|
||||
openslide_python==1.4.2
|
||||
paddleocr==3.2.0
|
||||
pandas==2.2.3
|
||||
pycryptodome==3.23.0
|
||||
python_docx==1.2.0
|
||||
pytz==2025.2
|
||||
six==1.17.0
|
||||
xmltodict==1.0.2
|
||||
zhconv==1.4.3
|
||||
sqlalchemy==2.0.40
|
||||
pymysql==1.1.1
|
||||
unstructured[docx,csv,xlsx,pptx]==0.18.15
|
||||
@@ -2,10 +2,15 @@
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
import uuid
|
||||
from typing import List, Dict, Any, Tuple
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
from unstructured.partition.auto import partition
|
||||
|
||||
from datamate.common.error_code import ERROR_CODE_TABLE, UNKNOWN_ERROR_CODE
|
||||
from datamate.common.utils.llm_request import LlmReq
|
||||
@@ -52,6 +57,7 @@ class BaseOp:
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.accelerator = kwargs.get('accelerator', "cpu")
|
||||
self.is_last_op = kwargs.get('is_last_op', False)
|
||||
self.is_first_op = kwargs.get('is_first_op', False)
|
||||
self._name = kwargs.get('op_name', None)
|
||||
self.infer_model = None
|
||||
self.text_key = kwargs.get('text_key', "text")
|
||||
@@ -122,10 +128,10 @@ class BaseOp:
|
||||
raise NotImplementedError("This is in BaseOp, plese re-define this method in Sub-classes")
|
||||
|
||||
def fill_sample_params(self, sample: Dict[str, Any], **kwargs):
|
||||
if not sample.get("text", None):
|
||||
if not sample.get(self.text_key, None):
|
||||
sample[self.text_key] = ""
|
||||
|
||||
if not sample.get("data", None):
|
||||
if not sample.get(self.data_key, None):
|
||||
sample[self.data_key] = b""
|
||||
|
||||
if not sample[self.data_key] and not sample[self.text_key]:
|
||||
@@ -137,6 +143,27 @@ class BaseOp:
|
||||
failed_reason = {"op_name": op_name, "error_code": error_code, "reason": exc_info}
|
||||
sample["failed_reason"] = failed_reason
|
||||
|
||||
def read_file(self, sample):
|
||||
filepath = sample[self.filepath_key]
|
||||
filetype = sample[self.filetype_key]
|
||||
if filetype in ["ppt", "pptx", "docx", "doc", "xlsx"]:
|
||||
elements = partition(filename=filepath)
|
||||
sample[self.text_key] = "\n\n".join([str(el) for el in elements])
|
||||
elif filetype in ["txt", "md", "markdown", "xml", "html", "csv", "json", "jsonl"]:
|
||||
with open(filepath, 'rb') as f:
|
||||
content = f.read()
|
||||
sample[self.text_key] = content.decode("utf-8-sig").replace("\r\n", "\n")
|
||||
elif filetype in ['jpg', 'jpeg', 'png', 'bmp']:
|
||||
image_np = cv2.imdecode(np.fromfile(filepath, dtype=np.uint8), -1)
|
||||
if image_np.size:
|
||||
data = cv2.imencode(f".{filetype}", image_np)[1]
|
||||
image_bytes = data.tobytes()
|
||||
sample[self.data_key] = image_bytes
|
||||
|
||||
def read_file_first(self, sample):
|
||||
if self.is_first_op:
|
||||
self.read_file(sample)
|
||||
|
||||
|
||||
class Mapper(BaseOp):
|
||||
def __init__(self, *args, **kwargs):
|
||||
@@ -158,15 +185,16 @@ class Mapper(BaseOp):
|
||||
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
|
||||
f"{str(get_exception_info(e))}")
|
||||
sample["execute_status"] = execute_status
|
||||
task_info = TaskInfoPersistence()
|
||||
task_info.persistence_task_info(sample)
|
||||
sample[self.filesize_key] = "0"
|
||||
sample[self.filetype_key] = ""
|
||||
TaskInfoPersistence().update_task_result(sample)
|
||||
raise e
|
||||
|
||||
sample["execute_status"] = execute_status
|
||||
# 加载文件成功执行信息到数据库
|
||||
if self.is_last_op:
|
||||
task_info = TaskInfoPersistence()
|
||||
task_info.persistence_task_info(sample)
|
||||
if FileExporter().execute(sample):
|
||||
TaskInfoPersistence().persistence_task_info(sample)
|
||||
return sample
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
@@ -197,8 +225,9 @@ class Slicer(BaseOp):
|
||||
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
|
||||
f"{str(get_exception_info(e))}")
|
||||
sample["execute_status"] = execute_status
|
||||
task_info = TaskInfoPersistence()
|
||||
task_info.persistence_task_info(sample)
|
||||
sample[self.filesize_key] = "0"
|
||||
sample[self.filetype_key] = ""
|
||||
TaskInfoPersistence().update_task_result(sample)
|
||||
return [sample]
|
||||
|
||||
self.load_sample_to_sample(sample, sample_list)
|
||||
@@ -206,8 +235,8 @@ class Slicer(BaseOp):
|
||||
|
||||
# 加载文件成功执行信息到数据库
|
||||
if self.is_last_op:
|
||||
task_info = TaskInfoPersistence()
|
||||
task_info.persistence_task_info(sample)
|
||||
if FileExporter().execute(sample):
|
||||
TaskInfoPersistence().persistence_task_info(sample)
|
||||
|
||||
return [sample]
|
||||
|
||||
@@ -286,22 +315,24 @@ class Filter(BaseOp):
|
||||
sample["execute_status"] = execute_status
|
||||
logger.error(f"Ops named {self.name} map failed, Error Info: \n"
|
||||
f"{str(get_exception_info(e))}")
|
||||
task_info = TaskInfoPersistence()
|
||||
task_info.persistence_task_info(sample)
|
||||
sample[self.filesize_key] = "0"
|
||||
sample[self.filetype_key] = ""
|
||||
TaskInfoPersistence().update_task_result(sample)
|
||||
raise e
|
||||
|
||||
sample["execute_status"] = execute_status
|
||||
# 文件无内容会被过滤
|
||||
if sample[self.text_key] == "" and sample[self.data_key] == b"":
|
||||
task_info = TaskInfoPersistence()
|
||||
sample["fileSize"] = "0"
|
||||
task_info.persistence_task_info(sample)
|
||||
sample[self.filesize_key] = "0"
|
||||
sample[self.filetype_key] = ""
|
||||
task_info.update_task_result(sample)
|
||||
return False
|
||||
|
||||
# 加载文件成功执行信息到数据库
|
||||
if self.is_last_op:
|
||||
task_info = TaskInfoPersistence()
|
||||
task_info.persistence_task_info(sample)
|
||||
if FileExporter().execute(sample):
|
||||
TaskInfoPersistence().persistence_task_info(sample)
|
||||
return True
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
@@ -379,3 +410,131 @@ class LLM(Mapper):
|
||||
raise RuntimeError(f"Save jsonl file Failed!, save_path: {save_path}.") from e
|
||||
|
||||
logger.info(f"LLM output has been save to {save_path}.")
|
||||
|
||||
|
||||
class FileExporter(BaseOp):
|
||||
"""把输入的json文件流抽取为txt"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(FileExporter, self).__init__(*args, **kwargs)
|
||||
self.last_ops = True
|
||||
self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
|
||||
'xlsx', 'xls', 'csv', 'pptx', 'ppt',
|
||||
'xml', 'json', 'doc', 'docx', 'pdf'])
|
||||
self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
|
||||
self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = sample[self.filetype_key]
|
||||
|
||||
try:
|
||||
start = time.time()
|
||||
if file_type in self.text_support_ext:
|
||||
sample, save_path = self.get_textfile_handler(sample)
|
||||
elif file_type in self.data_support_ext:
|
||||
sample, save_path = self.get_datafile_handler(sample)
|
||||
elif file_type in self.medical_support_ext:
|
||||
sample, save_path = self.get_medicalfile_handler(sample)
|
||||
else:
|
||||
raise TypeError(f"{file_type} is unsupported! please check support_ext in FileExporter Ops")
|
||||
|
||||
if sample[self.text_key] == '' and sample[self.data_key] == b'':
|
||||
sample[self.filesize_key] = "0"
|
||||
return False
|
||||
|
||||
if save_path:
|
||||
self.save_file(sample, save_path)
|
||||
sample[self.text_key] = ''
|
||||
sample[self.data_key] = b''
|
||||
sample[Fields.result] = True
|
||||
|
||||
file_type = save_path.split('.')[-1]
|
||||
sample[self.filetype_key] = file_type
|
||||
|
||||
base_name, _ = os.path.splitext(file_name)
|
||||
new_file_name = base_name + '.' + file_type
|
||||
sample[self.filename_key] = new_file_name
|
||||
|
||||
base_name, _ = os.path.splitext(save_path)
|
||||
sample[self.filepath_key] = base_name
|
||||
file_size = os.path.getsize(base_name)
|
||||
sample[self.filesize_key] = f"{file_size}"
|
||||
|
||||
logger.info(f"origin file named {file_name} has been save to {save_path}")
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileExporter costs {time.time() - start:.6f} s")
|
||||
except UnicodeDecodeError as err:
|
||||
logger.error(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FileExporter causes decode error: {err}")
|
||||
raise
|
||||
return True
|
||||
|
||||
def get_save_path(self, sample: Dict[str, Any], target_type):
|
||||
export_path = os.path.abspath(sample[self.export_path_key])
|
||||
file_name = sample[self.filename_key]
|
||||
new_file_name = os.path.splitext(file_name)[0] + '.' + target_type
|
||||
|
||||
if not check_valid_path(export_path):
|
||||
os.makedirs(export_path, exist_ok=True)
|
||||
return os.path.join(export_path, new_file_name)
|
||||
|
||||
def get_textfile_handler(self, sample: Dict[str, Any]):
|
||||
target_type = sample.get("target_type", None)
|
||||
|
||||
# target_type存在则保存为扫描件, docx格式
|
||||
if target_type:
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
# 不存在则保存为txt文件,正常文本清洗
|
||||
else:
|
||||
sample = self._get_from_text(sample)
|
||||
save_path = self.get_save_path(sample, 'txt')
|
||||
return sample, save_path
|
||||
|
||||
def get_datafile_handler(self, sample: Dict[str, Any]):
|
||||
target_type = sample.get("target_type", None)
|
||||
|
||||
# target_type存在, 图转文保存为target_type,markdown格式
|
||||
if target_type:
|
||||
sample = self._get_from_text(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
# 不存在则保存为原本图片文件格式,正常图片清洗
|
||||
else:
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, sample[self.filetype_key])
|
||||
return sample, save_path
|
||||
|
||||
def get_medicalfile_handler(self, sample: Dict[str, Any]):
|
||||
target_type = 'png'
|
||||
|
||||
sample = self._get_from_data(sample)
|
||||
save_path = self.get_save_path(sample, target_type)
|
||||
|
||||
return sample, save_path
|
||||
|
||||
def save_file(self, sample, save_path):
|
||||
file_name, _ = os.path.splitext(save_path)
|
||||
# 以二进制格式保存文件
|
||||
file_sample = sample[self.text_key].encode('utf-8') if sample[self.text_key] else sample[self.data_key]
|
||||
with open(file_name, 'wb') as f:
|
||||
f.write(file_sample)
|
||||
# 获取父目录路径
|
||||
|
||||
parent_dir = os.path.dirname(file_name)
|
||||
os.chmod(parent_dir, 0o770)
|
||||
os.chmod(file_name, 0o640)
|
||||
|
||||
def _get_from_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
sample[self.data_key] = bytes(sample[self.data_key])
|
||||
sample[self.text_key] = ''
|
||||
return sample
|
||||
|
||||
def _get_from_text(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
sample[self.data_key] = b''
|
||||
sample[self.text_key] = str(sample[self.text_key])
|
||||
return sample
|
||||
|
||||
@staticmethod
|
||||
def _get_uuid():
|
||||
return str(uuid.uuid4())
|
||||
|
||||
@@ -119,6 +119,8 @@ class RayDataset(BasicDataset):
|
||||
|
||||
# 加载Ops module
|
||||
temp_ops = self.load_ops_module(op_name)
|
||||
if index == 0:
|
||||
init_kwargs["is_first_op"] = True
|
||||
|
||||
if index == len(cfg_process) - 1:
|
||||
init_kwargs["is_last_op"] = True
|
||||
@@ -182,7 +184,8 @@ class RayDataset(BasicDataset):
|
||||
fn_kwargs=kwargs,
|
||||
resources=resources,
|
||||
num_cpus=0.05,
|
||||
concurrency=(1, 1 if operators_cls.use_model else int(max_actor_nums)))
|
||||
compute=rd.ActorPoolStrategy(min_size=1,
|
||||
max_size=int(max_actor_nums)))
|
||||
|
||||
elif issubclass(operators_cls, (Slicer, RELATIVE_Slicer)):
|
||||
self.data = self.data.flat_map(operators_cls,
|
||||
@@ -190,7 +193,8 @@ class RayDataset(BasicDataset):
|
||||
fn_kwargs=kwargs,
|
||||
resources=resources,
|
||||
num_cpus=0.05,
|
||||
concurrency=(1, int(max_actor_nums)))
|
||||
compute=rd.ActorPoolStrategy(min_size=1,
|
||||
max_size=int(max_actor_nums)))
|
||||
|
||||
elif issubclass(operators_cls, (Filter, RELATIVE_Filter)):
|
||||
self.data = self.data.filter(operators_cls,
|
||||
@@ -198,7 +202,8 @@ class RayDataset(BasicDataset):
|
||||
fn_kwargs=kwargs,
|
||||
resources=resources,
|
||||
num_cpus=0.05,
|
||||
concurrency=(1, int(max_actor_nums)))
|
||||
compute=rd.ActorPoolStrategy(min_size=1,
|
||||
max_size=int(max_actor_nums)))
|
||||
else:
|
||||
logger.error(
|
||||
'Ray executor only support Filter, Mapper and Slicer OPs for now')
|
||||
|
||||
@@ -25,13 +25,13 @@ class TaskInfoPersistence:
|
||||
with open(sql_config_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
def persistence_task_info(self, sample: Dict[str, Any]):
|
||||
def update_task_result(self, sample, file_id = str(uuid.uuid4())):
|
||||
instance_id = str(sample.get("instance_id"))
|
||||
src_file_name = str(sample.get("sourceFileName"))
|
||||
src_file_type = str(sample.get("sourceFileType"))
|
||||
src_file_id = str(sample.get("sourceFileId"))
|
||||
src_file_size = int(sample.get("sourceFileSize"))
|
||||
file_id = str(uuid.uuid4())
|
||||
|
||||
file_size = str(sample.get("fileSize"))
|
||||
file_type = str(sample.get("fileType"))
|
||||
file_name = str(sample.get("fileName"))
|
||||
@@ -53,6 +53,10 @@ class TaskInfoPersistence:
|
||||
}
|
||||
self.insert_result(result_data, str(self.sql_dict.get("insert_clean_result_sql")))
|
||||
|
||||
def update_file_result(self, sample, file_id):
|
||||
file_size = str(sample.get("fileSize"))
|
||||
file_type = str(sample.get("fileType"))
|
||||
file_name = str(sample.get("fileName"))
|
||||
dataset_id = str(sample.get("dataset_id"))
|
||||
file_path = str(sample.get("filePath"))
|
||||
create_time = datetime.now()
|
||||
@@ -72,6 +76,11 @@ class TaskInfoPersistence:
|
||||
}
|
||||
self.insert_result(file_data, str(self.sql_dict.get("insert_dataset_file_sql")))
|
||||
|
||||
def persistence_task_info(self, sample: Dict[str, Any]):
|
||||
file_id = str(uuid.uuid4())
|
||||
self.update_task_result(sample, file_id)
|
||||
self.update_file_result(sample, file_id)
|
||||
|
||||
@staticmethod
|
||||
def insert_result(data, sql):
|
||||
retries = 0
|
||||
|
||||
@@ -16,27 +16,13 @@ classifiers = [
|
||||
|
||||
# Core dependencies
|
||||
dependencies = [
|
||||
"uvicorn[standard]",
|
||||
"fastapi",
|
||||
"loguru",
|
||||
"jsonargparse",
|
||||
"ray[default, data]==2.46.0",
|
||||
"opencv-python"
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dj = [
|
||||
"py-data-juicer~=1.4.0"
|
||||
]
|
||||
|
||||
op = [
|
||||
"python-docx==1.1.0"
|
||||
]
|
||||
|
||||
# All dependencies
|
||||
all = [
|
||||
"datamate[dj]",
|
||||
"datamate[op]"
|
||||
"fastapi>=0.123.9",
|
||||
"jsonargparse>=4.44.0",
|
||||
"loguru>=0.7.3",
|
||||
"opencv-python-headless>=4.12.0.88",
|
||||
"ray[data,default]==2.52.1",
|
||||
"unstructured[csv,docx,pptx,xlsx]==0.18.15",
|
||||
"uvicorn[standard]>=0.38.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
|
||||
@@ -59,8 +59,7 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', '文本清洗模板', '文本清
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板');
|
||||
|
||||
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
|
||||
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
|
||||
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, null),
|
||||
@@ -85,10 +84,10 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileExporter', 28, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgFormatter', 1, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null);
|
||||
|
||||
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
|
||||
VALUES ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 3, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 4, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 5, null),
|
||||
@@ -99,5 +98,4 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 10, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 11, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'FileExporter', 14, null);
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null);
|
||||
@@ -67,10 +67,7 @@ VALUES ('64465bec-b46b-11f0-8291-00155d0e4808', '模态', 'modal', 'predefined'
|
||||
|
||||
INSERT IGNORE INTO t_operator
|
||||
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
|
||||
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||
('UnstructuredFormatter', 'Unstructured文本抽取', '基于Unstructured抽取非结构化文件的文本,目前支持PowerPoint演示文稿、Word文档以及Excel工作簿。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||
('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'multimodal', 'multimodal', null, null, '', false),
|
||||
VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取PDF中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
|
||||
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
||||
('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
||||
@@ -97,7 +94,6 @@ VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0',
|
||||
('UnicodeSpaceCleaner', '空格标准化', '将文档中不同的 unicode 空格,如 u2008,转换为正常空格\\u0020。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('AnonymizedUrlCleaner', 'URL网址匿名化', '将文档中的url网址匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('XMLTagCleaner', 'XML标签去除', '去除XML中的标签。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('ImgFormatter', '读取图片文件', '读取图片文件。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgBlurredImagesCleaner', '模糊图片过滤', '去除模糊的图片。', '1.0.0', 'image', 'image', null, '{"blurredThreshold": {"name": "梯度函数值", "description": "梯度函数值取值越小,图片模糊度越高。", "type": "slider", "defaultVal": 1000, "min": 1, "max": 10000, "step": 1}}', '', 'false'),
|
||||
('ImgBrightness', '图片亮度增强', '自适应调节图片的亮度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgContrast', '图片对比度增强', '自适应调节图片的对比度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
@@ -117,7 +113,7 @@ SELECT c.id, o.id
|
||||
FROM t_operator_category c
|
||||
CROSS JOIN t_operator o
|
||||
WHERE c.id IN ('d8a5df7a-52a9-42c2-83c4-01062e60f597', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
|
||||
AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter',
|
||||
AND o.id IN ('FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter',
|
||||
'FileWithHighRepeatWordRateFilter', 'FileWithHighSpecialCharRateFilter', 'FileWithManySensitiveWordsFilter',
|
||||
'DuplicateFilesFilter', 'DuplicateSentencesFilter', 'AnonymizedCreditCardNumber', 'AnonymizedIdNumber',
|
||||
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
|
||||
@@ -130,13 +126,6 @@ SELECT c.id, o.id
|
||||
FROM t_operator_category c
|
||||
CROSS JOIN t_operator o
|
||||
WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
|
||||
AND o.id IN ('ImgFormatter', 'ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
|
||||
AND o.id IN ('ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
|
||||
'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
|
||||
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify');
|
||||
|
||||
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
||||
SELECT c.id, o.id
|
||||
FROM t_operator_category c
|
||||
CROSS JOIN t_operator o
|
||||
WHERE c.id IN ('4d7dbd77-0a92-44f3-9056-2cd62d4a71e4', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
|
||||
AND o.id IN ('FileExporter', 'UnstructuredFormatter');
|
||||
|
||||
@@ -16,7 +16,7 @@ WORKDIR /opt/runtime
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install -e . --system \
|
||||
&& uv pip install -r /opt/runtime/datamate/ops/requirements.txt --system
|
||||
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system
|
||||
|
||||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||
&& chmod +x /opt/runtime/start.sh \
|
||||
|
||||
Reference in New Issue
Block a user