diff --git a/README.md b/README.md index e209af3..120b347 100644 --- a/README.md +++ b/README.md @@ -73,15 +73,15 @@ make install-mineru ``` ### Deploy the DeerFlow service -1. Modify `runtime/deer-flow/.env.example` and add configurations for SEARCH_API_KEY and the EMBEDDING model. -2. Modify `runtime/deer-flow/.conf.yaml.example` and add basic model service configurations. -3. Execute `make install-deer-flow` +```bash +make install-deer-flow +``` ### Local Development and Deployment After modifying the local code, please execute the following commands to build the image and deploy using the local image. ```bash make build -make install REGISTRY="" +make install dev=true ``` ## 🤝 Contribution Guidelines diff --git a/runtime/ops/mapper/__init__.py b/runtime/ops/mapper/__init__.py index 558b0d8..4b97019 100644 --- a/runtime/ops/mapper/__init__.py +++ b/runtime/ops/mapper/__init__.py @@ -23,7 +23,6 @@ def _import_operators(): from . import garble_characters_cleaner from . import html_tag_cleaner from . import id_number_cleaner - from . import img_watermark_remove from . import invisible_characters_cleaner from . import ip_address_cleaner from . import legend_cleaner @@ -47,6 +46,7 @@ def _import_operators(): from . import img_resize from . import remove_duplicate_sentences from . import knowledge_relation_slice + from . import pii_ner_detection _import_operators() diff --git a/runtime/ops/mapper/img_direction_correct/base_model.py b/runtime/ops/mapper/img_direction_correct/base_model.py index 59e1270..ab65ebb 100644 --- a/runtime/ops/mapper/img_direction_correct/base_model.py +++ b/runtime/ops/mapper/img_direction_correct/base_model.py @@ -11,7 +11,6 @@ class BaseModel: def __init__(self, model_type='vertical'): models_path = os.getenv("MODELS_PATH", "/home/models") - self.resources_path = str(Path(models_path, 'img_direction_correct', 'resources')) args = Namespace() args.cls_image_shape = '3, 224, 224' args.cls_batch_num = 6 @@ -20,13 +19,14 @@ class BaseModel: args.use_gpu = False args.use_npu = False args.use_xpu = False + args.use_mlu = False args.enable_mkldnn = False if model_type == 'vertical': - args.cls_model_dir = str(Path(self.resources_path, 'vertical_model')) + args.cls_model_dir = str(Path(models_path, 'ch_ppocr_mobile_v2.0_cls_infer')) self.model_name = 'standard model to detect image 0 or 90 rotated' args.label_list = ['0', '90'] else: - args.cls_model_dir = str(Path(self.resources_path, 'standard_model')) + args.cls_model_dir = str(Path(models_path, 'ch_ppocr_mobile_v2.0_cls_infer')) self.model_name = 'standard model to detect image 0 or 180 rotated' args.label_list = ['0', '180'] diff --git a/runtime/ops/mapper/img_watermark_remove/__init__.py b/runtime/ops/mapper/img_watermark_remove/__init__.py deleted file mode 100644 index 7441684..0000000 --- a/runtime/ops/mapper/img_watermark_remove/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -from datamate.core.base_op import OPERATORS - -OPERATORS.register_module(module_name='ImgWatermarkRemove', - module_path="ops.mapper.img_watermark_remove.process") diff --git a/runtime/ops/mapper/img_watermark_remove/metadata.yml b/runtime/ops/mapper/img_watermark_remove/metadata.yml deleted file mode 100644 index 8d2ac8c..0000000 --- a/runtime/ops/mapper/img_watermark_remove/metadata.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: '图片水印去除' -name_en: 'Image Watermark Removal' -description: '去除图片中的“知乎”和“抖音”水印。' -description_en: 'Removes the 知乎 and 抖音 watermarks from images.' -language: 'python' -vendor: 'huawei' -raw_id: 'ImgWatermarkRemove' -version: '1.0.0' -types: - - 'cleanse' -modal: 'image' -effect: - before: '' - after: '' -inputs: 'image' -outputs: 'image' -settings: - watermarkStr: - name: 需要去除的水印文字信息 - type: checkbox - defaultVal: '知乎,抖音' - options: - - label: 知乎 - value: 知乎 - - label: 抖音 - value: 抖音 \ No newline at end of file diff --git a/runtime/ops/mapper/img_watermark_remove/process.py b/runtime/ops/mapper/img_watermark_remove/process.py deleted file mode 100644 index 86ea557..0000000 --- a/runtime/ops/mapper/img_watermark_remove/process.py +++ /dev/null @@ -1,161 +0,0 @@ -# # -- encoding: utf-8 -- - -# -# Description: -# Create: 2025/01/06 -# """ -import time -from typing import Dict, Any - -import cv2 -import numpy as np -from loguru import logger - -from datamate.common.utils import bytes_to_numpy -from datamate.common.utils import numpy_to_bytes -from datamate.core.base_op import Mapper -from .watermark_ocr_model import WatermarkOcrModel - -DEFAULT_MAX_CHARACTERS = 10 -DEFAULT_BINARY_THRESHOLD_LOW = 200 - - -class ImgWatermarkRemove(Mapper): - use_model = True - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.remove_str = kwargs.get("watermarkStr", "知乎,抖音") - self.ocr_model = self.get_model(*args, **kwargs) - - @staticmethod - def _has_kw(result_list, kw_list): - """ - 图片是否包含目标水印,返回匹配到的文字列表 - """ - result_str_list = [] - for line in result_list: - for kw in kw_list: - if kw in line[1][0]: - result_str_list.append(line[1][0]) - break - return result_str_list - - @staticmethod - def _overlay_mask(background_img, prospect_img, img_over_x, img_over_y): - back_r, back_c, _ = background_img.shape # 背景图像行数、列数 - is_x_direction_failed = img_over_x > back_c or img_over_x < 0 - is_y_direction_failed = img_over_y > back_r or img_over_y < 0 - if is_x_direction_failed or is_y_direction_failed: - # 前景图不在背景图范围内, 直接返回原图 - return background_img - pro_r, pro_c, _ = prospect_img.shape # 前景图像行数、列数 - if img_over_x + pro_c > back_c: # 如果水平方向展示不全 - pro_c = back_c - img_over_x # 截取前景图的列数 - prospect_img = prospect_img[:, 0:pro_c, :] # 截取前景图 - if img_over_y + pro_r > back_r: # 如果垂直方向展示不全 - pro_r = back_r - img_over_y # 截取前景图的行数 - prospect_img = prospect_img[0:pro_r, :, :] # 截取前景图 - - prospect_img = cv2.cvtColor(prospect_img, cv2.COLOR_BGR2BGRA) # 前景图转为4通道图像 - prospect_tmp = np.zeros((back_r, back_c, 4), np.uint8) # 与背景图像等大的临时前景图层 - - # 前景图像放到前景图层里 - prospect_tmp[img_over_y:img_over_y + pro_r, img_over_x: img_over_x + pro_c, :] = prospect_img - - _, binary = cv2.threshold(prospect_img, 254, 255, cv2.THRESH_BINARY) # 前景图阈值处理 - prospect_mask = np.zeros((pro_r, pro_c, 1), np.uint8) # 单通道前景图像掩模 - prospect_mask[:, :, 0] = binary[:, :, 3] # 不透明像素的值作为掩模的值 - - mask = np.zeros((back_r, back_c, 1), np.uint8) - mask[img_over_y:img_over_y + prospect_mask.shape[0], - img_over_x: img_over_x + prospect_mask.shape[1]] = prospect_mask - - mask_not = cv2.bitwise_not(mask) - - prospect_tmp = cv2.bitwise_and(prospect_tmp, prospect_tmp, mask=mask) - background_img = cv2.bitwise_and(background_img, background_img, mask=mask_not) - prospect_tmp = cv2.cvtColor(prospect_tmp, cv2.COLOR_BGRA2BGR) # 前景图层转为三通道图像 - return prospect_tmp + background_img # 前景图层与背景图像相加合并 - - def execute(self, sample: Dict[str, Any]): - start = time.time() - self.read_file_first(sample) - file_name = sample[self.filename_key] - file_type = "." + sample[self.filetype_key] - img_bytes = sample[self.data_key] - if img_bytes: - data = bytes_to_numpy(img_bytes) - correct_data = self._watermark_remove(data, file_name, self.ocr_model) - sample[self.data_key] = numpy_to_bytes(correct_data, file_type) - logger.info(f"fileName: {file_name}, method: ImgWatermarkRemove costs {time.time() - start:6f} s") - return sample - - def delete_watermark(self, result_list, kw_list, data): - """ - 将符合目标的水印,模糊化处理 - """ - # 获取所有符合目标的文本框位置 - text_axes_list = [] - for line in result_list: - for kw in kw_list: - if kw in line[1][0]: - min_width = int(min(line[0][0][0], line[0][3][0])) - max_width = int(max(line[0][1][0], line[0][2][0])) - min_hight = int(min(line[0][0][1], line[0][1][1])) - max_hight = int(max(line[0][2][1], line[0][3][1])) - text_axes_list.append([min_width, min_hight, max_width, max_hight]) - break - # 去除水印 - delt = DEFAULT_MAX_CHARACTERS # 文本框范围扩大 - img = data - for text_axes in text_axes_list: - hight, width = img.shape[0:2] - # 截取图片 - min_width = text_axes[0] - delt if text_axes[0] - delt >= 0 else 0 - min_hight = text_axes[1] - delt if text_axes[1] - delt >= 0 else 0 - max_width = text_axes[2] + delt if text_axes[2] + delt <= width else width - max_hight = text_axes[3] + delt if text_axes[3] + delt <= hight else hight - cropped = img[min_hight:max_hight, min_width:max_width] # 裁剪坐标为[y0:y1, x0:x1] - # 图片二值化处理,把[200,200,200]-[250,250,250]以外的颜色变成0 - start_rgb = DEFAULT_BINARY_THRESHOLD_LOW - thresh = cv2.inRange(cropped, np.array([start_rgb, start_rgb, start_rgb]), np.array([250, 250, 250])) - # 创建形状和尺寸的结构元素 - kernel = np.ones((3, 3), np.uint8) # 设置卷积核3*3全是1;将当前的数组作为图像类型来进⾏各种操作,就要转换到uint8类型 - # 扩展待修复区域 - hi_mask = cv2.dilate(thresh, kernel, iterations=10) # 膨胀操作,白色区域增大,iterations迭代次数 - specular = cv2.inpaint(cropped, hi_mask, 5, flags=cv2.INPAINT_TELEA) - # imgSY:输入8位1通道或3通道图像。 - # hi_mask:修复掩码,8位1通道图像。非零像素表示需要修复的区域。 - # specular:输出与imgSY具有相同大小和类型的图像。 - # 5:算法考虑的每个点的圆形邻域的半径。 - # flags:NPAINT_NS基于Navier-Stokes的方法、Alexandru Telea的INPAINT_TELEA方法 - result = self._overlay_mask(img, specular, min_width, min_hight) - img = result - return img - - def init_model(self, *args, **kwargs): - return WatermarkOcrModel(*args, **kwargs).ocr_model - - def _watermark_remove(self, data, file_name, model): - """ - 去除水印的方法 - """ - remove_str = self.remove_str - # 勾选去水印的信息为空,则直接返回原图 - if remove_str == "": - return data - kw_list = remove_str.split(',') - # 加载模型 - ocr_model = model - try: - result = ocr_model.ocr(data, cls=True) - except RuntimeError as e: - logger.error(f"fileName: {file_name}, method: ocr predict error {e}") - return data - if result and result[0]: - logger.info(f"fileName: {file_name}, method: ocrModel detect watermark info {str(result)}") - return self.delete_watermark(result[0], kw_list, data) - else: - logger.info(f"fileName: {file_name}, method: ImgWatermarkRemove not need remove target ocr") - return data diff --git a/runtime/ops/mapper/img_watermark_remove/watermark_ocr_model.py b/runtime/ops/mapper/img_watermark_remove/watermark_ocr_model.py deleted file mode 100644 index 242c02c..0000000 --- a/runtime/ops/mapper/img_watermark_remove/watermark_ocr_model.py +++ /dev/null @@ -1,25 +0,0 @@ -# -- encoding: utf-8 -- - -import gc -import os -from pathlib import Path - - -class WatermarkOcrModel: - - def __init__(self, *args, **kwargs): - models_path = os.getenv("MODELS_PATH", "/home/models") - self.resources_path = str(Path(models_path, 'img_watermark_remove', 'resources')) - self.det_model_dir = str(Path(self.resources_path, 'ch_PP-OCRv4_det_infer')) - self.rec_model_dir = str(Path(self.resources_path, 'ch_PP-OCRv4_rec_infer')) - self.cls_model_dir = str(Path(self.resources_path, 'ch_ppocr_mobild_v2_cls_infer')) - - from paddleocr import PaddleOCR - self.ocr_model = PaddleOCR(det_model_dir=self.det_model_dir, cls_model_dir=self.cls_model_dir, - rec_model_dir=self.rec_model_dir, - use_angle_cls=True, - lang='ch') - - def __del__(self): - del self.ocr_model - gc.collect() diff --git a/runtime/ops/mapper/pii_ner_detection/__init__.py b/runtime/ops/mapper/pii_ner_detection/__init__.py new file mode 100644 index 0000000..b377a4b --- /dev/null +++ b/runtime/ops/mapper/pii_ner_detection/__init__.py @@ -0,0 +1,4 @@ +from datamate.core.base_op import OPERATORS + +OPERATORS.register_module(module_name='PiiDetector', + module_path='ops.mapper.pii_ner_detection.process') \ No newline at end of file diff --git a/runtime/ops/mapper/pii_ner_detection/custom_entities.py b/runtime/ops/mapper/pii_ner_detection/custom_entities.py new file mode 100644 index 0000000..698d57f --- /dev/null +++ b/runtime/ops/mapper/pii_ner_detection/custom_entities.py @@ -0,0 +1,62 @@ +import presidio_analyzer as analyzer + +# 中国身份证号识别器 +id_recognizer = analyzer.PatternRecognizer( + supported_entity="ID_CHINA", + supported_language="zh", + patterns=[ + analyzer.Pattern( + name="china_id_pattern", + regex=r"\b[1-9]\d{5}(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dXx]\b|\b[1-9]\d{7}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}\b", + score=0.9 + ) + ], + context=["身份证", "身份证明", "身份证号", "证件号码"] +) + +# 中国电话号码识别器 +phone_recognizer = analyzer.PatternRecognizer( + supported_entity="Phone_CHINA", + supported_language="zh", + patterns=[ + analyzer.Pattern( + name="china_mobile_pattern", + regex=r"\b(1[3-9]\d{9})\b", + score=0.85 + ), + analyzer.Pattern( + name="china_landline_pattern", + regex=r"\b(0\d{2,3}-?\d{7,8})\b", + score=0.8 + ) + ], + context=["电话", "手机", "联系方式", "联系电话"] +) + +# 中国邮编识别器 +zipcode_recognizer = analyzer.PatternRecognizer( + supported_entity="ZIPCODE_CHINA", + supported_language="zh", + patterns=[ + analyzer.Pattern( + name="china_zipcode_pattern", + regex=r"\b[1-9]\d{5}\b", + score=0.7 + ) + ], + context=["邮编", "邮政编码", "邮编号码"] +) + +# 兼容中文域名的URL识别器 +url_recognizer = analyzer.PatternRecognizer( + supported_entity="URL", + supported_language="zh", + patterns=[ + analyzer.Pattern( + name="url_pattern", + regex=r"\b((?:https?://|www\.)[\w-]+\.[\w-]+\S*|(?:https?://|www\.)[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\S*)\b", + score=0.9 + ) + ], + context=["网址", "链接", "网站", "网页"] +) \ No newline at end of file diff --git a/runtime/ops/mapper/pii_ner_detection/metadata.yml b/runtime/ops/mapper/pii_ner_detection/metadata.yml new file mode 100644 index 0000000..eae179f --- /dev/null +++ b/runtime/ops/mapper/pii_ner_detection/metadata.yml @@ -0,0 +1,9 @@ +name: '高级匿名化' +language: 'Python' +vendor: 'others' +raw_id: 'PiiDetector' +version: '1.0.0' +description: '高级匿名化算子,检测命名实体并匿名化。' +modal: 'text' +inputs: 'text' +outputs: 'text' \ No newline at end of file diff --git a/runtime/ops/mapper/pii_ner_detection/process.py b/runtime/ops/mapper/pii_ner_detection/process.py new file mode 100644 index 0000000..0079c79 --- /dev/null +++ b/runtime/ops/mapper/pii_ner_detection/process.py @@ -0,0 +1,52 @@ +import presidio_analyzer as analyzer +import presidio_anonymizer as anonymizer +import spacy + +from datamate.core.base_op import Mapper + +from .custom_entities import id_recognizer, phone_recognizer, zipcode_recognizer, url_recognizer + + +class PiiDetector(Mapper): + custom_ops = True + + def __init__(self, *args, **kwargs): + super(PiiDetector, self).__init__(*args, **kwargs) + self.support_language = kwargs.get("support_language", "zh") + + self.nlp_engine = None + self.text_analyzer = None + self.anom = None + + self.init_model(*args, **kwargs) + + def init_model(self, *args, **kwargs): + spacy.load("zh_core_web_sm") + provider = analyzer.nlp_engine.NlpEngineProvider( + nlp_configuration={ + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": "zh", "model_name": "zh_core_web_sm"} + ] + } + ) + + # 创建NLP Engine + self.nlp_engine = provider.create_engine() + + # 初始化AnalyzerEngine + self.text_analyzer = analyzer.AnalyzerEngine(nlp_engine=self.nlp_engine, supported_languages=["zh"]) + self.text_analyzer.registry.load_predefined_recognizers() + for recognizer in [id_recognizer, phone_recognizer, zipcode_recognizer, url_recognizer]: + self.text_analyzer.registry.add_recognizer(recognizer) + + # 初始化AnonymizerEngine + self.anom = anonymizer.AnonymizerEngine() + + def execute(self, sample): + self.read_file_first(sample) + text = sample.get('text') + analyzer_results = self.text_analyzer.analyze(text=text, language=self.support_language) + res = self.anom.anonymize(text=text, analyzer_results=analyzer_results) + sample['text'] = res.text + return sample diff --git a/runtime/ops/pyproject.toml b/runtime/ops/pyproject.toml index b5b0eb9..07fc092 100644 --- a/runtime/ops/pyproject.toml +++ b/runtime/ops/pyproject.toml @@ -3,7 +3,7 @@ name = "ops" version = "0.0.1" description = "Add your description here" readme = "README.md" -requires-python = ">=3.12" +requires-python = ">=3.11" dependencies = [ "beautifulsoup4>=4.14.3", "datasketch>=1.8.0", @@ -11,17 +11,21 @@ dependencies = [ "emoji>=2.15.0", "jieba>=0.42.1", "loguru>=0.7.3", - "numpy>=2.2.0,<=2.2.6", - "opencv-contrib-python-headless>=4.12.0.88", - "opencv-python-headless>=4.12.0.88", + "numpy==1.23.3", + "opencv-contrib-python-headless==4.7.0.72", + "opencv-python-headless==4.7.0.72", "openslide-python>=1.4.3", - "paddleocr>=3.3.2", - "pandas>=2.2.0,<=2.2.3", + "paddleocr==2.8.1", + "paddlepaddle==2.6.2", + "pandas==1.5.3", + "presidio-analyzer==2.2.25", + "presidio-anonymizer==2.2.25", "pycryptodome>=3.23.0", "pymysql>=1.1.2", "python-docx>=1.2.0", "pytz>=2025.2", "six>=1.17.0", + "spacy==3.7.0", "sqlalchemy>=2.0.44", "xmltodict>=1.0.2", "zhconv>=1.4.3", diff --git a/scripts/db/data-cleaning-init.sql b/scripts/db/data-cleaning-init.sql index 69bebd6..ee88e7c 100644 --- a/scripts/db/data-cleaning-init.sql +++ b/scripts/db/data-cleaning-init.sql @@ -59,43 +59,45 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', '文本清洗模板', '文本清 ('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板'); INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override) -VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 6, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 7, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 8, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 9, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 10, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 11, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 12, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 13, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 14, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 15, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 16, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 17, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 18, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 19, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 20, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 21, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 22, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 23, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, null), - ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null); +VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 1, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 2, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 3, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 4, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 5, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 6, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 7, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 8, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 9, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 10, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 11, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 12, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 13, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 14, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 15, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 16, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 17, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 18, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 19, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 20, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 21, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 22, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 23, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 24, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 25, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 26, null), + ('26ae585c-8310-4679-adc0-e53215e6e69b', 'PiiDetector', 27, null); INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override) -VALUES ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 3, null), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 4, null), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 5, null), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 6, null), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 7, null), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 8, null), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 9, null), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 10, null), - ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 11, null), +VALUES ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 1, null), + ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 2, null), + ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 3, null), + ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 4, null), + ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 5, null), + ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 6, null), + ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 7, null), + ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 8, null), + ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 9, null), + ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 10, null), + ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDirectionCorrect', 11, null), ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, null), ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null); \ No newline at end of file diff --git a/scripts/db/data-operator-init.sql b/scripts/db/data-operator-init.sql index ddb26a1..33b4926 100644 --- a/scripts/db/data-operator-init.sql +++ b/scripts/db/data-operator-init.sql @@ -105,8 +105,9 @@ VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取P ('ImgShadowRemove', '图片阴影去除', '去除图片中的阴影,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false'), ('ImgSharpness', '图片锐度增强', '自适应调节图片的锐度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 'false'), ('ImgSimilarImagesCleaner', '相似图片去除', '去除相似的图片。', '1.0.0', 'image', 'image', null, '{"similarThreshold": {"name": "相似度", "description": "相似度取值越大,图片相似度越高。", "type": "slider", "defaultVal": 0.8, "min": 0, "max": 1, "step": 0.01}}', '', 'false'), - ('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', null, '{"imgType": {"name": "图片编码格式", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "value": "jpg"}, {"label": "png", "value": "png"}, {"label": "jpeg", "value": "jpeg"}, {"label": "bmp", "value": "bmp"}]}}', '', 'false'); - + ('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', null, '{"imgType": {"name": "图片编码格式", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "value": "jpg"}, {"label": "png", "value": "png"}, {"label": "jpeg", "value": "jpeg"}, {"label": "bmp", "value": "bmp"}]}}', '', 'false'), + ('ImgDirectionCorrect', '图片方向校正', '将含有文字的图片校正到文字水平方向,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false'), + ('PiiDetector', '高级匿名化', '高级匿名化算子,检测命名实体并匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'); INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id) SELECT c.id, o.id @@ -119,7 +120,8 @@ AND o.id IN ('FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFil 'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner', 'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner', 'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner', - 'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'MineruFormatter'); + 'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'MineruFormatter', + 'PiiDetector'); INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id) SELECT c.id, o.id @@ -128,4 +130,4 @@ FROM t_operator_category c WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3') AND o.id IN ('ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise', 'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation', - 'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify'); + 'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify', 'ImgDirectionCorrect'); diff --git a/scripts/images/runtime/Dockerfile b/scripts/images/runtime/Dockerfile index e65b08c..653d016 100644 --- a/scripts/images/runtime/Dockerfile +++ b/scripts/images/runtime/Dockerfile @@ -3,7 +3,12 @@ FROM ghcr.io/astral-sh/uv:python3.11-bookworm RUN --mount=type=cache,target=/var/cache/apt \ --mount=type=cache,target=/var/lib/apt \ apt update \ - && apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix + && apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix swig + +RUN mkdir -p /home/models \ + && wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar \ + && tar -xf ch_ppocr_mobile_v2.0_cls_infer.tar -C /home/models \ + && rm -f ch_*.tar COPY runtime/python-executor /opt/runtime COPY runtime/ops /opt/runtime/datamate/ops @@ -16,7 +21,8 @@ WORKDIR /opt/runtime RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install -e . --system \ - && uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system + && uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \ + && python -m spacy download zh_core_web_sm RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \ && chmod +x /opt/runtime/start.sh \