feature: 增加水印去除/高级匿名化算子 (#151)

* feature: 增加水印去除算子 * feature: clean code * feature: clean code * feature: 增加高级匿名化算子
2025-12-10 18:12:47 +08:00
parent cbb146d3d7
commit 19a04df276
15 changed files with 197 additions and 274 deletions
--- a/README.md
+++ b/README.md
@@ -73,15 +73,15 @@ make install-mineru
 ```

 ### Deploy the DeerFlow service
-1. Modify `runtime/deer-flow/.env.example` and add configurations for SEARCH_API_KEY and the EMBEDDING model.
-2. Modify `runtime/deer-flow/.conf.yaml.example` and add basic model service configurations.
-3. Execute `make install-deer-flow`
+```bash
+make install-deer-flow
+```

 ### Local Development and Deployment
 After modifying the local code, please execute the following commands to build the image and deploy using the local image.
 ```bash
 make build
-make install REGISTRY=""
+make install dev=true
 ```

 ## 🤝 Contribution Guidelines
--- a/runtime/ops/mapper/init.py
+++ b/runtime/ops/mapper/init.py
@@ -23,7 +23,6 @@ def _import_operators():
    from . import garble_characters_cleaner
    from . import html_tag_cleaner
    from . import id_number_cleaner
-    from . import img_watermark_remove
    from . import invisible_characters_cleaner
    from . import ip_address_cleaner
    from . import legend_cleaner
@@ -47,6 +46,7 @@ def _import_operators():
    from . import img_resize
    from . import remove_duplicate_sentences
    from . import knowledge_relation_slice
+    from . import pii_ner_detection


 _import_operators()
--- a/runtime/ops/mapper/img_direction_correct/base_model.py
+++ b/runtime/ops/mapper/img_direction_correct/base_model.py
@@ -11,7 +11,6 @@ class BaseModel:

    def __init__(self, model_type='vertical'):
        models_path = os.getenv("MODELS_PATH", "/home/models")
-        self.resources_path = str(Path(models_path, 'img_direction_correct', 'resources'))
        args = Namespace()
        args.cls_image_shape = '3, 224, 224'
        args.cls_batch_num = 6
@@ -20,13 +19,14 @@ class BaseModel:
        args.use_gpu = False
        args.use_npu = False
        args.use_xpu = False
+        args.use_mlu = False
        args.enable_mkldnn = False
        if model_type == 'vertical':
-            args.cls_model_dir = str(Path(self.resources_path, 'vertical_model'))
+            args.cls_model_dir = str(Path(models_path, 'ch_ppocr_mobile_v2.0_cls_infer'))
            self.model_name = 'standard model to detect image 0 or 90 rotated'
            args.label_list = ['0', '90']
        else:
-            args.cls_model_dir = str(Path(self.resources_path, 'standard_model'))
+            args.cls_model_dir = str(Path(models_path, 'ch_ppocr_mobile_v2.0_cls_infer'))
            self.model_name = 'standard model to detect image 0 or 180 rotated'
            args.label_list = ['0', '180']

--- a/runtime/ops/mapper/img_watermark_remove/init.py
+++ b/runtime/ops/mapper/img_watermark_remove/init.py
@@ -1,6 +0,0 @@
-# -*- coding: utf-8 -*-
-
-from datamate.core.base_op import OPERATORS
-
-OPERATORS.register_module(module_name='ImgWatermarkRemove',
-                          module_path="ops.mapper.img_watermark_remove.process")
--- a/runtime/ops/mapper/img_watermark_remove/metadata.yml
+++ b/runtime/ops/mapper/img_watermark_remove/metadata.yml
@@ -1,26 +0,0 @@
-name: '图片水印去除'
-name_en: 'Image Watermark Removal'
-description: '去除图片中的“知乎”和“抖音”水印。'
-description_en: 'Removes the 知乎 and 抖音 watermarks from images.'
-language: 'python'
-vendor: 'huawei'
-raw_id: 'ImgWatermarkRemove'
-version: '1.0.0'
-types:
-  - 'cleanse'
-modal: 'image'
-effect:
-  before: ''
-  after: ''
-inputs: 'image'
-outputs: 'image'
-settings:
-  watermarkStr:
-    name: 需要去除的水印文字信息
-    type: checkbox
-    defaultVal: '知乎,抖音'
-    options:
-      - label: 知乎
-        value: 知乎
-      - label: 抖音
-        value: 抖音
--- a/runtime/ops/mapper/img_watermark_remove/process.py
+++ b/runtime/ops/mapper/img_watermark_remove/process.py
@@ -1,161 +0,0 @@
-# # -- encoding: utf-8 --
-
-#
-# Description:
-# Create: 2025/01/06
-# """
-import time
-from typing import Dict, Any
-
-import cv2
-import numpy as np
-from loguru import logger
-
-from datamate.common.utils import bytes_to_numpy
-from datamate.common.utils import numpy_to_bytes
-from datamate.core.base_op import Mapper
-from .watermark_ocr_model import WatermarkOcrModel
-
-DEFAULT_MAX_CHARACTERS = 10
-DEFAULT_BINARY_THRESHOLD_LOW = 200
-
-
-class ImgWatermarkRemove(Mapper):
-    use_model = True
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.remove_str = kwargs.get("watermarkStr", "知乎,抖音")
-        self.ocr_model = self.get_model(*args, **kwargs)
-
-    @staticmethod
-    def _has_kw(result_list, kw_list):
-        """
-        图片是否包含目标水印，返回匹配到的文字列表
-        """
-        result_str_list = []
-        for line in result_list:
-            for kw in kw_list:
-                if kw in line[1][0]:
-                    result_str_list.append(line[1][0])
-                    break
-        return result_str_list
-
-    @staticmethod
-    def _overlay_mask(background_img, prospect_img, img_over_x, img_over_y):
-        back_r, back_c, _ = background_img.shape  # 背景图像行数、列数
-        is_x_direction_failed = img_over_x > back_c or img_over_x < 0
-        is_y_direction_failed = img_over_y > back_r or img_over_y < 0
-        if is_x_direction_failed or is_y_direction_failed:
-            # 前景图不在背景图范围内, 直接返回原图
-            return background_img
-        pro_r, pro_c, _ = prospect_img.shape  # 前景图像行数、列数
-        if img_over_x + pro_c > back_c:  # 如果水平方向展示不全
-            pro_c = back_c - img_over_x  # 截取前景图的列数
-            prospect_img = prospect_img[:, 0:pro_c, :]  # 截取前景图
-        if img_over_y + pro_r > back_r:  # 如果垂直方向展示不全
-            pro_r = back_r - img_over_y  # 截取前景图的行数
-            prospect_img = prospect_img[0:pro_r, :, :]  # 截取前景图
-
-        prospect_img = cv2.cvtColor(prospect_img, cv2.COLOR_BGR2BGRA)  # 前景图转为4通道图像
-        prospect_tmp = np.zeros((back_r, back_c, 4), np.uint8)  # 与背景图像等大的临时前景图层
-
-        # 前景图像放到前景图层里
-        prospect_tmp[img_over_y:img_over_y + pro_r, img_over_x: img_over_x + pro_c, :] = prospect_img
-
-        _, binary = cv2.threshold(prospect_img, 254, 255, cv2.THRESH_BINARY)  # 前景图阈值处理
-        prospect_mask = np.zeros((pro_r, pro_c, 1), np.uint8)  # 单通道前景图像掩模
-        prospect_mask[:, :, 0] = binary[:, :, 3]  # 不透明像素的值作为掩模的值
-
-        mask = np.zeros((back_r, back_c, 1), np.uint8)
-        mask[img_over_y:img_over_y + prospect_mask.shape[0],
-        img_over_x: img_over_x + prospect_mask.shape[1]] = prospect_mask
-
-        mask_not = cv2.bitwise_not(mask)
-
-        prospect_tmp = cv2.bitwise_and(prospect_tmp, prospect_tmp, mask=mask)
-        background_img = cv2.bitwise_and(background_img, background_img, mask=mask_not)
-        prospect_tmp = cv2.cvtColor(prospect_tmp, cv2.COLOR_BGRA2BGR)  # 前景图层转为三通道图像
-        return prospect_tmp + background_img  # 前景图层与背景图像相加合并
-
-    def execute(self, sample: Dict[str, Any]):
-        start = time.time()
-        self.read_file_first(sample)
-        file_name = sample[self.filename_key]
-        file_type = "." + sample[self.filetype_key]
-        img_bytes = sample[self.data_key]
-        if img_bytes:
-            data = bytes_to_numpy(img_bytes)
-            correct_data = self._watermark_remove(data, file_name, self.ocr_model)
-            sample[self.data_key] = numpy_to_bytes(correct_data, file_type)
-        logger.info(f"fileName: {file_name}, method: ImgWatermarkRemove costs {time.time() - start:6f} s")
-        return sample
-
-    def delete_watermark(self, result_list, kw_list, data):
-        """
-        将符合目标的水印，模糊化处理
-        """
-        # 获取所有符合目标的文本框位置
-        text_axes_list = []
-        for line in result_list:
-            for kw in kw_list:
-                if kw in line[1][0]:
-                    min_width = int(min(line[0][0][0], line[0][3][0]))
-                    max_width = int(max(line[0][1][0], line[0][2][0]))
-                    min_hight = int(min(line[0][0][1], line[0][1][1]))
-                    max_hight = int(max(line[0][2][1], line[0][3][1]))
-                    text_axes_list.append([min_width, min_hight, max_width, max_hight])
-                    break
-        # 去除水印
-        delt = DEFAULT_MAX_CHARACTERS  # 文本框范围扩大
-        img = data
-        for text_axes in text_axes_list:
-            hight, width = img.shape[0:2]
-            # 截取图片
-            min_width = text_axes[0] - delt if text_axes[0] - delt >= 0 else 0
-            min_hight = text_axes[1] - delt if text_axes[1] - delt >= 0 else 0
-            max_width = text_axes[2] + delt if text_axes[2] + delt <= width else width
-            max_hight = text_axes[3] + delt if text_axes[3] + delt <= hight else hight
-            cropped = img[min_hight:max_hight, min_width:max_width]  # 裁剪坐标为[y0:y1, x0:x1]
-            # 图片二值化处理，把[200,200,200]-[250,250,250]以外的颜色变成0
-            start_rgb = DEFAULT_BINARY_THRESHOLD_LOW
-            thresh = cv2.inRange(cropped, np.array([start_rgb, start_rgb, start_rgb]), np.array([250, 250, 250]))
-            # 创建形状和尺寸的结构元素
-            kernel = np.ones((3, 3), np.uint8)  # 设置卷积核3*3全是1；将当前的数组作为图像类型来进&#12175;各种操作，就要转换到uint8类型
-            # 扩展待修复区域
-            hi_mask = cv2.dilate(thresh, kernel, iterations=10)  # 膨胀操作，白色区域增大，iterations迭代次数
-            specular = cv2.inpaint(cropped, hi_mask, 5, flags=cv2.INPAINT_TELEA)
-            # imgSY：输入8位1通道或3通道图像。
-            # hi_mask：修复掩码，8位1通道图像。非零像素表示需要修复的区域。
-            # specular：输出与imgSY具有相同大小和类型的图像。
-            # 5：算法考虑的每个点的圆形邻域的半径。
-            # flags：NPAINT_NS基于Navier-Stokes的方法、Alexandru Telea的INPAINT_TELEA方法
-            result = self._overlay_mask(img, specular, min_width, min_hight)
-            img = result
-        return img
-
-    def init_model(self, *args, **kwargs):
-        return WatermarkOcrModel(*args, **kwargs).ocr_model
-
-    def _watermark_remove(self, data, file_name, model):
-        """
-        去除水印的方法
-        """
-        remove_str = self.remove_str
-        # 勾选去水印的信息为空，则直接返回原图
-        if remove_str == "":
-            return data
-        kw_list = remove_str.split(',')
-        # 加载模型
-        ocr_model = model
-        try:
-            result = ocr_model.ocr(data, cls=True)
-        except RuntimeError as e:
-            logger.error(f"fileName: {file_name}, method: ocr predict error {e}")
-            return data
-        if result and result[0]:
-            logger.info(f"fileName: {file_name}, method: ocrModel detect watermark info {str(result)}")
-            return self.delete_watermark(result[0], kw_list, data)
-        else:
-            logger.info(f"fileName: {file_name}, method: ImgWatermarkRemove not need remove target ocr")
-            return data
--- a/runtime/ops/mapper/img_watermark_remove/watermark_ocr_model.py
+++ b/runtime/ops/mapper/img_watermark_remove/watermark_ocr_model.py
@@ -1,25 +0,0 @@
-# -- encoding: utf-8 --
-
-import gc
-import os
-from pathlib import Path
-
-
-class WatermarkOcrModel:
-
-    def __init__(self, *args, **kwargs):
-        models_path = os.getenv("MODELS_PATH", "/home/models")
-        self.resources_path = str(Path(models_path, 'img_watermark_remove', 'resources'))
-        self.det_model_dir = str(Path(self.resources_path, 'ch_PP-OCRv4_det_infer'))
-        self.rec_model_dir = str(Path(self.resources_path, 'ch_PP-OCRv4_rec_infer'))
-        self.cls_model_dir = str(Path(self.resources_path, 'ch_ppocr_mobild_v2_cls_infer'))
-
-        from paddleocr import PaddleOCR
-        self.ocr_model = PaddleOCR(det_model_dir=self.det_model_dir, cls_model_dir=self.cls_model_dir,
-                                   rec_model_dir=self.rec_model_dir,
-                                   use_angle_cls=True,
-                                   lang='ch')
-
-    def __del__(self):
-        del self.ocr_model
-        gc.collect()
--- a/runtime/ops/mapper/pii_ner_detection/init.py
+++ b/runtime/ops/mapper/pii_ner_detection/init.py
@@ -0,0 +1,4 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='PiiDetector',
+                          module_path='ops.mapper.pii_ner_detection.process')
--- a/runtime/ops/mapper/pii_ner_detection/custom_entities.py
+++ b/runtime/ops/mapper/pii_ner_detection/custom_entities.py
@@ -0,0 +1,62 @@
+import presidio_analyzer as analyzer
+
+# 中国身份证号识别器
+id_recognizer = analyzer.PatternRecognizer(
+    supported_entity="ID_CHINA",
+    supported_language="zh",
+    patterns=[
+        analyzer.Pattern(
+            name="china_id_pattern",
+            regex=r"\b[1-9]\d{5}(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dXx]\b|\b[1-9]\d{7}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}\b",
+            score=0.9
+        )
+    ],
+    context=["身份证", "身份证明", "身份证号", "证件号码"]
+)
+
+# 中国电话号码识别器
+phone_recognizer = analyzer.PatternRecognizer(
+    supported_entity="Phone_CHINA",
+    supported_language="zh",
+    patterns=[
+        analyzer.Pattern(
+            name="china_mobile_pattern",
+            regex=r"\b(1[3-9]\d{9})\b",
+            score=0.85
+        ),
+        analyzer.Pattern(
+            name="china_landline_pattern",
+            regex=r"\b(0\d{2,3}-?\d{7,8})\b",
+            score=0.8
+        )
+    ],
+    context=["电话", "手机", "联系方式", "联系电话"]
+)
+
+# 中国邮编识别器
+zipcode_recognizer = analyzer.PatternRecognizer(
+    supported_entity="ZIPCODE_CHINA",
+    supported_language="zh",
+    patterns=[
+        analyzer.Pattern(
+            name="china_zipcode_pattern",
+            regex=r"\b[1-9]\d{5}\b",
+            score=0.7
+        )
+    ],
+    context=["邮编", "邮政编码", "邮编号码"]
+)
+
+# 兼容中文域名的URL识别器
+url_recognizer = analyzer.PatternRecognizer(
+    supported_entity="URL",
+    supported_language="zh",
+    patterns=[
+        analyzer.Pattern(
+            name="url_pattern",
+            regex=r"\b((?:https?://|www\.)[\w-]+\.[\w-]+\S*|(?:https?://|www\.)[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\S*)\b",
+            score=0.9
+        )
+    ],
+    context=["网址", "链接", "网站", "网页"]
+)
--- a/runtime/ops/mapper/pii_ner_detection/metadata.yml
+++ b/runtime/ops/mapper/pii_ner_detection/metadata.yml
@@ -0,0 +1,9 @@
+name: '高级匿名化'
+language: 'Python'
+vendor: 'others'
+raw_id: 'PiiDetector'
+version: '1.0.0'
+description: '高级匿名化算子，检测命名实体并匿名化。'
+modal: 'text'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/pii_ner_detection/process.py
+++ b/runtime/ops/mapper/pii_ner_detection/process.py
@@ -0,0 +1,52 @@
+import presidio_analyzer as analyzer
+import presidio_anonymizer as anonymizer
+import spacy
+
+from datamate.core.base_op import Mapper
+
+from .custom_entities import id_recognizer, phone_recognizer, zipcode_recognizer, url_recognizer
+
+
+class PiiDetector(Mapper):
+    custom_ops = True
+
+    def __init__(self, *args, **kwargs):
+        super(PiiDetector, self).__init__(*args, **kwargs)
+        self.support_language = kwargs.get("support_language", "zh")
+
+        self.nlp_engine = None
+        self.text_analyzer = None
+        self.anom = None
+
+        self.init_model(*args, **kwargs)
+
+    def init_model(self, *args, **kwargs):
+        spacy.load("zh_core_web_sm")
+        provider = analyzer.nlp_engine.NlpEngineProvider(
+            nlp_configuration={
+                "nlp_engine_name": "spacy",
+                "models": [
+                    {"lang_code": "zh", "model_name": "zh_core_web_sm"}
+                ]
+            }
+        )
+
+        # 创建NLP Engine
+        self.nlp_engine = provider.create_engine()
+
+        #  初始化AnalyzerEngine
+        self.text_analyzer = analyzer.AnalyzerEngine(nlp_engine=self.nlp_engine, supported_languages=["zh"])
+        self.text_analyzer.registry.load_predefined_recognizers()
+        for recognizer in [id_recognizer, phone_recognizer, zipcode_recognizer, url_recognizer]:
+            self.text_analyzer.registry.add_recognizer(recognizer)
+
+        # 初始化AnonymizerEngine
+        self.anom = anonymizer.AnonymizerEngine()
+
+    def execute(self, sample):
+        self.read_file_first(sample)
+        text = sample.get('text')
+        analyzer_results = self.text_analyzer.analyze(text=text, language=self.support_language)
+        res = self.anom.anonymize(text=text, analyzer_results=analyzer_results)
+        sample['text'] = res.text
+        return sample
--- a/runtime/ops/pyproject.toml
+++ b/runtime/ops/pyproject.toml
@@ -3,7 +3,7 @@ name = "ops"
 version = "0.0.1"
 description = "Add your description here"
 readme = "README.md"
-requires-python = ">=3.12"
+requires-python = ">=3.11"
 dependencies = [
    "beautifulsoup4>=4.14.3",
    "datasketch>=1.8.0",
@@ -11,17 +11,21 @@ dependencies = [
    "emoji>=2.15.0",
    "jieba>=0.42.1",
    "loguru>=0.7.3",
-    "numpy>=2.2.0,<=2.2.6",
-    "opencv-contrib-python-headless>=4.12.0.88",
-    "opencv-python-headless>=4.12.0.88",
+    "numpy==1.23.3",
+    "opencv-contrib-python-headless==4.7.0.72",
+    "opencv-python-headless==4.7.0.72",
    "openslide-python>=1.4.3",
-    "paddleocr>=3.3.2",
-    "pandas>=2.2.0,<=2.2.3",
+    "paddleocr==2.8.1",
+    "paddlepaddle==2.6.2",
+    "pandas==1.5.3",
+    "presidio-analyzer==2.2.25",
+    "presidio-anonymizer==2.2.25",
    "pycryptodome>=3.23.0",
    "pymysql>=1.1.2",
    "python-docx>=1.2.0",
    "pytz>=2025.2",
    "six>=1.17.0",
+    "spacy==3.7.0",
    "sqlalchemy>=2.0.44",
    "xmltodict>=1.0.2",
    "zhconv>=1.4.3",
--- a/scripts/db/data-cleaning-init.sql
+++ b/scripts/db/data-cleaning-init.sql
@@ -59,43 +59,45 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', '文本清洗模板', '文本清
       ('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板');

 INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
-VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 6, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 7, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 8, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 9, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 10, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 11, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 12, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 13, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 14, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 15, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 16, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 17, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 18, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 19, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 20, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 21, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 22, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 23, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, null),
-       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null);
+VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 1, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 2, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 3, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 4, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 5, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 6, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 7, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 8, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 9, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 10, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 11, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 12, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 13, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 14, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 15, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 16, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 17, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 18, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 19, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 20, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 21, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 22, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 23, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 24, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 25, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 26, null),
+       ('26ae585c-8310-4679-adc0-e53215e6e69b', 'PiiDetector', 27, null);

 INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
-VALUES ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null),
-       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 3, null),
-       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 4, null),
-       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 5, null),
-       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 6, null),
-       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 7, null),
-       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 8, null),
-       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 9, null),
-       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 10, null),
-       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 11, null),
+VALUES ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 1, null),
+       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 2, null),
+       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 3, null),
+       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 4, null),
+       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 5, null),
+       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 6, null),
+       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 7, null),
+       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 8, null),
+       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 9, null),
+       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 10, null),
+       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDirectionCorrect', 11, null),
       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, null),
       ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null);
--- a/scripts/db/data-operator-init.sql
+++ b/scripts/db/data-operator-init.sql
@@ -105,8 +105,9 @@ VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API，抽取P
       ('ImgShadowRemove', '图片阴影去除', '去除图片中的阴影，主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
       ('ImgSharpness', '图片锐度增强', '自适应调节图片的锐度，主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 'false'),
       ('ImgSimilarImagesCleaner', '相似图片去除', '去除相似的图片。', '1.0.0', 'image', 'image', null, '{"similarThreshold": {"name": "相似度", "description": "相似度取值越大，图片相似度越高。", "type": "slider", "defaultVal": 0.8, "min": 0, "max": 1, "step": 0.01}}', '', 'false'),
-       ('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', null, '{"imgType": {"name": "图片编码格式", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "value": "jpg"}, {"label": "png", "value": "png"}, {"label": "jpeg", "value": "jpeg"}, {"label": "bmp", "value": "bmp"}]}}', '', 'false');
-
+       ('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', null, '{"imgType": {"name": "图片编码格式", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "value": "jpg"}, {"label": "png", "value": "png"}, {"label": "jpeg", "value": "jpeg"}, {"label": "bmp", "value": "bmp"}]}}', '', 'false'),
+       ('ImgDirectionCorrect', '图片方向校正', '将含有文字的图片校正到文字水平方向，主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
+       ('PiiDetector', '高级匿名化', '高级匿名化算子，检测命名实体并匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false');

 INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
 SELECT c.id, o.id
@@ -119,7 +120,8 @@ AND o.id IN ('FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFil
            'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
            'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
            'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
-            'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'MineruFormatter');
+            'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'MineruFormatter',
+            'PiiDetector');

 INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
 SELECT c.id, o.id
@@ -128,4 +130,4 @@ FROM t_operator_category c
 WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
  AND o.id IN ('ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
               'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
-               'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify');
+               'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify', 'ImgDirectionCorrect');
--- a/scripts/images/runtime/Dockerfile
+++ b/scripts/images/runtime/Dockerfile
@@ -3,7 +3,12 @@ FROM ghcr.io/astral-sh/uv:python3.11-bookworm
 RUN --mount=type=cache,target=/var/cache/apt \
    --mount=type=cache,target=/var/lib/apt \
    apt update \
-    && apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix
+    && apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix swig
+
+RUN mkdir -p /home/models \
+    && wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar \
+    && tar -xf ch_ppocr_mobile_v2.0_cls_infer.tar -C /home/models \
+    && rm -f ch_*.tar

 COPY runtime/python-executor /opt/runtime
 COPY runtime/ops /opt/runtime/datamate/ops
@@ -16,7 +21,8 @@ WORKDIR /opt/runtime

 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -e . --system \
-    && uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system
+    && uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
+    && python -m spacy download zh_core_web_sm

 RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
    && chmod +x /opt/runtime/start.sh \