feature: 增加水印去除/高级匿名化算子 (#151)

* feature: 增加水印去除算子 * feature: clean code * feature: clean code * feature: 增加高级匿名化算子
2025-12-10 18:12:47 +08:00
parent cbb146d3d7
commit 19a04df276
15 changed files with 197 additions and 274 deletions
--- a/runtime/ops/mapper/pii_ner_detection/init.py
+++ b/runtime/ops/mapper/pii_ner_detection/init.py
@@ -0,0 +1,4 @@
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='PiiDetector',
+                          module_path='ops.mapper.pii_ner_detection.process')
--- a/runtime/ops/mapper/pii_ner_detection/custom_entities.py
+++ b/runtime/ops/mapper/pii_ner_detection/custom_entities.py
@@ -0,0 +1,62 @@
+import presidio_analyzer as analyzer
+
+# 中国身份证号识别器
+id_recognizer = analyzer.PatternRecognizer(
+    supported_entity="ID_CHINA",
+    supported_language="zh",
+    patterns=[
+        analyzer.Pattern(
+            name="china_id_pattern",
+            regex=r"\b[1-9]\d{5}(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dXx]\b|\b[1-9]\d{7}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}\b",
+            score=0.9
+        )
+    ],
+    context=["身份证", "身份证明", "身份证号", "证件号码"]
+)
+
+# 中国电话号码识别器
+phone_recognizer = analyzer.PatternRecognizer(
+    supported_entity="Phone_CHINA",
+    supported_language="zh",
+    patterns=[
+        analyzer.Pattern(
+            name="china_mobile_pattern",
+            regex=r"\b(1[3-9]\d{9})\b",
+            score=0.85
+        ),
+        analyzer.Pattern(
+            name="china_landline_pattern",
+            regex=r"\b(0\d{2,3}-?\d{7,8})\b",
+            score=0.8
+        )
+    ],
+    context=["电话", "手机", "联系方式", "联系电话"]
+)
+
+# 中国邮编识别器
+zipcode_recognizer = analyzer.PatternRecognizer(
+    supported_entity="ZIPCODE_CHINA",
+    supported_language="zh",
+    patterns=[
+        analyzer.Pattern(
+            name="china_zipcode_pattern",
+            regex=r"\b[1-9]\d{5}\b",
+            score=0.7
+        )
+    ],
+    context=["邮编", "邮政编码", "邮编号码"]
+)
+
+# 兼容中文域名的URL识别器
+url_recognizer = analyzer.PatternRecognizer(
+    supported_entity="URL",
+    supported_language="zh",
+    patterns=[
+        analyzer.Pattern(
+            name="url_pattern",
+            regex=r"\b((?:https?://|www\.)[\w-]+\.[\w-]+\S*|(?:https?://|www\.)[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\S*)\b",
+            score=0.9
+        )
+    ],
+    context=["网址", "链接", "网站", "网页"]
+)
--- a/runtime/ops/mapper/pii_ner_detection/metadata.yml
+++ b/runtime/ops/mapper/pii_ner_detection/metadata.yml
@@ -0,0 +1,9 @@
+name: '高级匿名化'
+language: 'Python'
+vendor: 'others'
+raw_id: 'PiiDetector'
+version: '1.0.0'
+description: '高级匿名化算子，检测命名实体并匿名化。'
+modal: 'text'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/pii_ner_detection/process.py
+++ b/runtime/ops/mapper/pii_ner_detection/process.py
@@ -0,0 +1,52 @@
+import presidio_analyzer as analyzer
+import presidio_anonymizer as anonymizer
+import spacy
+
+from datamate.core.base_op import Mapper
+
+from .custom_entities import id_recognizer, phone_recognizer, zipcode_recognizer, url_recognizer
+
+
+class PiiDetector(Mapper):
+    custom_ops = True
+
+    def __init__(self, *args, **kwargs):
+        super(PiiDetector, self).__init__(*args, **kwargs)
+        self.support_language = kwargs.get("support_language", "zh")
+
+        self.nlp_engine = None
+        self.text_analyzer = None
+        self.anom = None
+
+        self.init_model(*args, **kwargs)
+
+    def init_model(self, *args, **kwargs):
+        spacy.load("zh_core_web_sm")
+        provider = analyzer.nlp_engine.NlpEngineProvider(
+            nlp_configuration={
+                "nlp_engine_name": "spacy",
+                "models": [
+                    {"lang_code": "zh", "model_name": "zh_core_web_sm"}
+                ]
+            }
+        )
+
+        # 创建NLP Engine
+        self.nlp_engine = provider.create_engine()
+
+        #  初始化AnalyzerEngine
+        self.text_analyzer = analyzer.AnalyzerEngine(nlp_engine=self.nlp_engine, supported_languages=["zh"])
+        self.text_analyzer.registry.load_predefined_recognizers()
+        for recognizer in [id_recognizer, phone_recognizer, zipcode_recognizer, url_recognizer]:
+            self.text_analyzer.registry.add_recognizer(recognizer)
+
+        # 初始化AnonymizerEngine
+        self.anom = anonymizer.AnonymizerEngine()
+
+    def execute(self, sample):
+        self.read_file_first(sample)
+        text = sample.get('text')
+        analyzer_results = self.text_analyzer.analyze(text=text, language=self.support_language)
+        res = self.anom.anonymize(text=text, analyzer_results=analyzer_results)
+        sample['text'] = res.text
+        return sample