You've already forked DataMate
53 lines
1.8 KiB
Python
53 lines
1.8 KiB
Python
import presidio_analyzer as analyzer
|
|
import presidio_anonymizer as anonymizer
|
|
import spacy
|
|
|
|
from datamate.core.base_op import Mapper
|
|
|
|
from .custom_entities import id_recognizer, phone_recognizer, zipcode_recognizer, url_recognizer
|
|
|
|
|
|
class PiiDetector(Mapper):
|
|
custom_ops = True
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super(PiiDetector, self).__init__(*args, **kwargs)
|
|
self.support_language = kwargs.get("support_language", "zh")
|
|
|
|
self.nlp_engine = None
|
|
self.text_analyzer = None
|
|
self.anom = None
|
|
|
|
self.init_model(*args, **kwargs)
|
|
|
|
def init_model(self, *args, **kwargs):
|
|
spacy.load("zh_core_web_sm")
|
|
provider = analyzer.nlp_engine.NlpEngineProvider(
|
|
nlp_configuration={
|
|
"nlp_engine_name": "spacy",
|
|
"models": [
|
|
{"lang_code": "zh", "model_name": "zh_core_web_sm"}
|
|
]
|
|
}
|
|
)
|
|
|
|
# 创建NLP Engine
|
|
self.nlp_engine = provider.create_engine()
|
|
|
|
# 初始化AnalyzerEngine
|
|
self.text_analyzer = analyzer.AnalyzerEngine(nlp_engine=self.nlp_engine, supported_languages=["zh"])
|
|
self.text_analyzer.registry.load_predefined_recognizers()
|
|
for recognizer in [id_recognizer, phone_recognizer, zipcode_recognizer, url_recognizer]:
|
|
self.text_analyzer.registry.add_recognizer(recognizer)
|
|
|
|
# 初始化AnonymizerEngine
|
|
self.anom = anonymizer.AnonymizerEngine()
|
|
|
|
def execute(self, sample):
|
|
self.read_file_first(sample)
|
|
text = sample.get('text')
|
|
analyzer_results = self.text_analyzer.analyze(text=text, language=self.support_language)
|
|
res = self.anom.anonymize(text=text, analyzer_results=analyzer_results)
|
|
sample['text'] = res.text
|
|
return sample
|