You've already forked DataMate
62 lines
1.8 KiB
Python
62 lines
1.8 KiB
Python
import presidio_analyzer as analyzer
|
|
|
|
# 中国身份证号识别器
|
|
id_recognizer = analyzer.PatternRecognizer(
|
|
supported_entity="ID_CHINA",
|
|
supported_language="zh",
|
|
patterns=[
|
|
analyzer.Pattern(
|
|
name="china_id_pattern",
|
|
regex=r"\b[1-9]\d{5}(19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}[\dXx]\b|\b[1-9]\d{7}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01])\d{3}\b",
|
|
score=0.9
|
|
)
|
|
],
|
|
context=["身份证", "身份证明", "身份证号", "证件号码"]
|
|
)
|
|
|
|
# 中国电话号码识别器
|
|
phone_recognizer = analyzer.PatternRecognizer(
|
|
supported_entity="Phone_CHINA",
|
|
supported_language="zh",
|
|
patterns=[
|
|
analyzer.Pattern(
|
|
name="china_mobile_pattern",
|
|
regex=r"\b(1[3-9]\d{9})\b",
|
|
score=0.85
|
|
),
|
|
analyzer.Pattern(
|
|
name="china_landline_pattern",
|
|
regex=r"\b(0\d{2,3}-?\d{7,8})\b",
|
|
score=0.8
|
|
)
|
|
],
|
|
context=["电话", "手机", "联系方式", "联系电话"]
|
|
)
|
|
|
|
# 中国邮编识别器
|
|
zipcode_recognizer = analyzer.PatternRecognizer(
|
|
supported_entity="ZIPCODE_CHINA",
|
|
supported_language="zh",
|
|
patterns=[
|
|
analyzer.Pattern(
|
|
name="china_zipcode_pattern",
|
|
regex=r"\b[1-9]\d{5}\b",
|
|
score=0.7
|
|
)
|
|
],
|
|
context=["邮编", "邮政编码", "邮编号码"]
|
|
)
|
|
|
|
# 兼容中文域名的URL识别器
|
|
url_recognizer = analyzer.PatternRecognizer(
|
|
supported_entity="URL",
|
|
supported_language="zh",
|
|
patterns=[
|
|
analyzer.Pattern(
|
|
name="url_pattern",
|
|
regex=r"\b((?:https?://|www\.)[\w-]+\.[\w-]+\S*|(?:https?://|www\.)[\u4e00-\u9fa5]+\.[\u4e00-\u9fa5]+\S*)\b",
|
|
score=0.9
|
|
)
|
|
],
|
|
context=["网址", "链接", "网站", "网页"]
|
|
) |