You've already forked DataMate
init datamate
This commit is contained in:
52
runtime/ops/mapper/__init__.py
Normal file
52
runtime/ops/mapper/__init__.py
Normal file
@@ -0,0 +1,52 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from datamate.common.utils.custom_importer import CustomImporter
|
||||
|
||||
|
||||
def _configure_importer():
|
||||
base_path = Path(__file__).resolve().parent
|
||||
sys.meta_path.append(CustomImporter(base_path))
|
||||
|
||||
|
||||
_configure_importer()
|
||||
|
||||
|
||||
def _import_operators():
|
||||
from . import content_cleaner
|
||||
from . import credit_card_number_cleaner
|
||||
from . import email_cleaner
|
||||
from . import emoji_cleaner
|
||||
from . import extra_space_cleaner
|
||||
from . import full_width_characters_cleaner
|
||||
from . import garble_characters_cleaner
|
||||
from . import html_tag_cleaner
|
||||
from . import id_number_cleaner
|
||||
from . import img_watermark_remove
|
||||
from . import invisible_characters_cleaner
|
||||
from . import ip_address_cleaner
|
||||
from . import legend_cleaner
|
||||
from . import phone_number_cleaner
|
||||
from . import political_word_cleaner
|
||||
from . import sexual_and_violent_word_cleaner
|
||||
from . import text_to_word
|
||||
from . import traditional_chinese
|
||||
from . import unicode_space_cleaner
|
||||
from . import url_cleaner
|
||||
from . import xml_tag_cleaner
|
||||
from . import img_enhanced_brightness
|
||||
from . import img_enhanced_contrast
|
||||
from . import img_enhanced_saturation
|
||||
from . import img_enhanced_sharpness
|
||||
from . import img_perspective_transformation
|
||||
from . import img_direction_correct
|
||||
from . import img_denoise
|
||||
from . import img_shadow_remove
|
||||
from . import img_type_unify
|
||||
from . import img_resize
|
||||
from . import remove_duplicate_sentences
|
||||
from . import knowledge_relation_slice
|
||||
|
||||
|
||||
_import_operators()
|
||||
6
runtime/ops/mapper/content_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/content_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ContentCleaner',
|
||||
module_path="ops.mapper.content_cleaner.process")
|
||||
16
runtime/ops/mapper/content_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/content_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '文档目录去除'
|
||||
name_en: 'Document Contents Removal'
|
||||
description: '去除文档中的目录。'
|
||||
description_en: 'Removes tables of contents from documents.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ContentCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
64
runtime/ops/mapper/content_cleaner/process.py
Normal file
64
runtime/ops/mapper/content_cleaner/process.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 文档目录去除
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ContentCleaner(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.no_content_count = 3 # 连续不符合目录结构的行数阈值
|
||||
# 目录标题
|
||||
self.content_text_pattern = r"^ *(目 *录|CONTENT(S)?) *$"
|
||||
# 目录行 前缀格式
|
||||
self.content_preface_pattern = r"^ *(前言|About This Document|\d+(\.\d+)*|[a-zA-Z]+(\.\d+)*)"
|
||||
# 目录行 中间格式
|
||||
self.content_middle_pattern = r"\.{7,}"
|
||||
# 目录行 结尾格式
|
||||
self.content_end_pattern = r"(\d|错误!未定义书签。|[IXV]+) *$"
|
||||
self.content_pattern = self.content_preface_pattern + ".*" + self.content_end_pattern
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._content_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: ContentCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _content_filter(self, input_data: str):
|
||||
count = 0 # 记录不符合目录结构的次数,连续3行不满足要求,则认为已经进入正文
|
||||
# 目录起始和结束索引
|
||||
content_start_index, content_end_index = -1, -1
|
||||
lines = input_data.split("\n")
|
||||
for i, line in enumerate(lines):
|
||||
if content_start_index >= 0 and count >= self.no_content_count:
|
||||
break
|
||||
# 首先匹配目录或content字眼
|
||||
if content_start_index < 0 and re.match(self.content_text_pattern, line, re.IGNORECASE):
|
||||
content_start_index = i
|
||||
content_end_index = i
|
||||
# 匹配两种形式的目录行
|
||||
# 1. 以指定格式开始、指定格式结尾;2.该行包含点数量超过7个
|
||||
elif content_start_index >= 0 and (re.match(self.content_pattern, line, re.IGNORECASE)
|
||||
or re.search(self.content_middle_pattern, line)):
|
||||
content_end_index = i
|
||||
count = 0
|
||||
elif content_start_index >= 0 and not (re.match(self.content_pattern, line, re.IGNORECASE)
|
||||
or re.search(self.content_middle_pattern, line)):
|
||||
count += 1
|
||||
|
||||
if 0 <= content_start_index < content_end_index:
|
||||
res = "\n".join(lines[:content_start_index] + lines[content_end_index + 1:])
|
||||
else:
|
||||
# 只有目录关键字时,关键字不去除;或不符合目录结构,返回原文
|
||||
res = "\n".join(lines)
|
||||
return res
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='AnonymizedCreditCardNumber',
|
||||
module_path="ops.mapper.credit_card_number_cleaner.process")
|
||||
16
runtime/ops/mapper/credit_card_number_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/credit_card_number_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '信用卡号匿名化'
|
||||
name_en: 'Credit Card Number Anonymization'
|
||||
description: '信用卡号匿名化'
|
||||
description_en: 'Anonymizes credit card numbers.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'AnonymizedCreditCardNumber'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '这个是信用卡号:4111111111111111'
|
||||
after: '这个是信用卡号:<credit_card_number>'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
83
runtime/ops/mapper/credit_card_number_cleaner/process.py
Normal file
83
runtime/ops/mapper/credit_card_number_cleaner/process.py
Normal file
@@ -0,0 +1,83 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 信用卡号匿名化
|
||||
Create: 2024/12/5 15:43
|
||||
"""
|
||||
from loguru import logger
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class AnonymizedCreditCardNumber(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AnonymizedCreditCardNumber, self).__init__(*args, **kwargs)
|
||||
self.re_compile = self._get_credit_card_re_compile()
|
||||
|
||||
@staticmethod
|
||||
def _verify_credit_card_num(credit_card_num: str):
|
||||
"""信用卡号码校验"""
|
||||
# 从右到左翻转
|
||||
digits = [int(x) for x in reversed(credit_card_num) if x.isdigit()]
|
||||
# 对偶数位数字翻倍 d*2
|
||||
even_digits = [d * 2 for d in digits[1::2]]
|
||||
# 如果对某个数字翻倍之后结果是一个两位数,将这两位数字加在一起
|
||||
even_digits = [d // 10 + d % 10 for d in even_digits]
|
||||
# 将上一步所有一位数相加
|
||||
even_sum = sum(even_digits)
|
||||
# 将卡号里从右到左奇数位上所有数字相加
|
||||
odd_sum = sum(digits[::2])
|
||||
# 将even_sum和odd_sum相加,能被10整数为合法,否则不合法
|
||||
if (odd_sum + even_sum) % 10 == 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def _get_credit_card_re_compile():
|
||||
separator_symbol = r"([- ]?)"
|
||||
# American Express 以 34 或 37 开头的 15 位数号码 格式:NNNN-NNNNNN-NNNNN 或 NNNN NNNNNN NNNNN
|
||||
american_express = "3[47][0-9]{2}" + separator_symbol + "[0-9]{6}" + separator_symbol + "[0-9]{5}"
|
||||
# 中国银联 以 62 或 60 开头,是一个 16 位数号码。 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
|
||||
china_union_pay = r"(6[02]\d{2})" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
|
||||
# Diner's Club 以 300–305、36、38 或 39、3095 开头, 14 位数号码 格式:NNNN-NNNNNN-NNNN 或 NNNN NNNNNN NNNN。
|
||||
diners_club = r"(30[0-5]\d|3[689]\d{2}|3095)" + separator_symbol + r"[0-9]{6}" + separator_symbol + r"[0-9]{4}"
|
||||
# Discover 以 6011、644–649 或 65 开头的 16 位数号码 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
|
||||
discover = r"(64[4-9]\d|65\d{2}|6011)" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
|
||||
# JCB 以 3528 到 3589 开头的 16 位数字, 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNNNNNN
|
||||
jcb = r"(352[89]|35[3-8]\d)" + separator_symbol + r"[0-9]{4}" + (
|
||||
r"((%s\d{%d}){%d}" % (separator_symbol, 4, 2) + ")|" + separator_symbol + r"[0-9]{8}")
|
||||
# Mastercard 以 51–55 或 2221–2720 开头的 16 位数字 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
|
||||
master_card = r"(5[1-5]\d{2}|222[1-9]|22[3-9]\d|2[3-6]\d{2}|27[01]\d|2720)" + r"(%s\d{%d}){%d}" \
|
||||
% (separator_symbol, 4, 3)
|
||||
# visa 以4开头 16 位数号码 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
|
||||
visa = r"4\d{3}" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
|
||||
|
||||
credit_card_pattern = r"(?<=[^\d])(%s|%s|%s|%s|%s|%s|%s)(?=[^\d])" % (
|
||||
american_express, china_union_pay, diners_club,
|
||||
discover, jcb, master_card, visa)
|
||||
credit_card_re_compile = re.compile(credit_card_pattern)
|
||||
return credit_card_re_compile
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._credit_card_number_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: CreditCardNumberCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _credit_card_number_filter(self, input_data: str):
|
||||
"""提取信用卡号号码"""
|
||||
input_data = ''.join(['【', input_data, '】'])
|
||||
# 抽取符合信用卡正则匹配的字符串
|
||||
credit_card_nums = [item.group(1) for item in self.re_compile.finditer(input_data)]
|
||||
# 判断抽取的字符串是不是真实的信用卡号
|
||||
for credit_card_num in credit_card_nums:
|
||||
if self._verify_credit_card_num(credit_card_num):
|
||||
# 替换有效信用卡号号码为<credit_card_number>
|
||||
credit_card_num_pattern = r"(?<=[^\d]){}(?=[^\d])".format(credit_card_num)
|
||||
input_data = re.compile(credit_card_num_pattern).sub("<credit_card_number>", input_data)
|
||||
return input_data[1:-1]
|
||||
6
runtime/ops/mapper/email_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/email_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='EmailNumberCleaner',
|
||||
module_path="ops.mapper.email_cleaner.process")
|
||||
16
runtime/ops/mapper/email_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/email_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '邮件地址匿名化'
|
||||
name_en: 'Email Address Anonymization'
|
||||
description: '邮件地址匿名化'
|
||||
description_en: 'Anonymizes email addresses.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'EmailNumberCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '这个是邮箱号:test_email@gmail.com'
|
||||
after: '这个是邮箱号:<email>'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
47
runtime/ops/mapper/email_cleaner/process.py
Normal file
47
runtime/ops/mapper/email_cleaner/process.py
Normal file
@@ -0,0 +1,47 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 邮件地址匿名化
|
||||
Create: 2025/01/15
|
||||
"""
|
||||
from loguru import logger
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from email_validator import validate_email, EmailNotValidError
|
||||
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class EmailNumberCleaner(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.front_email_pattern = r'(?<=[^0-9a-zA-Z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~\-])'
|
||||
self.back_email_pattern = r'(?=[^0-9a-zA-Z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~\-])'
|
||||
self.email_pattern = r'([a-zA-Z\d.\-+_]+\s?@\s?[a-zA-Z\d.\-+_]+\.[a-zA-Z0-9]{2,6})'
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._email_number_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: EmailCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _email_number_filter(self, input_data: str):
|
||||
""" 邮箱匿名化"""
|
||||
mixed_data = ''.join(['龥', input_data, '龥'])
|
||||
paired_emails = re.compile(self.front_email_pattern + self.email_pattern + self.back_email_pattern).findall(
|
||||
mixed_data)
|
||||
if paired_emails:
|
||||
for email in paired_emails:
|
||||
try:
|
||||
# 验证电子邮件地址
|
||||
validate_email(email, check_deliverability=False)
|
||||
mixed_data = re.compile(self.front_email_pattern + re.escape(email) + self.back_email_pattern).sub(
|
||||
"<email>", mixed_data, count=1)
|
||||
except EmailNotValidError as err:
|
||||
# 日志打印该电子邮件地址无效(不显示具体电子邮件地址)
|
||||
logger.error(f"email is abnormal email form: {err}")
|
||||
return mixed_data[1:-1]
|
||||
6
runtime/ops/mapper/emoji_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/emoji_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='EmojiCleaner',
|
||||
module_path="ops.mapper.emoji_cleaner.process")
|
||||
16
runtime/ops/mapper/emoji_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/emoji_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '文档表情去除'
|
||||
name_en: 'Emoticon Removal'
|
||||
description: '去除文档中表情字符或者emoji符号。'
|
||||
description_en: 'Removes emoticons or emojis from documents.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'EmojiCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '使用方式很简单,只需要将代码放入Markdown文本中即可,富文本格式可直接复制表情😀使用。'
|
||||
after: '使用方式很简单,只需要将代码放入Markdown文本中即可,富文本格式可直接复制表情使用。'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
27
runtime/ops/mapper/emoji_cleaner/process.py
Normal file
27
runtime/ops/mapper/emoji_cleaner/process.py
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
"""
|
||||
Description: 文档表情去除
|
||||
Create: 2023/12/7 15:43
|
||||
"""
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
import emoji
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class EmojiCleaner(Mapper):
|
||||
@staticmethod
|
||||
def _emoji_filter(input_data: str):
|
||||
res = []
|
||||
for input_s in input_data.split('\n'):
|
||||
res.append(emoji.replace_emoji(input_s, replace=''))
|
||||
return '\n'.join(res)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._emoji_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: EmojiCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
6
runtime/ops/mapper/extra_space_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/extra_space_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ExtraSpaceCleaner',
|
||||
module_path="ops.mapper.extra_space_cleaner.process")
|
||||
17
runtime/ops/mapper/extra_space_cleaner/metadata.yml
Normal file
17
runtime/ops/mapper/extra_space_cleaner/metadata.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
name: '多余空格去除'
|
||||
name_en: 'Redundant Space Removal'
|
||||
description: '移除文档首尾、句中或标点符号附近多余空格和 tab 等。'
|
||||
description_en: 'Removes redundant spaces and tabs at the beginning and end of documents,
|
||||
in sentences, or near punctuations.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ExtraSpaceCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ' 人工智能的研究历史有着一条从以“推理”为重 点,到以“知识”为重点,再到以“学习”为重点的自然、清晰的脉络。 '
|
||||
after: '人工智能的研究历史有着一条从以“推理”为重点,到以“知识”为重点,再到以“学习”为重点的自然、清晰的脉络。'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
69
runtime/ops/mapper/extra_space_cleaner/process.py
Normal file
69
runtime/ops/mapper/extra_space_cleaner/process.py
Normal file
@@ -0,0 +1,69 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 多余空格去除
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ExtraSpaceCleaner(Mapper):
|
||||
"""去除多余空格、多余空行,包括文档首尾空格、首尾tab
|
||||
【注意】去除多余空格前,会先将文档中所有空格规范化为\u0020
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
# 匹配文档中非常见的unicode 空格
|
||||
super().__init__(*args, **kwargs)
|
||||
self.white_space_pattern = ('[\u00A0 \u1680 \u2000-\u200D \u2028-\u2029'
|
||||
' \u202F \u205F \u3000 \u180E \u2060 \uFEFF]')
|
||||
self._file_path = Path(__file__).parent / 'resources' / 'special_token.txt'
|
||||
self.escaped_special_chars = self._get_escaped_special_chars() # 加载标点符号
|
||||
# 匹配文章中,连续多个空格
|
||||
extra_space_pattern = r" {2,}"
|
||||
# 匹配多个空格、换行符混排情况
|
||||
extra_line_pattern = r"( |\n){2,}"
|
||||
# 匹配中文、符号间多余空格
|
||||
extra_space_in_chinese_pattern = r"(?<=[\u4e00-\u9fa5" + self.escaped_special_chars + r"]) +(?=[\u4e00-\u9fa5" \
|
||||
+ self.escaped_special_chars + r"])"
|
||||
self.extra_space_re_compile = re.compile(extra_space_pattern)
|
||||
self.extra_space_in_chinese_re_compile = re.compile(extra_space_in_chinese_pattern)
|
||||
self.extra_line_re_compile = re.compile(extra_line_pattern)
|
||||
self.white_space_pattern_compile = re.compile(self.white_space_pattern)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _get_escaped_special_chars(self) -> str:
|
||||
with open(self._file_path, 'r', encoding='utf-8') as f:
|
||||
self._special_token = f.read().splitlines()
|
||||
res = ''.join([re.escape(char) for char in self._special_token]) # 将特殊字符转义并拼接成字符串
|
||||
return res
|
||||
|
||||
def _clean_extra_space(self, input_data: str) -> str:
|
||||
# 将文档中非常见的 unicode 空格,如 u2008,转换为正常空格(半角空格)
|
||||
input_data = self.white_space_pattern_compile.sub('\u0020', input_data)
|
||||
# 移除文档首尾、句中或标点符号附近多余空格和 tab
|
||||
input_data = input_data.strip()
|
||||
# 逐行移除首尾空格
|
||||
text = "\n".join([line.strip() for line in input_data.split("\n")])
|
||||
text = ''.join(['【', text, '】'])
|
||||
# 连续空格替换为一个正常空格
|
||||
remove_extra_space = self.extra_space_re_compile.sub("\u0020", text)
|
||||
# 去除中文、符号间的空格
|
||||
remove_extra_space_in_chinese = self.extra_space_in_chinese_re_compile.sub("", remove_extra_space)
|
||||
# 去除连续换行符
|
||||
remove_duplicate_line = self.extra_line_re_compile.sub("\n", remove_extra_space_in_chinese)
|
||||
return remove_duplicate_line[1:-1]
|
||||
@@ -0,0 +1,53 @@
|
||||
~
|
||||
·
|
||||
!
|
||||
@
|
||||
#
|
||||
¥
|
||||
%
|
||||
…
|
||||
&
|
||||
*
|
||||
(
|
||||
)
|
||||
—
|
||||
+
|
||||
-
|
||||
=
|
||||
{
|
||||
}
|
||||
|
|
||||
【
|
||||
】
|
||||
、
|
||||
:
|
||||
“
|
||||
”
|
||||
‘
|
||||
’
|
||||
;
|
||||
《
|
||||
》
|
||||
?
|
||||
,
|
||||
。
|
||||
`
|
||||
!
|
||||
$
|
||||
^
|
||||
(
|
||||
)
|
||||
_
|
||||
[
|
||||
]
|
||||
\
|
||||
:
|
||||
"
|
||||
;
|
||||
'
|
||||
<
|
||||
>
|
||||
?
|
||||
,
|
||||
/
|
||||
.
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='FullWidthCharacterCleaner',
|
||||
module_path="ops.mapper.full_width_characters_cleaner.process")
|
||||
@@ -0,0 +1,18 @@
|
||||
name: '全角转半角'
|
||||
name_en: 'Full-to-Half Width Character'
|
||||
description: '将文档中的所有全角字符转换成半角字符。'
|
||||
description_en: 'Converts all full-width characters in documents to half-width characters.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'FullWidthCharacterCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: 'Residential and commercial design, site inspections, working drawings,
|
||||
Minicad, renderings.'
|
||||
after: 'Residential and commercial design, site inspections, working drawings, MiniCad,
|
||||
renderings.'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
46
runtime/ops/mapper/full_width_characters_cleaner/process.py
Normal file
46
runtime/ops/mapper/full_width_characters_cleaner/process.py
Normal file
@@ -0,0 +1,46 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 全角转半角
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class FullWidthCharacterCleaner(Mapper):
|
||||
"""将文档中的所有全角字符转换成半角字符"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._full_to_half_dict = {
|
||||
'"': '"', '#': '#', '$': '$', '%': '%', '&': '&', ''': "'", '*': '*', '+': '+',
|
||||
'-': '-', '.': '.', '/': '/', '0': '0', '1': '1', '2': '2', '3': '3', '4': '4',
|
||||
'5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '<': '<', '=': '=', '>': '>',
|
||||
'@': '@', 'A': 'A', 'B': 'B', 'C': 'C', 'D': 'D', 'E': 'E', 'F': 'F', 'G': 'G',
|
||||
'H': 'H', 'I': 'I', 'J': 'J', 'K': 'K', 'L': 'L', 'M': 'M', 'N': 'N', 'O': 'O',
|
||||
'P': 'P', 'Q': 'Q', 'R': 'R', 'S': 'S', 'T': 'T', 'U': 'U', 'V': 'V', 'W': 'W',
|
||||
'X': 'X', 'Y': 'Y', 'Z': 'Z', '[': '[', '\': '\\', ']': ']', '^': '^', '_': '_',
|
||||
'`': '`', 'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': 'g',
|
||||
'h': 'h', 'i': 'i', 'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n', 'o': 'o',
|
||||
'p': 'p', 'q': 'q', 'r': 'r', 's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w',
|
||||
'x': 'x', 'y': 'y', 'z': 'z', '{': '{', '|': '|', '}': '}', '~': '~'
|
||||
}
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _full_width_character_filter(self, input_data: str):
|
||||
res = []
|
||||
for input_str in input_data.split('\n'):
|
||||
res.append("".join(self._full_to_half_dict.get(char, char) for char in input_str))
|
||||
return '\n'.join(res)
|
||||
6
runtime/ops/mapper/garble_characters_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/garble_characters_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='GrableCharactersCleaner',
|
||||
module_path="ops.mapper.garble_characters_cleaner.process")
|
||||
17
runtime/ops/mapper/garble_characters_cleaner/metadata.yml
Normal file
17
runtime/ops/mapper/garble_characters_cleaner/metadata.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
name: '文档乱码去除'
|
||||
name_en: 'Garbled Character Removal'
|
||||
description: '去除文档中的乱码和无意义的unicode。'
|
||||
description_en: 'Removes garbled characters and meaningless Unicode characters from
|
||||
documents.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'GrableCharactersCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '文档乱码����'
|
||||
after: '文档乱码'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
54
runtime/ops/mapper/garble_characters_cleaner/process.py
Normal file
54
runtime/ops/mapper/garble_characters_cleaner/process.py
Normal file
@@ -0,0 +1,54 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description:
|
||||
本插件实现将文档中乱码去除功能
|
||||
实现逻辑:
|
||||
1. 正则判断该字符的unicode编码是否在乱码范围内。若在范围内,则去除,不在范围内,则保留。
|
||||
2. 运行前,加载乱码字符范围的配置文件,即charset.json。该json文件中,key为字符集名称,value为unicode编码范围的集合。
|
||||
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class GrableCharactersCleaner(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._file_path = str(Path(__file__).parent / 'resources' / 'charset.json')
|
||||
self.unicode_grable_code_list = self.get_unicode_grable_code_list() # 乱码unicode编码的十进制范围的集合
|
||||
self.grable_re_compile = re.compile("[" + self.unicode_grable_code_list + "]")
|
||||
|
||||
def get_unicode_grable_code_list(self):
|
||||
"""获取乱码unicode编码范围"""
|
||||
res = ""
|
||||
with open(self._file_path, 'r', encoding='utf-8') as f:
|
||||
charset_number_list = json.load(f)
|
||||
for number_ranges in charset_number_list.values():
|
||||
for number_range in number_ranges:
|
||||
number_range_list = number_range.split(",")
|
||||
if len(number_range_list) < 2:
|
||||
logger.error(f"number_range_list size is {len(number_range_list)}, formatting error")
|
||||
continue
|
||||
res += number_range_list[0] + "-" + number_range_list[1]
|
||||
return res
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._grable_characters_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: GrableCharactersCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _grable_characters_filter(self, input_data: str):
|
||||
"""去除文档中的乱码"""
|
||||
return self.grable_re_compile.sub("", input_data)
|
||||
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"注音符号东亚": [
|
||||
"\u3100,\u312F"
|
||||
],
|
||||
"拉丁文补充1": [
|
||||
"\u00C0,\u00D6",
|
||||
"\u00D8,\u00F6",
|
||||
"\u00F8,\u00FF"
|
||||
],
|
||||
"拉丁文扩展,A": [
|
||||
"\u0100,\u017F"
|
||||
],
|
||||
"拉丁文扩展,B": [
|
||||
"\u0180,\u024F"
|
||||
],
|
||||
"私人使用区域": [
|
||||
"\uE000,\uF8FF",
|
||||
"\\U000f0000,\\U000ffffd",
|
||||
"\\U00100000,\\U0010fffd"
|
||||
],
|
||||
"占位符": [
|
||||
"\uFFFD,\uFFFD"
|
||||
]
|
||||
}
|
||||
6
runtime/ops/mapper/html_tag_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/html_tag_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='HtmlTagCleaner',
|
||||
module_path="ops.mapper.html_tag_cleaner.process")
|
||||
16
runtime/ops/mapper/html_tag_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/html_tag_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: 'HTML标签去除'
|
||||
name_en: 'HTML Tag Removal'
|
||||
description: '移除文档中HTML标签,如 <html>、<dev>、<p> 等。'
|
||||
description_en: 'Removes HTML tags from documents, such as <html>, <dev>, and <p>.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'HtmlTagCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '<p><b>机器学习</b>是<a href="/wiki/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD" title="人工智能">人工智能</a>的一个分支。</p>'
|
||||
after: '机器学习是人工智能的一个分支。'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
80
runtime/ops/mapper/html_tag_cleaner/process.py
Normal file
80
runtime/ops/mapper/html_tag_cleaner/process.py
Normal file
@@ -0,0 +1,80 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: HTML标签去除插件
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class HtmlTagCleaner(Mapper):
|
||||
"""移除文档中html标签,如 <html>,<dev>,<p>等,不对xml文档做处理"""
|
||||
tag_list = [
|
||||
'<a>', '<abbr>', '<acronym>', '<address>', '<applet>', '<area>', '<article>', '<aside>',
|
||||
'<audio>', '<b>', '<base>', '<basefont>', '<bdi>', '<bdo>', '<bgsound>', '<big>', '<blink>',
|
||||
'<blockquote>', '<body>', '<br>', '<button>', '<canvas>', '<caption>', '<center>', '<cite>',
|
||||
'<code>', '<col>', '<colgroup>', '<command>', '<content>', '<data>', '<datalist>', '<dd>',
|
||||
'<del>', '<details>', '<dfn>', '<dialog>', '<dir>', '<div>', '<dl>', '<dt>', '<em>',
|
||||
'<embed>', '<fieldset>', '<figcaption>', '<figure>', '<font>', '<footer>', '<form>', '<frame>',
|
||||
'<frameset>', '<h1>', '<h2>', '<h3>', '<h4>', '<h5>', '<h6>', '<head>', '<header>', '<hgroup>',
|
||||
'<hr>', '<html>', '<i>', '<iframe>', '<image>', '<img>', '<input>', '<ins>', '<isindex>',
|
||||
'<kbd>', '<keygen>', '<label>', '<legend>', '<li>', '<link>', '<listing>', '<main>', '<map>',
|
||||
'<mark>', '<marquee>', '<menu>', '<menuitem>', '<meta>', '<meter>', '<nav>', '<nobr>', '<noembed>',
|
||||
'<noframes>', '<noscript>', '<object>', '<ol>', '<optgroup>', '<option>', '<output>', '<p>',
|
||||
'<param>', '<picture>', '<plaintext>', '<pre>', '<progress>', '<q>', '<rp>', '<rt>', '<rtc>',
|
||||
'<ruby>', '<s>', '<samp>', '<script>', '<section>', '<select>', '<shadow>', '<small>',
|
||||
'<source>', '<spacer>', '<span>', '<strike>', '<strong>', '<style>', '<sub>', '<summary>',
|
||||
'<sup>', '<template>', '<textarea>', '<tfoot>', '<thead>', '<time>', '<title>', '<track>', '<tt>', '<u>',
|
||||
'<ul>', '<var>', '<video>', '<wbr>', '<xmp>'
|
||||
]
|
||||
preserved_attr_list = ['colspan', 'rowspan'] # 需要保留的标签属性列表
|
||||
|
||||
@staticmethod
|
||||
def _remove_specified_tags(input_data: str, specified_tags: List):
|
||||
"""移除指定html标签及其属性值"""
|
||||
html_tag_pattern = '|'.join(
|
||||
map(lambda tag: rf'{re.escape(tag[:-1])}(\s[^>]*)?>|</{re.escape(tag[1:-1])}>', specified_tags))
|
||||
cleaned_text = re.sub(html_tag_pattern, '', input_data, flags=re.IGNORECASE)
|
||||
return cleaned_text
|
||||
|
||||
@staticmethod
|
||||
def _remove_tag_attributes(input_data: str, preserved_attrs: List):
|
||||
"""移除html标签内的属性值,同时保留指定的属性"""
|
||||
tag_pattern = r'<(\w+)(\s+[^<>]*?)?>'
|
||||
attr_pattern = r'\s*(\w+)="([^"]+)"'
|
||||
|
||||
def __remove_unwanted_attrs(m):
|
||||
def __remove_attrs(x):
|
||||
if x.group(1) in preserved_attrs:
|
||||
return x.group(0)
|
||||
else:
|
||||
return ''
|
||||
|
||||
return re.sub(attr_pattern, __remove_attrs, m.group(0))
|
||||
|
||||
cleaned_text = re.sub(tag_pattern, __remove_unwanted_attrs, input_data)
|
||||
return cleaned_text
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
if sample[self.filetype_key] != "xml":
|
||||
sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: HtmlTagCleaner costs {time.time() - start:6f} s")
|
||||
else:
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: HtmlTagCleaner, The file is xml!")
|
||||
return sample
|
||||
|
||||
def _remove_html_tags(self, input_data: str):
|
||||
# 去除常见的html标签及其属性值(不包括<table>、<tbody>、<tr>、<td>、<th>)
|
||||
cleaned_text = self._remove_specified_tags(input_data, self.tag_list)
|
||||
# 去除表格标签内的属性值(不包括colspan、rowspan属性),eg:<td class="td8" rowspan="3"> —> <td rowspan="3">
|
||||
cleaned_text = self._remove_tag_attributes(cleaned_text, self.preserved_attr_list)
|
||||
return cleaned_text
|
||||
6
runtime/ops/mapper/id_number_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/id_number_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='AnonymizedIdNumber',
|
||||
module_path="ops.mapper.id_number_cleaner.process")
|
||||
16
runtime/ops/mapper/id_number_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/id_number_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '身份证号匿名化'
|
||||
name_en: 'ID Card Number Anonymization'
|
||||
description: '身份证号匿名化。'
|
||||
description_en: 'Anonymizes ID card numbers.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'AnonymizedIdNumber'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '这个是身份证号110101190001011009'
|
||||
after: '这个是身份证号<id>'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
116
runtime/ops/mapper/id_number_cleaner/process.py
Normal file
116
runtime/ops/mapper/id_number_cleaner/process.py
Normal file
@@ -0,0 +1,116 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 身份证号码匿名化插件
|
||||
Create: 2024/12/5 15:43
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
import pytz
|
||||
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class AnonymizedIdNumber(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(AnonymizedIdNumber, self).__init__(*args, **kwargs)
|
||||
self.id_number_re_compile = self.get_id_number_re_compile()
|
||||
self.id_coefficient = (7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2)
|
||||
self.id_verification = ("1", "0", "X", "9", "8", "7", "6", "5", "4", "3", "2")
|
||||
self.area_code_enum = self.load_code_list()
|
||||
|
||||
@staticmethod
|
||||
def get_id_number_re_compile():
|
||||
"""获取身份证号码正则匹配对象"""
|
||||
# 中国身份证号共计18位,1,2位省份,3,4位城市,5,6位县区码,7~14位为出生日期,最后一位为校验码,做了严格限定
|
||||
id_card_pattern = r'(?<=[^0-9])' \
|
||||
r'((1[1-5]|2[1-3]|3[1-7]|4[1-6]|5[0-4]|6[1-5]|71|81|82)' \
|
||||
r'(0[0-9]|1[0-9]|2[0-9]|3[0-4]|4[0-3]|5[1-3]|90)' \
|
||||
r'(0[0-9]|1[0-9]|2[0-9]|3[0-9]|4[0-3]|5[1-7]|6[1-4]|7[1-4]|8[1-7])' \
|
||||
r'(18|19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])' \
|
||||
r'\d{3}[0-9xX])' \
|
||||
r'(?=[^0-9xX])'
|
||||
return re.compile(id_card_pattern)
|
||||
|
||||
@staticmethod
|
||||
def load_code_list():
|
||||
"""编码表加载"""
|
||||
area_code_enum_path = str(Path(__file__).parent / 'resources' / 'area_code_enum.txt')
|
||||
with open(area_code_enum_path, 'r', encoding='utf-8') as f:
|
||||
area_code_list = set(f.read().splitlines())
|
||||
return area_code_list
|
||||
|
||||
@staticmethod
|
||||
def _verify_birthday_code(birthday_code: str):
|
||||
"""判断出生日期编码的8位数是否有效"""
|
||||
year = int(birthday_code[:4])
|
||||
month = int(birthday_code[4:6])
|
||||
day = int(birthday_code[6:8])
|
||||
date_string = "{}-{}-{}".format(year, month, day)
|
||||
date_format = "%Y-%m-%d"
|
||||
try:
|
||||
# 将日期字符串转换成时间
|
||||
date = datetime.strptime(date_string, date_format)
|
||||
# 设置时区为上海
|
||||
china_tz = pytz.timezone("Asia/Shanghai")
|
||||
china_date = china_tz.localize(date)
|
||||
# 获取当前时间
|
||||
current_date = datetime.now(china_tz)
|
||||
# 判断出生日期是否晚于当前时间;若晚于,则出生日期不合法
|
||||
return china_date <= current_date
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._id_number_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: IDNumberCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _verify_area_code(self, area_code: str):
|
||||
"""判断地域编码的6位数是否有效"""
|
||||
return area_code in self.area_code_enum
|
||||
|
||||
def _verify_verification_code(self, id_number: str):
|
||||
"""身份证号码校验码正确性校验"""
|
||||
verify_num = id_number[-1]
|
||||
# 将身份证号码前17位数分别乘以不同的系数,即self.id_coefficient,再将相乘结果相加
|
||||
id_sum = sum([int(num) * coe for num, coe in zip(id_number[:-1], self.id_coefficient)])
|
||||
# 判断相加总和除以11的余数是否等于身份证号码最后一位
|
||||
return verify_num.upper() == self.id_verification[id_sum % 11].upper()
|
||||
|
||||
def _verify_id_number(self, id_number: str):
|
||||
"""验证身份证号码有效性主函数"""
|
||||
return self._verify_verification_code(id_number) and \
|
||||
self._verify_birthday_code(id_number[6:14]) and \
|
||||
self._verify_area_code(id_number[:6])
|
||||
|
||||
def _verify_similar_id_number(self, id_number: str):
|
||||
"""用于宽松匹配类似身份证的字符串,不进行严格有效性验证。"""
|
||||
if len(id_number) != 18:
|
||||
return False
|
||||
if not id_number[:17].isdigit():
|
||||
return False
|
||||
last_char = id_number[-1].upper()
|
||||
return last_char in set('0123456789X')
|
||||
|
||||
def _id_number_filter(self, input_data: str):
|
||||
"""身份证号码匿名化"""
|
||||
input_data = ''.join(['【', input_data, '】'])
|
||||
# 抽取符合身份证正则匹配的字符串
|
||||
id_nums = [item.group(1) for item in self.id_number_re_compile.finditer(input_data)]
|
||||
# 判断抽取的字符串是不是真实的身份证号码
|
||||
for id_num in id_nums:
|
||||
if self._verify_id_number(id_num) or self._verify_similar_id_number(id_num):
|
||||
# 替换有效身份证号码为<id>
|
||||
id_num_pattern = r"(?<=[^0-9]){}(?=[^0-9xX])".format(id_num)
|
||||
input_data = re.compile(id_num_pattern).sub("<id>", input_data)
|
||||
return input_data[1:-1]
|
||||
3264
runtime/ops/mapper/id_number_cleaner/resources/area_code_enum.txt
Normal file
3264
runtime/ops/mapper/id_number_cleaner/resources/area_code_enum.txt
Normal file
File diff suppressed because it is too large
Load Diff
6
runtime/ops/mapper/img_denoise/__init__.py
Normal file
6
runtime/ops/mapper/img_denoise/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgDenoise',
|
||||
module_path="ops.mapper.img_denoise.process")
|
||||
17
runtime/ops/mapper/img_denoise/metadata.yml
Normal file
17
runtime/ops/mapper/img_denoise/metadata.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
name: '图片噪点去除'
|
||||
name_en: 'Image Noise Removal'
|
||||
description: '去除图片中的噪点,主要适用于自然场景。'
|
||||
description_en: 'Removes noises from images, which is mainly applicable to natural
|
||||
scenery image scenarios.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgDenoise'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
60
runtime/ops/mapper/img_denoise/process.py
Normal file
60
runtime/ops/mapper/img_denoise/process.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description:
|
||||
Create: 2025/01/17
|
||||
"""
|
||||
import time
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgDenoise(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgDenoise, self).__init__(*args, **kwargs)
|
||||
self._denoise_threshold = kwargs.get("denoise_threshold", 8)
|
||||
|
||||
@staticmethod
|
||||
def _denoise_image(data: object):
|
||||
"""降噪处理"""
|
||||
return cv2.medianBlur(data, 3)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
|
||||
img_bytes = sample[self.data_key]
|
||||
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
if img_bytes:
|
||||
data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
denoise_images = self._denoise_images_filter(data, file_name)
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(denoise_images, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgDenoise costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _denoise_images_filter(self, ori_img, file_name):
|
||||
# 获取原始图片的去噪图片
|
||||
clean_data = self._denoise_image(ori_img)
|
||||
# 为方便与其他图片比较可以将图片resize到同一个大小
|
||||
ori = cv2.resize(ori_img, (112, 112))
|
||||
dst = cv2.resize(clean_data, (112, 112))
|
||||
# 计算未降噪图片的灰度值的集合
|
||||
signal = np.sum(ori ** 2)
|
||||
# 计算未降噪图片的灰度值与去噪图片灰度值的差值的集合
|
||||
noise = np.sum((ori - dst) ** 2)
|
||||
# 根据未去噪图片和差值计算snr (图片信噪比)
|
||||
snr = 10 * np.log10(signal / noise)
|
||||
# 对于小于阈值的图片,进行降噪处理
|
||||
if snr < self._denoise_threshold:
|
||||
logger.info(f"The image denoise is {self._denoise_threshold}, "
|
||||
f"which exceeds the threshold of {snr}. {file_name} is filtered out.")
|
||||
return clean_data
|
||||
return ori_img
|
||||
6
runtime/ops/mapper/img_direction_correct/__init__.py
Normal file
6
runtime/ops/mapper/img_direction_correct/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgDirectionCorrect',
|
||||
module_path="ops.mapper.img_direction_correct.process")
|
||||
38
runtime/ops/mapper/img_direction_correct/base_model.py
Normal file
38
runtime/ops/mapper/img_direction_correct/base_model.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
import gc
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from argparse import Namespace
|
||||
|
||||
|
||||
class BaseModel:
|
||||
|
||||
def __init__(self, model_type='vertical'):
|
||||
models_path = os.getenv("MODELS_PATH", "/home/models")
|
||||
self.resources_path = str(Path(models_path, 'img_direction_correct', 'resources'))
|
||||
args = Namespace()
|
||||
args.cls_image_shape = '3, 224, 224'
|
||||
args.cls_batch_num = 6
|
||||
args.cls_thresh = 0.9
|
||||
args.use_onnx = False
|
||||
args.use_gpu = False
|
||||
args.use_npu = False
|
||||
args.use_xpu = False
|
||||
args.enable_mkldnn = False
|
||||
if model_type == 'vertical':
|
||||
args.cls_model_dir = str(Path(self.resources_path, 'vertical_model'))
|
||||
self.model_name = 'standard model to detect image 0 or 90 rotated'
|
||||
args.label_list = ['0', '90']
|
||||
else:
|
||||
args.cls_model_dir = str(Path(self.resources_path, 'standard_model'))
|
||||
self.model_name = 'standard model to detect image 0 or 180 rotated'
|
||||
args.label_list = ['0', '180']
|
||||
|
||||
from paddleocr.tools.infer.predict_cls import TextClassifier
|
||||
self.infer = TextClassifier(args)
|
||||
|
||||
def __del__(self):
|
||||
del self.infer
|
||||
gc.collect()
|
||||
17
runtime/ops/mapper/img_direction_correct/metadata.yml
Normal file
17
runtime/ops/mapper/img_direction_correct/metadata.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
name: '图片方向校正'
|
||||
name_en: 'Image Orientation Correction'
|
||||
description: '将含有文字的图片校正到文字水平方向,主要适用于文档场景。'
|
||||
description_en: 'Corrects images to ensure text is presented horizontally, which is
|
||||
mainly applicable to document scenarios.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgDirectionCorrect'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
139
runtime/ops/mapper/img_direction_correct/process.py
Normal file
139
runtime/ops/mapper/img_direction_correct/process.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description:
|
||||
Create: 2024/1/30 9:26
|
||||
"""
|
||||
import math
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
from .base_model import BaseModel
|
||||
|
||||
|
||||
class ImgDirectionCorrect(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgDirectionCorrect, self).__init__(*args, **kwargs)
|
||||
self.img_resize = 1000
|
||||
self.limit_size = 30000
|
||||
self.use_model = True
|
||||
self.vertical_model, self.standard_model = self.get_model(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _detect_angle(img):
|
||||
"""检测图片倾斜角度"""
|
||||
# 转为灰度单通道 [[255 255],[255 255]]
|
||||
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||||
# 黑白颠倒
|
||||
gray = cv2.bitwise_not(gray)
|
||||
# 二值化
|
||||
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
|
||||
# 把大于0的点的行列找出来
|
||||
ys, xs = np.where(thresh > 0)
|
||||
# 组成坐标[[306 37][306 38][307 38]],里面都是非零的像素
|
||||
coords = np.column_stack([xs, ys])
|
||||
# 获取最小矩形的信息 返回值(中心点,长宽,角度)
|
||||
rect = cv2.minAreaRect(coords)
|
||||
# 这里minAreaRect返回值为【0,90】,离y轴最近的夹角,后续有优化空间
|
||||
# 夹角小于45度时,填充的空白较少,有助于提升识别率
|
||||
angle = rect[-1] # 最后一个参数是角度
|
||||
# 小于45度时,逆时针旋转45度
|
||||
if angle <= 45.0:
|
||||
return angle
|
||||
# 大于45度时,顺时针旋转(90-angle)
|
||||
return angle - 90
|
||||
|
||||
@staticmethod
|
||||
def _detect_direction(image, file_name, model):
|
||||
"""
|
||||
Args:
|
||||
image: 待预测的图片
|
||||
file_name: 文件名
|
||||
model: 使用的模型, vertical_model 和 standard_model
|
||||
Returns: 旋转后的图片
|
||||
"""
|
||||
# cls_res为模型预测结果,格式应当类似于: [('90', 0.9815167)]
|
||||
_, cls_res, _ = model.infer([image])
|
||||
rotate_angle = int(cls_res[0][0])
|
||||
pro = float(cls_res[0][1])
|
||||
logger.info(
|
||||
f"fileName: {file_name}, model {model.model_name} detect result is {rotate_angle} with confidence {pro}")
|
||||
if rotate_angle == 90 and pro > 0.89:
|
||||
return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
|
||||
if rotate_angle == 180 and pro > 0.89:
|
||||
return cv2.rotate(image, 1)
|
||||
return image
|
||||
|
||||
@staticmethod
|
||||
def _rotate_bound(image, angle):
|
||||
"""根据倾斜角度旋转图片
|
||||
Args:
|
||||
image: 待处理图片
|
||||
angle: _detect_angle方法检测到的倾斜角
|
||||
"""
|
||||
if angle == 0.0:
|
||||
return image
|
||||
# 获取宽高
|
||||
h, w = image.shape[:2]
|
||||
sinval = math.fabs(math.sin(angle))
|
||||
cosval = math.fabs(math.cos(angle))
|
||||
dx = max(int((w * cosval + h * sinval - w) / 2), 0)
|
||||
dy = max(int((w * sinval + h * cosval - h) / 2), 0)
|
||||
dst_img = cv2.copyMakeBorder(image, dy, dy, dx, dx, cv2.BORDER_CONSTANT, value=(255, 255, 255))
|
||||
h, w = dst_img.shape[:2]
|
||||
rotated_matrix = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1.0)
|
||||
dst_img = cv2.warpAffine(dst_img, rotated_matrix, (w, h), borderValue=(255, 255, 255))
|
||||
return dst_img
|
||||
|
||||
def init_model(self, *args, **kwargs):
|
||||
return BaseModel(model_type='vertical'), BaseModel(model_type='standard')
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
if img_bytes:
|
||||
data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
correct_data = self._img_direction_correct(data, file_name, self.vertical_model, self.standard_model)
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(correct_data, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgDirectionCorrect costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _img_direction_correct(self, img, file_name, vertical_model, standard_model):
|
||||
height, width = img.shape[:2]
|
||||
if max(height, width) > self.limit_size:
|
||||
logger.info(
|
||||
f"fileName: {file_name}, method: ImgDirectionCorrect cannot process pixels number larger than 30000")
|
||||
return img
|
||||
detect_angle_img = self._resize(img)
|
||||
# 检测旋转角
|
||||
angle = self._detect_angle(detect_angle_img)
|
||||
# 将图片处理为 0, 90, 180, 270旋转角度的图片
|
||||
rotated_img = self._rotate_bound(img, angle)
|
||||
# 水平垂直方向识别:二分类模型,检测图片方向角为 0, 90, 将其处理为 0和180二分类图片
|
||||
rotated_img = self._detect_direction(rotated_img, file_name, vertical_model)
|
||||
# 0-180方向识别:二分类模型,检测图片方向角为 0, 180, 将其处理为 0和180二分类图片
|
||||
rotated_img = self._detect_direction(rotated_img, file_name, standard_model)
|
||||
return rotated_img
|
||||
|
||||
def _resize(self, image):
|
||||
height, width = image.shape[:2] # 获取原图像的水平方向尺寸和垂直方向尺寸。
|
||||
temp = max(height, width)
|
||||
# 若图片最长边大于限值,对图片进行压缩,否则返回原图
|
||||
if temp >= self.img_resize:
|
||||
mul_temp = temp / self.img_resize
|
||||
if height > width:
|
||||
return cv2.resize(image, (int(width / mul_temp), self.img_resize), interpolation=cv2.INTER_AREA)
|
||||
elif height < width:
|
||||
return cv2.resize(image, (self.img_resize, int(height / mul_temp)), interpolation=cv2.INTER_AREA)
|
||||
else:
|
||||
return cv2.resize(image, (self.img_resize, self.img_resize), interpolation=cv2.INTER_AREA)
|
||||
return image
|
||||
6
runtime/ops/mapper/img_enhanced_brightness/__init__.py
Normal file
6
runtime/ops/mapper/img_enhanced_brightness/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgBrightness',
|
||||
module_path="ops.mapper.img_enhanced_brightness.process")
|
||||
16
runtime/ops/mapper/img_enhanced_brightness/metadata.yml
Normal file
16
runtime/ops/mapper/img_enhanced_brightness/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '图片亮度增强'
|
||||
name_en: 'Image Brightness Enhancement'
|
||||
description: '自适应调节图片的亮度。'
|
||||
description_en: 'Adapts and adjusts image brightness.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgBrightness'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
100
runtime/ops/mapper/img_enhanced_brightness/process.py
Normal file
100
runtime/ops/mapper/img_enhanced_brightness/process.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 图像亮度增强算子。
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgBrightness(Mapper):
|
||||
"""图片亮度自适应增强"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgBrightness, self).__init__(*args, **kwargs)
|
||||
# 自适应增强参数
|
||||
self.factor_threshold = 1.1 # 图片增强因子下限(不作为参数传入)。
|
||||
self.standard_mean = 140 # 图片增强后的平均亮度(不作为参数传入)。
|
||||
self.gamma = 1.5 # gamma correction 中的gamma系数,大于1时,使得图像变亮。小于1时,使得图像变暗(不作为参数传入)。
|
||||
self.brightness_upper_bound = 0.35 # 非线性亮度增强阈值上界: 超过这个百分比,就进行线性亮度增强(不作为参数传入)。
|
||||
self.eps = 1 # 极小值,计算图像亮度增强因子的时候,防止全黑图片导致的除零错(不作为参数传入)。
|
||||
|
||||
@staticmethod
|
||||
def _get_grey_mean(src: np.ndarray):
|
||||
gray_image = cv2.cvtColor(src, cv2.COLOR_BGR2GRAY)
|
||||
return np.mean(gray_image)
|
||||
|
||||
@staticmethod
|
||||
def _return_gamma_table(gamma):
|
||||
"""返回gamma校正对应的查找表"""
|
||||
scale = np.power(255, 1 - gamma).astype(np.float64)
|
||||
return np.power(np.arange(256), gamma) * scale
|
||||
|
||||
@staticmethod
|
||||
def _return_linear_table(factor):
|
||||
"""返回线性变换对应的查找表"""
|
||||
linear_table = np.arange(256) * factor
|
||||
return np.clip(linear_table, 0, 255).astype(np.uint8)
|
||||
|
||||
def enhance_brightness_linear(self, image_data: np.ndarray, file_name):
|
||||
average_brightness = self._get_grey_mean(image_data)
|
||||
brightness_factor = self.standard_mean / (average_brightness + self.eps)
|
||||
|
||||
# 图像过亮,不需要增强亮度
|
||||
if brightness_factor <= 1:
|
||||
logger.info(f"fileName: {file_name}, method: ImgBrightness not need enhancement")
|
||||
return image_data
|
||||
|
||||
brightness_factor = max(brightness_factor, self.factor_threshold)
|
||||
linear_table = ImgBrightness._return_linear_table(brightness_factor)
|
||||
cv2.LUT(image_data, linear_table, dst=image_data)
|
||||
return image_data
|
||||
|
||||
def enhance_brightness(self, image_data: np.ndarray, file_name):
|
||||
'''
|
||||
亮度自适应增强方法。
|
||||
|
||||
Args:
|
||||
image_data: nd.array 格式图片
|
||||
gamma: gamma变换因子参数。经验值常用1.5, 已写成了成员变量。
|
||||
Returns:
|
||||
亮度自适应增强后的图片
|
||||
'''
|
||||
# 计算图片平均亮度
|
||||
average_brightness = self._get_grey_mean(image_data)
|
||||
|
||||
# 进行 gamma 校正
|
||||
if average_brightness / 255 <= self.brightness_upper_bound:
|
||||
# 预计算查找表
|
||||
gamma_table = ImgBrightness._return_gamma_table(1 / self.gamma).astype(np.uint8)
|
||||
cv2.LUT(image_data, gamma_table, dst=image_data)
|
||||
|
||||
# 如果亮度超过非线性亮度调整的上界,就进行非线性亮度调整
|
||||
else:
|
||||
image_data = self.enhance_brightness_linear(image_data, file_name)
|
||||
|
||||
return image_data
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
if img_bytes:
|
||||
# 进行图片增强
|
||||
img_data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
img_data = self.enhance_brightness(img_data, file_name)
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgBrightness costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
6
runtime/ops/mapper/img_enhanced_contrast/__init__.py
Normal file
6
runtime/ops/mapper/img_enhanced_contrast/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgContrast',
|
||||
module_path="ops.mapper.img_enhanced_contrast.process")
|
||||
16
runtime/ops/mapper/img_enhanced_contrast/metadata.yml
Normal file
16
runtime/ops/mapper/img_enhanced_contrast/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '图片对比度增强'
|
||||
name_en: 'Image Contrast Enhancement'
|
||||
description: '自适应调节图片的对比度。'
|
||||
description_en: 'Adapts and adjusts the image contrast.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgContrast'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
71
runtime/ops/mapper/img_enhanced_contrast/process.py
Normal file
71
runtime/ops/mapper/img_enhanced_contrast/process.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 图片对比度自适应增强
|
||||
Version:
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgContrast(Mapper):
|
||||
"""图片对比度自适应增强"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgContrast, self).__init__(*args, **kwargs)
|
||||
# 自适应增强参数
|
||||
self.clip_limit = 2 # 指定对比度限制阈值, 较大的值会产生更大的对比度增强效(不作为参数传入)。
|
||||
self.tile_grid = 16 # 指定图像划分的网格大小,较小的网格大小会导致更局部的均衡化效果(不作为参数传入)。
|
||||
self.standard_mean = 100 # 图片增强后的平均对比度(不作为参数传入)。
|
||||
self.eps = 0.5 # 小值,计算图像对比度增强因子的时候,防止全黑图片导致的除零错(不作为参数传入)。
|
||||
|
||||
@staticmethod
|
||||
def _get_contrast(image: np.ndarray):
|
||||
"""计算图像所有通道的平均标准差"""
|
||||
_, stddev = cv2.meanStdDev(image)
|
||||
contrast_std = np.mean(stddev)
|
||||
return contrast_std
|
||||
|
||||
def enhance_contrast(self, image_data: np.ndarray, file_name):
|
||||
"""对比度自适应增强方法"""
|
||||
|
||||
contrast_std = self._get_contrast(image_data)
|
||||
contrast_factor = self.standard_mean / (contrast_std + self.eps)
|
||||
|
||||
# 图片对比度较高,不需要增强对比度
|
||||
if contrast_factor <= 1:
|
||||
logger.info(f"fileName: {file_name}, method: ImgContrast not need enhancement")
|
||||
return image_data
|
||||
# 将彩色图像转换为Lab颜色空间
|
||||
cv2.cvtColor(image_data, cv2.COLOR_BGR2Lab, dst=image_data)
|
||||
|
||||
# 使用局部自适应直方图均衡化进行对比度调整。
|
||||
clahe = cv2.createCLAHE(clipLimit=self.clip_limit, tileGridSize=(self.tile_grid, self.tile_grid))
|
||||
image_data[:, :, 0] = clahe.apply(image_data[:, :, 0])
|
||||
|
||||
# 将增强后的Lab图像转换回BGR颜色空间
|
||||
cv2.cvtColor(image_data, cv2.COLOR_Lab2BGR, dst=image_data)
|
||||
return image_data
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
if img_bytes:
|
||||
# 进行图片增强
|
||||
img_data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
img_data = self.enhance_contrast(img_data, file_name)
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgContrast costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
6
runtime/ops/mapper/img_enhanced_saturation/__init__.py
Normal file
6
runtime/ops/mapper/img_enhanced_saturation/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgSaturation',
|
||||
module_path="ops.mapper.img_enhanced_saturation.process")
|
||||
17
runtime/ops/mapper/img_enhanced_saturation/metadata.yml
Normal file
17
runtime/ops/mapper/img_enhanced_saturation/metadata.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
name: '图片饱和度增强'
|
||||
name_en: 'Image Saturation Enhancement'
|
||||
description: '自适应调节图片的饱和度,主要适用于自然场景图片。'
|
||||
description_en: 'Adapts and adjusts the saturation of images, which is mainly applicable
|
||||
to natural scenery image scenarios.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgSaturation'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
81
runtime/ops/mapper/img_enhanced_saturation/process.py
Normal file
81
runtime/ops/mapper/img_enhanced_saturation/process.py
Normal file
@@ -0,0 +1,81 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 图片饱和度自适应增强
|
||||
Version:
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgSaturation(Mapper):
|
||||
"""图片饱和度自适应增强"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgSaturation, self).__init__(*args, **kwargs)
|
||||
# 自适应增强参数
|
||||
self.factor_threshold = 1.1 # 图片增强因子下限(不作为参数传入)。
|
||||
self.standard_mean = 130 # 图片增强后的平均饱和度(不作为参数传入)。
|
||||
self.eps = 1 # 极小值,计算图像饱和度增强因子的时候,防止全黑图片导致的除零错(不作为参数传入)。
|
||||
self.zeros_ratio_threshold = 0.1 # saturation通道 零值占比率,防止对近似灰度图的图像进行处理。
|
||||
self.red_channel_threshold = 140 # 图片红色通道阈值,用于抑制饱和度增强因子
|
||||
|
||||
def enhance_saturation(self, image_data: np.ndarray, file_name):
|
||||
"""饱和度自适应增强方法"""
|
||||
# 打开图像并转换为HSV颜色空间
|
||||
image_hsv = cv2.cvtColor(image_data, cv2.COLOR_BGR2HSV)
|
||||
s_channel = image_hsv[:, :, 1].copy()
|
||||
del image_hsv
|
||||
|
||||
# 提取饱和度通道
|
||||
# 正常的RGB图片,零值占比率比应当小于0.1, 如果高于0.1,可以认为这张图片近似于灰度图
|
||||
zero_s_ratio = np.count_nonzero(s_channel == 0) / s_channel.size
|
||||
if zero_s_ratio <= self.zeros_ratio_threshold:
|
||||
saturation_channel = s_channel
|
||||
# 灰度图片转成的RGB图片,转为HSV后,S通道值全为0
|
||||
else:
|
||||
return image_data
|
||||
|
||||
# 计算饱和度的统计信息
|
||||
saturation_mean = np.mean(saturation_channel)
|
||||
saturation_factor = self.standard_mean / (saturation_mean + self.eps)
|
||||
|
||||
# 图片饱和度较高,不需要增强饱和度
|
||||
if saturation_factor <= 1:
|
||||
logger.info(f"fileName: {file_name}, method: ImgSaturation not need enhancement")
|
||||
return image_data
|
||||
|
||||
# 计算图片红色通道均值, 如果过大,需要限制saturation factor大小,否则图片会泛红, 产生色彩畸变。
|
||||
red_channel_mean = np.mean(image_data[:, :, 2])
|
||||
if red_channel_mean >= self.red_channel_threshold:
|
||||
saturation_factor = min(saturation_factor, 1.5)
|
||||
else:
|
||||
saturation_factor = max(saturation_factor, self.factor_threshold)
|
||||
|
||||
degrade_image = cv2.cvtColor(image_data, cv2.COLOR_BGR2GRAY)
|
||||
degrade_image = cv2.cvtColor(degrade_image, cv2.COLOR_GRAY2BGR)
|
||||
cv2.addWeighted(image_data, saturation_factor, degrade_image, 1 - saturation_factor, 0, dst=image_data)
|
||||
return image_data
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
if img_bytes:
|
||||
# 进行图片增强
|
||||
img_data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
img_data = self.enhance_saturation(img_data, file_name)
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgSaturation costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
6
runtime/ops/mapper/img_enhanced_sharpness/__init__.py
Normal file
6
runtime/ops/mapper/img_enhanced_sharpness/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgSharpness',
|
||||
module_path="ops.mapper.img_enhanced_sharpness.process")
|
||||
17
runtime/ops/mapper/img_enhanced_sharpness/metadata.yml
Normal file
17
runtime/ops/mapper/img_enhanced_sharpness/metadata.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
name: '图片锐度增强'
|
||||
name_en: 'Image Sharpness Enhancement'
|
||||
description: '自适应调节图片的锐度,主要适用于自然场景图片。'
|
||||
description_en: 'Adapts and adjusts the image sharpness, which is mainly applicable
|
||||
to natural scenery image scenarios.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgSharpness'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
69
runtime/ops/mapper/img_enhanced_sharpness/process.py
Normal file
69
runtime/ops/mapper/img_enhanced_sharpness/process.py
Normal file
@@ -0,0 +1,69 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 图片锐度自适应增强
|
||||
Version:
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgSharpness(Mapper):
|
||||
"""图片锐度自适应增强"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgSharpness, self).__init__(*args, **kwargs)
|
||||
# 自适应增强参数
|
||||
self.factor_threshold = 1.1 # 图片增强因子下限(不作为参数传入)。
|
||||
self.standard_mean = 100 # 图片增强后的平均锐度(不作为参数传入)。
|
||||
self.kernel = self._init_kernel()
|
||||
self.eps = 1 # 小值,计算图像锐度增强因子的时候,防止全黑图片导致的除零错(不作为参数传入)。
|
||||
|
||||
@classmethod
|
||||
def _init_kernel(cls):
|
||||
kernel = np.array([[1, 1, 1],
|
||||
[1, 5, 1],
|
||||
[1, 1, 1]])
|
||||
# 对卷积核进行归一化
|
||||
kernel = kernel / np.sum(kernel)
|
||||
return kernel
|
||||
|
||||
def enhance_sharpness(self, image_data: np.ndarray, file_name):
|
||||
"""锐度自适应增强方法"""
|
||||
|
||||
# 打开图像并转换为灰度图像
|
||||
image_gray = cv2.cvtColor(image_data, cv2.COLOR_BGR2GRAY)
|
||||
sharpness = np.abs(cv2.Laplacian(image_gray, cv2.CV_8U)).mean()
|
||||
sharpness_factor = self.standard_mean / (sharpness + self.eps)
|
||||
|
||||
# 图片锐度较高,不需要增强锐度
|
||||
if sharpness_factor <= 1:
|
||||
logger.info(f"fileName: {file_name}, method: ImgSharpness not need enhancement")
|
||||
return image_data
|
||||
|
||||
filtered_img = cv2.filter2D(image_data, -1, self.kernel)
|
||||
cv2.addWeighted(image_data, sharpness_factor, filtered_img, 1.0 - sharpness_factor, 0, dst=image_data)
|
||||
return image_data
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
if img_bytes:
|
||||
# 进行图片增强
|
||||
img_data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
img_data = self.enhance_sharpness(img_data, file_name)
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgSharpness costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgPerspectiveTransformation',
|
||||
module_path="ops.mapper.img_perspective_transformation.process")
|
||||
@@ -0,0 +1,17 @@
|
||||
name: '图片透视变换'
|
||||
name_en: 'Image Perspective Transformation'
|
||||
description: '自适应校正图片的视角,主要适用于文档校正场景。'
|
||||
description_en: 'Adapts and corrects image perspectives, which is mainly applicable
|
||||
to document correction scenarios.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgPerspectiveTransformation'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
147
runtime/ops/mapper/img_perspective_transformation/process.py
Normal file
147
runtime/ops/mapper/img_perspective_transformation/process.py
Normal file
@@ -0,0 +1,147 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description:
|
||||
Create: 2025/01/16
|
||||
"""
|
||||
import time
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgPerspectiveTransformation(Mapper):
|
||||
"""图片透视变换插件"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgPerspectiveTransformation, self).__init__(*args, **kwargs)
|
||||
self.transform_utils = PerspectiveTransformationUtils()
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
if img_bytes:
|
||||
img_data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
transform_img = self._transform_img(img_data, file_name)
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(transform_img, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgPerspectiveTransformation costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _transform_img(self, image, file_name):
|
||||
original_img = image
|
||||
ratio = 900 / image.shape[0]
|
||||
# 固定尺寸
|
||||
img_resize = self.transform_utils.resize_img(image)
|
||||
# 边缘检测
|
||||
binary_img = self.transform_utils.get_canny(img_resize)
|
||||
# 轮廓
|
||||
max_contour, max_area = self.transform_utils.find_max_contour(binary_img)
|
||||
if not max_contour.size:
|
||||
return original_img
|
||||
# 多边形拟合凸包的四个顶点
|
||||
boxes = self.transform_utils.get_box_point(max_contour)
|
||||
if len(boxes) == 4:
|
||||
boxes = self.transform_utils.get_adapt_point(boxes, ratio)
|
||||
boxes = self.transform_utils.order_points(boxes)
|
||||
warped = self.transform_utils.get_warp_image(image, boxes)
|
||||
logger.info(f"fileName: {file_name}, method: ImgPerspectiveTransformation. "
|
||||
"This picture is transformed by perspective.")
|
||||
return warped
|
||||
return original_img
|
||||
|
||||
|
||||
class PerspectiveTransformationUtils:
|
||||
"""图片透视变换工具类"""
|
||||
|
||||
@staticmethod
|
||||
def resize_img(image, height=900):
|
||||
"""固定尺寸"""
|
||||
h, w = image.shape[:2]
|
||||
pro = height / h
|
||||
size = (int(w * pro), int(height))
|
||||
img_resize = cv2.resize(image, size)
|
||||
return img_resize
|
||||
|
||||
@staticmethod
|
||||
def get_canny(image):
|
||||
"""边缘检测"""
|
||||
# 高斯滤波
|
||||
binary = cv2.GaussianBlur(image, (3, 3), 2, 2)
|
||||
# 边缘检测
|
||||
binary = cv2.Canny(binary, 60, 240, apertureSize=3)
|
||||
# 膨胀操作,尽量使边缘闭合
|
||||
kernel = np.ones((3, 3), np.uint8)
|
||||
binary = cv2.dilate(binary, kernel, iterations=1)
|
||||
return binary
|
||||
|
||||
@staticmethod
|
||||
def find_max_contour(image):
|
||||
"""求出面积最大的轮廓"""
|
||||
# 寻找边缘
|
||||
contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||
# 计算面积
|
||||
max_area = 0.0
|
||||
max_contour = np.array([])
|
||||
for contour in contours:
|
||||
current_area = cv2.contourArea(contour)
|
||||
if current_area > max_area:
|
||||
max_area = current_area
|
||||
max_contour = contour
|
||||
return max_contour, max_area
|
||||
|
||||
@staticmethod
|
||||
def get_box_point(contour):
|
||||
"""多边形拟合凸包的四个顶点"""
|
||||
# 多边形拟合凸包
|
||||
hull = cv2.convexHull(contour)
|
||||
epsilon = 0.02 * cv2.arcLength(contour, True)
|
||||
approx = cv2.approxPolyDP(hull, epsilon, True)
|
||||
approx = approx.reshape((len(approx), 2))
|
||||
return approx
|
||||
|
||||
@staticmethod
|
||||
def get_adapt_point(box, pro):
|
||||
"""适配原四边形点集"""
|
||||
box_pro = box
|
||||
if pro != 1.0:
|
||||
box_pro = box / pro
|
||||
box_pro = np.trunc(box_pro)
|
||||
return box_pro
|
||||
|
||||
@staticmethod
|
||||
def order_points(pts):
|
||||
"""四边形顶点排序,[top-left, top-right, bottom-right, bottom-left]"""
|
||||
rect = np.zeros((4, 2), dtype="float32")
|
||||
s = pts.sum(axis=1)
|
||||
rect[0] = pts[np.argmin(s)]
|
||||
rect[2] = pts[np.argmax(s)]
|
||||
diff = np.diff(pts, axis=1)
|
||||
rect[1] = pts[np.argmin(diff)]
|
||||
rect[3] = pts[np.argmax(diff)]
|
||||
return np.intp(rect)
|
||||
|
||||
@staticmethod
|
||||
def compute_point_distance(a, b):
|
||||
"""计算长宽"""
|
||||
return int(np.sqrt(np.sum(np.square(a - b))))
|
||||
|
||||
def get_warp_image(self, image, box):
|
||||
"""透视变换"""
|
||||
w, h = self.compute_point_distance(box[0], box[1]), \
|
||||
self.compute_point_distance(box[1], box[2])
|
||||
dst_rect = np.array([[0, 0],
|
||||
[w - 1, 0],
|
||||
[w - 1, h - 1],
|
||||
[0, h - 1]], dtype='float32')
|
||||
box = np.array(box, dtype='float32')
|
||||
matrix = cv2.getPerspectiveTransform(box, dst_rect)
|
||||
warped = cv2.warpPerspective(image, matrix, (w, h))
|
||||
return warped
|
||||
6
runtime/ops/mapper/img_resize/__init__.py
Normal file
6
runtime/ops/mapper/img_resize/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgResize',
|
||||
module_path="ops.mapper.img_resize.process")
|
||||
35
runtime/ops/mapper/img_resize/metadata.yml
Normal file
35
runtime/ops/mapper/img_resize/metadata.yml
Normal file
@@ -0,0 +1,35 @@
|
||||
name: '图片重采样'
|
||||
name_en: 'Image Resampling'
|
||||
description: '将图片放大或缩小到指定像素。'
|
||||
description_en: 'Zooms in or out images to specified pixels.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgResize'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
settings:
|
||||
targetSize:
|
||||
name: 重采样尺寸
|
||||
type: multiple
|
||||
properties:
|
||||
- type: inputNumber
|
||||
name: 宽度
|
||||
description: 像素
|
||||
defaultVal: 256
|
||||
min: 1
|
||||
max: 4096
|
||||
step: 1
|
||||
- type: inputNumber
|
||||
name: 高度
|
||||
description: 像素
|
||||
defaultVal: 256
|
||||
min: 1
|
||||
max: 4096
|
||||
step: 1
|
||||
40
runtime/ops/mapper/img_resize/process.py
Normal file
40
runtime/ops/mapper/img_resize/process.py
Normal file
@@ -0,0 +1,40 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description:
|
||||
Create: 2025/01/16
|
||||
"""
|
||||
import time
|
||||
from typing import List, Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
import cv2
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgResize(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgResize, self).__init__(*args, **kwargs)
|
||||
self._target_size = kwargs.get("targetSize", [256, 256])
|
||||
|
||||
@classmethod
|
||||
def _img_resize(cls, data: List[float], target_size: List[int]) -> List[float]:
|
||||
"""将图片缩放到指定尺寸大小"""
|
||||
target_width = max(min(target_size[0], 4096), 1)
|
||||
target_height = max(min(target_size[1], 4096), 1)
|
||||
resized_img = cv2.resize(data, (target_width, target_height), interpolation=cv2.INTER_AREA)
|
||||
return resized_img
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
if img_bytes:
|
||||
data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
resized_img = self._img_resize(data, self._target_size)
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(resized_img, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgResize costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
6
runtime/ops/mapper/img_shadow_remove/__init__.py
Normal file
6
runtime/ops/mapper/img_shadow_remove/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgShadowRemove',
|
||||
module_path="ops.mapper.img_shadow_remove.process")
|
||||
17
runtime/ops/mapper/img_shadow_remove/metadata.yml
Normal file
17
runtime/ops/mapper/img_shadow_remove/metadata.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
name: '图片阴影去除'
|
||||
name_en: 'Image Shadow Removal'
|
||||
description: '去除图片中的阴影,主要适用于文档场景。'
|
||||
description_en: 'Removes shadows from images, which is mainly applicable to document
|
||||
scenarios.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgShadowRemove'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
72
runtime/ops/mapper/img_shadow_remove/process.py
Normal file
72
runtime/ops/mapper/img_shadow_remove/process.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description: 图片去阴影插件
|
||||
Create: 2025/01/16
|
||||
"""
|
||||
import time
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgShadowRemove(Mapper):
|
||||
"""图片阴影去除"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgShadowRemove, self).__init__(*args, **kwargs)
|
||||
self.iter_nums = 9 # 闭运算循环次数(不作为参数传入)。
|
||||
self.k_size = 3 # kernel size大小。
|
||||
self.clip_limit = 2 # 对比度限制阈值, 数值越大,效果越强。
|
||||
self.tile_grid = 8 # 图像划分的网格大小, 数值越小,局部效果越明显。
|
||||
|
||||
def shadow_removed(self, image_data: np.ndarray):
|
||||
'''
|
||||
阴影去除。
|
||||
|
||||
Args:
|
||||
image_data: nd.array 格式图片
|
||||
Returns:
|
||||
阴影去除后的图片
|
||||
'''
|
||||
# 设置kernel大小,进行闭运算
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (self.k_size, self.k_size))
|
||||
closing = cv2.morphologyEx(image_data, cv2.MORPH_CLOSE, kernel, iterations=self.iter_nums)
|
||||
|
||||
# 进行~(closing - original)操作
|
||||
cv2.bitwise_not(closing - image_data, dst=closing)
|
||||
cv2.cvtColor(closing, cv2.COLOR_BGR2Lab, dst=closing)
|
||||
|
||||
# 获取处理后图像的亮度通道
|
||||
img_l = cv2.split(closing)[0]
|
||||
del closing
|
||||
|
||||
# 对img_l进行调节后,替换原图的亮度通道
|
||||
cv2.cvtColor(image_data, cv2.COLOR_BGR2Lab, dst=image_data)
|
||||
# 创建 CLAHE 对象
|
||||
clahe = cv2.createCLAHE(clipLimit=self.clip_limit, tileGridSize=(self.tile_grid, self.tile_grid))
|
||||
# 进行 CLAHE 处理
|
||||
image_data[:, :, 0] = clahe.apply(img_l)
|
||||
del img_l
|
||||
|
||||
cv2.cvtColor(image_data, cv2.COLOR_Lab2BGR, dst=image_data)
|
||||
return image_data
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
img_bytes = sample[self.data_key]
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
if img_bytes:
|
||||
# 进行阴影去除
|
||||
img_data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
img_data = self.shadow_removed(img_data)
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImageShadowRemove costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
6
runtime/ops/mapper/img_type_unify/__init__.py
Normal file
6
runtime/ops/mapper/img_type_unify/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgTypeUnify',
|
||||
module_path="ops.mapper.img_type_unify.process")
|
||||
30
runtime/ops/mapper/img_type_unify/metadata.yml
Normal file
30
runtime/ops/mapper/img_type_unify/metadata.yml
Normal file
@@ -0,0 +1,30 @@
|
||||
name: '图片格式转换'
|
||||
name_en: 'Image Format Conversion'
|
||||
description: '将图片编码格式统一为jpg、jpeg、png、bmp格式'
|
||||
description_en: 'Converts image formats to JPG, JPEG, PNG, or BMP.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgTypeUnify'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
settings:
|
||||
imgType:
|
||||
name: 图片编码格式
|
||||
type: select
|
||||
defaultVal: jpg
|
||||
options:
|
||||
- label: jpg
|
||||
value: jpg
|
||||
- label: png
|
||||
value: png
|
||||
- label: jpeg
|
||||
value: jpeg
|
||||
- label: bmp
|
||||
value: bmp
|
||||
41
runtime/ops/mapper/img_type_unify/process.py
Normal file
41
runtime/ops/mapper/img_type_unify/process.py
Normal file
@@ -0,0 +1,41 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description:
|
||||
Create: 2025/01/16
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import bytes_transform
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class ImgTypeUnify(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(ImgTypeUnify, self).__init__(*args, **kwargs)
|
||||
"""勾选图片编码格式统一,未输入参数时,默认设置为jpg格式"""
|
||||
self._setting_type = kwargs.get("imgType", "jpg")
|
||||
|
||||
def execute(self, sample):
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
origin_file_type = sample[self.filetype_key]
|
||||
if origin_file_type == self._setting_type:
|
||||
# 原文件格式与目标文件编码格式一致,无需处理
|
||||
return sample
|
||||
file_path = sample[self.filepath_key]
|
||||
# 读取图片
|
||||
img_bytes = sample[self.data_key]
|
||||
if img_bytes:
|
||||
origin_data = bytes_transform.bytes_to_numpy(img_bytes)
|
||||
# 按指定编码格式转字节
|
||||
sample[self.data_key] = bytes_transform.numpy_to_bytes(origin_data, "." + self._setting_type)
|
||||
# 修改meta数据
|
||||
sample[self.filetype_key] = self._setting_type
|
||||
sample[self.filename_key] = re.sub(self._setting_type + "$", self._setting_type, file_name)
|
||||
sample[self.filepath_key] = re.sub(self._setting_type + "$", self._setting_type, file_path)
|
||||
logger.info(f"fileName: {file_name}, method: ImgTypeUnify costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
6
runtime/ops/mapper/img_watermark_remove/__init__.py
Normal file
6
runtime/ops/mapper/img_watermark_remove/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='ImgWatermarkRemove',
|
||||
module_path="ops.mapper.img_watermark_remove.process")
|
||||
26
runtime/ops/mapper/img_watermark_remove/metadata.yml
Normal file
26
runtime/ops/mapper/img_watermark_remove/metadata.yml
Normal file
@@ -0,0 +1,26 @@
|
||||
name: '图片水印去除'
|
||||
name_en: 'Image Watermark Removal'
|
||||
description: '去除图片中的“知乎”和“抖音”水印。'
|
||||
description_en: 'Removes the 知乎 and 抖音 watermarks from images.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'ImgWatermarkRemove'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'image'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'image'
|
||||
outputs: 'image'
|
||||
settings:
|
||||
watermarkStr:
|
||||
name: 需要去除的水印文字信息
|
||||
type: checkbox
|
||||
defaultVal: '知乎,抖音'
|
||||
options:
|
||||
- label: 知乎
|
||||
value: 知乎
|
||||
- label: 抖音
|
||||
value: 抖音
|
||||
160
runtime/ops/mapper/img_watermark_remove/process.py
Normal file
160
runtime/ops/mapper/img_watermark_remove/process.py
Normal file
@@ -0,0 +1,160 @@
|
||||
# # -- encoding: utf-8 --
|
||||
|
||||
#
|
||||
# Description:
|
||||
# Create: 2025/01/06
|
||||
# """
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils import bytes_to_numpy
|
||||
from datamate.common.utils import numpy_to_bytes
|
||||
from datamate.core.base_op import Mapper
|
||||
from .watermark_ocr_model import WatermarkOcrModel
|
||||
|
||||
DEFAULT_MAX_CHARACTERS = 10
|
||||
DEFAULT_BINARY_THRESHOLD_LOW = 200
|
||||
|
||||
|
||||
class ImgWatermarkRemove(Mapper):
|
||||
use_model = True
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.remove_str = kwargs.get("watermarkStr", "知乎,抖音")
|
||||
self.ocr_model = self.get_model(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def _has_kw(result_list, kw_list):
|
||||
"""
|
||||
图片是否包含目标水印,返回匹配到的文字列表
|
||||
"""
|
||||
result_str_list = []
|
||||
for line in result_list:
|
||||
for kw in kw_list:
|
||||
if kw in line[1][0]:
|
||||
result_str_list.append(line[1][0])
|
||||
break
|
||||
return result_str_list
|
||||
|
||||
@staticmethod
|
||||
def _overlay_mask(background_img, prospect_img, img_over_x, img_over_y):
|
||||
back_r, back_c, _ = background_img.shape # 背景图像行数、列数
|
||||
is_x_direction_failed = img_over_x > back_c or img_over_x < 0
|
||||
is_y_direction_failed = img_over_y > back_r or img_over_y < 0
|
||||
if is_x_direction_failed or is_y_direction_failed:
|
||||
# 前景图不在背景图范围内, 直接返回原图
|
||||
return background_img
|
||||
pro_r, pro_c, _ = prospect_img.shape # 前景图像行数、列数
|
||||
if img_over_x + pro_c > back_c: # 如果水平方向展示不全
|
||||
pro_c = back_c - img_over_x # 截取前景图的列数
|
||||
prospect_img = prospect_img[:, 0:pro_c, :] # 截取前景图
|
||||
if img_over_y + pro_r > back_r: # 如果垂直方向展示不全
|
||||
pro_r = back_r - img_over_y # 截取前景图的行数
|
||||
prospect_img = prospect_img[0:pro_r, :, :] # 截取前景图
|
||||
|
||||
prospect_img = cv2.cvtColor(prospect_img, cv2.COLOR_BGR2BGRA) # 前景图转为4通道图像
|
||||
prospect_tmp = np.zeros((back_r, back_c, 4), np.uint8) # 与背景图像等大的临时前景图层
|
||||
|
||||
# 前景图像放到前景图层里
|
||||
prospect_tmp[img_over_y:img_over_y + pro_r, img_over_x: img_over_x + pro_c, :] = prospect_img
|
||||
|
||||
_, binary = cv2.threshold(prospect_img, 254, 255, cv2.THRESH_BINARY) # 前景图阈值处理
|
||||
prospect_mask = np.zeros((pro_r, pro_c, 1), np.uint8) # 单通道前景图像掩模
|
||||
prospect_mask[:, :, 0] = binary[:, :, 3] # 不透明像素的值作为掩模的值
|
||||
|
||||
mask = np.zeros((back_r, back_c, 1), np.uint8)
|
||||
mask[img_over_y:img_over_y + prospect_mask.shape[0],
|
||||
img_over_x: img_over_x + prospect_mask.shape[1]] = prospect_mask
|
||||
|
||||
mask_not = cv2.bitwise_not(mask)
|
||||
|
||||
prospect_tmp = cv2.bitwise_and(prospect_tmp, prospect_tmp, mask=mask)
|
||||
background_img = cv2.bitwise_and(background_img, background_img, mask=mask_not)
|
||||
prospect_tmp = cv2.cvtColor(prospect_tmp, cv2.COLOR_BGRA2BGR) # 前景图层转为三通道图像
|
||||
return prospect_tmp + background_img # 前景图层与背景图像相加合并
|
||||
|
||||
def execute(self, sample: Dict[str, Any]):
|
||||
start = time.time()
|
||||
file_name = sample[self.filename_key]
|
||||
file_type = "." + sample[self.filetype_key]
|
||||
img_bytes = sample[self.data_key]
|
||||
if img_bytes:
|
||||
data = bytes_to_numpy(img_bytes)
|
||||
correct_data = self._watermark_remove(data, file_name, self.ocr_model)
|
||||
sample[self.data_key] = numpy_to_bytes(correct_data, file_type)
|
||||
logger.info(f"fileName: {file_name}, method: ImgWatermarkRemove costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def delete_watermark(self, result_list, kw_list, data):
|
||||
"""
|
||||
将符合目标的水印,模糊化处理
|
||||
"""
|
||||
# 获取所有符合目标的文本框位置
|
||||
text_axes_list = []
|
||||
for line in result_list:
|
||||
for kw in kw_list:
|
||||
if kw in line[1][0]:
|
||||
min_width = int(min(line[0][0][0], line[0][3][0]))
|
||||
max_width = int(max(line[0][1][0], line[0][2][0]))
|
||||
min_hight = int(min(line[0][0][1], line[0][1][1]))
|
||||
max_hight = int(max(line[0][2][1], line[0][3][1]))
|
||||
text_axes_list.append([min_width, min_hight, max_width, max_hight])
|
||||
break
|
||||
# 去除水印
|
||||
delt = DEFAULT_MAX_CHARACTERS # 文本框范围扩大
|
||||
img = data
|
||||
for text_axes in text_axes_list:
|
||||
hight, width = img.shape[0:2]
|
||||
# 截取图片
|
||||
min_width = text_axes[0] - delt if text_axes[0] - delt >= 0 else 0
|
||||
min_hight = text_axes[1] - delt if text_axes[1] - delt >= 0 else 0
|
||||
max_width = text_axes[2] + delt if text_axes[2] + delt <= width else width
|
||||
max_hight = text_axes[3] + delt if text_axes[3] + delt <= hight else hight
|
||||
cropped = img[min_hight:max_hight, min_width:max_width] # 裁剪坐标为[y0:y1, x0:x1]
|
||||
# 图片二值化处理,把[200,200,200]-[250,250,250]以外的颜色变成0
|
||||
start_rgb = DEFAULT_BINARY_THRESHOLD_LOW
|
||||
thresh = cv2.inRange(cropped, np.array([start_rgb, start_rgb, start_rgb]), np.array([250, 250, 250]))
|
||||
# 创建形状和尺寸的结构元素
|
||||
kernel = np.ones((3, 3), np.uint8) # 设置卷积核3*3全是1;将当前的数组作为图像类型来进⾏各种操作,就要转换到uint8类型
|
||||
# 扩展待修复区域
|
||||
hi_mask = cv2.dilate(thresh, kernel, iterations=10) # 膨胀操作,白色区域增大,iterations迭代次数
|
||||
specular = cv2.inpaint(cropped, hi_mask, 5, flags=cv2.INPAINT_TELEA)
|
||||
# imgSY:输入8位1通道或3通道图像。
|
||||
# hi_mask:修复掩码,8位1通道图像。非零像素表示需要修复的区域。
|
||||
# specular:输出与imgSY具有相同大小和类型的图像。
|
||||
# 5:算法考虑的每个点的圆形邻域的半径。
|
||||
# flags:NPAINT_NS基于Navier-Stokes的方法、Alexandru Telea的INPAINT_TELEA方法
|
||||
result = self._overlay_mask(img, specular, min_width, min_hight)
|
||||
img = result
|
||||
return img
|
||||
|
||||
def init_model(self, *args, **kwargs):
|
||||
return WatermarkOcrModel(*args, **kwargs).ocr_model
|
||||
|
||||
def _watermark_remove(self, data, file_name, model):
|
||||
"""
|
||||
去除水印的方法
|
||||
"""
|
||||
remove_str = self.remove_str
|
||||
# 勾选去水印的信息为空,则直接返回原图
|
||||
if remove_str == "":
|
||||
return data
|
||||
kw_list = remove_str.split(',')
|
||||
# 加载模型
|
||||
ocr_model = model
|
||||
try:
|
||||
result = ocr_model.ocr(data, cls=True)
|
||||
except RuntimeError as e:
|
||||
logger.error(f"fileName: {file_name}, method: ocr predict error {e}")
|
||||
return data
|
||||
if result and result[0]:
|
||||
logger.info(f"fileName: {file_name}, method: ocrModel detect watermark info {str(result)}")
|
||||
return self.delete_watermark(result[0], kw_list, data)
|
||||
else:
|
||||
logger.info(f"fileName: {file_name}, method: ImgWatermarkRemove not need remove target ocr")
|
||||
return data
|
||||
@@ -0,0 +1,25 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
import gc
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class WatermarkOcrModel:
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
models_path = os.getenv("MODELS_PATH", "/home/models")
|
||||
self.resources_path = str(Path(models_path, 'img_watermark_remove', 'resources'))
|
||||
self.det_model_dir = str(Path(self.resources_path, 'ch_PP-OCRv4_det_infer'))
|
||||
self.rec_model_dir = str(Path(self.resources_path, 'ch_PP-OCRv4_rec_infer'))
|
||||
self.cls_model_dir = str(Path(self.resources_path, 'ch_ppocr_mobild_v2_cls_infer'))
|
||||
|
||||
from paddleocr import PaddleOCR
|
||||
self.ocr_model = PaddleOCR(det_model_dir=self.det_model_dir, cls_model_dir=self.cls_model_dir,
|
||||
rec_model_dir=self.rec_model_dir,
|
||||
use_angle_cls=True,
|
||||
lang='ch')
|
||||
|
||||
def __del__(self):
|
||||
del self.ocr_model
|
||||
gc.collect()
|
||||
@@ -0,0 +1,7 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='InvisibleCharactersCleaner',
|
||||
module_path="ops.mapper.invisible_characters_cleaner.process")
|
||||
|
||||
16
runtime/ops/mapper/invisible_characters_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/invisible_characters_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '不可见字符去除'
|
||||
name_en: 'Invisible Character Removal'
|
||||
description: '去除文档中的不可见字符,例如 0-31 号字符中的部分字符。'
|
||||
description_en: 'Removes invisible characters from documents, for example, removing invisible characters from characters numbered 0 to 31.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'InvisibleCharactersCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: "对“材料”怎样下\x04定义才臻于 严格和科学?"
|
||||
after: '对“材料”怎样下定义才臻于严格和科学?'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
30
runtime/ops/mapper/invisible_characters_cleaner/process.py
Normal file
30
runtime/ops/mapper/invisible_characters_cleaner/process.py
Normal file
@@ -0,0 +1,30 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 不可见字符去除
|
||||
Create: 2025/01/13
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class InvisibleCharactersCleaner(Mapper):
|
||||
@staticmethod
|
||||
def _invisible_characters_filter(input_data: str):
|
||||
# 移除ASCII中不可见字符,包括0-7、14-19 21-31、127-160的字符
|
||||
invisible_char_pattern = '[\x00-\x07|\x0E-\x13|\x15-\x1F|\x7F-\xA0]'
|
||||
invisible_chars_re = re.compile(invisible_char_pattern)
|
||||
return invisible_chars_re.sub('', input_data)
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._invisible_characters_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: InvisibleCharactersCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
6
runtime/ops/mapper/ip_address_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/ip_address_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='AnonymizedIpAddress',
|
||||
module_path="ops.mapper.ip_address_cleaner.process")
|
||||
16
runtime/ops/mapper/ip_address_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/ip_address_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: 'IP地址匿名化'
|
||||
name_en: 'IP Address Anonymization'
|
||||
description: 'IP地址匿名化'
|
||||
description_en: 'Anonymizes IP addresses.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'AnonymizedIpAddress'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '这个是IP地址:10.x.x.10'
|
||||
after: '这个是IP地址:<ip>'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
74
runtime/ops/mapper/ip_address_cleaner/process.py
Normal file
74
runtime/ops/mapper/ip_address_cleaner/process.py
Normal file
@@ -0,0 +1,74 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 身份证号码匿名化插件
|
||||
Create: 2024/12/26 15:43
|
||||
"""
|
||||
import ipaddress
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class AnonymizedIpAddress(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
# IP地址校验
|
||||
# X.X.X.X与四级目录格式相同,避免误清洗,该格式的IP地址必须匹配 IP/IP地址等字样
|
||||
super().__init__(*args, **kwargs)
|
||||
self.ipv4_1_and_prefix_pattern = r'ip(地址| address|v4)?( |:|:)*(?<![\.\d])'
|
||||
# X.X.X.X
|
||||
self.ipv4_pattern = r'(?<![\.\d])\d\.\d\.\d\.\d(?![\.\d])'
|
||||
self.ipv4_re_compile = re.compile(r"(?<![\d.])(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?![.\d])")
|
||||
self.ipv6_re_compile = re.compile(r"(?<![0-9a-fA-F:])(([0-9a-fA-F]{0,4}:)+[0-9a-fA-F]{0,4})(?![0-9a-fA-F:])")
|
||||
|
||||
@staticmethod
|
||||
def verify_ip_address(ip):
|
||||
"""验证字符串是否为合法ip地址"""
|
||||
try:
|
||||
ipaddress.ip_address(ip)
|
||||
except ValueError:
|
||||
return False
|
||||
return True
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._ip_address_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: IPAddressCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def filter_ipv4(self, ipv4, line):
|
||||
"""ipv4地址匿名化"""
|
||||
if not self.verify_ip_address(ipv4):
|
||||
return line
|
||||
ipv4_format = ipv4.replace(".", "\\.")
|
||||
# 非单字节ip地址直接匿名化
|
||||
if not re.search(self.ipv4_pattern, "【" + ipv4 + "】"):
|
||||
line = re.compile(r"(?<![\d.])" + ipv4_format + r"(?![.\d])").sub("<ip>", line)
|
||||
elif re.search(self.ipv4_1_and_prefix_pattern + ipv4_format + r"(?![.\d])", line, re.IGNORECASE):
|
||||
# 单字节ip地址需搜索关键字眼,有关键字眼则段落中单字节ip地址匿名化
|
||||
line = re.compile(self.ipv4_pattern).sub("<ip>", line)
|
||||
return line
|
||||
|
||||
def _ip_address_filter(self, input_data: str):
|
||||
""" IPv4、IPv6地址匿名化"""
|
||||
lines = input_data.split("\n")
|
||||
line_list = []
|
||||
for line in lines:
|
||||
# 为防止IP地址处于段落开头或结尾不能被匹配,需要在字符串首尾加占位符
|
||||
line = ''.join(['【', line, '】'])
|
||||
ipv4_groups = self.ipv4_re_compile.findall(line)
|
||||
for ipv4 in ipv4_groups:
|
||||
line = self.filter_ipv4(ipv4, line)
|
||||
ipv6_groups = self.ipv6_re_compile.findall(line)
|
||||
for group in ipv6_groups:
|
||||
ipv6 = group[0]
|
||||
if ipv6 and self.verify_ip_address(ipv6):
|
||||
line = re.compile(r"(?<![0-9a-fA-F:])" + ipv6 + "(?![0-9a-fA-F:])").sub("<ip>", line)
|
||||
line_list.append(line[1:-1])
|
||||
text = "\n".join([line.strip() for line in line_list])
|
||||
return text
|
||||
6
runtime/ops/mapper/knowledge_relation_slice/__init__.py
Normal file
6
runtime/ops/mapper/knowledge_relation_slice/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='KnowledgeRelationSlice',
|
||||
module_path="ops.mapper.knowledge_relation_slice.process")
|
||||
108
runtime/ops/mapper/knowledge_relation_slice/graph_sim_func.py
Normal file
108
runtime/ops/mapper/knowledge_relation_slice/graph_sim_func.py
Normal file
@@ -0,0 +1,108 @@
|
||||
#!/usr/bin/python3.9
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import math
|
||||
from multiprocessing import Pool, cpu_count
|
||||
|
||||
from six import iteritems
|
||||
from six.moves import range
|
||||
from loguru import logger
|
||||
|
||||
PARAM_K1 = 1.5
|
||||
PARAM_B = 0.75
|
||||
EPSILON = 0.25
|
||||
|
||||
|
||||
def effective_n_jobs(n_jobs):
|
||||
if n_jobs == 0:
|
||||
raise ValueError('n_jobs == 0 in Parallel has no meaning')
|
||||
elif n_jobs is None:
|
||||
return 1
|
||||
elif n_jobs < 0:
|
||||
n_jobs = max(cpu_count() + 1 + n_jobs, 1)
|
||||
return n_jobs
|
||||
|
||||
|
||||
class SimilarityAlgBM25(object):
|
||||
|
||||
def __init__(self, corpus_docs):
|
||||
|
||||
self.corpus_files_size = 0
|
||||
self.avg_dl = 0
|
||||
self.doc_file_freqs = []
|
||||
self.idf_dict = {}
|
||||
self.doc_len = []
|
||||
self._initialize(corpus_docs)
|
||||
|
||||
def get_sim_score(self, document, index):
|
||||
|
||||
score = 0
|
||||
doc_freqs = self.doc_file_freqs[index]
|
||||
for word in document:
|
||||
if word not in doc_freqs:
|
||||
continue
|
||||
try:
|
||||
score += (self.idf_dict[word] * doc_freqs[word] * (PARAM_K1 + 1)
|
||||
/ (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avg_dl)))
|
||||
except KeyError as ke:
|
||||
logger.warning('key not found in doc_freqs dict: ', word)
|
||||
return score
|
||||
|
||||
def get_sim_scores(self, document):
|
||||
|
||||
scores = []
|
||||
for index in range(self.corpus_files_size):
|
||||
cur_score = self.get_sim_score(document, index)
|
||||
scores.append(cur_score)
|
||||
return scores
|
||||
|
||||
def get_scores_bow(self, document):
|
||||
|
||||
scores = []
|
||||
for index in range(self.corpus_files_size):
|
||||
score = self.get_sim_score(document, index)
|
||||
if score > 0:
|
||||
scores.append((index, score))
|
||||
return scores
|
||||
|
||||
def _initialize(self, corpus_files):
|
||||
"""
|
||||
Calculates frequencies of terms in documents and in corpus_files.
|
||||
Also computes inverse document frequencies.
|
||||
"""
|
||||
nd = {} # word -> number of documents with word
|
||||
num_doc = 0
|
||||
for document_file in corpus_files:
|
||||
self.corpus_files_size += 1
|
||||
self.doc_len.append(len(document_file))
|
||||
num_doc += len(document_file)
|
||||
|
||||
frequencies_dict = {}
|
||||
for word in document_file:
|
||||
if word not in frequencies_dict:
|
||||
frequencies_dict[word] = 0
|
||||
frequencies_dict[word] += 1
|
||||
self.doc_file_freqs.append(frequencies_dict)
|
||||
|
||||
for word, _ in iteritems(frequencies_dict):
|
||||
if word not in nd:
|
||||
nd[word] = 0
|
||||
nd[word] += 1
|
||||
|
||||
self.avg_dl = float(num_doc) / self.corpus_files_size
|
||||
# collect idf sum to calculate an average idf for epsilon value
|
||||
idf_sum = 0
|
||||
|
||||
negative_idfs_list = []
|
||||
for word, freq in iteritems(nd):
|
||||
idf = math.log(self.corpus_files_size - freq + 0.5) - math.log(freq + 0.5)
|
||||
self.idf_dict[word] = idf
|
||||
idf_sum += idf
|
||||
if idf < 0:
|
||||
negative_idfs_list.append(word)
|
||||
self.average_idf = float(idf_sum) / len(self.idf_dict)
|
||||
|
||||
eps = EPSILON * self.average_idf
|
||||
for word in negative_idfs_list:
|
||||
self.idf_dict[word] = eps
|
||||
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/python3.9
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
__all__ = ['build_llm_prompt', 'get_json_list']
|
||||
|
||||
import math
|
||||
|
||||
import jieba
|
||||
from loguru import logger
|
||||
|
||||
from . import graph_sim_func as bm25
|
||||
from .knowledge_slice import TextSegmentationOperator
|
||||
|
||||
|
||||
def build_llm_prompt(text):
|
||||
#
|
||||
prompt = """
|
||||
===
|
||||
<Role>:
|
||||
你是一位问答对QA智能撰写专家,你擅长根据给定的内容给出准确、完整、详细的多个问答对。
|
||||
|
||||
===
|
||||
<Instructions>:
|
||||
- 你需要根据已知信息(context),准确、详细的生成多个QA对。
|
||||
- 生成的问答对中答案少于10个中文字符时,放弃该问答对。
|
||||
- 确保所有问答对的答案都是已知信息的一部分,且可以组成已知信息,确保没有信息遗漏。
|
||||
- 仅根据已知信息生成问答对,答案要详细,且不能创造臆想已知信息中没有的内容。
|
||||
- 确保生成的多个QA对之间不要进行排序,Q:或A:前后不要出现数字序号。
|
||||
- Q:使用疑问句方式,问号结尾;A:使用陈述句方式,句号结尾,确保回答完整。
|
||||
- 输出格式如下:
|
||||
Q:......
|
||||
A:......
|
||||
|
||||
===
|
||||
<task>
|
||||
满足上述条件的情况下,现根据context:'''{}'''
|
||||
生成的多个QA问答对为:
|
||||
|
||||
"""
|
||||
|
||||
return prompt.format(text)
|
||||
|
||||
|
||||
class KnowledgeSlice:
|
||||
# edatamate切片算法插件
|
||||
def __init__(self, file_text, chunk_size=500, overlap_size=100):
|
||||
self.file_text = file_text
|
||||
self.slice_op = TextSegmentationOperator(chunk_size, overlap_size)
|
||||
|
||||
def execute(self):
|
||||
try:
|
||||
chunks = self.slice_op.process(self.file_text)
|
||||
except Exception as err:
|
||||
logger.exception(f"split text failed, error is: {err}")
|
||||
chunks = []
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
class BM25Model:
|
||||
def __init__(self, data_list):
|
||||
self.data_list = data_list
|
||||
self.corpus = self.load_corpus()
|
||||
|
||||
def bm25_similarity(self, query, num_best=1):
|
||||
query = jieba.lcut(query)
|
||||
bm = bm25.SimilarityAlgBM25(self.corpus)
|
||||
scores = bm.get_sim_scores(query)
|
||||
id_score = [(i, score) for i, score in enumerate(scores)]
|
||||
id_score.sort(key=lambda e: e[1], reverse=True)
|
||||
|
||||
return id_score[0: num_best]
|
||||
|
||||
def load_corpus(self):
|
||||
corpus = [jieba.lcut(data) for data in self.data_list]
|
||||
|
||||
return corpus
|
||||
|
||||
|
||||
class KnowledgeGraph:
|
||||
# class for document segmentation and create relation between knowledge
|
||||
def __init__(self, corpus_file_string, chunk_size=500, overlap_size=100, kg_relation=True):
|
||||
self.corpus_file_string = corpus_file_string
|
||||
self.chunk_size = chunk_size
|
||||
self.overlap_size = overlap_size
|
||||
self.kg_relation = kg_relation
|
||||
self.slicing_corpus = []
|
||||
self.knowledge_slice = KnowledgeSlice(self.corpus_file_string, self.chunk_size, self.overlap_size)
|
||||
|
||||
@staticmethod
|
||||
def update_gallery_list(gallery_list, iterated_dict):
|
||||
# get a gallery list which not in iterated_dict
|
||||
gallery_list_update = []
|
||||
gallery_list_index = []
|
||||
for i, _ in enumerate(gallery_list):
|
||||
if i not in iterated_dict:
|
||||
gallery_list_update.append(gallery_list[i])
|
||||
gallery_list_index.append(i)
|
||||
|
||||
return gallery_list_update, gallery_list_index
|
||||
|
||||
def document_slicing(self):
|
||||
json_list = []
|
||||
all_slices_info = self.knowledge_slice.execute()
|
||||
|
||||
for _, item in enumerate(all_slices_info):
|
||||
json_list.append({
|
||||
"slice_data": item
|
||||
})
|
||||
|
||||
self.slicing_corpus = json_list
|
||||
|
||||
def build_knowledge_relation(self, slicing_corpus_list):
|
||||
# knowledge relation for each paragraph
|
||||
if not self.kg_relation:
|
||||
return slicing_corpus_list
|
||||
iterated_dict = {}
|
||||
kr_result_json_list = []
|
||||
gallery_list = []
|
||||
kr_relation_list = []
|
||||
|
||||
if len(slicing_corpus_list) < 3:
|
||||
return slicing_corpus_list
|
||||
|
||||
for _, item in enumerate(slicing_corpus_list):
|
||||
gallery_list.append(item['slice_data'])
|
||||
|
||||
for k, item in enumerate(slicing_corpus_list):
|
||||
if k not in iterated_dict:
|
||||
iterated_dict[k] = 1
|
||||
cur_gallery_list, cur_gallery_src_index = self.update_gallery_list(gallery_list, iterated_dict)
|
||||
if len(cur_gallery_list) < 1:
|
||||
kr_result_json_list.append({
|
||||
"slice_data": item['slice_data']
|
||||
})
|
||||
return kr_result_json_list
|
||||
bm25_class = BM25Model(cur_gallery_list)
|
||||
id_scores = bm25_class.bm25_similarity(item['slice_data'], 1)
|
||||
kr_result_doc = item['slice_data'] + cur_gallery_list[id_scores[0][0]]
|
||||
kr_result_json_list.append({
|
||||
"slice_data": kr_result_doc
|
||||
})
|
||||
if cur_gallery_src_index[id_scores[0][0]] not in iterated_dict:
|
||||
iterated_dict[cur_gallery_src_index[id_scores[0][0]]] = 1
|
||||
else:
|
||||
continue
|
||||
|
||||
return kr_result_json_list
|
||||
|
||||
def build_graph_efficiently(self, search_space_size=50):
|
||||
# build knowledge relation in a efficient way
|
||||
knowledge_total_num = len(self.slicing_corpus)
|
||||
knowledge_chunk_num = math.ceil(knowledge_total_num / search_space_size)
|
||||
knowledge_relation_result = []
|
||||
|
||||
for i in range(0, knowledge_chunk_num):
|
||||
cur_max_index = (i + 1) * search_space_size
|
||||
if cur_max_index > knowledge_total_num:
|
||||
corpus_list = self.slicing_corpus[i * search_space_size:]
|
||||
else:
|
||||
corpus_list = self.slicing_corpus[i * search_space_size:cur_max_index]
|
||||
# to do knowledge relation
|
||||
cur_knowledge_relation_result = self.build_knowledge_relation(corpus_list)
|
||||
knowledge_relation_result.extend(cur_knowledge_relation_result)
|
||||
|
||||
return knowledge_relation_result
|
||||
|
||||
def knowledge_corpus_list_json(self):
|
||||
# deal the corpus and return structed information json_list
|
||||
self.document_slicing()
|
||||
kr_result_list_json = self.build_graph_efficiently()
|
||||
|
||||
return kr_result_list_json
|
||||
|
||||
|
||||
def get_json_list(txt_string, chunk_size=500, overlap_size=100, kg_relation=True):
|
||||
if len(txt_string) > 0:
|
||||
kg_extract = KnowledgeGraph(txt_string, chunk_size, overlap_size, kg_relation)
|
||||
kr_result_json_list = kg_extract.knowledge_corpus_list_json()
|
||||
else:
|
||||
kr_result_json_list = []
|
||||
|
||||
return kr_result_json_list
|
||||
@@ -0,0 +1,23 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
from typing import List
|
||||
|
||||
from loguru import logger
|
||||
from datamate.common.utils.text_splitter import TextSplitter
|
||||
|
||||
|
||||
class TextSegmentationOperator:
|
||||
def __init__(self, chunk_size, chunk_overlap):
|
||||
try:
|
||||
self.text_splitter = TextSplitter(-1, chunk_size, chunk_overlap)
|
||||
except Exception as err:
|
||||
logger.exception(f"init text splitter failed, error is: {err}")
|
||||
raise err
|
||||
|
||||
def process(self, input_data: str) -> List[str]:
|
||||
if input_data.strip() == "":
|
||||
logger.info("input text is empty, return empty chunks.")
|
||||
return []
|
||||
return self.text_splitter.split_text(input_data)
|
||||
16
runtime/ops/mapper/knowledge_relation_slice/metadata.yml
Normal file
16
runtime/ops/mapper/knowledge_relation_slice/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '知识库关系切片'
|
||||
name_en: 'Knowledge base relationship slicing'
|
||||
description: '知识库关系切片'
|
||||
description_en: 'Knowledge base relationship slicing.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'KnowledgeRelationSlice'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
46
runtime/ops/mapper/knowledge_relation_slice/process.py
Normal file
46
runtime/ops/mapper/knowledge_relation_slice/process.py
Normal file
@@ -0,0 +1,46 @@
|
||||
# -- encoding: utf-8 --
|
||||
|
||||
"""
|
||||
Description:
|
||||
Create: 2023/11/7 9:26
|
||||
"""
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
from .knowledge_relation import get_json_list
|
||||
|
||||
# 切片长度
|
||||
CHUNK_SIZE = 500
|
||||
# 相邻切片重合长度
|
||||
OVERLAP_SIZE = 100
|
||||
|
||||
|
||||
class KnowledgeRelationSlice(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(KnowledgeRelationSlice, self).__init__(*args, **kwargs)
|
||||
if 'chunk_size' not in kwargs:
|
||||
self.chunk_size = CHUNK_SIZE
|
||||
else:
|
||||
self.chunk_size = kwargs.get("chunk_size")
|
||||
|
||||
if 'overlap_size' not in kwargs:
|
||||
self.overlap_size = OVERLAP_SIZE
|
||||
else:
|
||||
self.overlap_size = kwargs.get("overlap_size")
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start_time = time.time()
|
||||
|
||||
chunk_item = get_json_list(sample[self.text_key], chunk_size=self.chunk_size, overlap_size=self.overlap_size)
|
||||
chunk_item_json = json.dumps(chunk_item, ensure_ascii=False)
|
||||
sample[self.text_key] = chunk_item_json
|
||||
|
||||
cost_time = time.time() - start_time
|
||||
logger.info(f'Generate knowledgeRelation slice num: {len(chunk_item)}, Cost time: {cost_time} s')
|
||||
|
||||
return sample
|
||||
6
runtime/ops/mapper/legend_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/legend_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='LegendCleaner',
|
||||
module_path="ops.mapper.legend_cleaner.process")
|
||||
16
runtime/ops/mapper/legend_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/legend_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '图注表注去除'
|
||||
name_en: 'Figure and Table Description Removal'
|
||||
description: '去除文档中的图注、表注等内容。'
|
||||
description_en: 'Removes figure and table description from documents.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'LegendCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '图1.1.1 图注名称'
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
41
runtime/ops/mapper/legend_cleaner/process.py
Normal file
41
runtime/ops/mapper/legend_cleaner/process.py
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 图注表注去除
|
||||
Create: 2024/12/5 15:43
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class LegendCleaner(Mapper):
|
||||
@staticmethod
|
||||
def _get_legend_re_compile():
|
||||
chinese_legend_prefix = r"(图|表|图片|表格)"
|
||||
chinese_legend_number = r"(\d+((\.|-)\d+)*|[a-zA-Z]{1,2}((\.|-)\d+)*)"
|
||||
chinese_legend_pattern = r"(?<=\n)" + chinese_legend_prefix + "( )*" + chinese_legend_number + " +.*\n"
|
||||
english_legend_pattern = r"(Figure|Table|Fig\.?)"
|
||||
english_legend_number = r"(S?\d+((\.|-)\d+)*|[a-zA-Z]{1,2}\d?((\.|-)\d+)*)"
|
||||
english_legend_pattern = (r"(?<=\n)" + english_legend_pattern + "( )*"
|
||||
+ english_legend_number + r"(\.|:)? +.*\n")
|
||||
legend_re_compile = re.compile('|'.join([chinese_legend_pattern, english_legend_pattern]), re.IGNORECASE)
|
||||
return legend_re_compile
|
||||
|
||||
@classmethod
|
||||
def _clean_html_tag(cls, input_data: str):
|
||||
"""移除文档中图注表注等"""
|
||||
input_data = ''.join(['\n', input_data, '\n'])
|
||||
text = cls._get_legend_re_compile().sub("", input_data)
|
||||
return text[1:-1]
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._clean_html_tag(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, method: LegendCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
6
runtime/ops/mapper/phone_number_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/phone_number_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='AnonymizedPhoneNumber',
|
||||
module_path="ops.mapper.phone_number_cleaner.process")
|
||||
16
runtime/ops/mapper/phone_number_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/phone_number_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '电话号码匿名化'
|
||||
name_en: 'Phone Number Anonymization'
|
||||
description: '电话号码匿名化'
|
||||
description_en: 'Anonymizes phone numbers.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'AnonymizedPhoneNumber'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '这个是电话号码:13111111111'
|
||||
after: '这个是电话号码:<tel>'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
51
runtime/ops/mapper/phone_number_cleaner/process.py
Normal file
51
runtime/ops/mapper/phone_number_cleaner/process.py
Normal file
@@ -0,0 +1,51 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 电话号码匿名化
|
||||
Create: 2024/12/26 15:43
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class AnonymizedPhoneNumber(Mapper):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.phone_re_compile = self.get_phone_re_compile()
|
||||
|
||||
@staticmethod
|
||||
def get_phone_re_compile():
|
||||
"""按照格式粗略匹配电话号码,支持以下格式电话号码
|
||||
前缀:(0086)、(86)、(0086)、(86) 、无
|
||||
电话号码:第一位1,第二位3-9,后续数字可以为0-9,数字按照3-4-4进行间隔,间隔符为空格、-、无
|
||||
固定电话号码:0AX-CXXX-XXXX、0BXX-CXXX-XXXX、0BXX-CXX-XXXX A为1-2、B为3-9、C为2-8、X为0-9
|
||||
约束:电话号码前后皆为非数字
|
||||
"""
|
||||
number_prefix = r'([\((]?\+?(00)?86[)\)]?[- ]?)?'
|
||||
cellphone_pattern = r"1[3-9]\d[- ]?\d{4}[- ]?\d{4}"
|
||||
landline_pattern = (r'[((]?(0?[12]\d)[))]?[ -]?[2-8]\d{3}[ -]?\d{4}'
|
||||
r'|[((]?(0?[3-9]\d{2})[))]?[ -]?[2-8]\d{2}\d?[ -]?\d{4}')
|
||||
phone_numbers_pattern = rf'(?<=[^\d]){number_prefix}({cellphone_pattern}|{landline_pattern})(?=[^\d])'
|
||||
phone_re_compile = re.compile(phone_numbers_pattern)
|
||||
return phone_re_compile
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._phone_number_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: PhoneNumberCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _phone_number_filter(self, input_data: str):
|
||||
""" 电话号码匿名化"""
|
||||
# 正则匹配:电话号码前需匹配不是数字的字符串
|
||||
# 为避免处于文章开头和结尾的电话号码不可被识别,需要在输入字符串的前后手动加上字符串
|
||||
input_data = ''.join(['【', input_data, '】'])
|
||||
input_data = self.phone_re_compile.sub("<tel>", input_data)
|
||||
return input_data[1:-1]
|
||||
6
runtime/ops/mapper/political_word_cleaner/__init__.py
Normal file
6
runtime/ops/mapper/political_word_cleaner/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='PoliticalWordCleaner',
|
||||
module_path="ops.mapper.political_word_cleaner.process")
|
||||
16
runtime/ops/mapper/political_word_cleaner/metadata.yml
Normal file
16
runtime/ops/mapper/political_word_cleaner/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '政治文本匿名化'
|
||||
name_en: 'Political Text Anonymization'
|
||||
description: '将政治文本进行匿名化。'
|
||||
description_en: 'Anonymizes political texts.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'PoliticalWordCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '特别字符:改革历程'
|
||||
after: '特别字符:***'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
67
runtime/ops/mapper/political_word_cleaner/process.py
Normal file
67
runtime/ops/mapper/political_word_cleaner/process.py
Normal file
@@ -0,0 +1,67 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 政治文本过滤
|
||||
Create: 2024/12/26 15:43
|
||||
"""
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils.aho_corasick import AhoCorasic
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class PoliticalWordCleaner(Mapper):
|
||||
"""外部输入的政治文本过滤插件"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
root_path = Path(__file__).parent / 'resources'
|
||||
political_file_path = str(root_path / 'political.txt')
|
||||
special_symbols_path = str(root_path / 'special_symbols.txt')
|
||||
self.special_symbols = self.load_words_list(special_symbols_path)
|
||||
self.political_words = self.load_words_list(political_file_path)
|
||||
self.ac_automaton = AhoCorasic(self.political_words)
|
||||
|
||||
@staticmethod
|
||||
def load_words_list(path):
|
||||
"""词表加载"""
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
words = set(f.read().splitlines())
|
||||
return words
|
||||
|
||||
@staticmethod
|
||||
def words_replace(target_strings: list, text: str):
|
||||
"""
|
||||
目标字符串替换。
|
||||
|
||||
Args:
|
||||
target_strings: 前缀树根节点。
|
||||
text: 待清洗文本。
|
||||
returns:
|
||||
清洗后文本。
|
||||
"""
|
||||
target_strings.sort(key=lambda x: -len(x))
|
||||
for s in target_strings:
|
||||
tmp_text = text.replace(s, '*' * len(s))
|
||||
text = tmp_text
|
||||
return text
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._political_word_filter(sample[self.text_key])
|
||||
logger.info(
|
||||
f"fileName: {sample[self.filename_key]}, method: PoliticalWordCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _political_word_filter(self, text):
|
||||
"""词语过滤主函数,分行过滤"""
|
||||
filtered_rows = []
|
||||
for row in text.split('\n'):
|
||||
matched_words = self.ac_automaton.search(row, self.special_symbols)
|
||||
filtered_rows.append(self.words_replace(matched_words, row))
|
||||
return '\n'.join(filtered_rows)
|
||||
@@ -0,0 +1,321 @@
|
||||
习近平
|
||||
平近习
|
||||
xjp
|
||||
习太子
|
||||
习明泽
|
||||
老习
|
||||
温家宝
|
||||
温加宝
|
||||
温x
|
||||
温jia宝
|
||||
温宝宝
|
||||
温加饱
|
||||
温加保
|
||||
张培莉
|
||||
温云松
|
||||
温如春
|
||||
温jb
|
||||
胡温
|
||||
胡x
|
||||
胡jt
|
||||
胡boss
|
||||
胡总
|
||||
胡王八
|
||||
hujintao
|
||||
胡jintao
|
||||
胡j涛
|
||||
胡惊涛
|
||||
胡景涛
|
||||
胡紧掏
|
||||
湖紧掏
|
||||
胡紧套
|
||||
锦涛
|
||||
hjt
|
||||
胡派
|
||||
胡主席
|
||||
刘永清
|
||||
胡海峰
|
||||
胡海清
|
||||
江泽民
|
||||
民泽江
|
||||
江胡
|
||||
江主席
|
||||
江书记
|
||||
江浙闽
|
||||
江沢民
|
||||
江浙民
|
||||
茳泽民
|
||||
zemin
|
||||
ze民
|
||||
老江
|
||||
老j
|
||||
江core
|
||||
江x
|
||||
江派
|
||||
江zm
|
||||
jzm
|
||||
江戏子
|
||||
江蛤蟆
|
||||
江某某
|
||||
江贼
|
||||
江猪
|
||||
江氏集团
|
||||
江绵恒
|
||||
江绵康
|
||||
王冶坪
|
||||
江泽慧
|
||||
邓小平
|
||||
平小邓
|
||||
xiao平
|
||||
邓xp
|
||||
邓晓平
|
||||
邓朴方
|
||||
邓榕
|
||||
邓质方
|
||||
毛泽东
|
||||
猫泽东
|
||||
猫则东
|
||||
猫贼洞
|
||||
毛zd
|
||||
毛zx
|
||||
z东
|
||||
ze东
|
||||
泽d
|
||||
zedong
|
||||
毛太祖
|
||||
毛相
|
||||
主席画像
|
||||
改革历程
|
||||
朱镕基
|
||||
朱容基
|
||||
朱镕鸡
|
||||
朱容鸡
|
||||
朱云来
|
||||
李鹏
|
||||
李peng
|
||||
里鹏
|
||||
李月月鸟
|
||||
李小鹏
|
||||
李小琳
|
||||
华主席
|
||||
华国
|
||||
国锋
|
||||
国峰
|
||||
锋同志
|
||||
白春礼
|
||||
薄熙来
|
||||
薄一波
|
||||
蔡赴朝
|
||||
蔡武
|
||||
曹刚川
|
||||
常万全
|
||||
陈炳德
|
||||
陈德铭
|
||||
陈建国
|
||||
陈良宇
|
||||
陈绍基
|
||||
陈同海
|
||||
陈至立
|
||||
戴秉国
|
||||
丁一平
|
||||
董建华
|
||||
杜德印
|
||||
杜世成
|
||||
傅锐
|
||||
郭伯雄
|
||||
郭金龙
|
||||
贺国强
|
||||
胡春华
|
||||
耀邦
|
||||
华建敏
|
||||
黄华华
|
||||
黄丽满
|
||||
黄兴国
|
||||
回良玉
|
||||
贾庆林
|
||||
贾廷安
|
||||
靖志远
|
||||
李长春
|
||||
李春城
|
||||
李建国
|
||||
李克强
|
||||
李岚清
|
||||
李沛瑶
|
||||
李荣融
|
||||
李瑞环
|
||||
李铁映
|
||||
李先念
|
||||
李学举
|
||||
李源潮
|
||||
栗智
|
||||
梁光烈
|
||||
廖锡龙
|
||||
林树森
|
||||
林炎志
|
||||
林左鸣
|
||||
令计划
|
||||
柳斌杰
|
||||
刘奇葆
|
||||
刘少奇
|
||||
刘延东
|
||||
刘云山
|
||||
刘志军
|
||||
龙新民
|
||||
路甬祥
|
||||
罗箭
|
||||
吕祖善
|
||||
马飚
|
||||
马恺
|
||||
孟建柱
|
||||
欧广源
|
||||
强卫
|
||||
沈跃跃
|
||||
宋平顺
|
||||
粟戎生
|
||||
苏树林
|
||||
孙家正
|
||||
铁凝
|
||||
屠光绍
|
||||
王东明
|
||||
汪东兴
|
||||
王鸿举
|
||||
王沪宁
|
||||
王乐泉
|
||||
王洛林
|
||||
王岐山
|
||||
王胜俊
|
||||
王太华
|
||||
王学军
|
||||
王兆国
|
||||
王振华
|
||||
吴邦国
|
||||
吴定富
|
||||
吴官正
|
||||
无官正
|
||||
吴胜利
|
||||
吴仪
|
||||
奚国华
|
||||
习仲勋
|
||||
徐才厚
|
||||
许其亮
|
||||
徐绍史
|
||||
杨洁篪
|
||||
叶剑英
|
||||
由喜贵
|
||||
于幼军
|
||||
俞正声
|
||||
袁纯清
|
||||
曾培炎
|
||||
曾庆红
|
||||
曾宪梓
|
||||
曾荫权
|
||||
张德江
|
||||
张定发
|
||||
张高丽
|
||||
张立昌
|
||||
张荣坤
|
||||
张志国
|
||||
赵洪祝
|
||||
紫阳
|
||||
周生贤
|
||||
周永康
|
||||
朱海仑
|
||||
中南海
|
||||
大陆当局
|
||||
中国当局
|
||||
北京当局
|
||||
共产党
|
||||
党产共
|
||||
共贪党
|
||||
阿共
|
||||
产党共
|
||||
公产党
|
||||
工产党
|
||||
共c党
|
||||
共x党
|
||||
共铲
|
||||
供产
|
||||
共惨
|
||||
供铲党
|
||||
供铲谠
|
||||
供铲裆
|
||||
共残党
|
||||
共残主义
|
||||
共产主义的幽灵
|
||||
拱铲
|
||||
老共
|
||||
中共
|
||||
中珙
|
||||
中gong
|
||||
gc党
|
||||
贡挡
|
||||
gong党
|
||||
g产
|
||||
狗产蛋
|
||||
共残裆
|
||||
恶党
|
||||
邪党
|
||||
共产专制
|
||||
共产王朝
|
||||
裆中央
|
||||
土共
|
||||
土g
|
||||
共狗
|
||||
g匪
|
||||
共匪
|
||||
仇共
|
||||
症腐
|
||||
政腐
|
||||
政付
|
||||
正府
|
||||
政俯
|
||||
政f
|
||||
zhengfu
|
||||
政zhi
|
||||
挡中央
|
||||
档中央
|
||||
中国zf
|
||||
中央zf
|
||||
国wu院
|
||||
中华帝国
|
||||
gong和
|
||||
大陆官方
|
||||
北京政权
|
||||
江泽民
|
||||
胡锦涛
|
||||
温家宝
|
||||
习近平
|
||||
习仲勋
|
||||
贺国强
|
||||
贺子珍
|
||||
周永康
|
||||
李长春
|
||||
李德生
|
||||
王岐山
|
||||
姚依林
|
||||
回良玉
|
||||
李源潮
|
||||
李干成
|
||||
戴秉国
|
||||
黄镇
|
||||
刘延东
|
||||
刘瑞龙
|
||||
俞正声
|
||||
黄敬
|
||||
薄熙
|
||||
薄一波
|
||||
周小川
|
||||
周建南
|
||||
温云松
|
||||
徐明
|
||||
江泽慧
|
||||
江绵恒
|
||||
江绵康
|
||||
李小鹏
|
||||
李鹏
|
||||
李小琳
|
||||
朱云来
|
||||
朱容基
|
||||
法轮功
|
||||
李洪志
|
||||
新疆骚乱
|
||||
@@ -0,0 +1,50 @@
|
||||
!
|
||||
|
||||
.
|
||||
,
|
||||
#
|
||||
$
|
||||
%
|
||||
&
|
||||
*
|
||||
(
|
||||
)
|
||||
|
|
||||
?
|
||||
/
|
||||
@
|
||||
"
|
||||
'
|
||||
;
|
||||
[
|
||||
]
|
||||
{
|
||||
}
|
||||
+
|
||||
~
|
||||
-
|
||||
_
|
||||
=
|
||||
^
|
||||
<
|
||||
>
|
||||
!
|
||||
。
|
||||
,
|
||||
¥
|
||||
(
|
||||
)
|
||||
?
|
||||
、
|
||||
“
|
||||
‘
|
||||
;
|
||||
【
|
||||
】
|
||||
——
|
||||
…
|
||||
……
|
||||
《
|
||||
》
|
||||
:
|
||||
:
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='DuplicateSentencesFilter',
|
||||
module_path="ops.mapper.remove_duplicate_sentences.process")
|
||||
16
runtime/ops/mapper/remove_duplicate_sentences/metadata.yml
Normal file
16
runtime/ops/mapper/remove_duplicate_sentences/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '文档局部内容去重'
|
||||
name_en: 'Partial Content Deduplication'
|
||||
description: '文档局部内容去重。'
|
||||
description_en: 'Deduplicates partial file content.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'DuplicateSentencesFilter'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '这是一个重复的句子。 这是一个重复的句子。 这是一个重复的句子。 这是一个重复的句子。 这是一个重复的句子。'
|
||||
after: '这是一个重复的句子。'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
68
runtime/ops/mapper/remove_duplicate_sentences/process.py
Normal file
68
runtime/ops/mapper/remove_duplicate_sentences/process.py
Normal file
@@ -0,0 +1,68 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 文档局部内容去重
|
||||
Create: 2025/01/07
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
from collections import Counter
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.core.base_op import Filter
|
||||
|
||||
|
||||
def duplicate_sentences_filter(input_data: str, file_name: str, duplicate_th: int = 5) -> str:
|
||||
""" 文本局部内容去重:去除某些重复出现的段落或句子
|
||||
以段落为基本单位,去除重复次数超过规定阈值的段落, 只保留第一次出现的段落的原始内容, 且不去除段落的首尾空格。
|
||||
|
||||
Args:
|
||||
input_data: 输入数据
|
||||
file_name: 文件名称
|
||||
duplicate_th: 最大重复次数阈值,默认小于5次
|
||||
Returns:
|
||||
str: 清洗后数据
|
||||
"""
|
||||
paragraphs = input_data.split("\n")
|
||||
trust_set = {'<table>', '<tbody>', '<tr>', '<td>', '</table>', '</tbody>', '</tr>', '</td>', ""}
|
||||
|
||||
# 进行一次遍历,记录每个段落的出现位置
|
||||
order_paragraphs = []
|
||||
paragraph_counts = Counter([line.strip() for line in re.split("\\n", input_data)])
|
||||
|
||||
try:
|
||||
for paragraph in paragraphs:
|
||||
# trust_set 中的元素不纳入统计
|
||||
if paragraph.strip() in trust_set:
|
||||
order_paragraphs.append(paragraph)
|
||||
continue
|
||||
paragraph_strip = paragraph.strip()
|
||||
if duplicate_th > paragraph_counts[paragraph_strip] >= 0:
|
||||
order_paragraphs.append(paragraph)
|
||||
elif paragraph_counts[paragraph_strip] >= duplicate_th:
|
||||
order_paragraphs.append(paragraph)
|
||||
paragraph_counts[paragraph_strip] = -1
|
||||
|
||||
except Exception as err:
|
||||
logger.exception(f"fileName: {file_name}, method: RemoveDuplicateSentencess. An error occurred when using "
|
||||
f"filtering duplicate sentences. The error is: {err}")
|
||||
return input_data
|
||||
|
||||
# 将去重后的段落重新组合成文本
|
||||
result_text = '\n'.join(order_paragraphs)
|
||||
return result_text
|
||||
|
||||
|
||||
class DuplicateSentencesFilter(Filter):
|
||||
"""文档局部内容去重插件"""
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
duplicate_th = 5 # 段落重复次数阈值
|
||||
file_name = sample[self.filename_key]
|
||||
start = time.time()
|
||||
sample[self.text_key] = duplicate_sentences_filter(sample[self.text_key], file_name, duplicate_th)
|
||||
logger.info(f"fileName: {file_name}, RemoveDuplicateSentencess costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='SexualAndViolentWordCleaner',
|
||||
module_path="ops.mapper.sexual_and_violent_word_cleaner.process")
|
||||
@@ -0,0 +1,16 @@
|
||||
name: '暴力色情文本匿名化'
|
||||
name_en: 'Violent and Pornographic Text Anonymization'
|
||||
description: '将暴力、色情文本进行匿名化。'
|
||||
description_en: 'Anonymizes violent and pornographic texts.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'SexualAndViolentWordCleaner'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: '特别字符:炸药'
|
||||
after: '特别字符:***'
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
@@ -0,0 +1,70 @@
|
||||
#!/user/bin/python
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Description: 暴力色情文本过滤
|
||||
Create: 2024/12/26 15:43
|
||||
"""
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from datamate.common.utils.aho_corasick import AhoCorasic
|
||||
from datamate.core.base_op import Mapper
|
||||
|
||||
|
||||
class SexualAndViolentWordCleaner(Mapper):
|
||||
"""外部输入的暴力、色情文本过滤插件"""
|
||||
root_path = Path(__file__).parent / 'resources'
|
||||
VIOLENT_FILE_PATH = str(root_path / 'violent.txt')
|
||||
SEXUAL_FILE_PATH = str(root_path / 'sexual.txt')
|
||||
SPECIAL_SYMBOLS_PATH = str(root_path / 'special_symbols.txt')
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.violent_words = self.load_words_list(self.VIOLENT_FILE_PATH)
|
||||
self.sexual_words = self.load_words_list(self.SEXUAL_FILE_PATH)
|
||||
self.special_symbols = self.load_words_list(self.SPECIAL_SYMBOLS_PATH)
|
||||
self.words = self.violent_words | self.sexual_words
|
||||
self.ac_automaton = AhoCorasic(self.words)
|
||||
|
||||
@staticmethod
|
||||
def load_words_list(path):
|
||||
"""词表加载"""
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
words = set(f.read().splitlines())
|
||||
return words
|
||||
|
||||
@staticmethod
|
||||
def words_replace(target_strings: list, text: str):
|
||||
"""
|
||||
目标字符串替换。
|
||||
|
||||
Args:
|
||||
target_strings: 前缀树根节点。
|
||||
text: 待清洗文本。
|
||||
returns:
|
||||
清洗后文本。
|
||||
"""
|
||||
target_strings.sort(key=lambda x: -len(x))
|
||||
for s in target_strings:
|
||||
tmp_text = text.replace(s, '*' * len(s))
|
||||
text = tmp_text
|
||||
return text
|
||||
|
||||
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
|
||||
start = time.time()
|
||||
sample[self.text_key] = self._sexual_and_violent_word_filter(sample[self.text_key])
|
||||
logger.info(f"fileName: {sample[self.filename_key]}, "
|
||||
f"method: SexualAndViolentWordCleaner costs {time.time() - start:6f} s")
|
||||
return sample
|
||||
|
||||
def _sexual_and_violent_word_filter(self, text):
|
||||
"""词语过滤主函数,分行过滤"""
|
||||
filtered_rows = []
|
||||
for row in text.split('\n'):
|
||||
matched_words = self.ac_automaton.search(row, self.special_symbols)
|
||||
filtered_rows.append(self.words_replace(matched_words, row))
|
||||
return '\n'.join(filtered_rows)
|
||||
@@ -0,0 +1,288 @@
|
||||
爱液
|
||||
按摩棒
|
||||
拔出来
|
||||
爆草
|
||||
包二奶
|
||||
暴干
|
||||
暴奸
|
||||
暴乳
|
||||
爆乳
|
||||
暴淫
|
||||
被操
|
||||
被插
|
||||
被干
|
||||
逼奸
|
||||
仓井空
|
||||
插暴
|
||||
操逼
|
||||
操黑
|
||||
操烂
|
||||
肏你
|
||||
肏死
|
||||
操死
|
||||
操我
|
||||
厕奴
|
||||
插比
|
||||
插b
|
||||
插逼
|
||||
插进
|
||||
插你
|
||||
插我
|
||||
插阴
|
||||
潮吹
|
||||
潮喷
|
||||
成人电影
|
||||
成人论坛
|
||||
成人色情
|
||||
成人网站
|
||||
成人文学
|
||||
成人小说
|
||||
艳情小说
|
||||
成人游戏
|
||||
吃精
|
||||
抽插
|
||||
春药
|
||||
大波
|
||||
大力抽送
|
||||
大乳
|
||||
荡妇
|
||||
荡女
|
||||
盗撮
|
||||
发浪
|
||||
放尿
|
||||
肥逼
|
||||
粉穴
|
||||
干死你
|
||||
干穴
|
||||
肛交
|
||||
肛门
|
||||
龟头
|
||||
裹本
|
||||
国产av
|
||||
豪乳
|
||||
黑逼
|
||||
后穴
|
||||
虎骑
|
||||
换妻俱乐部
|
||||
黄片
|
||||
几吧
|
||||
鸡吧
|
||||
鸡巴
|
||||
鸡奸
|
||||
妓女
|
||||
奸情
|
||||
叫床
|
||||
脚交
|
||||
精液
|
||||
就去日
|
||||
巨屌
|
||||
菊花洞
|
||||
菊门
|
||||
巨奶
|
||||
巨乳
|
||||
菊穴
|
||||
开苞
|
||||
口爆
|
||||
口活
|
||||
口交
|
||||
口射
|
||||
口淫
|
||||
狂操
|
||||
狂插
|
||||
浪逼
|
||||
浪妇
|
||||
浪叫
|
||||
浪女
|
||||
漏乳
|
||||
露b
|
||||
乱交
|
||||
乱伦
|
||||
轮暴
|
||||
轮操
|
||||
轮奸
|
||||
裸陪
|
||||
买春
|
||||
美逼
|
||||
美少妇
|
||||
美乳
|
||||
美腿
|
||||
美穴
|
||||
美幼
|
||||
秘唇
|
||||
迷奸
|
||||
密穴
|
||||
蜜穴
|
||||
蜜液
|
||||
摸奶
|
||||
摸胸
|
||||
母奸
|
||||
奈美
|
||||
奶子
|
||||
男奴
|
||||
内射
|
||||
嫩逼
|
||||
嫩女
|
||||
嫩穴
|
||||
捏弄
|
||||
女优
|
||||
炮友
|
||||
砲友
|
||||
喷精
|
||||
屁眼
|
||||
前凸后翘
|
||||
强jian
|
||||
强暴
|
||||
强奸处女
|
||||
情趣用品
|
||||
情色
|
||||
拳交
|
||||
全裸
|
||||
群交
|
||||
人妻
|
||||
人兽
|
||||
日逼
|
||||
日烂
|
||||
肉棒
|
||||
肉逼
|
||||
肉唇
|
||||
肉洞
|
||||
肉缝
|
||||
肉棍
|
||||
肉茎
|
||||
肉具
|
||||
揉乳
|
||||
肉穴
|
||||
肉欲
|
||||
乳爆
|
||||
乳房
|
||||
乳沟
|
||||
乳交
|
||||
乳头
|
||||
骚逼
|
||||
骚比
|
||||
骚女
|
||||
骚水
|
||||
骚穴
|
||||
色逼
|
||||
色情网站
|
||||
色区
|
||||
色色
|
||||
色诱
|
||||
色欲
|
||||
色b
|
||||
射爽
|
||||
射颜
|
||||
食精
|
||||
释欲
|
||||
兽奸
|
||||
兽交
|
||||
手淫
|
||||
兽欲
|
||||
熟妇
|
||||
熟母
|
||||
熟女
|
||||
爽片
|
||||
双臀
|
||||
死逼
|
||||
丝袜
|
||||
丝诱
|
||||
松岛枫
|
||||
酥痒
|
||||
汤加丽
|
||||
套弄
|
||||
体奸
|
||||
体位
|
||||
舔脚
|
||||
舔阴
|
||||
调教
|
||||
偷欢
|
||||
推油
|
||||
脱内裤
|
||||
文做
|
||||
舞女
|
||||
吸精
|
||||
夏川纯
|
||||
相奸
|
||||
小逼
|
||||
小穴
|
||||
小xue
|
||||
性感妖娆
|
||||
性感诱惑
|
||||
性虎
|
||||
性饥渴
|
||||
性技巧
|
||||
性交
|
||||
性奴
|
||||
性虐
|
||||
性息
|
||||
性欲
|
||||
穴口
|
||||
穴图
|
||||
亚情
|
||||
颜射
|
||||
阳具
|
||||
杨思敏
|
||||
要射了
|
||||
一夜欢
|
||||
一夜情
|
||||
一ye情
|
||||
阴部
|
||||
淫虫
|
||||
阴唇
|
||||
淫荡
|
||||
阴道
|
||||
淫电影
|
||||
阴阜
|
||||
淫妇
|
||||
淫河
|
||||
阴核
|
||||
阴户
|
||||
淫贱
|
||||
淫叫
|
||||
淫教师
|
||||
阴茎
|
||||
阴精
|
||||
淫浪
|
||||
淫媚
|
||||
淫糜
|
||||
淫魔
|
||||
淫母
|
||||
淫女
|
||||
淫虐
|
||||
淫妻
|
||||
淫情
|
||||
淫色
|
||||
淫声浪语
|
||||
淫兽学园
|
||||
淫书
|
||||
淫术炼金士
|
||||
淫水
|
||||
淫娃
|
||||
淫威
|
||||
淫亵
|
||||
淫样
|
||||
淫液
|
||||
淫照
|
||||
阴b
|
||||
应召
|
||||
幼交
|
||||
欲火
|
||||
欲女
|
||||
玉乳
|
||||
玉穴
|
||||
援交
|
||||
原味内衣
|
||||
援助交际
|
||||
招鸡
|
||||
招妓
|
||||
抓胸
|
||||
自慰
|
||||
作爱
|
||||
a片
|
||||
fuck
|
||||
gay片
|
||||
g点
|
||||
h动画
|
||||
h动漫
|
||||
失身粉
|
||||
淫荡自慰器
|
||||
@@ -0,0 +1,50 @@
|
||||
!
|
||||
|
||||
.
|
||||
,
|
||||
#
|
||||
$
|
||||
%
|
||||
&
|
||||
*
|
||||
(
|
||||
)
|
||||
|
|
||||
?
|
||||
/
|
||||
@
|
||||
"
|
||||
'
|
||||
;
|
||||
[
|
||||
]
|
||||
{
|
||||
}
|
||||
+
|
||||
~
|
||||
-
|
||||
_
|
||||
=
|
||||
^
|
||||
<
|
||||
>
|
||||
!
|
||||
。
|
||||
,
|
||||
¥
|
||||
(
|
||||
)
|
||||
?
|
||||
、
|
||||
“
|
||||
‘
|
||||
;
|
||||
【
|
||||
】
|
||||
——
|
||||
…
|
||||
……
|
||||
《
|
||||
》
|
||||
:
|
||||
:
|
||||
@@ -0,0 +1,452 @@
|
||||
出售雷管
|
||||
炸药
|
||||
出售炸药
|
||||
出售炸药电话
|
||||
制作火药配方
|
||||
炸药出售
|
||||
出售雷管炸药
|
||||
火药配方
|
||||
在家里做原子弹
|
||||
硝酸甘油炸弹制作
|
||||
TNT
|
||||
炸弹的制作
|
||||
硝铵炸药配方
|
||||
硝酸甘油制作
|
||||
炸弹配方
|
||||
简易炸药
|
||||
火药制作简易炸弹
|
||||
炸弹制作
|
||||
硝酸甘油
|
||||
硝酸甘油炸弹制作方法
|
||||
手把手教你做炸弹
|
||||
恐怖分子傻瓜手册
|
||||
氢弹手工制作方法
|
||||
起爆器
|
||||
自制燃烧弹
|
||||
燃烧弹
|
||||
弹药制造方法
|
||||
液体炸药配方
|
||||
炸药配方与制作
|
||||
自制炸药配方
|
||||
tnt炸药成分
|
||||
炸药的制作
|
||||
tnt炸药配方
|
||||
各种炸药配方大全
|
||||
炸药的成分
|
||||
燃烧弹的制作方
|
||||
黑火药的配方
|
||||
黑火药的制作
|
||||
黑火药比例
|
||||
制作燃烧弹
|
||||
黑火药配方比例
|
||||
黑火药的成分
|
||||
原子弹配方
|
||||
原子弹DIY教程
|
||||
C4炸药
|
||||
雷管出售
|
||||
制作硝化甘油
|
||||
DIY核弹
|
||||
核弹制作方法
|
||||
原子弹制作手册
|
||||
简易原子弹教程
|
||||
黑火药配方
|
||||
自制炸药方法
|
||||
教你怎么用土办法做武器
|
||||
zz炸弹的制作方法
|
||||
C4炸药的成分
|
||||
炸药制造生产工艺
|
||||
原子弹制造完全手册
|
||||
汽车炸弹制作
|
||||
人体炸弹制作流程
|
||||
炸弹制作从入门
|
||||
鸡尾酒炸弹制作
|
||||
集束炸弹制作
|
||||
硝酸铵的简便制取方法
|
||||
莫洛托夫燃烧弹
|
||||
塑胶炸弹
|
||||
用火药混合塑料
|
||||
高性能爆炸物质混合
|
||||
原子弹制作方法
|
||||
手教你制作原子弹
|
||||
轻型巡航导弹的制作
|
||||
黑索金的制造过程
|
||||
Zha药制造进程
|
||||
购买自爆材料
|
||||
可塑炸弹制作方法
|
||||
C4塑胶炸药
|
||||
水浴法制TNT
|
||||
简易核弹头的制造
|
||||
高爆炸药的基本配方
|
||||
如何制硝化甘油
|
||||
燃烧剂的简易制备
|
||||
简易大威力炸弹
|
||||
炸弹制作入门
|
||||
硝酸甘油炸药制作
|
||||
基本配方是硝酸甘
|
||||
燃烧剂的简易制作
|
||||
制作性能优良的燃烧弹
|
||||
酸甘油炸药
|
||||
简易导弹制作
|
||||
核弹头的制造
|
||||
可塑炸弹制作
|
||||
炸药硝酸甘油制法
|
||||
原子弹制作方法
|
||||
火药的配方
|
||||
手教你做原子弹
|
||||
怎么用土办法做武器
|
||||
手把手教你制作原子弹
|
||||
燃烧弹的制作方法
|
||||
怎么自制燃烧弹
|
||||
TNT炸弹制作
|
||||
高爆炸药基本配方
|
||||
爆破和杀伤燃烧弹
|
||||
炸弹制作从入门到精
|
||||
写的炸弹制作教程
|
||||
原子弹制作方法
|
||||
教你制作原子弹
|
||||
下硝化甘油的制作方法
|
||||
DIY原子弹
|
||||
原子弹制作
|
||||
炸药及其制造方法
|
||||
ATOM弹制造
|
||||
三分钟教你原子弹
|
||||
火药原理
|
||||
简易炸弹
|
||||
简易燃烧弹
|
||||
土炸药成份
|
||||
你怎么用土办法做武器
|
||||
燃烧弹制作
|
||||
燃烧瓶的制作方法
|
||||
炸弹制作手册
|
||||
原子弹制作方法DIY
|
||||
汽油三分之一白糖混合
|
||||
炸药的制作配方
|
||||
燃烧弹的制作方
|
||||
炸药的配方
|
||||
如何自制燃烧弹
|
||||
标准炸弹教程
|
||||
黑火药和硝酸钾农药混合
|
||||
恐怖分子自己制造炸弹
|
||||
手柄制作炸弹引爆器
|
||||
制作炸弹首先要
|
||||
燃烧瓶制作
|
||||
出售炸药
|
||||
出售气枪
|
||||
出售炸药QQ
|
||||
出售气枪QQ
|
||||
出售气枪电话
|
||||
出售手枪QQ
|
||||
出售手枪电话
|
||||
出售猎枪
|
||||
出售猎枪QQ
|
||||
出售猎枪电话
|
||||
出售汽枪
|
||||
出售步枪
|
||||
出售狙击枪
|
||||
出售手狗
|
||||
出售气狗
|
||||
出售猎狗
|
||||
手枪出售
|
||||
64式手枪出售
|
||||
92式手枪出售
|
||||
高仿真枪
|
||||
出售54式手枪
|
||||
出售M1911
|
||||
出售各类猎狗
|
||||
出售国产军用54手枪
|
||||
出售火枪游戏
|
||||
出售进口金属枪模
|
||||
出售仿真枪
|
||||
出售麻醉枪免收任何定金
|
||||
出售手枪
|
||||
出售外国枪模
|
||||
出售袖珍手枪
|
||||
娥眉气枪出售
|
||||
防身手枪QQ
|
||||
防身武器手枪
|
||||
仿真枪QQ
|
||||
钢珠枪小口径步枪
|
||||
高压狗QQ
|
||||
高压气枪
|
||||
气枪子弹
|
||||
高压气枪电狗专卖
|
||||
高压气枪
|
||||
气枪货到付款
|
||||
高压气枪专卖店
|
||||
各类军用枪
|
||||
各类军用枪支
|
||||
各式气枪出售
|
||||
工字牌气枪出售专卖
|
||||
气枪
|
||||
工字汽狗麻醉枪
|
||||
供应军用手枪
|
||||
供应汽枪
|
||||
购买枪支QQ
|
||||
国产汽狗专卖
|
||||
气枪货
|
||||
集团出售手枪
|
||||
集团出售手枪气枪
|
||||
进口气枪
|
||||
气枪子弹
|
||||
军用54出售
|
||||
军用77出售
|
||||
军用手枪钢珠枪
|
||||
里有汽枪出售
|
||||
猎枪出售
|
||||
猎枪仿真枪QQ
|
||||
猎枪麻醉枪钢珠枪
|
||||
猎枪哪里有卖
|
||||
猎枪手枪买卖
|
||||
猎枪销售手枪销售气枪
|
||||
猎枪专卖店
|
||||
猎枪子弹底火专卖
|
||||
麻醉枪出售可验货
|
||||
麻醉枪猎枪军用
|
||||
买卖54狗
|
||||
买卖64狗
|
||||
买卖警察枪支
|
||||
买卖军狗
|
||||
买卖军用枪支
|
||||
卖64手枪
|
||||
卖92手枪
|
||||
卖钢珠枪QQ
|
||||
哪里能买到自制手枪
|
||||
哪里有钢珠枪
|
||||
哪里有连发猎枪买卖
|
||||
哪里有卖双管猎
|
||||
哪里有气枪卖
|
||||
哪里有枪出售
|
||||
哪里有真枪出售
|
||||
哪里有真枪卖买
|
||||
哪能买到真枪
|
||||
出售手枪气枪猎
|
||||
气狗
|
||||
高压气枪
|
||||
气狗出售
|
||||
进口气狗
|
||||
气枪专卖网
|
||||
气枪出售
|
||||
气枪专卖网
|
||||
猎枪
|
||||
钢珠枪
|
||||
气枪仿真枪出售
|
||||
气枪购买
|
||||
气枪组装
|
||||
气枪货到付款
|
||||
汽狗出售
|
||||
气枪价格
|
||||
仿真气枪
|
||||
气枪价格
|
||||
气枪专卖店
|
||||
气枪看货加QQ
|
||||
气枪联系QQ
|
||||
气枪猎枪麻醉枪
|
||||
气枪迷魂药
|
||||
汽狗高压汽枪
|
||||
铅弹
|
||||
上海工字气枪
|
||||
全金属仿真枪专卖
|
||||
热武器QQ
|
||||
三箭气枪出售
|
||||
枪猎
|
||||
手槍买卖自制手枪
|
||||
售枪购买手枪
|
||||
售手枪QQ
|
||||
司马仿真枪模型QQ
|
||||
淘宝气枪货到付款
|
||||
提供气狗专卖店
|
||||
提供专业及时的气狗
|
||||
秃鹰气枪货到付款
|
||||
锡峰气枪出售
|
||||
仙鹤气枪出售
|
||||
买卖77式手枪
|
||||
销售电手狗
|
||||
销售气手狗
|
||||
销售小口径步枪
|
||||
销售运动步枪
|
||||
小口径秃鹰沙漠
|
||||
有军用枪支出售
|
||||
长期出售手枪
|
||||
折叠狗QQ
|
||||
转让猎枪
|
||||
子弹货到付款
|
||||
自制手枪哪里买
|
||||
自制手枪哪里有卖
|
||||
阻击枪
|
||||
汽枪
|
||||
高压气枪
|
||||
组装手枪
|
||||
最新销售汽狗
|
||||
出售警用手铐
|
||||
仿真枪
|
||||
猎枪
|
||||
真枪
|
||||
手枪
|
||||
阻击枪
|
||||
弓弩专营
|
||||
弓弩专卖网货到付款
|
||||
弓弩专卖网
|
||||
弓弩专卖货到付款
|
||||
弓弩专卖店网
|
||||
弓弩专卖店
|
||||
弓弩专卖
|
||||
弓弩制作
|
||||
弓弩直销
|
||||
弓弩有限公司
|
||||
弓弩营销中心
|
||||
弓弩一百专卖店
|
||||
弓弩销售
|
||||
弓弩网
|
||||
弓弩图纸
|
||||
弓弩特许经销
|
||||
弓弩狩猎网
|
||||
自制手弩
|
||||
追风弓弩麻醉箭专卖
|
||||
专业弓弩网
|
||||
中国战神军用弓弩
|
||||
中国弩弓专卖
|
||||
中国弓弩专卖网
|
||||
中国弓弩直销
|
||||
中国弓弩网
|
||||
中国弓弩狩猎网
|
||||
中国弓驽网
|
||||
制作简易弓弩
|
||||
郑州弓弩专卖
|
||||
赵氏弓弩专卖网
|
||||
赵氏弓弩专卖店
|
||||
赵氏弓弩专卖
|
||||
赵氏弓弩销售
|
||||
小型弓弩专卖店
|
||||
小猎人弓弩网
|
||||
狩猎器材弓弩专卖
|
||||
狩猎器材弓弩
|
||||
狩猎弓弩专卖网
|
||||
狩猎弓弩专卖
|
||||
狩猎弓弩麻醉箭
|
||||
手枪式折叠三用弩
|
||||
三利达弓弩专卖网
|
||||
三利达弓弩直营
|
||||
三利达弓弩配件
|
||||
三步倒药箭批发
|
||||
三步倒弩箭专卖
|
||||
三步倒麻醉弩箭销售
|
||||
三步倒麻醉箭专卖
|
||||
三步倒麻醉箭
|
||||
三步倒捕狗药
|
||||
军用弓弩专卖网
|
||||
军用弓弩专卖店
|
||||
军用弓弩批发
|
||||
军用弓弩公司
|
||||
供应三利达弓弩麻醉箭
|
||||
供应三步倒麻醉箭
|
||||
供应秦氏弓弩
|
||||
供应弩用麻醉箭
|
||||
供应弩捕狗箭
|
||||
供应麻醉箭三步倒
|
||||
供应麻醉箭批发
|
||||
供应麻醉箭
|
||||
供应军用弩折叠弩
|
||||
供应军用弓弩专卖
|
||||
供应精品弓弩
|
||||
供应弓弩麻醉箭
|
||||
供应弓弩
|
||||
供应钢珠弓弩
|
||||
弓弩商城专卖
|
||||
弓弩商城
|
||||
弓弩亲兄弟货到付款
|
||||
弓弩批发
|
||||
弓弩免定金货到付款
|
||||
弓弩麻醉箭
|
||||
弓弩麻醉镖
|
||||
弓弩论坛
|
||||
钢珠弓弩专卖网
|
||||
钢珠弓弩专卖店
|
||||
打狗弓弩三步倒
|
||||
麻醉弓弩专卖店
|
||||
出售军刀
|
||||
出售军刺
|
||||
出售弹簧刀
|
||||
出售三棱刀
|
||||
出售跳刀
|
||||
军刀网
|
||||
南方军刀网
|
||||
户外军刀网
|
||||
三棱军刺专卖
|
||||
出售开山刀军刺
|
||||
西点军刀网
|
||||
军刀专卖
|
||||
戈博军刀
|
||||
阿兰德龙户外
|
||||
出售军品军刀
|
||||
勃朗宁军刀
|
||||
军刀军品网
|
||||
阿兰得龙野营刀具网
|
||||
出售军刺军刀
|
||||
警用刀具出售
|
||||
折刀专卖网
|
||||
阳江军品军刀网
|
||||
野营刀专卖
|
||||
砍刀精品折刀专卖
|
||||
匕首蝴蝶甩刀专卖
|
||||
军刀专卖军刺
|
||||
军刀专卖刀具批发
|
||||
军刀图片砍刀
|
||||
军刀网军刀专卖
|
||||
军刀价格军用刀具
|
||||
军品军刺网
|
||||
军刀军刺甩棍
|
||||
阳江刀具批发网
|
||||
北方先锋军刀
|
||||
正品军刺出售
|
||||
野营军刀出售
|
||||
开山刀砍刀出售
|
||||
仿品军刺出售
|
||||
军刀直刀专卖
|
||||
手工猎刀专卖
|
||||
自动跳刀专卖
|
||||
军刀电棍销售
|
||||
军刀甩棍销售
|
||||
美国军刀出售
|
||||
极端武力折刀
|
||||
防卫棍刀户外刀具
|
||||
阿兰德龙野营刀
|
||||
仿品军刺网
|
||||
野营砍刀户外军刀
|
||||
手工猎刀户外刀具
|
||||
中国户外刀具网
|
||||
西点军品军刀网
|
||||
野营开山刀军刺
|
||||
三利达弓弩军刀
|
||||
尼泊尔军刀出售
|
||||
防卫野营砍刀出售
|
||||
防卫著名军刀出售
|
||||
防卫棍刀出售
|
||||
防卫甩棍出售
|
||||
防卫电棍出售
|
||||
军刺野营砍刀出售
|
||||
著名精品折刀出售
|
||||
战术军刀出售
|
||||
刺刀专卖网
|
||||
户外军刀出售
|
||||
阳江刀具直销网
|
||||
冷钢刀具直销网
|
||||
防卫刀具直销网
|
||||
极端武力直销网
|
||||
刀具直销网
|
||||
军刀直销网
|
||||
直刀匕首直销网
|
||||
军刀匕首直销网
|
||||
折刀砍刀军品网
|
||||
野营刀具军品网
|
||||
阳江刀具军品网
|
||||
冷钢刀具军品网
|
||||
防卫刀具军品网
|
||||
极端武力军品网
|
||||
军用刀具军品网
|
||||
军刀直刀军品网
|
||||
折刀砍刀专卖
|
||||
野营刀具专卖
|
||||
阳江刀具专卖
|
||||
冷钢刀具专卖
|
||||
防卫刀具专卖
|
||||
出售美军现役军刀
|
||||
6
runtime/ops/mapper/text_to_word/__init__.py
Normal file
6
runtime/ops/mapper/text_to_word/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from datamate.core.base_op import OPERATORS
|
||||
|
||||
OPERATORS.register_module(module_name='TextToWord',
|
||||
module_path="ops.mapper.text_to_word.process")
|
||||
16
runtime/ops/mapper/text_to_word/metadata.yml
Normal file
16
runtime/ops/mapper/text_to_word/metadata.yml
Normal file
@@ -0,0 +1,16 @@
|
||||
name: '转换为Word'
|
||||
name_en: 'Convert-to-Word'
|
||||
description: '将抽取结果转换为docx的word文件。'
|
||||
description_en: 'Converts extraction results to Word files in DOCX format.'
|
||||
language: 'python'
|
||||
vendor: 'huawei'
|
||||
raw_id: 'TextToWord'
|
||||
version: '1.0.0'
|
||||
types:
|
||||
- 'cleanse'
|
||||
modal: 'text'
|
||||
effect:
|
||||
before: ''
|
||||
after: ''
|
||||
inputs: 'text'
|
||||
outputs: 'text'
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user