init datamate

This commit is contained in:
Dallas98
2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions

View File

@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
import sys
from pathlib import Path
from datamate.common.utils.custom_importer import CustomImporter
def _configure_importer():
base_path = Path(__file__).resolve().parent
sys.meta_path.append(CustomImporter(base_path))
_configure_importer()
def _import_operators():
from . import content_cleaner
from . import credit_card_number_cleaner
from . import email_cleaner
from . import emoji_cleaner
from . import extra_space_cleaner
from . import full_width_characters_cleaner
from . import garble_characters_cleaner
from . import html_tag_cleaner
from . import id_number_cleaner
from . import img_watermark_remove
from . import invisible_characters_cleaner
from . import ip_address_cleaner
from . import legend_cleaner
from . import phone_number_cleaner
from . import political_word_cleaner
from . import sexual_and_violent_word_cleaner
from . import text_to_word
from . import traditional_chinese
from . import unicode_space_cleaner
from . import url_cleaner
from . import xml_tag_cleaner
from . import img_enhanced_brightness
from . import img_enhanced_contrast
from . import img_enhanced_saturation
from . import img_enhanced_sharpness
from . import img_perspective_transformation
from . import img_direction_correct
from . import img_denoise
from . import img_shadow_remove
from . import img_type_unify
from . import img_resize
from . import remove_duplicate_sentences
from . import knowledge_relation_slice
_import_operators()

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ContentCleaner',
module_path="ops.mapper.content_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '文档目录去除'
name_en: 'Document Contents Removal'
description: '去除文档中的目录。'
description_en: 'Removes tables of contents from documents.'
language: 'python'
vendor: 'huawei'
raw_id: 'ContentCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,64 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 文档目录去除
Create: 2025/01/13
"""
import re
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class ContentCleaner(Mapper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.no_content_count = 3 # 连续不符合目录结构的行数阈值
# 目录标题
self.content_text_pattern = r"^ *(目 *录|CONTENT(S)?) *$"
# 目录行 前缀格式
self.content_preface_pattern = r"^ *(前言|About This Document|\d+(\.\d+)*|[a-zA-Z]+(\.\d+)*)"
# 目录行 中间格式
self.content_middle_pattern = r"\.{7,}"
# 目录行 结尾格式
self.content_end_pattern = r"(\d|错误!未定义书签。|[IXV]+) *$"
self.content_pattern = self.content_preface_pattern + ".*" + self.content_end_pattern
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._content_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: ContentCleaner costs {time.time() - start:6f} s")
return sample
def _content_filter(self, input_data: str):
count = 0 # 记录不符合目录结构的次数,连续3行不满足要求,则认为已经进入正文
# 目录起始和结束索引
content_start_index, content_end_index = -1, -1
lines = input_data.split("\n")
for i, line in enumerate(lines):
if content_start_index >= 0 and count >= self.no_content_count:
break
# 首先匹配目录或content字眼
if content_start_index < 0 and re.match(self.content_text_pattern, line, re.IGNORECASE):
content_start_index = i
content_end_index = i
# 匹配两种形式的目录行
# 1. 以指定格式开始、指定格式结尾;2.该行包含点数量超过7个
elif content_start_index >= 0 and (re.match(self.content_pattern, line, re.IGNORECASE)
or re.search(self.content_middle_pattern, line)):
content_end_index = i
count = 0
elif content_start_index >= 0 and not (re.match(self.content_pattern, line, re.IGNORECASE)
or re.search(self.content_middle_pattern, line)):
count += 1
if 0 <= content_start_index < content_end_index:
res = "\n".join(lines[:content_start_index] + lines[content_end_index + 1:])
else:
# 只有目录关键字时,关键字不去除;或不符合目录结构,返回原文
res = "\n".join(lines)
return res

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='AnonymizedCreditCardNumber',
module_path="ops.mapper.credit_card_number_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '信用卡号匿名化'
name_en: 'Credit Card Number Anonymization'
description: '信用卡号匿名化'
description_en: 'Anonymizes credit card numbers.'
language: 'python'
vendor: 'huawei'
raw_id: 'AnonymizedCreditCardNumber'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '这个是信用卡号:4111111111111111'
after: '这个是信用卡号:<credit_card_number>'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,83 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 信用卡号匿名化
Create: 2024/12/5 15:43
"""
from loguru import logger
import re
import time
from typing import Dict, Any
from datamate.core.base_op import Mapper
class AnonymizedCreditCardNumber(Mapper):
def __init__(self, *args, **kwargs):
super(AnonymizedCreditCardNumber, self).__init__(*args, **kwargs)
self.re_compile = self._get_credit_card_re_compile()
@staticmethod
def _verify_credit_card_num(credit_card_num: str):
"""信用卡号码校验"""
# 从右到左翻转
digits = [int(x) for x in reversed(credit_card_num) if x.isdigit()]
# 对偶数位数字翻倍 d*2
even_digits = [d * 2 for d in digits[1::2]]
# 如果对某个数字翻倍之后结果是一个两位数,将这两位数字加在一起
even_digits = [d // 10 + d % 10 for d in even_digits]
# 将上一步所有一位数相加
even_sum = sum(even_digits)
# 将卡号里从右到左奇数位上所有数字相加
odd_sum = sum(digits[::2])
# 将even_sum和odd_sum相加,能被10整数为合法,否则不合法
if (odd_sum + even_sum) % 10 == 0:
return True
return False
@staticmethod
def _get_credit_card_re_compile():
separator_symbol = r"([- ]?)"
# American Express 以 34 或 37 开头的 15 位数号码 格式:NNNN-NNNNNN-NNNNN 或 NNNN NNNNNN NNNNN
american_express = "3[47][0-9]{2}" + separator_symbol + "[0-9]{6}" + separator_symbol + "[0-9]{5}"
# 中国银联 以 62 或 60 开头,是一个 16 位数号码。 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
china_union_pay = r"(6[02]\d{2})" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
# Diner's Club 以 300–305、36、38 或 39、3095 开头, 14 位数号码 格式:NNNN-NNNNNN-NNNN 或 NNNN NNNNNN NNNN。
diners_club = r"(30[0-5]\d|3[689]\d{2}|3095)" + separator_symbol + r"[0-9]{6}" + separator_symbol + r"[0-9]{4}"
# Discover 以 6011、644–649 或 65 开头的 16 位数号码 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
discover = r"(64[4-9]\d|65\d{2}|6011)" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
# JCB 以 3528 到 3589 开头的 16 位数字, 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNNNNNN
jcb = r"(352[89]|35[3-8]\d)" + separator_symbol + r"[0-9]{4}" + (
r"((%s\d{%d}){%d}" % (separator_symbol, 4, 2) + ")|" + separator_symbol + r"[0-9]{8}")
# Mastercard 以 51–55 或 2221–2720 开头的 16 位数字 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
master_card = r"(5[1-5]\d{2}|222[1-9]|22[3-9]\d|2[3-6]\d{2}|27[01]\d|2720)" + r"(%s\d{%d}){%d}" \
% (separator_symbol, 4, 3)
# visa 以4开头 16 位数号码 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
visa = r"4\d{3}" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
credit_card_pattern = r"(?<=[^\d])(%s|%s|%s|%s|%s|%s|%s)(?=[^\d])" % (
american_express, china_union_pay, diners_club,
discover, jcb, master_card, visa)
credit_card_re_compile = re.compile(credit_card_pattern)
return credit_card_re_compile
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._credit_card_number_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: CreditCardNumberCleaner costs {time.time() - start:6f} s")
return sample
def _credit_card_number_filter(self, input_data: str):
"""提取信用卡号号码"""
input_data = ''.join(['', input_data, ''])
# 抽取符合信用卡正则匹配的字符串
credit_card_nums = [item.group(1) for item in self.re_compile.finditer(input_data)]
# 判断抽取的字符串是不是真实的信用卡号
for credit_card_num in credit_card_nums:
if self._verify_credit_card_num(credit_card_num):
# 替换有效信用卡号号码为<credit_card_number>
credit_card_num_pattern = r"(?<=[^\d]){}(?=[^\d])".format(credit_card_num)
input_data = re.compile(credit_card_num_pattern).sub("<credit_card_number>", input_data)
return input_data[1:-1]

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='EmailNumberCleaner',
module_path="ops.mapper.email_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '邮件地址匿名化'
name_en: 'Email Address Anonymization'
description: '邮件地址匿名化'
description_en: 'Anonymizes email addresses.'
language: 'python'
vendor: 'huawei'
raw_id: 'EmailNumberCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '这个是邮箱号:test_email@gmail.com'
after: '这个是邮箱号:<email>'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,47 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 邮件地址匿名化
Create: 2025/01/15
"""
from loguru import logger
import re
import time
from typing import Dict, Any
from email_validator import validate_email, EmailNotValidError
from datamate.core.base_op import Mapper
class EmailNumberCleaner(Mapper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.front_email_pattern = r'(?<=[^0-9a-zA-Z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~\-])'
self.back_email_pattern = r'(?=[^0-9a-zA-Z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~\-])'
self.email_pattern = r'([a-zA-Z\d.\-+_]+\s?@\s?[a-zA-Z\d.\-+_]+\.[a-zA-Z0-9]{2,6})'
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._email_number_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: EmailCleaner costs {time.time() - start:6f} s")
return sample
def _email_number_filter(self, input_data: str):
""" 邮箱匿名化"""
mixed_data = ''.join(['', input_data, ''])
paired_emails = re.compile(self.front_email_pattern + self.email_pattern + self.back_email_pattern).findall(
mixed_data)
if paired_emails:
for email in paired_emails:
try:
# 验证电子邮件地址
validate_email(email, check_deliverability=False)
mixed_data = re.compile(self.front_email_pattern + re.escape(email) + self.back_email_pattern).sub(
"<email>", mixed_data, count=1)
except EmailNotValidError as err:
# 日志打印该电子邮件地址无效(不显示具体电子邮件地址)
logger.error(f"email is abnormal email form: {err}")
return mixed_data[1:-1]

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='EmojiCleaner',
module_path="ops.mapper.emoji_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '文档表情去除'
name_en: 'Emoticon Removal'
description: '去除文档中表情字符或者emoji符号。'
description_en: 'Removes emoticons or emojis from documents.'
language: 'python'
vendor: 'huawei'
raw_id: 'EmojiCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '使用方式很简单,只需要将代码放入Markdown文本中即可,富文本格式可直接复制表情😀使用。'
after: '使用方式很简单,只需要将代码放入Markdown文本中即可,富文本格式可直接复制表情使用。'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,27 @@
"""
Description: 文档表情去除
Create: 2023/12/7 15:43
"""
import time
from typing import Dict, Any
import emoji
from loguru import logger
from datamate.core.base_op import Mapper
class EmojiCleaner(Mapper):
@staticmethod
def _emoji_filter(input_data: str):
res = []
for input_s in input_data.split('\n'):
res.append(emoji.replace_emoji(input_s, replace=''))
return '\n'.join(res)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._emoji_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: EmojiCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ExtraSpaceCleaner',
module_path="ops.mapper.extra_space_cleaner.process")

View File

@@ -0,0 +1,17 @@
name: '多余空格去除'
name_en: 'Redundant Space Removal'
description: '移除文档首尾、句中或标点符号附近多余空格和 tab 等。'
description_en: 'Removes redundant spaces and tabs at the beginning and end of documents,
in sentences, or near punctuations.'
language: 'python'
vendor: 'huawei'
raw_id: 'ExtraSpaceCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: ' 人工智能的研究历史有着一条从以“推理”为重 点,到以“知识”为重点,再到以“学习”为重点的自然、清晰的脉络。 '
after: '人工智能的研究历史有着一条从以“推理”为重点,到以“知识”为重点,再到以“学习”为重点的自然、清晰的脉络。'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,69 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 多余空格去除
Create: 2025/01/13
"""
import re
import time
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class ExtraSpaceCleaner(Mapper):
"""去除多余空格、多余空行,包括文档首尾空格、首尾tab
【注意】去除多余空格前,会先将文档中所有空格规范化为\u0020
"""
def __init__(self, *args, **kwargs):
# 匹配文档中非常见的unicode 空格
super().__init__(*args, **kwargs)
self.white_space_pattern = ('[\u00A0 \u1680 \u2000-\u200D \u2028-\u2029'
' \u202F \u205F \u3000 \u180E \u2060 \uFEFF]')
self._file_path = Path(__file__).parent / 'resources' / 'special_token.txt'
self.escaped_special_chars = self._get_escaped_special_chars() # 加载标点符号
# 匹配文章中,连续多个空格
extra_space_pattern = r" {2,}"
# 匹配多个空格、换行符混排情况
extra_line_pattern = r"( |\n){2,}"
# 匹配中文、符号间多余空格
extra_space_in_chinese_pattern = r"(?<=[\u4e00-\u9fa5" + self.escaped_special_chars + r"]) +(?=[\u4e00-\u9fa5" \
+ self.escaped_special_chars + r"])"
self.extra_space_re_compile = re.compile(extra_space_pattern)
self.extra_space_in_chinese_re_compile = re.compile(extra_space_in_chinese_pattern)
self.extra_line_re_compile = re.compile(extra_line_pattern)
self.white_space_pattern_compile = re.compile(self.white_space_pattern)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")
return sample
def _get_escaped_special_chars(self) -> str:
with open(self._file_path, 'r', encoding='utf-8') as f:
self._special_token = f.read().splitlines()
res = ''.join([re.escape(char) for char in self._special_token]) # 将特殊字符转义并拼接成字符串
return res
def _clean_extra_space(self, input_data: str) -> str:
# 将文档中非常见的 unicode 空格,如 u2008,转换为正常空格(半角空格)
input_data = self.white_space_pattern_compile.sub('\u0020', input_data)
# 移除文档首尾、句中或标点符号附近多余空格和 tab
input_data = input_data.strip()
# 逐行移除首尾空格
text = "\n".join([line.strip() for line in input_data.split("\n")])
text = ''.join(['', text, ''])
# 连续空格替换为一个正常空格
remove_extra_space = self.extra_space_re_compile.sub("\u0020", text)
# 去除中文、符号间的空格
remove_extra_space_in_chinese = self.extra_space_in_chinese_re_compile.sub("", remove_extra_space)
# 去除连续换行符
remove_duplicate_line = self.extra_line_re_compile.sub("\n", remove_extra_space_in_chinese)
return remove_duplicate_line[1:-1]

View File

@@ -0,0 +1,53 @@
~
·
@
#
%
&
*
+
-
=
{
}
|
`
!
$
^
(
)
_
[
]
\
:
"
;
'
<
>
?
,
/
.

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='FullWidthCharacterCleaner',
module_path="ops.mapper.full_width_characters_cleaner.process")

View File

@@ -0,0 +1,18 @@
name: '全角转半角'
name_en: 'Full-to-Half Width Character'
description: '将文档中的所有全角字符转换成半角字符。'
description_en: 'Converts all full-width characters in documents to half-width characters.'
language: 'python'
vendor: 'huawei'
raw_id: 'FullWidthCharacterCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: 'Residential and commercial design, site inspections, working drawings,
Minicad, renderings.'
after: 'Residential and commercial design, site inspections, working drawings, MiniCad,
renderings.'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,46 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 全角转半角
Create: 2025/01/13
"""
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class FullWidthCharacterCleaner(Mapper):
"""将文档中的所有全角字符转换成半角字符"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._full_to_half_dict = {
'': '"', '': '#', '': '$', '': '%', '': '&', '': "'", '': '*', '': '+',
'': '-', '': '.', '': '/', '': '0', '': '1', '': '2', '': '3', '': '4',
'': '5', '': '6', '': '7', '': '8', '': '9', '': '<', '': '=', '': '>',
'': '@', '': 'A', '': 'B', '': 'C', '': 'D', '': 'E', '': 'F', '': 'G',
'': 'H', '': 'I', '': 'J', '': 'K', '': 'L', '': 'M', '': 'N', '': 'O',
'': 'P', '': 'Q', '': 'R', '': 'S', '': 'T', '': 'U', '': 'V', '': 'W',
'': 'X', '': 'Y', '': 'Z', '': '[', '': '\\', '': ']', '': '^', '_': '_',
'': '`', '': 'a', '': 'b', '': 'c', '': 'd', '': 'e', '': 'f', '': 'g',
'': 'h', '': 'i', '': 'j', '': 'k', '': 'l', '': 'm', '': 'n', '': 'o',
'': 'p', '': 'q', '': 'r', '': 's', '': 't', '': 'u', '': 'v', '': 'w',
'': 'x', '': 'y', '': 'z', '': '{', '': '|', '': '}', '': '~'
}
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")
return sample
def _full_width_character_filter(self, input_data: str):
res = []
for input_str in input_data.split('\n'):
res.append("".join(self._full_to_half_dict.get(char, char) for char in input_str))
return '\n'.join(res)

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='GrableCharactersCleaner',
module_path="ops.mapper.garble_characters_cleaner.process")

View File

@@ -0,0 +1,17 @@
name: '文档乱码去除'
name_en: 'Garbled Character Removal'
description: '去除文档中的乱码和无意义的unicode。'
description_en: 'Removes garbled characters and meaningless Unicode characters from
documents.'
language: 'python'
vendor: 'huawei'
raw_id: 'GrableCharactersCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '文档乱码����'
after: '文档乱码'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,54 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description:
本插件实现将文档中乱码去除功能
实现逻辑:
1. 正则判断该字符的unicode编码是否在乱码范围内。若在范围内,则去除,不在范围内,则保留。
2. 运行前,加载乱码字符范围的配置文件,即charset.json。该json文件中,key为字符集名称,value为unicode编码范围的集合。
Create: 2025/01/13
"""
import json
import re
import time
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class GrableCharactersCleaner(Mapper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._file_path = str(Path(__file__).parent / 'resources' / 'charset.json')
self.unicode_grable_code_list = self.get_unicode_grable_code_list() # 乱码unicode编码的十进制范围的集合
self.grable_re_compile = re.compile("[" + self.unicode_grable_code_list + "]")
def get_unicode_grable_code_list(self):
"""获取乱码unicode编码范围"""
res = ""
with open(self._file_path, 'r', encoding='utf-8') as f:
charset_number_list = json.load(f)
for number_ranges in charset_number_list.values():
for number_range in number_ranges:
number_range_list = number_range.split(",")
if len(number_range_list) < 2:
logger.error(f"number_range_list size is {len(number_range_list)}, formatting error")
continue
res += number_range_list[0] + "-" + number_range_list[1]
return res
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._grable_characters_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: GrableCharactersCleaner costs {time.time() - start:6f} s")
return sample
def _grable_characters_filter(self, input_data: str):
"""去除文档中的乱码"""
return self.grable_re_compile.sub("", input_data)

View File

@@ -0,0 +1,24 @@
{
"注音符号东亚": [
"\u3100,\u312F"
],
"拉丁文补充1": [
"\u00C0,\u00D6",
"\u00D8,\u00F6",
"\u00F8,\u00FF"
],
"拉丁文扩展,A": [
"\u0100,\u017F"
],
"拉丁文扩展,B": [
"\u0180,\u024F"
],
"私人使用区域": [
"\uE000,\uF8FF",
"\\U000f0000,\\U000ffffd",
"\\U00100000,\\U0010fffd"
],
"占位符": [
"\uFFFD,\uFFFD"
]
}

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='HtmlTagCleaner',
module_path="ops.mapper.html_tag_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: 'HTML标签去除'
name_en: 'HTML Tag Removal'
description: '移除文档中HTML标签,如 <html>、<dev>、<p> 等。'
description_en: 'Removes HTML tags from documents, such as <html>, <dev>, and <p>.'
language: 'python'
vendor: 'huawei'
raw_id: 'HtmlTagCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '<p><b>机器学习</b>是<a href="/wiki/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD" title="人工智能">人工智能</a>的一个分支。</p>'
after: '机器学习是人工智能的一个分支。'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,80 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: HTML标签去除插件
Create: 2025/01/13
"""
import re
import time
from typing import List, Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class HtmlTagCleaner(Mapper):
"""移除文档中html标签,如 <html>,<dev>,<p>等,不对xml文档做处理"""
tag_list = [
'<a>', '<abbr>', '<acronym>', '<address>', '<applet>', '<area>', '<article>', '<aside>',
'<audio>', '<b>', '<base>', '<basefont>', '<bdi>', '<bdo>', '<bgsound>', '<big>', '<blink>',
'<blockquote>', '<body>', '<br>', '<button>', '<canvas>', '<caption>', '<center>', '<cite>',
'<code>', '<col>', '<colgroup>', '<command>', '<content>', '<data>', '<datalist>', '<dd>',
'<del>', '<details>', '<dfn>', '<dialog>', '<dir>', '<div>', '<dl>', '<dt>', '<em>',
'<embed>', '<fieldset>', '<figcaption>', '<figure>', '<font>', '<footer>', '<form>', '<frame>',
'<frameset>', '<h1>', '<h2>', '<h3>', '<h4>', '<h5>', '<h6>', '<head>', '<header>', '<hgroup>',
'<hr>', '<html>', '<i>', '<iframe>', '<image>', '<img>', '<input>', '<ins>', '<isindex>',
'<kbd>', '<keygen>', '<label>', '<legend>', '<li>', '<link>', '<listing>', '<main>', '<map>',
'<mark>', '<marquee>', '<menu>', '<menuitem>', '<meta>', '<meter>', '<nav>', '<nobr>', '<noembed>',
'<noframes>', '<noscript>', '<object>', '<ol>', '<optgroup>', '<option>', '<output>', '<p>',
'<param>', '<picture>', '<plaintext>', '<pre>', '<progress>', '<q>', '<rp>', '<rt>', '<rtc>',
'<ruby>', '<s>', '<samp>', '<script>', '<section>', '<select>', '<shadow>', '<small>',
'<source>', '<spacer>', '<span>', '<strike>', '<strong>', '<style>', '<sub>', '<summary>',
'<sup>', '<template>', '<textarea>', '<tfoot>', '<thead>', '<time>', '<title>', '<track>', '<tt>', '<u>',
'<ul>', '<var>', '<video>', '<wbr>', '<xmp>'
]
preserved_attr_list = ['colspan', 'rowspan'] # 需要保留的标签属性列表
@staticmethod
def _remove_specified_tags(input_data: str, specified_tags: List):
"""移除指定html标签及其属性值"""
html_tag_pattern = '|'.join(
map(lambda tag: rf'{re.escape(tag[:-1])}(\s[^>]*)?>|</{re.escape(tag[1:-1])}>', specified_tags))
cleaned_text = re.sub(html_tag_pattern, '', input_data, flags=re.IGNORECASE)
return cleaned_text
@staticmethod
def _remove_tag_attributes(input_data: str, preserved_attrs: List):
"""移除html标签内的属性值,同时保留指定的属性"""
tag_pattern = r'<(\w+)(\s+[^<>]*?)?>'
attr_pattern = r'\s*(\w+)="([^"]+)"'
def __remove_unwanted_attrs(m):
def __remove_attrs(x):
if x.group(1) in preserved_attrs:
return x.group(0)
else:
return ''
return re.sub(attr_pattern, __remove_attrs, m.group(0))
cleaned_text = re.sub(tag_pattern, __remove_unwanted_attrs, input_data)
return cleaned_text
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
if sample[self.filetype_key] != "xml":
sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: HtmlTagCleaner costs {time.time() - start:6f} s")
else:
logger.info(f"fileName: {sample[self.filename_key]}, method: HtmlTagCleaner, The file is xml!")
return sample
def _remove_html_tags(self, input_data: str):
# 去除常见的html标签及其属性值(不包括<table>、<tbody>、<tr>、<td>、<th>)
cleaned_text = self._remove_specified_tags(input_data, self.tag_list)
# 去除表格标签内的属性值(不包括colspan、rowspan属性),eg:<td class="td8" rowspan="3"> —> <td rowspan="3">
cleaned_text = self._remove_tag_attributes(cleaned_text, self.preserved_attr_list)
return cleaned_text

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='AnonymizedIdNumber',
module_path="ops.mapper.id_number_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '身份证号匿名化'
name_en: 'ID Card Number Anonymization'
description: '身份证号匿名化。'
description_en: 'Anonymizes ID card numbers.'
language: 'python'
vendor: 'huawei'
raw_id: 'AnonymizedIdNumber'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '这个是身份证号110101190001011009'
after: '这个是身份证号<id>'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,116 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 身份证号码匿名化插件
Create: 2024/12/5 15:43
"""
import re
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, Any
from loguru import logger
import pytz
from datamate.core.base_op import Mapper
class AnonymizedIdNumber(Mapper):
def __init__(self, *args, **kwargs):
super(AnonymizedIdNumber, self).__init__(*args, **kwargs)
self.id_number_re_compile = self.get_id_number_re_compile()
self.id_coefficient = (7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2)
self.id_verification = ("1", "0", "X", "9", "8", "7", "6", "5", "4", "3", "2")
self.area_code_enum = self.load_code_list()
@staticmethod
def get_id_number_re_compile():
"""获取身份证号码正则匹配对象"""
# 中国身份证号共计18位,1,2位省份,3,4位城市,5,6位县区码,7~14位为出生日期,最后一位为校验码,做了严格限定
id_card_pattern = r'(?<=[^0-9])' \
r'((1[1-5]|2[1-3]|3[1-7]|4[1-6]|5[0-4]|6[1-5]|71|81|82)' \
r'(0[0-9]|1[0-9]|2[0-9]|3[0-4]|4[0-3]|5[1-3]|90)' \
r'(0[0-9]|1[0-9]|2[0-9]|3[0-9]|4[0-3]|5[1-7]|6[1-4]|7[1-4]|8[1-7])' \
r'(18|19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])' \
r'\d{3}[0-9xX])' \
r'(?=[^0-9xX])'
return re.compile(id_card_pattern)
@staticmethod
def load_code_list():
"""编码表加载"""
area_code_enum_path = str(Path(__file__).parent / 'resources' / 'area_code_enum.txt')
with open(area_code_enum_path, 'r', encoding='utf-8') as f:
area_code_list = set(f.read().splitlines())
return area_code_list
@staticmethod
def _verify_birthday_code(birthday_code: str):
"""判断出生日期编码的8位数是否有效"""
year = int(birthday_code[:4])
month = int(birthday_code[4:6])
day = int(birthday_code[6:8])
date_string = "{}-{}-{}".format(year, month, day)
date_format = "%Y-%m-%d"
try:
# 将日期字符串转换成时间
date = datetime.strptime(date_string, date_format)
# 设置时区为上海
china_tz = pytz.timezone("Asia/Shanghai")
china_date = china_tz.localize(date)
# 获取当前时间
current_date = datetime.now(china_tz)
# 判断出生日期是否晚于当前时间;若晚于,则出生日期不合法
return china_date <= current_date
except ValueError:
return False
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._id_number_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: IDNumberCleaner costs {time.time() - start:6f} s")
return sample
def _verify_area_code(self, area_code: str):
"""判断地域编码的6位数是否有效"""
return area_code in self.area_code_enum
def _verify_verification_code(self, id_number: str):
"""身份证号码校验码正确性校验"""
verify_num = id_number[-1]
# 将身份证号码前17位数分别乘以不同的系数,即self.id_coefficient,再将相乘结果相加
id_sum = sum([int(num) * coe for num, coe in zip(id_number[:-1], self.id_coefficient)])
# 判断相加总和除以11的余数是否等于身份证号码最后一位
return verify_num.upper() == self.id_verification[id_sum % 11].upper()
def _verify_id_number(self, id_number: str):
"""验证身份证号码有效性主函数"""
return self._verify_verification_code(id_number) and \
self._verify_birthday_code(id_number[6:14]) and \
self._verify_area_code(id_number[:6])
def _verify_similar_id_number(self, id_number: str):
"""用于宽松匹配类似身份证的字符串,不进行严格有效性验证。"""
if len(id_number) != 18:
return False
if not id_number[:17].isdigit():
return False
last_char = id_number[-1].upper()
return last_char in set('0123456789X')
def _id_number_filter(self, input_data: str):
"""身份证号码匿名化"""
input_data = ''.join(['', input_data, ''])
# 抽取符合身份证正则匹配的字符串
id_nums = [item.group(1) for item in self.id_number_re_compile.finditer(input_data)]
# 判断抽取的字符串是不是真实的身份证号码
for id_num in id_nums:
if self._verify_id_number(id_num) or self._verify_similar_id_number(id_num):
# 替换有效身份证号码为<id>
id_num_pattern = r"(?<=[^0-9]){}(?=[^0-9xX])".format(id_num)
input_data = re.compile(id_num_pattern).sub("<id>", input_data)
return input_data[1:-1]

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgDenoise',
module_path="ops.mapper.img_denoise.process")

View File

@@ -0,0 +1,17 @@
name: '图片噪点去除'
name_en: 'Image Noise Removal'
description: '去除图片中的噪点,主要适用于自然场景。'
description_en: 'Removes noises from images, which is mainly applicable to natural
scenery image scenarios.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgDenoise'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,60 @@
# -- encoding: utf-8 --
"""
Description:
Create: 2025/01/17
"""
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Mapper
class ImgDenoise(Mapper):
def __init__(self, *args, **kwargs):
super(ImgDenoise, self).__init__(*args, **kwargs)
self._denoise_threshold = kwargs.get("denoise_threshold", 8)
@staticmethod
def _denoise_image(data: object):
"""降噪处理"""
return cv2.medianBlur(data, 3)
def execute(self, sample: Dict[str, Any]):
start = time.time()
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
if img_bytes:
data = bytes_transform.bytes_to_numpy(img_bytes)
denoise_images = self._denoise_images_filter(data, file_name)
sample[self.data_key] = bytes_transform.numpy_to_bytes(denoise_images, file_type)
logger.info(f"fileName: {file_name}, method: ImgDenoise costs {time.time() - start:6f} s")
return sample
def _denoise_images_filter(self, ori_img, file_name):
# 获取原始图片的去噪图片
clean_data = self._denoise_image(ori_img)
# 为方便与其他图片比较可以将图片resize到同一个大小
ori = cv2.resize(ori_img, (112, 112))
dst = cv2.resize(clean_data, (112, 112))
# 计算未降噪图片的灰度值的集合
signal = np.sum(ori ** 2)
# 计算未降噪图片的灰度值与去噪图片灰度值的差值的集合
noise = np.sum((ori - dst) ** 2)
# 根据未去噪图片和差值计算snr (图片信噪比)
snr = 10 * np.log10(signal / noise)
# 对于小于阈值的图片,进行降噪处理
if snr < self._denoise_threshold:
logger.info(f"The image denoise is {self._denoise_threshold}, "
f"which exceeds the threshold of {snr}. {file_name} is filtered out.")
return clean_data
return ori_img

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgDirectionCorrect',
module_path="ops.mapper.img_direction_correct.process")

View File

@@ -0,0 +1,38 @@
# -- encoding: utf-8 --
import gc
import os
from pathlib import Path
from argparse import Namespace
class BaseModel:
def __init__(self, model_type='vertical'):
models_path = os.getenv("MODELS_PATH", "/home/models")
self.resources_path = str(Path(models_path, 'img_direction_correct', 'resources'))
args = Namespace()
args.cls_image_shape = '3, 224, 224'
args.cls_batch_num = 6
args.cls_thresh = 0.9
args.use_onnx = False
args.use_gpu = False
args.use_npu = False
args.use_xpu = False
args.enable_mkldnn = False
if model_type == 'vertical':
args.cls_model_dir = str(Path(self.resources_path, 'vertical_model'))
self.model_name = 'standard model to detect image 0 or 90 rotated'
args.label_list = ['0', '90']
else:
args.cls_model_dir = str(Path(self.resources_path, 'standard_model'))
self.model_name = 'standard model to detect image 0 or 180 rotated'
args.label_list = ['0', '180']
from paddleocr.tools.infer.predict_cls import TextClassifier
self.infer = TextClassifier(args)
def __del__(self):
del self.infer
gc.collect()

View File

@@ -0,0 +1,17 @@
name: '图片方向校正'
name_en: 'Image Orientation Correction'
description: '将含有文字的图片校正到文字水平方向,主要适用于文档场景。'
description_en: 'Corrects images to ensure text is presented horizontally, which is
mainly applicable to document scenarios.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgDirectionCorrect'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,139 @@
# -- encoding: utf-8 --
"""
Description:
Create: 2024/1/30 9:26
"""
import math
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Mapper
from .base_model import BaseModel
class ImgDirectionCorrect(Mapper):
def __init__(self, *args, **kwargs):
super(ImgDirectionCorrect, self).__init__(*args, **kwargs)
self.img_resize = 1000
self.limit_size = 30000
self.use_model = True
self.vertical_model, self.standard_model = self.get_model(*args, **kwargs)
@staticmethod
def _detect_angle(img):
"""检测图片倾斜角度"""
# 转为灰度单通道 [[255 255],[255 255]]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 黑白颠倒
gray = cv2.bitwise_not(gray)
# 二值化
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# 把大于0的点的行列找出来
ys, xs = np.where(thresh > 0)
# 组成坐标[[306 37][306 38][307 38]],里面都是非零的像素
coords = np.column_stack([xs, ys])
# 获取最小矩形的信息 返回值(中心点,长宽,角度)
rect = cv2.minAreaRect(coords)
# 这里minAreaRect返回值为【0,90】,离y轴最近的夹角,后续有优化空间
# 夹角小于45度时,填充的空白较少,有助于提升识别率
angle = rect[-1] # 最后一个参数是角度
# 小于45度时,逆时针旋转45度
if angle <= 45.0:
return angle
# 大于45度时,顺时针旋转(90-angle)
return angle - 90
@staticmethod
def _detect_direction(image, file_name, model):
"""
Args:
image: 待预测的图片
file_name: 文件名
model: 使用的模型, vertical_model 和 standard_model
Returns: 旋转后的图片
"""
# cls_res为模型预测结果,格式应当类似于: [('90', 0.9815167)]
_, cls_res, _ = model.infer([image])
rotate_angle = int(cls_res[0][0])
pro = float(cls_res[0][1])
logger.info(
f"fileName: {file_name}, model {model.model_name} detect result is {rotate_angle} with confidence {pro}")
if rotate_angle == 90 and pro > 0.89:
return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
if rotate_angle == 180 and pro > 0.89:
return cv2.rotate(image, 1)
return image
@staticmethod
def _rotate_bound(image, angle):
"""根据倾斜角度旋转图片
Args:
image: 待处理图片
angle: _detect_angle方法检测到的倾斜角
"""
if angle == 0.0:
return image
# 获取宽高
h, w = image.shape[:2]
sinval = math.fabs(math.sin(angle))
cosval = math.fabs(math.cos(angle))
dx = max(int((w * cosval + h * sinval - w) / 2), 0)
dy = max(int((w * sinval + h * cosval - h) / 2), 0)
dst_img = cv2.copyMakeBorder(image, dy, dy, dx, dx, cv2.BORDER_CONSTANT, value=(255, 255, 255))
h, w = dst_img.shape[:2]
rotated_matrix = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1.0)
dst_img = cv2.warpAffine(dst_img, rotated_matrix, (w, h), borderValue=(255, 255, 255))
return dst_img
def init_model(self, *args, **kwargs):
return BaseModel(model_type='vertical'), BaseModel(model_type='standard')
def execute(self, sample: Dict[str, Any]):
start = time.time()
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]
if img_bytes:
data = bytes_transform.bytes_to_numpy(img_bytes)
correct_data = self._img_direction_correct(data, file_name, self.vertical_model, self.standard_model)
sample[self.data_key] = bytes_transform.numpy_to_bytes(correct_data, file_type)
logger.info(f"fileName: {file_name}, method: ImgDirectionCorrect costs {time.time() - start:6f} s")
return sample
def _img_direction_correct(self, img, file_name, vertical_model, standard_model):
height, width = img.shape[:2]
if max(height, width) > self.limit_size:
logger.info(
f"fileName: {file_name}, method: ImgDirectionCorrect cannot process pixels number larger than 30000")
return img
detect_angle_img = self._resize(img)
# 检测旋转角
angle = self._detect_angle(detect_angle_img)
# 将图片处理为 0, 90, 180, 270旋转角度的图片
rotated_img = self._rotate_bound(img, angle)
# 水平垂直方向识别:二分类模型,检测图片方向角为 0, 90, 将其处理为 0和180二分类图片
rotated_img = self._detect_direction(rotated_img, file_name, vertical_model)
# 0-180方向识别:二分类模型,检测图片方向角为 0, 180, 将其处理为 0和180二分类图片
rotated_img = self._detect_direction(rotated_img, file_name, standard_model)
return rotated_img
def _resize(self, image):
height, width = image.shape[:2] # 获取原图像的水平方向尺寸和垂直方向尺寸。
temp = max(height, width)
# 若图片最长边大于限值,对图片进行压缩,否则返回原图
if temp >= self.img_resize:
mul_temp = temp / self.img_resize
if height > width:
return cv2.resize(image, (int(width / mul_temp), self.img_resize), interpolation=cv2.INTER_AREA)
elif height < width:
return cv2.resize(image, (self.img_resize, int(height / mul_temp)), interpolation=cv2.INTER_AREA)
else:
return cv2.resize(image, (self.img_resize, self.img_resize), interpolation=cv2.INTER_AREA)
return image

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgBrightness',
module_path="ops.mapper.img_enhanced_brightness.process")

View File

@@ -0,0 +1,16 @@
name: '图片亮度增强'
name_en: 'Image Brightness Enhancement'
description: '自适应调节图片的亮度。'
description_en: 'Adapts and adjusts image brightness.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgBrightness'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,100 @@
# -- encoding: utf-8 --
"""
Description: 图像亮度增强算子。
Create: 2025/01/13
"""
import time
from typing import Dict, Any
import numpy as np
import cv2
from loguru import logger
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Mapper
class ImgBrightness(Mapper):
"""图片亮度自适应增强"""
def __init__(self, *args, **kwargs):
super(ImgBrightness, self).__init__(*args, **kwargs)
# 自适应增强参数
self.factor_threshold = 1.1 # 图片增强因子下限(不作为参数传入)。
self.standard_mean = 140 # 图片增强后的平均亮度(不作为参数传入)。
self.gamma = 1.5 # gamma correction 中的gamma系数,大于1时,使得图像变亮。小于1时,使得图像变暗(不作为参数传入)。
self.brightness_upper_bound = 0.35 # 非线性亮度增强阈值上界: 超过这个百分比,就进行线性亮度增强(不作为参数传入)。
self.eps = 1 # 极小值,计算图像亮度增强因子的时候,防止全黑图片导致的除零错(不作为参数传入)。
@staticmethod
def _get_grey_mean(src: np.ndarray):
gray_image = cv2.cvtColor(src, cv2.COLOR_BGR2GRAY)
return np.mean(gray_image)
@staticmethod
def _return_gamma_table(gamma):
"""返回gamma校正对应的查找表"""
scale = np.power(255, 1 - gamma).astype(np.float64)
return np.power(np.arange(256), gamma) * scale
@staticmethod
def _return_linear_table(factor):
"""返回线性变换对应的查找表"""
linear_table = np.arange(256) * factor
return np.clip(linear_table, 0, 255).astype(np.uint8)
def enhance_brightness_linear(self, image_data: np.ndarray, file_name):
average_brightness = self._get_grey_mean(image_data)
brightness_factor = self.standard_mean / (average_brightness + self.eps)
# 图像过亮,不需要增强亮度
if brightness_factor <= 1:
logger.info(f"fileName: {file_name}, method: ImgBrightness not need enhancement")
return image_data
brightness_factor = max(brightness_factor, self.factor_threshold)
linear_table = ImgBrightness._return_linear_table(brightness_factor)
cv2.LUT(image_data, linear_table, dst=image_data)
return image_data
def enhance_brightness(self, image_data: np.ndarray, file_name):
'''
亮度自适应增强方法。
Args:
image_data: nd.array 格式图片
gamma: gamma变换因子参数。经验值常用1.5, 已写成了成员变量。
Returns:
亮度自适应增强后的图片
'''
# 计算图片平均亮度
average_brightness = self._get_grey_mean(image_data)
# 进行 gamma 校正
if average_brightness / 255 <= self.brightness_upper_bound:
# 预计算查找表
gamma_table = ImgBrightness._return_gamma_table(1 / self.gamma).astype(np.uint8)
cv2.LUT(image_data, gamma_table, dst=image_data)
# 如果亮度超过非线性亮度调整的上界,就进行非线性亮度调整
else:
image_data = self.enhance_brightness_linear(image_data, file_name)
return image_data
def execute(self, sample: Dict[str, Any]):
start = time.time()
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
if img_bytes:
# 进行图片增强
img_data = bytes_transform.bytes_to_numpy(img_bytes)
img_data = self.enhance_brightness(img_data, file_name)
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
logger.info(f"fileName: {file_name}, method: ImgBrightness costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgContrast',
module_path="ops.mapper.img_enhanced_contrast.process")

View File

@@ -0,0 +1,16 @@
name: '图片对比度增强'
name_en: 'Image Contrast Enhancement'
description: '自适应调节图片的对比度。'
description_en: 'Adapts and adjusts the image contrast.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgContrast'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,71 @@
# -- encoding: utf-8 --
"""
Description: 图片对比度自适应增强
Version:
Create: 2025/01/13
"""
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Mapper
class ImgContrast(Mapper):
"""图片对比度自适应增强"""
def __init__(self, *args, **kwargs):
super(ImgContrast, self).__init__(*args, **kwargs)
# 自适应增强参数
self.clip_limit = 2 # 指定对比度限制阈值, 较大的值会产生更大的对比度增强效(不作为参数传入)。
self.tile_grid = 16 # 指定图像划分的网格大小,较小的网格大小会导致更局部的均衡化效果(不作为参数传入)。
self.standard_mean = 100 # 图片增强后的平均对比度(不作为参数传入)。
self.eps = 0.5 # 小值,计算图像对比度增强因子的时候,防止全黑图片导致的除零错(不作为参数传入)。
@staticmethod
def _get_contrast(image: np.ndarray):
"""计算图像所有通道的平均标准差"""
_, stddev = cv2.meanStdDev(image)
contrast_std = np.mean(stddev)
return contrast_std
def enhance_contrast(self, image_data: np.ndarray, file_name):
"""对比度自适应增强方法"""
contrast_std = self._get_contrast(image_data)
contrast_factor = self.standard_mean / (contrast_std + self.eps)
# 图片对比度较高,不需要增强对比度
if contrast_factor <= 1:
logger.info(f"fileName: {file_name}, method: ImgContrast not need enhancement")
return image_data
# 将彩色图像转换为Lab颜色空间
cv2.cvtColor(image_data, cv2.COLOR_BGR2Lab, dst=image_data)
# 使用局部自适应直方图均衡化进行对比度调整。
clahe = cv2.createCLAHE(clipLimit=self.clip_limit, tileGridSize=(self.tile_grid, self.tile_grid))
image_data[:, :, 0] = clahe.apply(image_data[:, :, 0])
# 将增强后的Lab图像转换回BGR颜色空间
cv2.cvtColor(image_data, cv2.COLOR_Lab2BGR, dst=image_data)
return image_data
def execute(self, sample: Dict[str, Any]):
start = time.time()
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
if img_bytes:
# 进行图片增强
img_data = bytes_transform.bytes_to_numpy(img_bytes)
img_data = self.enhance_contrast(img_data, file_name)
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
logger.info(f"fileName: {file_name}, method: ImgContrast costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgSaturation',
module_path="ops.mapper.img_enhanced_saturation.process")

View File

@@ -0,0 +1,17 @@
name: '图片饱和度增强'
name_en: 'Image Saturation Enhancement'
description: '自适应调节图片的饱和度,主要适用于自然场景图片。'
description_en: 'Adapts and adjusts the saturation of images, which is mainly applicable
to natural scenery image scenarios.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgSaturation'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,81 @@
# -- encoding: utf-8 --
"""
Description: 图片饱和度自适应增强
Version:
Create: 2025/01/13
"""
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Mapper
class ImgSaturation(Mapper):
"""图片饱和度自适应增强"""
def __init__(self, *args, **kwargs):
super(ImgSaturation, self).__init__(*args, **kwargs)
# 自适应增强参数
self.factor_threshold = 1.1 # 图片增强因子下限(不作为参数传入)。
self.standard_mean = 130 # 图片增强后的平均饱和度(不作为参数传入)。
self.eps = 1 # 极小值,计算图像饱和度增强因子的时候,防止全黑图片导致的除零错(不作为参数传入)。
self.zeros_ratio_threshold = 0.1 # saturation通道 零值占比率,防止对近似灰度图的图像进行处理。
self.red_channel_threshold = 140 # 图片红色通道阈值,用于抑制饱和度增强因子
def enhance_saturation(self, image_data: np.ndarray, file_name):
"""饱和度自适应增强方法"""
# 打开图像并转换为HSV颜色空间
image_hsv = cv2.cvtColor(image_data, cv2.COLOR_BGR2HSV)
s_channel = image_hsv[:, :, 1].copy()
del image_hsv
# 提取饱和度通道
# 正常的RGB图片,零值占比率比应当小于0.1, 如果高于0.1,可以认为这张图片近似于灰度图
zero_s_ratio = np.count_nonzero(s_channel == 0) / s_channel.size
if zero_s_ratio <= self.zeros_ratio_threshold:
saturation_channel = s_channel
# 灰度图片转成的RGB图片,转为HSV后,S通道值全为0
else:
return image_data
# 计算饱和度的统计信息
saturation_mean = np.mean(saturation_channel)
saturation_factor = self.standard_mean / (saturation_mean + self.eps)
# 图片饱和度较高,不需要增强饱和度
if saturation_factor <= 1:
logger.info(f"fileName: {file_name}, method: ImgSaturation not need enhancement")
return image_data
# 计算图片红色通道均值, 如果过大,需要限制saturation factor大小,否则图片会泛红, 产生色彩畸变。
red_channel_mean = np.mean(image_data[:, :, 2])
if red_channel_mean >= self.red_channel_threshold:
saturation_factor = min(saturation_factor, 1.5)
else:
saturation_factor = max(saturation_factor, self.factor_threshold)
degrade_image = cv2.cvtColor(image_data, cv2.COLOR_BGR2GRAY)
degrade_image = cv2.cvtColor(degrade_image, cv2.COLOR_GRAY2BGR)
cv2.addWeighted(image_data, saturation_factor, degrade_image, 1 - saturation_factor, 0, dst=image_data)
return image_data
def execute(self, sample: Dict[str, Any]):
start = time.time()
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
if img_bytes:
# 进行图片增强
img_data = bytes_transform.bytes_to_numpy(img_bytes)
img_data = self.enhance_saturation(img_data, file_name)
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
logger.info(f"fileName: {file_name}, method: ImgSaturation costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgSharpness',
module_path="ops.mapper.img_enhanced_sharpness.process")

View File

@@ -0,0 +1,17 @@
name: '图片锐度增强'
name_en: 'Image Sharpness Enhancement'
description: '自适应调节图片的锐度,主要适用于自然场景图片。'
description_en: 'Adapts and adjusts the image sharpness, which is mainly applicable
to natural scenery image scenarios.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgSharpness'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,69 @@
# -- encoding: utf-8 --
"""
Description: 图片锐度自适应增强
Version:
Create: 2025/01/13
"""
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Mapper
class ImgSharpness(Mapper):
"""图片锐度自适应增强"""
def __init__(self, *args, **kwargs):
super(ImgSharpness, self).__init__(*args, **kwargs)
# 自适应增强参数
self.factor_threshold = 1.1 # 图片增强因子下限(不作为参数传入)。
self.standard_mean = 100 # 图片增强后的平均锐度(不作为参数传入)。
self.kernel = self._init_kernel()
self.eps = 1 # 小值,计算图像锐度增强因子的时候,防止全黑图片导致的除零错(不作为参数传入)。
@classmethod
def _init_kernel(cls):
kernel = np.array([[1, 1, 1],
[1, 5, 1],
[1, 1, 1]])
# 对卷积核进行归一化
kernel = kernel / np.sum(kernel)
return kernel
def enhance_sharpness(self, image_data: np.ndarray, file_name):
"""锐度自适应增强方法"""
# 打开图像并转换为灰度图像
image_gray = cv2.cvtColor(image_data, cv2.COLOR_BGR2GRAY)
sharpness = np.abs(cv2.Laplacian(image_gray, cv2.CV_8U)).mean()
sharpness_factor = self.standard_mean / (sharpness + self.eps)
# 图片锐度较高,不需要增强锐度
if sharpness_factor <= 1:
logger.info(f"fileName: {file_name}, method: ImgSharpness not need enhancement")
return image_data
filtered_img = cv2.filter2D(image_data, -1, self.kernel)
cv2.addWeighted(image_data, sharpness_factor, filtered_img, 1.0 - sharpness_factor, 0, dst=image_data)
return image_data
def execute(self, sample: Dict[str, Any]):
start = time.time()
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
if img_bytes:
# 进行图片增强
img_data = bytes_transform.bytes_to_numpy(img_bytes)
img_data = self.enhance_sharpness(img_data, file_name)
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
logger.info(f"fileName: {file_name}, method: ImgSharpness costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgPerspectiveTransformation',
module_path="ops.mapper.img_perspective_transformation.process")

View File

@@ -0,0 +1,17 @@
name: '图片透视变换'
name_en: 'Image Perspective Transformation'
description: '自适应校正图片的视角,主要适用于文档校正场景。'
description_en: 'Adapts and corrects image perspectives, which is mainly applicable
to document correction scenarios.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgPerspectiveTransformation'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,147 @@
# -- encoding: utf-8 --
"""
Description:
Create: 2025/01/16
"""
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Mapper
class ImgPerspectiveTransformation(Mapper):
"""图片透视变换插件"""
def __init__(self, *args, **kwargs):
super(ImgPerspectiveTransformation, self).__init__(*args, **kwargs)
self.transform_utils = PerspectiveTransformationUtils()
def execute(self, sample: Dict[str, Any]):
start = time.time()
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
if img_bytes:
img_data = bytes_transform.bytes_to_numpy(img_bytes)
transform_img = self._transform_img(img_data, file_name)
sample[self.data_key] = bytes_transform.numpy_to_bytes(transform_img, file_type)
logger.info(f"fileName: {file_name}, method: ImgPerspectiveTransformation costs {time.time() - start:6f} s")
return sample
def _transform_img(self, image, file_name):
original_img = image
ratio = 900 / image.shape[0]
# 固定尺寸
img_resize = self.transform_utils.resize_img(image)
# 边缘检测
binary_img = self.transform_utils.get_canny(img_resize)
# 轮廓
max_contour, max_area = self.transform_utils.find_max_contour(binary_img)
if not max_contour.size:
return original_img
# 多边形拟合凸包的四个顶点
boxes = self.transform_utils.get_box_point(max_contour)
if len(boxes) == 4:
boxes = self.transform_utils.get_adapt_point(boxes, ratio)
boxes = self.transform_utils.order_points(boxes)
warped = self.transform_utils.get_warp_image(image, boxes)
logger.info(f"fileName: {file_name}, method: ImgPerspectiveTransformation. "
"This picture is transformed by perspective.")
return warped
return original_img
class PerspectiveTransformationUtils:
"""图片透视变换工具类"""
@staticmethod
def resize_img(image, height=900):
"""固定尺寸"""
h, w = image.shape[:2]
pro = height / h
size = (int(w * pro), int(height))
img_resize = cv2.resize(image, size)
return img_resize
@staticmethod
def get_canny(image):
"""边缘检测"""
# 高斯滤波
binary = cv2.GaussianBlur(image, (3, 3), 2, 2)
# 边缘检测
binary = cv2.Canny(binary, 60, 240, apertureSize=3)
# 膨胀操作,尽量使边缘闭合
kernel = np.ones((3, 3), np.uint8)
binary = cv2.dilate(binary, kernel, iterations=1)
return binary
@staticmethod
def find_max_contour(image):
"""求出面积最大的轮廓"""
# 寻找边缘
contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
# 计算面积
max_area = 0.0
max_contour = np.array([])
for contour in contours:
current_area = cv2.contourArea(contour)
if current_area > max_area:
max_area = current_area
max_contour = contour
return max_contour, max_area
@staticmethod
def get_box_point(contour):
"""多边形拟合凸包的四个顶点"""
# 多边形拟合凸包
hull = cv2.convexHull(contour)
epsilon = 0.02 * cv2.arcLength(contour, True)
approx = cv2.approxPolyDP(hull, epsilon, True)
approx = approx.reshape((len(approx), 2))
return approx
@staticmethod
def get_adapt_point(box, pro):
"""适配原四边形点集"""
box_pro = box
if pro != 1.0:
box_pro = box / pro
box_pro = np.trunc(box_pro)
return box_pro
@staticmethod
def order_points(pts):
"""四边形顶点排序,[top-left, top-right, bottom-right, bottom-left]"""
rect = np.zeros((4, 2), dtype="float32")
s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1)
rect[1] = pts[np.argmin(diff)]
rect[3] = pts[np.argmax(diff)]
return np.intp(rect)
@staticmethod
def compute_point_distance(a, b):
"""计算长宽"""
return int(np.sqrt(np.sum(np.square(a - b))))
def get_warp_image(self, image, box):
"""透视变换"""
w, h = self.compute_point_distance(box[0], box[1]), \
self.compute_point_distance(box[1], box[2])
dst_rect = np.array([[0, 0],
[w - 1, 0],
[w - 1, h - 1],
[0, h - 1]], dtype='float32')
box = np.array(box, dtype='float32')
matrix = cv2.getPerspectiveTransform(box, dst_rect)
warped = cv2.warpPerspective(image, matrix, (w, h))
return warped

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgResize',
module_path="ops.mapper.img_resize.process")

View File

@@ -0,0 +1,35 @@
name: '图片重采样'
name_en: 'Image Resampling'
description: '将图片放大或缩小到指定像素。'
description_en: 'Zooms in or out images to specified pixels.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgResize'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'
settings:
targetSize:
name: 重采样尺寸
type: multiple
properties:
- type: inputNumber
name: 宽度
description: 像素
defaultVal: 256
min: 1
max: 4096
step: 1
- type: inputNumber
name: 高度
description: 像素
defaultVal: 256
min: 1
max: 4096
step: 1

View File

@@ -0,0 +1,40 @@
# -- encoding: utf-8 --
"""
Description:
Create: 2025/01/16
"""
import time
from typing import List, Dict, Any
from loguru import logger
import cv2
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Mapper
class ImgResize(Mapper):
def __init__(self, *args, **kwargs):
super(ImgResize, self).__init__(*args, **kwargs)
self._target_size = kwargs.get("targetSize", [256, 256])
@classmethod
def _img_resize(cls, data: List[float], target_size: List[int]) -> List[float]:
"""将图片缩放到指定尺寸大小"""
target_width = max(min(target_size[0], 4096), 1)
target_height = max(min(target_size[1], 4096), 1)
resized_img = cv2.resize(data, (target_width, target_height), interpolation=cv2.INTER_AREA)
return resized_img
def execute(self, sample: Dict[str, Any]):
start = time.time()
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]
if img_bytes:
data = bytes_transform.bytes_to_numpy(img_bytes)
resized_img = self._img_resize(data, self._target_size)
sample[self.data_key] = bytes_transform.numpy_to_bytes(resized_img, file_type)
logger.info(f"fileName: {file_name}, method: ImgResize costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgShadowRemove',
module_path="ops.mapper.img_shadow_remove.process")

View File

@@ -0,0 +1,17 @@
name: '图片阴影去除'
name_en: 'Image Shadow Removal'
description: '去除图片中的阴影,主要适用于文档场景。'
description_en: 'Removes shadows from images, which is mainly applicable to document
scenarios.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgShadowRemove'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,72 @@
# -- encoding: utf-8 --
"""
Description: 图片去阴影插件
Create: 2025/01/16
"""
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Mapper
class ImgShadowRemove(Mapper):
"""图片阴影去除"""
def __init__(self, *args, **kwargs):
super(ImgShadowRemove, self).__init__(*args, **kwargs)
self.iter_nums = 9 # 闭运算循环次数(不作为参数传入)。
self.k_size = 3 # kernel size大小。
self.clip_limit = 2 # 对比度限制阈值, 数值越大,效果越强。
self.tile_grid = 8 # 图像划分的网格大小, 数值越小,局部效果越明显。
def shadow_removed(self, image_data: np.ndarray):
'''
阴影去除。
Args:
image_data: nd.array 格式图片
Returns:
阴影去除后的图片
'''
# 设置kernel大小,进行闭运算
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (self.k_size, self.k_size))
closing = cv2.morphologyEx(image_data, cv2.MORPH_CLOSE, kernel, iterations=self.iter_nums)
# 进行~(closing - original)操作
cv2.bitwise_not(closing - image_data, dst=closing)
cv2.cvtColor(closing, cv2.COLOR_BGR2Lab, dst=closing)
# 获取处理后图像的亮度通道
img_l = cv2.split(closing)[0]
del closing
# 对img_l进行调节后,替换原图的亮度通道
cv2.cvtColor(image_data, cv2.COLOR_BGR2Lab, dst=image_data)
# 创建 CLAHE 对象
clahe = cv2.createCLAHE(clipLimit=self.clip_limit, tileGridSize=(self.tile_grid, self.tile_grid))
# 进行 CLAHE 处理
image_data[:, :, 0] = clahe.apply(img_l)
del img_l
cv2.cvtColor(image_data, cv2.COLOR_Lab2BGR, dst=image_data)
return image_data
def execute(self, sample: Dict[str, Any]):
start = time.time()
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
if img_bytes:
# 进行阴影去除
img_data = bytes_transform.bytes_to_numpy(img_bytes)
img_data = self.shadow_removed(img_data)
sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
logger.info(f"fileName: {file_name}, method: ImageShadowRemove costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -- encoding: utf-8 --
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgTypeUnify',
module_path="ops.mapper.img_type_unify.process")

View File

@@ -0,0 +1,30 @@
name: '图片格式转换'
name_en: 'Image Format Conversion'
description: '将图片编码格式统一为jpg、jpeg、png、bmp格式'
description_en: 'Converts image formats to JPG, JPEG, PNG, or BMP.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgTypeUnify'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'
settings:
imgType:
name: 图片编码格式
type: select
defaultVal: jpg
options:
- label: jpg
value: jpg
- label: png
value: png
- label: jpeg
value: jpeg
- label: bmp
value: bmp

View File

@@ -0,0 +1,41 @@
# -- encoding: utf-8 --
"""
Description:
Create: 2025/01/16
"""
import re
import time
from loguru import logger
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Mapper
class ImgTypeUnify(Mapper):
def __init__(self, *args, **kwargs):
super(ImgTypeUnify, self).__init__(*args, **kwargs)
"""勾选图片编码格式统一,未输入参数时,默认设置为jpg格式"""
self._setting_type = kwargs.get("imgType", "jpg")
def execute(self, sample):
start = time.time()
file_name = sample[self.filename_key]
origin_file_type = sample[self.filetype_key]
if origin_file_type == self._setting_type:
# 原文件格式与目标文件编码格式一致,无需处理
return sample
file_path = sample[self.filepath_key]
# 读取图片
img_bytes = sample[self.data_key]
if img_bytes:
origin_data = bytes_transform.bytes_to_numpy(img_bytes)
# 按指定编码格式转字节
sample[self.data_key] = bytes_transform.numpy_to_bytes(origin_data, "." + self._setting_type)
# 修改meta数据
sample[self.filetype_key] = self._setting_type
sample[self.filename_key] = re.sub(self._setting_type + "$", self._setting_type, file_name)
sample[self.filepath_key] = re.sub(self._setting_type + "$", self._setting_type, file_path)
logger.info(f"fileName: {file_name}, method: ImgTypeUnify costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgWatermarkRemove',
module_path="ops.mapper.img_watermark_remove.process")

View File

@@ -0,0 +1,26 @@
name: '图片水印去除'
name_en: 'Image Watermark Removal'
description: '去除图片中的“知乎”和“抖音”水印。'
description_en: 'Removes the 知乎 and 抖音 watermarks from images.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgWatermarkRemove'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'
settings:
watermarkStr:
name: 需要去除的水印文字信息
type: checkbox
defaultVal: '知乎,抖音'
options:
- label: 知乎
value: 知乎
- label: 抖音
value: 抖音

View File

@@ -0,0 +1,160 @@
# # -- encoding: utf-8 --
#
# Description:
# Create: 2025/01/06
# """
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import bytes_to_numpy
from datamate.common.utils import numpy_to_bytes
from datamate.core.base_op import Mapper
from .watermark_ocr_model import WatermarkOcrModel
DEFAULT_MAX_CHARACTERS = 10
DEFAULT_BINARY_THRESHOLD_LOW = 200
class ImgWatermarkRemove(Mapper):
use_model = True
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.remove_str = kwargs.get("watermarkStr", "知乎,抖音")
self.ocr_model = self.get_model(*args, **kwargs)
@staticmethod
def _has_kw(result_list, kw_list):
"""
图片是否包含目标水印,返回匹配到的文字列表
"""
result_str_list = []
for line in result_list:
for kw in kw_list:
if kw in line[1][0]:
result_str_list.append(line[1][0])
break
return result_str_list
@staticmethod
def _overlay_mask(background_img, prospect_img, img_over_x, img_over_y):
back_r, back_c, _ = background_img.shape # 背景图像行数、列数
is_x_direction_failed = img_over_x > back_c or img_over_x < 0
is_y_direction_failed = img_over_y > back_r or img_over_y < 0
if is_x_direction_failed or is_y_direction_failed:
# 前景图不在背景图范围内, 直接返回原图
return background_img
pro_r, pro_c, _ = prospect_img.shape # 前景图像行数、列数
if img_over_x + pro_c > back_c: # 如果水平方向展示不全
pro_c = back_c - img_over_x # 截取前景图的列数
prospect_img = prospect_img[:, 0:pro_c, :] # 截取前景图
if img_over_y + pro_r > back_r: # 如果垂直方向展示不全
pro_r = back_r - img_over_y # 截取前景图的行数
prospect_img = prospect_img[0:pro_r, :, :] # 截取前景图
prospect_img = cv2.cvtColor(prospect_img, cv2.COLOR_BGR2BGRA) # 前景图转为4通道图像
prospect_tmp = np.zeros((back_r, back_c, 4), np.uint8) # 与背景图像等大的临时前景图层
# 前景图像放到前景图层里
prospect_tmp[img_over_y:img_over_y + pro_r, img_over_x: img_over_x + pro_c, :] = prospect_img
_, binary = cv2.threshold(prospect_img, 254, 255, cv2.THRESH_BINARY) # 前景图阈值处理
prospect_mask = np.zeros((pro_r, pro_c, 1), np.uint8) # 单通道前景图像掩模
prospect_mask[:, :, 0] = binary[:, :, 3] # 不透明像素的值作为掩模的值
mask = np.zeros((back_r, back_c, 1), np.uint8)
mask[img_over_y:img_over_y + prospect_mask.shape[0],
img_over_x: img_over_x + prospect_mask.shape[1]] = prospect_mask
mask_not = cv2.bitwise_not(mask)
prospect_tmp = cv2.bitwise_and(prospect_tmp, prospect_tmp, mask=mask)
background_img = cv2.bitwise_and(background_img, background_img, mask=mask_not)
prospect_tmp = cv2.cvtColor(prospect_tmp, cv2.COLOR_BGRA2BGR) # 前景图层转为三通道图像
return prospect_tmp + background_img # 前景图层与背景图像相加合并
def execute(self, sample: Dict[str, Any]):
start = time.time()
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]
if img_bytes:
data = bytes_to_numpy(img_bytes)
correct_data = self._watermark_remove(data, file_name, self.ocr_model)
sample[self.data_key] = numpy_to_bytes(correct_data, file_type)
logger.info(f"fileName: {file_name}, method: ImgWatermarkRemove costs {time.time() - start:6f} s")
return sample
def delete_watermark(self, result_list, kw_list, data):
"""
将符合目标的水印,模糊化处理
"""
# 获取所有符合目标的文本框位置
text_axes_list = []
for line in result_list:
for kw in kw_list:
if kw in line[1][0]:
min_width = int(min(line[0][0][0], line[0][3][0]))
max_width = int(max(line[0][1][0], line[0][2][0]))
min_hight = int(min(line[0][0][1], line[0][1][1]))
max_hight = int(max(line[0][2][1], line[0][3][1]))
text_axes_list.append([min_width, min_hight, max_width, max_hight])
break
# 去除水印
delt = DEFAULT_MAX_CHARACTERS # 文本框范围扩大
img = data
for text_axes in text_axes_list:
hight, width = img.shape[0:2]
# 截取图片
min_width = text_axes[0] - delt if text_axes[0] - delt >= 0 else 0
min_hight = text_axes[1] - delt if text_axes[1] - delt >= 0 else 0
max_width = text_axes[2] + delt if text_axes[2] + delt <= width else width
max_hight = text_axes[3] + delt if text_axes[3] + delt <= hight else hight
cropped = img[min_hight:max_hight, min_width:max_width] # 裁剪坐标为[y0:y1, x0:x1]
# 图片二值化处理,把[200,200,200]-[250,250,250]以外的颜色变成0
start_rgb = DEFAULT_BINARY_THRESHOLD_LOW
thresh = cv2.inRange(cropped, np.array([start_rgb, start_rgb, start_rgb]), np.array([250, 250, 250]))
# 创建形状和尺寸的结构元素
kernel = np.ones((3, 3), np.uint8) # 设置卷积核3*3全是1;将当前的数组作为图像类型来进&#12175;各种操作,就要转换到uint8类型
# 扩展待修复区域
hi_mask = cv2.dilate(thresh, kernel, iterations=10) # 膨胀操作,白色区域增大,iterations迭代次数
specular = cv2.inpaint(cropped, hi_mask, 5, flags=cv2.INPAINT_TELEA)
# imgSY:输入8位1通道或3通道图像。
# hi_mask:修复掩码,8位1通道图像。非零像素表示需要修复的区域。
# specular:输出与imgSY具有相同大小和类型的图像。
# 5:算法考虑的每个点的圆形邻域的半径。
# flags:NPAINT_NS基于Navier-Stokes的方法、Alexandru Telea的INPAINT_TELEA方法
result = self._overlay_mask(img, specular, min_width, min_hight)
img = result
return img
def init_model(self, *args, **kwargs):
return WatermarkOcrModel(*args, **kwargs).ocr_model
def _watermark_remove(self, data, file_name, model):
"""
去除水印的方法
"""
remove_str = self.remove_str
# 勾选去水印的信息为空,则直接返回原图
if remove_str == "":
return data
kw_list = remove_str.split(',')
# 加载模型
ocr_model = model
try:
result = ocr_model.ocr(data, cls=True)
except RuntimeError as e:
logger.error(f"fileName: {file_name}, method: ocr predict error {e}")
return data
if result and result[0]:
logger.info(f"fileName: {file_name}, method: ocrModel detect watermark info {str(result)}")
return self.delete_watermark(result[0], kw_list, data)
else:
logger.info(f"fileName: {file_name}, method: ImgWatermarkRemove not need remove target ocr")
return data

View File

@@ -0,0 +1,25 @@
# -- encoding: utf-8 --
import gc
import os
from pathlib import Path
class WatermarkOcrModel:
def __init__(self, *args, **kwargs):
models_path = os.getenv("MODELS_PATH", "/home/models")
self.resources_path = str(Path(models_path, 'img_watermark_remove', 'resources'))
self.det_model_dir = str(Path(self.resources_path, 'ch_PP-OCRv4_det_infer'))
self.rec_model_dir = str(Path(self.resources_path, 'ch_PP-OCRv4_rec_infer'))
self.cls_model_dir = str(Path(self.resources_path, 'ch_ppocr_mobild_v2_cls_infer'))
from paddleocr import PaddleOCR
self.ocr_model = PaddleOCR(det_model_dir=self.det_model_dir, cls_model_dir=self.cls_model_dir,
rec_model_dir=self.rec_model_dir,
use_angle_cls=True,
lang='ch')
def __del__(self):
del self.ocr_model
gc.collect()

View File

@@ -0,0 +1,7 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='InvisibleCharactersCleaner',
module_path="ops.mapper.invisible_characters_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '不可见字符去除'
name_en: 'Invisible Character Removal'
description: '去除文档中的不可见字符,例如 0-31 号字符中的部分字符。'
description_en: 'Removes invisible characters from documents, for example, removing invisible characters from characters numbered 0 to 31.'
language: 'python'
vendor: 'huawei'
raw_id: 'InvisibleCharactersCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: "对“材料”怎样下\x04定义才臻于 严格和科学?"
after: '对“材料”怎样下定义才臻于严格和科学?'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,30 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 不可见字符去除
Create: 2025/01/13
"""
import re
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class InvisibleCharactersCleaner(Mapper):
@staticmethod
def _invisible_characters_filter(input_data: str):
# 移除ASCII中不可见字符,包括0-7、14-19 21-31、127-160的字符
invisible_char_pattern = '[\x00-\x07|\x0E-\x13|\x15-\x1F|\x7F-\xA0]'
invisible_chars_re = re.compile(invisible_char_pattern)
return invisible_chars_re.sub('', input_data)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._invisible_characters_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: InvisibleCharactersCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='AnonymizedIpAddress',
module_path="ops.mapper.ip_address_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: 'IP地址匿名化'
name_en: 'IP Address Anonymization'
description: 'IP地址匿名化'
description_en: 'Anonymizes IP addresses.'
language: 'python'
vendor: 'huawei'
raw_id: 'AnonymizedIpAddress'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '这个是IP地址:10.x.x.10'
after: '这个是IP地址:<ip>'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,74 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 身份证号码匿名化插件
Create: 2024/12/26 15:43
"""
import ipaddress
import re
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class AnonymizedIpAddress(Mapper):
def __init__(self, *args, **kwargs):
# IP地址校验
# X.X.X.X与四级目录格式相同,避免误清洗,该格式的IP地址必须匹配 IP/IP地址等字样
super().__init__(*args, **kwargs)
self.ipv4_1_and_prefix_pattern = r'ip(地址| address|v4)?( |:|:)*(?<![\.\d])'
# X.X.X.X
self.ipv4_pattern = r'(?<![\.\d])\d\.\d\.\d\.\d(?![\.\d])'
self.ipv4_re_compile = re.compile(r"(?<![\d.])(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?![.\d])")
self.ipv6_re_compile = re.compile(r"(?<![0-9a-fA-F:])(([0-9a-fA-F]{0,4}:)+[0-9a-fA-F]{0,4})(?![0-9a-fA-F:])")
@staticmethod
def verify_ip_address(ip):
"""验证字符串是否为合法ip地址"""
try:
ipaddress.ip_address(ip)
except ValueError:
return False
return True
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._ip_address_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: IPAddressCleaner costs {time.time() - start:6f} s")
return sample
def filter_ipv4(self, ipv4, line):
"""ipv4地址匿名化"""
if not self.verify_ip_address(ipv4):
return line
ipv4_format = ipv4.replace(".", "\\.")
# 非单字节ip地址直接匿名化
if not re.search(self.ipv4_pattern, "" + ipv4 + ""):
line = re.compile(r"(?<![\d.])" + ipv4_format + r"(?![.\d])").sub("<ip>", line)
elif re.search(self.ipv4_1_and_prefix_pattern + ipv4_format + r"(?![.\d])", line, re.IGNORECASE):
# 单字节ip地址需搜索关键字眼,有关键字眼则段落中单字节ip地址匿名化
line = re.compile(self.ipv4_pattern).sub("<ip>", line)
return line
def _ip_address_filter(self, input_data: str):
""" IPv4、IPv6地址匿名化"""
lines = input_data.split("\n")
line_list = []
for line in lines:
# 为防止IP地址处于段落开头或结尾不能被匹配,需要在字符串首尾加占位符
line = ''.join(['', line, ''])
ipv4_groups = self.ipv4_re_compile.findall(line)
for ipv4 in ipv4_groups:
line = self.filter_ipv4(ipv4, line)
ipv6_groups = self.ipv6_re_compile.findall(line)
for group in ipv6_groups:
ipv6 = group[0]
if ipv6 and self.verify_ip_address(ipv6):
line = re.compile(r"(?<![0-9a-fA-F:])" + ipv6 + "(?![0-9a-fA-F:])").sub("<ip>", line)
line_list.append(line[1:-1])
text = "\n".join([line.strip() for line in line_list])
return text

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='KnowledgeRelationSlice',
module_path="ops.mapper.knowledge_relation_slice.process")

View File

@@ -0,0 +1,108 @@
#!/usr/bin/python3.9
# -*- coding: utf-8 -*-
import math
from multiprocessing import Pool, cpu_count
from six import iteritems
from six.moves import range
from loguru import logger
PARAM_K1 = 1.5
PARAM_B = 0.75
EPSILON = 0.25
def effective_n_jobs(n_jobs):
if n_jobs == 0:
raise ValueError('n_jobs == 0 in Parallel has no meaning')
elif n_jobs is None:
return 1
elif n_jobs < 0:
n_jobs = max(cpu_count() + 1 + n_jobs, 1)
return n_jobs
class SimilarityAlgBM25(object):
def __init__(self, corpus_docs):
self.corpus_files_size = 0
self.avg_dl = 0
self.doc_file_freqs = []
self.idf_dict = {}
self.doc_len = []
self._initialize(corpus_docs)
def get_sim_score(self, document, index):
score = 0
doc_freqs = self.doc_file_freqs[index]
for word in document:
if word not in doc_freqs:
continue
try:
score += (self.idf_dict[word] * doc_freqs[word] * (PARAM_K1 + 1)
/ (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avg_dl)))
except KeyError as ke:
logger.warning('key not found in doc_freqs dict: ', word)
return score
def get_sim_scores(self, document):
scores = []
for index in range(self.corpus_files_size):
cur_score = self.get_sim_score(document, index)
scores.append(cur_score)
return scores
def get_scores_bow(self, document):
scores = []
for index in range(self.corpus_files_size):
score = self.get_sim_score(document, index)
if score > 0:
scores.append((index, score))
return scores
def _initialize(self, corpus_files):
"""
Calculates frequencies of terms in documents and in corpus_files.
Also computes inverse document frequencies.
"""
nd = {} # word -> number of documents with word
num_doc = 0
for document_file in corpus_files:
self.corpus_files_size += 1
self.doc_len.append(len(document_file))
num_doc += len(document_file)
frequencies_dict = {}
for word in document_file:
if word not in frequencies_dict:
frequencies_dict[word] = 0
frequencies_dict[word] += 1
self.doc_file_freqs.append(frequencies_dict)
for word, _ in iteritems(frequencies_dict):
if word not in nd:
nd[word] = 0
nd[word] += 1
self.avg_dl = float(num_doc) / self.corpus_files_size
# collect idf sum to calculate an average idf for epsilon value
idf_sum = 0
negative_idfs_list = []
for word, freq in iteritems(nd):
idf = math.log(self.corpus_files_size - freq + 0.5) - math.log(freq + 0.5)
self.idf_dict[word] = idf
idf_sum += idf
if idf < 0:
negative_idfs_list.append(word)
self.average_idf = float(idf_sum) / len(self.idf_dict)
eps = EPSILON * self.average_idf
for word in negative_idfs_list:
self.idf_dict[word] = eps

View File

@@ -0,0 +1,184 @@
#!/usr/bin/python3.9
# -*- coding: utf-8 -*-
__all__ = ['build_llm_prompt', 'get_json_list']
import math
import jieba
from loguru import logger
from . import graph_sim_func as bm25
from .knowledge_slice import TextSegmentationOperator
def build_llm_prompt(text):
#
prompt = """
===
<Role>:
你是一位问答对QA智能撰写专家,你擅长根据给定的内容给出准确、完整、详细的多个问答对。
===
<Instructions>:
- 你需要根据已知信息(context),准确、详细的生成多个QA对。
- 生成的问答对中答案少于10个中文字符时,放弃该问答对。
- 确保所有问答对的答案都是已知信息的一部分,且可以组成已知信息,确保没有信息遗漏。
- 仅根据已知信息生成问答对,答案要详细,且不能创造臆想已知信息中没有的内容。
- 确保生成的多个QA对之间不要进行排序,Q:或A:前后不要出现数字序号。
- Q:使用疑问句方式,问号结尾;A:使用陈述句方式,句号结尾,确保回答完整。
- 输出格式如下:
Q:......
A:......
===
<task>
满足上述条件的情况下,现根据context:'''{}'''
生成的多个QA问答对为:
"""
return prompt.format(text)
class KnowledgeSlice:
# edatamate切片算法插件
def __init__(self, file_text, chunk_size=500, overlap_size=100):
self.file_text = file_text
self.slice_op = TextSegmentationOperator(chunk_size, overlap_size)
def execute(self):
try:
chunks = self.slice_op.process(self.file_text)
except Exception as err:
logger.exception(f"split text failed, error is: {err}")
chunks = []
return chunks
class BM25Model:
def __init__(self, data_list):
self.data_list = data_list
self.corpus = self.load_corpus()
def bm25_similarity(self, query, num_best=1):
query = jieba.lcut(query)
bm = bm25.SimilarityAlgBM25(self.corpus)
scores = bm.get_sim_scores(query)
id_score = [(i, score) for i, score in enumerate(scores)]
id_score.sort(key=lambda e: e[1], reverse=True)
return id_score[0: num_best]
def load_corpus(self):
corpus = [jieba.lcut(data) for data in self.data_list]
return corpus
class KnowledgeGraph:
# class for document segmentation and create relation between knowledge
def __init__(self, corpus_file_string, chunk_size=500, overlap_size=100, kg_relation=True):
self.corpus_file_string = corpus_file_string
self.chunk_size = chunk_size
self.overlap_size = overlap_size
self.kg_relation = kg_relation
self.slicing_corpus = []
self.knowledge_slice = KnowledgeSlice(self.corpus_file_string, self.chunk_size, self.overlap_size)
@staticmethod
def update_gallery_list(gallery_list, iterated_dict):
# get a gallery list which not in iterated_dict
gallery_list_update = []
gallery_list_index = []
for i, _ in enumerate(gallery_list):
if i not in iterated_dict:
gallery_list_update.append(gallery_list[i])
gallery_list_index.append(i)
return gallery_list_update, gallery_list_index
def document_slicing(self):
json_list = []
all_slices_info = self.knowledge_slice.execute()
for _, item in enumerate(all_slices_info):
json_list.append({
"slice_data": item
})
self.slicing_corpus = json_list
def build_knowledge_relation(self, slicing_corpus_list):
# knowledge relation for each paragraph
if not self.kg_relation:
return slicing_corpus_list
iterated_dict = {}
kr_result_json_list = []
gallery_list = []
kr_relation_list = []
if len(slicing_corpus_list) < 3:
return slicing_corpus_list
for _, item in enumerate(slicing_corpus_list):
gallery_list.append(item['slice_data'])
for k, item in enumerate(slicing_corpus_list):
if k not in iterated_dict:
iterated_dict[k] = 1
cur_gallery_list, cur_gallery_src_index = self.update_gallery_list(gallery_list, iterated_dict)
if len(cur_gallery_list) < 1:
kr_result_json_list.append({
"slice_data": item['slice_data']
})
return kr_result_json_list
bm25_class = BM25Model(cur_gallery_list)
id_scores = bm25_class.bm25_similarity(item['slice_data'], 1)
kr_result_doc = item['slice_data'] + cur_gallery_list[id_scores[0][0]]
kr_result_json_list.append({
"slice_data": kr_result_doc
})
if cur_gallery_src_index[id_scores[0][0]] not in iterated_dict:
iterated_dict[cur_gallery_src_index[id_scores[0][0]]] = 1
else:
continue
return kr_result_json_list
def build_graph_efficiently(self, search_space_size=50):
# build knowledge relation in a efficient way
knowledge_total_num = len(self.slicing_corpus)
knowledge_chunk_num = math.ceil(knowledge_total_num / search_space_size)
knowledge_relation_result = []
for i in range(0, knowledge_chunk_num):
cur_max_index = (i + 1) * search_space_size
if cur_max_index > knowledge_total_num:
corpus_list = self.slicing_corpus[i * search_space_size:]
else:
corpus_list = self.slicing_corpus[i * search_space_size:cur_max_index]
# to do knowledge relation
cur_knowledge_relation_result = self.build_knowledge_relation(corpus_list)
knowledge_relation_result.extend(cur_knowledge_relation_result)
return knowledge_relation_result
def knowledge_corpus_list_json(self):
# deal the corpus and return structed information json_list
self.document_slicing()
kr_result_list_json = self.build_graph_efficiently()
return kr_result_list_json
def get_json_list(txt_string, chunk_size=500, overlap_size=100, kg_relation=True):
if len(txt_string) > 0:
kg_extract = KnowledgeGraph(txt_string, chunk_size, overlap_size, kg_relation)
kr_result_json_list = kg_extract.knowledge_corpus_list_json()
else:
kr_result_json_list = []
return kr_result_json_list

View File

@@ -0,0 +1,23 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
from typing import List
from loguru import logger
from datamate.common.utils.text_splitter import TextSplitter
class TextSegmentationOperator:
def __init__(self, chunk_size, chunk_overlap):
try:
self.text_splitter = TextSplitter(-1, chunk_size, chunk_overlap)
except Exception as err:
logger.exception(f"init text splitter failed, error is: {err}")
raise err
def process(self, input_data: str) -> List[str]:
if input_data.strip() == "":
logger.info("input text is empty, return empty chunks.")
return []
return self.text_splitter.split_text(input_data)

View File

@@ -0,0 +1,16 @@
name: '知识库关系切片'
name_en: 'Knowledge base relationship slicing'
description: '知识库关系切片'
description_en: 'Knowledge base relationship slicing.'
language: 'python'
vendor: 'huawei'
raw_id: 'KnowledgeRelationSlice'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,46 @@
# -- encoding: utf-8 --
"""
Description:
Create: 2023/11/7 9:26
"""
import json
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
from .knowledge_relation import get_json_list
# 切片长度
CHUNK_SIZE = 500
# 相邻切片重合长度
OVERLAP_SIZE = 100
class KnowledgeRelationSlice(Mapper):
def __init__(self, *args, **kwargs):
super(KnowledgeRelationSlice, self).__init__(*args, **kwargs)
if 'chunk_size' not in kwargs:
self.chunk_size = CHUNK_SIZE
else:
self.chunk_size = kwargs.get("chunk_size")
if 'overlap_size' not in kwargs:
self.overlap_size = OVERLAP_SIZE
else:
self.overlap_size = kwargs.get("overlap_size")
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start_time = time.time()
chunk_item = get_json_list(sample[self.text_key], chunk_size=self.chunk_size, overlap_size=self.overlap_size)
chunk_item_json = json.dumps(chunk_item, ensure_ascii=False)
sample[self.text_key] = chunk_item_json
cost_time = time.time() - start_time
logger.info(f'Generate knowledgeRelation slice num: {len(chunk_item)}, Cost time: {cost_time} s')
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='LegendCleaner',
module_path="ops.mapper.legend_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '图注表注去除'
name_en: 'Figure and Table Description Removal'
description: '去除文档中的图注、表注等内容。'
description_en: 'Removes figure and table description from documents.'
language: 'python'
vendor: 'huawei'
raw_id: 'LegendCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '图1.1.1 图注名称'
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,41 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 图注表注去除
Create: 2024/12/5 15:43
"""
import re
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class LegendCleaner(Mapper):
@staticmethod
def _get_legend_re_compile():
chinese_legend_prefix = r"(图|表|图片|表格)"
chinese_legend_number = r"(\d+((\.|-)\d+)*|[a-zA-Z]{1,2}((\.|-)\d+)*)"
chinese_legend_pattern = r"(?<=\n)" + chinese_legend_prefix + "( )*" + chinese_legend_number + " +.*\n"
english_legend_pattern = r"(Figure|Table|Fig\.?)"
english_legend_number = r"(S?\d+((\.|-)\d+)*|[a-zA-Z]{1,2}\d?((\.|-)\d+)*)"
english_legend_pattern = (r"(?<=\n)" + english_legend_pattern + "( )*"
+ english_legend_number + r"(\.|:)? +.*\n")
legend_re_compile = re.compile('|'.join([chinese_legend_pattern, english_legend_pattern]), re.IGNORECASE)
return legend_re_compile
@classmethod
def _clean_html_tag(cls, input_data: str):
"""移除文档中图注表注等"""
input_data = ''.join(['\n', input_data, '\n'])
text = cls._get_legend_re_compile().sub("", input_data)
return text[1:-1]
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._clean_html_tag(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: LegendCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='AnonymizedPhoneNumber',
module_path="ops.mapper.phone_number_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '电话号码匿名化'
name_en: 'Phone Number Anonymization'
description: '电话号码匿名化'
description_en: 'Anonymizes phone numbers.'
language: 'python'
vendor: 'huawei'
raw_id: 'AnonymizedPhoneNumber'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '这个是电话号码:13111111111'
after: '这个是电话号码:<tel>'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,51 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 电话号码匿名化
Create: 2024/12/26 15:43
"""
import re
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class AnonymizedPhoneNumber(Mapper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.phone_re_compile = self.get_phone_re_compile()
@staticmethod
def get_phone_re_compile():
"""按照格式粗略匹配电话号码,支持以下格式电话号码
前缀:(0086)、(86)、(0086)、(86) 、无
电话号码:第一位1,第二位3-9,后续数字可以为0-9,数字按照3-4-4进行间隔,间隔符为空格、-、无
固定电话号码:0AX-CXXX-XXXX、0BXX-CXXX-XXXX、0BXX-CXX-XXXX A为1-2、B为3-9、C为2-8、X为0-9
约束:电话号码前后皆为非数字
"""
number_prefix = r'([\((]?\+?(00)?86[)\)]?[- ]?)?'
cellphone_pattern = r"1[3-9]\d[- ]?\d{4}[- ]?\d{4}"
landline_pattern = (r'[((]?(0?[12]\d)[))]?[ -]?[2-8]\d{3}[ -]?\d{4}'
r'|[((]?(0?[3-9]\d{2})[))]?[ -]?[2-8]\d{2}\d?[ -]?\d{4}')
phone_numbers_pattern = rf'(?<=[^\d]){number_prefix}({cellphone_pattern}|{landline_pattern})(?=[^\d])'
phone_re_compile = re.compile(phone_numbers_pattern)
return phone_re_compile
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._phone_number_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: PhoneNumberCleaner costs {time.time() - start:6f} s")
return sample
def _phone_number_filter(self, input_data: str):
""" 电话号码匿名化"""
# 正则匹配:电话号码前需匹配不是数字的字符串
# 为避免处于文章开头和结尾的电话号码不可被识别,需要在输入字符串的前后手动加上字符串
input_data = ''.join(['', input_data, ''])
input_data = self.phone_re_compile.sub("<tel>", input_data)
return input_data[1:-1]

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='PoliticalWordCleaner',
module_path="ops.mapper.political_word_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '政治文本匿名化'
name_en: 'Political Text Anonymization'
description: '将政治文本进行匿名化。'
description_en: 'Anonymizes political texts.'
language: 'python'
vendor: 'huawei'
raw_id: 'PoliticalWordCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '特别字符:改革历程'
after: '特别字符:***'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,67 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 政治文本过滤
Create: 2024/12/26 15:43
"""
import time
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from datamate.common.utils.aho_corasick import AhoCorasic
from datamate.core.base_op import Mapper
class PoliticalWordCleaner(Mapper):
"""外部输入的政治文本过滤插件"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
root_path = Path(__file__).parent / 'resources'
political_file_path = str(root_path / 'political.txt')
special_symbols_path = str(root_path / 'special_symbols.txt')
self.special_symbols = self.load_words_list(special_symbols_path)
self.political_words = self.load_words_list(political_file_path)
self.ac_automaton = AhoCorasic(self.political_words)
@staticmethod
def load_words_list(path):
"""词表加载"""
with open(path, 'r', encoding='utf-8') as f:
words = set(f.read().splitlines())
return words
@staticmethod
def words_replace(target_strings: list, text: str):
"""
目标字符串替换。
Args:
target_strings: 前缀树根节点。
text: 待清洗文本。
returns:
清洗后文本。
"""
target_strings.sort(key=lambda x: -len(x))
for s in target_strings:
tmp_text = text.replace(s, '*' * len(s))
text = tmp_text
return text
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._political_word_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: PoliticalWordCleaner costs {time.time() - start:6f} s")
return sample
def _political_word_filter(self, text):
"""词语过滤主函数,分行过滤"""
filtered_rows = []
for row in text.split('\n'):
matched_words = self.ac_automaton.search(row, self.special_symbols)
filtered_rows.append(self.words_replace(matched_words, row))
return '\n'.join(filtered_rows)

View File

@@ -0,0 +1,321 @@
习近平
平近习
xjp
习太子
习明泽
老习
温家宝
温加宝
温x
温jia宝
温宝宝
温加饱
温加保
张培莉
温云松
温如春
温jb
胡温
胡x
胡jt
胡boss
胡总
胡王八
hujintao
胡jintao
胡j涛
胡惊涛
胡景涛
胡紧掏
湖紧掏
胡紧套
锦涛
hjt
胡派
胡主席
刘永清
胡海峰
胡海清
江泽民
民泽江
江胡
江主席
江书记
江浙闽
江沢民
江浙民
茳泽民
zemin
ze民
老江
老j
江core
江x
江派
江zm
jzm
江戏子
江蛤蟆
江某某
江贼
江猪
江氏集团
江绵恒
江绵康
王冶坪
江泽慧
邓小平
平小邓
xiao平
邓xp
邓晓平
邓朴方
邓榕
邓质方
毛泽东
猫泽东
猫则东
猫贼洞
毛zd
毛zx
z东
ze东
泽d
zedong
毛太祖
毛相
主席画像
改革历程
朱镕基
朱容基
朱镕鸡
朱容鸡
朱云来
李鹏
李peng
里鹏
李月月鸟
李小鹏
李小琳
华主席
华国
国锋
国峰
锋同志
白春礼
薄熙来
薄一波
蔡赴朝
蔡武
曹刚川
常万全
陈炳德
陈德铭
陈建国
陈良宇
陈绍基
陈同海
陈至立
戴秉国
丁一平
董建华
杜德印
杜世成
傅锐
郭伯雄
郭金龙
贺国强
胡春华
耀邦
华建敏
黄华华
黄丽满
黄兴国
回良玉
贾庆林
贾廷安
靖志远
李长春
李春城
李建国
李克强
李岚清
李沛瑶
李荣融
李瑞环
李铁映
李先念
李学举
李源潮
栗智
梁光烈
廖锡龙
林树森
林炎志
林左鸣
令计划
柳斌杰
刘奇葆
刘少奇
刘延东
刘云山
刘志军
龙新民
路甬祥
罗箭
吕祖善
马飚
马恺
孟建柱
欧广源
强卫
沈跃跃
宋平顺
粟戎生
苏树林
孙家正
铁凝
屠光绍
王东明
汪东兴
王鸿举
王沪宁
王乐泉
王洛林
王岐山
王胜俊
王太华
王学军
王兆国
王振华
吴邦国
吴定富
吴官正
无官正
吴胜利
吴仪
奚国华
习仲勋
徐才厚
许其亮
徐绍史
杨洁篪
叶剑英
由喜贵
于幼军
俞正声
袁纯清
曾培炎
曾庆红
曾宪梓
曾荫权
张德江
张定发
张高丽
张立昌
张荣坤
张志国
赵洪祝
紫阳
周生贤
周永康
朱海仑
中南海
大陆当局
中国当局
北京当局
共产党
党产共
共贪党
阿共
产党共
公产党
工产党
共c党
共x党
共铲
供产
共惨
供铲党
供铲谠
供铲裆
共残党
共残主义
共产主义的幽灵
拱铲
老共
中共
中珙
中gong
gc党
贡挡
gong党
g产
狗产蛋
共残裆
恶党
邪党
共产专制
共产王朝
裆中央
土共
土g
共狗
g匪
共匪
仇共
症腐
政腐
政付
正府
政俯
政f
zhengfu
政zhi
挡中央
档中央
中国zf
中央zf
国wu院
中华帝国
gong和
大陆官方
北京政权
江泽民
胡锦涛
温家宝
习近平
习仲勋
贺国强
贺子珍
周永康
李长春
李德生
王岐山
姚依林
回良玉
李源潮
李干成
戴秉国
黄镇
刘延东
刘瑞龙
俞正声
黄敬
薄熙
薄一波
周小川
周建南
温云松
徐明
江泽慧
江绵恒
江绵康
李小鹏
李鹏
李小琳
朱云来
朱容基
法轮功
李洪志
新疆骚乱

View File

@@ -0,0 +1,50 @@
!
.
,
#
$
%
&
*
(
)
|
?
/
@
"
'
;
[
]
{
}
+
~
-
_
=
^
<
>
——
……
:

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='DuplicateSentencesFilter',
module_path="ops.mapper.remove_duplicate_sentences.process")

View File

@@ -0,0 +1,16 @@
name: '文档局部内容去重'
name_en: 'Partial Content Deduplication'
description: '文档局部内容去重。'
description_en: 'Deduplicates partial file content.'
language: 'python'
vendor: 'huawei'
raw_id: 'DuplicateSentencesFilter'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '这是一个重复的句子。 这是一个重复的句子。 这是一个重复的句子。 这是一个重复的句子。 这是一个重复的句子。'
after: '这是一个重复的句子。'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,68 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 文档局部内容去重
Create: 2025/01/07
"""
import re
import time
from collections import Counter
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Filter
def duplicate_sentences_filter(input_data: str, file_name: str, duplicate_th: int = 5) -> str:
""" 文本局部内容去重:去除某些重复出现的段落或句子
以段落为基本单位,去除重复次数超过规定阈值的段落, 只保留第一次出现的段落的原始内容, 且不去除段落的首尾空格。
Args:
input_data: 输入数据
file_name: 文件名称
duplicate_th: 最大重复次数阈值,默认小于5次
Returns:
str: 清洗后数据
"""
paragraphs = input_data.split("\n")
trust_set = {'<table>', '<tbody>', '<tr>', '<td>', '</table>', '</tbody>', '</tr>', '</td>', ""}
# 进行一次遍历,记录每个段落的出现位置
order_paragraphs = []
paragraph_counts = Counter([line.strip() for line in re.split("\\n", input_data)])
try:
for paragraph in paragraphs:
# trust_set 中的元素不纳入统计
if paragraph.strip() in trust_set:
order_paragraphs.append(paragraph)
continue
paragraph_strip = paragraph.strip()
if duplicate_th > paragraph_counts[paragraph_strip] >= 0:
order_paragraphs.append(paragraph)
elif paragraph_counts[paragraph_strip] >= duplicate_th:
order_paragraphs.append(paragraph)
paragraph_counts[paragraph_strip] = -1
except Exception as err:
logger.exception(f"fileName: {file_name}, method: RemoveDuplicateSentencess. An error occurred when using "
f"filtering duplicate sentences. The error is: {err}")
return input_data
# 将去重后的段落重新组合成文本
result_text = '\n'.join(order_paragraphs)
return result_text
class DuplicateSentencesFilter(Filter):
"""文档局部内容去重插件"""
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
duplicate_th = 5 # 段落重复次数阈值
file_name = sample[self.filename_key]
start = time.time()
sample[self.text_key] = duplicate_sentences_filter(sample[self.text_key], file_name, duplicate_th)
logger.info(f"fileName: {file_name}, RemoveDuplicateSentencess costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='SexualAndViolentWordCleaner',
module_path="ops.mapper.sexual_and_violent_word_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '暴力色情文本匿名化'
name_en: 'Violent and Pornographic Text Anonymization'
description: '将暴力、色情文本进行匿名化。'
description_en: 'Anonymizes violent and pornographic texts.'
language: 'python'
vendor: 'huawei'
raw_id: 'SexualAndViolentWordCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '特别字符:炸药'
after: '特别字符:***'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,70 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 暴力色情文本过滤
Create: 2024/12/26 15:43
"""
import time
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from datamate.common.utils.aho_corasick import AhoCorasic
from datamate.core.base_op import Mapper
class SexualAndViolentWordCleaner(Mapper):
"""外部输入的暴力、色情文本过滤插件"""
root_path = Path(__file__).parent / 'resources'
VIOLENT_FILE_PATH = str(root_path / 'violent.txt')
SEXUAL_FILE_PATH = str(root_path / 'sexual.txt')
SPECIAL_SYMBOLS_PATH = str(root_path / 'special_symbols.txt')
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.violent_words = self.load_words_list(self.VIOLENT_FILE_PATH)
self.sexual_words = self.load_words_list(self.SEXUAL_FILE_PATH)
self.special_symbols = self.load_words_list(self.SPECIAL_SYMBOLS_PATH)
self.words = self.violent_words | self.sexual_words
self.ac_automaton = AhoCorasic(self.words)
@staticmethod
def load_words_list(path):
"""词表加载"""
with open(path, 'r', encoding='utf-8') as f:
words = set(f.read().splitlines())
return words
@staticmethod
def words_replace(target_strings: list, text: str):
"""
目标字符串替换。
Args:
target_strings: 前缀树根节点。
text: 待清洗文本。
returns:
清洗后文本。
"""
target_strings.sort(key=lambda x: -len(x))
for s in target_strings:
tmp_text = text.replace(s, '*' * len(s))
text = tmp_text
return text
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._sexual_and_violent_word_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: SexualAndViolentWordCleaner costs {time.time() - start:6f} s")
return sample
def _sexual_and_violent_word_filter(self, text):
"""词语过滤主函数,分行过滤"""
filtered_rows = []
for row in text.split('\n'):
matched_words = self.ac_automaton.search(row, self.special_symbols)
filtered_rows.append(self.words_replace(matched_words, row))
return '\n'.join(filtered_rows)

View File

@@ -0,0 +1,288 @@
爱液
按摩棒
拔出来
爆草
包二奶
暴干
暴奸
暴乳
爆乳
暴淫
被操
被插
被干
逼奸
仓井空
插暴
操逼
操黑
操烂
肏你
肏死
操死
操我
厕奴
插比
插b
插逼
插进
插你
插我
插阴
潮吹
潮喷
成人电影
成人论坛
成人色情
成人网站
成人文学
成人小说
艳情小说
成人游戏
吃精
抽插
春药
大波
大力抽送
大乳
荡妇
荡女
盗撮
发浪
放尿
肥逼
粉穴
干死你
干穴
肛交
肛门
龟头
裹本
国产av
豪乳
黑逼
后穴
虎骑
换妻俱乐部
黄片
几吧
鸡吧
鸡巴
鸡奸
妓女
奸情
叫床
脚交
精液
就去日
巨屌
菊花洞
菊门
巨奶
巨乳
菊穴
开苞
口爆
口活
口交
口射
口淫
狂操
狂插
浪逼
浪妇
浪叫
浪女
漏乳
露b
乱交
乱伦
轮暴
轮操
轮奸
裸陪
买春
美逼
美少妇
美乳
美腿
美穴
美幼
秘唇
迷奸
密穴
蜜穴
蜜液
摸奶
摸胸
母奸
奈美
奶子
男奴
内射
嫩逼
嫩女
嫩穴
捏弄
女优
炮友
砲友
喷精
屁眼
前凸后翘
强jian
强暴
强奸处女
情趣用品
情色
拳交
全裸
群交
人妻
人兽
日逼
日烂
肉棒
肉逼
肉唇
肉洞
肉缝
肉棍
肉茎
肉具
揉乳
肉穴
肉欲
乳爆
乳房
乳沟
乳交
乳头
骚逼
骚比
骚女
骚水
骚穴
色逼
色情网站
色区
色色
色诱
色欲
色b
射爽
射颜
食精
释欲
兽奸
兽交
手淫
兽欲
熟妇
熟母
熟女
爽片
双臀
死逼
丝袜
丝诱
松岛枫
酥痒
汤加丽
套弄
体奸
体位
舔脚
舔阴
调教
偷欢
推油
脱内裤
文做
舞女
吸精
夏川纯
相奸
小逼
小穴
小xue
性感妖娆
性感诱惑
性虎
性饥渴
性技巧
性交
性奴
性虐
性息
性欲
穴口
穴图
亚情
颜射
阳具
杨思敏
要射了
一夜欢
一夜情
一ye情
阴部
淫虫
阴唇
淫荡
阴道
淫电影
阴阜
淫妇
淫河
阴核
阴户
淫贱
淫叫
淫教师
阴茎
阴精
淫浪
淫媚
淫糜
淫魔
淫母
淫女
淫虐
淫妻
淫情
淫色
淫声浪语
淫兽学园
淫书
淫术炼金士
淫水
淫娃
淫威
淫亵
淫样
淫液
淫照
阴b
应召
幼交
欲火
欲女
玉乳
玉穴
援交
原味内衣
援助交际
招鸡
招妓
抓胸
自慰
作爱
a片
fuck
gay片
g点
h动画
h动漫
失身粉
淫荡自慰器

View File

@@ -0,0 +1,50 @@
!
.
,
#
$
%
&
*
(
)
|
?
/
@
"
'
;
[
]
{
}
+
~
-
_
=
^
<
>
——
……
:

View File

@@ -0,0 +1,452 @@
出售雷管
炸药
出售炸药
出售炸药电话
制作火药配方
炸药出售
出售雷管炸药
火药配方
在家里做原子弹
硝酸甘油炸弹制作
TNT
炸弹的制作
硝铵炸药配方
硝酸甘油制作
炸弹配方
简易炸药
火药制作简易炸弹
炸弹制作
硝酸甘油
硝酸甘油炸弹制作方法
手把手教你做炸弹
恐怖分子傻瓜手册
氢弹手工制作方法
起爆器
自制燃烧弹
燃烧弹
弹药制造方法
液体炸药配方
炸药配方与制作
自制炸药配方
tnt炸药成分
炸药的制作
tnt炸药配方
各种炸药配方大全
炸药的成分
燃烧弹的制作方
黑火药的配方
黑火药的制作
黑火药比例
制作燃烧弹
黑火药配方比例
黑火药的成分
原子弹配方
原子弹DIY教程
C4炸药
雷管出售
制作硝化甘油
DIY核弹
核弹制作方法
原子弹制作手册
简易原子弹教程
黑火药配方
自制炸药方法
教你怎么用土办法做武器
zz炸弹的制作方法
C4炸药的成分
炸药制造生产工艺
原子弹制造完全手册
汽车炸弹制作
人体炸弹制作流程
炸弹制作从入门
鸡尾酒炸弹制作
集束炸弹制作
硝酸铵的简便制取方法
莫洛托夫燃烧弹
塑胶炸弹
用火药混合塑料
高性能爆炸物质混合
原子弹制作方法
手教你制作原子弹
轻型巡航导弹的制作
黑索金的制造过程
Zha药制造进程
购买自爆材料
可塑炸弹制作方法
C4塑胶炸药
水浴法制TNT
简易核弹头的制造
高爆炸药的基本配方
如何制硝化甘油
燃烧剂的简易制备
简易大威力炸弹
炸弹制作入门
硝酸甘油炸药制作
基本配方是硝酸甘
燃烧剂的简易制作
制作性能优良的燃烧弹
酸甘油炸药
简易导弹制作
核弹头的制造
可塑炸弹制作
炸药硝酸甘油制法
原子弹制作方法
火药的配方
手教你做原子弹
怎么用土办法做武器
手把手教你制作原子弹
燃烧弹的制作方法
怎么自制燃烧弹
TNT炸弹制作
高爆炸药基本配方
爆破和杀伤燃烧弹
炸弹制作从入门到精
写的炸弹制作教程
原子弹制作方法
教你制作原子弹
下硝化甘油的制作方法
DIY原子弹
原子弹制作
炸药及其制造方法
ATOM弹制造
三分钟教你原子弹
火药原理
简易炸弹
简易燃烧弹
土炸药成份
你怎么用土办法做武器
燃烧弹制作
燃烧瓶的制作方法
炸弹制作手册
原子弹制作方法DIY
汽油三分之一白糖混合
炸药的制作配方
燃烧弹的制作方
炸药的配方
如何自制燃烧弹
标准炸弹教程
黑火药和硝酸钾农药混合
恐怖分子自己制造炸弹
手柄制作炸弹引爆器
制作炸弹首先要
燃烧瓶制作
出售炸药
出售气枪
出售炸药QQ
出售气枪QQ
出售气枪电话
出售手枪QQ
出售手枪电话
出售猎枪
出售猎枪QQ
出售猎枪电话
出售汽枪
出售步枪
出售狙击枪
出售手狗
出售气狗
出售猎狗
手枪出售
64式手枪出售
92式手枪出售
高仿真枪
出售54式手枪
出售M1911
出售各类猎狗
出售国产军用54手枪
出售火枪游戏
出售进口金属枪模
出售仿真枪
出售麻醉枪免收任何定金
出售手枪
出售外国枪模
出售袖珍手枪
娥眉气枪出售
防身手枪QQ
防身武器手枪
仿真枪QQ
钢珠枪小口径步枪
高压狗QQ
高压气枪
气枪子弹
高压气枪电狗专卖
高压气枪
气枪货到付款
高压气枪专卖店
各类军用枪
各类军用枪支
各式气枪出售
工字牌气枪出售专卖
气枪
工字汽狗麻醉枪
供应军用手枪
供应汽枪
购买枪支QQ
国产汽狗专卖
气枪货
集团出售手枪
集团出售手枪气枪
进口气枪
气枪子弹
军用54出售
军用77出售
军用手枪钢珠枪
里有汽枪出售
猎枪出售
猎枪仿真枪QQ
猎枪麻醉枪钢珠枪
猎枪哪里有卖
猎枪手枪买卖
猎枪销售手枪销售气枪
猎枪专卖店
猎枪子弹底火专卖
麻醉枪出售可验货
麻醉枪猎枪军用
买卖54狗
买卖64狗
买卖警察枪支
买卖军狗
买卖军用枪支
卖64手枪
卖92手枪
卖钢珠枪QQ
哪里能买到自制手枪
哪里有钢珠枪
哪里有连发猎枪买卖
哪里有卖双管猎
哪里有气枪卖
哪里有枪出售
哪里有真枪出售
哪里有真枪卖买
哪能买到真枪
出售手枪气枪猎
气狗
高压气枪
气狗出售
进口气狗
气枪专卖网
气枪出售
气枪专卖网
猎枪
钢珠枪
气枪仿真枪出售
气枪购买
气枪组装
气枪货到付款
汽狗出售
气枪价格
仿真气枪
气枪价格
气枪专卖店
气枪看货加QQ
气枪联系QQ
气枪猎枪麻醉枪
气枪迷魂药
汽狗高压汽枪
铅弹
上海工字气枪
全金属仿真枪专卖
热武器QQ
三箭气枪出售
枪猎
手槍买卖自制手枪
售枪购买手枪
售手枪QQ
司马仿真枪模型QQ
淘宝气枪货到付款
提供气狗专卖店
提供专业及时的气狗
秃鹰气枪货到付款
锡峰气枪出售
仙鹤气枪出售
买卖77式手枪
销售电手狗
销售气手狗
销售小口径步枪
销售运动步枪
小口径秃鹰沙漠
有军用枪支出售
长期出售手枪
折叠狗QQ
转让猎枪
子弹货到付款
自制手枪哪里买
自制手枪哪里有卖
阻击枪
汽枪
高压气枪
组装手枪
最新销售汽狗
出售警用手铐
仿真枪
猎枪
真枪
手枪
阻击枪
弓弩专营
弓弩专卖网货到付款
弓弩专卖网
弓弩专卖货到付款
弓弩专卖店网
弓弩专卖店
弓弩专卖
弓弩制作
弓弩直销
弓弩有限公司
弓弩营销中心
弓弩一百专卖店
弓弩销售
弓弩网
弓弩图纸
弓弩特许经销
弓弩狩猎网
自制手弩
追风弓弩麻醉箭专卖
专业弓弩网
中国战神军用弓弩
中国弩弓专卖
中国弓弩专卖网
中国弓弩直销
中国弓弩网
中国弓弩狩猎网
中国弓驽网
制作简易弓弩
郑州弓弩专卖
赵氏弓弩专卖网
赵氏弓弩专卖店
赵氏弓弩专卖
赵氏弓弩销售
小型弓弩专卖店
小猎人弓弩网
狩猎器材弓弩专卖
狩猎器材弓弩
狩猎弓弩专卖网
狩猎弓弩专卖
狩猎弓弩麻醉箭
手枪式折叠三用弩
三利达弓弩专卖网
三利达弓弩直营
三利达弓弩配件
三步倒药箭批发
三步倒弩箭专卖
三步倒麻醉弩箭销售
三步倒麻醉箭专卖
三步倒麻醉箭
三步倒捕狗药
军用弓弩专卖网
军用弓弩专卖店
军用弓弩批发
军用弓弩公司
供应三利达弓弩麻醉箭
供应三步倒麻醉箭
供应秦氏弓弩
供应弩用麻醉箭
供应弩捕狗箭
供应麻醉箭三步倒
供应麻醉箭批发
供应麻醉箭
供应军用弩折叠弩
供应军用弓弩专卖
供应精品弓弩
供应弓弩麻醉箭
供应弓弩
供应钢珠弓弩
弓弩商城专卖
弓弩商城
弓弩亲兄弟货到付款
弓弩批发
弓弩免定金货到付款
弓弩麻醉箭
弓弩麻醉镖
弓弩论坛
钢珠弓弩专卖网
钢珠弓弩专卖店
打狗弓弩三步倒
麻醉弓弩专卖店
出售军刀
出售军刺
出售弹簧刀
出售三棱刀
出售跳刀
军刀网
南方军刀网
户外军刀网
三棱军刺专卖
出售开山刀军刺
西点军刀网
军刀专卖
戈博军刀
阿兰德龙户外
出售军品军刀
勃朗宁军刀
军刀军品网
阿兰得龙野营刀具网
出售军刺军刀
警用刀具出售
折刀专卖网
阳江军品军刀网
野营刀专卖
砍刀精品折刀专卖
匕首蝴蝶甩刀专卖
军刀专卖军刺
军刀专卖刀具批发
军刀图片砍刀
军刀网军刀专卖
军刀价格军用刀具
军品军刺网
军刀军刺甩棍
阳江刀具批发网
北方先锋军刀
正品军刺出售
野营军刀出售
开山刀砍刀出售
仿品军刺出售
军刀直刀专卖
手工猎刀专卖
自动跳刀专卖
军刀电棍销售
军刀甩棍销售
美国军刀出售
极端武力折刀
防卫棍刀户外刀具
阿兰德龙野营刀
仿品军刺网
野营砍刀户外军刀
手工猎刀户外刀具
中国户外刀具网
西点军品军刀网
野营开山刀军刺
三利达弓弩军刀
尼泊尔军刀出售
防卫野营砍刀出售
防卫著名军刀出售
防卫棍刀出售
防卫甩棍出售
防卫电棍出售
军刺野营砍刀出售
著名精品折刀出售
战术军刀出售
刺刀专卖网
户外军刀出售
阳江刀具直销网
冷钢刀具直销网
防卫刀具直销网
极端武力直销网
刀具直销网
军刀直销网
直刀匕首直销网
军刀匕首直销网
折刀砍刀军品网
野营刀具军品网
阳江刀具军品网
冷钢刀具军品网
防卫刀具军品网
极端武力军品网
军用刀具军品网
军刀直刀军品网
折刀砍刀专卖
野营刀具专卖
阳江刀具专卖
冷钢刀具专卖
防卫刀具专卖
出售美军现役军刀

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='TextToWord',
module_path="ops.mapper.text_to_word.process")

View File

@@ -0,0 +1,16 @@
name: '转换为Word'
name_en: 'Convert-to-Word'
description: '将抽取结果转换为docx的word文件。'
description_en: 'Converts extraction results to Word files in DOCX format.'
language: 'python'
vendor: 'huawei'
raw_id: 'TextToWord'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

Some files were not shown because too many files have changed in this diff Show More