init datamate

2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions
--- a/runtime/ops/mapper/init.py
+++ b/runtime/ops/mapper/init.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+
+import sys
+from pathlib import Path
+from datamate.common.utils.custom_importer import CustomImporter
+
+
+def _configure_importer():
+    base_path = Path(__file__).resolve().parent
+    sys.meta_path.append(CustomImporter(base_path))
+
+
+_configure_importer()
+
+
+def _import_operators():
+    from . import content_cleaner
+    from . import credit_card_number_cleaner
+    from . import email_cleaner
+    from . import emoji_cleaner
+    from . import extra_space_cleaner
+    from . import full_width_characters_cleaner
+    from . import garble_characters_cleaner
+    from . import html_tag_cleaner
+    from . import id_number_cleaner
+    from . import img_watermark_remove
+    from . import invisible_characters_cleaner
+    from . import ip_address_cleaner
+    from . import legend_cleaner
+    from . import phone_number_cleaner
+    from . import political_word_cleaner
+    from . import sexual_and_violent_word_cleaner
+    from . import text_to_word
+    from . import traditional_chinese
+    from . import unicode_space_cleaner
+    from . import url_cleaner
+    from . import xml_tag_cleaner
+    from . import img_enhanced_brightness
+    from . import img_enhanced_contrast
+    from . import img_enhanced_saturation
+    from . import img_enhanced_sharpness
+    from . import img_perspective_transformation
+    from . import img_direction_correct
+    from . import img_denoise
+    from . import img_shadow_remove
+    from . import img_type_unify
+    from . import img_resize
+    from . import remove_duplicate_sentences
+    from . import knowledge_relation_slice
+
+
+_import_operators()
--- a/runtime/ops/mapper/content_cleaner/init.py
+++ b/runtime/ops/mapper/content_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ContentCleaner',
+                          module_path="ops.mapper.content_cleaner.process")
--- a/runtime/ops/mapper/content_cleaner/metadata.yml
+++ b/runtime/ops/mapper/content_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '文档目录去除'
+name_en: 'Document Contents Removal'
+description: '去除文档中的目录。'
+description_en: 'Removes tables of contents from documents.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ContentCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: ''
+  after: ''
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/content_cleaner/process.py
+++ b/runtime/ops/mapper/content_cleaner/process.py
@@ -0,0 +1,64 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 文档目录去除
+Create: 2025/01/13
+"""
+import re
+import time
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class ContentCleaner(Mapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.no_content_count = 3  # 连续不符合目录结构的行数阈值
+        # 目录标题
+        self.content_text_pattern = r"^ *(目 *录|CONTENT(S)?) *$"
+        # 目录行 前缀格式
+        self.content_preface_pattern = r"^ *(前言|About This Document|\d+(\.\d+)*|[a-zA-Z]+(\.\d+)*)"
+        # 目录行 中间格式
+        self.content_middle_pattern = r"\.{7,}"
+        # 目录行 结尾格式
+        self.content_end_pattern = r"(\d|错误!未定义书签。|[IXV]+) *$"
+        self.content_pattern = self.content_preface_pattern + ".*" + self.content_end_pattern
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._content_filter(sample[self.text_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, method: ContentCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _content_filter(self, input_data: str):
+        count = 0  # 记录不符合目录结构的次数，连续3行不满足要求，则认为已经进入正文
+        # 目录起始和结束索引
+        content_start_index, content_end_index = -1, -1
+        lines = input_data.split("\n")
+        for i, line in enumerate(lines):
+            if content_start_index >= 0 and count >= self.no_content_count:
+                break
+            # 首先匹配目录或content字眼
+            if content_start_index < 0 and re.match(self.content_text_pattern, line, re.IGNORECASE):
+                content_start_index = i
+                content_end_index = i
+            # 匹配两种形式的目录行
+            # 1. 以指定格式开始、指定格式结尾；2.该行包含点数量超过7个
+            elif content_start_index >= 0 and (re.match(self.content_pattern, line, re.IGNORECASE)
+                                               or re.search(self.content_middle_pattern, line)):
+                content_end_index = i
+                count = 0
+            elif content_start_index >= 0 and not (re.match(self.content_pattern, line, re.IGNORECASE)
+                                                   or re.search(self.content_middle_pattern, line)):
+                count += 1
+
+        if 0 <= content_start_index < content_end_index:
+            res = "\n".join(lines[:content_start_index] + lines[content_end_index + 1:])
+        else:
+            # 只有目录关键字时，关键字不去除;或不符合目录结构，返回原文
+            res = "\n".join(lines)
+        return res
--- a/runtime/ops/mapper/credit_card_number_cleaner/init.py
+++ b/runtime/ops/mapper/credit_card_number_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='AnonymizedCreditCardNumber',
+                          module_path="ops.mapper.credit_card_number_cleaner.process")
--- a/runtime/ops/mapper/credit_card_number_cleaner/metadata.yml
+++ b/runtime/ops/mapper/credit_card_number_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '信用卡号匿名化'
+name_en: 'Credit Card Number Anonymization'
+description: '信用卡号匿名化'
+description_en: 'Anonymizes credit card numbers.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'AnonymizedCreditCardNumber'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '这个是信用卡号：4111111111111111'
+  after: '这个是信用卡号：<credit_card_number>'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/credit_card_number_cleaner/process.py
+++ b/runtime/ops/mapper/credit_card_number_cleaner/process.py
@@ -0,0 +1,83 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 信用卡号匿名化
+Create: 2024/12/5 15:43
+"""
+from loguru import logger
+import re
+import time
+from typing import Dict, Any
+
+from datamate.core.base_op import Mapper
+
+
+class AnonymizedCreditCardNumber(Mapper):
+    def __init__(self, *args, **kwargs):
+        super(AnonymizedCreditCardNumber, self).__init__(*args, **kwargs)
+        self.re_compile = self._get_credit_card_re_compile()
+
+    @staticmethod
+    def _verify_credit_card_num(credit_card_num: str):
+        """信用卡号码校验"""
+        # 从右到左翻转
+        digits = [int(x) for x in reversed(credit_card_num) if x.isdigit()]
+        # 对偶数位数字翻倍 d*2
+        even_digits = [d * 2 for d in digits[1::2]]
+        # 如果对某个数字翻倍之后结果是一个两位数，将这两位数字加在一起
+        even_digits = [d // 10 + d % 10 for d in even_digits]
+        # 将上一步所有一位数相加
+        even_sum = sum(even_digits)
+        # 将卡号里从右到左奇数位上所有数字相加
+        odd_sum = sum(digits[::2])
+        # 将even_sum和odd_sum相加，能被10整数为合法，否则不合法
+        if (odd_sum + even_sum) % 10 == 0:
+            return True
+        return False
+
+    @staticmethod
+    def _get_credit_card_re_compile():
+        separator_symbol = r"([- ]?)"
+        # American Express 以 34 或 37 开头的 15 位数号码 格式:NNNN-NNNNNN-NNNNN 或 NNNN NNNNNN NNNNN
+        american_express = "3[47][0-9]{2}" + separator_symbol + "[0-9]{6}" + separator_symbol + "[0-9]{5}"
+        # 中国银联 以 62 或 60 开头，是一个 16 位数号码。 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
+        china_union_pay = r"(6[02]\d{2})" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
+        # Diner's Club 以 300–305、36、38 或 39、3095 开头, 14 位数号码  格式:NNNN-NNNNNN-NNNN 或 NNNN NNNNNN NNNN。
+        diners_club = r"(30[0-5]\d|3[689]\d{2}|3095)" + separator_symbol + r"[0-9]{6}" + separator_symbol + r"[0-9]{4}"
+        # Discover 以 6011、644–649 或 65 开头的 16 位数号码 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
+        discover = r"(64[4-9]\d|65\d{2}|6011)" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
+        # JCB 以 3528 到 3589 开头的 16 位数字, 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNNNNNN
+        jcb = r"(352[89]|35[3-8]\d)" + separator_symbol + r"[0-9]{4}" + (
+                r"((%s\d{%d}){%d}" % (separator_symbol, 4, 2) + ")|" + separator_symbol + r"[0-9]{8}")
+        # Mastercard 以 51–55 或 2221–2720 开头的 16 位数字 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
+        master_card = r"(5[1-5]\d{2}|222[1-9]|22[3-9]\d|2[3-6]\d{2}|27[01]\d|2720)" + r"(%s\d{%d}){%d}" \
+                      % (separator_symbol, 4, 3)
+        # visa 以4开头 16 位数号码 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
+        visa = r"4\d{3}" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
+
+        credit_card_pattern = r"(?<=[^\d])(%s|%s|%s|%s|%s|%s|%s)(?=[^\d])" % (
+            american_express, china_union_pay, diners_club,
+            discover, jcb, master_card, visa)
+        credit_card_re_compile = re.compile(credit_card_pattern)
+        return credit_card_re_compile
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._credit_card_number_filter(sample[self.text_key])
+        logger.info(
+            f"fileName: {sample[self.filename_key]}, method: CreditCardNumberCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _credit_card_number_filter(self, input_data: str):
+        """提取信用卡号号码"""
+        input_data = ''.join(['【', input_data, '】'])
+        # 抽取符合信用卡正则匹配的字符串
+        credit_card_nums = [item.group(1) for item in self.re_compile.finditer(input_data)]
+        # 判断抽取的字符串是不是真实的信用卡号
+        for credit_card_num in credit_card_nums:
+            if self._verify_credit_card_num(credit_card_num):
+                # 替换有效信用卡号号码为<credit_card_number>
+                credit_card_num_pattern = r"(?<=[^\d]){}(?=[^\d])".format(credit_card_num)
+                input_data = re.compile(credit_card_num_pattern).sub("<credit_card_number>", input_data)
+        return input_data[1:-1]
--- a/runtime/ops/mapper/email_cleaner/init.py
+++ b/runtime/ops/mapper/email_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='EmailNumberCleaner',
+                          module_path="ops.mapper.email_cleaner.process")
--- a/runtime/ops/mapper/email_cleaner/metadata.yml
+++ b/runtime/ops/mapper/email_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '邮件地址匿名化'
+name_en: 'Email Address Anonymization'
+description: '邮件地址匿名化'
+description_en: 'Anonymizes email addresses.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'EmailNumberCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '这个是邮箱号：test_email@gmail.com'
+  after: '这个是邮箱号：<email>'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/email_cleaner/process.py
+++ b/runtime/ops/mapper/email_cleaner/process.py
@@ -0,0 +1,47 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 邮件地址匿名化
+Create: 2025/01/15
+"""
+from loguru import logger
+import re
+import time
+from typing import Dict, Any
+
+from email_validator import validate_email, EmailNotValidError
+
+
+from datamate.core.base_op import Mapper
+
+
+class EmailNumberCleaner(Mapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.front_email_pattern = r'(?<=[^0-9a-zA-Z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~\-])'
+        self.back_email_pattern = r'(?=[^0-9a-zA-Z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~\-])'
+        self.email_pattern = r'([a-zA-Z\d.\-+_]+\s?@\s?[a-zA-Z\d.\-+_]+\.[a-zA-Z0-9]{2,6})'
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._email_number_filter(sample[self.text_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, method: EmailCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _email_number_filter(self, input_data: str):
+        """ 邮箱匿名化"""
+        mixed_data = ''.join(['龥', input_data, '龥'])
+        paired_emails = re.compile(self.front_email_pattern + self.email_pattern + self.back_email_pattern).findall(
+            mixed_data)
+        if paired_emails:
+            for email in paired_emails:
+                try:
+                    # 验证电子邮件地址
+                    validate_email(email, check_deliverability=False)
+                    mixed_data = re.compile(self.front_email_pattern + re.escape(email) + self.back_email_pattern).sub(
+                        "<email>", mixed_data, count=1)
+                except EmailNotValidError as err:
+                    # 日志打印该电子邮件地址无效（不显示具体电子邮件地址）
+                    logger.error(f"email is abnormal email form: {err}")
+        return mixed_data[1:-1]
--- a/runtime/ops/mapper/emoji_cleaner/init.py
+++ b/runtime/ops/mapper/emoji_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='EmojiCleaner',
+                          module_path="ops.mapper.emoji_cleaner.process")
--- a/runtime/ops/mapper/emoji_cleaner/metadata.yml
+++ b/runtime/ops/mapper/emoji_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '文档表情去除'
+name_en: 'Emoticon Removal'
+description: '去除文档中表情字符或者emoji符号。'
+description_en: 'Removes emoticons or emojis from documents.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'EmojiCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '使用方式很简单，只需要将代码放入Markdown文本中即可，富文本格式可直接复制表情😀使用。'
+  after: '使用方式很简单，只需要将代码放入Markdown文本中即可，富文本格式可直接复制表情使用。'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/emoji_cleaner/process.py
+++ b/runtime/ops/mapper/emoji_cleaner/process.py
@@ -0,0 +1,27 @@
+
+"""
+Description: 文档表情去除
+Create: 2023/12/7 15:43
+"""
+import time
+from typing import Dict, Any
+
+import emoji
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class EmojiCleaner(Mapper):
+    @staticmethod
+    def _emoji_filter(input_data: str):
+        res = []
+        for input_s in input_data.split('\n'):
+            res.append(emoji.replace_emoji(input_s, replace=''))
+        return '\n'.join(res)
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._emoji_filter(sample[self.text_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, method: EmojiCleaner costs {time.time() - start:6f} s")
+        return sample
--- a/runtime/ops/mapper/extra_space_cleaner/init.py
+++ b/runtime/ops/mapper/extra_space_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ExtraSpaceCleaner',
+                          module_path="ops.mapper.extra_space_cleaner.process")
--- a/runtime/ops/mapper/extra_space_cleaner/metadata.yml
+++ b/runtime/ops/mapper/extra_space_cleaner/metadata.yml
@@ -0,0 +1,17 @@
+name: '多余空格去除'
+name_en: 'Redundant Space Removal'
+description: '移除文档首尾、句中或标点符号附近多余空格和 tab 等。'
+description_en: 'Removes redundant spaces and tabs at the beginning and end of documents,
+  in sentences, or near punctuations.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ExtraSpaceCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '  人工智能的研究历史有着一条从以“推理”为重  点，到以“知识”为重点，再到以“学习”为重点的自然、清晰的脉络。  '
+  after: '人工智能的研究历史有着一条从以“推理”为重点，到以“知识”为重点，再到以“学习”为重点的自然、清晰的脉络。'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/extra_space_cleaner/process.py
+++ b/runtime/ops/mapper/extra_space_cleaner/process.py
@@ -0,0 +1,69 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 多余空格去除
+Create: 2025/01/13
+"""
+import re
+import time
+from pathlib import Path
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class ExtraSpaceCleaner(Mapper):
+    """去除多余空格、多余空行，包括文档首尾空格、首尾tab
+    【注意】去除多余空格前，会先将文档中所有空格规范化为\u0020
+    """
+
+    def __init__(self, *args, **kwargs):
+        # 匹配文档中非常见的unicode 空格
+        super().__init__(*args, **kwargs)
+        self.white_space_pattern = ('[\u00A0 \u1680 \u2000-\u200D \u2028-\u2029'
+                                    ' \u202F \u205F \u3000 \u180E \u2060 \uFEFF]')
+        self._file_path = Path(__file__).parent / 'resources' / 'special_token.txt'
+        self.escaped_special_chars = self._get_escaped_special_chars()  # 加载标点符号
+        # 匹配文章中，连续多个空格
+        extra_space_pattern = r" {2,}"
+        # 匹配多个空格、换行符混排情况
+        extra_line_pattern = r"( |\n){2,}"
+        # 匹配中文、符号间多余空格
+        extra_space_in_chinese_pattern = r"(?<=[\u4e00-\u9fa5" + self.escaped_special_chars + r"]) +(?=[\u4e00-\u9fa5" \
+                                         + self.escaped_special_chars + r"])"
+        self.extra_space_re_compile = re.compile(extra_space_pattern)
+        self.extra_space_in_chinese_re_compile = re.compile(extra_space_in_chinese_pattern)
+        self.extra_line_re_compile = re.compile(extra_line_pattern)
+        self.white_space_pattern_compile = re.compile(self.white_space_pattern)
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
+        logger.info(
+            f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _get_escaped_special_chars(self) -> str:
+        with open(self._file_path, 'r', encoding='utf-8') as f:
+            self._special_token = f.read().splitlines()
+        res = ''.join([re.escape(char) for char in self._special_token])  # 将特殊字符转义并拼接成字符串
+        return res
+
+    def _clean_extra_space(self, input_data: str) -> str:
+        # 将文档中非常见的 unicode 空格，如 u2008，转换为正常空格（半角空格）
+        input_data = self.white_space_pattern_compile.sub('\u0020', input_data)
+        # 移除文档首尾、句中或标点符号附近多余空格和 tab
+        input_data = input_data.strip()
+        # 逐行移除首尾空格
+        text = "\n".join([line.strip() for line in input_data.split("\n")])
+        text = ''.join(['【', text, '】'])
+        # 连续空格替换为一个正常空格
+        remove_extra_space = self.extra_space_re_compile.sub("\u0020", text)
+        # 去除中文、符号间的空格
+        remove_extra_space_in_chinese = self.extra_space_in_chinese_re_compile.sub("", remove_extra_space)
+        # 去除连续换行符
+        remove_duplicate_line = self.extra_line_re_compile.sub("\n", remove_extra_space_in_chinese)
+        return remove_duplicate_line[1:-1]
--- a/runtime/ops/mapper/extra_space_cleaner/resources/special_token.txt
+++ b/runtime/ops/mapper/extra_space_cleaner/resources/special_token.txt
@@ -0,0 +1,53 @@
+~
+·
+！
+@
+#
+￥
+%
+…
+&
+*
+（
+）
+—
+
+-
+=
+{
+}
+|
+【
+】
+、
+：
+“
+”
+‘
+’
+；
+《
+》
+？
+，
+。
+`
+!
+$
+^
+(
+)
+_
+[
+]
+\
+:
+"
+;
+'
+<
+>
+?
+,
+/
+.
--- a/runtime/ops/mapper/full_width_characters_cleaner/init.py
+++ b/runtime/ops/mapper/full_width_characters_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='FullWidthCharacterCleaner',
+                          module_path="ops.mapper.full_width_characters_cleaner.process")
--- a/runtime/ops/mapper/full_width_characters_cleaner/metadata.yml
+++ b/runtime/ops/mapper/full_width_characters_cleaner/metadata.yml
@@ -0,0 +1,18 @@
+name: '全角转半角'
+name_en: 'Full-to-Half Width Character'
+description: '将文档中的所有全角字符转换成半角字符。'
+description_en: 'Converts all full-width characters in documents to half-width characters.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'FullWidthCharacterCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: 'Ｒｅｓｉｄｅｎｔｉａｌ ａｎｄ ｃｏｍｍｅｒｃｉａｌ ｄｅｓｉｇｎ, ｓｉｔｅ ｉｎｓｐｅｃｔｉｏｎｓ, ｗｏｒｋｉｎｇ ｄｒａｗｉｎｇｓ,
+    Ｍｉｎｉｃａｄ, ｒｅｎｄｅｒｉｎｇｓ．'
+  after: 'Residential and commercial design, site inspections, working drawings, MiniCad,
+    renderings.'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/full_width_characters_cleaner/process.py
+++ b/runtime/ops/mapper/full_width_characters_cleaner/process.py
@@ -0,0 +1,46 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 全角转半角
+Create: 2025/01/13
+"""
+import time
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class FullWidthCharacterCleaner(Mapper):
+    """将文档中的所有全角字符转换成半角字符"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._full_to_half_dict = {
+            '＂': '"', '＃': '#', '＄': '$', '％': '%', '＆': '&', '＇': "'", '＊': '*', '＋': '+',
+            '－': '-', '．': '.', '／': '/', '０': '0', '１': '1', '２': '2', '３': '3', '４': '4',
+            '５': '5', '６': '6', '７': '7', '８': '8', '９': '9', '＜': '<', '＝': '=', '＞': '>',
+            '＠': '@', 'Ａ': 'A', 'Ｂ': 'B', 'Ｃ': 'C', 'Ｄ': 'D', 'Ｅ': 'E', 'Ｆ': 'F', 'Ｇ': 'G',
+            'Ｈ': 'H', 'Ｉ': 'I', 'Ｊ': 'J', 'Ｋ': 'K', 'Ｌ': 'L', 'Ｍ': 'M', 'Ｎ': 'N', 'Ｏ': 'O',
+            'Ｐ': 'P', 'Ｑ': 'Q', 'Ｒ': 'R', 'Ｓ': 'S', 'Ｔ': 'T', 'Ｕ': 'U', 'Ｖ': 'V', 'Ｗ': 'W',
+            'Ｘ': 'X', 'Ｙ': 'Y', 'Ｚ': 'Z', '［': '[', '＼': '\\', '］': ']', '＾': '^', '＿': '_',
+            '｀': '`', 'ａ': 'a', 'ｂ': 'b', 'ｃ': 'c', 'ｄ': 'd', 'ｅ': 'e', 'ｆ': 'f', 'ｇ': 'g',
+            'ｈ': 'h', 'ｉ': 'i', 'ｊ': 'j', 'ｋ': 'k', 'ｌ': 'l', 'ｍ': 'm', 'ｎ': 'n', 'ｏ': 'o',
+            'ｐ': 'p', 'ｑ': 'q', 'ｒ': 'r', 'ｓ': 's', 'ｔ': 't', 'ｕ': 'u', 'ｖ': 'v', 'ｗ': 'w',
+            'ｘ': 'x', 'ｙ': 'y', 'ｚ': 'z', '｛': '{', '｜': '|', '｝': '}', '～': '~'
+        }
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, "
+                    f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _full_width_character_filter(self, input_data: str):
+        res = []
+        for input_str in input_data.split('\n'):
+            res.append("".join(self._full_to_half_dict.get(char, char) for char in input_str))
+        return '\n'.join(res)
--- a/runtime/ops/mapper/garble_characters_cleaner/init.py
+++ b/runtime/ops/mapper/garble_characters_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='GrableCharactersCleaner',
+                          module_path="ops.mapper.garble_characters_cleaner.process")
--- a/runtime/ops/mapper/garble_characters_cleaner/metadata.yml
+++ b/runtime/ops/mapper/garble_characters_cleaner/metadata.yml
@@ -0,0 +1,17 @@
+name: '文档乱码去除'
+name_en: 'Garbled Character Removal'
+description: '去除文档中的乱码和无意义的unicode。'
+description_en: 'Removes garbled characters and meaningless Unicode characters from
+  documents.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'GrableCharactersCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '文档乱码����'
+  after: '文档乱码'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/garble_characters_cleaner/process.py
+++ b/runtime/ops/mapper/garble_characters_cleaner/process.py
@@ -0,0 +1,54 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description:
+    本插件实现将文档中乱码去除功能
+    实现逻辑：
+        1. 正则判断该字符的unicode编码是否在乱码范围内。若在范围内，则去除，不在范围内，则保留。
+        2. 运行前，加载乱码字符范围的配置文件，即charset.json。该json文件中，key为字符集名称，value为unicode编码范围的集合。
+
+Create: 2025/01/13
+"""
+import json
+import re
+import time
+from pathlib import Path
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class GrableCharactersCleaner(Mapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._file_path = str(Path(__file__).parent / 'resources' / 'charset.json')
+        self.unicode_grable_code_list = self.get_unicode_grable_code_list()  # 乱码unicode编码的十进制范围的集合
+        self.grable_re_compile = re.compile("[" + self.unicode_grable_code_list + "]")
+
+    def get_unicode_grable_code_list(self):
+        """获取乱码unicode编码范围"""
+        res = ""
+        with open(self._file_path, 'r', encoding='utf-8') as f:
+            charset_number_list = json.load(f)
+        for number_ranges in charset_number_list.values():
+            for number_range in number_ranges:
+                number_range_list = number_range.split(",")
+                if len(number_range_list) < 2:
+                    logger.error(f"number_range_list size is {len(number_range_list)}, formatting error")
+                    continue
+                res += number_range_list[0] + "-" + number_range_list[1]
+        return res
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._grable_characters_filter(sample[self.text_key])
+        logger.info(
+            f"fileName: {sample[self.filename_key]}, method: GrableCharactersCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _grable_characters_filter(self, input_data: str):
+        """去除文档中的乱码"""
+        return self.grable_re_compile.sub("", input_data)
--- a/runtime/ops/mapper/garble_characters_cleaner/resources/charset.json
+++ b/runtime/ops/mapper/garble_characters_cleaner/resources/charset.json
@@ -0,0 +1,24 @@
+{
+  "注音符号东亚": [
+    "\u3100,\u312F"
+  ],
+  "拉丁文补充1": [
+    "\u00C0,\u00D6",
+    "\u00D8,\u00F6",
+    "\u00F8,\u00FF"
+  ],
+  "拉丁文扩展,A": [
+    "\u0100,\u017F"
+  ],
+  "拉丁文扩展,B": [
+    "\u0180,\u024F"
+  ],
+  "私人使用区域": [
+    "\uE000,\uF8FF",
+    "\\U000f0000,\\U000ffffd",
+    "\\U00100000,\\U0010fffd"
+  ],
+  "占位符": [
+    "\uFFFD,\uFFFD"
+  ]
+}
--- a/runtime/ops/mapper/html_tag_cleaner/init.py
+++ b/runtime/ops/mapper/html_tag_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='HtmlTagCleaner',
+                          module_path="ops.mapper.html_tag_cleaner.process")
--- a/runtime/ops/mapper/html_tag_cleaner/metadata.yml
+++ b/runtime/ops/mapper/html_tag_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: 'HTML标签去除'
+name_en: 'HTML Tag Removal'
+description: '移除文档中HTML标签，如 <html>、<dev>、<p> 等。'
+description_en: 'Removes HTML tags from documents, such as <html>, <dev>, and <p>.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'HtmlTagCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '<p><b>机器学习</b>是<a href="/wiki/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD" title="人工智能">人工智能</a>的一个分支。</p>'
+  after: '机器学习是人工智能的一个分支。'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/html_tag_cleaner/process.py
+++ b/runtime/ops/mapper/html_tag_cleaner/process.py
@@ -0,0 +1,80 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: HTML标签去除插件
+Create: 2025/01/13
+"""
+import re
+import time
+from typing import List, Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class HtmlTagCleaner(Mapper):
+    """移除文档中html标签，如 <html>，<dev>，<p>等，不对xml文档做处理"""
+    tag_list = [
+        '<a>', '<abbr>', '<acronym>', '<address>', '<applet>', '<area>', '<article>', '<aside>',
+        '<audio>', '<b>', '<base>', '<basefont>', '<bdi>', '<bdo>', '<bgsound>', '<big>', '<blink>',
+        '<blockquote>', '<body>', '<br>', '<button>', '<canvas>', '<caption>', '<center>', '<cite>',
+        '<code>', '<col>', '<colgroup>', '<command>', '<content>', '<data>', '<datalist>', '<dd>',
+        '<del>', '<details>', '<dfn>', '<dialog>', '<dir>', '<div>', '<dl>', '<dt>', '<em>',
+        '<embed>', '<fieldset>', '<figcaption>', '<figure>', '<font>', '<footer>', '<form>', '<frame>',
+        '<frameset>', '<h1>', '<h2>', '<h3>', '<h4>', '<h5>', '<h6>', '<head>', '<header>', '<hgroup>',
+        '<hr>', '<html>', '<i>', '<iframe>', '<image>', '<img>', '<input>', '<ins>', '<isindex>',
+        '<kbd>', '<keygen>', '<label>', '<legend>', '<li>', '<link>', '<listing>', '<main>', '<map>',
+        '<mark>', '<marquee>', '<menu>', '<menuitem>', '<meta>', '<meter>', '<nav>', '<nobr>', '<noembed>',
+        '<noframes>', '<noscript>', '<object>', '<ol>', '<optgroup>', '<option>', '<output>', '<p>',
+        '<param>', '<picture>', '<plaintext>', '<pre>', '<progress>', '<q>', '<rp>', '<rt>', '<rtc>',
+        '<ruby>', '<s>', '<samp>', '<script>', '<section>', '<select>', '<shadow>', '<small>',
+        '<source>', '<spacer>', '<span>', '<strike>', '<strong>', '<style>', '<sub>', '<summary>',
+        '<sup>', '<template>', '<textarea>', '<tfoot>', '<thead>', '<time>', '<title>', '<track>', '<tt>', '<u>',
+        '<ul>', '<var>', '<video>', '<wbr>', '<xmp>'
+    ]
+    preserved_attr_list = ['colspan', 'rowspan']  # 需要保留的标签属性列表
+
+    @staticmethod
+    def _remove_specified_tags(input_data: str, specified_tags: List):
+        """移除指定html标签及其属性值"""
+        html_tag_pattern = '|'.join(
+            map(lambda tag: rf'{re.escape(tag[:-1])}(\s[^>]*)?>|</{re.escape(tag[1:-1])}>', specified_tags))
+        cleaned_text = re.sub(html_tag_pattern, '', input_data, flags=re.IGNORECASE)
+        return cleaned_text
+
+    @staticmethod
+    def _remove_tag_attributes(input_data: str, preserved_attrs: List):
+        """移除html标签内的属性值，同时保留指定的属性"""
+        tag_pattern = r'<(\w+)(\s+[^<>]*?)?>'
+        attr_pattern = r'\s*(\w+)="([^"]+)"'
+
+        def __remove_unwanted_attrs(m):
+            def __remove_attrs(x):
+                if x.group(1) in preserved_attrs:
+                    return x.group(0)
+                else:
+                    return ''
+
+            return re.sub(attr_pattern, __remove_attrs, m.group(0))
+
+        cleaned_text = re.sub(tag_pattern, __remove_unwanted_attrs, input_data)
+        return cleaned_text
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        if sample[self.filetype_key] != "xml":
+            sample[self.text_key] = self._remove_html_tags(sample[self.text_key])
+            logger.info(
+                f"fileName: ｛sample[self.filename_key]｝, method: HtmlTagCleaner costs {time.time() - start:6f} s")
+        else:
+            logger.info(f"fileName: {sample[self.filename_key]}, method: HtmlTagCleaner, The file is xml!")
+        return sample
+
+    def _remove_html_tags(self, input_data: str):
+        # 去除常见的html标签及其属性值（不包括<table>、<tbody>、<tr>、<td>、<th>）
+        cleaned_text = self._remove_specified_tags(input_data, self.tag_list)
+        # 去除表格标签内的属性值（不包括colspan、rowspan属性），eg:<td class="td8" rowspan="3"> —> <td rowspan="3">
+        cleaned_text = self._remove_tag_attributes(cleaned_text, self.preserved_attr_list)
+        return cleaned_text
--- a/runtime/ops/mapper/id_number_cleaner/init.py
+++ b/runtime/ops/mapper/id_number_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='AnonymizedIdNumber',
+                          module_path="ops.mapper.id_number_cleaner.process")
--- a/runtime/ops/mapper/id_number_cleaner/metadata.yml
+++ b/runtime/ops/mapper/id_number_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '身份证号匿名化'
+name_en: 'ID Card Number Anonymization'
+description: '身份证号匿名化。'
+description_en: 'Anonymizes ID card numbers.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'AnonymizedIdNumber'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '这个是身份证号110101190001011009'
+  after: '这个是身份证号<id>'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/id_number_cleaner/process.py
+++ b/runtime/ops/mapper/id_number_cleaner/process.py
@@ -0,0 +1,116 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 身份证号码匿名化插件
+Create: 2024/12/5 15:43
+"""
+import re
+import time
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any
+
+from loguru import logger
+
+import pytz
+
+
+from datamate.core.base_op import Mapper
+
+
+class AnonymizedIdNumber(Mapper):
+    def __init__(self, *args, **kwargs):
+        super(AnonymizedIdNumber, self).__init__(*args, **kwargs)
+        self.id_number_re_compile = self.get_id_number_re_compile()
+        self.id_coefficient = (7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2)
+        self.id_verification = ("1", "0", "X", "9", "8", "7", "6", "5", "4", "3", "2")
+        self.area_code_enum = self.load_code_list()
+
+    @staticmethod
+    def get_id_number_re_compile():
+        """获取身份证号码正则匹配对象"""
+        # 中国身份证号共计18位，1,2位省份，3,4位城市，5,6位县区码，7~14位为出生日期，最后一位为校验码，做了严格限定
+        id_card_pattern = r'(?<=[^0-9])' \
+                          r'((1[1-5]|2[1-3]|3[1-7]|4[1-6]|5[0-4]|6[1-5]|71|81|82)' \
+                          r'(0[0-9]|1[0-9]|2[0-9]|3[0-4]|4[0-3]|5[1-3]|90)' \
+                          r'(0[0-9]|1[0-9]|2[0-9]|3[0-9]|4[0-3]|5[1-7]|6[1-4]|7[1-4]|8[1-7])' \
+                          r'(18|19|20)\d{2}(0[1-9]|1[0-2])(0[1-9]|[12][0-9]|3[01])' \
+                          r'\d{3}[0-9xX])' \
+                          r'(?=[^0-9xX])'
+        return re.compile(id_card_pattern)
+
+    @staticmethod
+    def load_code_list():
+        """编码表加载"""
+        area_code_enum_path = str(Path(__file__).parent / 'resources' / 'area_code_enum.txt')
+        with open(area_code_enum_path, 'r', encoding='utf-8') as f:
+            area_code_list = set(f.read().splitlines())
+        return area_code_list
+
+    @staticmethod
+    def _verify_birthday_code(birthday_code: str):
+        """判断出生日期编码的8位数是否有效"""
+        year = int(birthday_code[:4])
+        month = int(birthday_code[4:6])
+        day = int(birthday_code[6:8])
+        date_string = "{}-{}-{}".format(year, month, day)
+        date_format = "%Y-%m-%d"
+        try:
+            # 将日期字符串转换成时间
+            date = datetime.strptime(date_string, date_format)
+            # 设置时区为上海
+            china_tz = pytz.timezone("Asia/Shanghai")
+            china_date = china_tz.localize(date)
+            # 获取当前时间
+            current_date = datetime.now(china_tz)
+            # 判断出生日期是否晚于当前时间；若晚于，则出生日期不合法
+            return china_date <= current_date
+        except ValueError:
+            return False
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._id_number_filter(sample[self.text_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, method: IDNumberCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _verify_area_code(self, area_code: str):
+        """判断地域编码的6位数是否有效"""
+        return area_code in self.area_code_enum
+
+    def _verify_verification_code(self, id_number: str):
+        """身份证号码校验码正确性校验"""
+        verify_num = id_number[-1]
+        # 将身份证号码前17位数分别乘以不同的系数，即self.id_coefficient，再将相乘结果相加
+        id_sum = sum([int(num) * coe for num, coe in zip(id_number[:-1], self.id_coefficient)])
+        # 判断相加总和除以11的余数是否等于身份证号码最后一位
+        return verify_num.upper() == self.id_verification[id_sum % 11].upper()
+
+    def _verify_id_number(self, id_number: str):
+        """验证身份证号码有效性主函数"""
+        return self._verify_verification_code(id_number) and \
+            self._verify_birthday_code(id_number[6:14]) and \
+            self._verify_area_code(id_number[:6])
+
+    def _verify_similar_id_number(self, id_number: str):
+        """用于宽松匹配类似身份证的字符串，不进行严格有效性验证。"""
+        if len(id_number) != 18:
+            return False
+        if not id_number[:17].isdigit():
+            return False
+        last_char = id_number[-1].upper()
+        return last_char in set('0123456789X')
+
+    def _id_number_filter(self, input_data: str):
+        """身份证号码匿名化"""
+        input_data = ''.join(['【', input_data, '】'])
+        # 抽取符合身份证正则匹配的字符串
+        id_nums = [item.group(1) for item in self.id_number_re_compile.finditer(input_data)]
+        # 判断抽取的字符串是不是真实的身份证号码
+        for id_num in id_nums:
+            if self._verify_id_number(id_num) or self._verify_similar_id_number(id_num):
+                # 替换有效身份证号码为<id>
+                id_num_pattern = r"(?<=[^0-9]){}(?=[^0-9xX])".format(id_num)
+                input_data = re.compile(id_num_pattern).sub("<id>", input_data)
+        return input_data[1:-1]
--- a/runtime/ops/mapper/id_number_cleaner/resources/area_code_enum.txt
+++ b/runtime/ops/mapper/id_number_cleaner/resources/area_code_enum.txt
--- a/runtime/ops/mapper/img_denoise/init.py
+++ b/runtime/ops/mapper/img_denoise/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgDenoise',
+                          module_path="ops.mapper.img_denoise.process")
--- a/runtime/ops/mapper/img_denoise/metadata.yml
+++ b/runtime/ops/mapper/img_denoise/metadata.yml
@@ -0,0 +1,17 @@
+name: '图片噪点去除'
+name_en: 'Image Noise Removal'
+description: '去除图片中的噪点，主要适用于自然场景。'
+description_en: 'Removes noises from images, which is mainly applicable to natural
+  scenery image scenarios.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgDenoise'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/mapper/img_denoise/process.py
+++ b/runtime/ops/mapper/img_denoise/process.py
@@ -0,0 +1,60 @@
+# -- encoding: utf-8 --
+
+"""
+Description:
+Create: 2025/01/17
+"""
+import time
+
+from typing import Dict, Any
+
+import cv2
+import numpy as np
+from loguru import logger
+
+from datamate.common.utils import bytes_transform
+from datamate.core.base_op import Mapper
+
+
+class ImgDenoise(Mapper):
+    def __init__(self, *args, **kwargs):
+        super(ImgDenoise, self).__init__(*args, **kwargs)
+        self._denoise_threshold = kwargs.get("denoise_threshold", 8)
+
+    @staticmethod
+    def _denoise_image(data: object):
+        """降噪处理"""
+        return cv2.medianBlur(data, 3)
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+
+        img_bytes = sample[self.data_key]
+
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        if img_bytes:
+            data = bytes_transform.bytes_to_numpy(img_bytes)
+            denoise_images = self._denoise_images_filter(data, file_name)
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(denoise_images, file_type)
+        logger.info(f"fileName: {file_name}, method: ImgDenoise costs {time.time() - start:6f} s")
+        return sample
+
+    def _denoise_images_filter(self, ori_img, file_name):
+        # 获取原始图片的去噪图片
+        clean_data = self._denoise_image(ori_img)
+        # 为方便与其他图片比较可以将图片resize到同一个大小
+        ori = cv2.resize(ori_img, (112, 112))
+        dst = cv2.resize(clean_data, (112, 112))
+        # 计算未降噪图片的灰度值的集合
+        signal = np.sum(ori ** 2)
+        # 计算未降噪图片的灰度值与去噪图片灰度值的差值的集合
+        noise = np.sum((ori - dst) ** 2)
+        # 根据未去噪图片和差值计算snr (图片信噪比)
+        snr = 10 * np.log10(signal / noise)
+        # 对于小于阈值的图片，进行降噪处理
+        if snr < self._denoise_threshold:
+            logger.info(f"The image denoise is {self._denoise_threshold}, "
+                        f"which exceeds the threshold of {snr}. {file_name} is filtered out.")
+            return clean_data
+        return ori_img
--- a/runtime/ops/mapper/img_direction_correct/init.py
+++ b/runtime/ops/mapper/img_direction_correct/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgDirectionCorrect',
+                          module_path="ops.mapper.img_direction_correct.process")
--- a/runtime/ops/mapper/img_direction_correct/base_model.py
+++ b/runtime/ops/mapper/img_direction_correct/base_model.py
@@ -0,0 +1,38 @@
+# -- encoding: utf-8 --
+
+import gc
+import os
+from pathlib import Path
+
+from argparse import Namespace
+
+
+class BaseModel:
+
+    def __init__(self, model_type='vertical'):
+        models_path = os.getenv("MODELS_PATH", "/home/models")
+        self.resources_path = str(Path(models_path, 'img_direction_correct', 'resources'))
+        args = Namespace()
+        args.cls_image_shape = '3, 224, 224'
+        args.cls_batch_num = 6
+        args.cls_thresh = 0.9
+        args.use_onnx = False
+        args.use_gpu = False
+        args.use_npu = False
+        args.use_xpu = False
+        args.enable_mkldnn = False
+        if model_type == 'vertical':
+            args.cls_model_dir = str(Path(self.resources_path, 'vertical_model'))
+            self.model_name = 'standard model to detect image 0 or 90 rotated'
+            args.label_list = ['0', '90']
+        else:
+            args.cls_model_dir = str(Path(self.resources_path, 'standard_model'))
+            self.model_name = 'standard model to detect image 0 or 180 rotated'
+            args.label_list = ['0', '180']
+
+        from paddleocr.tools.infer.predict_cls import TextClassifier
+        self.infer = TextClassifier(args)
+
+    def __del__(self):
+        del self.infer
+        gc.collect()
--- a/runtime/ops/mapper/img_direction_correct/metadata.yml
+++ b/runtime/ops/mapper/img_direction_correct/metadata.yml
@@ -0,0 +1,17 @@
+name: '图片方向校正'
+name_en: 'Image Orientation Correction'
+description: '将含有文字的图片校正到文字水平方向，主要适用于文档场景。'
+description_en: 'Corrects images to ensure text is presented horizontally, which is
+  mainly applicable to document scenarios.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgDirectionCorrect'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/mapper/img_direction_correct/process.py
+++ b/runtime/ops/mapper/img_direction_correct/process.py
@@ -0,0 +1,139 @@
+# -- encoding: utf-8 --
+
+"""
+Description:
+Create: 2024/1/30 9:26
+"""
+import math
+import time
+from typing import Dict, Any
+
+import cv2
+import numpy as np
+from loguru import logger
+
+from datamate.common.utils import bytes_transform
+from datamate.core.base_op import Mapper
+
+from .base_model import BaseModel
+
+
+class ImgDirectionCorrect(Mapper):
+    def __init__(self, *args, **kwargs):
+        super(ImgDirectionCorrect, self).__init__(*args, **kwargs)
+        self.img_resize = 1000
+        self.limit_size = 30000
+        self.use_model = True
+        self.vertical_model, self.standard_model = self.get_model(*args, **kwargs)
+
+    @staticmethod
+    def _detect_angle(img):
+        """检测图片倾斜角度"""
+        # 转为灰度单通道 [[255 255],[255 255]]
+        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+        # 黑白颠倒
+        gray = cv2.bitwise_not(gray)
+        # 二值化
+        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
+        # 把大于0的点的行列找出来
+        ys, xs = np.where(thresh > 0)
+        # 组成坐标[[306  37][306  38][307  38]],里面都是非零的像素
+        coords = np.column_stack([xs, ys])
+        # 获取最小矩形的信息 返回值(中心点，长宽，角度)
+        rect = cv2.minAreaRect(coords)
+        # 这里minAreaRect返回值为【0,90】，离y轴最近的夹角，后续有优化空间
+        # 夹角小于45度时，填充的空白较少，有助于提升识别率
+        angle = rect[-1]  # 最后一个参数是角度
+        # 小于45度时，逆时针旋转45度
+        if angle <= 45.0:
+            return angle
+        # 大于45度时，顺时针旋转（90-angle）
+        return angle - 90
+
+    @staticmethod
+    def _detect_direction(image, file_name, model):
+        """
+        Args:
+            image: 待预测的图片
+            file_name: 文件名
+            model: 使用的模型， vertical_model 和 standard_model
+        Returns: 旋转后的图片
+        """
+        # cls_res为模型预测结果，格式应当类似于: [('90', 0.9815167)]
+        _, cls_res, _ = model.infer([image])
+        rotate_angle = int(cls_res[0][0])
+        pro = float(cls_res[0][1])
+        logger.info(
+            f"fileName: ｛file_name｝, model ｛model.model_name｝ detect result is {rotate_angle} with confidence ｛pro｝")
+        if rotate_angle == 90 and pro > 0.89:
+            return cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)
+        if rotate_angle == 180 and pro > 0.89:
+            return cv2.rotate(image, 1)
+        return image
+
+    @staticmethod
+    def _rotate_bound(image, angle):
+        """根据倾斜角度旋转图片
+        Args:
+            image: 待处理图片
+            angle: _detect_angle方法检测到的倾斜角
+        """
+        if angle == 0.0:
+            return image
+        # 获取宽高
+        h, w = image.shape[:2]
+        sinval = math.fabs(math.sin(angle))
+        cosval = math.fabs(math.cos(angle))
+        dx = max(int((w * cosval + h * sinval - w) / 2), 0)
+        dy = max(int((w * sinval + h * cosval - h) / 2), 0)
+        dst_img = cv2.copyMakeBorder(image, dy, dy, dx, dx, cv2.BORDER_CONSTANT, value=(255, 255, 255))
+        h, w = dst_img.shape[:2]
+        rotated_matrix = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1.0)
+        dst_img = cv2.warpAffine(dst_img, rotated_matrix, (w, h), borderValue=(255, 255, 255))
+        return dst_img
+
+    def init_model(self, *args, **kwargs):
+        return BaseModel(model_type='vertical'), BaseModel(model_type='standard')
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        img_bytes = sample[self.data_key]
+        if img_bytes:
+            data = bytes_transform.bytes_to_numpy(img_bytes)
+            correct_data = self._img_direction_correct(data, file_name, self.vertical_model, self.standard_model)
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(correct_data, file_type)
+            logger.info(f"fileName: ｛file_name｝, method: ImgDirectionCorrect costs {time.time() - start:6f} s")
+        return sample
+
+    def _img_direction_correct(self, img, file_name, vertical_model, standard_model):
+        height, width = img.shape[:2]
+        if max(height, width) > self.limit_size:
+            logger.info(
+                f"fileName: {file_name}, method: ImgDirectionCorrect cannot process pixels number larger than 30000")
+            return img
+        detect_angle_img = self._resize(img)
+        # 检测旋转角
+        angle = self._detect_angle(detect_angle_img)
+        # 将图片处理为 0, 90, 180, 270旋转角度的图片
+        rotated_img = self._rotate_bound(img, angle)
+        # 水平垂直方向识别：二分类模型，检测图片方向角为 0, 90, 将其处理为 0和180二分类图片
+        rotated_img = self._detect_direction(rotated_img, file_name, vertical_model)
+        # 0-180方向识别：二分类模型，检测图片方向角为 0, 180, 将其处理为 0和180二分类图片
+        rotated_img = self._detect_direction(rotated_img, file_name, standard_model)
+        return rotated_img
+
+    def _resize(self, image):
+        height, width = image.shape[:2]  # 获取原图像的水平方向尺寸和垂直方向尺寸。
+        temp = max(height, width)
+        # 若图片最长边大于限值，对图片进行压缩，否则返回原图
+        if temp >= self.img_resize:
+            mul_temp = temp / self.img_resize
+            if height > width:
+                return cv2.resize(image, (int(width / mul_temp), self.img_resize), interpolation=cv2.INTER_AREA)
+            elif height < width:
+                return cv2.resize(image, (self.img_resize, int(height / mul_temp)), interpolation=cv2.INTER_AREA)
+            else:
+                return cv2.resize(image, (self.img_resize, self.img_resize), interpolation=cv2.INTER_AREA)
+        return image
--- a/runtime/ops/mapper/img_enhanced_brightness/init.py
+++ b/runtime/ops/mapper/img_enhanced_brightness/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgBrightness',
+                          module_path="ops.mapper.img_enhanced_brightness.process")
--- a/runtime/ops/mapper/img_enhanced_brightness/metadata.yml
+++ b/runtime/ops/mapper/img_enhanced_brightness/metadata.yml
@@ -0,0 +1,16 @@
+name: '图片亮度增强'
+name_en: 'Image Brightness Enhancement'
+description: '自适应调节图片的亮度。'
+description_en: 'Adapts and adjusts image brightness.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgBrightness'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/mapper/img_enhanced_brightness/process.py
+++ b/runtime/ops/mapper/img_enhanced_brightness/process.py
@@ -0,0 +1,100 @@
+# -- encoding: utf-8 --
+
+"""
+Description: 图像亮度增强算子。
+Create: 2025/01/13
+"""
+
+import time
+
+from typing import Dict, Any
+
+import numpy as np
+import cv2
+from loguru import logger
+
+from datamate.common.utils import bytes_transform
+
+from datamate.core.base_op import Mapper
+
+
+class ImgBrightness(Mapper):
+    """图片亮度自适应增强"""
+
+    def __init__(self, *args, **kwargs):
+        super(ImgBrightness, self).__init__(*args, **kwargs)
+        # 自适应增强参数
+        self.factor_threshold = 1.1  # 图片增强因子下限(不作为参数传入)。
+        self.standard_mean = 140  # 图片增强后的平均亮度(不作为参数传入)。
+        self.gamma = 1.5  # gamma correction 中的gamma系数，大于1时，使得图像变亮。小于1时，使得图像变暗(不作为参数传入)。
+        self.brightness_upper_bound = 0.35  # 非线性亮度增强阈值上界: 超过这个百分比，就进行线性亮度增强(不作为参数传入)。
+        self.eps = 1  # 极小值，计算图像亮度增强因子的时候，防止全黑图片导致的除零错(不作为参数传入)。
+
+    @staticmethod
+    def _get_grey_mean(src: np.ndarray):
+        gray_image = cv2.cvtColor(src, cv2.COLOR_BGR2GRAY)
+        return np.mean(gray_image)
+
+    @staticmethod
+    def _return_gamma_table(gamma):
+        """返回gamma校正对应的查找表"""
+        scale = np.power(255, 1 - gamma).astype(np.float64)
+        return np.power(np.arange(256), gamma) * scale
+
+    @staticmethod
+    def _return_linear_table(factor):
+        """返回线性变换对应的查找表"""
+        linear_table = np.arange(256) * factor
+        return np.clip(linear_table, 0, 255).astype(np.uint8)
+
+    def enhance_brightness_linear(self, image_data: np.ndarray, file_name):
+        average_brightness = self._get_grey_mean(image_data)
+        brightness_factor = self.standard_mean / (average_brightness + self.eps)
+
+        # 图像过亮，不需要增强亮度
+        if brightness_factor <= 1:
+            logger.info(f"fileName: {file_name}, method: ImgBrightness not need enhancement")
+            return image_data
+
+        brightness_factor = max(brightness_factor, self.factor_threshold)
+        linear_table = ImgBrightness._return_linear_table(brightness_factor)
+        cv2.LUT(image_data, linear_table, dst=image_data)
+        return image_data
+
+    def enhance_brightness(self, image_data: np.ndarray, file_name):
+        '''
+        亮度自适应增强方法。
+
+        Args:
+            image_data: nd.array 格式图片
+            gamma: gamma变换因子参数。经验值常用1.5, 已写成了成员变量。
+        Returns:
+            亮度自适应增强后的图片
+        '''
+        # 计算图片平均亮度
+        average_brightness = self._get_grey_mean(image_data)
+
+        # 进行 gamma 校正
+        if average_brightness / 255 <= self.brightness_upper_bound:
+            # 预计算查找表
+            gamma_table = ImgBrightness._return_gamma_table(1 / self.gamma).astype(np.uint8)
+            cv2.LUT(image_data, gamma_table, dst=image_data)
+
+        # 如果亮度超过非线性亮度调整的上界，就进行非线性亮度调整
+        else:
+            image_data = self.enhance_brightness_linear(image_data, file_name)
+
+        return image_data
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+        img_bytes = sample[self.data_key]
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        if img_bytes:
+            # 进行图片增强
+            img_data = bytes_transform.bytes_to_numpy(img_bytes)
+            img_data = self.enhance_brightness(img_data, file_name)
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
+        logger.info(f"fileName: {file_name}, method: ImgBrightness costs {time.time() - start:6f} s")
+        return sample
--- a/runtime/ops/mapper/img_enhanced_contrast/init.py
+++ b/runtime/ops/mapper/img_enhanced_contrast/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgContrast',
+                          module_path="ops.mapper.img_enhanced_contrast.process")
--- a/runtime/ops/mapper/img_enhanced_contrast/metadata.yml
+++ b/runtime/ops/mapper/img_enhanced_contrast/metadata.yml
@@ -0,0 +1,16 @@
+name: '图片对比度增强'
+name_en: 'Image Contrast Enhancement'
+description: '自适应调节图片的对比度。'
+description_en: 'Adapts and adjusts the image contrast.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgContrast'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/mapper/img_enhanced_contrast/process.py
+++ b/runtime/ops/mapper/img_enhanced_contrast/process.py
@@ -0,0 +1,71 @@
+# -- encoding: utf-8 --
+
+"""
+Description: 图片对比度自适应增强
+Version:
+Create: 2025/01/13
+"""
+
+import time
+
+from typing import Dict, Any
+
+import cv2
+import numpy as np
+from loguru import logger
+
+from datamate.common.utils import bytes_transform
+from datamate.core.base_op import Mapper
+
+
+class ImgContrast(Mapper):
+    """图片对比度自适应增强"""
+
+    def __init__(self, *args, **kwargs):
+        super(ImgContrast, self).__init__(*args, **kwargs)
+        # 自适应增强参数
+        self.clip_limit = 2  # 指定对比度限制阈值, 较大的值会产生更大的对比度增强效(不作为参数传入)。
+        self.tile_grid = 16  # 指定图像划分的网格大小,较小的网格大小会导致更局部的均衡化效果(不作为参数传入)。
+        self.standard_mean = 100  # 图片增强后的平均对比度(不作为参数传入)。
+        self.eps = 0.5  # 小值，计算图像对比度增强因子的时候，防止全黑图片导致的除零错(不作为参数传入)。
+
+    @staticmethod
+    def _get_contrast(image: np.ndarray):
+        """计算图像所有通道的平均标准差"""
+        _, stddev = cv2.meanStdDev(image)
+        contrast_std = np.mean(stddev)
+        return contrast_std
+
+    def enhance_contrast(self, image_data: np.ndarray, file_name):
+        """对比度自适应增强方法"""
+
+        contrast_std = self._get_contrast(image_data)
+        contrast_factor = self.standard_mean / (contrast_std + self.eps)
+
+        # 图片对比度较高，不需要增强对比度
+        if contrast_factor <= 1:
+            logger.info(f"fileName: {file_name}, method: ImgContrast not need enhancement")
+            return image_data
+        # 将彩色图像转换为Lab颜色空间
+        cv2.cvtColor(image_data, cv2.COLOR_BGR2Lab, dst=image_data)
+
+        # 使用局部自适应直方图均衡化进行对比度调整。
+        clahe = cv2.createCLAHE(clipLimit=self.clip_limit, tileGridSize=(self.tile_grid, self.tile_grid))
+        image_data[:, :, 0] = clahe.apply(image_data[:, :, 0])
+
+        # 将增强后的Lab图像转换回BGR颜色空间
+        cv2.cvtColor(image_data, cv2.COLOR_Lab2BGR, dst=image_data)
+        return image_data
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+        img_bytes = sample[self.data_key]
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        if img_bytes:
+            # 进行图片增强
+            img_data = bytes_transform.bytes_to_numpy(img_bytes)
+            img_data = self.enhance_contrast(img_data, file_name)
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
+        logger.info(f"fileName: {file_name}, method: ImgContrast costs {time.time() - start:6f} s")
+        return sample
--- a/runtime/ops/mapper/img_enhanced_saturation/init.py
+++ b/runtime/ops/mapper/img_enhanced_saturation/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgSaturation',
+                          module_path="ops.mapper.img_enhanced_saturation.process")
--- a/runtime/ops/mapper/img_enhanced_saturation/metadata.yml
+++ b/runtime/ops/mapper/img_enhanced_saturation/metadata.yml
@@ -0,0 +1,17 @@
+name: '图片饱和度增强'
+name_en: 'Image Saturation Enhancement'
+description: '自适应调节图片的饱和度，主要适用于自然场景图片。'
+description_en: 'Adapts and adjusts the saturation of images, which is mainly applicable
+  to natural scenery image scenarios.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgSaturation'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/mapper/img_enhanced_saturation/process.py
+++ b/runtime/ops/mapper/img_enhanced_saturation/process.py
@@ -0,0 +1,81 @@
+# -- encoding: utf-8 --
+
+"""
+Description: 图片饱和度自适应增强
+Version:
+Create: 2025/01/13
+"""
+
+import time
+
+from typing import Dict, Any
+
+import cv2
+import numpy as np
+from loguru import logger
+
+from datamate.common.utils import bytes_transform
+from datamate.core.base_op import Mapper
+
+
+class ImgSaturation(Mapper):
+    """图片饱和度自适应增强"""
+
+    def __init__(self, *args, **kwargs):
+        super(ImgSaturation, self).__init__(*args, **kwargs)
+        # 自适应增强参数
+        self.factor_threshold = 1.1  # 图片增强因子下限(不作为参数传入)。
+        self.standard_mean = 130  # 图片增强后的平均饱和度(不作为参数传入)。
+        self.eps = 1  # 极小值，计算图像饱和度增强因子的时候，防止全黑图片导致的除零错(不作为参数传入)。
+        self.zeros_ratio_threshold = 0.1  # saturation通道 零值占比率，防止对近似灰度图的图像进行处理。
+        self.red_channel_threshold = 140  # 图片红色通道阈值，用于抑制饱和度增强因子
+
+    def enhance_saturation(self, image_data: np.ndarray, file_name):
+        """饱和度自适应增强方法"""
+        # 打开图像并转换为HSV颜色空间
+        image_hsv = cv2.cvtColor(image_data, cv2.COLOR_BGR2HSV)
+        s_channel = image_hsv[:, :, 1].copy()
+        del image_hsv
+
+        # 提取饱和度通道
+        # 正常的RGB图片，零值占比率比应当小于0.1, 如果高于0.1，可以认为这张图片近似于灰度图
+        zero_s_ratio = np.count_nonzero(s_channel == 0) / s_channel.size
+        if zero_s_ratio <= self.zeros_ratio_threshold:
+            saturation_channel = s_channel
+        # 灰度图片转成的RGB图片，转为HSV后，S通道值全为0
+        else:
+            return image_data
+
+        # 计算饱和度的统计信息
+        saturation_mean = np.mean(saturation_channel)
+        saturation_factor = self.standard_mean / (saturation_mean + self.eps)
+
+        # 图片饱和度较高，不需要增强饱和度
+        if saturation_factor <= 1:
+            logger.info(f"fileName: ｛file_name｝, method: ImgSaturation not need enhancement")
+            return image_data
+
+        # 计算图片红色通道均值， 如果过大，需要限制saturation factor大小，否则图片会泛红, 产生色彩畸变。
+        red_channel_mean = np.mean(image_data[:, :, 2])
+        if red_channel_mean >= self.red_channel_threshold:
+            saturation_factor = min(saturation_factor, 1.5)
+        else:
+            saturation_factor = max(saturation_factor, self.factor_threshold)
+
+        degrade_image = cv2.cvtColor(image_data, cv2.COLOR_BGR2GRAY)
+        degrade_image = cv2.cvtColor(degrade_image, cv2.COLOR_GRAY2BGR)
+        cv2.addWeighted(image_data, saturation_factor, degrade_image, 1 - saturation_factor, 0, dst=image_data)
+        return image_data
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+        img_bytes = sample[self.data_key]
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        if img_bytes:
+            # 进行图片增强
+            img_data = bytes_transform.bytes_to_numpy(img_bytes)
+            img_data = self.enhance_saturation(img_data, file_name)
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
+        logger.info(f"fileName: ｛file_name｝, method: ImgSaturation costs {time.time() - start:6f} s")
+        return sample
--- a/runtime/ops/mapper/img_enhanced_sharpness/init.py
+++ b/runtime/ops/mapper/img_enhanced_sharpness/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgSharpness',
+                          module_path="ops.mapper.img_enhanced_sharpness.process")
--- a/runtime/ops/mapper/img_enhanced_sharpness/metadata.yml
+++ b/runtime/ops/mapper/img_enhanced_sharpness/metadata.yml
@@ -0,0 +1,17 @@
+name: '图片锐度增强'
+name_en: 'Image Sharpness Enhancement'
+description: '自适应调节图片的锐度，主要适用于自然场景图片。'
+description_en: 'Adapts and adjusts the image sharpness, which is mainly applicable
+  to natural scenery image scenarios.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgSharpness'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/mapper/img_enhanced_sharpness/process.py
+++ b/runtime/ops/mapper/img_enhanced_sharpness/process.py
@@ -0,0 +1,69 @@
+# -- encoding: utf-8 --
+
+"""
+Description: 图片锐度自适应增强
+Version:
+Create: 2025/01/13
+"""
+
+import time
+
+from typing import Dict, Any
+
+import cv2
+import numpy as np
+from loguru import logger
+
+from datamate.common.utils import bytes_transform
+from datamate.core.base_op import Mapper
+
+
+class ImgSharpness(Mapper):
+    """图片锐度自适应增强"""
+
+    def __init__(self, *args, **kwargs):
+        super(ImgSharpness, self).__init__(*args, **kwargs)
+        # 自适应增强参数
+        self.factor_threshold = 1.1  # 图片增强因子下限(不作为参数传入)。
+        self.standard_mean = 100  # 图片增强后的平均锐度(不作为参数传入)。
+        self.kernel = self._init_kernel()
+        self.eps = 1  # 小值，计算图像锐度增强因子的时候，防止全黑图片导致的除零错(不作为参数传入)。
+
+    @classmethod
+    def _init_kernel(cls):
+        kernel = np.array([[1, 1, 1],
+                           [1, 5, 1],
+                           [1, 1, 1]])
+        # 对卷积核进行归一化
+        kernel = kernel / np.sum(kernel)
+        return kernel
+
+    def enhance_sharpness(self, image_data: np.ndarray, file_name):
+        """锐度自适应增强方法"""
+
+        # 打开图像并转换为灰度图像
+        image_gray = cv2.cvtColor(image_data, cv2.COLOR_BGR2GRAY)
+        sharpness = np.abs(cv2.Laplacian(image_gray, cv2.CV_8U)).mean()
+        sharpness_factor = self.standard_mean / (sharpness + self.eps)
+
+        # 图片锐度较高，不需要增强锐度
+        if sharpness_factor <= 1:
+            logger.info(f"fileName: {file_name}, method: ImgSharpness not need enhancement")
+            return image_data
+
+        filtered_img = cv2.filter2D(image_data, -1, self.kernel)
+        cv2.addWeighted(image_data, sharpness_factor, filtered_img, 1.0 - sharpness_factor, 0, dst=image_data)
+        return image_data
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+        img_bytes = sample[self.data_key]
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        if img_bytes:
+            # 进行图片增强
+            img_data = bytes_transform.bytes_to_numpy(img_bytes)
+            img_data = self.enhance_sharpness(img_data, file_name)
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
+        logger.info(f"fileName: {file_name}, method: ImgSharpness costs {time.time() - start:6f} s")
+        return sample
--- a/runtime/ops/mapper/img_perspective_transformation/init.py
+++ b/runtime/ops/mapper/img_perspective_transformation/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgPerspectiveTransformation',
+                          module_path="ops.mapper.img_perspective_transformation.process")
--- a/runtime/ops/mapper/img_perspective_transformation/metadata.yml
+++ b/runtime/ops/mapper/img_perspective_transformation/metadata.yml
@@ -0,0 +1,17 @@
+name: '图片透视变换'
+name_en: 'Image Perspective Transformation'
+description: '自适应校正图片的视角，主要适用于文档校正场景。'
+description_en: 'Adapts and corrects image perspectives, which is mainly applicable
+  to document correction scenarios.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgPerspectiveTransformation'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/mapper/img_perspective_transformation/process.py
+++ b/runtime/ops/mapper/img_perspective_transformation/process.py
@@ -0,0 +1,147 @@
+# -- encoding: utf-8 --
+
+"""
+Description:
+Create: 2025/01/16
+"""
+import time
+
+from typing import Dict, Any
+
+import cv2
+import numpy as np
+from loguru import logger
+
+from datamate.common.utils import bytes_transform
+from datamate.core.base_op import Mapper
+
+
+class ImgPerspectiveTransformation(Mapper):
+    """图片透视变换插件"""
+
+    def __init__(self, *args, **kwargs):
+        super(ImgPerspectiveTransformation, self).__init__(*args, **kwargs)
+        self.transform_utils = PerspectiveTransformationUtils()
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+        img_bytes = sample[self.data_key]
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        if img_bytes:
+            img_data = bytes_transform.bytes_to_numpy(img_bytes)
+            transform_img = self._transform_img(img_data, file_name)
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(transform_img, file_type)
+        logger.info(f"fileName: {file_name}, method: ImgPerspectiveTransformation costs {time.time() - start:6f} s")
+        return sample
+
+    def _transform_img(self, image, file_name):
+        original_img = image
+        ratio = 900 / image.shape[0]
+        # 固定尺寸
+        img_resize = self.transform_utils.resize_img(image)
+        # 边缘检测
+        binary_img = self.transform_utils.get_canny(img_resize)
+        # 轮廓
+        max_contour, max_area = self.transform_utils.find_max_contour(binary_img)
+        if not max_contour.size:
+            return original_img
+        # 多边形拟合凸包的四个顶点
+        boxes = self.transform_utils.get_box_point(max_contour)
+        if len(boxes) == 4:
+            boxes = self.transform_utils.get_adapt_point(boxes, ratio)
+            boxes = self.transform_utils.order_points(boxes)
+            warped = self.transform_utils.get_warp_image(image, boxes)
+            logger.info(f"fileName: {file_name}, method: ImgPerspectiveTransformation. "
+                        "This picture is transformed by perspective.")
+            return warped
+        return original_img
+
+
+class PerspectiveTransformationUtils:
+    """图片透视变换工具类"""
+
+    @staticmethod
+    def resize_img(image, height=900):
+        """固定尺寸"""
+        h, w = image.shape[:2]
+        pro = height / h
+        size = (int(w * pro), int(height))
+        img_resize = cv2.resize(image, size)
+        return img_resize
+
+    @staticmethod
+    def get_canny(image):
+        """边缘检测"""
+        # 高斯滤波
+        binary = cv2.GaussianBlur(image, (3, 3), 2, 2)
+        # 边缘检测
+        binary = cv2.Canny(binary, 60, 240, apertureSize=3)
+        # 膨胀操作，尽量使边缘闭合
+        kernel = np.ones((3, 3), np.uint8)
+        binary = cv2.dilate(binary, kernel, iterations=1)
+        return binary
+
+    @staticmethod
+    def find_max_contour(image):
+        """求出面积最大的轮廓"""
+        # 寻找边缘
+        contours, _ = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+        # 计算面积
+        max_area = 0.0
+        max_contour = np.array([])
+        for contour in contours:
+            current_area = cv2.contourArea(contour)
+            if current_area > max_area:
+                max_area = current_area
+                max_contour = contour
+        return max_contour, max_area
+
+    @staticmethod
+    def get_box_point(contour):
+        """多边形拟合凸包的四个顶点"""
+        # 多边形拟合凸包
+        hull = cv2.convexHull(contour)
+        epsilon = 0.02 * cv2.arcLength(contour, True)
+        approx = cv2.approxPolyDP(hull, epsilon, True)
+        approx = approx.reshape((len(approx), 2))
+        return approx
+
+    @staticmethod
+    def get_adapt_point(box, pro):
+        """适配原四边形点集"""
+        box_pro = box
+        if pro != 1.0:
+            box_pro = box / pro
+        box_pro = np.trunc(box_pro)
+        return box_pro
+
+    @staticmethod
+    def order_points(pts):
+        """四边形顶点排序，[top-left, top-right, bottom-right, bottom-left]"""
+        rect = np.zeros((4, 2), dtype="float32")
+        s = pts.sum(axis=1)
+        rect[0] = pts[np.argmin(s)]
+        rect[2] = pts[np.argmax(s)]
+        diff = np.diff(pts, axis=1)
+        rect[1] = pts[np.argmin(diff)]
+        rect[3] = pts[np.argmax(diff)]
+        return np.intp(rect)
+
+    @staticmethod
+    def compute_point_distance(a, b):
+        """计算长宽"""
+        return int(np.sqrt(np.sum(np.square(a - b))))
+
+    def get_warp_image(self, image, box):
+        """透视变换"""
+        w, h = self.compute_point_distance(box[0], box[1]), \
+            self.compute_point_distance(box[1], box[2])
+        dst_rect = np.array([[0, 0],
+                             [w - 1, 0],
+                             [w - 1, h - 1],
+                             [0, h - 1]], dtype='float32')
+        box = np.array(box, dtype='float32')
+        matrix = cv2.getPerspectiveTransform(box, dst_rect)
+        warped = cv2.warpPerspective(image, matrix, (w, h))
+        return warped
--- a/runtime/ops/mapper/img_resize/init.py
+++ b/runtime/ops/mapper/img_resize/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgResize',
+                          module_path="ops.mapper.img_resize.process")
--- a/runtime/ops/mapper/img_resize/metadata.yml
+++ b/runtime/ops/mapper/img_resize/metadata.yml
@@ -0,0 +1,35 @@
+name: '图片重采样'
+name_en: 'Image Resampling'
+description: '将图片放大或缩小到指定像素。'
+description_en: 'Zooms in or out images to specified pixels.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgResize'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
+settings:
+  targetSize:
+    name: 重采样尺寸
+    type: multiple
+    properties:
+      - type: inputNumber
+        name: 宽度
+        description: 像素
+        defaultVal: 256
+        min: 1
+        max: 4096
+        step: 1
+      - type: inputNumber
+        name: 高度
+        description: 像素
+        defaultVal: 256
+        min: 1
+        max: 4096
+        step: 1
--- a/runtime/ops/mapper/img_resize/process.py
+++ b/runtime/ops/mapper/img_resize/process.py
@@ -0,0 +1,40 @@
+# -- encoding: utf-8 --
+
+"""
+Description:
+Create: 2025/01/16
+"""
+import time
+from typing import List, Dict, Any
+
+from loguru import logger
+import cv2
+
+from datamate.common.utils import bytes_transform
+from datamate.core.base_op import Mapper
+
+
+class ImgResize(Mapper):
+    def __init__(self, *args, **kwargs):
+        super(ImgResize, self).__init__(*args, **kwargs)
+        self._target_size = kwargs.get("targetSize", [256, 256])
+
+    @classmethod
+    def _img_resize(cls, data: List[float], target_size: List[int]) -> List[float]:
+        """将图片缩放到指定尺寸大小"""
+        target_width = max(min(target_size[0], 4096), 1)
+        target_height = max(min(target_size[1], 4096), 1)
+        resized_img = cv2.resize(data, (target_width, target_height), interpolation=cv2.INTER_AREA)
+        return resized_img
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        img_bytes = sample[self.data_key]
+        if img_bytes:
+            data = bytes_transform.bytes_to_numpy(img_bytes)
+            resized_img = self._img_resize(data, self._target_size)
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(resized_img, file_type)
+            logger.info(f"fileName: {file_name}, method: ImgResize costs {time.time() - start:6f} s")
+        return sample
--- a/runtime/ops/mapper/img_shadow_remove/init.py
+++ b/runtime/ops/mapper/img_shadow_remove/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgShadowRemove',
+                          module_path="ops.mapper.img_shadow_remove.process")
--- a/runtime/ops/mapper/img_shadow_remove/metadata.yml
+++ b/runtime/ops/mapper/img_shadow_remove/metadata.yml
@@ -0,0 +1,17 @@
+name: '图片阴影去除'
+name_en: 'Image Shadow Removal'
+description: '去除图片中的阴影，主要适用于文档场景。'
+description_en: 'Removes shadows from images, which is mainly applicable to document
+  scenarios.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgShadowRemove'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/mapper/img_shadow_remove/process.py
+++ b/runtime/ops/mapper/img_shadow_remove/process.py
@@ -0,0 +1,72 @@
+# -- encoding: utf-8 --
+
+"""
+Description: 图片去阴影插件
+Create: 2025/01/16
+"""
+import time
+
+from typing import Dict, Any
+
+import cv2
+import numpy as np
+from loguru import logger
+
+from datamate.common.utils import bytes_transform
+from datamate.core.base_op import Mapper
+
+
+class ImgShadowRemove(Mapper):
+    """图片阴影去除"""
+
+    def __init__(self, *args, **kwargs):
+        super(ImgShadowRemove, self).__init__(*args, **kwargs)
+        self.iter_nums = 9  # 闭运算循环次数(不作为参数传入)。
+        self.k_size = 3  # kernel size大小。
+        self.clip_limit = 2  # 对比度限制阈值, 数值越大，效果越强。
+        self.tile_grid = 8  # 图像划分的网格大小, 数值越小，局部效果越明显。
+
+    def shadow_removed(self, image_data: np.ndarray):
+        '''
+        阴影去除。
+
+        Args:
+            image_data: nd.array 格式图片
+        Returns:
+            阴影去除后的图片
+        '''
+        # 设置kernel大小，进行闭运算
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (self.k_size, self.k_size))
+        closing = cv2.morphologyEx(image_data, cv2.MORPH_CLOSE, kernel, iterations=self.iter_nums)
+
+        # 进行~（closing - original）操作
+        cv2.bitwise_not(closing - image_data, dst=closing)
+        cv2.cvtColor(closing, cv2.COLOR_BGR2Lab, dst=closing)
+
+        # 获取处理后图像的亮度通道
+        img_l = cv2.split(closing)[0]
+        del closing
+
+        # 对img_l进行调节后，替换原图的亮度通道
+        cv2.cvtColor(image_data, cv2.COLOR_BGR2Lab, dst=image_data)
+        # 创建 CLAHE 对象
+        clahe = cv2.createCLAHE(clipLimit=self.clip_limit, tileGridSize=(self.tile_grid, self.tile_grid))
+        # 进行 CLAHE 处理
+        image_data[:, :, 0] = clahe.apply(img_l)
+        del img_l
+
+        cv2.cvtColor(image_data, cv2.COLOR_Lab2BGR, dst=image_data)
+        return image_data
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+        img_bytes = sample[self.data_key]
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        if img_bytes:
+            # 进行阴影去除
+            img_data = bytes_transform.bytes_to_numpy(img_bytes)
+            img_data = self.shadow_removed(img_data)
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(img_data, file_type)
+        logger.info(f"fileName: {file_name}, method: ImageShadowRemove costs {time.time() - start:6f} s")
+        return sample
--- a/runtime/ops/mapper/img_type_unify/init.py
+++ b/runtime/ops/mapper/img_type_unify/init.py
@@ -0,0 +1,6 @@
+# -- encoding: utf-8 --
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgTypeUnify',
+                          module_path="ops.mapper.img_type_unify.process")
--- a/runtime/ops/mapper/img_type_unify/metadata.yml
+++ b/runtime/ops/mapper/img_type_unify/metadata.yml
@@ -0,0 +1,30 @@
+name: '图片格式转换'
+name_en: 'Image Format Conversion'
+description: '将图片编码格式统一为jpg、jpeg、png、bmp格式'
+description_en: 'Converts image formats to JPG, JPEG, PNG, or BMP.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgTypeUnify'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
+settings:
+  imgType:
+    name: 图片编码格式
+    type: select
+    defaultVal: jpg
+    options:
+      - label: jpg
+        value: jpg
+      - label: png
+        value: png
+      - label: jpeg
+        value: jpeg
+      - label: bmp
+        value: bmp
--- a/runtime/ops/mapper/img_type_unify/process.py
+++ b/runtime/ops/mapper/img_type_unify/process.py
@@ -0,0 +1,41 @@
+# -- encoding: utf-8 --
+
+"""
+Description:
+Create: 2025/01/16
+"""
+import re
+import time
+
+from loguru import logger
+
+from datamate.common.utils import bytes_transform
+from datamate.core.base_op import Mapper
+
+
+class ImgTypeUnify(Mapper):
+    def __init__(self, *args, **kwargs):
+        super(ImgTypeUnify, self).__init__(*args, **kwargs)
+        """勾选图片编码格式统一，未输入参数时，默认设置为jpg格式"""
+        self._setting_type = kwargs.get("imgType", "jpg")
+
+    def execute(self, sample):
+        start = time.time()
+        file_name = sample[self.filename_key]
+        origin_file_type = sample[self.filetype_key]
+        if origin_file_type == self._setting_type:
+            # 原文件格式与目标文件编码格式一致，无需处理
+            return sample
+        file_path = sample[self.filepath_key]
+        # 读取图片
+        img_bytes = sample[self.data_key]
+        if img_bytes:
+            origin_data = bytes_transform.bytes_to_numpy(img_bytes)
+            # 按指定编码格式转字节
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(origin_data, "." + self._setting_type)
+            # 修改meta数据
+            sample[self.filetype_key] = self._setting_type
+            sample[self.filename_key] = re.sub(self._setting_type + "$", self._setting_type, file_name)
+            sample[self.filepath_key] = re.sub(self._setting_type + "$", self._setting_type, file_path)
+            logger.info(f"fileName: {file_name}, method: ImgTypeUnify costs {time.time() - start:6f} s")
+        return sample
--- a/runtime/ops/mapper/img_watermark_remove/init.py
+++ b/runtime/ops/mapper/img_watermark_remove/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgWatermarkRemove',
+                          module_path="ops.mapper.img_watermark_remove.process")
--- a/runtime/ops/mapper/img_watermark_remove/metadata.yml
+++ b/runtime/ops/mapper/img_watermark_remove/metadata.yml
@@ -0,0 +1,26 @@
+name: '图片水印去除'
+name_en: 'Image Watermark Removal'
+description: '去除图片中的“知乎”和“抖音”水印。'
+description_en: 'Removes the 知乎 and 抖音 watermarks from images.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgWatermarkRemove'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
+settings:
+  watermarkStr:
+    name: 需要去除的水印文字信息
+    type: checkbox
+    defaultVal: '知乎,抖音'
+    options:
+      - label: 知乎
+        value: 知乎
+      - label: 抖音
+        value: 抖音
--- a/runtime/ops/mapper/img_watermark_remove/process.py
+++ b/runtime/ops/mapper/img_watermark_remove/process.py
@@ -0,0 +1,160 @@
+# # -- encoding: utf-8 --
+
+#
+# Description:
+# Create: 2025/01/06
+# """
+import time
+from typing import Dict, Any
+
+import cv2
+import numpy as np
+from loguru import logger
+
+from datamate.common.utils import bytes_to_numpy
+from datamate.common.utils import numpy_to_bytes
+from datamate.core.base_op import Mapper
+from .watermark_ocr_model import WatermarkOcrModel
+
+DEFAULT_MAX_CHARACTERS = 10
+DEFAULT_BINARY_THRESHOLD_LOW = 200
+
+
+class ImgWatermarkRemove(Mapper):
+    use_model = True
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.remove_str = kwargs.get("watermarkStr", "知乎,抖音")
+        self.ocr_model = self.get_model(*args, **kwargs)
+
+    @staticmethod
+    def _has_kw(result_list, kw_list):
+        """
+        图片是否包含目标水印，返回匹配到的文字列表
+        """
+        result_str_list = []
+        for line in result_list:
+            for kw in kw_list:
+                if kw in line[1][0]:
+                    result_str_list.append(line[1][0])
+                    break
+        return result_str_list
+
+    @staticmethod
+    def _overlay_mask(background_img, prospect_img, img_over_x, img_over_y):
+        back_r, back_c, _ = background_img.shape  # 背景图像行数、列数
+        is_x_direction_failed = img_over_x > back_c or img_over_x < 0
+        is_y_direction_failed = img_over_y > back_r or img_over_y < 0
+        if is_x_direction_failed or is_y_direction_failed:
+            # 前景图不在背景图范围内, 直接返回原图
+            return background_img
+        pro_r, pro_c, _ = prospect_img.shape  # 前景图像行数、列数
+        if img_over_x + pro_c > back_c:  # 如果水平方向展示不全
+            pro_c = back_c - img_over_x  # 截取前景图的列数
+            prospect_img = prospect_img[:, 0:pro_c, :]  # 截取前景图
+        if img_over_y + pro_r > back_r:  # 如果垂直方向展示不全
+            pro_r = back_r - img_over_y  # 截取前景图的行数
+            prospect_img = prospect_img[0:pro_r, :, :]  # 截取前景图
+
+        prospect_img = cv2.cvtColor(prospect_img, cv2.COLOR_BGR2BGRA)  # 前景图转为4通道图像
+        prospect_tmp = np.zeros((back_r, back_c, 4), np.uint8)  # 与背景图像等大的临时前景图层
+
+        # 前景图像放到前景图层里
+        prospect_tmp[img_over_y:img_over_y + pro_r, img_over_x: img_over_x + pro_c, :] = prospect_img
+
+        _, binary = cv2.threshold(prospect_img, 254, 255, cv2.THRESH_BINARY)  # 前景图阈值处理
+        prospect_mask = np.zeros((pro_r, pro_c, 1), np.uint8)  # 单通道前景图像掩模
+        prospect_mask[:, :, 0] = binary[:, :, 3]  # 不透明像素的值作为掩模的值
+
+        mask = np.zeros((back_r, back_c, 1), np.uint8)
+        mask[img_over_y:img_over_y + prospect_mask.shape[0],
+        img_over_x: img_over_x + prospect_mask.shape[1]] = prospect_mask
+
+        mask_not = cv2.bitwise_not(mask)
+
+        prospect_tmp = cv2.bitwise_and(prospect_tmp, prospect_tmp, mask=mask)
+        background_img = cv2.bitwise_and(background_img, background_img, mask=mask_not)
+        prospect_tmp = cv2.cvtColor(prospect_tmp, cv2.COLOR_BGRA2BGR)  # 前景图层转为三通道图像
+        return prospect_tmp + background_img  # 前景图层与背景图像相加合并
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        img_bytes = sample[self.data_key]
+        if img_bytes:
+            data = bytes_to_numpy(img_bytes)
+            correct_data = self._watermark_remove(data, file_name, self.ocr_model)
+            sample[self.data_key] = numpy_to_bytes(correct_data, file_type)
+        logger.info(f"fileName: {file_name}, method: ImgWatermarkRemove costs {time.time() - start:6f} s")
+        return sample
+
+    def delete_watermark(self, result_list, kw_list, data):
+        """
+        将符合目标的水印，模糊化处理
+        """
+        # 获取所有符合目标的文本框位置
+        text_axes_list = []
+        for line in result_list:
+            for kw in kw_list:
+                if kw in line[1][0]:
+                    min_width = int(min(line[0][0][0], line[0][3][0]))
+                    max_width = int(max(line[0][1][0], line[0][2][0]))
+                    min_hight = int(min(line[0][0][1], line[0][1][1]))
+                    max_hight = int(max(line[0][2][1], line[0][3][1]))
+                    text_axes_list.append([min_width, min_hight, max_width, max_hight])
+                    break
+        # 去除水印
+        delt = DEFAULT_MAX_CHARACTERS  # 文本框范围扩大
+        img = data
+        for text_axes in text_axes_list:
+            hight, width = img.shape[0:2]
+            # 截取图片
+            min_width = text_axes[0] - delt if text_axes[0] - delt >= 0 else 0
+            min_hight = text_axes[1] - delt if text_axes[1] - delt >= 0 else 0
+            max_width = text_axes[2] + delt if text_axes[2] + delt <= width else width
+            max_hight = text_axes[3] + delt if text_axes[3] + delt <= hight else hight
+            cropped = img[min_hight:max_hight, min_width:max_width]  # 裁剪坐标为[y0:y1, x0:x1]
+            # 图片二值化处理，把[200,200,200]-[250,250,250]以外的颜色变成0
+            start_rgb = DEFAULT_BINARY_THRESHOLD_LOW
+            thresh = cv2.inRange(cropped, np.array([start_rgb, start_rgb, start_rgb]), np.array([250, 250, 250]))
+            # 创建形状和尺寸的结构元素
+            kernel = np.ones((3, 3), np.uint8)  # 设置卷积核3*3全是1；将当前的数组作为图像类型来进&#12175;各种操作，就要转换到uint8类型
+            # 扩展待修复区域
+            hi_mask = cv2.dilate(thresh, kernel, iterations=10)  # 膨胀操作，白色区域增大，iterations迭代次数
+            specular = cv2.inpaint(cropped, hi_mask, 5, flags=cv2.INPAINT_TELEA)
+            # imgSY：输入8位1通道或3通道图像。
+            # hi_mask：修复掩码，8位1通道图像。非零像素表示需要修复的区域。
+            # specular：输出与imgSY具有相同大小和类型的图像。
+            # 5：算法考虑的每个点的圆形邻域的半径。
+            # flags：NPAINT_NS基于Navier-Stokes的方法、Alexandru Telea的INPAINT_TELEA方法
+            result = self._overlay_mask(img, specular, min_width, min_hight)
+            img = result
+        return img
+
+    def init_model(self, *args, **kwargs):
+        return WatermarkOcrModel(*args, **kwargs).ocr_model
+
+    def _watermark_remove(self, data, file_name, model):
+        """
+        去除水印的方法
+        """
+        remove_str = self.remove_str
+        # 勾选去水印的信息为空，则直接返回原图
+        if remove_str == "":
+            return data
+        kw_list = remove_str.split(',')
+        # 加载模型
+        ocr_model = model
+        try:
+            result = ocr_model.ocr(data, cls=True)
+        except RuntimeError as e:
+            logger.error(f"fileName: {file_name}, method: ocr predict error {e}")
+            return data
+        if result and result[0]:
+            logger.info(f"fileName: {file_name}, method: ocrModel detect watermark info {str(result)}")
+            return self.delete_watermark(result[0], kw_list, data)
+        else:
+            logger.info(f"fileName: {file_name}, method: ImgWatermarkRemove not need remove target ocr")
+            return data
--- a/runtime/ops/mapper/img_watermark_remove/watermark_ocr_model.py
+++ b/runtime/ops/mapper/img_watermark_remove/watermark_ocr_model.py
@@ -0,0 +1,25 @@
+# -- encoding: utf-8 --
+
+import gc
+import os
+from pathlib import Path
+
+
+class WatermarkOcrModel:
+
+    def __init__(self, *args, **kwargs):
+        models_path = os.getenv("MODELS_PATH", "/home/models")
+        self.resources_path = str(Path(models_path, 'img_watermark_remove', 'resources'))
+        self.det_model_dir = str(Path(self.resources_path, 'ch_PP-OCRv4_det_infer'))
+        self.rec_model_dir = str(Path(self.resources_path, 'ch_PP-OCRv4_rec_infer'))
+        self.cls_model_dir = str(Path(self.resources_path, 'ch_ppocr_mobild_v2_cls_infer'))
+
+        from paddleocr import PaddleOCR
+        self.ocr_model = PaddleOCR(det_model_dir=self.det_model_dir, cls_model_dir=self.cls_model_dir,
+                                   rec_model_dir=self.rec_model_dir,
+                                   use_angle_cls=True,
+                                   lang='ch')
+
+    def __del__(self):
+        del self.ocr_model
+        gc.collect()
--- a/runtime/ops/mapper/invisible_characters_cleaner/init.py
+++ b/runtime/ops/mapper/invisible_characters_cleaner/init.py
@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='InvisibleCharactersCleaner',
+                          module_path="ops.mapper.invisible_characters_cleaner.process")
+
--- a/runtime/ops/mapper/invisible_characters_cleaner/metadata.yml
+++ b/runtime/ops/mapper/invisible_characters_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '不可见字符去除'
+name_en: 'Invisible Character Removal'
+description: '去除文档中的不可见字符，例如 0-31 号字符中的部分字符。'
+description_en: 'Removes invisible characters from documents, for example, removing invisible characters from characters numbered 0 to 31.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'InvisibleCharactersCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: "对“材料”怎样下\x04定义才臻于 严格和科学?"
+  after: '对“材料”怎样下定义才臻于严格和科学?'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/invisible_characters_cleaner/process.py
+++ b/runtime/ops/mapper/invisible_characters_cleaner/process.py
@@ -0,0 +1,30 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 不可见字符去除
+Create: 2025/01/13
+"""
+import re
+import time
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class InvisibleCharactersCleaner(Mapper):
+    @staticmethod
+    def _invisible_characters_filter(input_data: str):
+        # 移除ASCII中不可见字符，包括0-7、14-19 21-31、127-160的字符
+        invisible_char_pattern = '[\x00-\x07|\x0E-\x13|\x15-\x1F|\x7F-\xA0]'
+        invisible_chars_re = re.compile(invisible_char_pattern)
+        return invisible_chars_re.sub('', input_data)
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._invisible_characters_filter(sample[self.text_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, "
+                    f"method: InvisibleCharactersCleaner costs {time.time() - start:6f} s")
+        return sample
--- a/runtime/ops/mapper/ip_address_cleaner/init.py
+++ b/runtime/ops/mapper/ip_address_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='AnonymizedIpAddress',
+                          module_path="ops.mapper.ip_address_cleaner.process")
--- a/runtime/ops/mapper/ip_address_cleaner/metadata.yml
+++ b/runtime/ops/mapper/ip_address_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: 'IP地址匿名化'
+name_en: 'IP Address Anonymization'
+description: 'IP地址匿名化'
+description_en: 'Anonymizes IP addresses.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'AnonymizedIpAddress'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '这个是IP地址：10.x.x.10'
+  after: '这个是IP地址：<ip>'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/ip_address_cleaner/process.py
+++ b/runtime/ops/mapper/ip_address_cleaner/process.py
@@ -0,0 +1,74 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 身份证号码匿名化插件
+Create: 2024/12/26 15:43
+"""
+import ipaddress
+import re
+import time
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class AnonymizedIpAddress(Mapper):
+    def __init__(self, *args, **kwargs):
+        # IP地址校验
+        # X.X.X.X与四级目录格式相同，避免误清洗，该格式的IP地址必须匹配 IP/IP地址等字样
+        super().__init__(*args, **kwargs)
+        self.ipv4_1_and_prefix_pattern = r'ip(地址| address|v4)?( |:|：)*(?<![\.\d])'
+        # X.X.X.X
+        self.ipv4_pattern = r'(?<![\.\d])\d\.\d\.\d\.\d(?![\.\d])'
+        self.ipv4_re_compile = re.compile(r"(?<![\d.])(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})(?![.\d])")
+        self.ipv6_re_compile = re.compile(r"(?<![0-9a-fA-F:])(([0-9a-fA-F]{0,4}:)+[0-9a-fA-F]{0,4})(?![0-9a-fA-F:])")
+
+    @staticmethod
+    def verify_ip_address(ip):
+        """验证字符串是否为合法ip地址"""
+        try:
+            ipaddress.ip_address(ip)
+        except ValueError:
+            return False
+        return True
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._ip_address_filter(sample[self.text_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, method: IPAddressCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def filter_ipv4(self, ipv4, line):
+        """ipv4地址匿名化"""
+        if not self.verify_ip_address(ipv4):
+            return line
+        ipv4_format = ipv4.replace(".", "\\.")
+        # 非单字节ip地址直接匿名化
+        if not re.search(self.ipv4_pattern, "【" + ipv4 + "】"):
+            line = re.compile(r"(?<![\d.])" + ipv4_format + r"(?![.\d])").sub("<ip>", line)
+        elif re.search(self.ipv4_1_and_prefix_pattern + ipv4_format + r"(?![.\d])", line, re.IGNORECASE):
+            # 单字节ip地址需搜索关键字眼，有关键字眼则段落中单字节ip地址匿名化
+            line = re.compile(self.ipv4_pattern).sub("<ip>", line)
+        return line
+
+    def _ip_address_filter(self, input_data: str):
+        """ IPv4、IPv6地址匿名化"""
+        lines = input_data.split("\n")
+        line_list = []
+        for line in lines:
+            # 为防止IP地址处于段落开头或结尾不能被匹配，需要在字符串首尾加占位符
+            line = ''.join(['【', line, '】'])
+            ipv4_groups = self.ipv4_re_compile.findall(line)
+            for ipv4 in ipv4_groups:
+                line = self.filter_ipv4(ipv4, line)
+            ipv6_groups = self.ipv6_re_compile.findall(line)
+            for group in ipv6_groups:
+                ipv6 = group[0]
+                if ipv6 and self.verify_ip_address(ipv6):
+                    line = re.compile(r"(?<![0-9a-fA-F:])" + ipv6 + "(?![0-9a-fA-F:])").sub("<ip>", line)
+            line_list.append(line[1:-1])
+        text = "\n".join([line.strip() for line in line_list])
+        return text
--- a/runtime/ops/mapper/knowledge_relation_slice/init.py
+++ b/runtime/ops/mapper/knowledge_relation_slice/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='KnowledgeRelationSlice',
+                          module_path="ops.mapper.knowledge_relation_slice.process")
--- a/runtime/ops/mapper/knowledge_relation_slice/graph_sim_func.py
+++ b/runtime/ops/mapper/knowledge_relation_slice/graph_sim_func.py
@@ -0,0 +1,108 @@
+#!/usr/bin/python3.9
+# -*- coding: utf-8 -*-
+
+
+import math
+from multiprocessing import Pool, cpu_count
+
+from six import iteritems
+from six.moves import range
+from loguru import logger
+
+PARAM_K1 = 1.5
+PARAM_B = 0.75
+EPSILON = 0.25
+
+
+def effective_n_jobs(n_jobs):
+    if n_jobs == 0:
+        raise ValueError('n_jobs == 0 in Parallel has no meaning')
+    elif n_jobs is None:
+        return 1
+    elif n_jobs < 0:
+        n_jobs = max(cpu_count() + 1 + n_jobs, 1)
+    return n_jobs
+
+
+class SimilarityAlgBM25(object):
+
+    def __init__(self, corpus_docs):
+
+        self.corpus_files_size = 0
+        self.avg_dl = 0
+        self.doc_file_freqs = []
+        self.idf_dict = {}
+        self.doc_len = []
+        self._initialize(corpus_docs)
+
+    def get_sim_score(self, document, index):
+
+        score = 0
+        doc_freqs = self.doc_file_freqs[index]
+        for word in document:
+            if word not in doc_freqs:
+                continue
+            try:
+                score += (self.idf_dict[word] * doc_freqs[word] * (PARAM_K1 + 1)
+                          / (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avg_dl)))
+            except KeyError as ke:
+                logger.warning('key not found in doc_freqs dict: ', word)
+        return score
+
+    def get_sim_scores(self, document):
+
+        scores = []
+        for index in range(self.corpus_files_size):
+            cur_score = self.get_sim_score(document, index)
+            scores.append(cur_score)
+        return scores
+
+    def get_scores_bow(self, document):
+
+        scores = []
+        for index in range(self.corpus_files_size):
+            score = self.get_sim_score(document, index)
+            if score > 0:
+                scores.append((index, score))
+        return scores
+
+    def _initialize(self, corpus_files):
+        """
+        Calculates frequencies of terms in documents and in corpus_files. 
+        Also computes inverse document frequencies.
+        """
+        nd = {}  # word -> number of documents with word
+        num_doc = 0
+        for document_file in corpus_files:
+            self.corpus_files_size += 1
+            self.doc_len.append(len(document_file))
+            num_doc += len(document_file)
+
+            frequencies_dict = {}
+            for word in document_file:
+                if word not in frequencies_dict:
+                    frequencies_dict[word] = 0
+                frequencies_dict[word] += 1
+            self.doc_file_freqs.append(frequencies_dict)
+
+            for word, _ in iteritems(frequencies_dict):
+                if word not in nd:
+                    nd[word] = 0
+                nd[word] += 1
+
+        self.avg_dl = float(num_doc) / self.corpus_files_size
+        # collect idf sum to calculate an average idf for epsilon value
+        idf_sum = 0
+
+        negative_idfs_list = []
+        for word, freq in iteritems(nd):
+            idf = math.log(self.corpus_files_size - freq + 0.5) - math.log(freq + 0.5)
+            self.idf_dict[word] = idf
+            idf_sum += idf
+            if idf < 0:
+                negative_idfs_list.append(word)
+        self.average_idf = float(idf_sum) / len(self.idf_dict)
+
+        eps = EPSILON * self.average_idf
+        for word in negative_idfs_list:
+            self.idf_dict[word] = eps
--- a/runtime/ops/mapper/knowledge_relation_slice/knowledge_relation.py
+++ b/runtime/ops/mapper/knowledge_relation_slice/knowledge_relation.py
@@ -0,0 +1,184 @@
+#!/usr/bin/python3.9
+# -*- coding: utf-8 -*-
+
+
+__all__ = ['build_llm_prompt', 'get_json_list']
+
+import math
+
+import jieba
+from loguru import logger
+
+from . import graph_sim_func as bm25
+from .knowledge_slice import TextSegmentationOperator
+
+
+def build_llm_prompt(text):
+    #
+    prompt = """
+    ===
+    <Role>:
+    你是一位问答对QA智能撰写专家，你擅长根据给定的内容给出准确、完整、详细的多个问答对。
+
+    ===
+    <Instructions>:
+    - 你需要根据已知信息（context），准确、详细的生成多个QA对。
+    - 生成的问答对中答案少于10个中文字符时，放弃该问答对。
+    - 确保所有问答对的答案都是已知信息的一部分，且可以组成已知信息，确保没有信息遗漏。
+    - 仅根据已知信息生成问答对，答案要详细，且不能创造臆想已知信息中没有的内容。
+    - 确保生成的多个QA对之间不要进行排序，Q:或A:前后不要出现数字序号。
+    - Q:使用疑问句方式，问号结尾；A:使用陈述句方式，句号结尾，确保回答完整。
+    - 输出格式如下：
+    Q:......
+    A:......
+    
+    ===
+    <task>
+    满足上述条件的情况下，现根据context:'''{}'''
+    生成的多个QA问答对为:
+
+    """
+
+    return prompt.format(text)
+
+
+class KnowledgeSlice:
+    # edatamate切片算法插件
+    def __init__(self, file_text, chunk_size=500, overlap_size=100):
+        self.file_text = file_text
+        self.slice_op = TextSegmentationOperator(chunk_size, overlap_size)
+
+    def execute(self):
+        try:
+            chunks = self.slice_op.process(self.file_text)
+        except Exception as err:
+            logger.exception(f"split text failed, error is: {err}")
+            chunks = []
+
+        return chunks
+
+
+class BM25Model:
+    def __init__(self, data_list):
+        self.data_list = data_list
+        self.corpus = self.load_corpus()
+
+    def bm25_similarity(self, query, num_best=1):
+        query = jieba.lcut(query)
+        bm = bm25.SimilarityAlgBM25(self.corpus)
+        scores = bm.get_sim_scores(query)
+        id_score = [(i, score) for i, score in enumerate(scores)]
+        id_score.sort(key=lambda e: e[1], reverse=True)
+
+        return id_score[0: num_best]
+
+    def load_corpus(self):
+        corpus = [jieba.lcut(data) for data in self.data_list]
+
+        return corpus
+
+
+class KnowledgeGraph:
+    # class for document segmentation and create relation between knowledge
+    def __init__(self, corpus_file_string, chunk_size=500, overlap_size=100, kg_relation=True):
+        self.corpus_file_string = corpus_file_string
+        self.chunk_size = chunk_size
+        self.overlap_size = overlap_size
+        self.kg_relation = kg_relation
+        self.slicing_corpus = []
+        self.knowledge_slice = KnowledgeSlice(self.corpus_file_string, self.chunk_size, self.overlap_size)
+
+    @staticmethod
+    def update_gallery_list(gallery_list, iterated_dict):
+        # get a gallery list which not in iterated_dict
+        gallery_list_update = []
+        gallery_list_index = []
+        for i, _ in enumerate(gallery_list):
+            if i not in iterated_dict:
+                gallery_list_update.append(gallery_list[i])
+                gallery_list_index.append(i)
+
+        return gallery_list_update, gallery_list_index
+
+    def document_slicing(self):
+        json_list = []
+        all_slices_info = self.knowledge_slice.execute()
+
+        for _, item in enumerate(all_slices_info):
+            json_list.append({
+                "slice_data": item
+            })
+
+        self.slicing_corpus = json_list
+
+    def build_knowledge_relation(self, slicing_corpus_list):
+        # knowledge relation for each paragraph
+        if not self.kg_relation:
+            return slicing_corpus_list
+        iterated_dict = {}
+        kr_result_json_list = []
+        gallery_list = []
+        kr_relation_list = []
+
+        if len(slicing_corpus_list) < 3:
+            return slicing_corpus_list
+
+        for _, item in enumerate(slicing_corpus_list):
+            gallery_list.append(item['slice_data'])
+
+        for k, item in enumerate(slicing_corpus_list):
+            if k not in iterated_dict:
+                iterated_dict[k] = 1
+                cur_gallery_list, cur_gallery_src_index = self.update_gallery_list(gallery_list, iterated_dict)
+                if len(cur_gallery_list) < 1:
+                    kr_result_json_list.append({
+                        "slice_data": item['slice_data']
+                    })
+                    return kr_result_json_list
+                bm25_class = BM25Model(cur_gallery_list)
+                id_scores = bm25_class.bm25_similarity(item['slice_data'], 1)
+                kr_result_doc = item['slice_data'] + cur_gallery_list[id_scores[0][0]]
+                kr_result_json_list.append({
+                    "slice_data": kr_result_doc
+                })
+                if cur_gallery_src_index[id_scores[0][0]] not in iterated_dict:
+                    iterated_dict[cur_gallery_src_index[id_scores[0][0]]] = 1
+            else:
+                continue
+
+        return kr_result_json_list
+
+    def build_graph_efficiently(self, search_space_size=50):
+        # build knowledge relation in a efficient way
+        knowledge_total_num = len(self.slicing_corpus)
+        knowledge_chunk_num = math.ceil(knowledge_total_num / search_space_size)
+        knowledge_relation_result = []
+
+        for i in range(0, knowledge_chunk_num):
+            cur_max_index = (i + 1) * search_space_size
+            if cur_max_index > knowledge_total_num:
+                corpus_list = self.slicing_corpus[i * search_space_size:]
+            else:
+                corpus_list = self.slicing_corpus[i * search_space_size:cur_max_index]
+            # to do knowledge relation
+            cur_knowledge_relation_result = self.build_knowledge_relation(corpus_list)
+            knowledge_relation_result.extend(cur_knowledge_relation_result)
+
+        return knowledge_relation_result
+
+    def knowledge_corpus_list_json(self):
+        # deal the corpus and return structed information json_list
+        self.document_slicing()
+        kr_result_list_json = self.build_graph_efficiently()
+
+        return kr_result_list_json
+
+
+def get_json_list(txt_string, chunk_size=500, overlap_size=100, kg_relation=True):
+    if len(txt_string) > 0:
+        kg_extract = KnowledgeGraph(txt_string, chunk_size, overlap_size, kg_relation)
+        kr_result_json_list = kg_extract.knowledge_corpus_list_json()
+    else:
+        kr_result_json_list = []
+
+    return kr_result_json_list
--- a/runtime/ops/mapper/knowledge_relation_slice/knowledge_slice.py
+++ b/runtime/ops/mapper/knowledge_relation_slice/knowledge_slice.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+
+from typing import List
+
+from loguru import logger
+from datamate.common.utils.text_splitter import TextSplitter
+
+
+class TextSegmentationOperator:
+    def __init__(self, chunk_size, chunk_overlap):
+        try:
+            self.text_splitter = TextSplitter(-1, chunk_size, chunk_overlap)
+        except Exception as err:
+            logger.exception(f"init text splitter failed, error is： {err}")
+            raise err
+
+    def process(self, input_data: str) -> List[str]:
+        if input_data.strip() == "":
+            logger.info("input text is empty, return empty chunks.")
+            return []
+        return self.text_splitter.split_text(input_data)
--- a/runtime/ops/mapper/knowledge_relation_slice/metadata.yml
+++ b/runtime/ops/mapper/knowledge_relation_slice/metadata.yml
@@ -0,0 +1,16 @@
+name: '知识库关系切片'
+name_en: 'Knowledge base relationship slicing'
+description: '知识库关系切片'
+description_en: 'Knowledge base relationship slicing.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'KnowledgeRelationSlice'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: ''
+  after: ''
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/knowledge_relation_slice/process.py
+++ b/runtime/ops/mapper/knowledge_relation_slice/process.py
@@ -0,0 +1,46 @@
+# -- encoding: utf-8 --
+
+"""
+Description:
+Create: 2023/11/7 9:26
+"""
+import json
+import time
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+from .knowledge_relation import get_json_list
+
+# 切片长度
+CHUNK_SIZE = 500
+# 相邻切片重合长度
+OVERLAP_SIZE = 100
+
+
+class KnowledgeRelationSlice(Mapper):
+    def __init__(self, *args, **kwargs):
+        super(KnowledgeRelationSlice, self).__init__(*args, **kwargs)
+        if 'chunk_size' not in kwargs:
+            self.chunk_size = CHUNK_SIZE
+        else:
+            self.chunk_size = kwargs.get("chunk_size")
+
+        if 'overlap_size' not in kwargs:
+            self.overlap_size = OVERLAP_SIZE
+        else:
+            self.overlap_size = kwargs.get("overlap_size")
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start_time = time.time()
+
+        chunk_item = get_json_list(sample[self.text_key], chunk_size=self.chunk_size, overlap_size=self.overlap_size)
+        chunk_item_json = json.dumps(chunk_item, ensure_ascii=False)
+        sample[self.text_key] = chunk_item_json
+
+        cost_time = time.time() - start_time
+        logger.info(f'Generate knowledgeRelation slice num: {len(chunk_item)}, Cost time: {cost_time} s')
+
+        return sample
--- a/runtime/ops/mapper/legend_cleaner/init.py
+++ b/runtime/ops/mapper/legend_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='LegendCleaner',
+                          module_path="ops.mapper.legend_cleaner.process")
--- a/runtime/ops/mapper/legend_cleaner/metadata.yml
+++ b/runtime/ops/mapper/legend_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '图注表注去除'
+name_en: 'Figure and Table Description Removal'
+description: '去除文档中的图注、表注等内容。'
+description_en: 'Removes figure and table description from documents.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'LegendCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '图1.1.1 图注名称'
+  after: ''
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/legend_cleaner/process.py
+++ b/runtime/ops/mapper/legend_cleaner/process.py
@@ -0,0 +1,41 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 图注表注去除
+Create: 2024/12/5 15:43
+"""
+import re
+import time
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class LegendCleaner(Mapper):
+    @staticmethod
+    def _get_legend_re_compile():
+        chinese_legend_prefix = r"(图|表|图片|表格)"
+        chinese_legend_number = r"(\d+((\.|-)\d+)*|[a-zA-Z]{1,2}((\.|-)\d+)*)"
+        chinese_legend_pattern = r"(?<=\n)" + chinese_legend_prefix + "( )*" + chinese_legend_number + " +.*\n"
+        english_legend_pattern = r"(Figure|Table|Fig\.?)"
+        english_legend_number = r"(S?\d+((\.|-)\d+)*|[a-zA-Z]{1,2}\d?((\.|-)\d+)*)"
+        english_legend_pattern = (r"(?<=\n)" + english_legend_pattern + "( )*"
+                                  + english_legend_number + r"(\.|:)? +.*\n")
+        legend_re_compile = re.compile('|'.join([chinese_legend_pattern, english_legend_pattern]), re.IGNORECASE)
+        return legend_re_compile
+
+    @classmethod
+    def _clean_html_tag(cls, input_data: str):
+        """移除文档中图注表注等"""
+        input_data = ''.join(['\n', input_data, '\n'])
+        text = cls._get_legend_re_compile().sub("", input_data)
+        return text[1:-1]
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._clean_html_tag(sample[self.text_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, method: LegendCleaner costs {time.time() - start:6f} s")
+        return sample
--- a/runtime/ops/mapper/phone_number_cleaner/init.py
+++ b/runtime/ops/mapper/phone_number_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='AnonymizedPhoneNumber',
+                          module_path="ops.mapper.phone_number_cleaner.process")
--- a/runtime/ops/mapper/phone_number_cleaner/metadata.yml
+++ b/runtime/ops/mapper/phone_number_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '电话号码匿名化'
+name_en: 'Phone Number Anonymization'
+description: '电话号码匿名化'
+description_en: 'Anonymizes phone numbers.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'AnonymizedPhoneNumber'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '这个是电话号码：13111111111'
+  after: '这个是电话号码：<tel>'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/phone_number_cleaner/process.py
+++ b/runtime/ops/mapper/phone_number_cleaner/process.py
@@ -0,0 +1,51 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 电话号码匿名化
+Create: 2024/12/26 15:43
+"""
+import re
+import time
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Mapper
+
+
+class AnonymizedPhoneNumber(Mapper):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.phone_re_compile = self.get_phone_re_compile()
+
+    @staticmethod
+    def get_phone_re_compile():
+        """按照格式粗略匹配电话号码，支持以下格式电话号码
+        前缀：（0086）、（86）、(0086)、(86) 、无
+        电话号码：第一位1，第二位3-9，后续数字可以为0-9，数字按照3-4-4进行间隔，间隔符为空格、-、无
+        固定电话号码：0AX-CXXX-XXXX、0BXX-CXXX-XXXX、0BXX-CXX-XXXX A为1-2、B为3-9、C为2-8、X为0-9
+        约束：电话号码前后皆为非数字
+        """
+        number_prefix = r'([\(（]?\+?(00)?86[)\）]?[- ]?)?'
+        cellphone_pattern = r"1[3-9]\d[- ]?\d{4}[- ]?\d{4}"
+        landline_pattern = (r'[(（]?(0?[12]\d)[)）]?[ -]?[2-8]\d{3}[ -]?\d{4}'
+                            r'|[(（]?(0?[3-9]\d{2})[)）]?[ -]?[2-8]\d{2}\d?[ -]?\d{4}')
+        phone_numbers_pattern = rf'(?<=[^\d]){number_prefix}({cellphone_pattern}|{landline_pattern})(?=[^\d])'
+        phone_re_compile = re.compile(phone_numbers_pattern)
+        return phone_re_compile
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._phone_number_filter(sample[self.text_key])
+        logger.info(
+            f"fileName: {sample[self.filename_key]}, method: PhoneNumberCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _phone_number_filter(self, input_data: str):
+        """ 电话号码匿名化"""
+        # 正则匹配：电话号码前需匹配不是数字的字符串
+        # 为避免处于文章开头和结尾的电话号码不可被识别，需要在输入字符串的前后手动加上字符串
+        input_data = ''.join(['【', input_data, '】'])
+        input_data = self.phone_re_compile.sub("<tel>", input_data)
+        return input_data[1:-1]
--- a/runtime/ops/mapper/political_word_cleaner/init.py
+++ b/runtime/ops/mapper/political_word_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='PoliticalWordCleaner',
+                          module_path="ops.mapper.political_word_cleaner.process")
--- a/runtime/ops/mapper/political_word_cleaner/metadata.yml
+++ b/runtime/ops/mapper/political_word_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '政治文本匿名化'
+name_en: 'Political Text Anonymization'
+description: '将政治文本进行匿名化。'
+description_en: 'Anonymizes political texts.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'PoliticalWordCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '特别字符：改革历程'
+  after: '特别字符：***'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/political_word_cleaner/process.py
+++ b/runtime/ops/mapper/political_word_cleaner/process.py
@@ -0,0 +1,67 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 政治文本过滤
+Create: 2024/12/26 15:43
+"""
+import time
+from pathlib import Path
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.common.utils.aho_corasick import AhoCorasic
+from datamate.core.base_op import Mapper
+
+
+class PoliticalWordCleaner(Mapper):
+    """外部输入的政治文本过滤插件"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        root_path = Path(__file__).parent / 'resources'
+        political_file_path = str(root_path / 'political.txt')
+        special_symbols_path = str(root_path / 'special_symbols.txt')
+        self.special_symbols = self.load_words_list(special_symbols_path)
+        self.political_words = self.load_words_list(political_file_path)
+        self.ac_automaton = AhoCorasic(self.political_words)
+
+    @staticmethod
+    def load_words_list(path):
+        """词表加载"""
+        with open(path, 'r', encoding='utf-8') as f:
+            words = set(f.read().splitlines())
+        return words
+
+    @staticmethod
+    def words_replace(target_strings: list, text: str):
+        """
+        目标字符串替换。
+
+        Args:
+            target_strings: 前缀树根节点。
+            text: 待清洗文本。
+        returns:
+            清洗后文本。
+        """
+        target_strings.sort(key=lambda x: -len(x))
+        for s in target_strings:
+            tmp_text = text.replace(s, '*' * len(s))
+            text = tmp_text
+        return text
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._political_word_filter(sample[self.text_key])
+        logger.info(
+            f"fileName: {sample[self.filename_key]}, method: PoliticalWordCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _political_word_filter(self, text):
+        """词语过滤主函数，分行过滤"""
+        filtered_rows = []
+        for row in text.split('\n'):
+            matched_words = self.ac_automaton.search(row, self.special_symbols)
+            filtered_rows.append(self.words_replace(matched_words, row))
+        return '\n'.join(filtered_rows)
--- a/runtime/ops/mapper/political_word_cleaner/resources/political.txt
+++ b/runtime/ops/mapper/political_word_cleaner/resources/political.txt
@@ -0,0 +1,321 @@
+习近平
+平近习
+xjp
+习太子
+习明泽
+老习
+温家宝
+温加宝
+温x
+温jia宝
+温宝宝
+温加饱
+温加保
+张培莉
+温云松
+温如春
+温jb
+胡温
+胡x
+胡jt
+胡boss
+胡总
+胡王八
+hujintao
+胡jintao
+胡j涛
+胡惊涛
+胡景涛
+胡紧掏
+湖紧掏
+胡紧套
+锦涛
+hjt
+胡派
+胡主席
+刘永清
+胡海峰
+胡海清
+江泽民
+民泽江
+江胡
+江主席
+江书记
+江浙闽
+江沢民
+江浙民
+茳泽民
+zemin
+ze民
+老江
+老j
+江core
+江x
+江派
+江zm
+jzm
+江戏子
+江蛤蟆
+江某某
+江贼
+江猪
+江氏集团
+江绵恒
+江绵康
+王冶坪
+江泽慧
+邓小平
+平小邓
+xiao平
+邓xp
+邓晓平
+邓朴方
+邓榕
+邓质方
+毛泽东
+猫泽东
+猫则东
+猫贼洞
+毛zd
+毛zx
+z东
+ze东
+泽d
+zedong
+毛太祖
+毛相
+主席画像
+改革历程
+朱镕基
+朱容基
+朱镕鸡
+朱容鸡
+朱云来
+李鹏
+李peng
+里鹏
+李月月鸟
+李小鹏
+李小琳
+华主席
+华国
+国锋
+国峰
+锋同志
+白春礼
+薄熙来
+薄一波
+蔡赴朝
+蔡武
+曹刚川
+常万全
+陈炳德
+陈德铭
+陈建国
+陈良宇
+陈绍基
+陈同海
+陈至立
+戴秉国
+丁一平
+董建华
+杜德印
+杜世成
+傅锐
+郭伯雄
+郭金龙
+贺国强
+胡春华
+耀邦
+华建敏
+黄华华
+黄丽满
+黄兴国
+回良玉
+贾庆林
+贾廷安
+靖志远
+李长春
+李春城
+李建国
+李克强
+李岚清
+李沛瑶
+李荣融
+李瑞环
+李铁映
+李先念
+李学举
+李源潮
+栗智
+梁光烈
+廖锡龙
+林树森
+林炎志
+林左鸣
+令计划
+柳斌杰
+刘奇葆
+刘少奇
+刘延东
+刘云山
+刘志军
+龙新民
+路甬祥
+罗箭
+吕祖善
+马飚
+马恺
+孟建柱
+欧广源
+强卫
+沈跃跃
+宋平顺
+粟戎生
+苏树林
+孙家正
+铁凝
+屠光绍
+王东明
+汪东兴
+王鸿举
+王沪宁
+王乐泉
+王洛林
+王岐山
+王胜俊
+王太华
+王学军
+王兆国
+王振华
+吴邦国
+吴定富
+吴官正
+无官正
+吴胜利
+吴仪
+奚国华
+习仲勋
+徐才厚
+许其亮
+徐绍史
+杨洁篪
+叶剑英
+由喜贵
+于幼军
+俞正声
+袁纯清
+曾培炎
+曾庆红
+曾宪梓
+曾荫权
+张德江
+张定发
+张高丽
+张立昌
+张荣坤
+张志国
+赵洪祝
+紫阳
+周生贤
+周永康
+朱海仑
+中南海
+大陆当局
+中国当局
+北京当局
+共产党
+党产共
+共贪党
+阿共
+产党共
+公产党
+工产党
+共c党
+共x党
+共铲
+供产
+共惨
+供铲党
+供铲谠
+供铲裆
+共残党
+共残主义
+共产主义的幽灵
+拱铲
+老共
+中共
+中珙
+中gong
+gc党
+贡挡
+gong党
+g产
+狗产蛋
+共残裆
+恶党
+邪党
+共产专制
+共产王朝
+裆中央
+土共
+土g
+共狗
+g匪
+共匪
+仇共
+症腐
+政腐
+政付
+正府
+政俯
+政f
+zhengfu
+政zhi
+挡中央
+档中央
+中国zf
+中央zf
+国wu院
+中华帝国
+gong和
+大陆官方
+北京政权
+江泽民
+胡锦涛
+温家宝
+习近平
+习仲勋
+贺国强
+贺子珍
+周永康
+李长春
+李德生
+王岐山
+姚依林
+回良玉
+李源潮
+李干成
+戴秉国
+黄镇
+刘延东
+刘瑞龙
+俞正声
+黄敬
+薄熙
+薄一波
+周小川
+周建南
+温云松
+徐明
+江泽慧
+江绵恒
+江绵康
+李小鹏
+李鹏
+李小琳
+朱云来
+朱容基
+法轮功
+李洪志
+新疆骚乱
--- a/runtime/ops/mapper/political_word_cleaner/resources/special_symbols.txt
+++ b/runtime/ops/mapper/political_word_cleaner/resources/special_symbols.txt
@@ -0,0 +1,50 @@
+!
+ 
+.
+,
+#
+$
+%
+&
+*
+(
+)
+|
+?
+/
+@
+"
+'
+;
+[
+]
+{
+}
+
+~
+-
+_
+=
+^
+<
+>
+！
+。
+，
+￥
+（
+）
+？
+、
+“
+‘
+；
+【
+】
+——
+…
+……
+《
+》
+:
+：
--- a/runtime/ops/mapper/remove_duplicate_sentences/init.py
+++ b/runtime/ops/mapper/remove_duplicate_sentences/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='DuplicateSentencesFilter',
+                          module_path="ops.mapper.remove_duplicate_sentences.process")
--- a/runtime/ops/mapper/remove_duplicate_sentences/metadata.yml
+++ b/runtime/ops/mapper/remove_duplicate_sentences/metadata.yml
@@ -0,0 +1,16 @@
+name: '文档局部内容去重'
+name_en: 'Partial Content Deduplication'
+description: '文档局部内容去重。'
+description_en: 'Deduplicates partial file content.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'DuplicateSentencesFilter'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '这是一个重复的句子。 这是一个重复的句子。 这是一个重复的句子。 这是一个重复的句子。 这是一个重复的句子。'
+  after: '这是一个重复的句子。'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/remove_duplicate_sentences/process.py
+++ b/runtime/ops/mapper/remove_duplicate_sentences/process.py
@@ -0,0 +1,68 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 文档局部内容去重
+Create: 2025/01/07
+"""
+import re
+import time
+from collections import Counter
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Filter
+
+
+def duplicate_sentences_filter(input_data: str, file_name: str, duplicate_th: int = 5) -> str:
+    """ 文本局部内容去重：去除某些重复出现的段落或句子
+    以段落为基本单位，去除重复次数超过规定阈值的段落, 只保留第一次出现的段落的原始内容, 且不去除段落的首尾空格。
+
+    Args:
+        input_data: 输入数据
+        file_name: 文件名称
+        duplicate_th: 最大重复次数阈值，默认小于5次
+    Returns:
+        str: 清洗后数据
+    """
+    paragraphs = input_data.split("\n")
+    trust_set = {'<table>', '<tbody>', '<tr>', '<td>', '</table>', '</tbody>', '</tr>', '</td>', ""}
+
+    # 进行一次遍历，记录每个段落的出现位置
+    order_paragraphs = []
+    paragraph_counts = Counter([line.strip() for line in re.split("\\n", input_data)])
+
+    try:
+        for paragraph in paragraphs:
+            # trust_set 中的元素不纳入统计
+            if paragraph.strip() in trust_set:
+                order_paragraphs.append(paragraph)
+                continue
+            paragraph_strip = paragraph.strip()
+            if duplicate_th > paragraph_counts[paragraph_strip] >= 0:
+                order_paragraphs.append(paragraph)
+            elif paragraph_counts[paragraph_strip] >= duplicate_th:
+                order_paragraphs.append(paragraph)
+                paragraph_counts[paragraph_strip] = -1
+
+    except Exception as err:
+        logger.exception(f"fileName: ｛file_name｝, method: RemoveDuplicateSentencess. An error occurred when using "
+                         f"filtering duplicate sentences. The error is: ｛err｝")
+        return input_data
+
+    # 将去重后的段落重新组合成文本
+    result_text = '\n'.join(order_paragraphs)
+    return result_text
+
+
+class DuplicateSentencesFilter(Filter):
+    """文档局部内容去重插件"""
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        duplicate_th = 5  # 段落重复次数阈值
+        file_name = sample[self.filename_key]
+        start = time.time()
+        sample[self.text_key] = duplicate_sentences_filter(sample[self.text_key], file_name, duplicate_th)
+        logger.info(f"fileName: {file_name}, RemoveDuplicateSentencess costs {time.time() - start:6f} s")
+        return sample
--- a/runtime/ops/mapper/sexual_and_violent_word_cleaner/init.py
+++ b/runtime/ops/mapper/sexual_and_violent_word_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='SexualAndViolentWordCleaner',
+                          module_path="ops.mapper.sexual_and_violent_word_cleaner.process")
--- a/runtime/ops/mapper/sexual_and_violent_word_cleaner/metadata.yml
+++ b/runtime/ops/mapper/sexual_and_violent_word_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '暴力色情文本匿名化'
+name_en: 'Violent and Pornographic Text Anonymization'
+description: '将暴力、色情文本进行匿名化。'
+description_en: 'Anonymizes violent and pornographic texts.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'SexualAndViolentWordCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '特别字符：炸药'
+  after: '特别字符：***'
+inputs: 'text'
+outputs: 'text'
--- a/runtime/ops/mapper/sexual_and_violent_word_cleaner/process.py
+++ b/runtime/ops/mapper/sexual_and_violent_word_cleaner/process.py
@@ -0,0 +1,70 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 暴力色情文本过滤
+Create: 2024/12/26 15:43
+"""
+import time
+from pathlib import Path
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.common.utils.aho_corasick import AhoCorasic
+from datamate.core.base_op import Mapper
+
+
+class SexualAndViolentWordCleaner(Mapper):
+    """外部输入的暴力、色情文本过滤插件"""
+    root_path = Path(__file__).parent / 'resources'
+    VIOLENT_FILE_PATH = str(root_path / 'violent.txt')
+    SEXUAL_FILE_PATH = str(root_path / 'sexual.txt')
+    SPECIAL_SYMBOLS_PATH = str(root_path / 'special_symbols.txt')
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.violent_words = self.load_words_list(self.VIOLENT_FILE_PATH)
+        self.sexual_words = self.load_words_list(self.SEXUAL_FILE_PATH)
+        self.special_symbols = self.load_words_list(self.SPECIAL_SYMBOLS_PATH)
+        self.words = self.violent_words | self.sexual_words
+        self.ac_automaton = AhoCorasic(self.words)
+
+    @staticmethod
+    def load_words_list(path):
+        """词表加载"""
+        with open(path, 'r', encoding='utf-8') as f:
+            words = set(f.read().splitlines())
+        return words
+
+    @staticmethod
+    def words_replace(target_strings: list, text: str):
+        """
+        目标字符串替换。
+
+        Args:
+            target_strings: 前缀树根节点。
+            text: 待清洗文本。
+        returns:
+            清洗后文本。
+        """
+        target_strings.sort(key=lambda x: -len(x))
+        for s in target_strings:
+            tmp_text = text.replace(s, '*' * len(s))
+            text = tmp_text
+        return text
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._sexual_and_violent_word_filter(sample[self.text_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, "
+                    f"method: SexualAndViolentWordCleaner costs {time.time() - start:6f} s")
+        return sample
+
+    def _sexual_and_violent_word_filter(self, text):
+        """词语过滤主函数，分行过滤"""
+        filtered_rows = []
+        for row in text.split('\n'):
+            matched_words = self.ac_automaton.search(row, self.special_symbols)
+            filtered_rows.append(self.words_replace(matched_words, row))
+        return '\n'.join(filtered_rows)
--- a/runtime/ops/mapper/sexual_and_violent_word_cleaner/resources/sexual.txt
+++ b/runtime/ops/mapper/sexual_and_violent_word_cleaner/resources/sexual.txt
@@ -0,0 +1,288 @@
+爱液
+按摩棒
+拔出来
+爆草
+包二奶
+暴干
+暴奸
+暴乳
+爆乳
+暴淫
+被操
+被插
+被干
+逼奸
+仓井空
+插暴
+操逼
+操黑
+操烂
+肏你
+肏死
+操死
+操我
+厕奴
+插比
+插b
+插逼
+插进
+插你
+插我
+插阴
+潮吹
+潮喷
+成人电影
+成人论坛
+成人色情
+成人网站
+成人文学
+成人小说
+艳情小说
+成人游戏
+吃精
+抽插
+春药
+大波
+大力抽送
+大乳
+荡妇
+荡女
+盗撮
+发浪
+放尿
+肥逼
+粉穴
+干死你
+干穴
+肛交
+肛门
+龟头
+裹本
+国产av
+豪乳
+黑逼
+后穴
+虎骑
+换妻俱乐部
+黄片
+几吧
+鸡吧
+鸡巴
+鸡奸
+妓女
+奸情
+叫床
+脚交
+精液
+就去日
+巨屌
+菊花洞
+菊门
+巨奶
+巨乳
+菊穴
+开苞
+口爆
+口活
+口交
+口射
+口淫
+狂操
+狂插
+浪逼
+浪妇
+浪叫
+浪女
+漏乳
+露b
+乱交
+乱伦
+轮暴
+轮操
+轮奸
+裸陪
+买春
+美逼
+美少妇
+美乳
+美腿
+美穴
+美幼
+秘唇
+迷奸
+密穴
+蜜穴
+蜜液
+摸奶
+摸胸
+母奸
+奈美
+奶子
+男奴
+内射
+嫩逼
+嫩女
+嫩穴
+捏弄
+女优
+炮友
+砲友
+喷精
+屁眼
+前凸后翘
+强jian
+强暴
+强奸处女
+情趣用品
+情色
+拳交
+全裸
+群交
+人妻
+人兽
+日逼
+日烂
+肉棒
+肉逼
+肉唇
+肉洞
+肉缝
+肉棍
+肉茎
+肉具
+揉乳
+肉穴
+肉欲
+乳爆
+乳房
+乳沟
+乳交
+乳头
+骚逼
+骚比
+骚女
+骚水
+骚穴
+色逼
+色情网站
+色区
+色色
+色诱
+色欲
+色b
+射爽
+射颜
+食精
+释欲
+兽奸
+兽交
+手淫
+兽欲
+熟妇
+熟母
+熟女
+爽片
+双臀
+死逼
+丝袜
+丝诱
+松岛枫
+酥痒
+汤加丽
+套弄
+体奸
+体位
+舔脚
+舔阴
+调教
+偷欢
+推油
+脱内裤
+文做
+舞女
+吸精
+夏川纯
+相奸
+小逼
+小穴
+小xue
+性感妖娆
+性感诱惑
+性虎
+性饥渴
+性技巧
+性交
+性奴
+性虐
+性息
+性欲
+穴口
+穴图
+亚情
+颜射
+阳具
+杨思敏
+要射了
+一夜欢
+一夜情
+一ye情
+阴部
+淫虫
+阴唇
+淫荡
+阴道
+淫电影
+阴阜
+淫妇
+淫河
+阴核
+阴户
+淫贱
+淫叫
+淫教师
+阴茎
+阴精
+淫浪
+淫媚
+淫糜
+淫魔
+淫母
+淫女
+淫虐
+淫妻
+淫情
+淫色
+淫声浪语
+淫兽学园
+淫书
+淫术炼金士
+淫水
+淫娃
+淫威
+淫亵
+淫样
+淫液
+淫照
+阴b
+应召
+幼交
+欲火
+欲女
+玉乳
+玉穴
+援交
+原味内衣
+援助交际
+招鸡
+招妓
+抓胸
+自慰
+作爱
+a片
+fuck
+gay片
+g点
+h动画
+h动漫
+失身粉
+淫荡自慰器
--- a/runtime/ops/mapper/sexual_and_violent_word_cleaner/resources/special_symbols.txt
+++ b/runtime/ops/mapper/sexual_and_violent_word_cleaner/resources/special_symbols.txt
@@ -0,0 +1,50 @@
+!
+ 
+.
+,
+#
+$
+%
+&
+*
+(
+)
+|
+?
+/
+@
+"
+'
+;
+[
+]
+{
+}
+
+~
+-
+_
+=
+^
+<
+>
+！
+。
+，
+￥
+（
+）
+？
+、
+“
+‘
+；
+【
+】
+——
+…
+……
+《
+》
+:
+：
--- a/runtime/ops/mapper/sexual_and_violent_word_cleaner/resources/violent.txt
+++ b/runtime/ops/mapper/sexual_and_violent_word_cleaner/resources/violent.txt
@@ -0,0 +1,452 @@
+出售雷管
+炸药
+出售炸药
+出售炸药电话
+制作火药配方
+炸药出售
+出售雷管炸药
+火药配方
+在家里做原子弹
+硝酸甘油炸弹制作
+TNT
+炸弹的制作
+硝铵炸药配方
+硝酸甘油制作
+炸弹配方
+简易炸药
+火药制作简易炸弹
+炸弹制作
+硝酸甘油
+硝酸甘油炸弹制作方法
+手把手教你做炸弹
+恐怖分子傻瓜手册
+氢弹手工制作方法
+起爆器
+自制燃烧弹
+燃烧弹
+弹药制造方法
+液体炸药配方
+炸药配方与制作
+自制炸药配方
+tnt炸药成分
+炸药的制作
+tnt炸药配方
+各种炸药配方大全
+炸药的成分
+燃烧弹的制作方
+黑火药的配方
+黑火药的制作
+黑火药比例
+制作燃烧弹
+黑火药配方比例
+黑火药的成分
+原子弹配方
+原子弹DIY教程
+C4炸药
+雷管出售
+制作硝化甘油
+DIY核弹
+核弹制作方法
+原子弹制作手册
+简易原子弹教程
+黑火药配方
+自制炸药方法
+教你怎么用土办法做武器
+zz炸弹的制作方法
+C4炸药的成分
+炸药制造生产工艺
+原子弹制造完全手册
+汽车炸弹制作
+人体炸弹制作流程
+炸弹制作从入门
+鸡尾酒炸弹制作
+集束炸弹制作
+硝酸铵的简便制取方法
+莫洛托夫燃烧弹
+塑胶炸弹
+用火药混合塑料
+高性能爆炸物质混合
+原子弹制作方法
+手教你制作原子弹
+轻型巡航导弹的制作
+黑索金的制造过程
+Zha药制造进程
+购买自爆材料
+可塑炸弹制作方法
+C4塑胶炸药
+水浴法制TNT
+简易核弹头的制造
+高爆炸药的基本配方
+如何制硝化甘油
+燃烧剂的简易制备
+简易大威力炸弹
+炸弹制作入门
+硝酸甘油炸药制作
+基本配方是硝酸甘
+燃烧剂的简易制作
+制作性能优良的燃烧弹
+酸甘油炸药
+简易导弹制作
+核弹头的制造
+可塑炸弹制作
+炸药硝酸甘油制法
+原子弹制作方法
+火药的配方
+手教你做原子弹
+怎么用土办法做武器
+手把手教你制作原子弹
+燃烧弹的制作方法
+怎么自制燃烧弹
+TNT炸弹制作
+高爆炸药基本配方
+爆破和杀伤燃烧弹
+炸弹制作从入门到精
+写的炸弹制作教程
+原子弹制作方法
+教你制作原子弹
+下硝化甘油的制作方法
+DIY原子弹
+原子弹制作
+炸药及其制造方法
+ATOM弹制造
+三分钟教你原子弹
+火药原理
+简易炸弹
+简易燃烧弹
+土炸药成份
+你怎么用土办法做武器
+燃烧弹制作
+燃烧瓶的制作方法
+炸弹制作手册
+原子弹制作方法DIY
+汽油三分之一白糖混合
+炸药的制作配方
+燃烧弹的制作方
+炸药的配方
+如何自制燃烧弹
+标准炸弹教程
+黑火药和硝酸钾农药混合
+恐怖分子自己制造炸弹
+手柄制作炸弹引爆器
+制作炸弹首先要
+燃烧瓶制作
+出售炸药
+出售气枪
+出售炸药QQ
+出售气枪QQ
+出售气枪电话
+出售手枪QQ
+出售手枪电话
+出售猎枪
+出售猎枪QQ
+出售猎枪电话
+出售汽枪
+出售步枪
+出售狙击枪
+出售手狗
+出售气狗
+出售猎狗
+手枪出售
+64式手枪出售
+92式手枪出售
+高仿真枪
+出售54式手枪
+出售M1911
+出售各类猎狗
+出售国产军用54手枪
+出售火枪游戏
+出售进口金属枪模
+出售仿真枪
+出售麻醉枪免收任何定金
+出售手枪
+出售外国枪模
+出售袖珍手枪
+娥眉气枪出售
+防身手枪QQ
+防身武器手枪
+仿真枪QQ
+钢珠枪小口径步枪
+高压狗QQ
+高压气枪
+气枪子弹
+高压气枪电狗专卖
+高压气枪
+气枪货到付款
+高压气枪专卖店
+各类军用枪
+各类军用枪支
+各式气枪出售
+工字牌气枪出售专卖
+气枪
+工字汽狗麻醉枪
+供应军用手枪
+供应汽枪
+购买枪支QQ
+国产汽狗专卖
+气枪货
+集团出售手枪
+集团出售手枪气枪
+进口气枪
+气枪子弹
+军用54出售
+军用77出售
+军用手枪钢珠枪
+里有汽枪出售
+猎枪出售
+猎枪仿真枪QQ
+猎枪麻醉枪钢珠枪
+猎枪哪里有卖
+猎枪手枪买卖
+猎枪销售手枪销售气枪
+猎枪专卖店
+猎枪子弹底火专卖
+麻醉枪出售可验货
+麻醉枪猎枪军用
+买卖54狗
+买卖64狗
+买卖警察枪支
+买卖军狗
+买卖军用枪支
+卖64手枪
+卖92手枪
+卖钢珠枪QQ
+哪里能买到自制手枪
+哪里有钢珠枪
+哪里有连发猎枪买卖
+哪里有卖双管猎
+哪里有气枪卖
+哪里有枪出售
+哪里有真枪出售
+哪里有真枪卖买
+哪能买到真枪
+出售手枪气枪猎
+气狗
+高压气枪
+气狗出售
+进口气狗
+气枪专卖网
+气枪出售
+气枪专卖网
+猎枪
+钢珠枪
+气枪仿真枪出售
+气枪购买
+气枪组装
+气枪货到付款
+汽狗出售
+气枪价格
+仿真气枪
+气枪价格
+气枪专卖店
+气枪看货加QQ
+气枪联系QQ
+气枪猎枪麻醉枪
+气枪迷魂药
+汽狗高压汽枪
+铅弹
+上海工字气枪
+全金属仿真枪专卖
+热武器QQ
+三箭气枪出售
+枪猎
+手槍买卖自制手枪
+售枪购买手枪
+售手枪QQ
+司马仿真枪模型QQ
+淘宝气枪货到付款
+提供气狗专卖店
+提供专业及时的气狗
+秃鹰气枪货到付款
+锡峰气枪出售
+仙鹤气枪出售
+买卖77式手枪
+销售电手狗
+销售气手狗
+销售小口径步枪
+销售运动步枪
+小口径秃鹰沙漠
+有军用枪支出售
+长期出售手枪
+折叠狗QQ
+转让猎枪
+子弹货到付款
+自制手枪哪里买
+自制手枪哪里有卖
+阻击枪
+汽枪
+高压气枪
+组装手枪
+最新销售汽狗
+出售警用手铐
+仿真枪
+猎枪
+真枪
+手枪
+阻击枪
+弓弩专营
+弓弩专卖网货到付款
+弓弩专卖网
+弓弩专卖货到付款
+弓弩专卖店网
+弓弩专卖店
+弓弩专卖
+弓弩制作
+弓弩直销
+弓弩有限公司
+弓弩营销中心
+弓弩一百专卖店
+弓弩销售
+弓弩网
+弓弩图纸
+弓弩特许经销
+弓弩狩猎网
+自制手弩
+追风弓弩麻醉箭专卖
+专业弓弩网
+中国战神军用弓弩
+中国弩弓专卖
+中国弓弩专卖网
+中国弓弩直销
+中国弓弩网
+中国弓弩狩猎网
+中国弓驽网
+制作简易弓弩
+郑州弓弩专卖
+赵氏弓弩专卖网
+赵氏弓弩专卖店
+赵氏弓弩专卖
+赵氏弓弩销售
+小型弓弩专卖店
+小猎人弓弩网
+狩猎器材弓弩专卖
+狩猎器材弓弩
+狩猎弓弩专卖网
+狩猎弓弩专卖
+狩猎弓弩麻醉箭
+手枪式折叠三用弩
+三利达弓弩专卖网
+三利达弓弩直营
+三利达弓弩配件
+三步倒药箭批发
+三步倒弩箭专卖
+三步倒麻醉弩箭销售
+三步倒麻醉箭专卖
+三步倒麻醉箭
+三步倒捕狗药
+军用弓弩专卖网
+军用弓弩专卖店
+军用弓弩批发
+军用弓弩公司
+供应三利达弓弩麻醉箭
+供应三步倒麻醉箭
+供应秦氏弓弩
+供应弩用麻醉箭
+供应弩捕狗箭
+供应麻醉箭三步倒
+供应麻醉箭批发
+供应麻醉箭
+供应军用弩折叠弩
+供应军用弓弩专卖
+供应精品弓弩
+供应弓弩麻醉箭
+供应弓弩
+供应钢珠弓弩
+弓弩商城专卖
+弓弩商城
+弓弩亲兄弟货到付款
+弓弩批发
+弓弩免定金货到付款
+弓弩麻醉箭
+弓弩麻醉镖
+弓弩论坛
+钢珠弓弩专卖网
+钢珠弓弩专卖店
+打狗弓弩三步倒
+麻醉弓弩专卖店
+出售军刀
+出售军刺
+出售弹簧刀
+出售三棱刀
+出售跳刀
+军刀网
+南方军刀网
+户外军刀网
+三棱军刺专卖
+出售开山刀军刺
+西点军刀网
+军刀专卖
+戈博军刀
+阿兰德龙户外
+出售军品军刀
+勃朗宁军刀
+军刀军品网
+阿兰得龙野营刀具网
+出售军刺军刀
+警用刀具出售
+折刀专卖网
+阳江军品军刀网
+野营刀专卖
+砍刀精品折刀专卖
+匕首蝴蝶甩刀专卖
+军刀专卖军刺
+军刀专卖刀具批发
+军刀图片砍刀
+军刀网军刀专卖
+军刀价格军用刀具
+军品军刺网
+军刀军刺甩棍
+阳江刀具批发网
+北方先锋军刀
+正品军刺出售
+野营军刀出售
+开山刀砍刀出售
+仿品军刺出售
+军刀直刀专卖
+手工猎刀专卖
+自动跳刀专卖
+军刀电棍销售
+军刀甩棍销售
+美国军刀出售
+极端武力折刀
+防卫棍刀户外刀具
+阿兰德龙野营刀
+仿品军刺网
+野营砍刀户外军刀
+手工猎刀户外刀具
+中国户外刀具网
+西点军品军刀网
+野营开山刀军刺
+三利达弓弩军刀
+尼泊尔军刀出售
+防卫野营砍刀出售
+防卫著名军刀出售
+防卫棍刀出售
+防卫甩棍出售
+防卫电棍出售
+军刺野营砍刀出售
+著名精品折刀出售
+战术军刀出售
+刺刀专卖网
+户外军刀出售
+阳江刀具直销网
+冷钢刀具直销网
+防卫刀具直销网
+极端武力直销网
+刀具直销网
+军刀直销网
+直刀匕首直销网
+军刀匕首直销网
+折刀砍刀军品网
+野营刀具军品网
+阳江刀具军品网
+冷钢刀具军品网
+防卫刀具军品网
+极端武力军品网
+军用刀具军品网
+军刀直刀军品网
+折刀砍刀专卖
+野营刀具专卖
+阳江刀具专卖
+冷钢刀具专卖
+防卫刀具专卖
+出售美军现役军刀
--- a/runtime/ops/mapper/text_to_word/init.py
+++ b/runtime/ops/mapper/text_to_word/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='TextToWord',
+                          module_path="ops.mapper.text_to_word.process")
--- a/runtime/ops/mapper/text_to_word/metadata.yml
+++ b/runtime/ops/mapper/text_to_word/metadata.yml
@@ -0,0 +1,16 @@
+name: '转换为Word'
+name_en: 'Convert-to-Word'
+description: '将抽取结果转换为docx的word文件。'
+description_en: 'Converts extraction results to Word files in DOCX format.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'TextToWord'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: ''
+  after: ''
+inputs: 'text'
+outputs: 'text'
--- a/Show More
+++ b/Show More