init datamate

2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions
--- a/runtime/ops/filter/init.py
+++ b/runtime/ops/filter/init.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+
+import sys
+from pathlib import Path
+from datamate.common.utils.custom_importer import CustomImporter
+
+
+def _configure_importer():
+    base_path = Path(__file__).resolve().parent
+    sys.meta_path.append(CustomImporter(base_path))
+
+
+_configure_importer()
+
+
+def _import_operators():
+    from . import file_with_high_repeat_phrase_rate_filter
+    from . import file_with_high_repeat_word_rate_filter
+    from . import file_with_high_special_char_rate_filter
+    from . import remove_file_with_many_sensitive_words
+    from . import remove_file_with_short_or_long_length
+    from . import remove_duplicate_file
+    from . import img_blurred_images_cleaner
+    from . import img_duplicated_images_cleaner
+    from . import img_similar_images_cleaner
+    from . import img_advertisement_images_cleaner
+
+
+_import_operators()
--- a/runtime/ops/filter/file_with_high_repeat_phrase_rate_filter/init.py
+++ b/runtime/ops/filter/file_with_high_repeat_phrase_rate_filter/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='FileWithHighRepeatPhraseRateFilter',
+                          module_path="ops.filter.file_with_high_repeat_phrase_rate_filter.process")
--- a/runtime/ops/filter/file_with_high_repeat_phrase_rate_filter/metadata.yml
+++ b/runtime/ops/filter/file_with_high_repeat_phrase_rate_filter/metadata.yml
@@ -0,0 +1,31 @@
+name: '文档词重复率检查'
+description: '去除重复词过多的文档。'
+language: 'Python'
+vendor: 'Huawei'
+raw_id: 'FileWithHighRepeatPhraseRateFilter'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '机器机器机器机器机器机器机器机器机器机器学习学习学习学习学习'
+  after: ''
+inputs: 'text'
+outputs: 'text'
+settings:
+  repeatPhraseRatio:
+    name: 文档词重复率
+    description: 某个词的统计数/文档总词数 > 设定值，该文档被去除。
+    type: slider
+    defaultVal: 0.5
+    min: 0
+    max: 1
+    step: 0.1
+  hitStopwords:
+    name: 去除停用词
+    description: 统计重复词时，选择是否要去除停用词。
+    type: switch
+    defaultVal: false
+    required: true
+    checkedLabel: 去除
+    unCheckedLabel: 不去除
--- a/runtime/ops/filter/file_with_high_repeat_phrase_rate_filter/process.py
+++ b/runtime/ops/filter/file_with_high_repeat_phrase_rate_filter/process.py
@@ -0,0 +1,73 @@
+#!/user/bin/python
+# -- encoding: utf-8 --
+
+"""
+Description: 词重复率过高文档过滤插件
+Create: 2023/11/7 9:26
+"""
+import re
+import time
+
+from collections import Counter
+from pathlib import Path
+from typing import Dict, Any
+from loguru import logger
+
+import jieba
+from datamate.core.base_op import Filter
+
+
+class FileWithHighRepeatPhraseRateFilter(Filter):
+    """词重复率过高文档过滤插件"""
+    PUNCTUATION_PATTERN = re.compile(r'^[\u3000-\u303F\uff00-\uffef\s\W_]+$')
+
+    def __init__(self, *args, **kwargs):
+        super(FileWithHighRepeatPhraseRateFilter, self).__init__(*args, **kwargs)
+        self._min_threshold = kwargs.get("repeatPhraseRatio", 0.5)  # 重复词符占全文的比例阈值，默认值为0.5
+        self._hit_stopword_trigger = kwargs.get("hitStopwords", False)  # 计算重复词率时是否去除停用词，默认为False不去除，True为去除
+        self._file_path = Path(__file__).parent / 'resources' / 'hit_stopwords.txt'
+        self._hit_stopwords = []
+        if self._hit_stopword_trigger:
+            with open(self._file_path, 'r', encoding='utf-8') as f:
+                self._hit_stopwords = f.read().splitlines()
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._file_with_high_repeat_phrase_rate_filter(sample[self.text_key],
+                                                                               sample[self.filename_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, "
+                    f"method: FileWithHighRepeatPhraseRateFilter costs {(time.time() - start):6f} s")
+        return sample
+
+    def _tokenize_by_jieba(self, text: str):
+        """基于jieba对输入文本进行切分
+
+        Args:
+            text: 输入文档内容
+        Returns:
+            words_list: 切割后的词列表
+        """
+
+        for word in jieba.lcut(text):
+            if not self.PUNCTUATION_PATTERN.match(word) and word not in self._hit_stopwords:
+                yield word
+
+    def _file_with_high_repeat_phrase_rate_filter(self, input_data: str, file_name):
+        if len(input_data) < 2:  # 词语长度至少2个字符
+            return input_data
+        words_list = self._tokenize_by_jieba(input_data)
+        words_count = dict(Counter(words_list))
+        words_count_max, words_total_count = 0, 0
+        for words in words_count:
+            # 只统计中文、字母，且长度大于1的词语
+            if len(words) > 1 and words.isalpha():
+                words_count_max = max(words_count_max, words_count.get(words))
+                words_total_count += words_count.get(words)
+        output_data = input_data
+        repeat_phrase_rate = words_count_max / words_total_count if words_total_count > 0 else 0
+        if repeat_phrase_rate >= self._min_threshold:
+            # 只要有一个词重复率高于阈值，就会过滤文档
+            output_data = ""
+            logger.info(f"The repeat phrase rate of the input data is {repeat_phrase_rate}. "
+                        f"Threshold is {self._min_threshold}. The document {file_name} is filtered.")
+        return output_data
--- a/runtime/ops/filter/file_with_high_repeat_phrase_rate_filter/resources/hit_stopwords.txt
+++ b/runtime/ops/filter/file_with_high_repeat_phrase_rate_filter/resources/hit_stopwords.txt
--- a/runtime/ops/filter/file_with_high_repeat_word_rate_filter/init.py
+++ b/runtime/ops/filter/file_with_high_repeat_word_rate_filter/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='FileWithHighRepeatWordRateFilter',
+                          module_path="ops.filter.file_with_high_repeat_word_rate_filter.process")
--- a/runtime/ops/filter/file_with_high_repeat_word_rate_filter/metadata.yml
+++ b/runtime/ops/filter/file_with_high_repeat_word_rate_filter/metadata.yml
@@ -0,0 +1,25 @@
+name: '文档字重复率检查'
+name_en: 'Word Repetition Rate Check'
+description: '去除重复字过多的文档。'
+description_en: 'Filters out files that contain excessive repeated words.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'FileWithHighRepeatWordRateFilter'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '机器学学学学学学学学学学学学学学学学学学学学学学学学学学学学学学习'
+  after: ''
+inputs: 'text'
+outputs: 'text'
+settings:
+  repeatWordRatio:
+    name: 文档字重复率
+    description: 某个字的统计数/文档总字数 > 设定值，该文档被去除。
+    type: slider
+    defaultVal: 0.5
+    min: 0
+    max: 1
+    step: 0.1
--- a/runtime/ops/filter/file_with_high_repeat_word_rate_filter/process.py
+++ b/runtime/ops/filter/file_with_high_repeat_word_rate_filter/process.py
@@ -0,0 +1,51 @@
+#!/user/bin/python
+# -- encoding: utf-8 --
+
+"""
+Description: 检查文档字重复率插件
+Create: 2023/11/7 9:26
+"""
+import re
+import time
+
+from collections import Counter
+from typing import Dict, Any
+from loguru import logger
+
+from datamate.core.base_op import Filter
+
+
+class FileWithHighRepeatWordRateFilter(Filter):
+    """检查文档字重复率插件"""
+
+    def __init__(self, *args, **kwargs):
+        super(FileWithHighRepeatWordRateFilter, self).__init__(*args, **kwargs)
+        self._min_threshold = kwargs.get("repeatWordRatio", 0.5)  # 重复字符占整行的比例阈值，默认值为0.5
+
+    @staticmethod
+    def _extract_word(input_data):
+        # 只统计中文字的重复率
+        extracted_word = re.sub(r'[^\u4e00-\u9fff]', '', input_data)
+        return extracted_word
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._file_with_high_repeat_word_rate_filter(sample[self.text_key],
+                                                                             sample[self.filename_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, "
+                    f"method: FileWithHighRepeatWordRateFilter costs {(time.time() - start):6f} s")
+        return sample
+
+    def _file_with_high_repeat_word_rate_filter(self, input_data: str, file_name):
+        tmp = self._extract_word(input_data)
+        if not tmp:
+            return input_data
+        output_data = input_data
+        words_count = Counter(tmp)
+        max_value = max(words_count.values())
+        repeat_word_rate = max_value / len(tmp)
+        if repeat_word_rate >= self._min_threshold:
+            output_data = ""
+            logger.info(f"The repeat word rate of the input data is {repeat_word_rate}. "
+                        f"Threshold is {self._min_threshold}. The document %s is filtered.")
+        return output_data
--- a/runtime/ops/filter/file_with_high_special_char_rate_filter/init.py
+++ b/runtime/ops/filter/file_with_high_special_char_rate_filter/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='FileWithHighSpecialCharRateFilter',
+                          module_path="ops.filter.file_with_high_special_char_rate_filter.process")
--- a/runtime/ops/filter/file_with_high_special_char_rate_filter/metadata.yml
+++ b/runtime/ops/filter/file_with_high_special_char_rate_filter/metadata.yml
@@ -0,0 +1,25 @@
+name: '文档特殊字符率检查'
+name_en: 'Special Character Rate Check'
+description: '去除特殊字符过多的文档。'
+description_en: 'Filters out files that contain excessive special characters.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'FileWithHighSpecialCharRateFilter'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '你好！@！@#！￥！@#'
+  after: ''
+inputs: 'text'
+outputs: 'text'
+settings:
+  specialCharRatio:
+    name: 文档特殊字符率
+    description: 特殊字符的统计数/文档总字数 > 设定值，该文档被去除。
+    type: slider
+    defaultVal: 0.3
+    min: 0
+    max: 1
+    step: 0.1
--- a/runtime/ops/filter/file_with_high_special_char_rate_filter/process.py
+++ b/runtime/ops/filter/file_with_high_special_char_rate_filter/process.py
@@ -0,0 +1,49 @@
+#!/user/bin/python
+# -- encoding: utf-8 --
+
+"""
+Description: 文档特殊字符率检查
+Create: 2023/11/7 9:26
+"""
+import time
+
+from pathlib import Path
+from typing import Dict, Any
+from loguru import logger
+
+from datamate.core.base_op import Filter
+
+
+class FileWithHighSpecialCharRateFilter(Filter):
+    """检查文档特殊字符率"""
+
+    def __init__(self, *args, **kwargs):
+        super(FileWithHighSpecialCharRateFilter, self).__init__(*args, **kwargs)
+        self._min_threshold = kwargs.get("specialCharRatio", 0.3)  # 特殊字符占全文比例阈值，默认值为0.3
+        self._file_path = Path(__file__).parent / 'resources' / 'special_token.txt'
+        with open(self._file_path, 'r', encoding='utf-8') as f:
+            self._special_token = set(f.read().splitlines())
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._file_with_high_special_char_rate_filter(sample[self.text_key],
+                                                                              sample[self.filename_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, "
+                    f"method: FileWithHighSpecialCharRateFilter costs {(time.time() - start):6f} s")
+        return sample
+
+    def _file_with_high_special_char_rate_filter(self, input_data: str, file_name):
+        if not input_data:
+            return ""
+
+        output_data = input_data
+        total = 0
+        for token in self._special_token:
+            total += input_data.count(token)
+
+        special_char_rate = total / len(input_data)
+        if special_char_rate >= self._min_threshold:
+            logger.info(f"The special char rate of the input data is {special_char_rate}. "
+                        f"Threshold is {self._min_threshold}. The document {file_name} is filtered.")
+            output_data = ""
+        return output_data
--- a/runtime/ops/filter/file_with_high_special_char_rate_filter/resources/special_token.txt
+++ b/runtime/ops/filter/file_with_high_special_char_rate_filter/resources/special_token.txt
@@ -0,0 +1,50 @@
+~
+·
+！
+@
+#
+￥
+%
+…
+&
+*
+（
+）
+—
+
+-
+=
+{
+}
+|
+【
+】
+、
+：
+“
+；
+‘
+《
+》
+？
+，
+。
+`
+!
+$
+^
+(
+)
+_
+[
+]
+\
+:
+"
+;
+'
+<
+>
+?
+,
+/
--- a/runtime/ops/filter/img_advertisement_images_cleaner/init.py
+++ b/runtime/ops/filter/img_advertisement_images_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgAdvertisementImagesCleaner',
+                          module_path="ops.filter.img_advertisement_images_cleaner.process")
--- a/runtime/ops/filter/img_advertisement_images_cleaner/metadata.yml
+++ b/runtime/ops/filter/img_advertisement_images_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '广告图片过滤'
+name_en: 'Ad Image Filter'
+description: '去除包含二维码的图片。'
+description_en: 'Removes images containing QR codes.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgAdvertisementImagesCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/filter/img_advertisement_images_cleaner/process.py
+++ b/runtime/ops/filter/img_advertisement_images_cleaner/process.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+# -*- coding:utf-8 -*-
+
+"""
+Description:
+Create: 2024/1/22 20:49
+"""
+import time
+from typing import Dict, Any
+
+import cv2
+import numpy as np
+from loguru import logger
+
+from datamate.common.utils import bytes_transform
+from datamate.core.base_op import Filter
+
+from .wechat_qrcode_model import WechatQRCodeModel
+
+
+class ImgAdvertisementImagesCleaner(Filter):
+    """去除广告图片的插件，当前仅支持去除二维码"""
+
+    def __init__(self, *args, **kwargs):
+        super(ImgAdvertisementImagesCleaner, self).__init__(*args, **kwargs)
+        self.img_resize = 1000  # 大图片的最长边压缩为1000
+        self.use_model = True
+        self.model = self.get_model(*args, **kwargs)
+
+    @staticmethod
+    def _detect_qr_code_using_anchor_point(img):
+        # 有些二维码和边缘紧贴，无法识别出整个矩形，所以我们先对图片大小进行扩展
+        expand_length = 10
+        edge = expand_length // 2
+        h, w = img.shape[:2]
+        image_extend = np.zeros((img.shape[0] + expand_length, img.shape[1] + expand_length, 3), np.uint8)
+        image_extend[:] = 255
+        image_extend[edge:edge + h, edge:edge + w] = img
+
+        # 转灰度、二值化、找轮廓
+        gray = cv2.cvtColor(image_extend, cv2.COLOR_BGR2GRAY)
+        # 中值滤波
+        blur_image = cv2.medianBlur(gray, 5)
+        _, thresh = cv2.threshold(blur_image, 127, 255, cv2.THRESH_BINARY)
+        contours, hir = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+
+        # 三个“回”字特征轮廓存储
+        parent_contours_list = []
+        hir_list = hir[0]
+        for i, item in enumerate(hir_list):
+            # 判断A轮廓是否有B轮廓
+            if item[2] == -1:
+                continue
+            else:
+                hir_b_index = item[2]
+            # 判断B轮廓是否有C轮廓
+            if hir_list[hir_b_index][2] == -1:
+                continue
+            hir_c_index = hir_list[hir_b_index][2]
+            # 计算A轮廓的周长和C轮廓周长的比值
+            hir_c_arc_length = cv2.arcLength(contours[hir_c_index], True)
+            if hir_c_arc_length:
+                error = cv2.arcLength(contours[i], True) / hir_c_arc_length
+                # 二维码每一个“回”的黑白框框的比例大概为1:1:3:1:1
+                # 理论上，A轮廓周长为28，C轮廓周长为12，A/C = error = 2.3333
+                if 1.5 <= error <= 3:
+                    parent_contours_list.append(contours[i])
+
+        # 若找到3个以上“回”字，该图片含有二维码
+        return len(parent_contours_list) >= 3
+
+    @staticmethod
+    def _detect_qr_code_using_wechat_model(img, file_name, model):
+        res = ""
+        try:
+            res, points = model.detectAndDecode(img)
+        except UnicodeDecodeError as ex:
+            res = ex.object.decode('ISO-8859-1').split(" ")[0]
+        except Exception as err:
+            logger.exception(f"fileName: {file_name}, method: ImgAdvertisementImagesCleaner. "
+                             f"An error occurred when using the WeChat model to detect the QR code. "
+                             f"The error is: {err}")
+        if res:
+            return True
+        return False
+
+    def init_model(self, *args, **kwargs):
+        return WechatQRCodeModel(*args, **kwargs).wechat_qr_model
+
+    def resize_img(self, image):
+        """图片等比压缩"""
+        height, width = image.shape[:2]  # 获取原图像的水平方向尺寸和垂直方向尺寸。
+        temp = max(height, width)
+        # 若图片最长边大于限值，对图片进行压缩，否则返回原图
+        if temp >= self.img_resize:
+            mul_temp = temp / self.img_resize
+            if height > width:
+                res = cv2.resize(image, (int(width / mul_temp), self.img_resize), interpolation=cv2.INTER_AREA)
+            elif height < width:
+                res = cv2.resize(image, (self.img_resize, int(height / mul_temp)), interpolation=cv2.INTER_AREA)
+            else:
+                res = cv2.resize(image, (self.img_resize, self.img_resize), interpolation=cv2.INTER_AREA)
+            return res
+        return image
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        img_bytes = sample[self.data_key]
+        if img_bytes:
+            data = bytes_transform.bytes_to_numpy(img_bytes)
+            image = self._detect_advertisement_img(data, file_name, self.model)
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(image, file_type)
+            logger.info(f"fileName: {file_name}, "
+                        f"method: ImgAdvertisementImagesCleaner costs {(time.time() - start):6f} s")
+        return sample
+
+    def _detect_advertisement_img(self, img, file_name, model):
+        """检测含有二维码的图片"""
+        img_resize = self.resize_img(img)
+        if self._detect_qr_code_using_wechat_model(img_resize, file_name, model) \
+                or self._detect_qr_code_using_anchor_point(img_resize):
+            logger.info(f"fileName: {file_name}, method: ImgAdvertisementImagesCleaner. "
+                        "The image contains advertisement. The image is filtered out.")
+            return np.array([])
+        return img
--- a/runtime/ops/filter/img_advertisement_images_cleaner/wechat_qrcode_model.py
+++ b/runtime/ops/filter/img_advertisement_images_cleaner/wechat_qrcode_model.py
@@ -0,0 +1,23 @@
+# -- encoding: utf-8 --
+
+import gc
+import os
+from pathlib import Path
+
+import cv2
+
+
+class WechatQRCodeModel:
+
+    def __init__(self, *args, **kwargs):
+        models_path = os.getenv("MODELS_PATH", "/home/models")
+        self.resources_path = str(Path(models_path, 'img_QRcode_detect', 'resources'))
+        self.wechat_qr_model = cv2.wechat_qrcode_WeChatQRCode(
+            str(Path(self.resources_path, 'detect.prototxt')),
+            str(Path(self.resources_path, 'detect.caffemodel')),
+            str(Path(self.resources_path, 'sr.prototxt')),
+            str(Path(self.resources_path, 'sr.caffemodel')))
+
+    def __del__(self):
+        del self.wechat_qr_model
+        gc.collect()
--- a/runtime/ops/filter/img_blurred_images_cleaner/init.py
+++ b/runtime/ops/filter/img_blurred_images_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgBlurredImagesCleaner',
+                          module_path="ops.filter.img_blurred_images_cleaner.process")
--- a/runtime/ops/filter/img_blurred_images_cleaner/metadata.yml
+++ b/runtime/ops/filter/img_blurred_images_cleaner/metadata.yml
@@ -0,0 +1,25 @@
+name: '模糊图片过滤'
+name_en: 'Fuzzy Image Filter'
+description: '去除模糊的图片。'
+description_en: 'Filters out fuzzy images.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgBlurredImagesCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
+settings:
+  blurredThreshold:
+    name: 梯度函数值
+    description: 梯度函数值取值越小，图片模糊度越高。
+    type: slider
+    defaultVal: 1000
+    min: 1
+    max: 10000
+    step: 1
--- a/runtime/ops/filter/img_blurred_images_cleaner/process.py
+++ b/runtime/ops/filter/img_blurred_images_cleaner/process.py
@@ -0,0 +1,50 @@
+# -- encoding: utf-8 --
+
+"""
+Description:
+Create: 2025/01/17
+"""
+import time
+
+from typing import Dict, Any
+
+import cv2
+import numpy as np
+from loguru import logger
+
+
+from datamate.common.utils import bytes_transform
+from datamate.core.base_op import Filter
+
+
+class ImgBlurredImagesCleaner(Filter):
+    """过滤模糊度低于阈值的图片插件"""
+
+    def __init__(self, *args, **kwargs):
+        super(ImgBlurredImagesCleaner, self).__init__(*args, **kwargs)
+        # 设置模糊度阈值
+        self._blurred_threshold = kwargs.get("blurredThreshold", 1000)
+
+    def execute(self, sample: Dict[str, Any]):
+        start = time.time()
+        img_bytes = sample[self.data_key]
+        file_name = sample[self.filename_key]
+        file_type = "." + sample[self.filetype_key]
+        if img_bytes:
+            data = bytes_transform.bytes_to_numpy(img_bytes)
+            blurred_images = self._blurred_images_filter(data, file_name)
+            sample[self.data_key] = bytes_transform.numpy_to_bytes(blurred_images, file_type)
+        logger.info(f"fileName: ｛file_name｝, method: ImagesBlurredCleaner costs {(time.time() - start):6f} s")
+        return sample
+
+    def _blurred_images_filter(self, image, file_name):
+        # 为方便与其他图片比较可以将图片resize到同一个大小
+        img_resize = cv2.resize(image, (112, 112))
+        # 将图片压缩为单通道的灰度图
+        gray = cv2.cvtColor(img_resize, cv2.COLOR_BGR2GRAY)
+        score = cv2.Laplacian(gray, cv2.CV_64F).var()
+        if score <= self._blurred_threshold:
+            logger.info(f"The image blur is {self._blurred_threshold}, "
+                        f"which exceeds the threshold of ｛score｝. ｛file_name｝ is filtered out.")
+            return np.array([])
+        return image
--- a/runtime/ops/filter/img_duplicated_images_cleaner/init.py
+++ b/runtime/ops/filter/img_duplicated_images_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgDuplicatedImagesCleaner',
+                          module_path="ops.filter.img_duplicated_images_cleaner.process")
--- a/runtime/ops/filter/img_duplicated_images_cleaner/metadata.yml
+++ b/runtime/ops/filter/img_duplicated_images_cleaner/metadata.yml
@@ -0,0 +1,16 @@
+name: '重复图片去除'
+name_en: 'Duplicate Image Removal'
+description: '去除重复的图片。'
+description_en: 'Removes duplicate images.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgDuplicatedImagesCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
--- a/runtime/ops/filter/img_duplicated_images_cleaner/process.py
+++ b/runtime/ops/filter/img_duplicated_images_cleaner/process.py
@@ -0,0 +1,109 @@
+# -- encoding: utf-8 --
+
+"""
+Description:
+于MD5值计算当前图片与数据集中其它图片是否相同。相同该图片过滤，保留原数据集图片。
+将文件特征数据即MD5值，存到数据库。根据任务uuid获取历史文件特征，遍历特征并进行去重比较
+Create: 2025/1/7
+"""
+
+import json
+import time
+from pathlib import Path
+from typing import Dict, Any
+
+import cv2
+from Crypto.Hash import MD5
+from sqlalchemy import text
+from loguru import logger
+
+from datamate.sql_manager.sql_manager import SQLManager
+from datamate.common.utils import get_now_time
+from datamate.common.utils import bytes_to_numpy, numpy_to_bytes
+from datamate.core.base_op import Filter
+
+
+class ImgDuplicatedImagesCleaner(Filter):
+    """去除重复图片插件
+    基于MD5值计算当前图片与数据集中其它图片是否相同。相同该图片过滤，保留原数据集图片。
+    """
+
+    def __init__(self, *args, **kwargs):
+        # task_uuid为标识该数据集的唯一标志
+        super().__init__(*args, **kwargs)
+        self.task_uuid = kwargs.get("uuid", "")
+        self.img_resize = 200  # 图片压缩尺寸
+        # 获取数据库sql
+        self.sql_dict = self.load_sql_dict()
+        # 获取数据库连接池
+        self.conn = None  # 数据库连接
+        self.trans = None  # 数据库事务
+
+    @staticmethod
+    def load_sql_dict():
+        """获取sql语句"""
+        sql_config_path = str(Path(__file__).parent / 'sql' / 'sql_config.json')
+        with open(sql_config_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+
+    def compute_md5(self, img_bytes: bytes) -> str:
+        """将图片统一转化为png无损格式，计算每张图像的md5值"""
+        if not img_bytes:
+            return ""
+        img = bytes_to_numpy(img_bytes)
+        height, width = img.shape[:2]  # 获取原图像的水平方向尺寸和垂直方向尺寸。
+        res = cv2.resize(img, (int(width / height * self.img_resize), self.img_resize), interpolation=cv2.INTER_AREA)
+        img_bytes = numpy_to_bytes(res, ".png")
+        hash_md5 = MD5.new()
+        hash_md5.update(img_bytes)
+        return hash_md5.hexdigest()
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        """重复图片去重算子执行入口"""
+        start = time.time()
+        file_name = sample[self.filename_key]
+        self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
+        img_data = self._duplicate_images_filter(file_name, sample[self.data_key])
+        sample[self.data_key] = img_data
+        logger.info(
+            f"fileName: {file_name}, method: DuplicateImagesCleaner costs {(time.time() - start):6f} s")
+        return sample
+
+    def execute_sql(self, md5: str, file_name: str,
+                    img_bytes: bytes) -> bytes:
+        """从数据库中获取文件特征、比较MD5，插入新的文件特征"""
+        timestamp = get_now_time('Asia/Shanghai', '%Y-%m-%d %H:%M:%S', file_name,
+                                 "DuplicateImagesCleaner")
+        query_sql = str(self.sql_dict.get("query_sql"))
+        insert_sql = str(self.sql_dict.get("insert_sql"))
+        create_tables_sql = str(self.sql_dict.get("create_tables_sql"))
+        query_sql_params = {"task_uuid": self.task_uuid, "file_feature": md5}
+        insert_sql_params = {"task_uuid": self.task_uuid, "file_feature": md5, "file_name": file_name.encode("utf-8"),
+                             "timestamp": timestamp}
+
+        db_manager = SQLManager()
+        try:
+            self.conn = db_manager.create_connect()
+        except Exception as e:
+            logger.error(f"fileName: {file_name}, database connection failed: {str(e)}")
+            raise RuntimeError(82000, str(e)) from None
+
+        with self.conn as connection:
+            connection.execute(text(create_tables_sql))
+            # 判断是否有重复文件
+            result = connection.execute(text(query_sql, query_sql_params)).fetchall()
+            # 查询记录为空，无重复图片, 插入新文件特征
+            if not result:
+                connection.execute(text(insert_sql, insert_sql_params))
+                return img_bytes
+            logger.info(f"taskId: {self.task_uuid} fileName: {file_name}, method: Duplicate ImagesCleaner. "
+                        f"The image is duplicated and filtered ")
+        return b""
+
+    def _duplicate_images_filter(self, file_name: str, img_bytes: bytes) -> bytes:
+        """重复图片去重算子执行逻辑"""
+        # 如果文件为空，则无需去重，返回原图
+        if not img_bytes:
+            return img_bytes
+        md5 = self.compute_md5(img_bytes)
+        return self.execute_sql(md5, file_name, img_bytes)
--- a/runtime/ops/filter/img_duplicated_images_cleaner/sql/sql_config.json
+++ b/runtime/ops/filter/img_duplicated_images_cleaner/sql/sql_config.json
@@ -0,0 +1,5 @@
+{
+  "query_sql": "SELECT * FROM operator_duplicate_img_features WHERE task_uuid = :task_uuid AND file_feature = :file_feature",
+  "insert_sql": "INSERT INTO operator_duplicate_img_features (task_uuid, file_feature, file_name, timestamp) VALUES (:task_uuid, :file_feature, :file_name, :timestamp)",
+  "create_tables_sql": "CREATE TABLE IF NOT EXISTS operator_duplicate_img_features (id INT AUTO_INCREMENT PRIMARY KEY,task_uuid VARCHAR(255),file_feature TEXT,file_name TEXT,timestamp DATETIME);"
+}
--- a/runtime/ops/filter/img_similar_images_cleaner/init.py
+++ b/runtime/ops/filter/img_similar_images_cleaner/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='ImgSimilarImagesCleaner',
+                          module_path="ops.filter.img_similar_images_cleaner.process")
--- a/runtime/ops/filter/img_similar_images_cleaner/metadata.yml
+++ b/runtime/ops/filter/img_similar_images_cleaner/metadata.yml
@@ -0,0 +1,25 @@
+name: '相似图片去除'
+name_en: 'Similar Image Removal'
+description: '去除相似的图片。'
+description_en: 'Removes similar images.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'ImgSimilarImagesCleaner'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'image'
+effect:
+  before: ''
+  after: ''
+inputs: 'image'
+outputs: 'image'
+settings:
+  similarThreshold:
+    name: 相似度
+    description: 相似度取值越大，图片相似度越高。
+    type: slider
+    defaultVal: 0.8
+    min: 0
+    max: 1
+    step: 0.01
--- a/runtime/ops/filter/img_similar_images_cleaner/process.py
+++ b/runtime/ops/filter/img_similar_images_cleaner/process.py
@@ -0,0 +1,238 @@
+# -- encoding: utf-8 --
+
+"""
+Description:
+    1.本算子结合感知哈希算法和ORB两个算法判断图片的相似性
+    2.感知哈希算法则是从图像的整体结构和特征维度来计算图片的相似度。
+    3.ORB算法可以用来对图像中的关键点快速创建特征向量，这些特征向量可以用来识别图像中的对象。通过比较两张图片的特征向量计算相似度。
+    4.感知哈希算法和ORB算法计算相似度高于0.75，则选择二者较大值；若低于0.75，则选择二者最小值作为相似度
+    5.将文件特征数据存到数据库。根据任务uuid获取历史文件特征，遍历特征并进行去重比较
+Create: 2025/1/7
+"""
+import json
+import time
+import zlib
+from pathlib import Path
+from typing import List, Dict, Any
+
+import cv2
+import numpy as np
+from sqlalchemy import text
+from loguru import logger
+
+from datamate.sql_manager.sql_manager import SQLManager
+from datamate.common.utils import get_now_time
+from datamate.common.utils import bytes_to_numpy
+from datamate.core.base_op import Filter
+
+MAX_RETRIES = 5
+BASE_DELAY = 1
+MAX_DELAY = 30  # 最大延时设置为30秒
+JITTER_FACTOR = 0.25  # 抖动因子为等待时间的25%
+MAX_FEATURES_NUM = 200
+
+
+def get_orb_des(image: np.ndarray) -> np.ndarray:
+    """检测图像中的特征点kp和计算这些特征点的描述符矩阵des_matrix"""
+    if not image.size:
+        return np.array([])
+    orb = cv2.ORB_create()  # 初始化ORB检测器
+    orb.setMaxFeatures(MAX_FEATURES_NUM)  # 设置最大特征点数量为200
+    kp, des_matrix = orb.detectAndCompute(image, None)
+    if des_matrix is None:
+        # 若没有提取出图像特征，描述符矩阵置为空
+        des_matrix = np.array([])
+    return des_matrix
+
+
+class ImgSimilarImagesCleaner(Filter):
+    """去除相似图片的插件"""
+
+    DEFAULT_SIMILAR_THRESHOLD = 0.8  # 默认相似度阈值
+    DEFAULT_TASK_UUID = "uuid"  # 默认任务UUID
+    DEFAULT_ORB_RATIO = 0.8  # 默认特征点距离比率
+    DEFAULT_MIX_SIMILARITY = 0.75  # 默认相似度算法阈值
+    DEFAULT_IMG_RESIZE = 200  # 默认图片压缩尺寸
+    DEFAULT_PAGE_SIZE = 500  # 默认每页数据量
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.similar_threshold = kwargs.get("similarThreshold", self.DEFAULT_SIMILAR_THRESHOLD)  # 默认相似度阈值为0.8
+        # task_uuid为标识该数据集的唯一标志
+        self.task_uuid = kwargs.get("uuid", self.DEFAULT_TASK_UUID)
+        self.orb_ratio = self.DEFAULT_ORB_RATIO  # 特征点距离的比率，该数值为经验值
+        self.mix_similarity = self.DEFAULT_MIX_SIMILARITY  # 选择相似度算法的阈值，该数值为经验值
+        self.img_resize = self.DEFAULT_IMG_RESIZE  # 图片压缩尺寸
+        self.conn = None  # 数据库连接
+        self.trans = None  # 数据库事务
+        self.page_size = self.DEFAULT_PAGE_SIZE  # 每页数据量
+        # 获取数据库sql
+        self.sql_dict = self.load_sql_dict()
+
+    @staticmethod
+    def load_sql_dict():
+        """获取sql语句"""
+        sql_config_path = str(Path(__file__).parent / 'sql' / 'sql_config.json')
+        with open(sql_config_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+
+    @staticmethod
+    def get_p_hash(image: np.ndarray) -> str:
+        """计算pHash值"""
+        hashed_value = ""
+        if not image.size:
+            return hashed_value
+        gray_image = cv2.cvtColor(cv2.resize(image, (8, 8), interpolation=cv2.INTER_AREA), cv2.COLOR_BGR2GRAY)
+        dct_image = cv2.dct(np.float32(gray_image))
+        hashed_value = ''.join(['1' if x >= 0 else '0' for x in dct_image[:8, :8].flatten()])
+        return hashed_value
+
+    @staticmethod
+    def get_phash_similarity(hash_comparison: str, hash_compared: str) -> float:
+        """通过计算汉明距离，获取图片相似度"""
+        # 若哈希值为空，则相似度为0
+        if not hash_comparison or not hash_compared:
+            return 0.0
+        # 计算汉明距离
+        distance = sum(
+            bit_comparison != bit_compared for bit_comparison, bit_compared in zip(hash_comparison, hash_compared))
+        similarity = 1 - distance / len(hash_comparison)
+        return similarity
+
+    def filter_similar_images(self, img: np.ndarray, file_name: str) -> np.ndarray:
+        """判断数据集中是否存在相似图片"""
+        # 如果文件为空，则无需去重，返回原图
+        if not img.size:
+            return img
+        p_hash = self.get_p_hash(img)
+        height, width = img.shape[:2]  # 获取原图像的水平方向尺寸和垂直方向尺寸。
+        img_resize = cv2.resize(img, (int(width / height * self.img_resize), self.img_resize),
+                                interpolation=cv2.INTER_AREA)
+        des_matrix = get_orb_des(img_resize)
+        return self.execute_sql(p_hash, des_matrix, file_name, img)
+
+    def get_orb_similarity(self, des_matrix: np.ndarray, des_matrix_history: np.ndarray, file_name: str,
+                           file_name_history: str) -> float:
+        """获取图片orb相似度"""
+        # 若描述符矩阵为空，则相似度为0
+        if not des_matrix.size or not des_matrix_history.size:
+            return 0.0
+        # 根据矩阵对角线上元素和的大小，选择描述符矩阵作为训练或查询矩阵
+        train_matrix, query_matrix = des_matrix, des_matrix_history
+        if train_matrix.shape[0] > des_matrix_history.shape[0]:
+            train_matrix, query_matrix = des_matrix_history, des_matrix
+        elif des_matrix.shape[0] == des_matrix_history.shape[0]:
+            if np.trace(des_matrix) > np.trace(des_matrix_history):
+                train_matrix, query_matrix = des_matrix_history, des_matrix
+
+        try:
+            # knn筛选结果
+            matches = (cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False).
+                       knnMatch(query_matrix, trainDescriptors=train_matrix, k=2))
+            if not matches:
+                return 0.0
+            # 遍历每一对特征点，筛选距离更近的特征点
+            count = 0
+            for (m, n) in matches:
+                if m.distance < self.orb_ratio * n.distance:
+                    count += 1
+            orb_similarity = count / len(matches)
+            return orb_similarity
+        except Exception as e:
+            logger.exception(f"taskId: ｛self.task_uuid｝, failed to compare the similarity between "
+                             f"｛file_name｝ and ｛file_name_history｝: {e}")
+            return 0.0
+
+    def execute_sql(self, p_hash: str, des_matrix: np.ndarray, file_name: str,
+                    img: np.ndarray) -> np.ndarray:
+        des_matrix_binary = zlib.compress(des_matrix.tobytes())  # 使用 zlib 进行压缩数组
+        timestamp = get_now_time('Asia/Shanghai', '%Y-%m-%d %H:%M:%S', file_name,
+                                 "ImgSimilarCleaner")
+        query_task_uuid_sql = str(self.sql_dict.get("query_task_uuid_sql"))
+        insert_sql = str(self.sql_dict.get("insert_sql"))
+        create_tables_sql = str(self.sql_dict.get("create_tables_sql"))
+
+        db_manager = SQLManager()
+        try:
+            self.conn = db_manager.create_connect()
+        except Exception as e:
+            logger.error(f"fileName: {file_name}, database connection failed: {str(e)}")
+            raise RuntimeError(82000, str(e)) from None
+
+        with self.conn as connection:
+            """从数据库中获取文件特征、比较相似度，插入新的文件特征"""
+            connection.execute(text(create_tables_sql))
+            result = connection.execute(text(query_task_uuid_sql), {"task_uuid": self.task_uuid}).fetchall()
+            total_count = len(result)
+            if self.has_similar_images(connection, des_matrix, file_name, p_hash, total_count):
+                    return np.array([])
+
+            insert_data = {
+                "task_uuid": self.task_uuid,
+                "p_hash": p_hash,
+                "des_matrix": des_matrix_binary,
+                "matrix_shape": str(des_matrix.shape),
+                "file_name": file_name.encode("utf-8").hex(),
+                "timestamp": timestamp
+            }
+            connection.execute(text(insert_sql),insert_data)
+        return img
+
+    def has_similar_images(self, connection, des_matrix, file_name, p_hash, total_count):
+        for i in range(0, total_count, self.page_size):
+            query_sql = self.sql_dict.get("query_sql")
+            rows = connection.execute(text(query_sql), {"task_uuid": self.task_uuid, "ge": self.page_size, "le": i}).fetchall()
+            # 对应任务uuid，最后一页没有数据，跳出循环
+            if not rows:
+                break            # 对两张图片进行相似度比较
+            if self.determine_similar_images(rows, p_hash, des_matrix, file_name):
+                return True
+        return False
+
+    def determine_similar_images(self, file_features: List, p_hash: str, des_matrix: np.ndarray,
+                                 file_name: str) -> bool:
+        """根据文件特征，判断两张图片相似度是否超过指定阈值"""
+        for signature in file_features:
+            pash_feature, orb_feature, matrix_shape, file_name_history = signature[2], signature[3], signature[4], \
+                signature[5]
+            if not pash_feature:
+                # 若图片为空，p_hash、des_matrix为空，跳过比对
+                continue
+            # 解压缩数据
+            decompressed_data = zlib.decompress(orb_feature)
+            # 将字节流转换回矩阵
+            des_matrix_history = np.frombuffer(decompressed_data, dtype=np.uint8).reshape(eval(matrix_shape))
+            # 移除转义字符 '\' 并将十六进制字符串转换为字节序列
+            bytes_data = bytes.fromhex(file_name_history)
+            # 解码字节序列为 UTF-8 编码的字符串
+            file_name_decoded = bytes_data.decode('utf-8')
+
+            phash_similarity = self.get_phash_similarity(p_hash, pash_feature)
+            orb_similarity = self.get_orb_similarity(des_matrix, des_matrix_history, file_name, file_name_decoded)
+            max_similarity = max(phash_similarity, orb_similarity)
+            min_similarity = min(phash_similarity, orb_similarity)
+            if max_similarity >= self.mix_similarity:
+                result = max_similarity
+            else:
+                result = min_similarity
+            similarity = round(result, 2)
+            if similarity >= self.similar_threshold:
+                logger.info(
+                    "fileName: %s, method: ImgSimilarCleaner, dataset: %s. This picture is similar to %s, "
+                    "and the similarity is %.4f. The picture is filtered.", file_name, self.task_uuid,
+                    file_name_decoded, similarity)
+                return True
+        return False
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        """去除相似图片算子执行入口"""
+        start = time.time()
+        file_name = sample[self.filename_key]
+        img_bytes = sample[self.data_key]
+        data = bytes_to_numpy(img_bytes) if img_bytes else np.array([])
+        similar_images = self.filter_similar_images(data, file_name)
+        # 若相似图片，sample[self.data_key]设为空
+        if not similar_images.size:
+            sample[self.data_key] = b""
+        logger.info(f"fileName: {file_name}, method: ImgSimilarCleaner costs {(time.time() - start):6f} s")
+        return sample
--- a/runtime/ops/filter/img_similar_images_cleaner/sql/sql_config.json
+++ b/runtime/ops/filter/img_similar_images_cleaner/sql/sql_config.json
@@ -0,0 +1,6 @@
+{
+  "query_sql": "SELECT * FROM operator_similar_img_features WHERE task_uuid = :task_uuid ORDER BY timestamp LIMIT :ge OFFSET :le",
+  "insert_sql": "INSERT INTO operator_similar_img_features (task_uuid,p_hash,des_matrix,matrix_shape,file_name,timestamp) VALUES (:task_uuid,:p_hash,:des_matrix,:matrix_shape,:file_name,:timestamp)",
+  "query_task_uuid_sql": "SELECT * FROM operator_similar_img_features WHERE task_uuid = :task_uuid",
+  "create_tables_sql": "CREATE TABLE IF NOT EXISTS operator_similar_img_features (id INT AUTO_INCREMENT PRIMARY KEY,task_uuid VARCHAR(255),p_hash TEXT,des_matrix BLOB,matrix_shape TEXT,file_name TEXT,timestamp DATETIME);"
+}
--- a/runtime/ops/filter/remove_duplicate_file/init.py
+++ b/runtime/ops/filter/remove_duplicate_file/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='DuplicateFilesFilter',
+                          module_path="ops.filter.remove_duplicate_file.process")
--- a/runtime/ops/filter/remove_duplicate_file/metadata.yml
+++ b/runtime/ops/filter/remove_duplicate_file/metadata.yml
@@ -0,0 +1,25 @@
+name: '相似文档去除'
+name_en: 'Similar Document Removal'
+description: '相似文档去除。'
+description_en: 'Removes similar documents.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'DuplicateFilesFilter'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '这篇文档跟数据集中的另一篇文档内容几乎一样，执行该算子后，这篇文档会被去除。'
+  after: ''
+inputs: 'text'
+outputs: 'text'
+settings:
+  fileDuplicateThreshold:
+    name: 文档相似度
+    description: 基于MinHash算法和Jaccard相似度，计算当前文档与数据集中其它文档相似性，超过设定值，该文档被去除。
+    type: slider
+    defaultVal: 0.5
+    min: 0
+    max: 1
+    step: 0.1
--- a/runtime/ops/filter/remove_duplicate_file/process.py
+++ b/runtime/ops/filter/remove_duplicate_file/process.py
@@ -0,0 +1,158 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 文档局部内容去重
+Create: 2025/01/07
+"""
+
+import json
+import re
+import time
+from pathlib import Path
+from typing import List, Dict, Any
+
+import numpy as np
+from datasketch import MinHash
+from sqlalchemy import text
+from loguru import logger
+
+from datamate.sql_manager.sql_manager import SQLManager
+from datamate.common.utils import get_now_time
+from datamate.core.base_op import Filter
+
+
+class DuplicateFilesFilter(Filter):
+    """相似文档去除插件
+
+    基于MinHash计算当前文档与数据集中其它文档相似性，相似性高于设定阈值则返回空。
+    """
+
+    def __init__(self, *args, **kwargs):
+        # 标点符号
+        super().__init__(*args, **kwargs)
+        self.punctuation_pattern = "。.？?！!，,；;：:（）()【】{}[]“”""‘’''/\n"
+        # 默认相似度阈值为0.5
+        self.duplicate_th = kwargs.get("fileDuplicateThreshold", 0.5)
+        # task_uuid为标识该数据集的唯一标志
+        self.task_uuid = kwargs.get("uuid", "")
+        # 数据库连接
+        self.conn = None
+        # 数据库事务
+        self.trans = None
+        # 每页数据量
+        self.page_size = 500
+        # 获取数据库sql
+        self.sql_dict = self.load_sql_dict()
+
+    @staticmethod
+    def load_sql_dict():
+        """获取sql语句"""
+        sql_config_path = str(Path(__file__).parent / 'sql' / 'sql_config.json')
+        with open(sql_config_path, 'r', encoding='utf-8') as f:
+            return json.load(f)
+
+    def get_minhash(self, input_text: str) -> MinHash:
+        """获取输入文档的minhash
+
+        Args:
+            input_text: 输入文档内容
+
+        Returns:
+            text_minhash: 输入文档对应的minhash值
+        """
+        text_minhash = MinHash()
+        for word in re.split(f"[{re.escape(self.punctuation_pattern)}]", input_text.strip()):
+            text_minhash.update(word.strip().encode('utf8'))
+        return text_minhash
+
+    def deduplicate_files(self, sample: Dict[str, Any], file_name: str) -> str:
+        """去除相似文件
+
+        Args:
+            content: 待处理的Content对象
+            file_name: 文件名称
+
+        Returns:
+            input_text: 去重后的文件内容，大于相似度值返回空，否则返回原始文本内容。
+        """
+        input_text = sample[self.text_key]
+        if not input_text:
+            return input_text
+        text_minhash = self.get_minhash(input_text)
+        return self.execute_sql(text_minhash, file_name, input_text)
+
+    def execute_sql(self, text_minhash: MinHash, file_name: str,
+                    input_text: str) -> str:
+        """从数据库中获取文件特征、比较相似度，插入新的文件特征"""
+        timestamp = get_now_time('Asia/Shanghai', '%Y-%m-%d %H:%M:%S', file_name,
+                                 "DuplicateFilesFilter")
+        minhash_values = text_minhash.hashvalues
+        # 将 NumPy 数组转换为字符串
+        minhash_values_string = np.array2string(minhash_values)
+        query_task_uuid_sql = self.sql_dict.get("query_task_uuid_sql")
+        insert_sql = self.sql_dict.get("insert_sql")
+        create_tables_sql = self.sql_dict.get("create_tables_sql")
+        db_manager = SQLManager()
+        try:
+            self.conn = db_manager.create_connect()
+        except Exception as e:
+            logger.error(f"fileName: {file_name}, database connection failed: {str(e)}")
+            raise RuntimeError(82000, str(e)) from None
+        with self.conn as connection:
+            connection.execute(text(create_tables_sql))
+            result = connection.execute(text(query_task_uuid_sql), {"task_uuid": self.task_uuid}).fetchall()
+            total_count = len(result)
+            if self.has_similar_text(connection, file_name, text_minhash, total_count):
+                return ""
+            insert_data = {
+                "task_uuid": self.task_uuid,
+                "file_feature": minhash_values_string,
+                "file_name": file_name.encode("utf-8").hex(),
+                "timestamp": timestamp
+            }
+            connection.execute(text(insert_sql), insert_data)
+        return input_text
+
+    def has_similar_text(self, connection, file_name, text_minhash, total_count) -> bool:
+        query_sql = self.sql_dict.get("query_sql")
+        for i in range(0, total_count, self.page_size):
+            rows = connection.execute(
+                text(query_sql), {"task_uuid": self.task_uuid, "ge": self.page_size, "le": i}).fetchall()
+            # 对应任务uuid，最后一页没有数据，跳出循环
+            if not rows:
+                break
+            # 对两个文本进行相似度比较
+            if self.determine_similar_text(rows, text_minhash, file_name):
+                return True
+        return False
+
+    def determine_similar_text(self, file_features: List, text_minhash: MinHash, file_name: str) -> bool:
+        for signature in file_features:
+            # 历史文件特征和历史文件名称
+            file_feature, file_name_history = signature[2], signature[3]
+            if not file_feature:
+                continue
+            minhash_obj = MinHash(num_perm=128)
+            minhash_obj.hashvalues = np.fromstring(file_feature.strip('[]'), dtype=np.uint64, sep=' ')
+            similarity = text_minhash.jaccard(minhash_obj)
+
+            # 移除转义字符 '\' 并将十六进制字符串转换为字节序列
+            bytes_data = bytes.fromhex(file_name_history)
+            # 解码字节序列为 UTF-8 编码的字符串
+            file_name_decoded = bytes_data.decode('utf-8')
+
+            if similarity >= self.duplicate_th:
+                logger.info(f"taskId: {self.task_uuid}, fileName: {file_name} is similar to {file_name_decoded}, "
+                            f"and the similarity is {similarity:4f}")
+                return True
+        return False
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        file_name = sample[self.filename_key]
+        self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
+        sample[self.text_key] = self.deduplicate_files(sample, file_name)
+        logger.info(f"taskId: {self.task_uuid} fileName: {file_name}, "
+                    f"method: DuplicateFilesFilter costs {(time.time() - start):6f} s")
+        return sample
--- a/runtime/ops/filter/remove_duplicate_file/sql/sql_config.json
+++ b/runtime/ops/filter/remove_duplicate_file/sql/sql_config.json
@@ -0,0 +1,6 @@
+{
+  "query_sql": "SELECT * FROM operators_similar_text_features WHERE task_uuid = :task_uuid ORDER BY timestamp LIMIT :ge OFFSET :le",
+  "create_tables_sql": "CREATE TABLE IF NOT EXISTS operators_similar_text_features (id INT AUTO_INCREMENT PRIMARY KEY, task_uuid VARCHAR(255),file_feature TEXT,file_name TEXT,timestamp DATETIME);",
+  "insert_sql": "INSERT INTO operators_similar_text_features (task_uuid, file_feature, file_name, timestamp) VALUES (:task_uuid, :file_feature, :file_name, :timestamp)",
+  "query_task_uuid_sql": "SELECT * FROM operators_similar_text_features WHERE task_uuid = :task_uuid"
+}
--- a/runtime/ops/filter/remove_file_with_many_sensitive_words/init.py
+++ b/runtime/ops/filter/remove_file_with_many_sensitive_words/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='FileWithManySensitiveWordsFilter',
+                          module_path="ops.filter.remove_file_with_many_sensitive_words.process")
--- a/runtime/ops/filter/remove_file_with_many_sensitive_words/metadata.yml
+++ b/runtime/ops/filter/remove_file_with_many_sensitive_words/metadata.yml
@@ -0,0 +1,25 @@
+name: '文档敏感词率检查'
+name_en: 'Sensitive Word Rate Check'
+description: '去除敏感词过多的文档。'
+description_en: 'Filters out files that contain excessive sensitive phrases.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'FileWithManySensitiveWordsFilter'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '出售硝酸甘油出售硝酸甘油出售硝酸甘油出售硝酸甘油'
+  after: ''
+inputs: 'text'
+outputs: 'text'
+settings:
+  sensitiveWordsRate:
+    name: 文档敏感词率
+    description: 敏感词的字数/文档总字数 > 设定值，该文档被去除。
+    type: slider
+    defaultVal: 0.01
+    min: 0
+    max: 1
+    step: 0.01
--- a/runtime/ops/filter/remove_file_with_many_sensitive_words/process.py
+++ b/runtime/ops/filter/remove_file_with_many_sensitive_words/process.py
@@ -0,0 +1,116 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 过滤语言概率太低的文档（支持自定义阈值）
+Create: 2023/12/7 15:43
+"""
+import sys
+import time
+from pathlib import Path
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Filter
+from datamate.common.utils.aho_corasick import build_trie, add_fail_pointer
+
+sys.setrecursionlimit(5000)
+
+
+class AhoCorasic:
+    """AC自动机算法进行目标字符串搜索"""
+
+    def __init__(self, words):
+        self._root = add_fail_pointer(build_trie(words))
+
+    def search_and_count(self, text: str, special_symbols: set):
+        """
+        匹配敏感词，统计敏感词字数。
+
+        Args:
+            text: 文本
+            special_symbols: 特殊字符（需跳过）
+        Returns:
+            统计敏感词字数
+        """
+        target_count = 0
+        node = self._root
+
+        valid_len = 0  # 当前遍历的有效长度
+        for _, s in enumerate(text):
+            if s in special_symbols:  # 跳过特殊字符
+                continue
+
+            matched = True
+            while s not in node.child:  # 当node.child没有字符s
+                if node == self._root:  # 当node为root（无node.fail），有效长度归0且跳出
+                    valid_len = 0
+                    matched = False
+                    break
+                elif node.fail == self._root:  # node.fail为root场景，有效长度归0，但可继续
+                    valid_len = 0
+                node = node.fail  # 移动到失败指针节点
+            if not matched:
+                continue
+
+            node = node.child.get(s)
+            valid_len += 1
+            if node.word:  # node是单词尾字母
+                target_count += valid_len
+                valid_len = 0
+        return target_count
+
+
+class FileWithManySensitiveWordsFilter(Filter):
+    """外部输入的暴力、色情文本过滤插件"""
+
+    def __init__(self, *args, **kwargs):
+        super(FileWithManySensitiveWordsFilter, self).__init__(*args, **kwargs)
+        root_path = Path(__file__).parent / 'resources'
+        violent_file_path = str(root_path / 'violent.txt')
+        sexual_file_path = str(root_path / 'sexual.txt')
+        political_file_path = str(root_path / 'political.txt')
+        special_symbols_path = str(root_path / 'special_symbols.txt')
+        self._file_sensitive_words_rate = kwargs.get("sensitiveWordsRate", 0.01)  # 参数默认值为0.01
+        self.violent_words = self.load_words_list(violent_file_path)
+        self.sexual_words = self.load_words_list(sexual_file_path)
+        self.political_words = self.load_words_list(political_file_path)
+        self.special_symbols = self.load_words_list(special_symbols_path)
+        self.symbols = self.special_symbols | {"\n", "\t", "\r"}  # 符号，不纳入文本字数统计
+        self.words = self.violent_words | self.sexual_words | self.political_words
+        self.ac_automaton = AhoCorasic(self.words)
+
+    @staticmethod
+    def load_words_list(path):
+        """词表加载"""
+        with open(path, 'r', encoding='utf-8') as f:
+            words = set(f.read().splitlines())
+        return words
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._file_with_many_sensitive_words_filter(sample[self.text_key],
+                                                                            sample[self.filename_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, "
+                    f"method: FileWithManySensitiveWordsFilter costs {(time.time() - start):6f} s")
+        return sample
+
+    def _file_with_many_sensitive_words_filter(self, input_data: str, file_name):
+        """过滤敏感词过多的文档"""
+        total_count = 0
+        for s in input_data:
+            if s not in self.symbols:
+                total_count += 1
+        if total_count == 0:
+            return input_data
+
+        # 敏感词率 = 敏感词字数 / 总字数，符号不纳入统计
+        sensitive_rate = self.ac_automaton.search_and_count(input_data, self.special_symbols) / total_count
+        if sensitive_rate >= self._file_sensitive_words_rate:
+            logger.info(f"This document contains too many sensitive words. "
+                        f"The proportion of sensitive words is {sensitive_rate}. "
+                        f"Threshold is {self._file_sensitive_words_rate}. The document {file_name} is filtered.")
+            return ""
+        else:
+            return input_data
--- a/runtime/ops/filter/remove_file_with_many_sensitive_words/resources/political.txt
+++ b/runtime/ops/filter/remove_file_with_many_sensitive_words/resources/political.txt
@@ -0,0 +1,321 @@
+习近平
+平近习
+xjp
+习太子
+习明泽
+老习
+温家宝
+温加宝
+温x
+温jia宝
+温宝宝
+温加饱
+温加保
+张培莉
+温云松
+温如春
+温jb
+胡温
+胡x
+胡jt
+胡boss
+胡总
+胡王八
+hujintao
+胡jintao
+胡j涛
+胡惊涛
+胡景涛
+胡紧掏
+湖紧掏
+胡紧套
+锦涛
+hjt
+胡派
+胡主席
+刘永清
+胡海峰
+胡海清
+江泽民
+民泽江
+江胡
+江主席
+江书记
+江浙闽
+江沢民
+江浙民
+茳泽民
+zemin
+ze民
+老江
+老j
+江core
+江x
+江派
+江zm
+jzm
+江戏子
+江蛤蟆
+江某某
+江贼
+江猪
+江氏集团
+江绵恒
+江绵康
+王冶坪
+江泽慧
+邓小平
+平小邓
+xiao平
+邓xp
+邓晓平
+邓朴方
+邓榕
+邓质方
+毛泽东
+猫泽东
+猫则东
+猫贼洞
+毛zd
+毛zx
+z东
+ze东
+泽d
+zedong
+毛太祖
+毛相
+主席画像
+改革历程
+朱镕基
+朱容基
+朱镕鸡
+朱容鸡
+朱云来
+李鹏
+李peng
+里鹏
+李月月鸟
+李小鹏
+李小琳
+华主席
+华国
+国锋
+国峰
+锋同志
+白春礼
+薄熙来
+薄一波
+蔡赴朝
+蔡武
+曹刚川
+常万全
+陈炳德
+陈德铭
+陈建国
+陈良宇
+陈绍基
+陈同海
+陈至立
+戴秉国
+丁一平
+董建华
+杜德印
+杜世成
+傅锐
+郭伯雄
+郭金龙
+贺国强
+胡春华
+耀邦
+华建敏
+黄华华
+黄丽满
+黄兴国
+回良玉
+贾庆林
+贾廷安
+靖志远
+李长春
+李春城
+李建国
+李克强
+李岚清
+李沛瑶
+李荣融
+李瑞环
+李铁映
+李先念
+李学举
+李源潮
+栗智
+梁光烈
+廖锡龙
+林树森
+林炎志
+林左鸣
+令计划
+柳斌杰
+刘奇葆
+刘少奇
+刘延东
+刘云山
+刘志军
+龙新民
+路甬祥
+罗箭
+吕祖善
+马飚
+马恺
+孟建柱
+欧广源
+强卫
+沈跃跃
+宋平顺
+粟戎生
+苏树林
+孙家正
+铁凝
+屠光绍
+王东明
+汪东兴
+王鸿举
+王沪宁
+王乐泉
+王洛林
+王岐山
+王胜俊
+王太华
+王学军
+王兆国
+王振华
+吴邦国
+吴定富
+吴官正
+无官正
+吴胜利
+吴仪
+奚国华
+习仲勋
+徐才厚
+许其亮
+徐绍史
+杨洁篪
+叶剑英
+由喜贵
+于幼军
+俞正声
+袁纯清
+曾培炎
+曾庆红
+曾宪梓
+曾荫权
+张德江
+张定发
+张高丽
+张立昌
+张荣坤
+张志国
+赵洪祝
+紫阳
+周生贤
+周永康
+朱海仑
+中南海
+大陆当局
+中国当局
+北京当局
+共产党
+党产共
+共贪党
+阿共
+产党共
+公产党
+工产党
+共c党
+共x党
+共铲
+供产
+共惨
+供铲党
+供铲谠
+供铲裆
+共残党
+共残主义
+共产主义的幽灵
+拱铲
+老共
+中共
+中珙
+中gong
+gc党
+贡挡
+gong党
+g产
+狗产蛋
+共残裆
+恶党
+邪党
+共产专制
+共产王朝
+裆中央
+土共
+土g
+共狗
+g匪
+共匪
+仇共
+症腐
+政腐
+政付
+正府
+政俯
+政f
+zhengfu
+政zhi
+挡中央
+档中央
+中国zf
+中央zf
+国wu院
+中华帝国
+gong和
+大陆官方
+北京政权
+江泽民
+胡锦涛
+温家宝
+习近平
+习仲勋
+贺国强
+贺子珍
+周永康
+李长春
+李德生
+王岐山
+姚依林
+回良玉
+李源潮
+李干成
+戴秉国
+黄镇
+刘延东
+刘瑞龙
+俞正声
+黄敬
+薄熙
+薄一波
+周小川
+周建南
+温云松
+徐明
+江泽慧
+江绵恒
+江绵康
+李小鹏
+李鹏
+李小琳
+朱云来
+朱容基
+法轮功
+李洪志
+新疆骚乱
--- a/runtime/ops/filter/remove_file_with_many_sensitive_words/resources/sexual.txt
+++ b/runtime/ops/filter/remove_file_with_many_sensitive_words/resources/sexual.txt
@@ -0,0 +1,288 @@
+爱液
+按摩棒
+拔出来
+爆草
+包二奶
+暴干
+暴奸
+暴乳
+爆乳
+暴淫
+被操
+被插
+被干
+逼奸
+仓井空
+插暴
+操逼
+操黑
+操烂
+肏你
+肏死
+操死
+操我
+厕奴
+插比
+插b
+插逼
+插进
+插你
+插我
+插阴
+潮吹
+潮喷
+成人电影
+成人论坛
+成人色情
+成人网站
+成人文学
+成人小说
+艳情小说
+成人游戏
+吃精
+抽插
+春药
+大波
+大力抽送
+大乳
+荡妇
+荡女
+盗撮
+发浪
+放尿
+肥逼
+粉穴
+干死你
+干穴
+肛交
+肛门
+龟头
+裹本
+国产av
+豪乳
+黑逼
+后穴
+虎骑
+换妻俱乐部
+黄片
+几吧
+鸡吧
+鸡巴
+鸡奸
+妓女
+奸情
+叫床
+脚交
+精液
+就去日
+巨屌
+菊花洞
+菊门
+巨奶
+巨乳
+菊穴
+开苞
+口爆
+口活
+口交
+口射
+口淫
+狂操
+狂插
+浪逼
+浪妇
+浪叫
+浪女
+漏乳
+露b
+乱交
+乱伦
+轮暴
+轮操
+轮奸
+裸陪
+买春
+美逼
+美少妇
+美乳
+美腿
+美穴
+美幼
+秘唇
+迷奸
+密穴
+蜜穴
+蜜液
+摸奶
+摸胸
+母奸
+奈美
+奶子
+男奴
+内射
+嫩逼
+嫩女
+嫩穴
+捏弄
+女优
+炮友
+砲友
+喷精
+屁眼
+前凸后翘
+强jian
+强暴
+强奸处女
+情趣用品
+情色
+拳交
+全裸
+群交
+人妻
+人兽
+日逼
+日烂
+肉棒
+肉逼
+肉唇
+肉洞
+肉缝
+肉棍
+肉茎
+肉具
+揉乳
+肉穴
+肉欲
+乳爆
+乳房
+乳沟
+乳交
+乳头
+骚逼
+骚比
+骚女
+骚水
+骚穴
+色逼
+色情网站
+色区
+色色
+色诱
+色欲
+色b
+射爽
+射颜
+食精
+释欲
+兽奸
+兽交
+手淫
+兽欲
+熟妇
+熟母
+熟女
+爽片
+双臀
+死逼
+丝袜
+丝诱
+松岛枫
+酥痒
+汤加丽
+套弄
+体奸
+体位
+舔脚
+舔阴
+调教
+偷欢
+推油
+脱内裤
+文做
+舞女
+吸精
+夏川纯
+相奸
+小逼
+小穴
+小xue
+性感妖娆
+性感诱惑
+性虎
+性饥渴
+性技巧
+性交
+性奴
+性虐
+性息
+性欲
+穴口
+穴图
+亚情
+颜射
+阳具
+杨思敏
+要射了
+一夜欢
+一夜情
+一ye情
+阴部
+淫虫
+阴唇
+淫荡
+阴道
+淫电影
+阴阜
+淫妇
+淫河
+阴核
+阴户
+淫贱
+淫叫
+淫教师
+阴茎
+阴精
+淫浪
+淫媚
+淫糜
+淫魔
+淫母
+淫女
+淫虐
+淫妻
+淫情
+淫色
+淫声浪语
+淫兽学园
+淫书
+淫术炼金士
+淫水
+淫娃
+淫威
+淫亵
+淫样
+淫液
+淫照
+阴b
+应召
+幼交
+欲火
+欲女
+玉乳
+玉穴
+援交
+原味内衣
+援助交际
+招鸡
+招妓
+抓胸
+自慰
+作爱
+a片
+fuck
+gay片
+g点
+h动画
+h动漫
+失身粉
+淫荡自慰器
--- a/runtime/ops/filter/remove_file_with_many_sensitive_words/resources/special_symbols.txt
+++ b/runtime/ops/filter/remove_file_with_many_sensitive_words/resources/special_symbols.txt
@@ -0,0 +1,50 @@
+!
+ 
+.
+,
+#
+$
+%
+&
+*
+(
+)
+|
+?
+/
+@
+"
+'
+;
+[
+]
+{
+}
+
+~
+-
+_
+=
+^
+<
+>
+！
+。
+，
+￥
+（
+）
+？
+、
+“
+‘
+；
+【
+】
+——
+…
+……
+《
+》
+:
+：
--- a/runtime/ops/filter/remove_file_with_many_sensitive_words/resources/violent.txt
+++ b/runtime/ops/filter/remove_file_with_many_sensitive_words/resources/violent.txt
@@ -0,0 +1,452 @@
+出售雷管
+炸药
+出售炸药
+出售炸药电话
+制作火药配方
+炸药出售
+出售雷管炸药
+火药配方
+在家里做原子弹
+硝酸甘油炸弹制作
+TNT
+炸弹的制作
+硝铵炸药配方
+硝酸甘油制作
+炸弹配方
+简易炸药
+火药制作简易炸弹
+炸弹制作
+硝酸甘油
+硝酸甘油炸弹制作方法
+手把手教你做炸弹
+恐怖分子傻瓜手册
+氢弹手工制作方法
+起爆器
+自制燃烧弹
+燃烧弹
+弹药制造方法
+液体炸药配方
+炸药配方与制作
+自制炸药配方
+tnt炸药成分
+炸药的制作
+tnt炸药配方
+各种炸药配方大全
+炸药的成分
+燃烧弹的制作方
+黑火药的配方
+黑火药的制作
+黑火药比例
+制作燃烧弹
+黑火药配方比例
+黑火药的成分
+原子弹配方
+原子弹DIY教程
+C4炸药
+雷管出售
+制作硝化甘油
+DIY核弹
+核弹制作方法
+原子弹制作手册
+简易原子弹教程
+黑火药配方
+自制炸药方法
+教你怎么用土办法做武器
+zz炸弹的制作方法
+C4炸药的成分
+炸药制造生产工艺
+原子弹制造完全手册
+汽车炸弹制作
+人体炸弹制作流程
+炸弹制作从入门
+鸡尾酒炸弹制作
+集束炸弹制作
+硝酸铵的简便制取方法
+莫洛托夫燃烧弹
+塑胶炸弹
+用火药混合塑料
+高性能爆炸物质混合
+原子弹制作方法
+手教你制作原子弹
+轻型巡航导弹的制作
+黑索金的制造过程
+Zha药制造进程
+购买自爆材料
+可塑炸弹制作方法
+C4塑胶炸药
+水浴法制TNT
+简易核弹头的制造
+高爆炸药的基本配方
+如何制硝化甘油
+燃烧剂的简易制备
+简易大威力炸弹
+炸弹制作入门
+硝酸甘油炸药制作
+基本配方是硝酸甘
+燃烧剂的简易制作
+制作性能优良的燃烧弹
+酸甘油炸药
+简易导弹制作
+核弹头的制造
+可塑炸弹制作
+炸药硝酸甘油制法
+原子弹制作方法
+火药的配方
+手教你做原子弹
+怎么用土办法做武器
+手把手教你制作原子弹
+燃烧弹的制作方法
+怎么自制燃烧弹
+TNT炸弹制作
+高爆炸药基本配方
+爆破和杀伤燃烧弹
+炸弹制作从入门到精
+写的炸弹制作教程
+原子弹制作方法
+教你制作原子弹
+下硝化甘油的制作方法
+DIY原子弹
+原子弹制作
+炸药及其制造方法
+ATOM弹制造
+三分钟教你原子弹
+火药原理
+简易炸弹
+简易燃烧弹
+土炸药成份
+你怎么用土办法做武器
+燃烧弹制作
+燃烧瓶的制作方法
+炸弹制作手册
+原子弹制作方法DIY
+汽油三分之一白糖混合
+炸药的制作配方
+燃烧弹的制作方
+炸药的配方
+如何自制燃烧弹
+标准炸弹教程
+黑火药和硝酸钾农药混合
+恐怖分子自己制造炸弹
+手柄制作炸弹引爆器
+制作炸弹首先要
+燃烧瓶制作
+出售炸药
+出售气枪
+出售炸药QQ
+出售气枪QQ
+出售气枪电话
+出售手枪QQ
+出售手枪电话
+出售猎枪
+出售猎枪QQ
+出售猎枪电话
+出售汽枪
+出售步枪
+出售狙击枪
+出售手狗
+出售气狗
+出售猎狗
+手枪出售
+64式手枪出售
+92式手枪出售
+高仿真枪
+出售54式手枪
+出售M1911
+出售各类猎狗
+出售国产军用54手枪
+出售火枪游戏
+出售进口金属枪模
+出售仿真枪
+出售麻醉枪免收任何定金
+出售手枪
+出售外国枪模
+出售袖珍手枪
+娥眉气枪出售
+防身手枪QQ
+防身武器手枪
+仿真枪QQ
+钢珠枪小口径步枪
+高压狗QQ
+高压气枪
+气枪子弹
+高压气枪电狗专卖
+高压气枪
+气枪货到付款
+高压气枪专卖店
+各类军用枪
+各类军用枪支
+各式气枪出售
+工字牌气枪出售专卖
+气枪
+工字汽狗麻醉枪
+供应军用手枪
+供应汽枪
+购买枪支QQ
+国产汽狗专卖
+气枪货
+集团出售手枪
+集团出售手枪气枪
+进口气枪
+气枪子弹
+军用54出售
+军用77出售
+军用手枪钢珠枪
+里有汽枪出售
+猎枪出售
+猎枪仿真枪QQ
+猎枪麻醉枪钢珠枪
+猎枪哪里有卖
+猎枪手枪买卖
+猎枪销售手枪销售气枪
+猎枪专卖店
+猎枪子弹底火专卖
+麻醉枪出售可验货
+麻醉枪猎枪军用
+买卖54狗
+买卖64狗
+买卖警察枪支
+买卖军狗
+买卖军用枪支
+卖64手枪
+卖92手枪
+卖钢珠枪QQ
+哪里能买到自制手枪
+哪里有钢珠枪
+哪里有连发猎枪买卖
+哪里有卖双管猎
+哪里有气枪卖
+哪里有枪出售
+哪里有真枪出售
+哪里有真枪卖买
+哪能买到真枪
+出售手枪气枪猎
+气狗
+高压气枪
+气狗出售
+进口气狗
+气枪专卖网
+气枪出售
+气枪专卖网
+猎枪
+钢珠枪
+气枪仿真枪出售
+气枪购买
+气枪组装
+气枪货到付款
+汽狗出售
+气枪价格
+仿真气枪
+气枪价格
+气枪专卖店
+气枪看货加QQ
+气枪联系QQ
+气枪猎枪麻醉枪
+气枪迷魂药
+汽狗高压汽枪
+铅弹
+上海工字气枪
+全金属仿真枪专卖
+热武器QQ
+三箭气枪出售
+枪猎
+手槍买卖自制手枪
+售枪购买手枪
+售手枪QQ
+司马仿真枪模型QQ
+淘宝气枪货到付款
+提供气狗专卖店
+提供专业及时的气狗
+秃鹰气枪货到付款
+锡峰气枪出售
+仙鹤气枪出售
+买卖77式手枪
+销售电手狗
+销售气手狗
+销售小口径步枪
+销售运动步枪
+小口径秃鹰沙漠
+有军用枪支出售
+长期出售手枪
+折叠狗QQ
+转让猎枪
+子弹货到付款
+自制手枪哪里买
+自制手枪哪里有卖
+阻击枪
+汽枪
+高压气枪
+组装手枪
+最新销售汽狗
+出售警用手铐
+仿真枪
+猎枪
+真枪
+手枪
+阻击枪
+弓弩专营
+弓弩专卖网货到付款
+弓弩专卖网
+弓弩专卖货到付款
+弓弩专卖店网
+弓弩专卖店
+弓弩专卖
+弓弩制作
+弓弩直销
+弓弩有限公司
+弓弩营销中心
+弓弩一百专卖店
+弓弩销售
+弓弩网
+弓弩图纸
+弓弩特许经销
+弓弩狩猎网
+自制手弩
+追风弓弩麻醉箭专卖
+专业弓弩网
+中国战神军用弓弩
+中国弩弓专卖
+中国弓弩专卖网
+中国弓弩直销
+中国弓弩网
+中国弓弩狩猎网
+中国弓驽网
+制作简易弓弩
+郑州弓弩专卖
+赵氏弓弩专卖网
+赵氏弓弩专卖店
+赵氏弓弩专卖
+赵氏弓弩销售
+小型弓弩专卖店
+小猎人弓弩网
+狩猎器材弓弩专卖
+狩猎器材弓弩
+狩猎弓弩专卖网
+狩猎弓弩专卖
+狩猎弓弩麻醉箭
+手枪式折叠三用弩
+三利达弓弩专卖网
+三利达弓弩直营
+三利达弓弩配件
+三步倒药箭批发
+三步倒弩箭专卖
+三步倒麻醉弩箭销售
+三步倒麻醉箭专卖
+三步倒麻醉箭
+三步倒捕狗药
+军用弓弩专卖网
+军用弓弩专卖店
+军用弓弩批发
+军用弓弩公司
+供应三利达弓弩麻醉箭
+供应三步倒麻醉箭
+供应秦氏弓弩
+供应弩用麻醉箭
+供应弩捕狗箭
+供应麻醉箭三步倒
+供应麻醉箭批发
+供应麻醉箭
+供应军用弩折叠弩
+供应军用弓弩专卖
+供应精品弓弩
+供应弓弩麻醉箭
+供应弓弩
+供应钢珠弓弩
+弓弩商城专卖
+弓弩商城
+弓弩亲兄弟货到付款
+弓弩批发
+弓弩免定金货到付款
+弓弩麻醉箭
+弓弩麻醉镖
+弓弩论坛
+钢珠弓弩专卖网
+钢珠弓弩专卖店
+打狗弓弩三步倒
+麻醉弓弩专卖店
+出售军刀
+出售军刺
+出售弹簧刀
+出售三棱刀
+出售跳刀
+军刀网
+南方军刀网
+户外军刀网
+三棱军刺专卖
+出售开山刀军刺
+西点军刀网
+军刀专卖
+戈博军刀
+阿兰德龙户外
+出售军品军刀
+勃朗宁军刀
+军刀军品网
+阿兰得龙野营刀具网
+出售军刺军刀
+警用刀具出售
+折刀专卖网
+阳江军品军刀网
+野营刀专卖
+砍刀精品折刀专卖
+匕首蝴蝶甩刀专卖
+军刀专卖军刺
+军刀专卖刀具批发
+军刀图片砍刀
+军刀网军刀专卖
+军刀价格军用刀具
+军品军刺网
+军刀军刺甩棍
+阳江刀具批发网
+北方先锋军刀
+正品军刺出售
+野营军刀出售
+开山刀砍刀出售
+仿品军刺出售
+军刀直刀专卖
+手工猎刀专卖
+自动跳刀专卖
+军刀电棍销售
+军刀甩棍销售
+美国军刀出售
+极端武力折刀
+防卫棍刀户外刀具
+阿兰德龙野营刀
+仿品军刺网
+野营砍刀户外军刀
+手工猎刀户外刀具
+中国户外刀具网
+西点军品军刀网
+野营开山刀军刺
+三利达弓弩军刀
+尼泊尔军刀出售
+防卫野营砍刀出售
+防卫著名军刀出售
+防卫棍刀出售
+防卫甩棍出售
+防卫电棍出售
+军刺野营砍刀出售
+著名精品折刀出售
+战术军刀出售
+刺刀专卖网
+户外军刀出售
+阳江刀具直销网
+冷钢刀具直销网
+防卫刀具直销网
+极端武力直销网
+刀具直销网
+军刀直销网
+直刀匕首直销网
+军刀匕首直销网
+折刀砍刀军品网
+野营刀具军品网
+阳江刀具军品网
+冷钢刀具军品网
+防卫刀具军品网
+极端武力军品网
+军用刀具军品网
+军刀直刀军品网
+折刀砍刀专卖
+野营刀具专卖
+阳江刀具专卖
+冷钢刀具专卖
+防卫刀具专卖
+出售美军现役军刀
--- a/runtime/ops/filter/remove_file_with_short_or_long_length/init.py
+++ b/runtime/ops/filter/remove_file_with_short_or_long_length/init.py
@@ -0,0 +1,6 @@
+# -*- coding: utf-8 -*-
+
+from datamate.core.base_op import OPERATORS
+
+OPERATORS.register_module(module_name='FileWithShortOrLongLengthFilter',
+                          module_path="ops.filter.remove_file_with_short_or_long_length.process")
--- a/runtime/ops/filter/remove_file_with_short_or_long_length/metadata.yml
+++ b/runtime/ops/filter/remove_file_with_short_or_long_length/metadata.yml
@@ -0,0 +1,34 @@
+name: '文档字数检查'
+name_en: 'Word Count Check'
+description: '字数不在指定范围会被过滤掉。'
+description_en: 'Filters out documents whose word count is not in the specified range.'
+language: 'python'
+vendor: 'huawei'
+raw_id: 'FileWithShortOrLongLengthFilter'
+version: '1.0.0'
+types:
+  - 'cleanse'
+modal: 'text'
+effect:
+  before: '过短文本'
+  after: ''
+inputs: 'text'
+outputs: 'text'
+settings:
+  fileLength:
+    name: 文档字数
+    description: '过滤字数不在指定范围内的文档，如[10,10000000]。若输入为空，则不对字数上/下限做限制。'
+    type: range
+    properties:
+      - name: fileMinimumLength
+        type: inputNumber
+        defaultVal: 10
+        min: 0
+        max: 10000000000000000
+        step: 1
+      - name: fileMaximumLength
+        type: inputNumber
+        defaultVal: 10000000
+        min: 0
+        max: 10000000000000000
+        step: 1
--- a/runtime/ops/filter/remove_file_with_short_or_long_length/process.py
+++ b/runtime/ops/filter/remove_file_with_short_or_long_length/process.py
@@ -0,0 +1,54 @@
+#!/user/bin/python
+# -*- coding: utf-8 -*-
+
+"""
+Description: 词数目不在指定范围会被过滤掉（支持自定义阈值）
+Create: 2025/01/16
+"""
+
+import re
+import time
+from typing import Dict, Any
+
+from loguru import logger
+
+from datamate.core.base_op import Filter
+
+
+class FileWithShortOrLongLengthFilter(Filter):
+    """检查文档字数目，词数目不在指定范围会被过滤掉（支持自定义阈值）"""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        file_length_list = kwargs.get("fileLength", [10, 10000000])  # [下限，上限]，默认字数下限为10, 默认字数上限为10000000
+        if len(file_length_list) != 2:  # 要求传入字数目上限和字数目下限
+            logger.error(f"method: FileWithShortOrLongLengthFilter expected 2 arguments, got {len(file_length_list)}")
+            raise RuntimeError(82001, "method: FileWithShortOrLongLengthFilter expected 2 arguments") from None
+        # 用户不输入下限参数时前端传入''，则不对字数目下限控制
+        self._file_minimum_length = 0 if not file_length_list[0] else file_length_list[0]
+        # 用户不输入上限参数时前端传入''，则不对字数目上限控制
+        self._file_maximum_length = float("inf") if not file_length_list[1] else file_length_list[1]
+
+    def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
+        start = time.time()
+        sample[self.text_key] = self._file_with_short_or_long_length_filter(sample[self.text_key],
+                                                                            sample[self.filename_key])
+        logger.info(f"fileName: {sample[self.filename_key]}, "
+                    f"method: FileWithShortOrLongLengthFilter costs {(time.time() - start):6f} s")
+        return sample
+
+    def _strip_unicode_whitespace(self, text: str):
+        # 常见 Unicode 空格符（涵盖普通空格、全角空格、零宽空格等）
+        pattern = r'[\u0020\u00A0\u1680\u2000-\u200F\u202F\u205F\u3000]+'
+        # 匹配首尾的空格符
+        pattern = fr'^{pattern}|{pattern}$'
+        return re.sub(pattern, '', text)
+
+    def _file_with_short_or_long_length_filter(self, input_data: str, file_name):
+        input_data_tmp = self._strip_unicode_whitespace(input_data)
+        if len(input_data_tmp) < self._file_minimum_length or len(input_data_tmp) > self._file_maximum_length:
+            logger.info(f"The length of input_data is: {len(input_data_tmp)}, "
+                        f"which is not within the threshold range of {self._file_minimum_length} "
+                        f"and {self._file_maximum_length}. {file_name} is filtered.")
+            return ""
+        return input_data