init datamate

This commit is contained in:
Dallas98
2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions

89
runtime/ops/README.md Normal file
View File

@@ -0,0 +1,89 @@
# 自定义算子开发指南
## 算子规范
### 算子元数据格式
每个自定义算子都需要包含一个 `metadata.yml` 文件:
```yaml
name: '落盘算子'
name_en: 'save file operator'
description: '将文件内容保存为文件。'
description_en: 'Save the file data as a file.'
language: 'Python'
vendor: 'Huawei'
raw_id: 'FileExporter'
version: '1.0.0'
types:
- 'collect'
modal: 'others'
effect:
before: ''
after: ''
inputs: 'all'
outputs: 'all'
```
### 算子实现
创建 `process.py` 文件:
```python
# -*- coding: utf-8 -*-
"""
Description: Json文本抽取
Create: 2024/06/06 15:43
"""
import time
from loguru import logger
from typing import Dict, Any
from datamate.core.base_op import Mapper
class TextFormatter(Mapper):
"""把输入的json文件流抽取为txt"""
def __init__(self, *args, **kwargs):
super(TextFormatter, self).__init__(*args, **kwargs)
@staticmethod
def _extract_json(byte_io):
"""将默认使用utf-8编码的Json文件流解码,抽取为txt"""
# 用utf-8-sig的格式进行抽取,可以避免uft-8 BOM编码格式的文件在抽取后产生隐藏字符作为前缀。
return byte_io.decode("utf-8-sig").replace("\r\n", "\n")
def byte_read(self, sample: Dict[str, Any]):
filepath = sample[self.filepath_key]
with open(filepath, "rb") as file:
byte_data = file.read()
sample[self.data_key] = byte_data
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
try:
self.byte_read(sample)
sample[self.text_key] = self._extract_json(sample[self.data_key])
sample[self.data_key] = b"" # 将sample[self.data_key]置空
logger.info(
f"fileName: {sample[self.filename_key]}, method: TextFormatter costs {(time.time() - start):6f} s")
except UnicodeDecodeError as err:
logger.exception(f"fileName: {sample[self.filename_key]}, method: TextFormatter causes decode error: {err}")
raise
return sample
```
创建 `__init__.py` 文件:
```python
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='TextFormatter',
module_path="ops.formatter.text_formatter.process")
```

View File

@@ -0,0 +1,49 @@
{
"name": "text_length_filter",
"displayName": "文本长度过滤器",
"version": "1.0.0",
"author": "DataMate Team",
"description": "根据文本长度过滤数据,支持最小和最大长度限制",
"category": "数据清洗",
"type": "CUSTOM",
"inputs": [
{
"name": "input_data",
"type": "array",
"description": "输入文本数组",
"required": true
}
],
"outputs": [
{
"name": "filtered_data",
"type": "array",
"description": "过滤后的文本数组"
}
],
"parameters": [
{
"name": "min_length",
"type": "integer",
"description": "最小文本长度",
"default": 10,
"min": 0
},
{
"name": "max_length",
"type": "integer",
"description": "最大文本长度",
"default": 1000,
"min": 1
},
{
"name": "text_field",
"type": "string",
"description": "文本字段名称(如果输入是对象数组)",
"default": "text"
}
],
"tags": ["文本处理", "数据过滤", "长度检查"],
"documentation": "https://docs.datamate.com/operators/text-length-filter",
"repository": "https://github.com/datamate/operators/tree/main/text-length-filter"
}

View File

@@ -0,0 +1,135 @@
"""
文本长度过滤器算子
根据设定的最小和最大长度过滤文本数据
"""
import json
import logging
from typing import Dict, Any, List, Union
logger = logging.getLogger(__name__)
class TextLengthFilter:
"""文本长度过滤器算子"""
def __init__(self):
self.name = "text_length_filter"
self.version = "1.0.0"
def execute(self, config: Dict[str, Any], context: Dict[str, Any]) -> Dict[str, Any]:
"""执行文本长度过滤"""
logger.info(f"开始执行算子: {self.name}")
# 获取参数
parameters = config.get('parameters', {})
min_length = parameters.get('min_length', 10)
max_length = parameters.get('max_length', 1000)
text_field = parameters.get('text_field', 'text')
logger.info(f"过滤参数: min_length={min_length}, max_length={max_length}, text_field={text_field}")
# 验证参数
if min_length < 0:
raise ValueError("min_length must be >= 0")
if max_length < min_length:
raise ValueError("max_length must be >= min_length")
# 读取输入数据
input_path = context['input_path']
with open(input_path, 'r', encoding='utf-8') as f:
input_data = json.load(f)
if not isinstance(input_data, list):
raise ValueError("输入数据必须是数组格式")
logger.info(f"输入数据条数: {len(input_data)}")
# 执行过滤
filtered_data = []
stats = {
'total_input': len(input_data),
'too_short': 0,
'too_long': 0,
'filtered_out': 0,
'kept': 0
}
for i, item in enumerate(input_data):
try:
# 提取文本内容
if isinstance(item, str):
text = item
elif isinstance(item, dict) and text_field in item:
text = str(item[text_field])
else:
logger.warning(f"跳过无法处理的数据项 {i}: {type(item)}")
stats['filtered_out'] += 1
continue
# 检查长度
text_length = len(text)
if text_length < min_length:
stats['too_short'] += 1
stats['filtered_out'] += 1
elif text_length > max_length:
stats['too_long'] += 1
stats['filtered_out'] += 1
else:
filtered_data.append(item)
stats['kept'] += 1
# 进度报告
if (i + 1) % 1000 == 0:
progress = (i + 1) / len(input_data) * 100
logger.info(f"处理进度: {progress:.1f}% ({i + 1}/{len(input_data)})")
except Exception as e:
logger.warning(f"处理数据项 {i} 时出错: {e}")
stats['filtered_out'] += 1
continue
# 保存结果
output_path = context['output_path']
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(filtered_data, f, ensure_ascii=False, indent=2)
# 准备返回结果
result = {
'status': 'success',
'statistics': stats,
'filter_rate': stats['filtered_out'] / stats['total_input'] * 100 if stats['total_input'] > 0 else 0,
'output_path': output_path
}
logger.info(f"过滤完成: {stats}")
logger.info(f"过滤率: {result['filter_rate']:.2f}%")
return result
def validate_config(self, config: Dict[str, Any]) -> List[str]:
"""验证配置参数"""
errors = []
parameters = config.get('parameters', {})
min_length = parameters.get('min_length')
max_length = parameters.get('max_length')
if min_length is not None and not isinstance(min_length, int):
errors.append("min_length must be an integer")
if max_length is not None and not isinstance(max_length, int):
errors.append("max_length must be an integer")
if min_length is not None and min_length < 0:
errors.append("min_length must be >= 0")
if min_length is not None and max_length is not None and max_length < min_length:
errors.append("max_length must be >= min_length")
return errors
def create_operator():
"""算子工厂函数"""
return TextLengthFilter()

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
import sys
from pathlib import Path
from datamate.common.utils.custom_importer import CustomImporter
def _configure_importer():
base_path = Path(__file__).resolve().parent
sys.meta_path.append(CustomImporter(base_path))
_configure_importer()
def _import_operators():
from . import file_with_high_repeat_phrase_rate_filter
from . import file_with_high_repeat_word_rate_filter
from . import file_with_high_special_char_rate_filter
from . import remove_file_with_many_sensitive_words
from . import remove_file_with_short_or_long_length
from . import remove_duplicate_file
from . import img_blurred_images_cleaner
from . import img_duplicated_images_cleaner
from . import img_similar_images_cleaner
from . import img_advertisement_images_cleaner
_import_operators()

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='FileWithHighRepeatPhraseRateFilter',
module_path="ops.filter.file_with_high_repeat_phrase_rate_filter.process")

View File

@@ -0,0 +1,31 @@
name: '文档词重复率检查'
description: '去除重复词过多的文档。'
language: 'Python'
vendor: 'Huawei'
raw_id: 'FileWithHighRepeatPhraseRateFilter'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '机器机器机器机器机器机器机器机器机器机器学习学习学习学习学习'
after: ''
inputs: 'text'
outputs: 'text'
settings:
repeatPhraseRatio:
name: 文档词重复率
description: 某个词的统计数/文档总词数 > 设定值,该文档被去除。
type: slider
defaultVal: 0.5
min: 0
max: 1
step: 0.1
hitStopwords:
name: 去除停用词
description: 统计重复词时,选择是否要去除停用词。
type: switch
defaultVal: false
required: true
checkedLabel: 去除
unCheckedLabel: 不去除

View File

@@ -0,0 +1,73 @@
#!/user/bin/python
# -- encoding: utf-8 --
"""
Description: 词重复率过高文档过滤插件
Create: 2023/11/7 9:26
"""
import re
import time
from collections import Counter
from pathlib import Path
from typing import Dict, Any
from loguru import logger
import jieba
from datamate.core.base_op import Filter
class FileWithHighRepeatPhraseRateFilter(Filter):
"""词重复率过高文档过滤插件"""
PUNCTUATION_PATTERN = re.compile(r'^[\u3000-\u303F\uff00-\uffef\s\W_]+$')
def __init__(self, *args, **kwargs):
super(FileWithHighRepeatPhraseRateFilter, self).__init__(*args, **kwargs)
self._min_threshold = kwargs.get("repeatPhraseRatio", 0.5) # 重复词符占全文的比例阈值,默认值为0.5
self._hit_stopword_trigger = kwargs.get("hitStopwords", False) # 计算重复词率时是否去除停用词,默认为False不去除,True为去除
self._file_path = Path(__file__).parent / 'resources' / 'hit_stopwords.txt'
self._hit_stopwords = []
if self._hit_stopword_trigger:
with open(self._file_path, 'r', encoding='utf-8') as f:
self._hit_stopwords = f.read().splitlines()
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._file_with_high_repeat_phrase_rate_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FileWithHighRepeatPhraseRateFilter costs {(time.time() - start):6f} s")
return sample
def _tokenize_by_jieba(self, text: str):
"""基于jieba对输入文本进行切分
Args:
text: 输入文档内容
Returns:
words_list: 切割后的词列表
"""
for word in jieba.lcut(text):
if not self.PUNCTUATION_PATTERN.match(word) and word not in self._hit_stopwords:
yield word
def _file_with_high_repeat_phrase_rate_filter(self, input_data: str, file_name):
if len(input_data) < 2: # 词语长度至少2个字符
return input_data
words_list = self._tokenize_by_jieba(input_data)
words_count = dict(Counter(words_list))
words_count_max, words_total_count = 0, 0
for words in words_count:
# 只统计中文、字母,且长度大于1的词语
if len(words) > 1 and words.isalpha():
words_count_max = max(words_count_max, words_count.get(words))
words_total_count += words_count.get(words)
output_data = input_data
repeat_phrase_rate = words_count_max / words_total_count if words_total_count > 0 else 0
if repeat_phrase_rate >= self._min_threshold:
# 只要有一个词重复率高于阈值,就会过滤文档
output_data = ""
logger.info(f"The repeat phrase rate of the input data is {repeat_phrase_rate}. "
f"Threshold is {self._min_threshold}. The document {file_name} is filtered.")
return output_data

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='FileWithHighRepeatWordRateFilter',
module_path="ops.filter.file_with_high_repeat_word_rate_filter.process")

View File

@@ -0,0 +1,25 @@
name: '文档字重复率检查'
name_en: 'Word Repetition Rate Check'
description: '去除重复字过多的文档。'
description_en: 'Filters out files that contain excessive repeated words.'
language: 'python'
vendor: 'huawei'
raw_id: 'FileWithHighRepeatWordRateFilter'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '机器学学学学学学学学学学学学学学学学学学学学学学学学学学学学学学习'
after: ''
inputs: 'text'
outputs: 'text'
settings:
repeatWordRatio:
name: 文档字重复率
description: 某个字的统计数/文档总字数 > 设定值,该文档被去除。
type: slider
defaultVal: 0.5
min: 0
max: 1
step: 0.1

View File

@@ -0,0 +1,51 @@
#!/user/bin/python
# -- encoding: utf-8 --
"""
Description: 检查文档字重复率插件
Create: 2023/11/7 9:26
"""
import re
import time
from collections import Counter
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Filter
class FileWithHighRepeatWordRateFilter(Filter):
"""检查文档字重复率插件"""
def __init__(self, *args, **kwargs):
super(FileWithHighRepeatWordRateFilter, self).__init__(*args, **kwargs)
self._min_threshold = kwargs.get("repeatWordRatio", 0.5) # 重复字符占整行的比例阈值,默认值为0.5
@staticmethod
def _extract_word(input_data):
# 只统计中文字的重复率
extracted_word = re.sub(r'[^\u4e00-\u9fff]', '', input_data)
return extracted_word
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._file_with_high_repeat_word_rate_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FileWithHighRepeatWordRateFilter costs {(time.time() - start):6f} s")
return sample
def _file_with_high_repeat_word_rate_filter(self, input_data: str, file_name):
tmp = self._extract_word(input_data)
if not tmp:
return input_data
output_data = input_data
words_count = Counter(tmp)
max_value = max(words_count.values())
repeat_word_rate = max_value / len(tmp)
if repeat_word_rate >= self._min_threshold:
output_data = ""
logger.info(f"The repeat word rate of the input data is {repeat_word_rate}. "
f"Threshold is {self._min_threshold}. The document %s is filtered.")
return output_data

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='FileWithHighSpecialCharRateFilter',
module_path="ops.filter.file_with_high_special_char_rate_filter.process")

View File

@@ -0,0 +1,25 @@
name: '文档特殊字符率检查'
name_en: 'Special Character Rate Check'
description: '去除特殊字符过多的文档。'
description_en: 'Filters out files that contain excessive special characters.'
language: 'python'
vendor: 'huawei'
raw_id: 'FileWithHighSpecialCharRateFilter'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '你好!@!@#!¥!@#'
after: ''
inputs: 'text'
outputs: 'text'
settings:
specialCharRatio:
name: 文档特殊字符率
description: 特殊字符的统计数/文档总字数 > 设定值,该文档被去除。
type: slider
defaultVal: 0.3
min: 0
max: 1
step: 0.1

View File

@@ -0,0 +1,49 @@
#!/user/bin/python
# -- encoding: utf-8 --
"""
Description: 文档特殊字符率检查
Create: 2023/11/7 9:26
"""
import time
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Filter
class FileWithHighSpecialCharRateFilter(Filter):
"""检查文档特殊字符率"""
def __init__(self, *args, **kwargs):
super(FileWithHighSpecialCharRateFilter, self).__init__(*args, **kwargs)
self._min_threshold = kwargs.get("specialCharRatio", 0.3) # 特殊字符占全文比例阈值,默认值为0.3
self._file_path = Path(__file__).parent / 'resources' / 'special_token.txt'
with open(self._file_path, 'r', encoding='utf-8') as f:
self._special_token = set(f.read().splitlines())
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._file_with_high_special_char_rate_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FileWithHighSpecialCharRateFilter costs {(time.time() - start):6f} s")
return sample
def _file_with_high_special_char_rate_filter(self, input_data: str, file_name):
if not input_data:
return ""
output_data = input_data
total = 0
for token in self._special_token:
total += input_data.count(token)
special_char_rate = total / len(input_data)
if special_char_rate >= self._min_threshold:
logger.info(f"The special char rate of the input data is {special_char_rate}. "
f"Threshold is {self._min_threshold}. The document {file_name} is filtered.")
output_data = ""
return output_data

View File

@@ -0,0 +1,50 @@
~
·
@
#
%
&
*
+
-
=
{
}
|
`
!
$
^
(
)
_
[
]
\
:
"
;
'
<
>
?
,
/

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgAdvertisementImagesCleaner',
module_path="ops.filter.img_advertisement_images_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '广告图片过滤'
name_en: 'Ad Image Filter'
description: '去除包含二维码的图片。'
description_en: 'Removes images containing QR codes.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgAdvertisementImagesCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,127 @@
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
Description:
Create: 2024/1/22 20:49
"""
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Filter
from .wechat_qrcode_model import WechatQRCodeModel
class ImgAdvertisementImagesCleaner(Filter):
"""去除广告图片的插件,当前仅支持去除二维码"""
def __init__(self, *args, **kwargs):
super(ImgAdvertisementImagesCleaner, self).__init__(*args, **kwargs)
self.img_resize = 1000 # 大图片的最长边压缩为1000
self.use_model = True
self.model = self.get_model(*args, **kwargs)
@staticmethod
def _detect_qr_code_using_anchor_point(img):
# 有些二维码和边缘紧贴,无法识别出整个矩形,所以我们先对图片大小进行扩展
expand_length = 10
edge = expand_length // 2
h, w = img.shape[:2]
image_extend = np.zeros((img.shape[0] + expand_length, img.shape[1] + expand_length, 3), np.uint8)
image_extend[:] = 255
image_extend[edge:edge + h, edge:edge + w] = img
# 转灰度、二值化、找轮廓
gray = cv2.cvtColor(image_extend, cv2.COLOR_BGR2GRAY)
# 中值滤波
blur_image = cv2.medianBlur(gray, 5)
_, thresh = cv2.threshold(blur_image, 127, 255, cv2.THRESH_BINARY)
contours, hir = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
# 三个“回”字特征轮廓存储
parent_contours_list = []
hir_list = hir[0]
for i, item in enumerate(hir_list):
# 判断A轮廓是否有B轮廓
if item[2] == -1:
continue
else:
hir_b_index = item[2]
# 判断B轮廓是否有C轮廓
if hir_list[hir_b_index][2] == -1:
continue
hir_c_index = hir_list[hir_b_index][2]
# 计算A轮廓的周长和C轮廓周长的比值
hir_c_arc_length = cv2.arcLength(contours[hir_c_index], True)
if hir_c_arc_length:
error = cv2.arcLength(contours[i], True) / hir_c_arc_length
# 二维码每一个“回”的黑白框框的比例大概为1:1:3:1:1
# 理论上,A轮廓周长为28,C轮廓周长为12,A/C = error = 2.3333
if 1.5 <= error <= 3:
parent_contours_list.append(contours[i])
# 若找到3个以上“回”字,该图片含有二维码
return len(parent_contours_list) >= 3
@staticmethod
def _detect_qr_code_using_wechat_model(img, file_name, model):
res = ""
try:
res, points = model.detectAndDecode(img)
except UnicodeDecodeError as ex:
res = ex.object.decode('ISO-8859-1').split(" ")[0]
except Exception as err:
logger.exception(f"fileName: {file_name}, method: ImgAdvertisementImagesCleaner. "
f"An error occurred when using the WeChat model to detect the QR code. "
f"The error is: {err}")
if res:
return True
return False
def init_model(self, *args, **kwargs):
return WechatQRCodeModel(*args, **kwargs).wechat_qr_model
def resize_img(self, image):
"""图片等比压缩"""
height, width = image.shape[:2] # 获取原图像的水平方向尺寸和垂直方向尺寸。
temp = max(height, width)
# 若图片最长边大于限值,对图片进行压缩,否则返回原图
if temp >= self.img_resize:
mul_temp = temp / self.img_resize
if height > width:
res = cv2.resize(image, (int(width / mul_temp), self.img_resize), interpolation=cv2.INTER_AREA)
elif height < width:
res = cv2.resize(image, (self.img_resize, int(height / mul_temp)), interpolation=cv2.INTER_AREA)
else:
res = cv2.resize(image, (self.img_resize, self.img_resize), interpolation=cv2.INTER_AREA)
return res
return image
def execute(self, sample: Dict[str, Any]):
start = time.time()
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
img_bytes = sample[self.data_key]
if img_bytes:
data = bytes_transform.bytes_to_numpy(img_bytes)
image = self._detect_advertisement_img(data, file_name, self.model)
sample[self.data_key] = bytes_transform.numpy_to_bytes(image, file_type)
logger.info(f"fileName: {file_name}, "
f"method: ImgAdvertisementImagesCleaner costs {(time.time() - start):6f} s")
return sample
def _detect_advertisement_img(self, img, file_name, model):
"""检测含有二维码的图片"""
img_resize = self.resize_img(img)
if self._detect_qr_code_using_wechat_model(img_resize, file_name, model) \
or self._detect_qr_code_using_anchor_point(img_resize):
logger.info(f"fileName: {file_name}, method: ImgAdvertisementImagesCleaner. "
"The image contains advertisement. The image is filtered out.")
return np.array([])
return img

View File

@@ -0,0 +1,23 @@
# -- encoding: utf-8 --
import gc
import os
from pathlib import Path
import cv2
class WechatQRCodeModel:
def __init__(self, *args, **kwargs):
models_path = os.getenv("MODELS_PATH", "/home/models")
self.resources_path = str(Path(models_path, 'img_QRcode_detect', 'resources'))
self.wechat_qr_model = cv2.wechat_qrcode_WeChatQRCode(
str(Path(self.resources_path, 'detect.prototxt')),
str(Path(self.resources_path, 'detect.caffemodel')),
str(Path(self.resources_path, 'sr.prototxt')),
str(Path(self.resources_path, 'sr.caffemodel')))
def __del__(self):
del self.wechat_qr_model
gc.collect()

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgBlurredImagesCleaner',
module_path="ops.filter.img_blurred_images_cleaner.process")

View File

@@ -0,0 +1,25 @@
name: '模糊图片过滤'
name_en: 'Fuzzy Image Filter'
description: '去除模糊的图片。'
description_en: 'Filters out fuzzy images.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgBlurredImagesCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'
settings:
blurredThreshold:
name: 梯度函数值
description: 梯度函数值取值越小,图片模糊度越高。
type: slider
defaultVal: 1000
min: 1
max: 10000
step: 1

View File

@@ -0,0 +1,50 @@
# -- encoding: utf-8 --
"""
Description:
Create: 2025/01/17
"""
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import bytes_transform
from datamate.core.base_op import Filter
class ImgBlurredImagesCleaner(Filter):
"""过滤模糊度低于阈值的图片插件"""
def __init__(self, *args, **kwargs):
super(ImgBlurredImagesCleaner, self).__init__(*args, **kwargs)
# 设置模糊度阈值
self._blurred_threshold = kwargs.get("blurredThreshold", 1000)
def execute(self, sample: Dict[str, Any]):
start = time.time()
img_bytes = sample[self.data_key]
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
if img_bytes:
data = bytes_transform.bytes_to_numpy(img_bytes)
blurred_images = self._blurred_images_filter(data, file_name)
sample[self.data_key] = bytes_transform.numpy_to_bytes(blurred_images, file_type)
logger.info(f"fileName: {file_name}, method: ImagesBlurredCleaner costs {(time.time() - start):6f} s")
return sample
def _blurred_images_filter(self, image, file_name):
# 为方便与其他图片比较可以将图片resize到同一个大小
img_resize = cv2.resize(image, (112, 112))
# 将图片压缩为单通道的灰度图
gray = cv2.cvtColor(img_resize, cv2.COLOR_BGR2GRAY)
score = cv2.Laplacian(gray, cv2.CV_64F).var()
if score <= self._blurred_threshold:
logger.info(f"The image blur is {self._blurred_threshold}, "
f"which exceeds the threshold of {score}. {file_name} is filtered out.")
return np.array([])
return image

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgDuplicatedImagesCleaner',
module_path="ops.filter.img_duplicated_images_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '重复图片去除'
name_en: 'Duplicate Image Removal'
description: '去除重复的图片。'
description_en: 'Removes duplicate images.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgDuplicatedImagesCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,109 @@
# -- encoding: utf-8 --
"""
Description:
于MD5值计算当前图片与数据集中其它图片是否相同。相同该图片过滤,保留原数据集图片。
将文件特征数据即MD5值,存到数据库。根据任务uuid获取历史文件特征,遍历特征并进行去重比较
Create: 2025/1/7
"""
import json
import time
from pathlib import Path
from typing import Dict, Any
import cv2
from Crypto.Hash import MD5
from sqlalchemy import text
from loguru import logger
from datamate.sql_manager.sql_manager import SQLManager
from datamate.common.utils import get_now_time
from datamate.common.utils import bytes_to_numpy, numpy_to_bytes
from datamate.core.base_op import Filter
class ImgDuplicatedImagesCleaner(Filter):
"""去除重复图片插件
基于MD5值计算当前图片与数据集中其它图片是否相同。相同该图片过滤,保留原数据集图片。
"""
def __init__(self, *args, **kwargs):
# task_uuid为标识该数据集的唯一标志
super().__init__(*args, **kwargs)
self.task_uuid = kwargs.get("uuid", "")
self.img_resize = 200 # 图片压缩尺寸
# 获取数据库sql
self.sql_dict = self.load_sql_dict()
# 获取数据库连接池
self.conn = None # 数据库连接
self.trans = None # 数据库事务
@staticmethod
def load_sql_dict():
"""获取sql语句"""
sql_config_path = str(Path(__file__).parent / 'sql' / 'sql_config.json')
with open(sql_config_path, 'r', encoding='utf-8') as f:
return json.load(f)
def compute_md5(self, img_bytes: bytes) -> str:
"""将图片统一转化为png无损格式,计算每张图像的md5值"""
if not img_bytes:
return ""
img = bytes_to_numpy(img_bytes)
height, width = img.shape[:2] # 获取原图像的水平方向尺寸和垂直方向尺寸。
res = cv2.resize(img, (int(width / height * self.img_resize), self.img_resize), interpolation=cv2.INTER_AREA)
img_bytes = numpy_to_bytes(res, ".png")
hash_md5 = MD5.new()
hash_md5.update(img_bytes)
return hash_md5.hexdigest()
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
"""重复图片去重算子执行入口"""
start = time.time()
file_name = sample[self.filename_key]
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
img_data = self._duplicate_images_filter(file_name, sample[self.data_key])
sample[self.data_key] = img_data
logger.info(
f"fileName: {file_name}, method: DuplicateImagesCleaner costs {(time.time() - start):6f} s")
return sample
def execute_sql(self, md5: str, file_name: str,
img_bytes: bytes) -> bytes:
"""从数据库中获取文件特征、比较MD5,插入新的文件特征"""
timestamp = get_now_time('Asia/Shanghai', '%Y-%m-%d %H:%M:%S', file_name,
"DuplicateImagesCleaner")
query_sql = str(self.sql_dict.get("query_sql"))
insert_sql = str(self.sql_dict.get("insert_sql"))
create_tables_sql = str(self.sql_dict.get("create_tables_sql"))
query_sql_params = {"task_uuid": self.task_uuid, "file_feature": md5}
insert_sql_params = {"task_uuid": self.task_uuid, "file_feature": md5, "file_name": file_name.encode("utf-8"),
"timestamp": timestamp}
db_manager = SQLManager()
try:
self.conn = db_manager.create_connect()
except Exception as e:
logger.error(f"fileName: {file_name}, database connection failed: {str(e)}")
raise RuntimeError(82000, str(e)) from None
with self.conn as connection:
connection.execute(text(create_tables_sql))
# 判断是否有重复文件
result = connection.execute(text(query_sql, query_sql_params)).fetchall()
# 查询记录为空,无重复图片, 插入新文件特征
if not result:
connection.execute(text(insert_sql, insert_sql_params))
return img_bytes
logger.info(f"taskId: {self.task_uuid} fileName: {file_name}, method: Duplicate ImagesCleaner. "
f"The image is duplicated and filtered ")
return b""
def _duplicate_images_filter(self, file_name: str, img_bytes: bytes) -> bytes:
"""重复图片去重算子执行逻辑"""
# 如果文件为空,则无需去重,返回原图
if not img_bytes:
return img_bytes
md5 = self.compute_md5(img_bytes)
return self.execute_sql(md5, file_name, img_bytes)

View File

@@ -0,0 +1,5 @@
{
"query_sql": "SELECT * FROM operator_duplicate_img_features WHERE task_uuid = :task_uuid AND file_feature = :file_feature",
"insert_sql": "INSERT INTO operator_duplicate_img_features (task_uuid, file_feature, file_name, timestamp) VALUES (:task_uuid, :file_feature, :file_name, :timestamp)",
"create_tables_sql": "CREATE TABLE IF NOT EXISTS operator_duplicate_img_features (id INT AUTO_INCREMENT PRIMARY KEY,task_uuid VARCHAR(255),file_feature TEXT,file_name TEXT,timestamp DATETIME);"
}

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgSimilarImagesCleaner',
module_path="ops.filter.img_similar_images_cleaner.process")

View File

@@ -0,0 +1,25 @@
name: '相似图片去除'
name_en: 'Similar Image Removal'
description: '去除相似的图片。'
description_en: 'Removes similar images.'
language: 'python'
vendor: 'huawei'
raw_id: 'ImgSimilarImagesCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'
settings:
similarThreshold:
name: 相似度
description: 相似度取值越大,图片相似度越高。
type: slider
defaultVal: 0.8
min: 0
max: 1
step: 0.01

View File

@@ -0,0 +1,238 @@
# -- encoding: utf-8 --
"""
Description:
1.本算子结合感知哈希算法和ORB两个算法判断图片的相似性
2.感知哈希算法则是从图像的整体结构和特征维度来计算图片的相似度。
3.ORB算法可以用来对图像中的关键点快速创建特征向量,这些特征向量可以用来识别图像中的对象。通过比较两张图片的特征向量计算相似度。
4.感知哈希算法和ORB算法计算相似度高于0.75,则选择二者较大值;若低于0.75,则选择二者最小值作为相似度
5.将文件特征数据存到数据库。根据任务uuid获取历史文件特征,遍历特征并进行去重比较
Create: 2025/1/7
"""
import json
import time
import zlib
from pathlib import Path
from typing import List, Dict, Any
import cv2
import numpy as np
from sqlalchemy import text
from loguru import logger
from datamate.sql_manager.sql_manager import SQLManager
from datamate.common.utils import get_now_time
from datamate.common.utils import bytes_to_numpy
from datamate.core.base_op import Filter
MAX_RETRIES = 5
BASE_DELAY = 1
MAX_DELAY = 30 # 最大延时设置为30秒
JITTER_FACTOR = 0.25 # 抖动因子为等待时间的25%
MAX_FEATURES_NUM = 200
def get_orb_des(image: np.ndarray) -> np.ndarray:
"""检测图像中的特征点kp和计算这些特征点的描述符矩阵des_matrix"""
if not image.size:
return np.array([])
orb = cv2.ORB_create() # 初始化ORB检测器
orb.setMaxFeatures(MAX_FEATURES_NUM) # 设置最大特征点数量为200
kp, des_matrix = orb.detectAndCompute(image, None)
if des_matrix is None:
# 若没有提取出图像特征,描述符矩阵置为空
des_matrix = np.array([])
return des_matrix
class ImgSimilarImagesCleaner(Filter):
"""去除相似图片的插件"""
DEFAULT_SIMILAR_THRESHOLD = 0.8 # 默认相似度阈值
DEFAULT_TASK_UUID = "uuid" # 默认任务UUID
DEFAULT_ORB_RATIO = 0.8 # 默认特征点距离比率
DEFAULT_MIX_SIMILARITY = 0.75 # 默认相似度算法阈值
DEFAULT_IMG_RESIZE = 200 # 默认图片压缩尺寸
DEFAULT_PAGE_SIZE = 500 # 默认每页数据量
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.similar_threshold = kwargs.get("similarThreshold", self.DEFAULT_SIMILAR_THRESHOLD) # 默认相似度阈值为0.8
# task_uuid为标识该数据集的唯一标志
self.task_uuid = kwargs.get("uuid", self.DEFAULT_TASK_UUID)
self.orb_ratio = self.DEFAULT_ORB_RATIO # 特征点距离的比率,该数值为经验值
self.mix_similarity = self.DEFAULT_MIX_SIMILARITY # 选择相似度算法的阈值,该数值为经验值
self.img_resize = self.DEFAULT_IMG_RESIZE # 图片压缩尺寸
self.conn = None # 数据库连接
self.trans = None # 数据库事务
self.page_size = self.DEFAULT_PAGE_SIZE # 每页数据量
# 获取数据库sql
self.sql_dict = self.load_sql_dict()
@staticmethod
def load_sql_dict():
"""获取sql语句"""
sql_config_path = str(Path(__file__).parent / 'sql' / 'sql_config.json')
with open(sql_config_path, 'r', encoding='utf-8') as f:
return json.load(f)
@staticmethod
def get_p_hash(image: np.ndarray) -> str:
"""计算pHash值"""
hashed_value = ""
if not image.size:
return hashed_value
gray_image = cv2.cvtColor(cv2.resize(image, (8, 8), interpolation=cv2.INTER_AREA), cv2.COLOR_BGR2GRAY)
dct_image = cv2.dct(np.float32(gray_image))
hashed_value = ''.join(['1' if x >= 0 else '0' for x in dct_image[:8, :8].flatten()])
return hashed_value
@staticmethod
def get_phash_similarity(hash_comparison: str, hash_compared: str) -> float:
"""通过计算汉明距离,获取图片相似度"""
# 若哈希值为空,则相似度为0
if not hash_comparison or not hash_compared:
return 0.0
# 计算汉明距离
distance = sum(
bit_comparison != bit_compared for bit_comparison, bit_compared in zip(hash_comparison, hash_compared))
similarity = 1 - distance / len(hash_comparison)
return similarity
def filter_similar_images(self, img: np.ndarray, file_name: str) -> np.ndarray:
"""判断数据集中是否存在相似图片"""
# 如果文件为空,则无需去重,返回原图
if not img.size:
return img
p_hash = self.get_p_hash(img)
height, width = img.shape[:2] # 获取原图像的水平方向尺寸和垂直方向尺寸。
img_resize = cv2.resize(img, (int(width / height * self.img_resize), self.img_resize),
interpolation=cv2.INTER_AREA)
des_matrix = get_orb_des(img_resize)
return self.execute_sql(p_hash, des_matrix, file_name, img)
def get_orb_similarity(self, des_matrix: np.ndarray, des_matrix_history: np.ndarray, file_name: str,
file_name_history: str) -> float:
"""获取图片orb相似度"""
# 若描述符矩阵为空,则相似度为0
if not des_matrix.size or not des_matrix_history.size:
return 0.0
# 根据矩阵对角线上元素和的大小,选择描述符矩阵作为训练或查询矩阵
train_matrix, query_matrix = des_matrix, des_matrix_history
if train_matrix.shape[0] > des_matrix_history.shape[0]:
train_matrix, query_matrix = des_matrix_history, des_matrix
elif des_matrix.shape[0] == des_matrix_history.shape[0]:
if np.trace(des_matrix) > np.trace(des_matrix_history):
train_matrix, query_matrix = des_matrix_history, des_matrix
try:
# knn筛选结果
matches = (cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=False).
knnMatch(query_matrix, trainDescriptors=train_matrix, k=2))
if not matches:
return 0.0
# 遍历每一对特征点,筛选距离更近的特征点
count = 0
for (m, n) in matches:
if m.distance < self.orb_ratio * n.distance:
count += 1
orb_similarity = count / len(matches)
return orb_similarity
except Exception as e:
logger.exception(f"taskId: {self.task_uuid}, failed to compare the similarity between "
f"{file_name} and {file_name_history}: {e}")
return 0.0
def execute_sql(self, p_hash: str, des_matrix: np.ndarray, file_name: str,
img: np.ndarray) -> np.ndarray:
des_matrix_binary = zlib.compress(des_matrix.tobytes()) # 使用 zlib 进行压缩数组
timestamp = get_now_time('Asia/Shanghai', '%Y-%m-%d %H:%M:%S', file_name,
"ImgSimilarCleaner")
query_task_uuid_sql = str(self.sql_dict.get("query_task_uuid_sql"))
insert_sql = str(self.sql_dict.get("insert_sql"))
create_tables_sql = str(self.sql_dict.get("create_tables_sql"))
db_manager = SQLManager()
try:
self.conn = db_manager.create_connect()
except Exception as e:
logger.error(f"fileName: {file_name}, database connection failed: {str(e)}")
raise RuntimeError(82000, str(e)) from None
with self.conn as connection:
"""从数据库中获取文件特征、比较相似度,插入新的文件特征"""
connection.execute(text(create_tables_sql))
result = connection.execute(text(query_task_uuid_sql), {"task_uuid": self.task_uuid}).fetchall()
total_count = len(result)
if self.has_similar_images(connection, des_matrix, file_name, p_hash, total_count):
return np.array([])
insert_data = {
"task_uuid": self.task_uuid,
"p_hash": p_hash,
"des_matrix": des_matrix_binary,
"matrix_shape": str(des_matrix.shape),
"file_name": file_name.encode("utf-8").hex(),
"timestamp": timestamp
}
connection.execute(text(insert_sql),insert_data)
return img
def has_similar_images(self, connection, des_matrix, file_name, p_hash, total_count):
for i in range(0, total_count, self.page_size):
query_sql = self.sql_dict.get("query_sql")
rows = connection.execute(text(query_sql), {"task_uuid": self.task_uuid, "ge": self.page_size, "le": i}).fetchall()
# 对应任务uuid,最后一页没有数据,跳出循环
if not rows:
break # 对两张图片进行相似度比较
if self.determine_similar_images(rows, p_hash, des_matrix, file_name):
return True
return False
def determine_similar_images(self, file_features: List, p_hash: str, des_matrix: np.ndarray,
file_name: str) -> bool:
"""根据文件特征,判断两张图片相似度是否超过指定阈值"""
for signature in file_features:
pash_feature, orb_feature, matrix_shape, file_name_history = signature[2], signature[3], signature[4], \
signature[5]
if not pash_feature:
# 若图片为空,p_hash、des_matrix为空,跳过比对
continue
# 解压缩数据
decompressed_data = zlib.decompress(orb_feature)
# 将字节流转换回矩阵
des_matrix_history = np.frombuffer(decompressed_data, dtype=np.uint8).reshape(eval(matrix_shape))
# 移除转义字符 '\' 并将十六进制字符串转换为字节序列
bytes_data = bytes.fromhex(file_name_history)
# 解码字节序列为 UTF-8 编码的字符串
file_name_decoded = bytes_data.decode('utf-8')
phash_similarity = self.get_phash_similarity(p_hash, pash_feature)
orb_similarity = self.get_orb_similarity(des_matrix, des_matrix_history, file_name, file_name_decoded)
max_similarity = max(phash_similarity, orb_similarity)
min_similarity = min(phash_similarity, orb_similarity)
if max_similarity >= self.mix_similarity:
result = max_similarity
else:
result = min_similarity
similarity = round(result, 2)
if similarity >= self.similar_threshold:
logger.info(
"fileName: %s, method: ImgSimilarCleaner, dataset: %s. This picture is similar to %s, "
"and the similarity is %.4f. The picture is filtered.", file_name, self.task_uuid,
file_name_decoded, similarity)
return True
return False
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
"""去除相似图片算子执行入口"""
start = time.time()
file_name = sample[self.filename_key]
img_bytes = sample[self.data_key]
data = bytes_to_numpy(img_bytes) if img_bytes else np.array([])
similar_images = self.filter_similar_images(data, file_name)
# 若相似图片,sample[self.data_key]设为空
if not similar_images.size:
sample[self.data_key] = b""
logger.info(f"fileName: {file_name}, method: ImgSimilarCleaner costs {(time.time() - start):6f} s")
return sample

View File

@@ -0,0 +1,6 @@
{
"query_sql": "SELECT * FROM operator_similar_img_features WHERE task_uuid = :task_uuid ORDER BY timestamp LIMIT :ge OFFSET :le",
"insert_sql": "INSERT INTO operator_similar_img_features (task_uuid,p_hash,des_matrix,matrix_shape,file_name,timestamp) VALUES (:task_uuid,:p_hash,:des_matrix,:matrix_shape,:file_name,:timestamp)",
"query_task_uuid_sql": "SELECT * FROM operator_similar_img_features WHERE task_uuid = :task_uuid",
"create_tables_sql": "CREATE TABLE IF NOT EXISTS operator_similar_img_features (id INT AUTO_INCREMENT PRIMARY KEY,task_uuid VARCHAR(255),p_hash TEXT,des_matrix BLOB,matrix_shape TEXT,file_name TEXT,timestamp DATETIME);"
}

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='DuplicateFilesFilter',
module_path="ops.filter.remove_duplicate_file.process")

View File

@@ -0,0 +1,25 @@
name: '相似文档去除'
name_en: 'Similar Document Removal'
description: '相似文档去除。'
description_en: 'Removes similar documents.'
language: 'python'
vendor: 'huawei'
raw_id: 'DuplicateFilesFilter'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '这篇文档跟数据集中的另一篇文档内容几乎一样,执行该算子后,这篇文档会被去除。'
after: ''
inputs: 'text'
outputs: 'text'
settings:
fileDuplicateThreshold:
name: 文档相似度
description: 基于MinHash算法和Jaccard相似度,计算当前文档与数据集中其它文档相似性,超过设定值,该文档被去除。
type: slider
defaultVal: 0.5
min: 0
max: 1
step: 0.1

View File

@@ -0,0 +1,158 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 文档局部内容去重
Create: 2025/01/07
"""
import json
import re
import time
from pathlib import Path
from typing import List, Dict, Any
import numpy as np
from datasketch import MinHash
from sqlalchemy import text
from loguru import logger
from datamate.sql_manager.sql_manager import SQLManager
from datamate.common.utils import get_now_time
from datamate.core.base_op import Filter
class DuplicateFilesFilter(Filter):
"""相似文档去除插件
基于MinHash计算当前文档与数据集中其它文档相似性,相似性高于设定阈值则返回空。
"""
def __init__(self, *args, **kwargs):
# 标点符号
super().__init__(*args, **kwargs)
self.punctuation_pattern = "。.??!!,,;;::()()【】{}[]“”""‘’''/\n"
# 默认相似度阈值为0.5
self.duplicate_th = kwargs.get("fileDuplicateThreshold", 0.5)
# task_uuid为标识该数据集的唯一标志
self.task_uuid = kwargs.get("uuid", "")
# 数据库连接
self.conn = None
# 数据库事务
self.trans = None
# 每页数据量
self.page_size = 500
# 获取数据库sql
self.sql_dict = self.load_sql_dict()
@staticmethod
def load_sql_dict():
"""获取sql语句"""
sql_config_path = str(Path(__file__).parent / 'sql' / 'sql_config.json')
with open(sql_config_path, 'r', encoding='utf-8') as f:
return json.load(f)
def get_minhash(self, input_text: str) -> MinHash:
"""获取输入文档的minhash
Args:
input_text: 输入文档内容
Returns:
text_minhash: 输入文档对应的minhash值
"""
text_minhash = MinHash()
for word in re.split(f"[{re.escape(self.punctuation_pattern)}]", input_text.strip()):
text_minhash.update(word.strip().encode('utf8'))
return text_minhash
def deduplicate_files(self, sample: Dict[str, Any], file_name: str) -> str:
"""去除相似文件
Args:
content: 待处理的Content对象
file_name: 文件名称
Returns:
input_text: 去重后的文件内容,大于相似度值返回空,否则返回原始文本内容。
"""
input_text = sample[self.text_key]
if not input_text:
return input_text
text_minhash = self.get_minhash(input_text)
return self.execute_sql(text_minhash, file_name, input_text)
def execute_sql(self, text_minhash: MinHash, file_name: str,
input_text: str) -> str:
"""从数据库中获取文件特征、比较相似度,插入新的文件特征"""
timestamp = get_now_time('Asia/Shanghai', '%Y-%m-%d %H:%M:%S', file_name,
"DuplicateFilesFilter")
minhash_values = text_minhash.hashvalues
# 将 NumPy 数组转换为字符串
minhash_values_string = np.array2string(minhash_values)
query_task_uuid_sql = self.sql_dict.get("query_task_uuid_sql")
insert_sql = self.sql_dict.get("insert_sql")
create_tables_sql = self.sql_dict.get("create_tables_sql")
db_manager = SQLManager()
try:
self.conn = db_manager.create_connect()
except Exception as e:
logger.error(f"fileName: {file_name}, database connection failed: {str(e)}")
raise RuntimeError(82000, str(e)) from None
with self.conn as connection:
connection.execute(text(create_tables_sql))
result = connection.execute(text(query_task_uuid_sql), {"task_uuid": self.task_uuid}).fetchall()
total_count = len(result)
if self.has_similar_text(connection, file_name, text_minhash, total_count):
return ""
insert_data = {
"task_uuid": self.task_uuid,
"file_feature": minhash_values_string,
"file_name": file_name.encode("utf-8").hex(),
"timestamp": timestamp
}
connection.execute(text(insert_sql), insert_data)
return input_text
def has_similar_text(self, connection, file_name, text_minhash, total_count) -> bool:
query_sql = self.sql_dict.get("query_sql")
for i in range(0, total_count, self.page_size):
rows = connection.execute(
text(query_sql), {"task_uuid": self.task_uuid, "ge": self.page_size, "le": i}).fetchall()
# 对应任务uuid,最后一页没有数据,跳出循环
if not rows:
break
# 对两个文本进行相似度比较
if self.determine_similar_text(rows, text_minhash, file_name):
return True
return False
def determine_similar_text(self, file_features: List, text_minhash: MinHash, file_name: str) -> bool:
for signature in file_features:
# 历史文件特征和历史文件名称
file_feature, file_name_history = signature[2], signature[3]
if not file_feature:
continue
minhash_obj = MinHash(num_perm=128)
minhash_obj.hashvalues = np.fromstring(file_feature.strip('[]'), dtype=np.uint64, sep=' ')
similarity = text_minhash.jaccard(minhash_obj)
# 移除转义字符 '\' 并将十六进制字符串转换为字节序列
bytes_data = bytes.fromhex(file_name_history)
# 解码字节序列为 UTF-8 编码的字符串
file_name_decoded = bytes_data.decode('utf-8')
if similarity >= self.duplicate_th:
logger.info(f"taskId: {self.task_uuid}, fileName: {file_name} is similar to {file_name_decoded}, "
f"and the similarity is {similarity:4f}")
return True
return False
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
file_name = sample[self.filename_key]
self.task_uuid = sample.get("instance_id") if not self.task_uuid else self.task_uuid
sample[self.text_key] = self.deduplicate_files(sample, file_name)
logger.info(f"taskId: {self.task_uuid} fileName: {file_name}, "
f"method: DuplicateFilesFilter costs {(time.time() - start):6f} s")
return sample

View File

@@ -0,0 +1,6 @@
{
"query_sql": "SELECT * FROM operators_similar_text_features WHERE task_uuid = :task_uuid ORDER BY timestamp LIMIT :ge OFFSET :le",
"create_tables_sql": "CREATE TABLE IF NOT EXISTS operators_similar_text_features (id INT AUTO_INCREMENT PRIMARY KEY, task_uuid VARCHAR(255),file_feature TEXT,file_name TEXT,timestamp DATETIME);",
"insert_sql": "INSERT INTO operators_similar_text_features (task_uuid, file_feature, file_name, timestamp) VALUES (:task_uuid, :file_feature, :file_name, :timestamp)",
"query_task_uuid_sql": "SELECT * FROM operators_similar_text_features WHERE task_uuid = :task_uuid"
}

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='FileWithManySensitiveWordsFilter',
module_path="ops.filter.remove_file_with_many_sensitive_words.process")

View File

@@ -0,0 +1,25 @@
name: '文档敏感词率检查'
name_en: 'Sensitive Word Rate Check'
description: '去除敏感词过多的文档。'
description_en: 'Filters out files that contain excessive sensitive phrases.'
language: 'python'
vendor: 'huawei'
raw_id: 'FileWithManySensitiveWordsFilter'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '出售硝酸甘油出售硝酸甘油出售硝酸甘油出售硝酸甘油'
after: ''
inputs: 'text'
outputs: 'text'
settings:
sensitiveWordsRate:
name: 文档敏感词率
description: 敏感词的字数/文档总字数 > 设定值,该文档被去除。
type: slider
defaultVal: 0.01
min: 0
max: 1
step: 0.01

View File

@@ -0,0 +1,116 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 过滤语言概率太低的文档(支持自定义阈值)
Create: 2023/12/7 15:43
"""
import sys
import time
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Filter
from datamate.common.utils.aho_corasick import build_trie, add_fail_pointer
sys.setrecursionlimit(5000)
class AhoCorasic:
"""AC自动机算法进行目标字符串搜索"""
def __init__(self, words):
self._root = add_fail_pointer(build_trie(words))
def search_and_count(self, text: str, special_symbols: set):
"""
匹配敏感词,统计敏感词字数。
Args:
text: 文本
special_symbols: 特殊字符(需跳过)
Returns:
统计敏感词字数
"""
target_count = 0
node = self._root
valid_len = 0 # 当前遍历的有效长度
for _, s in enumerate(text):
if s in special_symbols: # 跳过特殊字符
continue
matched = True
while s not in node.child: # 当node.child没有字符s
if node == self._root: # 当node为root(无node.fail),有效长度归0且跳出
valid_len = 0
matched = False
break
elif node.fail == self._root: # node.fail为root场景,有效长度归0,但可继续
valid_len = 0
node = node.fail # 移动到失败指针节点
if not matched:
continue
node = node.child.get(s)
valid_len += 1
if node.word: # node是单词尾字母
target_count += valid_len
valid_len = 0
return target_count
class FileWithManySensitiveWordsFilter(Filter):
"""外部输入的暴力、色情文本过滤插件"""
def __init__(self, *args, **kwargs):
super(FileWithManySensitiveWordsFilter, self).__init__(*args, **kwargs)
root_path = Path(__file__).parent / 'resources'
violent_file_path = str(root_path / 'violent.txt')
sexual_file_path = str(root_path / 'sexual.txt')
political_file_path = str(root_path / 'political.txt')
special_symbols_path = str(root_path / 'special_symbols.txt')
self._file_sensitive_words_rate = kwargs.get("sensitiveWordsRate", 0.01) # 参数默认值为0.01
self.violent_words = self.load_words_list(violent_file_path)
self.sexual_words = self.load_words_list(sexual_file_path)
self.political_words = self.load_words_list(political_file_path)
self.special_symbols = self.load_words_list(special_symbols_path)
self.symbols = self.special_symbols | {"\n", "\t", "\r"} # 符号,不纳入文本字数统计
self.words = self.violent_words | self.sexual_words | self.political_words
self.ac_automaton = AhoCorasic(self.words)
@staticmethod
def load_words_list(path):
"""词表加载"""
with open(path, 'r', encoding='utf-8') as f:
words = set(f.read().splitlines())
return words
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._file_with_many_sensitive_words_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FileWithManySensitiveWordsFilter costs {(time.time() - start):6f} s")
return sample
def _file_with_many_sensitive_words_filter(self, input_data: str, file_name):
"""过滤敏感词过多的文档"""
total_count = 0
for s in input_data:
if s not in self.symbols:
total_count += 1
if total_count == 0:
return input_data
# 敏感词率 = 敏感词字数 / 总字数,符号不纳入统计
sensitive_rate = self.ac_automaton.search_and_count(input_data, self.special_symbols) / total_count
if sensitive_rate >= self._file_sensitive_words_rate:
logger.info(f"This document contains too many sensitive words. "
f"The proportion of sensitive words is {sensitive_rate}. "
f"Threshold is {self._file_sensitive_words_rate}. The document {file_name} is filtered.")
return ""
else:
return input_data

View File

@@ -0,0 +1,321 @@
习近平
平近习
xjp
习太子
习明泽
老习
温家宝
温加宝
温x
温jia宝
温宝宝
温加饱
温加保
张培莉
温云松
温如春
温jb
胡温
胡x
胡jt
胡boss
胡总
胡王八
hujintao
胡jintao
胡j涛
胡惊涛
胡景涛
胡紧掏
湖紧掏
胡紧套
锦涛
hjt
胡派
胡主席
刘永清
胡海峰
胡海清
江泽民
民泽江
江胡
江主席
江书记
江浙闽
江沢民
江浙民
茳泽民
zemin
ze民
老江
老j
江core
江x
江派
江zm
jzm
江戏子
江蛤蟆
江某某
江贼
江猪
江氏集团
江绵恒
江绵康
王冶坪
江泽慧
邓小平
平小邓
xiao平
邓xp
邓晓平
邓朴方
邓榕
邓质方
毛泽东
猫泽东
猫则东
猫贼洞
毛zd
毛zx
z东
ze东
泽d
zedong
毛太祖
毛相
主席画像
改革历程
朱镕基
朱容基
朱镕鸡
朱容鸡
朱云来
李鹏
李peng
里鹏
李月月鸟
李小鹏
李小琳
华主席
华国
国锋
国峰
锋同志
白春礼
薄熙来
薄一波
蔡赴朝
蔡武
曹刚川
常万全
陈炳德
陈德铭
陈建国
陈良宇
陈绍基
陈同海
陈至立
戴秉国
丁一平
董建华
杜德印
杜世成
傅锐
郭伯雄
郭金龙
贺国强
胡春华
耀邦
华建敏
黄华华
黄丽满
黄兴国
回良玉
贾庆林
贾廷安
靖志远
李长春
李春城
李建国
李克强
李岚清
李沛瑶
李荣融
李瑞环
李铁映
李先念
李学举
李源潮
栗智
梁光烈
廖锡龙
林树森
林炎志
林左鸣
令计划
柳斌杰
刘奇葆
刘少奇
刘延东
刘云山
刘志军
龙新民
路甬祥
罗箭
吕祖善
马飚
马恺
孟建柱
欧广源
强卫
沈跃跃
宋平顺
粟戎生
苏树林
孙家正
铁凝
屠光绍
王东明
汪东兴
王鸿举
王沪宁
王乐泉
王洛林
王岐山
王胜俊
王太华
王学军
王兆国
王振华
吴邦国
吴定富
吴官正
无官正
吴胜利
吴仪
奚国华
习仲勋
徐才厚
许其亮
徐绍史
杨洁篪
叶剑英
由喜贵
于幼军
俞正声
袁纯清
曾培炎
曾庆红
曾宪梓
曾荫权
张德江
张定发
张高丽
张立昌
张荣坤
张志国
赵洪祝
紫阳
周生贤
周永康
朱海仑
中南海
大陆当局
中国当局
北京当局
共产党
党产共
共贪党
阿共
产党共
公产党
工产党
共c党
共x党
共铲
供产
共惨
供铲党
供铲谠
供铲裆
共残党
共残主义
共产主义的幽灵
拱铲
老共
中共
中珙
中gong
gc党
贡挡
gong党
g产
狗产蛋
共残裆
恶党
邪党
共产专制
共产王朝
裆中央
土共
土g
共狗
g匪
共匪
仇共
症腐
政腐
政付
正府
政俯
政f
zhengfu
政zhi
挡中央
档中央
中国zf
中央zf
国wu院
中华帝国
gong和
大陆官方
北京政权
江泽民
胡锦涛
温家宝
习近平
习仲勋
贺国强
贺子珍
周永康
李长春
李德生
王岐山
姚依林
回良玉
李源潮
李干成
戴秉国
黄镇
刘延东
刘瑞龙
俞正声
黄敬
薄熙
薄一波
周小川
周建南
温云松
徐明
江泽慧
江绵恒
江绵康
李小鹏
李鹏
李小琳
朱云来
朱容基
法轮功
李洪志
新疆骚乱

View File

@@ -0,0 +1,288 @@
爱液
按摩棒
拔出来
爆草
包二奶
暴干
暴奸
暴乳
爆乳
暴淫
被操
被插
被干
逼奸
仓井空
插暴
操逼
操黑
操烂
肏你
肏死
操死
操我
厕奴
插比
插b
插逼
插进
插你
插我
插阴
潮吹
潮喷
成人电影
成人论坛
成人色情
成人网站
成人文学
成人小说
艳情小说
成人游戏
吃精
抽插
春药
大波
大力抽送
大乳
荡妇
荡女
盗撮
发浪
放尿
肥逼
粉穴
干死你
干穴
肛交
肛门
龟头
裹本
国产av
豪乳
黑逼
后穴
虎骑
换妻俱乐部
黄片
几吧
鸡吧
鸡巴
鸡奸
妓女
奸情
叫床
脚交
精液
就去日
巨屌
菊花洞
菊门
巨奶
巨乳
菊穴
开苞
口爆
口活
口交
口射
口淫
狂操
狂插
浪逼
浪妇
浪叫
浪女
漏乳
露b
乱交
乱伦
轮暴
轮操
轮奸
裸陪
买春
美逼
美少妇
美乳
美腿
美穴
美幼
秘唇
迷奸
密穴
蜜穴
蜜液
摸奶
摸胸
母奸
奈美
奶子
男奴
内射
嫩逼
嫩女
嫩穴
捏弄
女优
炮友
砲友
喷精
屁眼
前凸后翘
强jian
强暴
强奸处女
情趣用品
情色
拳交
全裸
群交
人妻
人兽
日逼
日烂
肉棒
肉逼
肉唇
肉洞
肉缝
肉棍
肉茎
肉具
揉乳
肉穴
肉欲
乳爆
乳房
乳沟
乳交
乳头
骚逼
骚比
骚女
骚水
骚穴
色逼
色情网站
色区
色色
色诱
色欲
色b
射爽
射颜
食精
释欲
兽奸
兽交
手淫
兽欲
熟妇
熟母
熟女
爽片
双臀
死逼
丝袜
丝诱
松岛枫
酥痒
汤加丽
套弄
体奸
体位
舔脚
舔阴
调教
偷欢
推油
脱内裤
文做
舞女
吸精
夏川纯
相奸
小逼
小穴
小xue
性感妖娆
性感诱惑
性虎
性饥渴
性技巧
性交
性奴
性虐
性息
性欲
穴口
穴图
亚情
颜射
阳具
杨思敏
要射了
一夜欢
一夜情
一ye情
阴部
淫虫
阴唇
淫荡
阴道
淫电影
阴阜
淫妇
淫河
阴核
阴户
淫贱
淫叫
淫教师
阴茎
阴精
淫浪
淫媚
淫糜
淫魔
淫母
淫女
淫虐
淫妻
淫情
淫色
淫声浪语
淫兽学园
淫书
淫术炼金士
淫水
淫娃
淫威
淫亵
淫样
淫液
淫照
阴b
应召
幼交
欲火
欲女
玉乳
玉穴
援交
原味内衣
援助交际
招鸡
招妓
抓胸
自慰
作爱
a片
fuck
gay片
g点
h动画
h动漫
失身粉
淫荡自慰器

View File

@@ -0,0 +1,50 @@
!
.
,
#
$
%
&
*
(
)
|
?
/
@
"
'
;
[
]
{
}
+
~
-
_
=
^
<
>
——
……
:

View File

@@ -0,0 +1,452 @@
出售雷管
炸药
出售炸药
出售炸药电话
制作火药配方
炸药出售
出售雷管炸药
火药配方
在家里做原子弹
硝酸甘油炸弹制作
TNT
炸弹的制作
硝铵炸药配方
硝酸甘油制作
炸弹配方
简易炸药
火药制作简易炸弹
炸弹制作
硝酸甘油
硝酸甘油炸弹制作方法
手把手教你做炸弹
恐怖分子傻瓜手册
氢弹手工制作方法
起爆器
自制燃烧弹
燃烧弹
弹药制造方法
液体炸药配方
炸药配方与制作
自制炸药配方
tnt炸药成分
炸药的制作
tnt炸药配方
各种炸药配方大全
炸药的成分
燃烧弹的制作方
黑火药的配方
黑火药的制作
黑火药比例
制作燃烧弹
黑火药配方比例
黑火药的成分
原子弹配方
原子弹DIY教程
C4炸药
雷管出售
制作硝化甘油
DIY核弹
核弹制作方法
原子弹制作手册
简易原子弹教程
黑火药配方
自制炸药方法
教你怎么用土办法做武器
zz炸弹的制作方法
C4炸药的成分
炸药制造生产工艺
原子弹制造完全手册
汽车炸弹制作
人体炸弹制作流程
炸弹制作从入门
鸡尾酒炸弹制作
集束炸弹制作
硝酸铵的简便制取方法
莫洛托夫燃烧弹
塑胶炸弹
用火药混合塑料
高性能爆炸物质混合
原子弹制作方法
手教你制作原子弹
轻型巡航导弹的制作
黑索金的制造过程
Zha药制造进程
购买自爆材料
可塑炸弹制作方法
C4塑胶炸药
水浴法制TNT
简易核弹头的制造
高爆炸药的基本配方
如何制硝化甘油
燃烧剂的简易制备
简易大威力炸弹
炸弹制作入门
硝酸甘油炸药制作
基本配方是硝酸甘
燃烧剂的简易制作
制作性能优良的燃烧弹
酸甘油炸药
简易导弹制作
核弹头的制造
可塑炸弹制作
炸药硝酸甘油制法
原子弹制作方法
火药的配方
手教你做原子弹
怎么用土办法做武器
手把手教你制作原子弹
燃烧弹的制作方法
怎么自制燃烧弹
TNT炸弹制作
高爆炸药基本配方
爆破和杀伤燃烧弹
炸弹制作从入门到精
写的炸弹制作教程
原子弹制作方法
教你制作原子弹
下硝化甘油的制作方法
DIY原子弹
原子弹制作
炸药及其制造方法
ATOM弹制造
三分钟教你原子弹
火药原理
简易炸弹
简易燃烧弹
土炸药成份
你怎么用土办法做武器
燃烧弹制作
燃烧瓶的制作方法
炸弹制作手册
原子弹制作方法DIY
汽油三分之一白糖混合
炸药的制作配方
燃烧弹的制作方
炸药的配方
如何自制燃烧弹
标准炸弹教程
黑火药和硝酸钾农药混合
恐怖分子自己制造炸弹
手柄制作炸弹引爆器
制作炸弹首先要
燃烧瓶制作
出售炸药
出售气枪
出售炸药QQ
出售气枪QQ
出售气枪电话
出售手枪QQ
出售手枪电话
出售猎枪
出售猎枪QQ
出售猎枪电话
出售汽枪
出售步枪
出售狙击枪
出售手狗
出售气狗
出售猎狗
手枪出售
64式手枪出售
92式手枪出售
高仿真枪
出售54式手枪
出售M1911
出售各类猎狗
出售国产军用54手枪
出售火枪游戏
出售进口金属枪模
出售仿真枪
出售麻醉枪免收任何定金
出售手枪
出售外国枪模
出售袖珍手枪
娥眉气枪出售
防身手枪QQ
防身武器手枪
仿真枪QQ
钢珠枪小口径步枪
高压狗QQ
高压气枪
气枪子弹
高压气枪电狗专卖
高压气枪
气枪货到付款
高压气枪专卖店
各类军用枪
各类军用枪支
各式气枪出售
工字牌气枪出售专卖
气枪
工字汽狗麻醉枪
供应军用手枪
供应汽枪
购买枪支QQ
国产汽狗专卖
气枪货
集团出售手枪
集团出售手枪气枪
进口气枪
气枪子弹
军用54出售
军用77出售
军用手枪钢珠枪
里有汽枪出售
猎枪出售
猎枪仿真枪QQ
猎枪麻醉枪钢珠枪
猎枪哪里有卖
猎枪手枪买卖
猎枪销售手枪销售气枪
猎枪专卖店
猎枪子弹底火专卖
麻醉枪出售可验货
麻醉枪猎枪军用
买卖54狗
买卖64狗
买卖警察枪支
买卖军狗
买卖军用枪支
卖64手枪
卖92手枪
卖钢珠枪QQ
哪里能买到自制手枪
哪里有钢珠枪
哪里有连发猎枪买卖
哪里有卖双管猎
哪里有气枪卖
哪里有枪出售
哪里有真枪出售
哪里有真枪卖买
哪能买到真枪
出售手枪气枪猎
气狗
高压气枪
气狗出售
进口气狗
气枪专卖网
气枪出售
气枪专卖网
猎枪
钢珠枪
气枪仿真枪出售
气枪购买
气枪组装
气枪货到付款
汽狗出售
气枪价格
仿真气枪
气枪价格
气枪专卖店
气枪看货加QQ
气枪联系QQ
气枪猎枪麻醉枪
气枪迷魂药
汽狗高压汽枪
铅弹
上海工字气枪
全金属仿真枪专卖
热武器QQ
三箭气枪出售
枪猎
手槍买卖自制手枪
售枪购买手枪
售手枪QQ
司马仿真枪模型QQ
淘宝气枪货到付款
提供气狗专卖店
提供专业及时的气狗
秃鹰气枪货到付款
锡峰气枪出售
仙鹤气枪出售
买卖77式手枪
销售电手狗
销售气手狗
销售小口径步枪
销售运动步枪
小口径秃鹰沙漠
有军用枪支出售
长期出售手枪
折叠狗QQ
转让猎枪
子弹货到付款
自制手枪哪里买
自制手枪哪里有卖
阻击枪
汽枪
高压气枪
组装手枪
最新销售汽狗
出售警用手铐
仿真枪
猎枪
真枪
手枪
阻击枪
弓弩专营
弓弩专卖网货到付款
弓弩专卖网
弓弩专卖货到付款
弓弩专卖店网
弓弩专卖店
弓弩专卖
弓弩制作
弓弩直销
弓弩有限公司
弓弩营销中心
弓弩一百专卖店
弓弩销售
弓弩网
弓弩图纸
弓弩特许经销
弓弩狩猎网
自制手弩
追风弓弩麻醉箭专卖
专业弓弩网
中国战神军用弓弩
中国弩弓专卖
中国弓弩专卖网
中国弓弩直销
中国弓弩网
中国弓弩狩猎网
中国弓驽网
制作简易弓弩
郑州弓弩专卖
赵氏弓弩专卖网
赵氏弓弩专卖店
赵氏弓弩专卖
赵氏弓弩销售
小型弓弩专卖店
小猎人弓弩网
狩猎器材弓弩专卖
狩猎器材弓弩
狩猎弓弩专卖网
狩猎弓弩专卖
狩猎弓弩麻醉箭
手枪式折叠三用弩
三利达弓弩专卖网
三利达弓弩直营
三利达弓弩配件
三步倒药箭批发
三步倒弩箭专卖
三步倒麻醉弩箭销售
三步倒麻醉箭专卖
三步倒麻醉箭
三步倒捕狗药
军用弓弩专卖网
军用弓弩专卖店
军用弓弩批发
军用弓弩公司
供应三利达弓弩麻醉箭
供应三步倒麻醉箭
供应秦氏弓弩
供应弩用麻醉箭
供应弩捕狗箭
供应麻醉箭三步倒
供应麻醉箭批发
供应麻醉箭
供应军用弩折叠弩
供应军用弓弩专卖
供应精品弓弩
供应弓弩麻醉箭
供应弓弩
供应钢珠弓弩
弓弩商城专卖
弓弩商城
弓弩亲兄弟货到付款
弓弩批发
弓弩免定金货到付款
弓弩麻醉箭
弓弩麻醉镖
弓弩论坛
钢珠弓弩专卖网
钢珠弓弩专卖店
打狗弓弩三步倒
麻醉弓弩专卖店
出售军刀
出售军刺
出售弹簧刀
出售三棱刀
出售跳刀
军刀网
南方军刀网
户外军刀网
三棱军刺专卖
出售开山刀军刺
西点军刀网
军刀专卖
戈博军刀
阿兰德龙户外
出售军品军刀
勃朗宁军刀
军刀军品网
阿兰得龙野营刀具网
出售军刺军刀
警用刀具出售
折刀专卖网
阳江军品军刀网
野营刀专卖
砍刀精品折刀专卖
匕首蝴蝶甩刀专卖
军刀专卖军刺
军刀专卖刀具批发
军刀图片砍刀
军刀网军刀专卖
军刀价格军用刀具
军品军刺网
军刀军刺甩棍
阳江刀具批发网
北方先锋军刀
正品军刺出售
野营军刀出售
开山刀砍刀出售
仿品军刺出售
军刀直刀专卖
手工猎刀专卖
自动跳刀专卖
军刀电棍销售
军刀甩棍销售
美国军刀出售
极端武力折刀
防卫棍刀户外刀具
阿兰德龙野营刀
仿品军刺网
野营砍刀户外军刀
手工猎刀户外刀具
中国户外刀具网
西点军品军刀网
野营开山刀军刺
三利达弓弩军刀
尼泊尔军刀出售
防卫野营砍刀出售
防卫著名军刀出售
防卫棍刀出售
防卫甩棍出售
防卫电棍出售
军刺野营砍刀出售
著名精品折刀出售
战术军刀出售
刺刀专卖网
户外军刀出售
阳江刀具直销网
冷钢刀具直销网
防卫刀具直销网
极端武力直销网
刀具直销网
军刀直销网
直刀匕首直销网
军刀匕首直销网
折刀砍刀军品网
野营刀具军品网
阳江刀具军品网
冷钢刀具军品网
防卫刀具军品网
极端武力军品网
军用刀具军品网
军刀直刀军品网
折刀砍刀专卖
野营刀具专卖
阳江刀具专卖
冷钢刀具专卖
防卫刀具专卖
出售美军现役军刀

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='FileWithShortOrLongLengthFilter',
module_path="ops.filter.remove_file_with_short_or_long_length.process")

View File

@@ -0,0 +1,34 @@
name: '文档字数检查'
name_en: 'Word Count Check'
description: '字数不在指定范围会被过滤掉。'
description_en: 'Filters out documents whose word count is not in the specified range.'
language: 'python'
vendor: 'huawei'
raw_id: 'FileWithShortOrLongLengthFilter'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '过短文本'
after: ''
inputs: 'text'
outputs: 'text'
settings:
fileLength:
name: 文档字数
description: '过滤字数不在指定范围内的文档,如[10,10000000]。若输入为空,则不对字数上/下限做限制。'
type: range
properties:
- name: fileMinimumLength
type: inputNumber
defaultVal: 10
min: 0
max: 10000000000000000
step: 1
- name: fileMaximumLength
type: inputNumber
defaultVal: 10000000
min: 0
max: 10000000000000000
step: 1

View File

@@ -0,0 +1,54 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 词数目不在指定范围会被过滤掉(支持自定义阈值)
Create: 2025/01/16
"""
import re
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Filter
class FileWithShortOrLongLengthFilter(Filter):
"""检查文档字数目,词数目不在指定范围会被过滤掉(支持自定义阈值)"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
file_length_list = kwargs.get("fileLength", [10, 10000000]) # [下限,上限],默认字数下限为10, 默认字数上限为10000000
if len(file_length_list) != 2: # 要求传入字数目上限和字数目下限
logger.error(f"method: FileWithShortOrLongLengthFilter expected 2 arguments, got {len(file_length_list)}")
raise RuntimeError(82001, "method: FileWithShortOrLongLengthFilter expected 2 arguments") from None
# 用户不输入下限参数时前端传入'',则不对字数目下限控制
self._file_minimum_length = 0 if not file_length_list[0] else file_length_list[0]
# 用户不输入上限参数时前端传入'',则不对字数目上限控制
self._file_maximum_length = float("inf") if not file_length_list[1] else file_length_list[1]
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._file_with_short_or_long_length_filter(sample[self.text_key],
sample[self.filename_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FileWithShortOrLongLengthFilter costs {(time.time() - start):6f} s")
return sample
def _strip_unicode_whitespace(self, text: str):
# 常见 Unicode 空格符(涵盖普通空格、全角空格、零宽空格等)
pattern = r'[\u0020\u00A0\u1680\u2000-\u200F\u202F\u205F\u3000]+'
# 匹配首尾的空格符
pattern = fr'^{pattern}|{pattern}$'
return re.sub(pattern, '', text)
def _file_with_short_or_long_length_filter(self, input_data: str, file_name):
input_data_tmp = self._strip_unicode_whitespace(input_data)
if len(input_data_tmp) < self._file_minimum_length or len(input_data_tmp) > self._file_maximum_length:
logger.info(f"The length of input_data is: {len(input_data_tmp)}, "
f"which is not within the threshold range of {self._file_minimum_length} "
f"and {self._file_maximum_length}. {file_name} is filtered.")
return ""
return input_data

View File

@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
import sys
from pathlib import Path
from datamate.core.base_op import OPERATORS
from datamate.common.utils.custom_importer import CustomImporter
def _configure_importer():
base_path = Path(__file__).resolve().parent
sys.meta_path.append(CustomImporter(base_path))
_configure_importer()
def _import_operators():
from . import text_formatter
from . import word_formatter
from . import img_formatter
from . import file_exporter
from . import slide_formatter
_import_operators()

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='FileExporter',
module_path="ops.formatter.file_exporter.process")

View File

@@ -0,0 +1,16 @@
name: '落盘算子'
name_en: 'save file operator'
description: '将文件内容保存为文件。'
description_en: 'Save the file data as a file.'
language: 'Python'
vendor: 'Huawei'
raw_id: 'FileExporter'
version: '1.0.0'
types:
- 'collect'
modal: 'others'
effect:
before: ''
after: ''
inputs: 'all'
outputs: 'all'

View File

@@ -0,0 +1,144 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: Json文本抽取
Create: 2024/06/06 15:43
"""
import time
import os
import uuid
from typing import Tuple, Dict, Any
from loguru import logger
from datamate.core.constant import Fields
from datamate.core.base_op import Mapper
from datamate.common.utils import check_valid_path
class FileExporter(Mapper):
"""把输入的json文件流抽取为txt"""
def __init__(self, *args, **kwargs):
super(FileExporter, self).__init__(*args, **kwargs)
self.last_ops = True
self.text_support_ext = kwargs.get("text_support_ext", ['txt', 'html', 'md', 'markdown',
'xml', 'json', 'doc', 'docx', 'pdf'])
self.data_support_ext = kwargs.get("data_support_ext", ['jpg', 'jpeg', 'png', 'bmp'])
self.medical_support_ext = kwargs.get("medical_support_ext", ['svs', 'tif', 'tiff'])
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
file_name = sample[self.filename_key]
file_type = sample[self.filetype_key]
try:
start = time.time()
if file_type in self.text_support_ext:
sample, save_path = self.get_textfile_handler(sample)
elif file_type in self.data_support_ext:
sample, save_path = self.get_datafile_handler(sample)
elif file_type in self.medical_support_ext:
sample, save_path = self.get_medicalfile_handler(sample)
else:
raise TypeError(f"{file_type} is unsupported! please check support_ext in FileExporter Ops")
if sample[self.text_key] == '' and sample[self.data_key] == b'':
sample[self.filesize_key] = "0"
return sample
if save_path:
self.save_file(sample, save_path)
sample[self.text_key] = ''
sample[self.data_key] = b''
sample[Fields.result] = True
file_type = save_path.split('.')[-1]
sample[self.filetype_key] = file_type
base_name, _ = os.path.splitext(file_name)
new_file_name = base_name + '.' + file_type
sample[self.filename_key] = new_file_name
base_name, _ = os.path.splitext(save_path)
sample[self.filepath_key] = base_name
file_size = os.path.getsize(base_name)
sample[self.filesize_key] = f"{file_size}"
logger.info(f"origin file named {file_name} has been save to {save_path}")
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FileExporter costs {time.time() - start:.6f} s")
except UnicodeDecodeError as err:
logger.error(f"fileName: {sample[self.filename_key]}, "
f"method: FileExporter causes decode error: {err}")
raise
return sample
def get_save_path(self, sample: Dict[str, Any], target_type) -> str:
export_path = os.path.abspath(sample[self.export_path_key])
file_name = sample[self.filename_key]
new_file_name = os.path.splitext(file_name)[0] + '.' + target_type
if not check_valid_path(export_path):
os.makedirs(export_path, exist_ok=True)
res = os.path.join(export_path, new_file_name)
return res
def get_textfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
target_type = sample.get("target_type", None)
# target_type存在则保存为扫描件, docx格式
if target_type:
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, target_type)
# 不存在则保存为txt文件,正常文本清洗
else:
sample = self._get_from_text(sample)
save_path = self.get_save_path(sample, 'txt')
return sample, save_path
def get_datafile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
target_type = sample.get("target_type", None)
# target_type存在, 图转文保存为target_type,markdown格式
if target_type:
sample = self._get_from_text(sample)
save_path = self.get_save_path(sample, target_type)
# 不存在则保存为原本图片文件格式,正常图片清洗
else:
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, sample[self.filetype_key])
return sample, save_path
def get_medicalfile_handler(self, sample: Dict[str, Any]) -> Tuple[Dict, str]:
target_type = 'png'
sample = self._get_from_data(sample)
save_path = self.get_save_path(sample, target_type)
return sample, save_path
def save_file(self, sample, save_path):
file_name, _ = os.path.splitext(save_path)
# 以二进制格式保存文件
file_sample = sample[self.text_key].encode('utf-8') if sample[self.text_key] else sample[self.data_key]
with open(file_name, 'wb') as f:
f.write(file_sample)
# 获取父目录路径
parent_dir = os.path.dirname(file_name)
os.chmod(parent_dir, 0o770)
os.chmod(file_name, 0o640)
def _get_from_data(self, sample: Dict[str, Any]) -> Dict[str, Any]:
sample[self.data_key] = bytes(sample[self.data_key])
sample[self.text_key] = ''
return sample
def _get_from_text(self, sample: Dict[str, Any]) -> Dict[str, Any]:
sample[self.data_key] = b''
sample[self.text_key] = str(sample[self.text_key])
return sample
def _get_uuid(self):
res = str(uuid.uuid4())
return res

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ImgFormatter',
module_path="ops.formatter.img_formatter.process")

View File

@@ -0,0 +1,16 @@
name: '读取图片文件'
name_en: 'Image File Reader'
description: '读取图片文件。'
description_en: 'Reads image files.'
language: 'Python'
vendor: 'Huawei'
raw_id: 'ImgFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,35 @@
# # -- encoding: utf-8 --
#
# Description:
# Create: 2024/1/30 15:24
# """
import time
from typing import Dict, Any
import cv2
import numpy as np
from loguru import logger
from datamate.common.utils import numpy_to_bytes
from datamate.core.base_op import Mapper
class ImgFormatter(Mapper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
file_name = sample[self.filename_key]
file_type = "." + sample[self.filetype_key]
file_path = sample[self.filepath_key]
img_data = _img_extract(file_path)
sample[self.data_key] = numpy_to_bytes(img_data, file_type)
logger.info(f"fileName: {file_name}, method: ImgExtract costs {(time.time() - start):6f} s")
return sample
def _img_extract(file_path):
return cv2.imdecode(np.fromfile(file_path, dtype=np.uint8), -1)

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='SlideFormatter',
module_path="ops.formatter.slide_formatter.process")

View File

@@ -0,0 +1,16 @@
name: '病理图片内容抽取'
name_en: 'Pathology Image Content Extraction'
description: '解析病理图片。'
description_en: 'Analyze pathological images.'
language: 'python'
vendor: 'huawei'
raw_id: 'SlideFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'image'
effect:
before: ''
after: ''
inputs: 'image'
outputs: 'image'

View File

@@ -0,0 +1,36 @@
# -- encoding: utf-8 --
"""
Description: 医疗图片解析载入
Create: 2025/02/08 11:00
"""
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class SlideFormatter(Mapper):
def __init__(self, *args, **kwargs):
super(SlideFormatter, self).__init__(*args, **kwargs)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
'''
Read medical image and corresponding mask file, each as Image type in COntent value. Return Content.
'''
start = time.time()
file_type = sample[self.filetype_key]
types_openslide = ['svs', 'tif', 'dcm', 'vms', 'vmu',
'ndpi', 'scn', 'mrxs', 'tiff', 'svslide',
'bif', 'czi', 'sdpc']
if file_type not in types_openslide:
raise TypeError(f"Format not supported: {file_type}. Supported formats are: {', '.join(types_openslide)}.")
file_name = sample[self.filename_key]
logger.info(f"fileName: {file_name}, method: SlideFormatter costs {(time.time() - start):6f} s")
# Not really loading the slide, instead, use path as lazy loading.
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='TextFormatter',
module_path="ops.formatter.text_formatter.process")

View File

@@ -0,0 +1,16 @@
name: 'TXT文本抽取'
name_en: 'TXT Text Extraction'
description: '抽取TXT中的文本'
description_en: 'Extracts text from TXT files.'
language: 'python'
vendor: 'huawei'
raw_id: 'TxtFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,44 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: Json文本抽取
Create: 2024/06/06 15:43
"""
import time
from loguru import logger
from typing import Dict, Any
from datamate.core.base_op import Mapper
class TextFormatter(Mapper):
"""把输入的json文件流抽取为txt"""
def __init__(self, *args, **kwargs):
super(TextFormatter, self).__init__(*args, **kwargs)
@staticmethod
def _extract_json(byte_io):
"""将默认使用utf-8编码的Json文件流解码,抽取为txt"""
# 用utf-8-sig的格式进行抽取,可以避免uft-8 BOM编码格式的文件在抽取后产生隐藏字符作为前缀。
return byte_io.decode("utf-8-sig").replace("\r\n", "\n")
def byte_read(self, sample: Dict[str, Any]):
filepath = sample[self.filepath_key]
with open(filepath, "rb") as file:
byte_data = file.read()
sample[self.data_key] = byte_data
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
try:
self.byte_read(sample)
sample[self.text_key] = self._extract_json(sample[self.data_key])
sample[self.data_key] = b"" # 将sample[self.data_key]置空
logger.info(
f"fileName: {sample[self.filename_key]}, method: TextFormatter costs {(time.time() - start):6f} s")
except UnicodeDecodeError as err:
logger.exception(f"fileName: {sample[self.filename_key]}, method: TextFormatter causes decode error: {err}")
raise
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='WordFormatter',
module_path="ops.formatter.word_formatter.process")

View File

@@ -0,0 +1,16 @@
name: 'Word文本抽取'
name_en: 'Word Text Extraction'
description: '抽取Word中的文本'
description_en: 'Extracts text from Word files.'
language: 'java'
vendor: 'huawei'
raw_id: 'WordFormatter'
version: '1.0.0'
types:
- 'collect'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,68 @@
# # -- encoding: utf-8 --
#
# Description:
# Create: 2024/1/30 15:24
# """
from loguru import logger
import os
import subprocess
import time
from typing import Dict, Any
from datamate.common.utils import check_valid_path
from datamate.core.base_op import Mapper
class WordFormatter(Mapper):
SEPERATOR = ' | '
def __init__(self, *args, **kwargs):
super(WordFormatter, self).__init__(*args, **kwargs)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
file_name = sample[self.filename_key]
file_path = sample[self.filepath_key]
file_type = sample[self.filetype_key]
txt_content = self.word2html(file_path, file_type)
sample[self.text_key] = txt_content
logger.info(f"fileName: {file_name}, method: WordFormatter costs {(time.time() - start):6f} s")
return sample
@staticmethod
def word2html(file_path, file_type):
check_valid_path(file_path)
file_dir = file_path.rsplit('/', 1)[0]
file_name = file_path.rsplit('/', 1)[1]
html_file_path = os.path.join(file_dir, f"{file_name}.txt")
current_file_path = os.path.dirname(os.path.abspath(__file__))
try:
process = subprocess.Popen(
['java', '-jar', f'{current_file_path}/../../../java_operator/WordFormatter-1.0.jar', file_path,
html_file_path, file_type], shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
stdout, stderr = process.communicate(timeout=24 * 60 * 60)
if process.returncode == 0:
logger.info(f"Convert {file_path} successfully to DOCX")
else:
logger.info(f"Convert {file_path} failed, error: {stderr.strip().decode('utf-8')}.")
raise RuntimeError()
except subprocess.CalledProcessError as e:
logger.error(f"Convert failed: {e}, return code: {e.returncode}")
except FileNotFoundError:
logger.error("LibreOffice command not found, please make sure it is available in PATH")
except Exception as e:
logger.error(f"An unexpected error occurred, convert failed: {e}", )
try:
with open(html_file_path, 'r', encoding='utf-8') as file:
txt_content = file.read()
os.remove(html_file_path)
logger.info("Tmp docx file removed")
except FileNotFoundError:
logger.error(f"Tmp file {html_file_path} does not exist")
except PermissionError:
logger.error(f"You are not allowed to delete tmp file {html_file_path}")
logger.info(f"Convert {html_file_path} to html success")
return txt_content

View File

@@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
"""
since:
"""
import sys
from pathlib import Path
from datamate.common.utils.custom_importer import CustomImporter
def _configure_importer():
base_path = Path(__file__).resolve().parent
sys.meta_path.append(CustomImporter(base_path))
_configure_importer()
def _import_operators():
from . import qa_condition_evaluator
from . import text_quality_evaluation
_import_operators()

View File

@@ -0,0 +1,10 @@
# -*- coding: utf-8 -*-
"""
since:
"""
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='QAConditionEvaluator',
module_path="ops.llms.qa_condition_evaluator.process")

View File

@@ -0,0 +1,16 @@
name: 'QA评估'
name_en: 'QA Assessment'
description: '通过用户维度和相应描述进行QA对评估。'
description_en: 'Perform QA assessment based on the user dimension and corresponding description.'
language: 'python'
vendor: 'huawei'
raw_id: 'QAConditionEvaluator'
version: '1.0.0'
types:
- 'consolidate'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,98 @@
# -- encoding: utf-8 --
"""
Description: 基于LLM通过用户设置维度和相应描述进行QA对评估
Create: 2023/11/7 9:26
"""
import json
import re
import time
from pathlib import Path
from typing import List, Dict, Any
from loguru import logger
from datamate.core.base_op import LLM
class QAConditionEvaluator(LLM):
def __init__(self, *args, **kwargs):
super(QAConditionEvaluator, self).__init__(*args, **kwargs)
self.pattern = r'结果[::] ?[YN]'
self.template_path = Path(__file__).parent / "resources/template.txt"
self.examples_path = Path(__file__).parent / "resources/examples.json"
self.task_id = kwargs.get("taskId", "default_id")
self.dimensions = kwargs.get("dimension", [
{
"dimension": "回答是否有针对性",
"description": "回答应对问题中的所有疑问点提供正面、直接的回答,"
"不应引起疑惑。同时,答案不应有任何内容的遗漏,需构成一个完整的陈述。"
},
{
"dimension": "问题是否独立",
"description": "仅分析问题,问题的主体和客体都比较明确,即使有省略,也符合语言习惯。"
"在不需要补充其他信息的情况下不会引起疑惑。"
},
{
"dimension": "语法是否错误",
"description": "问题为疑问句,答案为陈述句; 不存在词语搭配不当的情况;连接词和标点符号不存在错用情况;"
"逻辑混乱的情况不存在;语法结构都正确且完整;"
}
])
self.llm = self.get_llm(*args, **kwargs)
self.prompts = self.build_llm_prompt(*args, **kwargs)
@staticmethod
def _process_examples(dimension_example: List) -> str:
if not dimension_example:
return "\n"
res = "\n以下是一些案例供你参考:"
for single_example in dimension_example:
res += (f"\n问题:{single_example['question']}"
f"\n回答:{single_example['answer']}"
f"\n分析思路:{single_example['evaluate']}"
f"\n结果:{single_example['result']}\n")
return res
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
qas = json.loads(sample[self.text_key])
single_content_res = []
for qa in qas:
single_qa_res = []
for dimension, prompt in self.prompts.items():
local_result = self._llm_call_parse(qa, prompt, retry=2)
single_qa_res.append({"dimension": dimension, "result": local_result})
qa_response = {"qaId": qa["qaId"], "result": single_qa_res}
single_content_res.append(qa_response)
sample[self.text_key] = "Sucess"
self.save_sample(single_content_res, sample)
cost_time = time.time() - start
logger.info(f"task id: {self.task_id}, method: QAConditionEvaluator costs {cost_time:.6f} s")
return sample
def build_llm_prompt(self, *args, **kwargs) -> Dict:
templates = self.template_path.read_text(encoding="utf-8")
examples_dict = json.loads(self.examples_path.read_text(encoding="utf-8"))
prompts_dict = {}
for dimension in self.dimensions:
name, des = dimension["dimension"], dimension["description"]
dimension_example = self._process_examples(examples_dict.get(name))
dimension_prompt = templates.format(criterion=des, examples=dimension_example, question="{question}",
answer="{answer}")
prompts_dict[name] = dimension_prompt
return prompts_dict
def _llm_call_parse(self, data: Dict, prompt: str, retry: int = 2):
try:
for _ in range(retry):
response = self.llm(prompt.format(question=data["question"], answer=data["answer"]))
result = re.findall(self.pattern, response)
if result:
return "Y" in result[0]
except RuntimeError as e:
logger.error(f"method: QAConditionEvaluator execution error, cause by {e}")
return False

View File

@@ -0,0 +1,36 @@
{
"回答是否有针对性": [
{
"question": "下发 IO 测试的命令格式是什么??",
"answer": "下发 IO 测试的命令格式为:",
"evaluate": "回答正面直接的回答了问题(命令格式是什么),但是答案以冒号结尾并不是完整的陈述,因此结果是N",
"result": "N"
},
{
"question": "HyperCDP的约束关系是什么?",
"answer": "HyperCDP特性和OceanStor Dorado的其他功能之间存在一定的约束关系。",
"evaluate": "回答提及到了HyperCDP特性和OceanStor Dorado的其他功能之间存在约束关系,并未直接回答约束关系是什么,因此结果是N。",
"result": "N"
}
],
"问题是否独立": [
{
"question": "这段文档主要讲述了什么内容?",
"answer": "这段文档主要讲述了在治疗月经失调和怀孕问题时,需要根据患者的具体症状来调整治疗方法。主要方法包括温养肾气、调理气血、调补冲任、疏肝解郁、燥湿化痰、理气调经以及活血调经等。",
"evaluate": "问题中省略了“这段文档”的具体指代,如果没有额外的上下文信息,单独这个问题可能会引起疑惑,因为听者或读者可能不知道指的是哪篇文章。在大多数情况下,为了使问题更加明确,提问者应该提供文章的标题、作者或相关背景信息。",
"result": "N"
},
{
"question": "这些鱼体内的不饱和脂肪酸含量非常高,这是为什么?",
"answer": "因为这些鱼体内的脂肪酸构成比较特殊,不饱和脂肪酸的含量非常高,而饱和脂肪酸的含量则较低。",
"evaluate": "在中文提问的语言习惯中,问题中使用“这些”来指代一组已经提及或已知的鱼是合适的。然而,问题本身并没有提供足够的信息来确定“这些鱼”具体指的是哪些鱼,这就造成了主体的不明确。",
"result": "N"
},
{
"question": "如何食用黑芝麻以达到最佳效果?",
"answer": "我们平日里吃到的芝麻制品多为芝麻酱和芝麻香油。吃整粒的芝麻对于营养的吸收来说并不是最好的,因为芝麻的外面有一层稍硬的膜,只有把它碾碎,其中的营养素才能被吸收。所以,芝麻最好碾碎了再吃。",
"evaluate": "在这个问题中,主体“黑芝麻”是具体的食物,客体“食用方法以达到最佳效果”也是明确的,因为人们通常关心如何食用某种食物以获得最大的健康益处。因此这个问题是符合标准的。",
"result": "Y"
}
]
}

View File

@@ -0,0 +1,107 @@
# QA评估插件
## 背景
基于维度和描述对QA对进行评估,支持用户自定义维度。
### 约束:
- 维度小于10个
- 维度名称低于20个字
- 依赖大模型服务,服务输入输出如下:
```python
# 输入
request_template = {
"prompt": "你好",
"max_length": 2024,
"top_n": 0.9,
"temperature": 0.9
}
# 输出
response_template = {
"response":"XXX"
}
```
#### 默认3个维度:
- 问题是否独立
- 问答是否针对
- 语法是否错误
## 调用接口输入
```python
inputs = [[
{
"businessData": {
"params": {
"taskId":1,
"LLMUrl":"https://x.x.x.x:xxxx/qwen",
"LLMHeaders":{"Content-Type": "application/json","User-Agent":"Client"},
"LLMBody":{
"prompt": "你好",
"max_length": 2024,
"top_n": 0.9,
"temperature": 0.9
},
"dimension":[
{"dimension":"回答是否有针对性",
"description":"回答应对问题中的所有疑问点提供正面、直接的回答,不应引起疑惑。同时,答案不应有任何内容的遗漏,需构成一个完整的陈述。"
},
{"dimension":"问题是否独立",
"description":"仅分析问题,问题的主体和客体都比较明确,即使有省略,也符合语言习惯。在不需要补充其他信息的情况下不会引起疑惑。"
},
{"dimension":"语法是否错误",
"description":"问题为疑问句,答案为陈述句; 不存在词语搭配不当的情况;连接词和标点符号不存在错用情况;逻辑混乱的情况不存在;语法结构都正确且完整;"
}
]
}
},
"passData": {
"data": "",
"text": "[{\"question\":\"什么是秋燥、秋困和秋冻?\",\"answer\":\"秋燥、秋困和秋冻是秋天常见的三种症状和养生问题。秋燥是指秋天天气干燥,导致人体水分流失,出现皮肤发痒、嘴唇起皮、鼻咽干燥等症状;秋困是指秋天天气凉爽,人体代谢下降,导致人感到无精打采、呵欠连天、昏昏欲睡等症状;秋冻是指秋天气温下降,人体需要适应气温的变化,不能一下子穿上很多衣服,让身体适应气温的变化。\",\"qaId\":1}]",
"meta": {
}
},
"contextData": {}
}
]]
```
调用接口输出
```python
outputs = [
{
"businessData": {
"params": {
"taskId": 1,
"LLMUrl": "https://x.x.x.x:xxxx/qwen",
"LLMHeaders": {
"Content-Type": "application/json",
"User-Agent": "Client"
},
"LLMBody": {
"prompt": "你好",
"max_length": 2024,
"top_n": 0.9,
"temperature": 0.9
},
"dimension": [
{
"dimension": "回答是否有针对性",
"description": "回答应对问题中的所有疑问点提供正面、直接的回答,不应引起疑惑。同时,答案不应有任何内容的遗漏,需构成一个完整的陈述。"
},
{
"dimension": "问题是否独立",
"description": "仅分析问题,问题的主体和客体都比较明确,即使有省略,也符合语言习惯。在不需要补充其他信息的情况下不会引起疑惑。"
},
{
"dimension": "语法是否错误",
"description": "问题为疑问句,答案为陈述句; 不存在词语搭配不当的情况;连接词和标点符号不存在错用情况;逻辑混乱的情况不存在;语法结构都正确且完整;"
}
]
}
},
"passData": {
"data": "",
"text": "[{\"qaId\": 1, \"result\": [{\"dimension\": \"\回\答\是\否\有\针\对\性\", \"result\": true}, {\"dimension\": \"\问\题\是\否\独\立\", \"result\": true}, {\"dimension\": \"\语\法\是\否\错\误\", \"result\": true}]}]",
"meta": {}
},
"contextData": {}
}
]
```

View File

@@ -0,0 +1,17 @@
你将会获得一个问答对,判断问答对是否满足以下标准:
标准:"{criterion}"
要求:
1. 结合以上标准,一步一步的分析问答对是否满足标准,按照模板输出你的回答。
2. 如果你对自己的判断没有较强的信心,直接算作不满足标准。
3. 你的最终裁定应该是'Y'表示是(符合标准)或'N'表示否(不符合标准)。
4. 如果你的回答不符合模板格式和规范,重新思考回答。
{examples}
问答对:
问题:"{question}"
答案:"{answer}"
模板:
结果:[插入结果N或Y]
分析思路:XXX
"""

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='TextQualityEvaluation',
module_path="ops.llms.text_quality_evaluation.process")

View File

@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
"""
Description: 指令数据生成常量
Create: 2023/11/20 16:20
"""
EVAL_DIMENSION_MAP = [
{
"dimension": "完备性",
"description": "数据的记录和信息是否是完整的,是否存在缺失的情况",
"score_name": "qua_score"
},
{
"dimension": "一致性",
"description": "同一指标在不同地方的结果是否一致",
"score_name": "logic_score"
},
{
"dimension": "有效性",
"description": "该样本涉及某领域的信息量",
"score_name": "effective_score"
}
]
BUSINESS_EVAL_DIMENSION_MAP = [
{
"dimension": "金融",
"description": "涉及保险合同、保险问答、年报、资产负债表、金融新闻、保险从业资格CICE、基金从业资格、期货从业资格、注册会计师(CPA"
")、理财规划师、税务师、精算师-金融数学、经济师、证券从业资格、银行从业资格等相关金融行业知识",
"score_name": "finance_score"
},
{
"dimension": "存储",
"description": "存储",
"score_name": "storage_score"
},
{
"dimension": "医疗",
"description": "涵盖中医科、儿科、内科、口腔科、外科、妇产科、心理科学、急诊科、感染与免疫科、生殖健康科、男性健康科、皮肤性病科、眼耳鼻喉科、神经科学、肿瘤科等医疗相关领域",
"score_name": "medical_score"
}
]

View File

@@ -0,0 +1,16 @@
name: '文本质量评估'
name_en: 'Text Quality Evaluation'
description: '通过用户维度和相应描述进行文本评估。'
description_en: 'Text evaluation is performed based on user dimensions and corresponding descriptions.'
language: 'python'
vendor: 'huawei'
raw_id: 'TextQualityEvaluation'
version: '1.0.0'
types:
- 'consolidate'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,113 @@
# -- encoding: utf-8 --
"""
Description: 基于LLM通过用户设置维度和相应描述进行文本质量评估
Create: 2025/3/14 11:00
"""
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
from typing import Dict, Any
from loguru import logger
from datamate.common.utils.text_splitter import TextSplitter
from datamate.core.base_op import LLM
from .constant import EVAL_DIMENSION_MAP, BUSINESS_EVAL_DIMENSION_MAP
from .prompt_config import TEXT_QUALITY_EVALUATE_TEMPLATE
CHUNK_SIZE = 4000
CHUNK_OVERLAP = 0
class TextQualityEvaluation(LLM):
def __init__(self, *args, **kwargs):
super(TextQualityEvaluation, self).__init__(*args, **kwargs)
self.total_length = 0
self.text_list = []
self.total_scores = [0, 0, 0, 0, 0, 0]
self.text_splitter = TextSplitter(1024 * 1024, CHUNK_SIZE, CHUNK_OVERLAP)
self.pattern = r'\d+\.\d+'
self.task_id = kwargs.get("taskId", "default_id")
self.llm = self.get_llm(*args, **kwargs)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
tmp_text_list = self.text_splitter.split_text(sample[self.text_key])
logger.info(f"task id: {self.task_id}, the length of chunks: {len(tmp_text_list)}")
self.text_list = tmp_text_list
text_res = {}
self._evaluate_concurrently_text(text_res)
sample[self.text_key] = "Success"
self.save_sample([text_res], sample)
cost_time = time.time() - start
logger.info(f"task id: {self.task_id}, method: TextQualityEvaluation costs {cost_time:.6f} s")
self.text_list = []
return sample
def _evaluate_concurrently_text(self, text_res, max_workers: int = 5):
for eval_dimension in EVAL_DIMENSION_MAP + BUSINESS_EVAL_DIMENSION_MAP:
text_res[eval_dimension["score_name"]] = 0
self.total_scores = [0, 0, 0, 0, 0, 0]
self.total_length = 0
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# 使用 partial 绑定多参数
future_to_params = {
executor.submit(
partial(self.get_current_score_concurrently, text)): text
for text in self.text_list
}
for future in as_completed(future_to_params):
self.parse_execute_result(future, future_to_params)
for _, eval_dimension in enumerate(EVAL_DIMENSION_MAP + BUSINESS_EVAL_DIMENSION_MAP):
total_score = self.total_scores[_]
text_res[eval_dimension["score_name"]] = 0
if self.total_length > 0:
text_res[eval_dimension["score_name"]] = total_score / self.total_length
def parse_execute_result(self, future, future_to_params):
text = future_to_params[future]
try:
scores = future.result()
if scores and len(scores) == len(self.total_scores):
self.total_length += len(text)
for _, score in enumerate(scores):
self.total_scores[_] = self.total_scores[_] + score * len(text)
except Exception as e:
logger.error(f"Evaluate error, error details: {e}")
def get_current_score_concurrently(self, text, retry: int = 2):
dimension_list = []
for eval_dimension in EVAL_DIMENSION_MAP + BUSINESS_EVAL_DIMENSION_MAP:
dimension = eval_dimension["dimension"] + ":" + eval_dimension["description"]
dimension_list.append(dimension)
prompt = TEXT_QUALITY_EVALUATE_TEMPLATE.format(context=text, dimension0=dimension_list[0],
dimension1=dimension_list[1], dimension2=dimension_list[2],
dimension3=dimension_list[3], dimension4=dimension_list[4],
dimension5=dimension_list[5])
retry_time = 0
while True:
try:
return self.get_scores(prompt)
except RuntimeError as e:
if retry_time < retry:
retry_time += 1
else:
logger.warning(f"Request LLM error, details: {e}")
return []
def get_scores(self, prompt):
response = self.llm(prompt)
scores_str_list = response.split(",")
scores = []
for scores_str in scores_str_list:
decimals = re.findall(self.pattern, scores_str)
if decimals:
score = float(decimals[-1])
if 0 <= score <= 1:
scores.append(score)
logger.info(f"current evaluate scores: {scores}")
return scores

View File

@@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
"""
Description: prompt 配置文件
Create: 2024/02/07
"""
TEXT_QUALITY_EVALUATE_TEMPLATE = """
===
<Role>:
你是一位擅长文本质量评估的数据处理专家。
===
<Instructions>:
你擅长根据已知的Context内容, 结合每个评估标准Dimension,给出该标准下文本质量评估结果,结果为0-1的小数:
- 充分理解Context内容,质量评估时要覆盖Context的主要内容,不能随意臆想和编造。
- 如果你对自己的判断没有较强的信心,直接算作不满足标准,输出0.0分。
- 总计会有六个评估标准,分别是Dimension1~Dimension6,每个评估标准都需要给出对应标准下的评估分数,分数为0-1的小数。
- 每个评估标注都只输出最终的打分,不能输出额外的内容;每个评估标准的评估结果之间用英文逗号“,”分开。
===
<Task>
请基于下面的参考信息和<Instructions>,生成符合要求的内容。
输入:
参考信息Context是: "{context}"
第一个评估标准Dimension0是: "{dimension0}"
第二个评估标准Dimension1是: "{dimension1}"
第三个评估标准Dimension2是: "{dimension2}"
第四个评估标准Dimension3是: "{dimension3}"
第五个评估标准Dimension4是: "{dimension4}"
第六个评估标准Dimension5是: "{dimension5}"
输出:
"""

View File

@@ -0,0 +1,98 @@
{
"对文本逻辑连贯性的评分,范围1-5分": [
{
"question": "今天天气很好,我吃了苹果。数学题很难,天空是蓝色的。狗会叫,鸟会飞。1234567890。",
"answer": "1",
"evaluate": "这是一段完全没有逻辑的文字,主题不断跳跃,没有任何结构可循。",
"result": "1"
},
{
"question": "我今天早上吃了面包,然后去了公园。天气很好,但突然下起了雨。我思考人生的意义,然后决定回家吃冰淇淋。",
"answer": "2",
"evaluate": "内容尚可理解,但逻辑连贯性较差,主题跳跃明显。",
"result": "2"
},
{
"question": "人工智能正在改变世界。它可以帮助我们解决复杂的问题,但也带来了伦理挑战。例如,自动驾驶汽车需要做出道德决策。此外,人工智能还可以用于医疗诊断。",
"answer": "3",
"evaluate": "内容结构尚可,逻辑基本连贯,但存在少量混乱或跳跃。",
"result": "3"
},
{
"question": "人工智能正在改变世界。它可以帮助我们解决复杂的问题,但也带来了伦理挑战。例如,自动驾驶汽车需要做出道德决策。此外,人工智能还可以用于医疗诊断。这些应用展示了其潜力和局限性。",
"answer": "4",
"evaluate": "内容结构清晰,逻辑连贯,仅有极小混乱或跳跃。",
"result": "4"
},
{
"question": "人工智能正在改变世界。它可以帮助我们解决复杂的问题,但也带来了伦理挑战。例如,自动驾驶汽车需要做出道德决策。此外,人工智能还可以用于医疗诊断。这些应用展示了其潜力和局限性,同时也引发了关于技术与人类关系的深入讨论。",
"answer": "5",
"evaluate": "内容结构清晰,逻辑严密,无任何混乱或跳跃。",
"result": "5"
}
],
"对文本格式一致性的评分,范围1-5分": [
{
"question": "巴黎的埃菲尔铁塔很高,伦敦的塔桥很老,纽约的自由女神像很美。东京的涩谷很有名,新加坡的滨海湾很繁华。",
"answer": "1",
"evaluate": "这是一段完全没有格式一致性的文字,段落之间没有任何分隔,内容完全混乱。",
"result": "1"
},
{
"question": "巴黎的埃菲尔铁塔很高,伦敦的塔桥很老,纽约的自由女神像很美。东京的涩谷很有名,新加坡的滨海湾很繁华。这些地方都很有特色,但描述方式不统一。",
"answer": "2",
"evaluate": "内容尚可理解,但格式一致性较差,段落之间没有任何分隔,存在较多格式混乱。",
"result": "2"
},
{
"question": "巴黎的埃菲尔铁塔很高。伦敦的塔桥很老。纽约的自由女神像很美。东京的涩谷很有名。新加坡的滨海湾很繁华。这些地方都有独特的建筑风格。",
"answer": "3",
"evaluate": "内容结构尚可,格式基本一致,但存在少量格式混乱或不一致。",
"result": "3"
},
{
"question": "巴黎的埃菲尔铁塔很高。\n伦敦的塔桥很老。\n纽约的自由女神像很美。\n东京的涩谷很有名。\n新加坡的滨海湾很繁华。\n这些地方都有独特的建筑风格。",
"answer": "4",
"evaluate": "内容结构清晰,格式一致,仅有极小格式混乱或不一致。",
"result": "4"
},
{
"question": "### 世界著名建筑\n- **巴黎的埃菲尔铁塔**:高耸入云,象征浪漫。\n- **伦敦的塔桥**:历史悠久,充满工业风格。\n- **纽约的自由女神像**:象征自由,举世闻名。\n- **东京的涩谷**:现代都市的代表,充满活力。\n- **新加坡的滨海湾**:融合自然与现代建筑,令人惊叹。\n\n这些地方都有独特的建筑风格,展现了不同的文化特色。",
"answer": "5",
"evaluate": "内容结构清晰,格式完全一致,无任何混乱或格式错误。",
"result": "5"
}
],
"对文本信息完整性的评分,范围1-5分": [
{
"question": "这款手机很好。",
"answer": "1",
"evaluate": "这是一段完全没有信息完整性的文字,内容过于简单,没有任何具体信息。",
"result": "1"
},
{
"question": "这款手机很好,屏幕很大。",
"answer": "2",
"evaluate": "内容尚可理解,但信息完整性较差,缺乏关键细节,如性能、价格等。",
"result": "2"
},
{
"question": "这款手机很好,屏幕很大,运行速度快。",
"answer": "3",
"evaluate": "内容结构尚可,信息基本完整,但存在关键信息遗漏,如摄像头质量、价格等。",
"result": "3"
},
{
"question": "这款手机很好,屏幕很大,运行速度快,摄像头也很清晰。",
"answer": "4",
"evaluate": "内容结构清晰,信息较为完整,仅有少量关键信息遗漏。",
"result": "4"
},
{
"question": "### 这款手机的评测\n- **屏幕**:6.7英寸AMOLED,显示效果出色。\n- **性能**:搭载最新处理器,运行速度快,流畅无卡顿。\n- **摄像头**:4800万像素主摄,支持夜景模式,成像清晰。\n- **价格**:起售价为899美元,性价比高。\n- **优点**:屏幕显示效果好,性能强劲。\n- **缺点**:电池容量较小,续航一般。\n\n总体来说,这是一款综合表现优秀的手机。",
"answer": "5",
"evaluate": "内容结构清晰,信息完整且详细,涵盖了所有关键方面。",
"result": "5"
}
]
}

View File

@@ -0,0 +1,17 @@
你将会获得一个问答对,判断问答对是否满足以下标准:
标准:"{criterion}"
要求:
1. 结合以上标准,一步一步的分析question文本是否满足标准,这里的question不是指一个问题,只是输入的文本,按照模板输出每个维度的分数,你的result就是分数。额外输入一个维度平均分
2. 如果你对自己的判断没有较强的信心,直接算作不满足标准。
3. 你的最终裁定应该是1-5的评分,严格按照examples中打分的标准。
4. 如果你的回答不符合模板格式和规范,重新思考回答。
{examples}
问答对:
问题:"{question}"
答案:"{answer}"
模板:
结果:[1或2或3或4或5]
分析思路:XXX
"""

View File

@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
import sys
from pathlib import Path
from datamate.common.utils.custom_importer import CustomImporter
def _configure_importer():
base_path = Path(__file__).resolve().parent
sys.meta_path.append(CustomImporter(base_path))
_configure_importer()
def _import_operators():
from . import content_cleaner
from . import credit_card_number_cleaner
from . import email_cleaner
from . import emoji_cleaner
from . import extra_space_cleaner
from . import full_width_characters_cleaner
from . import garble_characters_cleaner
from . import html_tag_cleaner
from . import id_number_cleaner
from . import img_watermark_remove
from . import invisible_characters_cleaner
from . import ip_address_cleaner
from . import legend_cleaner
from . import phone_number_cleaner
from . import political_word_cleaner
from . import sexual_and_violent_word_cleaner
from . import text_to_word
from . import traditional_chinese
from . import unicode_space_cleaner
from . import url_cleaner
from . import xml_tag_cleaner
from . import img_enhanced_brightness
from . import img_enhanced_contrast
from . import img_enhanced_saturation
from . import img_enhanced_sharpness
from . import img_perspective_transformation
from . import img_direction_correct
from . import img_denoise
from . import img_shadow_remove
from . import img_type_unify
from . import img_resize
from . import remove_duplicate_sentences
from . import knowledge_relation_slice
_import_operators()

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ContentCleaner',
module_path="ops.mapper.content_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '文档目录去除'
name_en: 'Document Contents Removal'
description: '去除文档中的目录。'
description_en: 'Removes tables of contents from documents.'
language: 'python'
vendor: 'huawei'
raw_id: 'ContentCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: ''
after: ''
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,64 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 文档目录去除
Create: 2025/01/13
"""
import re
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class ContentCleaner(Mapper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.no_content_count = 3 # 连续不符合目录结构的行数阈值
# 目录标题
self.content_text_pattern = r"^ *(目 *录|CONTENT(S)?) *$"
# 目录行 前缀格式
self.content_preface_pattern = r"^ *(前言|About This Document|\d+(\.\d+)*|[a-zA-Z]+(\.\d+)*)"
# 目录行 中间格式
self.content_middle_pattern = r"\.{7,}"
# 目录行 结尾格式
self.content_end_pattern = r"(\d|错误!未定义书签。|[IXV]+) *$"
self.content_pattern = self.content_preface_pattern + ".*" + self.content_end_pattern
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._content_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: ContentCleaner costs {time.time() - start:6f} s")
return sample
def _content_filter(self, input_data: str):
count = 0 # 记录不符合目录结构的次数,连续3行不满足要求,则认为已经进入正文
# 目录起始和结束索引
content_start_index, content_end_index = -1, -1
lines = input_data.split("\n")
for i, line in enumerate(lines):
if content_start_index >= 0 and count >= self.no_content_count:
break
# 首先匹配目录或content字眼
if content_start_index < 0 and re.match(self.content_text_pattern, line, re.IGNORECASE):
content_start_index = i
content_end_index = i
# 匹配两种形式的目录行
# 1. 以指定格式开始、指定格式结尾;2.该行包含点数量超过7个
elif content_start_index >= 0 and (re.match(self.content_pattern, line, re.IGNORECASE)
or re.search(self.content_middle_pattern, line)):
content_end_index = i
count = 0
elif content_start_index >= 0 and not (re.match(self.content_pattern, line, re.IGNORECASE)
or re.search(self.content_middle_pattern, line)):
count += 1
if 0 <= content_start_index < content_end_index:
res = "\n".join(lines[:content_start_index] + lines[content_end_index + 1:])
else:
# 只有目录关键字时,关键字不去除;或不符合目录结构,返回原文
res = "\n".join(lines)
return res

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='AnonymizedCreditCardNumber',
module_path="ops.mapper.credit_card_number_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '信用卡号匿名化'
name_en: 'Credit Card Number Anonymization'
description: '信用卡号匿名化'
description_en: 'Anonymizes credit card numbers.'
language: 'python'
vendor: 'huawei'
raw_id: 'AnonymizedCreditCardNumber'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '这个是信用卡号:4111111111111111'
after: '这个是信用卡号:<credit_card_number>'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,83 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 信用卡号匿名化
Create: 2024/12/5 15:43
"""
from loguru import logger
import re
import time
from typing import Dict, Any
from datamate.core.base_op import Mapper
class AnonymizedCreditCardNumber(Mapper):
def __init__(self, *args, **kwargs):
super(AnonymizedCreditCardNumber, self).__init__(*args, **kwargs)
self.re_compile = self._get_credit_card_re_compile()
@staticmethod
def _verify_credit_card_num(credit_card_num: str):
"""信用卡号码校验"""
# 从右到左翻转
digits = [int(x) for x in reversed(credit_card_num) if x.isdigit()]
# 对偶数位数字翻倍 d*2
even_digits = [d * 2 for d in digits[1::2]]
# 如果对某个数字翻倍之后结果是一个两位数,将这两位数字加在一起
even_digits = [d // 10 + d % 10 for d in even_digits]
# 将上一步所有一位数相加
even_sum = sum(even_digits)
# 将卡号里从右到左奇数位上所有数字相加
odd_sum = sum(digits[::2])
# 将even_sum和odd_sum相加,能被10整数为合法,否则不合法
if (odd_sum + even_sum) % 10 == 0:
return True
return False
@staticmethod
def _get_credit_card_re_compile():
separator_symbol = r"([- ]?)"
# American Express 以 34 或 37 开头的 15 位数号码 格式:NNNN-NNNNNN-NNNNN 或 NNNN NNNNNN NNNNN
american_express = "3[47][0-9]{2}" + separator_symbol + "[0-9]{6}" + separator_symbol + "[0-9]{5}"
# 中国银联 以 62 或 60 开头,是一个 16 位数号码。 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
china_union_pay = r"(6[02]\d{2})" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
# Diner's Club 以 300–305、36、38 或 39、3095 开头, 14 位数号码 格式:NNNN-NNNNNN-NNNN 或 NNNN NNNNNN NNNN。
diners_club = r"(30[0-5]\d|3[689]\d{2}|3095)" + separator_symbol + r"[0-9]{6}" + separator_symbol + r"[0-9]{4}"
# Discover 以 6011、644–649 或 65 开头的 16 位数号码 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
discover = r"(64[4-9]\d|65\d{2}|6011)" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
# JCB 以 3528 到 3589 开头的 16 位数字, 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNNNNNN
jcb = r"(352[89]|35[3-8]\d)" + separator_symbol + r"[0-9]{4}" + (
r"((%s\d{%d}){%d}" % (separator_symbol, 4, 2) + ")|" + separator_symbol + r"[0-9]{8}")
# Mastercard 以 51–55 或 2221–2720 开头的 16 位数字 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
master_card = r"(5[1-5]\d{2}|222[1-9]|22[3-9]\d|2[3-6]\d{2}|27[01]\d|2720)" + r"(%s\d{%d}){%d}" \
% (separator_symbol, 4, 3)
# visa 以4开头 16 位数号码 格式:NNNN-NNNN-NNNN-NNNN 或 NNNN NNNN NNNN NNNN
visa = r"4\d{3}" + r"(%s\d{%d}){%d}" % (separator_symbol, 4, 3)
credit_card_pattern = r"(?<=[^\d])(%s|%s|%s|%s|%s|%s|%s)(?=[^\d])" % (
american_express, china_union_pay, diners_club,
discover, jcb, master_card, visa)
credit_card_re_compile = re.compile(credit_card_pattern)
return credit_card_re_compile
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._credit_card_number_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: CreditCardNumberCleaner costs {time.time() - start:6f} s")
return sample
def _credit_card_number_filter(self, input_data: str):
"""提取信用卡号号码"""
input_data = ''.join(['', input_data, ''])
# 抽取符合信用卡正则匹配的字符串
credit_card_nums = [item.group(1) for item in self.re_compile.finditer(input_data)]
# 判断抽取的字符串是不是真实的信用卡号
for credit_card_num in credit_card_nums:
if self._verify_credit_card_num(credit_card_num):
# 替换有效信用卡号号码为<credit_card_number>
credit_card_num_pattern = r"(?<=[^\d]){}(?=[^\d])".format(credit_card_num)
input_data = re.compile(credit_card_num_pattern).sub("<credit_card_number>", input_data)
return input_data[1:-1]

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='EmailNumberCleaner',
module_path="ops.mapper.email_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '邮件地址匿名化'
name_en: 'Email Address Anonymization'
description: '邮件地址匿名化'
description_en: 'Anonymizes email addresses.'
language: 'python'
vendor: 'huawei'
raw_id: 'EmailNumberCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '这个是邮箱号:test_email@gmail.com'
after: '这个是邮箱号:<email>'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,47 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 邮件地址匿名化
Create: 2025/01/15
"""
from loguru import logger
import re
import time
from typing import Dict, Any
from email_validator import validate_email, EmailNotValidError
from datamate.core.base_op import Mapper
class EmailNumberCleaner(Mapper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.front_email_pattern = r'(?<=[^0-9a-zA-Z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~\-])'
self.back_email_pattern = r'(?=[^0-9a-zA-Z\!\#\$\%\&\'\*\+\-\/\=\?\^\_\`\{\|\}\~\-])'
self.email_pattern = r'([a-zA-Z\d.\-+_]+\s?@\s?[a-zA-Z\d.\-+_]+\.[a-zA-Z0-9]{2,6})'
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._email_number_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: EmailCleaner costs {time.time() - start:6f} s")
return sample
def _email_number_filter(self, input_data: str):
""" 邮箱匿名化"""
mixed_data = ''.join(['', input_data, ''])
paired_emails = re.compile(self.front_email_pattern + self.email_pattern + self.back_email_pattern).findall(
mixed_data)
if paired_emails:
for email in paired_emails:
try:
# 验证电子邮件地址
validate_email(email, check_deliverability=False)
mixed_data = re.compile(self.front_email_pattern + re.escape(email) + self.back_email_pattern).sub(
"<email>", mixed_data, count=1)
except EmailNotValidError as err:
# 日志打印该电子邮件地址无效(不显示具体电子邮件地址)
logger.error(f"email is abnormal email form: {err}")
return mixed_data[1:-1]

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='EmojiCleaner',
module_path="ops.mapper.emoji_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: '文档表情去除'
name_en: 'Emoticon Removal'
description: '去除文档中表情字符或者emoji符号。'
description_en: 'Removes emoticons or emojis from documents.'
language: 'python'
vendor: 'huawei'
raw_id: 'EmojiCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '使用方式很简单,只需要将代码放入Markdown文本中即可,富文本格式可直接复制表情😀使用。'
after: '使用方式很简单,只需要将代码放入Markdown文本中即可,富文本格式可直接复制表情使用。'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,27 @@
"""
Description: 文档表情去除
Create: 2023/12/7 15:43
"""
import time
from typing import Dict, Any
import emoji
from loguru import logger
from datamate.core.base_op import Mapper
class EmojiCleaner(Mapper):
@staticmethod
def _emoji_filter(input_data: str):
res = []
for input_s in input_data.split('\n'):
res.append(emoji.replace_emoji(input_s, replace=''))
return '\n'.join(res)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._emoji_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, method: EmojiCleaner costs {time.time() - start:6f} s")
return sample

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='ExtraSpaceCleaner',
module_path="ops.mapper.extra_space_cleaner.process")

View File

@@ -0,0 +1,17 @@
name: '多余空格去除'
name_en: 'Redundant Space Removal'
description: '移除文档首尾、句中或标点符号附近多余空格和 tab 等。'
description_en: 'Removes redundant spaces and tabs at the beginning and end of documents,
in sentences, or near punctuations.'
language: 'python'
vendor: 'huawei'
raw_id: 'ExtraSpaceCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: ' 人工智能的研究历史有着一条从以“推理”为重 点,到以“知识”为重点,再到以“学习”为重点的自然、清晰的脉络。 '
after: '人工智能的研究历史有着一条从以“推理”为重点,到以“知识”为重点,再到以“学习”为重点的自然、清晰的脉络。'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,69 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 多余空格去除
Create: 2025/01/13
"""
import re
import time
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class ExtraSpaceCleaner(Mapper):
"""去除多余空格、多余空行,包括文档首尾空格、首尾tab
【注意】去除多余空格前,会先将文档中所有空格规范化为\u0020
"""
def __init__(self, *args, **kwargs):
# 匹配文档中非常见的unicode 空格
super().__init__(*args, **kwargs)
self.white_space_pattern = ('[\u00A0 \u1680 \u2000-\u200D \u2028-\u2029'
' \u202F \u205F \u3000 \u180E \u2060 \uFEFF]')
self._file_path = Path(__file__).parent / 'resources' / 'special_token.txt'
self.escaped_special_chars = self._get_escaped_special_chars() # 加载标点符号
# 匹配文章中,连续多个空格
extra_space_pattern = r" {2,}"
# 匹配多个空格、换行符混排情况
extra_line_pattern = r"( |\n){2,}"
# 匹配中文、符号间多余空格
extra_space_in_chinese_pattern = r"(?<=[\u4e00-\u9fa5" + self.escaped_special_chars + r"]) +(?=[\u4e00-\u9fa5" \
+ self.escaped_special_chars + r"])"
self.extra_space_re_compile = re.compile(extra_space_pattern)
self.extra_space_in_chinese_re_compile = re.compile(extra_space_in_chinese_pattern)
self.extra_line_re_compile = re.compile(extra_line_pattern)
self.white_space_pattern_compile = re.compile(self.white_space_pattern)
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._clean_extra_space(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: ExtraSpaceCleaner costs {time.time() - start:6f} s")
return sample
def _get_escaped_special_chars(self) -> str:
with open(self._file_path, 'r', encoding='utf-8') as f:
self._special_token = f.read().splitlines()
res = ''.join([re.escape(char) for char in self._special_token]) # 将特殊字符转义并拼接成字符串
return res
def _clean_extra_space(self, input_data: str) -> str:
# 将文档中非常见的 unicode 空格,如 u2008,转换为正常空格(半角空格)
input_data = self.white_space_pattern_compile.sub('\u0020', input_data)
# 移除文档首尾、句中或标点符号附近多余空格和 tab
input_data = input_data.strip()
# 逐行移除首尾空格
text = "\n".join([line.strip() for line in input_data.split("\n")])
text = ''.join(['', text, ''])
# 连续空格替换为一个正常空格
remove_extra_space = self.extra_space_re_compile.sub("\u0020", text)
# 去除中文、符号间的空格
remove_extra_space_in_chinese = self.extra_space_in_chinese_re_compile.sub("", remove_extra_space)
# 去除连续换行符
remove_duplicate_line = self.extra_line_re_compile.sub("\n", remove_extra_space_in_chinese)
return remove_duplicate_line[1:-1]

View File

@@ -0,0 +1,53 @@
~
·
@
#
%
&
*
+
-
=
{
}
|
`
!
$
^
(
)
_
[
]
\
:
"
;
'
<
>
?
,
/
.

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='FullWidthCharacterCleaner',
module_path="ops.mapper.full_width_characters_cleaner.process")

View File

@@ -0,0 +1,18 @@
name: '全角转半角'
name_en: 'Full-to-Half Width Character'
description: '将文档中的所有全角字符转换成半角字符。'
description_en: 'Converts all full-width characters in documents to half-width characters.'
language: 'python'
vendor: 'huawei'
raw_id: 'FullWidthCharacterCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: 'Residential and commercial design, site inspections, working drawings,
Minicad, renderings.'
after: 'Residential and commercial design, site inspections, working drawings, MiniCad,
renderings.'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,46 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description: 全角转半角
Create: 2025/01/13
"""
import time
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class FullWidthCharacterCleaner(Mapper):
"""将文档中的所有全角字符转换成半角字符"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._full_to_half_dict = {
'': '"', '': '#', '': '$', '': '%', '': '&', '': "'", '': '*', '': '+',
'': '-', '': '.', '': '/', '': '0', '': '1', '': '2', '': '3', '': '4',
'': '5', '': '6', '': '7', '': '8', '': '9', '': '<', '': '=', '': '>',
'': '@', '': 'A', '': 'B', '': 'C', '': 'D', '': 'E', '': 'F', '': 'G',
'': 'H', '': 'I', '': 'J', '': 'K', '': 'L', '': 'M', '': 'N', '': 'O',
'': 'P', '': 'Q', '': 'R', '': 'S', '': 'T', '': 'U', '': 'V', '': 'W',
'': 'X', '': 'Y', '': 'Z', '': '[', '': '\\', '': ']', '': '^', '_': '_',
'': '`', '': 'a', '': 'b', '': 'c', '': 'd', '': 'e', '': 'f', '': 'g',
'': 'h', '': 'i', '': 'j', '': 'k', '': 'l', '': 'm', '': 'n', '': 'o',
'': 'p', '': 'q', '': 'r', '': 's', '': 't', '': 'u', '': 'v', '': 'w',
'': 'x', '': 'y', '': 'z', '': '{', '': '|', '': '}', '': '~'
}
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._full_width_character_filter(sample[self.text_key])
logger.info(f"fileName: {sample[self.filename_key]}, "
f"method: FullWidthCharactersCleaner costs {time.time() - start:6f} s")
return sample
def _full_width_character_filter(self, input_data: str):
res = []
for input_str in input_data.split('\n'):
res.append("".join(self._full_to_half_dict.get(char, char) for char in input_str))
return '\n'.join(res)

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='GrableCharactersCleaner',
module_path="ops.mapper.garble_characters_cleaner.process")

View File

@@ -0,0 +1,17 @@
name: '文档乱码去除'
name_en: 'Garbled Character Removal'
description: '去除文档中的乱码和无意义的unicode。'
description_en: 'Removes garbled characters and meaningless Unicode characters from
documents.'
language: 'python'
vendor: 'huawei'
raw_id: 'GrableCharactersCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '文档乱码����'
after: '文档乱码'
inputs: 'text'
outputs: 'text'

View File

@@ -0,0 +1,54 @@
#!/user/bin/python
# -*- coding: utf-8 -*-
"""
Description:
本插件实现将文档中乱码去除功能
实现逻辑:
1. 正则判断该字符的unicode编码是否在乱码范围内。若在范围内,则去除,不在范围内,则保留。
2. 运行前,加载乱码字符范围的配置文件,即charset.json。该json文件中,key为字符集名称,value为unicode编码范围的集合。
Create: 2025/01/13
"""
import json
import re
import time
from pathlib import Path
from typing import Dict, Any
from loguru import logger
from datamate.core.base_op import Mapper
class GrableCharactersCleaner(Mapper):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._file_path = str(Path(__file__).parent / 'resources' / 'charset.json')
self.unicode_grable_code_list = self.get_unicode_grable_code_list() # 乱码unicode编码的十进制范围的集合
self.grable_re_compile = re.compile("[" + self.unicode_grable_code_list + "]")
def get_unicode_grable_code_list(self):
"""获取乱码unicode编码范围"""
res = ""
with open(self._file_path, 'r', encoding='utf-8') as f:
charset_number_list = json.load(f)
for number_ranges in charset_number_list.values():
for number_range in number_ranges:
number_range_list = number_range.split(",")
if len(number_range_list) < 2:
logger.error(f"number_range_list size is {len(number_range_list)}, formatting error")
continue
res += number_range_list[0] + "-" + number_range_list[1]
return res
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
start = time.time()
sample[self.text_key] = self._grable_characters_filter(sample[self.text_key])
logger.info(
f"fileName: {sample[self.filename_key]}, method: GrableCharactersCleaner costs {time.time() - start:6f} s")
return sample
def _grable_characters_filter(self, input_data: str):
"""去除文档中的乱码"""
return self.grable_re_compile.sub("", input_data)

View File

@@ -0,0 +1,24 @@
{
"注音符号东亚": [
"\u3100,\u312F"
],
"拉丁文补充1": [
"\u00C0,\u00D6",
"\u00D8,\u00F6",
"\u00F8,\u00FF"
],
"拉丁文扩展,A": [
"\u0100,\u017F"
],
"拉丁文扩展,B": [
"\u0180,\u024F"
],
"私人使用区域": [
"\uE000,\uF8FF",
"\\U000f0000,\\U000ffffd",
"\\U00100000,\\U0010fffd"
],
"占位符": [
"\uFFFD,\uFFFD"
]
}

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from datamate.core.base_op import OPERATORS
OPERATORS.register_module(module_name='HtmlTagCleaner',
module_path="ops.mapper.html_tag_cleaner.process")

View File

@@ -0,0 +1,16 @@
name: 'HTML标签去除'
name_en: 'HTML Tag Removal'
description: '移除文档中HTML标签,如 <html>、<dev>、<p> 等。'
description_en: 'Removes HTML tags from documents, such as <html>, <dev>, and <p>.'
language: 'python'
vendor: 'huawei'
raw_id: 'HtmlTagCleaner'
version: '1.0.0'
types:
- 'cleanse'
modal: 'text'
effect:
before: '<p><b>机器学习</b>是<a href="/wiki/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD" title="人工智能">人工智能</a>的一个分支。</p>'
after: '机器学习是人工智能的一个分支。'
inputs: 'text'
outputs: 'text'

Some files were not shown because too many files have changed in this diff Show More