feature: 增加水印去除/高级匿名化算子 (#151)

* feature: 增加水印去除算子

* feature: clean code

* feature: clean code

* feature: 增加高级匿名化算子
This commit is contained in:
hhhhsc701
2025-12-10 18:12:47 +08:00
committed by GitHub
parent cbb146d3d7
commit 19a04df276
15 changed files with 197 additions and 274 deletions

View File

@@ -105,8 +105,9 @@ VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取P
('ImgShadowRemove', '图片阴影去除', '去除图片中的阴影,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgSharpness', '图片锐度增强', '自适应调节图片的锐度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgSimilarImagesCleaner', '相似图片去除', '去除相似的图片。', '1.0.0', 'image', 'image', null, '{"similarThreshold": {"name": "相似度", "description": "相似度取值越大,图片相似度越高。", "type": "slider", "defaultVal": 0.8, "min": 0, "max": 1, "step": 0.01}}', '', 'false'),
('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', null, '{"imgType": {"name": "图片编码格式", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "value": "jpg"}, {"label": "png", "value": "png"}, {"label": "jpeg", "value": "jpeg"}, {"label": "bmp", "value": "bmp"}]}}', '', 'false');
('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', null, '{"imgType": {"name": "图片编码格式", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "value": "jpg"}, {"label": "png", "value": "png"}, {"label": "jpeg", "value": "jpeg"}, {"label": "bmp", "value": "bmp"}]}}', '', 'false'),
('ImgDirectionCorrect', '图片方向校正', '将含有文字的图片校正到文字水平方向,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('PiiDetector', '高级匿名化', '高级匿名化算子,检测命名实体并匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
@@ -119,7 +120,8 @@ AND o.id IN ('FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFil
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'MineruFormatter');
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'MineruFormatter',
'PiiDetector');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
@@ -128,4 +130,4 @@ FROM t_operator_category c
WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
AND o.id IN ('ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify');
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify', 'ImgDirectionCorrect');