You've already forked DataMate
feature: 增加水印去除/高级匿名化算子 (#151)
* feature: 增加水印去除算子 * feature: clean code * feature: clean code * feature: 增加高级匿名化算子
This commit is contained in:
@@ -59,43 +59,45 @@ VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', '文本清洗模板', '文本清
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板');
|
||||
|
||||
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
|
||||
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 6, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 7, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 8, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 9, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 10, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 11, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 12, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 13, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 14, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 15, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 16, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 17, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 18, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 19, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 20, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 21, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 22, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 23, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null);
|
||||
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 1, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 2, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 3, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 4, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 5, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 6, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 7, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 8, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 9, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 10, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 11, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 12, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 13, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 14, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 15, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 16, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 17, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 18, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 19, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 20, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 21, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 22, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 23, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 24, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 25, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 26, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PiiDetector', 27, null);
|
||||
|
||||
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
|
||||
VALUES ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 3, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 4, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 5, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 6, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 7, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 8, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 9, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 10, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 11, null),
|
||||
VALUES ('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 1, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 2, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 3, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 4, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 5, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 6, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 7, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 8, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 9, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 10, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDirectionCorrect', 11, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null);
|
||||
@@ -105,8 +105,9 @@ VALUES ('MineruFormatter', 'MinerU PDF文本抽取', '基于MinerU API,抽取P
|
||||
('ImgShadowRemove', '图片阴影去除', '去除图片中的阴影,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgSharpness', '图片锐度增强', '自适应调节图片的锐度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgSimilarImagesCleaner', '相似图片去除', '去除相似的图片。', '1.0.0', 'image', 'image', null, '{"similarThreshold": {"name": "相似度", "description": "相似度取值越大,图片相似度越高。", "type": "slider", "defaultVal": 0.8, "min": 0, "max": 1, "step": 0.01}}', '', 'false'),
|
||||
('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', null, '{"imgType": {"name": "图片编码格式", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "value": "jpg"}, {"label": "png", "value": "png"}, {"label": "jpeg", "value": "jpeg"}, {"label": "bmp", "value": "bmp"}]}}', '', 'false');
|
||||
|
||||
('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', null, '{"imgType": {"name": "图片编码格式", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "value": "jpg"}, {"label": "png", "value": "png"}, {"label": "jpeg", "value": "jpeg"}, {"label": "bmp", "value": "bmp"}]}}', '', 'false'),
|
||||
('ImgDirectionCorrect', '图片方向校正', '将含有文字的图片校正到文字水平方向,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('PiiDetector', '高级匿名化', '高级匿名化算子,检测命名实体并匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false');
|
||||
|
||||
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
||||
SELECT c.id, o.id
|
||||
@@ -119,7 +120,8 @@ AND o.id IN ('FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFil
|
||||
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
|
||||
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
|
||||
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
|
||||
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'MineruFormatter');
|
||||
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner', 'MineruFormatter',
|
||||
'PiiDetector');
|
||||
|
||||
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
||||
SELECT c.id, o.id
|
||||
@@ -128,4 +130,4 @@ FROM t_operator_category c
|
||||
WHERE c.id IN ('de36b61c-9e8a-4422-8c31-d30585c7100f', '9eda9d5d-072b-499b-916c-797a0a8750e1', '96a3b07a-3439-4557-a835-525faad60ca3')
|
||||
AND o.id IN ('ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
|
||||
'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
|
||||
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify');
|
||||
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify', 'ImgDirectionCorrect');
|
||||
|
||||
@@ -3,7 +3,12 @@ FROM ghcr.io/astral-sh/uv:python3.11-bookworm
|
||||
RUN --mount=type=cache,target=/var/cache/apt \
|
||||
--mount=type=cache,target=/var/lib/apt \
|
||||
apt update \
|
||||
&& apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix
|
||||
&& apt install -y libgl1 libglib2.0-0 vim libmagic1 libreoffice dos2unix swig
|
||||
|
||||
RUN mkdir -p /home/models \
|
||||
&& wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/ch/ch_ppocr_mobile_v2.0_cls_infer.tar \
|
||||
&& tar -xf ch_ppocr_mobile_v2.0_cls_infer.tar -C /home/models \
|
||||
&& rm -f ch_*.tar
|
||||
|
||||
COPY runtime/python-executor /opt/runtime
|
||||
COPY runtime/ops /opt/runtime/datamate/ops
|
||||
@@ -16,7 +21,8 @@ WORKDIR /opt/runtime
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install -e . --system \
|
||||
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system
|
||||
&& uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
|
||||
&& python -m spacy download zh_core_web_sm
|
||||
|
||||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||
&& chmod +x /opt/runtime/start.sh \
|
||||
|
||||
Reference in New Issue
Block a user