[Feature] Refactor project to use 'datamate' naming convention for services and configurations (#14)

* Enhance CleaningTaskService to track cleaning process progress and update ExecutorType to DATAMATE

* Refactor project to use 'datamate' naming convention for services and configurations
This commit is contained in:
hhhhsc701
2025-10-22 17:53:16 +08:00
committed by GitHub
parent 175d9ded93
commit 31ef8bc265
39 changed files with 312 additions and 737 deletions

View File

@@ -22,6 +22,7 @@ CREATE TABLE IF NOT EXISTS t_clean_task
dest_dataset_name varchar(64),
before_size bigint,
after_size bigint,
file_count int,
created_at timestamp default current_timestamp,
started_at timestamp,
finished_at timestamp,

View File

@@ -111,113 +111,30 @@ VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0',
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
VALUES (3, 'TextFormatter'),
(7, 'FileExporter'),
(8, 'TextFormatter'),
(8, 'FileExporter'),
(3, 'FileWithShortOrLongLengthFilter'),
(3, 'FileWithHighRepeatPhraseRateFilter'),
(3, 'FileWithHighRepeatWordRateFilter'),
(3, 'FileWithHighSpecialCharRateFilter'),
(3, 'FileWithManySensitiveWordsFilter'),
(3, 'DuplicateFilesFilter'),
(3, 'DuplicateSentencesFilter'),
(3, 'AnonymizedCreditCardNumber'),
(3, 'AnonymizedIdNumber'),
(3, 'AnonymizedIpAddress'),
(3, 'AnonymizedPhoneNumber'),
(3, 'AnonymizedUrlCleaner'),
(3, 'HtmlTagCleaner'),
(3, 'XMLTagCleaner'),
(3, 'ContentCleaner'),
(3, 'EmailNumberCleaner'),
(3, 'EmojiCleaner'),
(3, 'ExtraSpaceCleaner'),
(3, 'FullWidthCharacterCleaner'),
(3, 'GrableCharactersCleaner'),
(3, 'InvisibleCharactersCleaner'),
(3, 'LegendCleaner'),
(3, 'PoliticalWordCleaner'),
(3, 'SexualAndViolentWordCleaner'),
(3, 'TraditionalChineseCleaner'),
(3, 'UnicodeSpaceCleaner'),
(4, 'ImgFormatter'),
(4, 'ImgBlurredImagesCleaner'),
(4, 'ImgBrightness'),
(4, 'ImgContrast'),
(4, 'ImgDenoise'),
(4, 'ImgDuplicatedImagesCleaner'),
(4, 'ImgPerspectiveTransformation'),
(4, 'ImgResize'),
(4, 'ImgSaturation'),
(4, 'ImgShadowRemove'),
(4, 'ImgSharpness'),
(4, 'ImgSimilarImagesCleaner'),
(4, 'ImgTypeUnify'),
(8, 'FileWithShortOrLongLengthFilter'),
(8, 'FileWithHighRepeatPhraseRateFilter'),
(8, 'FileWithHighRepeatWordRateFilter'),
(8, 'FileWithHighSpecialCharRateFilter'),
(8, 'FileWithManySensitiveWordsFilter'),
(8, 'DuplicateFilesFilter'),
(8, 'DuplicateSentencesFilter'),
(8, 'AnonymizedCreditCardNumber'),
(8, 'AnonymizedIdNumber'),
(8, 'AnonymizedIpAddress'),
(8, 'AnonymizedPhoneNumber'),
(8, 'AnonymizedUrlCleaner'),
(8, 'HtmlTagCleaner'),
(8, 'XMLTagCleaner'),
(8, 'ContentCleaner'),
(8, 'EmailNumberCleaner'),
(8, 'EmojiCleaner'),
(8, 'ExtraSpaceCleaner'),
(8, 'FullWidthCharacterCleaner'),
(8, 'GrableCharactersCleaner'),
(8, 'InvisibleCharactersCleaner'),
(8, 'LegendCleaner'),
(8, 'PoliticalWordCleaner'),
(8, 'SexualAndViolentWordCleaner'),
(8, 'TraditionalChineseCleaner'),
(8, 'UnicodeSpaceCleaner'),
(11, 'TextFormatter'),
(11, 'FileExporter'),
(11, 'FileWithShortOrLongLengthFilter'),
(11, 'FileWithHighRepeatPhraseRateFilter'),
(11, 'FileWithHighRepeatWordRateFilter'),
(11, 'FileWithHighSpecialCharRateFilter'),
(11, 'FileWithManySensitiveWordsFilter'),
(11, 'DuplicateFilesFilter'),
(11, 'DuplicateSentencesFilter'),
(11, 'AnonymizedCreditCardNumber'),
(11, 'AnonymizedIdNumber'),
(11, 'AnonymizedIpAddress'),
(11, 'AnonymizedPhoneNumber'),
(11, 'AnonymizedUrlCleaner'),
(11, 'HtmlTagCleaner'),
(11, 'XMLTagCleaner'),
(11, 'ContentCleaner'),
(11, 'EmailNumberCleaner'),
(11, 'EmojiCleaner'),
(11, 'ExtraSpaceCleaner'),
(11, 'FullWidthCharacterCleaner'),
(11, 'GrableCharactersCleaner'),
(11, 'InvisibleCharactersCleaner'),
(11, 'LegendCleaner'),
(11, 'PoliticalWordCleaner'),
(11, 'SexualAndViolentWordCleaner'),
(11, 'TraditionalChineseCleaner'),
(11, 'UnicodeSpaceCleaner'),
(11, 'ImgFormatter'),
(11, 'ImgBlurredImagesCleaner'),
(11, 'ImgBrightness'),
(11, 'ImgContrast'),
(11, 'ImgDenoise'),
(11, 'ImgDuplicatedImagesCleaner'),
(11, 'ImgPerspectiveTransformation'),
(11, 'ImgResize'),
(11, 'ImgSaturation'),
(11, 'ImgShadowRemove'),
(11, 'ImgSharpness'),
(11, 'ImgSimilarImagesCleaner'),
(11, 'ImgTypeUnify');
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN (3, 8, 11)
AND o.id IN ('TextFormatter', 'FileWithShortOrLongLengthFilter', 'FileWithHighRepeatPhraseRateFilter',
'FileWithHighRepeatWordRateFilter', 'FileWithHighSpecialCharRateFilter', 'FileWithManySensitiveWordsFilter',
'DuplicateFilesFilter', 'DuplicateSentencesFilter', 'AnonymizedCreditCardNumber', 'AnonymizedIdNumber',
'AnonymizedIpAddress', 'AnonymizedPhoneNumber', 'AnonymizedUrlCleaner', 'HtmlTagCleaner', 'XMLTagCleaner',
'ContentCleaner', 'EmailNumberCleaner', 'EmojiCleaner', 'ExtraSpaceCleaner', 'FullWidthCharacterCleaner',
'GrableCharactersCleaner', 'InvisibleCharactersCleaner', 'LegendCleaner', 'PoliticalWordCleaner',
'SexualAndViolentWordCleaner', 'TraditionalChineseCleaner', 'UnicodeSpaceCleaner');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN (4, 8, 11)
AND o.id IN ('ImgFormatter', 'ImgBlurredImagesCleaner', 'ImgBrightness', 'ImgContrast', 'ImgDenoise',
'ImgDuplicatedImagesCleaner', 'ImgPerspectiveTransformation', 'ImgResize', 'ImgSaturation',
'ImgShadowRemove', 'ImgSharpness', 'ImgSimilarImagesCleaner', 'ImgTypeUnify');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
SELECT c.id, o.id
FROM t_operator_category c
CROSS JOIN t_operator o
WHERE c.id IN (7, 8, 11)
AND o.id IN ('FileExporter');