You've already forked DataMate
init datamate
This commit is contained in:
1
scripts/db/00-database-init.sql
Normal file
1
scripts/db/00-database-init.sql
Normal file
@@ -0,0 +1 @@
|
||||
create database if not exists datamate;
|
||||
103
scripts/db/data-cleaning-init.sql
Normal file
103
scripts/db/data-cleaning-init.sql
Normal file
@@ -0,0 +1,103 @@
|
||||
USE datamate;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS t_clean_template
|
||||
(
|
||||
id varchar(64) primary key not null unique,
|
||||
name varchar(64),
|
||||
description varchar(256),
|
||||
created_at timestamp default current_timestamp,
|
||||
updated_at timestamp default current_timestamp,
|
||||
created_by varchar(256)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS t_clean_task
|
||||
(
|
||||
id varchar(64) primary key,
|
||||
name varchar(64),
|
||||
description varchar(256),
|
||||
status varchar(256),
|
||||
src_dataset_id varchar(64),
|
||||
src_dataset_name varchar(64),
|
||||
dest_dataset_id varchar(64),
|
||||
dest_dataset_name varchar(64),
|
||||
before_size bigint,
|
||||
after_size bigint,
|
||||
created_at timestamp default current_timestamp,
|
||||
started_at timestamp,
|
||||
finished_at timestamp,
|
||||
created_by varchar(256)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS t_operator_instance
|
||||
(
|
||||
instance_id varchar(256),
|
||||
operator_id varchar(256),
|
||||
op_index int,
|
||||
settings_override text,
|
||||
PRIMARY KEY (instance_id, operator_id, op_index)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS t_clean_result
|
||||
(
|
||||
instance_id varchar(64),
|
||||
src_file_id varchar(64),
|
||||
dest_file_id varchar(64),
|
||||
src_name varchar(256),
|
||||
dest_name varchar(256),
|
||||
src_type varchar(256),
|
||||
dest_type varchar(256),
|
||||
src_size bigint,
|
||||
dest_size bigint,
|
||||
status varchar(256),
|
||||
result TEXT,
|
||||
primary key (instance_id, dest_file_id)
|
||||
);
|
||||
|
||||
INSERT IGNORE INTO t_clean_template(id, name, description)
|
||||
VALUES ('ac2f2582-a990-11f0-9768-00155d09c825', '空模板', '空模板'),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'text文本清洗模板', 'text文本清洗模板'),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板');
|
||||
|
||||
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
|
||||
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 6, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 7, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 8, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 9, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 10, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 11, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 12, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 13, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 14, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 15, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 16, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 17, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 18, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 19, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 20, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 21, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 22, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 23, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null),
|
||||
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileExporter', 28, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgFormatter', 1, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 3, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 4, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 5, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 6, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 7, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 8, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 9, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 10, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 11, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null),
|
||||
('4421504e-c6c9-4760-b55a-509d17429597', 'FileExporter', 14, null);
|
||||
160
scripts/db/data-collection-init.sql
Normal file
160
scripts/db/data-collection-init.sql
Normal file
@@ -0,0 +1,160 @@
|
||||
-- 数据归集服务数据库初始化脚本
|
||||
-- 适用于datamate数据库
|
||||
|
||||
USE datamate;
|
||||
|
||||
-- =====================================
|
||||
-- DDL语句 - 数据库表结构定义
|
||||
-- =====================================
|
||||
|
||||
-- 删除现有表(支持重复执行 调测阶段使用)
|
||||
DROP TABLE IF EXISTS t_dc_task_executions;
|
||||
DROP TABLE IF EXISTS t_dc_collection_tasks;
|
||||
DROP TABLE IF EXISTS t_dc_datax_templates;
|
||||
|
||||
CREATE TABLE t_dc_task_executions (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT '执行记录ID(UUID)',
|
||||
task_id VARCHAR(36) NOT NULL COMMENT '任务ID',
|
||||
task_name VARCHAR(255) NOT NULL COMMENT '任务名称',
|
||||
status VARCHAR(20) DEFAULT 'RUNNING' COMMENT '执行状态:RUNNING/SUCCESS/FAILED/STOPPED',
|
||||
progress DECIMAL(5,2) DEFAULT 0.00 COMMENT '进度百分比',
|
||||
records_total BIGINT DEFAULT 0 COMMENT '总记录数',
|
||||
records_processed BIGINT DEFAULT 0 COMMENT '已处理记录数',
|
||||
records_success BIGINT DEFAULT 0 COMMENT '成功记录数',
|
||||
records_failed BIGINT DEFAULT 0 COMMENT '失败记录数',
|
||||
throughput DECIMAL(10,2) DEFAULT 0.00 COMMENT '吞吐量(条/秒)',
|
||||
data_size_bytes BIGINT DEFAULT 0 COMMENT '数据量(字节)',
|
||||
started_at TIMESTAMP NULL COMMENT '开始时间',
|
||||
completed_at TIMESTAMP NULL COMMENT '完成时间',
|
||||
duration_seconds INT DEFAULT 0 COMMENT '执行时长(秒)',
|
||||
config JSON COMMENT '执行配置',
|
||||
error_message TEXT COMMENT '错误信息',
|
||||
datax_job_id TEXT COMMENT 'datax任务ID',
|
||||
result TEXT COMMENT '执行结果',
|
||||
created_at TIMESTAMP NULL COMMENT '创建时间',
|
||||
INDEX idx_task_id (task_id),
|
||||
INDEX idx_status (status),
|
||||
INDEX idx_started_at (started_at)
|
||||
) COMMENT='任务执行明细表';
|
||||
|
||||
-- 数据归集任务表
|
||||
CREATE TABLE t_dc_collection_tasks (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT '任务ID(UUID)',
|
||||
name VARCHAR(255) NOT NULL COMMENT '任务名称',
|
||||
description TEXT COMMENT '任务描述',
|
||||
sync_mode VARCHAR(20) DEFAULT 'ONCE' COMMENT '同步模式:ONCE/SCHEDULED',
|
||||
config TEXT NOT NULL COMMENT '归集配置(DataX配置),包含源端和目标端配置信息',
|
||||
schedule_expression VARCHAR(255) COMMENT 'Cron调度表达式',
|
||||
status VARCHAR(20) DEFAULT 'DRAFT' COMMENT '任务状态:DRAFT/READY/RUNNING/SUCCESS/FAILED/STOPPED',
|
||||
retry_count INT DEFAULT 3 COMMENT '重试次数',
|
||||
timeout_seconds INT DEFAULT 3600 COMMENT '超时时间(秒)',
|
||||
max_records BIGINT COMMENT '最大处理记录数',
|
||||
sort_field VARCHAR(100) COMMENT '增量字段',
|
||||
last_execution_id VARCHAR(36) COMMENT '最后执行ID(UUID)',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
created_by VARCHAR(255) COMMENT '创建者',
|
||||
updated_by VARCHAR(255) COMMENT '更新者',
|
||||
INDEX idx_status (status),
|
||||
INDEX idx_created_at (created_at),
|
||||
INDEX idx_schedule (schedule_expression)
|
||||
) COMMENT='数据归集任务表';
|
||||
|
||||
-- 任务执行记录表
|
||||
CREATE TABLE t_dc_task_log (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT '执行记录ID(UUID)',
|
||||
task_id VARCHAR(36) NOT NULL COMMENT '任务ID',
|
||||
task_name VARCHAR(255) NOT NULL COMMENT '任务名称',
|
||||
sync_mode VARCHAR(20) DEFAULT 'FULL' COMMENT '同步模式:FULL/INCREMENTAL',
|
||||
status VARCHAR(20) DEFAULT 'RUNNING' COMMENT '执行状态:RUNNING/SUCCESS/FAILED/STOPPED',
|
||||
start_time TIMESTAMP NULL COMMENT '开始时间',
|
||||
end_time TIMESTAMP NULL COMMENT '结束时间',
|
||||
duration BIGINT COMMENT '执行时长(毫秒)',
|
||||
process_id VARCHAR(50) COMMENT '进程ID',
|
||||
log_path VARCHAR(500) COMMENT '日志文件路径',
|
||||
error_msg LONGTEXT COMMENT '错误信息',
|
||||
result LONGTEXT COMMENT '执行结果',
|
||||
retry_times INT DEFAULT 0 COMMENT '重试次数',
|
||||
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间'
|
||||
) COMMENT='任务执行记录表';
|
||||
|
||||
|
||||
-- DataX模板配置表
|
||||
CREATE TABLE t_dc_datax_templates (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT '模板ID(UUID)',
|
||||
name VARCHAR(255) NOT NULL UNIQUE COMMENT '模板名称',
|
||||
source_type VARCHAR(50) NOT NULL COMMENT '源数据源类型',
|
||||
target_type VARCHAR(50) NOT NULL COMMENT '目标数据源类型',
|
||||
template_content TEXT NOT NULL COMMENT '模板内容',
|
||||
description TEXT COMMENT '模板描述',
|
||||
version VARCHAR(20) DEFAULT '1.0.0' COMMENT '版本号',
|
||||
is_system BOOLEAN DEFAULT FALSE COMMENT '是否系统模板',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
created_by VARCHAR(255) COMMENT '创建者',
|
||||
INDEX idx_source_target (source_type, target_type),
|
||||
INDEX idx_system (is_system)
|
||||
) COMMENT='DataX模板配置表';
|
||||
|
||||
-- =====================================
|
||||
-- DML语句 - 数据操作
|
||||
-- =====================================
|
||||
|
||||
-- 插入默认的DataX模板
|
||||
INSERT INTO t_dc_datax_templates (id, name, source_type, target_type, template_content, description, is_system, created_by) VALUES
|
||||
-- MySQL to MySQL 模板
|
||||
('e4272e51-d431-4681-a370-1b3d0b036cd0', 'MySQL到MySQL', 'MYSQL', 'MYSQL', JSON_OBJECT(
|
||||
'job', JSON_OBJECT(
|
||||
'setting', JSON_OBJECT(
|
||||
'speed', JSON_OBJECT('channel', 3)
|
||||
),
|
||||
'content', JSON_ARRAY(
|
||||
JSON_OBJECT(
|
||||
'reader', JSON_OBJECT(
|
||||
'name', 'mysqlreader',
|
||||
'parameter', JSON_OBJECT(
|
||||
'username', '${source.username}',
|
||||
'password', '${source.password}',
|
||||
'column', JSON_ARRAY('*'),
|
||||
'splitPk', '${source.splitPk:id}',
|
||||
'connection', JSON_ARRAY(
|
||||
JSON_OBJECT(
|
||||
'jdbcUrl', JSON_ARRAY('${source.jdbcUrl}'),
|
||||
'table', JSON_ARRAY('${source.table}')
|
||||
)
|
||||
)
|
||||
)
|
||||
),
|
||||
'writer', JSON_OBJECT(
|
||||
'name', 'mysqlwriter',
|
||||
'parameter', JSON_OBJECT(
|
||||
'writeMode', 'insert',
|
||||
'username', '${target.username}',
|
||||
'password', '${target.password}',
|
||||
'column', JSON_ARRAY('*'),
|
||||
'session', JSON_ARRAY('set session sql_mode="PIPES_AS_CONCAT"'),
|
||||
'preSql', JSON_ARRAY('${target.preSql:}'),
|
||||
'connection', JSON_ARRAY(
|
||||
JSON_OBJECT(
|
||||
'jdbcUrl', '${target.jdbcUrl}',
|
||||
'table', JSON_ARRAY('${target.table}')
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
), 'MySQL到MySQL数据同步模板', TRUE, 'system');
|
||||
|
||||
-- 插入任务执行记录模拟数据
|
||||
INSERT INTO t_dc_task_executions (id, task_id, task_name, status, progress, records_total, records_processed, records_success, records_failed, throughput, data_size_bytes, started_at, completed_at, duration_seconds, config) VALUES
|
||||
-- 成功执行记录
|
||||
('12128059-a266-4d4f-b647-3cb8c24b8aad', '54cefc4d-3071-43d9-9fbf-baeb87932acd', '用户数据同步', 'SUCCESS', 100.00, 15000, 15000, 15000, 0, 125.50, 2048576,
|
||||
DATE_SUB(NOW(), INTERVAL 1 DAY), DATE_SUB(NOW(), INTERVAL 1 DAY) + INTERVAL 2 MINUTE, 120,
|
||||
JSON_OBJECT('batchSize', 1000, 'parallelism', 3)),
|
||||
|
||||
('9d418e0c-fa54-4f01-8633-3a5ad57f46a1', '3039a5c8-c894-42ab-ad49-5c2c5eccda31', '订单增量同步', 'SUCCESS', 100.00, 8500, 8500, 8500, 0, 94.44, 1536000,
|
||||
DATE_SUB(NOW(), INTERVAL 12 HOUR), DATE_SUB(NOW(), INTERVAL 12 HOUR) + INTERVAL 90 SECOND, 90,
|
||||
JSON_OBJECT('batchSize', 2000, 'parallelism', 2));
|
||||
|
||||
15
scripts/db/data-common-init.sql
Normal file
15
scripts/db/data-common-init.sql
Normal file
@@ -0,0 +1,15 @@
|
||||
-- 数据归集服务数据库初始化脚本
|
||||
-- 适用于datamate数据库
|
||||
|
||||
USE datamate;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `t_chunk_upload_request`
|
||||
(
|
||||
`id` VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
|
||||
`total_file_num` INT COMMENT '总文件数',
|
||||
`uploaded_file_num` INT COMMENT '已上传文件数',
|
||||
`upload_path` VARCHAR(256) COMMENT '文件路径',
|
||||
`timeout` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '上传请求超时时间',
|
||||
`service_id` VARCHAR(64) COMMENT '上传请求所属服务:DATA-MANAGEMENT(数据管理);',
|
||||
`check_info` TEXT COMMENT '业务信息'
|
||||
) COMMENT ='文件切片上传请求表';
|
||||
156
scripts/db/data-management-init.sql
Normal file
156
scripts/db/data-management-init.sql
Normal file
@@ -0,0 +1,156 @@
|
||||
-- DataMate Platform 数据库初始化脚本
|
||||
-- 适用于现有datamate数据库环境
|
||||
|
||||
-- 使用现有的datamate数据库
|
||||
USE datamate;
|
||||
|
||||
-- 删除已存在的表(如果需要重新创建)
|
||||
-- 原有表名保留,但本脚本新建以 t_dm_ 为前缀的新表,并使用 UUID 主键
|
||||
-- 可按需手工迁移旧数据到新表
|
||||
|
||||
-- ===========================================
|
||||
-- 数据管理(Data Management)模块表(UUID 主键,t_dm_ 前缀)
|
||||
-- ===========================================
|
||||
|
||||
-- 数据集表(支持医学影像、文本、问答等多种类型)
|
||||
CREATE TABLE IF NOT EXISTS t_dm_datasets (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
|
||||
name VARCHAR(255) NOT NULL COMMENT '数据集名称',
|
||||
description TEXT COMMENT '数据集描述',
|
||||
dataset_type VARCHAR(50) NOT NULL COMMENT '数据集类型:IMAGE/TEXT/QA/MULTIMODAL/OTHER',
|
||||
category VARCHAR(100) COMMENT '数据集分类:医学影像/问答/文献等',
|
||||
path VARCHAR(500) COMMENT '数据存储路径',
|
||||
format VARCHAR(50) COMMENT '数据格式:DCM/JPG/JSON/CSV等',
|
||||
schema_info JSON COMMENT '数据结构信息',
|
||||
size_bytes BIGINT DEFAULT 0 COMMENT '数据大小(字节)',
|
||||
file_count BIGINT DEFAULT 0 COMMENT '文件数量',
|
||||
record_count BIGINT DEFAULT 0 COMMENT '记录数量',
|
||||
retention_days INTEGER DEFAULT 0 COMMENT '数据保留天数(0表示长期保留)',
|
||||
tags JSON COMMENT '标签列表',
|
||||
metadata JSON COMMENT '元数据信息',
|
||||
status VARCHAR(50) DEFAULT 'DRAFT' COMMENT '状态:DRAFT/ACTIVE/ARCHIVED',
|
||||
is_public BOOLEAN DEFAULT FALSE COMMENT '是否公开',
|
||||
is_featured BOOLEAN DEFAULT FALSE COMMENT '是否推荐',
|
||||
version BIGINT NOT NULL DEFAULT 0 COMMENT '版本号',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
created_by VARCHAR(255) COMMENT '创建者',
|
||||
updated_by VARCHAR(255) COMMENT '更新者',
|
||||
INDEX idx_dm_dataset_type (dataset_type),
|
||||
INDEX idx_dm_category (category),
|
||||
INDEX idx_dm_format (format),
|
||||
INDEX idx_dm_status (status),
|
||||
INDEX idx_dm_public (is_public),
|
||||
INDEX idx_dm_featured (is_featured),
|
||||
INDEX idx_dm_created_at (created_at)
|
||||
) COMMENT='数据集表(UUID 主键)';
|
||||
|
||||
-- 数据集文件表
|
||||
CREATE TABLE IF NOT EXISTS t_dm_dataset_files (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
|
||||
dataset_id VARCHAR(36) NOT NULL COMMENT '所属数据集ID(UUID)',
|
||||
file_name VARCHAR(255) NOT NULL COMMENT '文件名',
|
||||
file_path VARCHAR(1000) NOT NULL COMMENT '文件路径',
|
||||
file_type VARCHAR(50) COMMENT '文件格式:JPG/PNG/DCM/TXT等',
|
||||
file_size BIGINT DEFAULT 0 COMMENT '文件大小(字节)',
|
||||
check_sum VARCHAR(64) COMMENT '文件校验和',
|
||||
tags JSON COMMENT '文件标签信息',
|
||||
metadata JSON COMMENT '文件元数据',
|
||||
status VARCHAR(50) DEFAULT 'ACTIVE' COMMENT '文件状态:ACTIVE/DELETED/PROCESSING',
|
||||
upload_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '上传时间',
|
||||
last_access_time TIMESTAMP NULL COMMENT '最后访问时间',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
FOREIGN KEY (dataset_id) REFERENCES t_dm_datasets(id) ON DELETE CASCADE,
|
||||
INDEX idx_dm_dataset (dataset_id),
|
||||
INDEX idx_dm_file_type (file_type),
|
||||
INDEX idx_dm_file_status (status),
|
||||
INDEX idx_dm_upload_time (upload_time)
|
||||
) COMMENT='数据集文件表(UUID 主键)';
|
||||
|
||||
-- 数据集统计信息表
|
||||
CREATE TABLE IF NOT EXISTS t_dm_dataset_statistics (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
|
||||
dataset_id VARCHAR(36) NOT NULL COMMENT '数据集ID(UUID)',
|
||||
stat_date DATE NOT NULL COMMENT '统计日期',
|
||||
total_files BIGINT DEFAULT 0 COMMENT '总文件数',
|
||||
total_size BIGINT DEFAULT 0 COMMENT '总大小(字节)',
|
||||
processed_files BIGINT DEFAULT 0 COMMENT '已处理文件数',
|
||||
error_files BIGINT DEFAULT 0 COMMENT '错误文件数',
|
||||
download_count BIGINT DEFAULT 0 COMMENT '下载次数',
|
||||
view_count BIGINT DEFAULT 0 COMMENT '查看次数',
|
||||
quality_metrics JSON COMMENT '质量指标',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
FOREIGN KEY (dataset_id) REFERENCES t_dm_datasets(id) ON DELETE CASCADE,
|
||||
UNIQUE KEY uk_dm_dataset_date (dataset_id, stat_date),
|
||||
INDEX idx_dm_stat_date (stat_date)
|
||||
) COMMENT='数据集统计信息表(UUID 主键)';
|
||||
|
||||
-- 标签表
|
||||
CREATE TABLE IF NOT EXISTS t_dm_tags (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
|
||||
name VARCHAR(100) NOT NULL UNIQUE COMMENT '标签名称',
|
||||
description TEXT COMMENT '标签描述',
|
||||
category VARCHAR(50) COMMENT '标签分类',
|
||||
color VARCHAR(7) COMMENT '标签颜色(十六进制)',
|
||||
usage_count BIGINT DEFAULT 0 COMMENT '使用次数',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
INDEX idx_dm_tag_category (category),
|
||||
INDEX idx_dm_tag_usage_count (usage_count)
|
||||
) COMMENT='标签表(UUID 主键)';
|
||||
|
||||
-- 数据集标签关联表
|
||||
CREATE TABLE IF NOT EXISTS t_dm_dataset_tags (
|
||||
dataset_id VARCHAR(36) NOT NULL COMMENT '数据集ID(UUID)',
|
||||
tag_id VARCHAR(36) NOT NULL COMMENT '标签ID(UUID)',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
PRIMARY KEY (dataset_id, tag_id),
|
||||
FOREIGN KEY (dataset_id) REFERENCES t_dm_datasets(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY (tag_id) REFERENCES t_dm_tags(id) ON DELETE CASCADE
|
||||
) COMMENT='数据集标签关联表(UUID 外键)';
|
||||
|
||||
-- ===========================================
|
||||
-- 非数据管理表(如 users、t_data_sources)保持不变
|
||||
-- ===========================================
|
||||
|
||||
-- 用户表(如果不存在)
|
||||
CREATE TABLE IF NOT EXISTS users (
|
||||
id BIGINT PRIMARY KEY AUTO_INCREMENT,
|
||||
username VARCHAR(255) NOT NULL UNIQUE COMMENT '用户名',
|
||||
email VARCHAR(255) NOT NULL UNIQUE COMMENT '邮箱',
|
||||
password_hash VARCHAR(255) NOT NULL COMMENT '密码哈希',
|
||||
full_name VARCHAR(255) COMMENT '真实姓名',
|
||||
avatar_url VARCHAR(500) COMMENT '头像URL',
|
||||
role VARCHAR(50) NOT NULL DEFAULT 'USER' COMMENT '角色:ADMIN/USER',
|
||||
organization VARCHAR(255) COMMENT '所属机构',
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE COMMENT '是否启用',
|
||||
last_login_at TIMESTAMP NULL COMMENT '最后登录时间',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
INDEX idx_username (username),
|
||||
INDEX idx_email (email),
|
||||
INDEX idx_role (role),
|
||||
INDEX idx_enabled (enabled)
|
||||
) COMMENT='用户表';
|
||||
|
||||
-- 插入初始数据
|
||||
|
||||
-- 插入默认用户
|
||||
INSERT IGNORE INTO users (username, email, password_hash, full_name, role, organization) VALUES
|
||||
('admin', 'admin@datamate.com', '$2a$10$N.zmdr9k7uOCQb376NoUnuTJ8iAt6Z5EHsM8lE9lBOsl7q7U3.XUO', '系统管理员', 'ADMIN', 'DataMate'),
|
||||
('knowledge_user', 'knowledge@datamate.com', '$2a$10$N.zmdr9k7uOCQb376NoUnuTJ8iAt6Z5EHsM8lE9lBOsl7q7U3.XUO', '知识库用户', 'USER', '三甲医院');
|
||||
|
||||
-- 创建视图:数据集统计摘要(引用新表)
|
||||
CREATE OR REPLACE VIEW v_dm_dataset_summary AS
|
||||
SELECT
|
||||
COUNT(*) as total_datasets,
|
||||
SUM(CASE WHEN status = 'ACTIVE' THEN 1 ELSE 0 END) as active_datasets,
|
||||
SUM(CASE WHEN is_public = TRUE THEN 1 ELSE 0 END) as public_datasets,
|
||||
SUM(CASE WHEN is_featured = TRUE THEN 1 ELSE 0 END) as featured_datasets,
|
||||
SUM(file_count) as total_files,
|
||||
SUM(record_count) as total_records,
|
||||
COUNT(DISTINCT dataset_type) as dataset_types,
|
||||
COUNT(DISTINCT category) as categories
|
||||
FROM t_dm_datasets;
|
||||
223
scripts/db/data-operator-init.sql
Normal file
223
scripts/db/data-operator-init.sql
Normal file
@@ -0,0 +1,223 @@
|
||||
USE datamate;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS t_operator
|
||||
(
|
||||
id varchar(64) primary key,
|
||||
name varchar(64),
|
||||
description varchar(256),
|
||||
version varchar(256),
|
||||
inputs varchar(256),
|
||||
outputs varchar(256),
|
||||
runtime text,
|
||||
settings text,
|
||||
file_name text,
|
||||
is_star bool,
|
||||
created_at timestamp default current_timestamp,
|
||||
updated_at timestamp default current_timestamp
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS t_operator_category
|
||||
(
|
||||
id int primary key auto_increment,
|
||||
name varchar(64),
|
||||
type varchar(64),
|
||||
parent_id int
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS t_operator_category_relation
|
||||
(
|
||||
category_id int,
|
||||
operator_id varchar(64),
|
||||
primary key (category_id, operator_id)
|
||||
);
|
||||
|
||||
CREATE OR REPLACE VIEW v_operator AS
|
||||
SELECT o.id AS operator_id,
|
||||
o.name AS operator_name,
|
||||
description,
|
||||
version,
|
||||
inputs,
|
||||
outputs,
|
||||
runtime,
|
||||
settings,
|
||||
is_star,
|
||||
created_at,
|
||||
updated_at,
|
||||
toc.id AS category_id,
|
||||
toc.name AS category_name
|
||||
FROM t_operator_category_relation tocr
|
||||
LEFT JOIN t_operator o ON tocr.operator_id = o.id
|
||||
LEFT JOIN t_operator_category toc ON tocr.category_id = toc.id;
|
||||
|
||||
INSERT IGNORE INTO t_operator_category(id, name, type, parent_id)
|
||||
VALUES (1, '模态', 'predefined', 0),
|
||||
(2, '语言', 'predefined', 0),
|
||||
(3, '文本', 'predefined', 1),
|
||||
(4, '图片', 'predefined', 1),
|
||||
(5, '音频', 'predefined', 1),
|
||||
(6, '视频', 'predefined', 1),
|
||||
(7, '多模态', 'predefined', 1),
|
||||
(8, 'Python', 'predefined', 2),
|
||||
(9, 'Java', 'predefined', 2),
|
||||
(10, '来源', 'predefined', 0),
|
||||
(11, '系统预置', 'predefined', 10),
|
||||
(12, '用户上传', 'predefined', 10),
|
||||
(13, '收藏状态', 'predefined', 0),
|
||||
(14, '已收藏', 'predefined', 13);
|
||||
|
||||
INSERT IGNORE INTO t_operator
|
||||
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
|
||||
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
|
||||
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
|
||||
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
|
||||
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
||||
('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
||||
('DuplicateFilesFilter', '相似文档去除', '相似文档去除。', '1.0.0', 'text', 'text', null, '{"fileDuplicateThreshold": {"name": "文档相似度", "description": "基于MinHash算法和Jaccard相似度,计算当前文档与数据集中其它文档相似性,超过设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
|
||||
('FileWithManySensitiveWordsFilter', '文档敏感词率检查', '去除敏感词过多的文档。', '1.0.0', 'text', 'text', null, '{"sensitiveWordsRate": {"name": "文档敏感词率", "description": "敏感词的字数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.01, "min": 0, "max": 1, "step": 0.01}}', '', 'false'),
|
||||
('FileWithShortOrLongLengthFilter', '文档字数检查', '字数不在指定范围会被过滤掉。', '1.0.0', 'text', 'text', null, '{"fileLength": {"name": "文档字数", "description": "过滤字数不在指定范围内的文档,如[10,10000000]。若输入为空,则不对字数上/下限做限制。", "type": "range", "defaultVal": [10, 10000000], "min": 0, "max": 10000000000000000, "step": 1}}', '', 'false'),
|
||||
('ContentCleaner', '文档目录去除', '去除文档中的目录。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('AnonymizedCreditCardNumber', '信用卡号匿名化', '信用卡号匿名化', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('EmailNumberCleaner', '邮件地址匿名化', '邮件地址匿名化', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('EmojiCleaner', '文档表情去除', '去除文档中表情字符或者emoji符号。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('ExtraSpaceCleaner', '多余空格去除', '移除文档首尾、句中或标点符号附近多余空格和 tab 等。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('FullWidthCharacterCleaner', '全角转半角', '将文档中的所有全角字符转换成半角字符。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('GrableCharactersCleaner', '文档乱码去除', '去除文档中的乱码和无意义的unicode。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('HtmlTagCleaner', 'HTML标签去除', '移除文档中HTML标签,如 <html>、<dev>、<p> 等。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('AnonymizedIdNumber', '身份证号匿名化', '身份证号匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('InvisibleCharactersCleaner', '不可见字符去除', '去除文档中的不可见字符,例如 0-31 号字符中的部分字符。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('AnonymizedIpAddress', 'IP地址匿名化', 'IP地址匿名化', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('LegendCleaner', '图注表注去除', '去除文档中的图注、表注等内容。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('AnonymizedPhoneNumber', '电话号码匿名化', '电话号码匿名化', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('PoliticalWordCleaner', '政治文本匿名化', '将政治文本进行匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('DuplicateSentencesFilter', '文档局部内容去重', '文档局部内容去重。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('SexualAndViolentWordCleaner', '暴力色情文本匿名化', '将暴力、色情文本进行匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('TraditionalChineseCleaner', '繁体转简体', '将繁体转换为简体。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('UnicodeSpaceCleaner', '空格标准化', '将文档中不同的 unicode 空格,如 u2008,转换为正常空格\\u0020。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('AnonymizedUrlCleaner', 'URL网址匿名化', '将文档中的url网址匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('XMLTagCleaner', 'XML标签去除', '去除XML中的标签。', '1.0.0', 'text', 'text', null, null, '', 'false'),
|
||||
('ImgFormatter', '读取图片文件', '读取图片文件。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgBlurredImagesCleaner', '模糊图片过滤', '去除模糊的图片。', '1.0.0', 'image', 'image', '{"blurredThreshold": {"name": "梯度函数值", "name_en": "Gradient Value", "description": "梯度函数值取值越小,图片模糊度越高。", "description_en": "A smaller gradient value indicates a higher image blur.", "type": "slider", "defaultVal": 1000, "min": 1, "max": 10000, "step": 1}}', null, '', 'false'),
|
||||
('ImgBrightness', '图片亮度增强', '自适应调节图片的亮度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgContrast', '图片对比度增强', '自适应调节图片的对比度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgDenoise', '图片噪点去除', '去除图片中的噪点,主要适用于自然场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgDuplicatedImagesCleaner', '重复图片去除', '去除重复的图片。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgPerspectiveTransformation', '图片透视变换', '自适应校正图片的视角,主要适用于文档校正场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgResize', '图片重采样', '将图片放大或缩小到指定像素。', '1.0.0', 'image', 'image', '{"targetSize": {"name": "重采样尺寸", "name_en": "Resample Size", "type": "multiple", "properties": [{"type": "inputNumber", "name": "宽度", "name_en": "Width", "description": "像素", "description_en": "Pixel", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}, {"type": "inputNumber", "name": "高度", "name_en": "Height", "description": "像素", "description_en": "Pixel", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}]}}', null, '', 'false'),
|
||||
('ImgSaturation', '图片饱和度增强', '自适应调节图片的饱和度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgShadowRemove', '图片阴影去除', '去除图片中的阴影,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgSharpness', '图片锐度增强', '自适应调节图片的锐度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 'false'),
|
||||
('ImgSimilarImagesCleaner', '相似图片去除', '去除相似的图片。', '1.0.0', 'image', 'image', '{"similarThreshold": {"name": "相似度", "name_en": "Similarity", "description": "相似度取值越大,图片相似度越高。", "description_en": "A larger similarity value indicates a higher image similarity.", "type": "slider", "defaultVal": 0.8, "min": 0, "max": 1, "step": 0.01}}', null, '', 'false'),
|
||||
('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', '{"imgType": {"name": "图片编码格式", "name_en": "Image Encoding Format", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "label_en": "jpg", "value": "jpg"}, {"label": "png", "label_en": "png", "value": "png"}, {"label": "jpeg", "label_en": "jpeg", "value": "jpeg"}, {"label": "bmp", "label_en": "bmp", "value": "bmp"}]}}', null, '', 'false');
|
||||
|
||||
|
||||
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
|
||||
VALUES (3, 'TextFormatter'),
|
||||
(7, 'FileExporter'),
|
||||
(8, 'TextFormatter'),
|
||||
(8, 'FileExporter'),
|
||||
(3, 'FileWithShortOrLongLengthFilter'),
|
||||
(3, 'FileWithHighRepeatPhraseRateFilter'),
|
||||
(3, 'FileWithHighRepeatWordRateFilter'),
|
||||
(3, 'FileWithHighSpecialCharRateFilter'),
|
||||
(3, 'FileWithManySensitiveWordsFilter'),
|
||||
(3, 'DuplicateFilesFilter'),
|
||||
(3, 'DuplicateSentencesFilter'),
|
||||
(3, 'AnonymizedCreditCardNumber'),
|
||||
(3, 'AnonymizedIdNumber'),
|
||||
(3, 'AnonymizedIpAddress'),
|
||||
(3, 'AnonymizedPhoneNumber'),
|
||||
(3, 'AnonymizedUrlCleaner'),
|
||||
(3, 'HtmlTagCleaner'),
|
||||
(3, 'XMLTagCleaner'),
|
||||
(3, 'ContentCleaner'),
|
||||
(3, 'EmailNumberCleaner'),
|
||||
(3, 'EmojiCleaner'),
|
||||
(3, 'ExtraSpaceCleaner'),
|
||||
(3, 'FullWidthCharacterCleaner'),
|
||||
(3, 'GrableCharactersCleaner'),
|
||||
(3, 'InvisibleCharactersCleaner'),
|
||||
(3, 'LegendCleaner'),
|
||||
(3, 'PoliticalWordCleaner'),
|
||||
(3, 'SexualAndViolentWordCleaner'),
|
||||
(3, 'TraditionalChineseCleaner'),
|
||||
(3, 'UnicodeSpaceCleaner'),
|
||||
(4, 'ImgFormatter'),
|
||||
(4, 'ImgBlurredImagesCleaner'),
|
||||
(4, 'ImgBrightness'),
|
||||
(4, 'ImgContrast'),
|
||||
(4, 'ImgDenoise'),
|
||||
(4, 'ImgDuplicatedImagesCleaner'),
|
||||
(4, 'ImgPerspectiveTransformation'),
|
||||
(4, 'ImgResize'),
|
||||
(4, 'ImgSaturation'),
|
||||
(4, 'ImgShadowRemove'),
|
||||
(4, 'ImgSharpness'),
|
||||
(4, 'ImgSimilarImagesCleaner'),
|
||||
(4, 'ImgTypeUnify'),
|
||||
(8, 'FileWithShortOrLongLengthFilter'),
|
||||
(8, 'FileWithHighRepeatPhraseRateFilter'),
|
||||
(8, 'FileWithHighRepeatWordRateFilter'),
|
||||
(8, 'FileWithHighSpecialCharRateFilter'),
|
||||
(8, 'FileWithManySensitiveWordsFilter'),
|
||||
(8, 'DuplicateFilesFilter'),
|
||||
(8, 'DuplicateSentencesFilter'),
|
||||
(8, 'AnonymizedCreditCardNumber'),
|
||||
(8, 'AnonymizedIdNumber'),
|
||||
(8, 'AnonymizedIpAddress'),
|
||||
(8, 'AnonymizedPhoneNumber'),
|
||||
(8, 'AnonymizedUrlCleaner'),
|
||||
(8, 'HtmlTagCleaner'),
|
||||
(8, 'XMLTagCleaner'),
|
||||
(8, 'ContentCleaner'),
|
||||
(8, 'EmailNumberCleaner'),
|
||||
(8, 'EmojiCleaner'),
|
||||
(8, 'ExtraSpaceCleaner'),
|
||||
(8, 'FullWidthCharacterCleaner'),
|
||||
(8, 'GrableCharactersCleaner'),
|
||||
(8, 'InvisibleCharactersCleaner'),
|
||||
(8, 'LegendCleaner'),
|
||||
(8, 'PoliticalWordCleaner'),
|
||||
(8, 'SexualAndViolentWordCleaner'),
|
||||
(8, 'TraditionalChineseCleaner'),
|
||||
(8, 'UnicodeSpaceCleaner'),
|
||||
(11, 'TextFormatter'),
|
||||
(11, 'FileExporter'),
|
||||
(11, 'FileWithShortOrLongLengthFilter'),
|
||||
(11, 'FileWithHighRepeatPhraseRateFilter'),
|
||||
(11, 'FileWithHighRepeatWordRateFilter'),
|
||||
(11, 'FileWithHighSpecialCharRateFilter'),
|
||||
(11, 'FileWithManySensitiveWordsFilter'),
|
||||
(11, 'DuplicateFilesFilter'),
|
||||
(11, 'DuplicateSentencesFilter'),
|
||||
(11, 'AnonymizedCreditCardNumber'),
|
||||
(11, 'AnonymizedIdNumber'),
|
||||
(11, 'AnonymizedIpAddress'),
|
||||
(11, 'AnonymizedPhoneNumber'),
|
||||
(11, 'AnonymizedUrlCleaner'),
|
||||
(11, 'HtmlTagCleaner'),
|
||||
(11, 'XMLTagCleaner'),
|
||||
(11, 'ContentCleaner'),
|
||||
(11, 'EmailNumberCleaner'),
|
||||
(11, 'EmojiCleaner'),
|
||||
(11, 'ExtraSpaceCleaner'),
|
||||
(11, 'FullWidthCharacterCleaner'),
|
||||
(11, 'GrableCharactersCleaner'),
|
||||
(11, 'InvisibleCharactersCleaner'),
|
||||
(11, 'LegendCleaner'),
|
||||
(11, 'PoliticalWordCleaner'),
|
||||
(11, 'SexualAndViolentWordCleaner'),
|
||||
(11, 'TraditionalChineseCleaner'),
|
||||
(11, 'UnicodeSpaceCleaner'),
|
||||
(11, 'ImgFormatter'),
|
||||
(11, 'ImgBlurredImagesCleaner'),
|
||||
(11, 'ImgBrightness'),
|
||||
(11, 'ImgContrast'),
|
||||
(11, 'ImgDenoise'),
|
||||
(11, 'ImgDuplicatedImagesCleaner'),
|
||||
(11, 'ImgPerspectiveTransformation'),
|
||||
(11, 'ImgResize'),
|
||||
(11, 'ImgSaturation'),
|
||||
(11, 'ImgShadowRemove'),
|
||||
(11, 'ImgSharpness'),
|
||||
(11, 'ImgSimilarImagesCleaner'),
|
||||
(11, 'ImgTypeUnify');
|
||||
Reference in New Issue
Block a user