refactor: modify data collection to python implementation (#214)

* feature: LabelStudio jumps without login

* refactor: modify data collection to python implementation

* refactor: modify data collection to python implementation

* refactor: modify data collection to python implementation

* refactor: modify data collection to python implementation

* refactor: modify data collection to python implementation

* refactor: modify data collection to python implementation

* fix: remove terrabase dependency

* feature: add the collection task executions page and the collection template page

* fix: fix the collection task creation

* fix: fix the collection task creation
This commit is contained in:
hefanli
2025-12-30 18:48:43 +08:00
committed by GitHub
parent 80d4dfd285
commit 63f4e3e447
71 changed files with 1861 additions and 2557 deletions

View File

@@ -10,35 +10,7 @@ USE datamate;
-- 删除现有表(支持重复执行 调测阶段使用)
DROP TABLE IF EXISTS t_dc_task_executions;
DROP TABLE IF EXISTS t_dc_collection_tasks;
DROP TABLE IF EXISTS t_dc_datax_templates;
CREATE TABLE t_dc_task_executions (
id VARCHAR(36) PRIMARY KEY COMMENT '执行记录ID(UUID)',
task_id VARCHAR(36) NOT NULL COMMENT '任务ID',
task_name VARCHAR(255) NOT NULL COMMENT '任务名称',
status VARCHAR(20) DEFAULT 'RUNNING' COMMENT '执行状态:RUNNING/SUCCESS/FAILED/STOPPED',
progress DECIMAL(5,2) DEFAULT 0.00 COMMENT '进度百分比',
records_total BIGINT DEFAULT 0 COMMENT '总记录数',
records_processed BIGINT DEFAULT 0 COMMENT '已处理记录数',
records_success BIGINT DEFAULT 0 COMMENT '成功记录数',
records_failed BIGINT DEFAULT 0 COMMENT '失败记录数',
throughput DECIMAL(10,2) DEFAULT 0.00 COMMENT '吞吐量(条/秒)',
data_size_bytes BIGINT DEFAULT 0 COMMENT '数据量(字节)',
started_at TIMESTAMP NULL COMMENT '开始时间',
completed_at TIMESTAMP NULL COMMENT '完成时间',
duration_seconds INT DEFAULT 0 COMMENT '执行时长(秒)',
config JSON COMMENT '执行配置',
error_message TEXT COMMENT '错误信息',
datax_job_id TEXT COMMENT 'datax任务ID',
result TEXT COMMENT '执行结果',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
updated_by VARCHAR(255) COMMENT '更新者',
INDEX idx_task_id (task_id),
INDEX idx_status (status),
INDEX idx_started_at (started_at)
) COMMENT='任务执行明细表';
DROP TABLE IF EXISTS t_dc_collection_templates;
-- 数据归集任务表
CREATE TABLE t_dc_collection_tasks (
@@ -46,120 +18,60 @@ CREATE TABLE t_dc_collection_tasks (
name VARCHAR(255) NOT NULL COMMENT '任务名称',
description TEXT COMMENT '任务描述',
sync_mode VARCHAR(20) DEFAULT 'ONCE' COMMENT '同步模式:ONCE/SCHEDULED',
task_type VARCHAR(20) DEFAULT 'NAS' COMMENT '任务类型:NAS/OBS/MYSQL/CUSTOM',
template_id VARCHAR(36) NOT NULL COMMENT '归集模板ID',
template_name VARCHAR(255) NOT NULL COMMENT '归集模板名称',
target_path VARCHAR(1000) DEFAULT '' COMMENT '目标存储路径',
config TEXT NOT NULL COMMENT '归集配置(DataX配置),包含源端和目标端配置信息',
config JSON NOT NULL COMMENT '归集配置(DataX配置),包含源端和目标端配置信息',
schedule_expression VARCHAR(255) COMMENT 'Cron调度表达式',
status VARCHAR(20) DEFAULT 'DRAFT' COMMENT '任务状态:DRAFT/READY/RUNNING/SUCCESS/FAILED/STOPPED',
retry_count INT DEFAULT 3 COMMENT '重试次数',
timeout_seconds INT DEFAULT 3600 COMMENT '超时时间(秒)',
max_records BIGINT COMMENT '最大处理记录数',
sort_field VARCHAR(100) COMMENT '增量字段',
last_execution_id VARCHAR(36) COMMENT '最后执行ID(UUID)',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
updated_by VARCHAR(255) COMMENT '更新者',
INDEX idx_status (status),
INDEX idx_created_at (created_at),
INDEX idx_schedule (schedule_expression)
INDEX idx_created_at (created_at)
) COMMENT='数据归集任务表';
-- 任务执行记录表
CREATE TABLE t_dc_task_log (
CREATE TABLE t_dc_task_executions (
id VARCHAR(36) PRIMARY KEY COMMENT '执行记录ID(UUID)',
task_id VARCHAR(36) NOT NULL COMMENT '任务ID',
task_name VARCHAR(255) NOT NULL COMMENT '任务名称',
sync_mode VARCHAR(20) DEFAULT 'FULL' COMMENT '同步模式:FULL/INCREMENTAL',
status VARCHAR(20) DEFAULT 'RUNNING' COMMENT '执行状态:RUNNING/SUCCESS/FAILED/STOPPED',
start_time TIMESTAMP NULL COMMENT '开始时间',
end_time TIMESTAMP NULL COMMENT '结束时间',
duration BIGINT COMMENT '执行时长(毫秒)',
process_id VARCHAR(50) COMMENT '进程ID',
log_path VARCHAR(500) COMMENT '日志文件路径',
error_msg LONGTEXT COMMENT '错误信息',
result LONGTEXT COMMENT '执行结果',
retry_times INT DEFAULT 0 COMMENT '重试次数',
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间'
) COMMENT='任务执行记录表';
-- DataX模板配置表
CREATE TABLE t_dc_datax_templates (
id VARCHAR(36) PRIMARY KEY COMMENT '模板ID(UUID)',
name VARCHAR(255) NOT NULL UNIQUE COMMENT '模板名称',
source_type VARCHAR(50) NOT NULL COMMENT '源数据源类型',
target_type VARCHAR(50) NOT NULL COMMENT '目标数据源类型',
template_content TEXT NOT NULL COMMENT '模板内容',
description TEXT COMMENT '模板描述',
version VARCHAR(20) DEFAULT '1.0.0' COMMENT '版本号',
is_system BOOLEAN DEFAULT FALSE COMMENT '是否系统模板',
log_path VARCHAR(1000) NOT NULL COMMENT '日志文件路径',
started_at TIMESTAMP NULL COMMENT '开始时间',
completed_at TIMESTAMP NULL COMMENT '完成时间',
duration_seconds INT DEFAULT 0 COMMENT '执行时长(秒)',
error_message TEXT COMMENT '错误信息',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
INDEX idx_source_target (source_type, target_type),
INDEX idx_system (is_system)
) COMMENT='DataX模板配置表';
updated_by VARCHAR(255) COMMENT '更新者',
INDEX idx_task_id (task_id),
INDEX idx_status (status),
INDEX idx_started_at (started_at)
) COMMENT='任务执行明细表';
-- =====================================
-- DML语句 - 数据操作
-- =====================================
-- 插入默认的DataX模板
INSERT INTO t_dc_datax_templates (id, name, source_type, target_type, template_content, description, is_system, created_by) VALUES
-- MySQL to MySQL 模板
('e4272e51-d431-4681-a370-1b3d0b036cd0', 'MySQL到MySQL', 'MYSQL', 'MYSQL', JSON_OBJECT(
'job', JSON_OBJECT(
'setting', JSON_OBJECT(
'speed', JSON_OBJECT('channel', 3)
),
'content', JSON_ARRAY(
JSON_OBJECT(
'reader', JSON_OBJECT(
'name', 'mysqlreader',
'parameter', JSON_OBJECT(
'username', '${source.username}',
'password', '${source.password}',
'column', JSON_ARRAY('*'),
'splitPk', '${source.splitPk:id}',
'connection', JSON_ARRAY(
JSON_OBJECT(
'jdbcUrl', JSON_ARRAY('${source.jdbcUrl}'),
'table', JSON_ARRAY('${source.table}')
)
)
)
),
'writer', JSON_OBJECT(
'name', 'mysqlwriter',
'parameter', JSON_OBJECT(
'writeMode', 'insert',
'username', '${target.username}',
'password', '${target.password}',
'column', JSON_ARRAY('*'),
'session', JSON_ARRAY('set session sql_mode="PIPES_AS_CONCAT"'),
'preSql', JSON_ARRAY('${target.preSql:}'),
'connection', JSON_ARRAY(
JSON_OBJECT(
'jdbcUrl', '${target.jdbcUrl}',
'table', JSON_ARRAY('${target.table}')
)
)
)
)
)
)
)
), 'MySQL到MySQL数据同步模板', TRUE, 'system');
-- 插入任务执行记录模拟数据
INSERT INTO t_dc_task_executions (id, task_id, task_name, status, progress, records_total, records_processed, records_success, records_failed, throughput, data_size_bytes, started_at, completed_at, duration_seconds, config) VALUES
-- 成功执行记录
('12128059-a266-4d4f-b647-3cb8c24b8aad', '54cefc4d-3071-43d9-9fbf-baeb87932acd', '用户数据同步', 'SUCCESS', 100.00, 15000, 15000, 15000, 0, 125.50, 2048576,
DATE_SUB(NOW(), INTERVAL 1 DAY), DATE_SUB(NOW(), INTERVAL 1 DAY) + INTERVAL 2 MINUTE, 120,
JSON_OBJECT('batchSize', 1000, 'parallelism', 3)),
('9d418e0c-fa54-4f01-8633-3a5ad57f46a1', '3039a5c8-c894-42ab-ad49-5c2c5eccda31', '订单增量同步', 'SUCCESS', 100.00, 8500, 8500, 8500, 0, 94.44, 1536000,
DATE_SUB(NOW(), INTERVAL 12 HOUR), DATE_SUB(NOW(), INTERVAL 12 HOUR) + INTERVAL 90 SECOND, 90,
JSON_OBJECT('batchSize', 2000, 'parallelism', 2));
-- 数据归集模板配置表
CREATE TABLE t_dc_collection_templates (
id VARCHAR(36) PRIMARY KEY COMMENT '模板ID(UUID)',
name VARCHAR(255) NOT NULL UNIQUE COMMENT '模板名称',
description TEXT COMMENT '模板描述',
source_type VARCHAR(64) NOT NULL COMMENT '源数据源类型',
source_name VARCHAR(64) NOT NULL COMMENT '源数据源名称',
target_type VARCHAR(64) NOT NULL COMMENT '目标数据源类型',
target_name VARCHAR(64) NOT NULL COMMENT '目标数据源名称',
template_content JSON NOT NULL COMMENT '模板内容',
built_in BOOLEAN DEFAULT FALSE COMMENT '是否系统内置',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
updated_by VARCHAR(255) COMMENT '更新者',
INDEX idx_source_target (source_type, target_type)
) COMMENT='数据归集模板配置表';
INSERT IGNORE INTO t_dc_collection_templates(id, name, description, source_type, source_name, target_type, target_name, template_content, built_in, created_by, updated_by)
VALUES ('1', 'NAS归集模板', '将NAS存储上的文件归集到DataMate平台上。', 'nfsreader', 'nfsreader', 'nfswriter', 'nfswriter', '{"parameter": {}, "reader": {}, "writer": {}}', True, 'system', 'system'),
('2', 'OBS归集模板', '将OBS存储上的文件归集到DataMate平台上。', 'obsreader', 'obsreader', 'obswriter', 'obswriter', '{"parameter": {"endpoint": {"name": "服务地址","description": "OBS的服务地址。","type": "input"},"bucket": {"name": "存储桶名称","description": "OBS存储桶名称。","type": "input"},"accessKey": {"name": "访问密钥","description": "OBS访问密钥。","type": "input"},"secretKey": {"name": "密钥","description": "OBS密钥。","type": "input"},"prefix": {"name": "匹配前缀","description": "按照匹配前缀去选中OBS中的文件进行归集。","type": "input"}}, "reader": {}, "writer": {}}', True, 'system', 'system');