You've already forked DataMate
refactor: modify data collection to python implementation (#214)
* feature: LabelStudio jumps without login * refactor: modify data collection to python implementation * refactor: modify data collection to python implementation * refactor: modify data collection to python implementation * refactor: modify data collection to python implementation * refactor: modify data collection to python implementation * refactor: modify data collection to python implementation * fix: remove terrabase dependency * feature: add the collection task executions page and the collection template page * fix: fix the collection task creation * fix: fix the collection task creation
This commit is contained in:
@@ -10,35 +10,7 @@ USE datamate;
|
||||
-- 删除现有表(支持重复执行 调测阶段使用)
|
||||
DROP TABLE IF EXISTS t_dc_task_executions;
|
||||
DROP TABLE IF EXISTS t_dc_collection_tasks;
|
||||
DROP TABLE IF EXISTS t_dc_datax_templates;
|
||||
|
||||
CREATE TABLE t_dc_task_executions (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT '执行记录ID(UUID)',
|
||||
task_id VARCHAR(36) NOT NULL COMMENT '任务ID',
|
||||
task_name VARCHAR(255) NOT NULL COMMENT '任务名称',
|
||||
status VARCHAR(20) DEFAULT 'RUNNING' COMMENT '执行状态:RUNNING/SUCCESS/FAILED/STOPPED',
|
||||
progress DECIMAL(5,2) DEFAULT 0.00 COMMENT '进度百分比',
|
||||
records_total BIGINT DEFAULT 0 COMMENT '总记录数',
|
||||
records_processed BIGINT DEFAULT 0 COMMENT '已处理记录数',
|
||||
records_success BIGINT DEFAULT 0 COMMENT '成功记录数',
|
||||
records_failed BIGINT DEFAULT 0 COMMENT '失败记录数',
|
||||
throughput DECIMAL(10,2) DEFAULT 0.00 COMMENT '吞吐量(条/秒)',
|
||||
data_size_bytes BIGINT DEFAULT 0 COMMENT '数据量(字节)',
|
||||
started_at TIMESTAMP NULL COMMENT '开始时间',
|
||||
completed_at TIMESTAMP NULL COMMENT '完成时间',
|
||||
duration_seconds INT DEFAULT 0 COMMENT '执行时长(秒)',
|
||||
config JSON COMMENT '执行配置',
|
||||
error_message TEXT COMMENT '错误信息',
|
||||
datax_job_id TEXT COMMENT 'datax任务ID',
|
||||
result TEXT COMMENT '执行结果',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
created_by VARCHAR(255) COMMENT '创建者',
|
||||
updated_by VARCHAR(255) COMMENT '更新者',
|
||||
INDEX idx_task_id (task_id),
|
||||
INDEX idx_status (status),
|
||||
INDEX idx_started_at (started_at)
|
||||
) COMMENT='任务执行明细表';
|
||||
DROP TABLE IF EXISTS t_dc_collection_templates;
|
||||
|
||||
-- 数据归集任务表
|
||||
CREATE TABLE t_dc_collection_tasks (
|
||||
@@ -46,120 +18,60 @@ CREATE TABLE t_dc_collection_tasks (
|
||||
name VARCHAR(255) NOT NULL COMMENT '任务名称',
|
||||
description TEXT COMMENT '任务描述',
|
||||
sync_mode VARCHAR(20) DEFAULT 'ONCE' COMMENT '同步模式:ONCE/SCHEDULED',
|
||||
task_type VARCHAR(20) DEFAULT 'NAS' COMMENT '任务类型:NAS/OBS/MYSQL/CUSTOM',
|
||||
template_id VARCHAR(36) NOT NULL COMMENT '归集模板ID',
|
||||
template_name VARCHAR(255) NOT NULL COMMENT '归集模板名称',
|
||||
target_path VARCHAR(1000) DEFAULT '' COMMENT '目标存储路径',
|
||||
config TEXT NOT NULL COMMENT '归集配置(DataX配置),包含源端和目标端配置信息',
|
||||
config JSON NOT NULL COMMENT '归集配置(DataX配置),包含源端和目标端配置信息',
|
||||
schedule_expression VARCHAR(255) COMMENT 'Cron调度表达式',
|
||||
status VARCHAR(20) DEFAULT 'DRAFT' COMMENT '任务状态:DRAFT/READY/RUNNING/SUCCESS/FAILED/STOPPED',
|
||||
retry_count INT DEFAULT 3 COMMENT '重试次数',
|
||||
timeout_seconds INT DEFAULT 3600 COMMENT '超时时间(秒)',
|
||||
max_records BIGINT COMMENT '最大处理记录数',
|
||||
sort_field VARCHAR(100) COMMENT '增量字段',
|
||||
last_execution_id VARCHAR(36) COMMENT '最后执行ID(UUID)',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
created_by VARCHAR(255) COMMENT '创建者',
|
||||
updated_by VARCHAR(255) COMMENT '更新者',
|
||||
INDEX idx_status (status),
|
||||
INDEX idx_created_at (created_at),
|
||||
INDEX idx_schedule (schedule_expression)
|
||||
INDEX idx_created_at (created_at)
|
||||
) COMMENT='数据归集任务表';
|
||||
|
||||
-- 任务执行记录表
|
||||
CREATE TABLE t_dc_task_log (
|
||||
CREATE TABLE t_dc_task_executions (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT '执行记录ID(UUID)',
|
||||
task_id VARCHAR(36) NOT NULL COMMENT '任务ID',
|
||||
task_name VARCHAR(255) NOT NULL COMMENT '任务名称',
|
||||
sync_mode VARCHAR(20) DEFAULT 'FULL' COMMENT '同步模式:FULL/INCREMENTAL',
|
||||
status VARCHAR(20) DEFAULT 'RUNNING' COMMENT '执行状态:RUNNING/SUCCESS/FAILED/STOPPED',
|
||||
start_time TIMESTAMP NULL COMMENT '开始时间',
|
||||
end_time TIMESTAMP NULL COMMENT '结束时间',
|
||||
duration BIGINT COMMENT '执行时长(毫秒)',
|
||||
process_id VARCHAR(50) COMMENT '进程ID',
|
||||
log_path VARCHAR(500) COMMENT '日志文件路径',
|
||||
error_msg LONGTEXT COMMENT '错误信息',
|
||||
result LONGTEXT COMMENT '执行结果',
|
||||
retry_times INT DEFAULT 0 COMMENT '重试次数',
|
||||
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间'
|
||||
) COMMENT='任务执行记录表';
|
||||
|
||||
|
||||
-- DataX模板配置表
|
||||
CREATE TABLE t_dc_datax_templates (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT '模板ID(UUID)',
|
||||
name VARCHAR(255) NOT NULL UNIQUE COMMENT '模板名称',
|
||||
source_type VARCHAR(50) NOT NULL COMMENT '源数据源类型',
|
||||
target_type VARCHAR(50) NOT NULL COMMENT '目标数据源类型',
|
||||
template_content TEXT NOT NULL COMMENT '模板内容',
|
||||
description TEXT COMMENT '模板描述',
|
||||
version VARCHAR(20) DEFAULT '1.0.0' COMMENT '版本号',
|
||||
is_system BOOLEAN DEFAULT FALSE COMMENT '是否系统模板',
|
||||
log_path VARCHAR(1000) NOT NULL COMMENT '日志文件路径',
|
||||
started_at TIMESTAMP NULL COMMENT '开始时间',
|
||||
completed_at TIMESTAMP NULL COMMENT '完成时间',
|
||||
duration_seconds INT DEFAULT 0 COMMENT '执行时长(秒)',
|
||||
error_message TEXT COMMENT '错误信息',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
created_by VARCHAR(255) COMMENT '创建者',
|
||||
INDEX idx_source_target (source_type, target_type),
|
||||
INDEX idx_system (is_system)
|
||||
) COMMENT='DataX模板配置表';
|
||||
updated_by VARCHAR(255) COMMENT '更新者',
|
||||
INDEX idx_task_id (task_id),
|
||||
INDEX idx_status (status),
|
||||
INDEX idx_started_at (started_at)
|
||||
) COMMENT='任务执行明细表';
|
||||
|
||||
-- =====================================
|
||||
-- DML语句 - 数据操作
|
||||
-- =====================================
|
||||
|
||||
-- 插入默认的DataX模板
|
||||
INSERT INTO t_dc_datax_templates (id, name, source_type, target_type, template_content, description, is_system, created_by) VALUES
|
||||
-- MySQL to MySQL 模板
|
||||
('e4272e51-d431-4681-a370-1b3d0b036cd0', 'MySQL到MySQL', 'MYSQL', 'MYSQL', JSON_OBJECT(
|
||||
'job', JSON_OBJECT(
|
||||
'setting', JSON_OBJECT(
|
||||
'speed', JSON_OBJECT('channel', 3)
|
||||
),
|
||||
'content', JSON_ARRAY(
|
||||
JSON_OBJECT(
|
||||
'reader', JSON_OBJECT(
|
||||
'name', 'mysqlreader',
|
||||
'parameter', JSON_OBJECT(
|
||||
'username', '${source.username}',
|
||||
'password', '${source.password}',
|
||||
'column', JSON_ARRAY('*'),
|
||||
'splitPk', '${source.splitPk:id}',
|
||||
'connection', JSON_ARRAY(
|
||||
JSON_OBJECT(
|
||||
'jdbcUrl', JSON_ARRAY('${source.jdbcUrl}'),
|
||||
'table', JSON_ARRAY('${source.table}')
|
||||
)
|
||||
)
|
||||
)
|
||||
),
|
||||
'writer', JSON_OBJECT(
|
||||
'name', 'mysqlwriter',
|
||||
'parameter', JSON_OBJECT(
|
||||
'writeMode', 'insert',
|
||||
'username', '${target.username}',
|
||||
'password', '${target.password}',
|
||||
'column', JSON_ARRAY('*'),
|
||||
'session', JSON_ARRAY('set session sql_mode="PIPES_AS_CONCAT"'),
|
||||
'preSql', JSON_ARRAY('${target.preSql:}'),
|
||||
'connection', JSON_ARRAY(
|
||||
JSON_OBJECT(
|
||||
'jdbcUrl', '${target.jdbcUrl}',
|
||||
'table', JSON_ARRAY('${target.table}')
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
), 'MySQL到MySQL数据同步模板', TRUE, 'system');
|
||||
|
||||
-- 插入任务执行记录模拟数据
|
||||
INSERT INTO t_dc_task_executions (id, task_id, task_name, status, progress, records_total, records_processed, records_success, records_failed, throughput, data_size_bytes, started_at, completed_at, duration_seconds, config) VALUES
|
||||
-- 成功执行记录
|
||||
('12128059-a266-4d4f-b647-3cb8c24b8aad', '54cefc4d-3071-43d9-9fbf-baeb87932acd', '用户数据同步', 'SUCCESS', 100.00, 15000, 15000, 15000, 0, 125.50, 2048576,
|
||||
DATE_SUB(NOW(), INTERVAL 1 DAY), DATE_SUB(NOW(), INTERVAL 1 DAY) + INTERVAL 2 MINUTE, 120,
|
||||
JSON_OBJECT('batchSize', 1000, 'parallelism', 3)),
|
||||
|
||||
('9d418e0c-fa54-4f01-8633-3a5ad57f46a1', '3039a5c8-c894-42ab-ad49-5c2c5eccda31', '订单增量同步', 'SUCCESS', 100.00, 8500, 8500, 8500, 0, 94.44, 1536000,
|
||||
DATE_SUB(NOW(), INTERVAL 12 HOUR), DATE_SUB(NOW(), INTERVAL 12 HOUR) + INTERVAL 90 SECOND, 90,
|
||||
JSON_OBJECT('batchSize', 2000, 'parallelism', 2));
|
||||
-- 数据归集模板配置表
|
||||
CREATE TABLE t_dc_collection_templates (
|
||||
id VARCHAR(36) PRIMARY KEY COMMENT '模板ID(UUID)',
|
||||
name VARCHAR(255) NOT NULL UNIQUE COMMENT '模板名称',
|
||||
description TEXT COMMENT '模板描述',
|
||||
source_type VARCHAR(64) NOT NULL COMMENT '源数据源类型',
|
||||
source_name VARCHAR(64) NOT NULL COMMENT '源数据源名称',
|
||||
target_type VARCHAR(64) NOT NULL COMMENT '目标数据源类型',
|
||||
target_name VARCHAR(64) NOT NULL COMMENT '目标数据源名称',
|
||||
template_content JSON NOT NULL COMMENT '模板内容',
|
||||
built_in BOOLEAN DEFAULT FALSE COMMENT '是否系统内置',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
created_by VARCHAR(255) COMMENT '创建者',
|
||||
updated_by VARCHAR(255) COMMENT '更新者',
|
||||
INDEX idx_source_target (source_type, target_type)
|
||||
) COMMENT='数据归集模板配置表';
|
||||
|
||||
INSERT IGNORE INTO t_dc_collection_templates(id, name, description, source_type, source_name, target_type, target_name, template_content, built_in, created_by, updated_by)
|
||||
VALUES ('1', 'NAS归集模板', '将NAS存储上的文件归集到DataMate平台上。', 'nfsreader', 'nfsreader', 'nfswriter', 'nfswriter', '{"parameter": {}, "reader": {}, "writer": {}}', True, 'system', 'system'),
|
||||
('2', 'OBS归集模板', '将OBS存储上的文件归集到DataMate平台上。', 'obsreader', 'obsreader', 'obswriter', 'obswriter', '{"parameter": {"endpoint": {"name": "服务地址","description": "OBS的服务地址。","type": "input"},"bucket": {"name": "存储桶名称","description": "OBS存储桶名称。","type": "input"},"accessKey": {"name": "访问密钥","description": "OBS访问密钥。","type": "input"},"secretKey": {"name": "密钥","description": "OBS密钥。","type": "input"},"prefix": {"name": "匹配前缀","description": "按照匹配前缀去选中OBS中的文件进行归集。","type": "input"}}, "reader": {}, "writer": {}}', True, 'system', 'system');
|
||||
|
||||
@@ -1,9 +1,26 @@
|
||||
FROM maven:3-eclipse-temurin-8 AS datax-builder
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git && \
|
||||
git clone https://github.com/alibaba/DataX.git
|
||||
|
||||
COPY runtime/datax/ DataX/
|
||||
|
||||
RUN cd DataX && \
|
||||
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
|
||||
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
|
||||
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
|
||||
|
||||
FROM python:3.12-slim
|
||||
|
||||
# Single-stage image with build cache optimization using BuildKit cache mounts.
|
||||
# Note: to use the cache mount syntax you must build with BuildKit enabled:
|
||||
# DOCKER_BUILDKIT=1 docker build . -f scripts/images/datamate-python/Dockerfile -t datamate-backend-python
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y --no-install-recommends openjdk-21-jre-headless \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
# Poetry configuration
|
||||
@@ -12,7 +29,9 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
||||
POETRY_VIRTUALENVS_CREATE=false \
|
||||
POETRY_CACHE_DIR=/tmp/poetry_cache
|
||||
|
||||
ENV PATH="/root/.local/bin:$PATH"
|
||||
ENV JAVA_HOME=/usr/lib/jvm/java-21-openjdk-amd64
|
||||
|
||||
ENV PATH="/root/.local/bin:$JAVA_HOME/bin:$PATH"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
@@ -22,6 +41,8 @@ RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
&& pip install --root-user-action=ignore pipx \
|
||||
&& pipx install "poetry==$POETRY_VERSION"
|
||||
|
||||
COPY --from=datax-builder /DataX/target/datax/datax /opt/datax
|
||||
|
||||
# Copy only dependency files first (leverages layer caching when dependencies don't change)
|
||||
COPY runtime/datamate-python/pyproject.toml runtime/datamate-python/poetry.lock* /app/
|
||||
|
||||
|
||||
@@ -1,28 +1,19 @@
|
||||
FROM maven:3-eclipse-temurin-21 AS builder
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git && \
|
||||
git clone https://github.com/ModelEngine-Group/Terrabase.git && \
|
||||
cd Terrabase && \
|
||||
git -c core.quotepath=false -c log.showSignature=false checkout -b pyh/feat_terrabase_develop origin/pyh/feat_terrabase_develop -- && \
|
||||
mvn -U clean package install -Dmaven.test.skip=true
|
||||
|
||||
COPY backend/ /opt/gateway
|
||||
|
||||
RUN cd /opt/gateway/api-gateway && \
|
||||
mvn -U clean package -Dmaven.test.skip=true && \
|
||||
ls /opt/gateway/api-gateway/target
|
||||
mvn -U clean package -Dmaven.test.skip=true
|
||||
|
||||
|
||||
FROM eclipse-temurin:21-jdk
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y vim wget curl python3 python3-pip python-is-python3 dos2unix && \
|
||||
apt-get install -y vim wget curl dos2unix && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY --from=builder /opt/gateway/api-gateway/target/gateway.jar /opt/gateway/gateway.jar
|
||||
COPY --from=builder /Terrabase/enterprise-impl-commercial/target/*.jar /opt/terrabase/
|
||||
|
||||
COPY scripts/images/gateway/start.sh /opt/gateway/start.sh
|
||||
|
||||
|
||||
Reference in New Issue
Block a user