init datamate

This commit is contained in:
Dallas98
2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions

View File

@@ -0,0 +1 @@
create database if not exists datamate;

View File

@@ -0,0 +1,103 @@
USE datamate;
CREATE TABLE IF NOT EXISTS t_clean_template
(
id varchar(64) primary key not null unique,
name varchar(64),
description varchar(256),
created_at timestamp default current_timestamp,
updated_at timestamp default current_timestamp,
created_by varchar(256)
);
CREATE TABLE IF NOT EXISTS t_clean_task
(
id varchar(64) primary key,
name varchar(64),
description varchar(256),
status varchar(256),
src_dataset_id varchar(64),
src_dataset_name varchar(64),
dest_dataset_id varchar(64),
dest_dataset_name varchar(64),
before_size bigint,
after_size bigint,
created_at timestamp default current_timestamp,
started_at timestamp,
finished_at timestamp,
created_by varchar(256)
);
CREATE TABLE IF NOT EXISTS t_operator_instance
(
instance_id varchar(256),
operator_id varchar(256),
op_index int,
settings_override text,
PRIMARY KEY (instance_id, operator_id, op_index)
);
CREATE TABLE IF NOT EXISTS t_clean_result
(
instance_id varchar(64),
src_file_id varchar(64),
dest_file_id varchar(64),
src_name varchar(256),
dest_name varchar(256),
src_type varchar(256),
dest_type varchar(256),
src_size bigint,
dest_size bigint,
status varchar(256),
result TEXT,
primary key (instance_id, dest_file_id)
);
INSERT IGNORE INTO t_clean_template(id, name, description)
VALUES ('ac2f2582-a990-11f0-9768-00155d09c825', '空模板', '空模板'),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'text文本清洗模板', 'text文本清洗模板'),
('4421504e-c6c9-4760-b55a-509d17429597', '图片清洗模板', '图片清洗模板');
INSERT IGNORE INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
VALUES ('26ae585c-8310-4679-adc0-e53215e6e69b', 'TextFormatter', 1, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithShortOrLongLengthFilter', 2, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatWordRateFilter', 3, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighRepeatPhraseRateFilter', 4, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithHighSpecialCharRateFilter', 5, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileWithManySensitiveWordsFilter', 6, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'UnicodeSpaceCleaner', 7, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ExtraSpaceCleaner', 8, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FullWidthCharacterCleaner', 9, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'InvisibleCharactersCleaner', 10, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'ContentCleaner', 11, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'LegendCleaner', 12, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmojiCleaner', 13, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'HtmlTagCleaner', 14, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'TraditionalChineseCleaner', 15, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'GrableCharactersCleaner', 16, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'XMLTagCleaner', 17, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateSentencesFilter', 18, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'DuplicateFilesFilter', 19, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'SexualAndViolentWordCleaner', 20, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'PoliticalWordCleaner', 21, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedPhoneNumber', 22, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedCreditCardNumber', 23, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'EmailNumberCleaner', 24, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIpAddress', 25, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedIdNumber', 26, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'AnonymizedUrlCleaner', 27, null),
('26ae585c-8310-4679-adc0-e53215e6e69b', 'FileExporter', 28, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgFormatter', 1, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBlurredImagesCleaner', 2, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDuplicatedImagesCleaner', 3, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSimilarImagesCleaner', 4, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgBrightness', 5, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgContrast', 6, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSaturation', 7, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgSharpness', 8, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgDenoise', 9, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgShadowRemove', 10, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgPerspectiveTransformation', 11, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgResize', 12, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'ImgTypeUnify', 13, null),
('4421504e-c6c9-4760-b55a-509d17429597', 'FileExporter', 14, null);

View File

@@ -0,0 +1,160 @@
-- 数据归集服务数据库初始化脚本
-- 适用于datamate数据库
USE datamate;
-- =====================================
-- DDL语句 - 数据库表结构定义
-- =====================================
-- 删除现有表(支持重复执行 调测阶段使用)
DROP TABLE IF EXISTS t_dc_task_executions;
DROP TABLE IF EXISTS t_dc_collection_tasks;
DROP TABLE IF EXISTS t_dc_datax_templates;
CREATE TABLE t_dc_task_executions (
id VARCHAR(36) PRIMARY KEY COMMENT '执行记录ID(UUID)',
task_id VARCHAR(36) NOT NULL COMMENT '任务ID',
task_name VARCHAR(255) NOT NULL COMMENT '任务名称',
status VARCHAR(20) DEFAULT 'RUNNING' COMMENT '执行状态:RUNNING/SUCCESS/FAILED/STOPPED',
progress DECIMAL(5,2) DEFAULT 0.00 COMMENT '进度百分比',
records_total BIGINT DEFAULT 0 COMMENT '总记录数',
records_processed BIGINT DEFAULT 0 COMMENT '已处理记录数',
records_success BIGINT DEFAULT 0 COMMENT '成功记录数',
records_failed BIGINT DEFAULT 0 COMMENT '失败记录数',
throughput DECIMAL(10,2) DEFAULT 0.00 COMMENT '吞吐量(条/秒)',
data_size_bytes BIGINT DEFAULT 0 COMMENT '数据量(字节)',
started_at TIMESTAMP NULL COMMENT '开始时间',
completed_at TIMESTAMP NULL COMMENT '完成时间',
duration_seconds INT DEFAULT 0 COMMENT '执行时长(秒)',
config JSON COMMENT '执行配置',
error_message TEXT COMMENT '错误信息',
datax_job_id TEXT COMMENT 'datax任务ID',
result TEXT COMMENT '执行结果',
created_at TIMESTAMP NULL COMMENT '创建时间',
INDEX idx_task_id (task_id),
INDEX idx_status (status),
INDEX idx_started_at (started_at)
) COMMENT='任务执行明细表';
-- 数据归集任务表
CREATE TABLE t_dc_collection_tasks (
id VARCHAR(36) PRIMARY KEY COMMENT '任务ID(UUID)',
name VARCHAR(255) NOT NULL COMMENT '任务名称',
description TEXT COMMENT '任务描述',
sync_mode VARCHAR(20) DEFAULT 'ONCE' COMMENT '同步模式:ONCE/SCHEDULED',
config TEXT NOT NULL COMMENT '归集配置(DataX配置),包含源端和目标端配置信息',
schedule_expression VARCHAR(255) COMMENT 'Cron调度表达式',
status VARCHAR(20) DEFAULT 'DRAFT' COMMENT '任务状态:DRAFT/READY/RUNNING/SUCCESS/FAILED/STOPPED',
retry_count INT DEFAULT 3 COMMENT '重试次数',
timeout_seconds INT DEFAULT 3600 COMMENT '超时时间(秒)',
max_records BIGINT COMMENT '最大处理记录数',
sort_field VARCHAR(100) COMMENT '增量字段',
last_execution_id VARCHAR(36) COMMENT '最后执行ID(UUID)',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
updated_by VARCHAR(255) COMMENT '更新者',
INDEX idx_status (status),
INDEX idx_created_at (created_at),
INDEX idx_schedule (schedule_expression)
) COMMENT='数据归集任务表';
-- 任务执行记录表
CREATE TABLE t_dc_task_log (
id VARCHAR(36) PRIMARY KEY COMMENT '执行记录ID(UUID)',
task_id VARCHAR(36) NOT NULL COMMENT '任务ID',
task_name VARCHAR(255) NOT NULL COMMENT '任务名称',
sync_mode VARCHAR(20) DEFAULT 'FULL' COMMENT '同步模式:FULL/INCREMENTAL',
status VARCHAR(20) DEFAULT 'RUNNING' COMMENT '执行状态:RUNNING/SUCCESS/FAILED/STOPPED',
start_time TIMESTAMP NULL COMMENT '开始时间',
end_time TIMESTAMP NULL COMMENT '结束时间',
duration BIGINT COMMENT '执行时长(毫秒)',
process_id VARCHAR(50) COMMENT '进程ID',
log_path VARCHAR(500) COMMENT '日志文件路径',
error_msg LONGTEXT COMMENT '错误信息',
result LONGTEXT COMMENT '执行结果',
retry_times INT DEFAULT 0 COMMENT '重试次数',
create_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间'
) COMMENT='任务执行记录表';
-- DataX模板配置表
CREATE TABLE t_dc_datax_templates (
id VARCHAR(36) PRIMARY KEY COMMENT '模板ID(UUID)',
name VARCHAR(255) NOT NULL UNIQUE COMMENT '模板名称',
source_type VARCHAR(50) NOT NULL COMMENT '源数据源类型',
target_type VARCHAR(50) NOT NULL COMMENT '目标数据源类型',
template_content TEXT NOT NULL COMMENT '模板内容',
description TEXT COMMENT '模板描述',
version VARCHAR(20) DEFAULT '1.0.0' COMMENT '版本号',
is_system BOOLEAN DEFAULT FALSE COMMENT '是否系统模板',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
INDEX idx_source_target (source_type, target_type),
INDEX idx_system (is_system)
) COMMENT='DataX模板配置表';
-- =====================================
-- DML语句 - 数据操作
-- =====================================
-- 插入默认的DataX模板
INSERT INTO t_dc_datax_templates (id, name, source_type, target_type, template_content, description, is_system, created_by) VALUES
-- MySQL to MySQL 模板
('e4272e51-d431-4681-a370-1b3d0b036cd0', 'MySQL到MySQL', 'MYSQL', 'MYSQL', JSON_OBJECT(
'job', JSON_OBJECT(
'setting', JSON_OBJECT(
'speed', JSON_OBJECT('channel', 3)
),
'content', JSON_ARRAY(
JSON_OBJECT(
'reader', JSON_OBJECT(
'name', 'mysqlreader',
'parameter', JSON_OBJECT(
'username', '${source.username}',
'password', '${source.password}',
'column', JSON_ARRAY('*'),
'splitPk', '${source.splitPk:id}',
'connection', JSON_ARRAY(
JSON_OBJECT(
'jdbcUrl', JSON_ARRAY('${source.jdbcUrl}'),
'table', JSON_ARRAY('${source.table}')
)
)
)
),
'writer', JSON_OBJECT(
'name', 'mysqlwriter',
'parameter', JSON_OBJECT(
'writeMode', 'insert',
'username', '${target.username}',
'password', '${target.password}',
'column', JSON_ARRAY('*'),
'session', JSON_ARRAY('set session sql_mode="PIPES_AS_CONCAT"'),
'preSql', JSON_ARRAY('${target.preSql:}'),
'connection', JSON_ARRAY(
JSON_OBJECT(
'jdbcUrl', '${target.jdbcUrl}',
'table', JSON_ARRAY('${target.table}')
)
)
)
)
)
)
)
), 'MySQL到MySQL数据同步模板', TRUE, 'system');
-- 插入任务执行记录模拟数据
INSERT INTO t_dc_task_executions (id, task_id, task_name, status, progress, records_total, records_processed, records_success, records_failed, throughput, data_size_bytes, started_at, completed_at, duration_seconds, config) VALUES
-- 成功执行记录
('12128059-a266-4d4f-b647-3cb8c24b8aad', '54cefc4d-3071-43d9-9fbf-baeb87932acd', '用户数据同步', 'SUCCESS', 100.00, 15000, 15000, 15000, 0, 125.50, 2048576,
DATE_SUB(NOW(), INTERVAL 1 DAY), DATE_SUB(NOW(), INTERVAL 1 DAY) + INTERVAL 2 MINUTE, 120,
JSON_OBJECT('batchSize', 1000, 'parallelism', 3)),
('9d418e0c-fa54-4f01-8633-3a5ad57f46a1', '3039a5c8-c894-42ab-ad49-5c2c5eccda31', '订单增量同步', 'SUCCESS', 100.00, 8500, 8500, 8500, 0, 94.44, 1536000,
DATE_SUB(NOW(), INTERVAL 12 HOUR), DATE_SUB(NOW(), INTERVAL 12 HOUR) + INTERVAL 90 SECOND, 90,
JSON_OBJECT('batchSize', 2000, 'parallelism', 2));

View File

@@ -0,0 +1,15 @@
-- 数据归集服务数据库初始化脚本
-- 适用于datamate数据库
USE datamate;
CREATE TABLE IF NOT EXISTS `t_chunk_upload_request`
(
`id` VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
`total_file_num` INT COMMENT '总文件数',
`uploaded_file_num` INT COMMENT '已上传文件数',
`upload_path` VARCHAR(256) COMMENT '文件路径',
`timeout` TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '上传请求超时时间',
`service_id` VARCHAR(64) COMMENT '上传请求所属服务:DATA-MANAGEMENT(数据管理);',
`check_info` TEXT COMMENT '业务信息'
) COMMENT ='文件切片上传请求表';

View File

@@ -0,0 +1,156 @@
-- DataMate Platform 数据库初始化脚本
-- 适用于现有datamate数据库环境
-- 使用现有的datamate数据库
USE datamate;
-- 删除已存在的表(如果需要重新创建)
-- 原有表名保留,但本脚本新建以 t_dm_ 为前缀的新表,并使用 UUID 主键
-- 可按需手工迁移旧数据到新表
-- ===========================================
-- 数据管理(Data Management)模块表(UUID 主键,t_dm_ 前缀)
-- ===========================================
-- 数据集表(支持医学影像、文本、问答等多种类型)
CREATE TABLE IF NOT EXISTS t_dm_datasets (
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
name VARCHAR(255) NOT NULL COMMENT '数据集名称',
description TEXT COMMENT '数据集描述',
dataset_type VARCHAR(50) NOT NULL COMMENT '数据集类型:IMAGE/TEXT/QA/MULTIMODAL/OTHER',
category VARCHAR(100) COMMENT '数据集分类:医学影像/问答/文献等',
path VARCHAR(500) COMMENT '数据存储路径',
format VARCHAR(50) COMMENT '数据格式:DCM/JPG/JSON/CSV等',
schema_info JSON COMMENT '数据结构信息',
size_bytes BIGINT DEFAULT 0 COMMENT '数据大小(字节)',
file_count BIGINT DEFAULT 0 COMMENT '文件数量',
record_count BIGINT DEFAULT 0 COMMENT '记录数量',
retention_days INTEGER DEFAULT 0 COMMENT '数据保留天数(0表示长期保留)',
tags JSON COMMENT '标签列表',
metadata JSON COMMENT '元数据信息',
status VARCHAR(50) DEFAULT 'DRAFT' COMMENT '状态:DRAFT/ACTIVE/ARCHIVED',
is_public BOOLEAN DEFAULT FALSE COMMENT '是否公开',
is_featured BOOLEAN DEFAULT FALSE COMMENT '是否推荐',
version BIGINT NOT NULL DEFAULT 0 COMMENT '版本号',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
updated_by VARCHAR(255) COMMENT '更新者',
INDEX idx_dm_dataset_type (dataset_type),
INDEX idx_dm_category (category),
INDEX idx_dm_format (format),
INDEX idx_dm_status (status),
INDEX idx_dm_public (is_public),
INDEX idx_dm_featured (is_featured),
INDEX idx_dm_created_at (created_at)
) COMMENT='数据集表(UUID 主键)';
-- 数据集文件表
CREATE TABLE IF NOT EXISTS t_dm_dataset_files (
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
dataset_id VARCHAR(36) NOT NULL COMMENT '所属数据集ID(UUID)',
file_name VARCHAR(255) NOT NULL COMMENT '文件名',
file_path VARCHAR(1000) NOT NULL COMMENT '文件路径',
file_type VARCHAR(50) COMMENT '文件格式:JPG/PNG/DCM/TXT等',
file_size BIGINT DEFAULT 0 COMMENT '文件大小(字节)',
check_sum VARCHAR(64) COMMENT '文件校验和',
tags JSON COMMENT '文件标签信息',
metadata JSON COMMENT '文件元数据',
status VARCHAR(50) DEFAULT 'ACTIVE' COMMENT '文件状态:ACTIVE/DELETED/PROCESSING',
upload_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '上传时间',
last_access_time TIMESTAMP NULL COMMENT '最后访问时间',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
FOREIGN KEY (dataset_id) REFERENCES t_dm_datasets(id) ON DELETE CASCADE,
INDEX idx_dm_dataset (dataset_id),
INDEX idx_dm_file_type (file_type),
INDEX idx_dm_file_status (status),
INDEX idx_dm_upload_time (upload_time)
) COMMENT='数据集文件表(UUID 主键)';
-- 数据集统计信息表
CREATE TABLE IF NOT EXISTS t_dm_dataset_statistics (
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
dataset_id VARCHAR(36) NOT NULL COMMENT '数据集ID(UUID)',
stat_date DATE NOT NULL COMMENT '统计日期',
total_files BIGINT DEFAULT 0 COMMENT '总文件数',
total_size BIGINT DEFAULT 0 COMMENT '总大小(字节)',
processed_files BIGINT DEFAULT 0 COMMENT '已处理文件数',
error_files BIGINT DEFAULT 0 COMMENT '错误文件数',
download_count BIGINT DEFAULT 0 COMMENT '下载次数',
view_count BIGINT DEFAULT 0 COMMENT '查看次数',
quality_metrics JSON COMMENT '质量指标',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
FOREIGN KEY (dataset_id) REFERENCES t_dm_datasets(id) ON DELETE CASCADE,
UNIQUE KEY uk_dm_dataset_date (dataset_id, stat_date),
INDEX idx_dm_stat_date (stat_date)
) COMMENT='数据集统计信息表(UUID 主键)';
-- 标签表
CREATE TABLE IF NOT EXISTS t_dm_tags (
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
name VARCHAR(100) NOT NULL UNIQUE COMMENT '标签名称',
description TEXT COMMENT '标签描述',
category VARCHAR(50) COMMENT '标签分类',
color VARCHAR(7) COMMENT '标签颜色(十六进制)',
usage_count BIGINT DEFAULT 0 COMMENT '使用次数',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
INDEX idx_dm_tag_category (category),
INDEX idx_dm_tag_usage_count (usage_count)
) COMMENT='标签表(UUID 主键)';
-- 数据集标签关联表
CREATE TABLE IF NOT EXISTS t_dm_dataset_tags (
dataset_id VARCHAR(36) NOT NULL COMMENT '数据集ID(UUID)',
tag_id VARCHAR(36) NOT NULL COMMENT '标签ID(UUID)',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
PRIMARY KEY (dataset_id, tag_id),
FOREIGN KEY (dataset_id) REFERENCES t_dm_datasets(id) ON DELETE CASCADE,
FOREIGN KEY (tag_id) REFERENCES t_dm_tags(id) ON DELETE CASCADE
) COMMENT='数据集标签关联表(UUID 外键)';
-- ===========================================
-- 非数据管理表(如 users、t_data_sources)保持不变
-- ===========================================
-- 用户表(如果不存在)
CREATE TABLE IF NOT EXISTS users (
id BIGINT PRIMARY KEY AUTO_INCREMENT,
username VARCHAR(255) NOT NULL UNIQUE COMMENT '用户名',
email VARCHAR(255) NOT NULL UNIQUE COMMENT '邮箱',
password_hash VARCHAR(255) NOT NULL COMMENT '密码哈希',
full_name VARCHAR(255) COMMENT '真实姓名',
avatar_url VARCHAR(500) COMMENT '头像URL',
role VARCHAR(50) NOT NULL DEFAULT 'USER' COMMENT '角色:ADMIN/USER',
organization VARCHAR(255) COMMENT '所属机构',
enabled BOOLEAN NOT NULL DEFAULT TRUE COMMENT '是否启用',
last_login_at TIMESTAMP NULL COMMENT '最后登录时间',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
INDEX idx_username (username),
INDEX idx_email (email),
INDEX idx_role (role),
INDEX idx_enabled (enabled)
) COMMENT='用户表';
-- 插入初始数据
-- 插入默认用户
INSERT IGNORE INTO users (username, email, password_hash, full_name, role, organization) VALUES
('admin', 'admin@datamate.com', '$2a$10$N.zmdr9k7uOCQb376NoUnuTJ8iAt6Z5EHsM8lE9lBOsl7q7U3.XUO', '系统管理员', 'ADMIN', 'DataMate'),
('knowledge_user', 'knowledge@datamate.com', '$2a$10$N.zmdr9k7uOCQb376NoUnuTJ8iAt6Z5EHsM8lE9lBOsl7q7U3.XUO', '知识库用户', 'USER', '三甲医院');
-- 创建视图:数据集统计摘要(引用新表)
CREATE OR REPLACE VIEW v_dm_dataset_summary AS
SELECT
COUNT(*) as total_datasets,
SUM(CASE WHEN status = 'ACTIVE' THEN 1 ELSE 0 END) as active_datasets,
SUM(CASE WHEN is_public = TRUE THEN 1 ELSE 0 END) as public_datasets,
SUM(CASE WHEN is_featured = TRUE THEN 1 ELSE 0 END) as featured_datasets,
SUM(file_count) as total_files,
SUM(record_count) as total_records,
COUNT(DISTINCT dataset_type) as dataset_types,
COUNT(DISTINCT category) as categories
FROM t_dm_datasets;

View File

@@ -0,0 +1,223 @@
USE datamate;
CREATE TABLE IF NOT EXISTS t_operator
(
id varchar(64) primary key,
name varchar(64),
description varchar(256),
version varchar(256),
inputs varchar(256),
outputs varchar(256),
runtime text,
settings text,
file_name text,
is_star bool,
created_at timestamp default current_timestamp,
updated_at timestamp default current_timestamp
);
CREATE TABLE IF NOT EXISTS t_operator_category
(
id int primary key auto_increment,
name varchar(64),
type varchar(64),
parent_id int
);
CREATE TABLE IF NOT EXISTS t_operator_category_relation
(
category_id int,
operator_id varchar(64),
primary key (category_id, operator_id)
);
CREATE OR REPLACE VIEW v_operator AS
SELECT o.id AS operator_id,
o.name AS operator_name,
description,
version,
inputs,
outputs,
runtime,
settings,
is_star,
created_at,
updated_at,
toc.id AS category_id,
toc.name AS category_name
FROM t_operator_category_relation tocr
LEFT JOIN t_operator o ON tocr.operator_id = o.id
LEFT JOIN t_operator_category toc ON tocr.category_id = toc.id;
INSERT IGNORE INTO t_operator_category(id, name, type, parent_id)
VALUES (1, '模态', 'predefined', 0),
(2, '语言', 'predefined', 0),
(3, '文本', 'predefined', 1),
(4, '图片', 'predefined', 1),
(5, '音频', 'predefined', 1),
(6, '视频', 'predefined', 1),
(7, '多模态', 'predefined', 1),
(8, 'Python', 'predefined', 2),
(9, 'Java', 'predefined', 2),
(10, '来源', 'predefined', 0),
(11, '系统预置', 'predefined', 10),
(12, '用户上传', 'predefined', 10),
(13, '收藏状态', 'predefined', 0),
(14, '已收藏', 'predefined', 13);
INSERT IGNORE INTO t_operator
(id, name, description, version, inputs, outputs, runtime, settings, file_name, is_star)
VALUES ('TextFormatter', 'TXT文本抽取', '抽取TXT中的文本。', '1.0.0', 'text', 'text', null, null, '', false),
('FileExporter', '落盘算子', '将文件保存到本地目录。', '1.0.0', 'all', 'all', null, null, '', false),
('FileWithHighRepeatPhraseRateFilter', '文档词重复率检查', '去除重复词过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatPhraseRatio": {"name": "文档词重复率", "description": "某个词的统计数/文档总词数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}, "hitStopwords": {"name": "去除停用词", "description": "统计重复词时,选择是否要去除停用词。", "type": "switch", "defaultVal": false, "required": true, "checkedLabel": "去除", "unCheckedLabel": "不去除"}}', '', 'false'),
('FileWithHighRepeatWordRateFilter', '文档字重复率检查', '去除重复字过多的文档。', '1.0.0', 'text', 'text', null, '{"repeatWordRatio": {"name": "文档字重复率", "description": "某个字的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
('FileWithHighSpecialCharRateFilter', '文档特殊字符率检查', '去除特殊字符过多的文档。', '1.0.0', 'text', 'text', null, '{"specialCharRatio": {"name": "文档特殊字符率", "description": "特殊字符的统计数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.3, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
('DuplicateFilesFilter', '相似文档去除', '相似文档去除。', '1.0.0', 'text', 'text', null, '{"fileDuplicateThreshold": {"name": "文档相似度", "description": "基于MinHash算法和Jaccard相似度,计算当前文档与数据集中其它文档相似性,超过设定值,该文档被去除。", "type": "slider", "defaultVal": 0.5, "min": 0, "max": 1, "step": 0.1}}', '', 'false'),
('FileWithManySensitiveWordsFilter', '文档敏感词率检查', '去除敏感词过多的文档。', '1.0.0', 'text', 'text', null, '{"sensitiveWordsRate": {"name": "文档敏感词率", "description": "敏感词的字数/文档总字数 > 设定值,该文档被去除。", "type": "slider", "defaultVal": 0.01, "min": 0, "max": 1, "step": 0.01}}', '', 'false'),
('FileWithShortOrLongLengthFilter', '文档字数检查', '字数不在指定范围会被过滤掉。', '1.0.0', 'text', 'text', null, '{"fileLength": {"name": "文档字数", "description": "过滤字数不在指定范围内的文档,如[10,10000000]。若输入为空,则不对字数上/下限做限制。", "type": "range", "defaultVal": [10, 10000000], "min": 0, "max": 10000000000000000, "step": 1}}', '', 'false'),
('ContentCleaner', '文档目录去除', '去除文档中的目录。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('AnonymizedCreditCardNumber', '信用卡号匿名化', '信用卡号匿名化', '1.0.0', 'text', 'text', null, null, '', 'false'),
('EmailNumberCleaner', '邮件地址匿名化', '邮件地址匿名化', '1.0.0', 'text', 'text', null, null, '', 'false'),
('EmojiCleaner', '文档表情去除', '去除文档中表情字符或者emoji符号。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('ExtraSpaceCleaner', '多余空格去除', '移除文档首尾、句中或标点符号附近多余空格和 tab 等。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('FullWidthCharacterCleaner', '全角转半角', '将文档中的所有全角字符转换成半角字符。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('GrableCharactersCleaner', '文档乱码去除', '去除文档中的乱码和无意义的unicode。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('HtmlTagCleaner', 'HTML标签去除', '移除文档中HTML标签,如 <html>、<dev>、<p> 等。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('AnonymizedIdNumber', '身份证号匿名化', '身份证号匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('InvisibleCharactersCleaner', '不可见字符去除', '去除文档中的不可见字符,例如 0-31 号字符中的部分字符。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('AnonymizedIpAddress', 'IP地址匿名化', 'IP地址匿名化', '1.0.0', 'text', 'text', null, null, '', 'false'),
('LegendCleaner', '图注表注去除', '去除文档中的图注、表注等内容。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('AnonymizedPhoneNumber', '电话号码匿名化', '电话号码匿名化', '1.0.0', 'text', 'text', null, null, '', 'false'),
('PoliticalWordCleaner', '政治文本匿名化', '将政治文本进行匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('DuplicateSentencesFilter', '文档局部内容去重', '文档局部内容去重。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('SexualAndViolentWordCleaner', '暴力色情文本匿名化', '将暴力、色情文本进行匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('TraditionalChineseCleaner', '繁体转简体', '将繁体转换为简体。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('UnicodeSpaceCleaner', '空格标准化', '将文档中不同的 unicode 空格,如 u2008,转换为正常空格\\u0020。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('AnonymizedUrlCleaner', 'URL网址匿名化', '将文档中的url网址匿名化。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('XMLTagCleaner', 'XML标签去除', '去除XML中的标签。', '1.0.0', 'text', 'text', null, null, '', 'false'),
('ImgFormatter', '读取图片文件', '读取图片文件。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgBlurredImagesCleaner', '模糊图片过滤', '去除模糊的图片。', '1.0.0', 'image', 'image', '{"blurredThreshold": {"name": "梯度函数值", "name_en": "Gradient Value", "description": "梯度函数值取值越小,图片模糊度越高。", "description_en": "A smaller gradient value indicates a higher image blur.", "type": "slider", "defaultVal": 1000, "min": 1, "max": 10000, "step": 1}}', null, '', 'false'),
('ImgBrightness', '图片亮度增强', '自适应调节图片的亮度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgContrast', '图片对比度增强', '自适应调节图片的对比度。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgDenoise', '图片噪点去除', '去除图片中的噪点,主要适用于自然场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgDuplicatedImagesCleaner', '重复图片去除', '去除重复的图片。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgPerspectiveTransformation', '图片透视变换', '自适应校正图片的视角,主要适用于文档校正场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgResize', '图片重采样', '将图片放大或缩小到指定像素。', '1.0.0', 'image', 'image', '{"targetSize": {"name": "重采样尺寸", "name_en": "Resample Size", "type": "multiple", "properties": [{"type": "inputNumber", "name": "宽度", "name_en": "Width", "description": "像素", "description_en": "Pixel", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}, {"type": "inputNumber", "name": "高度", "name_en": "Height", "description": "像素", "description_en": "Pixel", "defaultVal": 256, "min": 1, "max": 4096, "step": 1}]}}', null, '', 'false'),
('ImgSaturation', '图片饱和度增强', '自适应调节图片的饱和度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgShadowRemove', '图片阴影去除', '去除图片中的阴影,主要适用于文档场景。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgSharpness', '图片锐度增强', '自适应调节图片的锐度,主要适用于自然场景图片。', '1.0.0', 'image', 'image', null, null, '', 'false'),
('ImgSimilarImagesCleaner', '相似图片去除', '去除相似的图片。', '1.0.0', 'image', 'image', '{"similarThreshold": {"name": "相似度", "name_en": "Similarity", "description": "相似度取值越大,图片相似度越高。", "description_en": "A larger similarity value indicates a higher image similarity.", "type": "slider", "defaultVal": 0.8, "min": 0, "max": 1, "step": 0.01}}', null, '', 'false'),
('ImgTypeUnify', '图片格式转换', '将图片编码格式统一为jpg、jpeg、png、bmp格式。', '1.0.0', 'image', 'image', '{"imgType": {"name": "图片编码格式", "name_en": "Image Encoding Format", "type": "select", "defaultVal": "jpg", "options": [{"label": "jpg", "label_en": "jpg", "value": "jpg"}, {"label": "png", "label_en": "png", "value": "png"}, {"label": "jpeg", "label_en": "jpeg", "value": "jpeg"}, {"label": "bmp", "label_en": "bmp", "value": "bmp"}]}}', null, '', 'false');
INSERT IGNORE INTO t_operator_category_relation(category_id, operator_id)
VALUES (3, 'TextFormatter'),
(7, 'FileExporter'),
(8, 'TextFormatter'),
(8, 'FileExporter'),
(3, 'FileWithShortOrLongLengthFilter'),
(3, 'FileWithHighRepeatPhraseRateFilter'),
(3, 'FileWithHighRepeatWordRateFilter'),
(3, 'FileWithHighSpecialCharRateFilter'),
(3, 'FileWithManySensitiveWordsFilter'),
(3, 'DuplicateFilesFilter'),
(3, 'DuplicateSentencesFilter'),
(3, 'AnonymizedCreditCardNumber'),
(3, 'AnonymizedIdNumber'),
(3, 'AnonymizedIpAddress'),
(3, 'AnonymizedPhoneNumber'),
(3, 'AnonymizedUrlCleaner'),
(3, 'HtmlTagCleaner'),
(3, 'XMLTagCleaner'),
(3, 'ContentCleaner'),
(3, 'EmailNumberCleaner'),
(3, 'EmojiCleaner'),
(3, 'ExtraSpaceCleaner'),
(3, 'FullWidthCharacterCleaner'),
(3, 'GrableCharactersCleaner'),
(3, 'InvisibleCharactersCleaner'),
(3, 'LegendCleaner'),
(3, 'PoliticalWordCleaner'),
(3, 'SexualAndViolentWordCleaner'),
(3, 'TraditionalChineseCleaner'),
(3, 'UnicodeSpaceCleaner'),
(4, 'ImgFormatter'),
(4, 'ImgBlurredImagesCleaner'),
(4, 'ImgBrightness'),
(4, 'ImgContrast'),
(4, 'ImgDenoise'),
(4, 'ImgDuplicatedImagesCleaner'),
(4, 'ImgPerspectiveTransformation'),
(4, 'ImgResize'),
(4, 'ImgSaturation'),
(4, 'ImgShadowRemove'),
(4, 'ImgSharpness'),
(4, 'ImgSimilarImagesCleaner'),
(4, 'ImgTypeUnify'),
(8, 'FileWithShortOrLongLengthFilter'),
(8, 'FileWithHighRepeatPhraseRateFilter'),
(8, 'FileWithHighRepeatWordRateFilter'),
(8, 'FileWithHighSpecialCharRateFilter'),
(8, 'FileWithManySensitiveWordsFilter'),
(8, 'DuplicateFilesFilter'),
(8, 'DuplicateSentencesFilter'),
(8, 'AnonymizedCreditCardNumber'),
(8, 'AnonymizedIdNumber'),
(8, 'AnonymizedIpAddress'),
(8, 'AnonymizedPhoneNumber'),
(8, 'AnonymizedUrlCleaner'),
(8, 'HtmlTagCleaner'),
(8, 'XMLTagCleaner'),
(8, 'ContentCleaner'),
(8, 'EmailNumberCleaner'),
(8, 'EmojiCleaner'),
(8, 'ExtraSpaceCleaner'),
(8, 'FullWidthCharacterCleaner'),
(8, 'GrableCharactersCleaner'),
(8, 'InvisibleCharactersCleaner'),
(8, 'LegendCleaner'),
(8, 'PoliticalWordCleaner'),
(8, 'SexualAndViolentWordCleaner'),
(8, 'TraditionalChineseCleaner'),
(8, 'UnicodeSpaceCleaner'),
(11, 'TextFormatter'),
(11, 'FileExporter'),
(11, 'FileWithShortOrLongLengthFilter'),
(11, 'FileWithHighRepeatPhraseRateFilter'),
(11, 'FileWithHighRepeatWordRateFilter'),
(11, 'FileWithHighSpecialCharRateFilter'),
(11, 'FileWithManySensitiveWordsFilter'),
(11, 'DuplicateFilesFilter'),
(11, 'DuplicateSentencesFilter'),
(11, 'AnonymizedCreditCardNumber'),
(11, 'AnonymizedIdNumber'),
(11, 'AnonymizedIpAddress'),
(11, 'AnonymizedPhoneNumber'),
(11, 'AnonymizedUrlCleaner'),
(11, 'HtmlTagCleaner'),
(11, 'XMLTagCleaner'),
(11, 'ContentCleaner'),
(11, 'EmailNumberCleaner'),
(11, 'EmojiCleaner'),
(11, 'ExtraSpaceCleaner'),
(11, 'FullWidthCharacterCleaner'),
(11, 'GrableCharactersCleaner'),
(11, 'InvisibleCharactersCleaner'),
(11, 'LegendCleaner'),
(11, 'PoliticalWordCleaner'),
(11, 'SexualAndViolentWordCleaner'),
(11, 'TraditionalChineseCleaner'),
(11, 'UnicodeSpaceCleaner'),
(11, 'ImgFormatter'),
(11, 'ImgBlurredImagesCleaner'),
(11, 'ImgBrightness'),
(11, 'ImgContrast'),
(11, 'ImgDenoise'),
(11, 'ImgDuplicatedImagesCleaner'),
(11, 'ImgPerspectiveTransformation'),
(11, 'ImgResize'),
(11, 'ImgSaturation'),
(11, 'ImgShadowRemove'),
(11, 'ImgSharpness'),
(11, 'ImgSimilarImagesCleaner'),
(11, 'ImgTypeUnify');

View File

@@ -0,0 +1,45 @@
FROM maven:3-openjdk-8-slim AS datax-builder
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
apt-get update && \
apt-get install -y git && \
git clone https://github.com/alibaba/DataX.git
COPY runtime/datax/ DataX/
RUN cd DataX && \
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
FROM maven:3-amazoncorretto-21-debian AS builder
COPY backend/ /opt/backend
COPY scripts/images/backend/settings.xml /opt/backend
RUN cd /opt/backend && \
mvn -U clean package -s settings.xml -Dmaven.test.skip=true
FROM openjdk:21-jdk-slim
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \
apt-get update && \
apt-get install -y vim wget curl nfs-common rsync python3 python3-pip python-is-python3 && \
apt-get clean && \
rm -rf /var/lib/apy/lists/*
COPY --from=builder /opt/backend/services/main-application/target/data-mate.jar /opt/backend/data-mate.jar
COPY --from=datax-builder /DataX/target/datax/datax /opt/datax
COPY editions/community/config/application.yml /opt/backend/application.yml
COPY editions/community/config/log4j2.xml /opt/backend/log4j2.xml
COPY scripts/images/backend/start.sh /opt/backend/start.sh
RUN chmod +x /opt/backend/start.sh \
&& ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
ENTRYPOINT ["/opt/backend/start.sh"]
CMD ["java", "-Duser.timezone=Asia/Shanghai", "-jar", "/opt/backend/data-mate.jar"]

View File

@@ -0,0 +1,68 @@
<?xml version="1.0" encoding="UTF-8"?>
<settings xmlns="http://maven.apache.org/SETTINGS/1.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/SETTINGS/1.0.0 https://maven.apache.org/xsd/settings-1.0.0.xsd">
<!-- 本地仓库路径(可选,默认在 ~/.m2/repository) -->
<localRepository>${user.home}/.m2/repository</localRepository>
<!-- 阿里云镜像配置 -->
<mirrors>
<mirror>
<id>aliyun-maven</id>
<name>Aliyun Maven Repository</name>
<url>https://maven.aliyun.com/repository/public</url>
<mirrorOf>central,jcenter,google,spring,spring-plugin,gradle-plugin</mirrorOf>
</mirror>
</mirrors>
<!-- 使用 Java 21 编译配置(可选,但推荐) -->
<profiles>
<profile>
<id>java21</id>
<activation>
<activeByDefault>true</activeByDefault>
<jdk>21</jdk>
</activation>
<properties>
<maven.compiler.source>21</maven.compiler.source>
<maven.compiler.target>21</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
</profile>
<!-- 激活阿里云仓库(可选,增强依赖解析) -->
<profile>
<id>aliyun-repos</id>
<repositories>
<repository>
<id>aliyun-public</id>
<name>Aliyun Public Repository</name>
<url>https://maven.aliyun.com/repository/public</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled> <!-- 默认关闭快照版本 -->
</snapshots>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>aliyun-plugin</id>
<name>Aliyun Plugin Repository</name>
<url>https://maven.aliyun.com/repository/public</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</pluginRepository>
</pluginRepositories>
</profile>
</profiles>
<activeProfiles>
<activeProfile>aliyun-repos</activeProfile> <!-- 激活阿里云仓库 -->
<activeProfile>java21</activeProfile> <!-- 激活 Java 21 配置 -->
</activeProfiles>
</settings>

View File

@@ -0,0 +1,8 @@
#!/bin/bash
set -e
rpcbind
echo "Starting main application..."
exec "$@"

View File

@@ -0,0 +1,33 @@
FROM maven:3-openjdk-8-slim AS builder
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
apt-get update && \
apt-get install -y git && \
git clone https://github.com/alibaba/DataX.git
COPY runtime/datax/ DataX/
RUN cd DataX && \
sed -i "s/com.mysql.jdbc.Driver/com.mysql.cj.jdbc.Driver/g" \
plugin-rdbms-util/src/main/java/com/alibaba/datax/plugin/rdbms/util/DataBaseType.java && \
mvn -U clean package assembly:assembly -Dmaven.test.skip=true
FROM openjdk:8-jdk-slim
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list && \
apt-get update && \
apt-get install -y python3 python3-pip python-is-python3 vim wget curl nfs-common rsync && \
apt-get clean && \
rm -rf /var/lib/apy/lists/*
RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
pip config --user set global.trusted-host mirrors.aliyun.com && \
pip install fastapi uvicorn[standard] && \
pip cache purge
COPY --from=builder /DataX/target/datax/datax /opt/datax
COPY scripts/images/datax/app.py /opt/datax/bin/app.py

View File

@@ -0,0 +1,52 @@
import subprocess
import tempfile
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI(title="datax")
class CreateJobParam(BaseModel):
content: str
@app.post("/process", tags=["run datax.py"])
async def process(job: CreateJobParam):
output = {
"status": "failed",
}
try:
# 创建临时文件存储Python脚本
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=True) as f:
f.write(job.content)
f.seek(0)
cmd_args = ["python3", "/opt/datax/bin/datax.py", f.name]
result = subprocess.run(
cmd_args,
capture_output=True,
text=True,
check=True
)
output["status"] = result.returncode
if result.returncode != 0:
output["stdout"] = result.stdout
output["stderr"] = result.stderr
except subprocess.TimeoutExpired as e:
output["status"] = 408
output["stderr"] = f"The script execution timed out: {e.stderr}"
except subprocess.CalledProcessError as e:
output["status"] = 500
output["stderr"] = f"Script execution failed: {e.stdout}"
except Exception as e:
output["status"] = 500
output["stderr"] = f"Server error: {str(e)}"
return output
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

View File

@@ -0,0 +1,17 @@
FROM node:18-alpine AS builder
WORKDIR /app
COPY frontend ./
RUN if [ -f package-lock.json ]; then npm ci; else npm install; fi && \
npm run build
FROM nginx:1.29 AS runner
COPY --from=builder /app/dist /opt/frontend
COPY scripts/images/frontend/edm.conf /etc/nginx/conf.d/default.conf
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
CMD ["nginx", "-g", "daemon off;"]

View File

@@ -0,0 +1,16 @@
server {
listen 80;
server_name 0.0.0.0;
location /api/ {
proxy_pass http://backend:8080/api/;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
location / {
root /opt/frontend;
try_files $uri $uri/ /index.html;
}
}

View File

@@ -0,0 +1,22 @@
FROM python:3.10-slim
RUN apt-get update && \
apt-get install -y curl vim libgl1 libgl1-mesa-glx libglib2.0-0 procps && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
RUN pip config --user set global.index-url https://mirrors.aliyun.com/pypi/simple && \
pip config --user set global.trusted-host mirrors.aliyun.com && \
pip install --upgrade setuptools && \
pip install -U 'mineru[core]==2.1.0' --break-system-packages && \
pip install torch==2.7.1+cpu -f https://download.pytorch.org/whl/torch/ && \
pip install torchvision==0.22.1+cpu -f https://download.pytorch.org/whl/torchvision && \
pip install requests==2.27.1 torch_npu==2.7.1rc1 numpy==1.26.0 decorator==5.2.1 einops==0.8.1 attrs==25.3.0 && \
pip cache purge
ENV CURL_CA_BUNDLE=""
ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
RUN /bin/bash -c "mineru-models-download -s modelscope -m all"
ENV MINERU_MODEL_SOURCE=local

View File

@@ -0,0 +1,24 @@
FROM python:3.11
COPY runtime/python-executor /opt/runtime
COPY runtime/ops /opt/runtime/datamate/ops
ENV PYTHONPATH=/opt/runtime/datamate/
RUN sed -i 's/deb.debian.org/mirrors.huaweicloud.com/g' /etc/apt/sources.list.d/debian.sources \
&& apt update \
&& apt install -y libgl1 libglib2.0-0 vim poppler-utils tesseract-ocr tesseract-ocr-chi-sim libmagic1t64 \
&& apt clean \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /opt/runtime
ENV HF_HUB_DISABLE_XET=1
RUN pip install -e . -i https://mirrors.huaweicloud.com/repository/pypi/simple \
&& pip install -r /opt/runtime/datamate/ops/requirements.txt -i https://mirrors.huaweicloud.com/repository/pypi/simple \
&& pip cache purge \
&& python -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" \
&& python -c "from unstructured_inference.models.base import get_model; get_model()"
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime

View File

@@ -0,0 +1,9 @@
FROM downloads.unstructured.io/unstructured-io/unstructured
RUN pip config --user set global.index https://mirrors.huaweicloud.com/repository/pypi && \
pip config --user set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple && \
pip config --user set global.trusted-host mirrors.huaweicloud.com && \
pip install fastapi uvicorn && \
pip cache purge
COPY scripts/images/unstructured/app.py /app/app.py

View File

@@ -0,0 +1,61 @@
import asyncio
import os
from typing import Optional
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from unstructured.partition.auto import partition
app = FastAPI(title="unstructured")
class FileProcessingRequest(BaseModel):
"""文件处理请求模型"""
file_path: Optional[str] = None
# 可添加其他可选字段
@app.post("/process", tags=["文件处理"])
async def process_file(request_data: FileProcessingRequest):
"""处理文件并返回提取的文本内容"""
try:
file_path = request_data.file_path
if not file_path:
raise HTTPException(status_code=400, detail="缺少必要参数: filePath")
if not os.path.exists(file_path):
raise HTTPException(status_code=404, detail=f"文件不存在: {file_path}")
# 异步执行可能耗时的文件处理操作
text_content = await process_file_async(file_path)
# 返回处理结果
return {
"filePath": file_path,
"text": text_content,
"status": "success"
}
except HTTPException as e:
raise e
except Exception as e:
raise HTTPException(status_code=500, detail=f"process failed: {str(e)}")
async def process_file_async(file_path: str) -> str:
"""异步处理文件内容"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, partition_file_sync, file_path)
def partition_file_sync(file_path: str) -> str:
"""同步处理文件内容(由异步函数调用)"""
elements = partition(filename=file_path)
return "\n\n".join([str(el) for el in elements])
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)

103
scripts/save_images.sh Normal file
View File

@@ -0,0 +1,103 @@
#!/bin/bash
# ==========================================================
# 步骤 1: 定义帮助函数
# ==========================================================
# 脚本名称
SCRIPT_NAME=$(basename "$0")
help_message() {
cat << EOF
Usage: $SCRIPT_NAME [-d TARGET_DIR] [-h|--help]
描述:
将预定义的 Docker 镜像列表保存为 .tar 文件。
选项:
-d TARGET_DIR 指定保存镜像的目标目录。
(绝对路径或相对路径)
如果未指定,将使用默认路径: $TARGET_DIR_DEFAULT
-h, --help 显示此帮助信息并退出。
示例:
# 使用默认目录 (./dist)
$SCRIPT_NAME
# 指定保存到 /tmp/my-archive 目录
$SCRIPT_NAME -d /tmp/my-archive
EOF
}
# ==========================================================
# 步骤 2: 定义默认值和处理参数
# ==========================================================
# 默认目标目录
TARGET_DIR_DEFAULT="./dist"
TARGET_DIR="$TARGET_DIR_DEFAULT"
# 使用 getopts 处理命令行选项。
# d: 表示 -d 选项后需要一个参数(目标目录)。
while getopts "d:h" opt; do
case ${opt} in
d )
# 如果 -d 选项被指定,使用传入的参数作为目标目录
TARGET_DIR="$OPTARG"
;;
h )
# 如果是 -h 选项,显示帮助并退出
help_message
exit 0
;;
\? )
# 处理无效的选项
echo "错误:无效选项 -$OPTARG" >&2
help_message
exit 1
;;
esac
done
# 移动到下一个非选项参数 (通常此脚本没有其他参数,但这是最佳实践)
shift $((OPTIND -1))
# ==========================================================
# 步骤 3: 脚本核心逻辑
# ==========================================================
# 检查/创建目标文件夹
if ! mkdir -p "$TARGET_DIR"; then
echo "❌ 致命错误:无法创建目标目录: $TARGET_DIR" >&2
exit 1
fi
echo "目标目录已确认/创建: $TARGET_DIR"
echo "----------------------------------------"
# Image list
images=("frontend:latest" "backend:latest" "runtime:latest" "mysql:8")
for image in "${images[@]}"; do
# 清理镜像名称,用 '_' 替换 ':',以创建安全的文件名。
safe_name="${image//[:]/_}"
# 构造完整的输出文件路径。
output_path="$TARGET_DIR/$safe_name.tar"
echo "正在保存镜像 $image"
echo " -> 到文件 $output_path"
# 执行 docker save 命令
docker save -o "$output_path" "$image"
# 检查保存是否成功 ($? 存储上一个命令的退出状态)
if [ $? -eq 0 ]; then
echo "✅ 保存成功。"
else
echo "❌ 保存失败!"
fi
echo ""
done