init datamate

This commit is contained in:
Dallas98
2025-10-21 23:00:48 +08:00
commit 1c97afed7d
692 changed files with 135442 additions and 0 deletions

View File

@@ -0,0 +1,55 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>api-gateway</artifactId>
<name>API Gateway</name>
<description>API网关服务</description>
<dependencies>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-gateway</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis-reactive</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-security</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-loadbalancer</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,77 @@
package com.datamate.gateway;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.cloud.gateway.route.RouteLocator;
import org.springframework.cloud.gateway.route.builder.RouteLocatorBuilder;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
/**
* API Gateway & Auth Service Application
* 统一的API网关和认证授权微服务
* 提供路由、鉴权、限流等功能
*/
@SpringBootApplication
@ComponentScan(basePackages = {
"com.datamate.gateway",
"com.datamate.shared"
})
public class ApiGatewayApplication {
public static void main(String[] args) {
SpringApplication.run(ApiGatewayApplication.class, args);
}
@Bean
public RouteLocator customRouteLocator(RouteLocatorBuilder builder) {
return builder.routes()
// 数据归集服务路由
.route("data-collection", r -> r.path("/api/data-collection/**")
.uri("lb://data-collection-service"))
// 数据管理服务路由
.route("data-management", r -> r.path("/api/data-management/**")
.uri("lb://data-management-service"))
// 算子市场服务路由
.route("operator-market", r -> r.path("/api/operators/**")
.uri("lb://operator-market-service"))
// 数据清洗服务路由
.route("data-cleaning", r -> r.path("/api/cleaning/**")
.uri("lb://data-cleaning-service"))
// 数据合成服务路由
.route("data-synthesis", r -> r.path("/api/synthesis/**")
.uri("lb://data-synthesis-service"))
// 数据标注服务路由
.route("data-annotation", r -> r.path("/api/annotation/**")
.uri("lb://data-annotation-service"))
// 数据评估服务路由
.route("data-evaluation", r -> r.path("/api/evaluation/**")
.uri("lb://data-evaluation-service"))
// 流程编排服务路由
.route("pipeline-orchestration", r -> r.path("/api/pipelines/**")
.uri("lb://pipeline-orchestration-service"))
// 执行引擎服务路由
.route("execution-engine", r -> r.path("/api/execution/**")
.uri("lb://execution-engine-service"))
// 认证服务路由
.route("auth-service", r -> r.path("/api/auth/**")
.uri("lb://auth-service"))
// RAG服务路由
.route("rag-indexer", r -> r.path("/api/rag/indexer/**")
.uri("lb://rag-indexer-service"))
.route("rag-query", r -> r.path("/api/rag/query/**")
.uri("lb://rag-query-service"))
.build();
}
}

147
backend/openapi/README.md Normal file
View File

@@ -0,0 +1,147 @@
# OpenAPI Code Generation Configuration
# 基于YAML生成API代码的配置文件
## Maven Plugin Configuration for Spring Boot
# 在各个服务的pom.xml中添加以下插件配置:
```xml
<plugin>
<groupId>org.openapitools</groupId>
<artifactId>openapi-generator-maven-plugin</artifactId>
<version>6.6.0</version>
<executions>
<execution>
<goals>
<goal>generate</goal>
</goals>
<configuration>
<inputSpec>${project.basedir}/../../openapi/specs/${project.artifactId}.yaml</inputSpec>
<generatorName>spring</generatorName>
<output>${project.build.directory}/generated-sources/openapi</output>
<apiPackage>com.datamate.${project.name}.interfaces.api</apiPackage>
<modelPackage>com.datamate.${project.name}.interfaces.dto</modelPackage>
<configOptions>
<interfaceOnly>true</interfaceOnly>
<useTags>true</useTags>
<skipDefaultInterface>true</skipDefaultInterface>
<hideGenerationTimestamp>true</hideGenerationTimestamp>
<java8>true</java8>
<dateLibrary>java8</dateLibrary>
<useBeanValidation>true</useBeanValidation>
<performBeanValidation>true</performBeanValidation>
<useSpringBoot3>true</useSpringBoot3>
<documentationProvider>springdoc</documentationProvider>
</configOptions>
</configuration>
</execution>
</executions>
</plugin>
```
## Gradle Plugin Configuration (Alternative)
# 如果使用Gradle,可以使用以下配置:
```gradle
plugins {
id 'org.openapi.generator' version '6.6.0'
}
openApiGenerate {
generatorName = "spring"
inputSpec = "$rootDir/openapi/specs/${project.name}.yaml"
outputDir = "$buildDir/generated-sources/openapi"
apiPackage = "com.datamate.${project.name}.interfaces.api"
modelPackage = "com.datamate.${project.name}.interfaces.dto"
configOptions = [
interfaceOnly: "true",
useTags: "true",
skipDefaultInterface: "true",
hideGenerationTimestamp: "true",
java8: "true",
dateLibrary: "java8",
useBeanValidation: "true",
performBeanValidation: "true",
useSpringBoot3: "true",
documentationProvider: "springdoc"
]
}
```
## Frontend TypeScript Client Generation
# 为前端生成TypeScript客户端:
```bash
# 安装 OpenAPI Generator CLI
npm install -g @openapitools/openapi-generator-cli
# 生成TypeScript客户端
openapi-generator-cli generate \
-i openapi/specs/data-annotation-service.yaml \
-g typescript-axios \
-o frontend/packages/api-client/src/generated/annotation \
--additional-properties=supportsES6=true,npmName=@datamate/annotation-api,npmVersion=1.0.0
```
## Usage in Services
# 在各个服务中使用生成的代码:
1. **在 interfaces 层实现生成的API接口**
```java
@RestController
@RequestMapping("/api/v1/annotation")
public class AnnotationTaskController implements AnnotationTasksApi {
private final AnnotationTaskApplicationService annotationTaskService;
@Override
public ResponseEntity<AnnotationTaskPageResponse> getAnnotationTasks(
Integer page, Integer size, String status) {
// 实现业务逻辑
return ResponseEntity.ok(annotationTaskService.getTasks(page, size, status));
}
}
```
2. **在 application 层使用生成的DTO**
```java
@Service
public class AnnotationTaskApplicationService {
public AnnotationTaskPageResponse getTasks(Integer page, Integer size, String status) {
// 业务逻辑实现
// 使用生成的DTO类型
}
}
```
## Build Integration
# 构建集成脚本位置:scripts/build/generate-api.sh
```bash
#!/bin/bash
# 生成所有服务的API代码
OPENAPI_DIR="openapi/specs"
SERVICES=(
"data-annotation-service"
"data-management-service"
"operator-market-service"
"data-cleaning-service"
"data-synthesis-service"
"data-evaluation-service"
"pipeline-orchestration-service"
"execution-engine-service"
"rag-indexer-service"
"rag-query-service"
"api-gateway"
"auth-service"
)
for service in "${SERVICES[@]}"; do
echo "Generating API for $service..."
mvn -f backend/services/$service/pom.xml openapi-generator:generate
done
echo "All APIs generated successfully!"
```

View File

@@ -0,0 +1,298 @@
openapi: 3.0.3
info:
title: Data Annotation Service API
description: 数据标注服务API - 智能预标注、人工平台、主动学习
version: 1.0.0
contact:
name: Data Mate Platform Team
servers:
- url: http://localhost:8080
description: Development server
tags:
- name: annotation-tasks
description: 标注任务管理
- name: annotation-data
description: 标注数据管理
- name: pre-annotation
description: 智能预标注
- name: active-learning
description: 主动学习
paths:
/api/v1/annotation/tasks:
get:
tags:
- annotation-tasks
summary: 获取标注任务列表
description: 分页获取标注任务列表
parameters:
- name: page
in: query
description: 页码
schema:
type: integer
default: 0
- name: size
in: query
description: 每页大小
schema:
type: integer
default: 20
- name: status
in: query
description: 任务状态
schema:
type: string
enum: [PENDING, IN_PROGRESS, COMPLETED, PAUSED]
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/AnnotationTaskPageResponse'
'400':
description: 请求参数错误
'500':
description: 服务器内部错误
post:
tags:
- annotation-tasks
summary: 创建标注任务
description: 创建新的标注任务
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateAnnotationTaskRequest'
responses:
'201':
description: 创建成功
content:
application/json:
schema:
$ref: '#/components/schemas/AnnotationTaskResponse'
'400':
description: 请求参数错误
'500':
description: 服务器内部错误
/api/v1/annotation/tasks/{taskId}:
get:
tags:
- annotation-tasks
summary: 获取标注任务详情
parameters:
- name: taskId
in: path
required: true
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/AnnotationTaskResponse'
'404':
description: 任务不存在
put:
tags:
- annotation-tasks
summary: 更新标注任务
parameters:
- name: taskId
in: path
required: true
schema:
type: string
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/UpdateAnnotationTaskRequest'
responses:
'200':
description: 更新成功
content:
application/json:
schema:
$ref: '#/components/schemas/AnnotationTaskResponse'
/api/v1/annotation/pre-annotate:
post:
tags:
- pre-annotation
summary: 智能预标注
description: 使用AI模型进行智能预标注
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/PreAnnotationRequest'
responses:
'200':
description: 预标注成功
content:
application/json:
schema:
$ref: '#/components/schemas/PreAnnotationResponse'
components:
schemas:
AnnotationTaskResponse:
type: object
properties:
id:
type: string
description: 任务ID
name:
type: string
description: 任务名称
description:
type: string
description: 任务描述
type:
type: string
enum: [TEXT_CLASSIFICATION, NAMED_ENTITY_RECOGNITION, OBJECT_DETECTION, SEMANTIC_SEGMENTATION]
description: 标注类型
status:
type: string
enum: [PENDING, IN_PROGRESS, COMPLETED, PAUSED]
description: 任务状态
datasetId:
type: string
description: 数据集ID
progress:
type: number
format: double
description: 进度百分比
createdAt:
type: string
format: date-time
description: 创建时间
updatedAt:
type: string
format: date-time
description: 更新时间
CreateAnnotationTaskRequest:
type: object
required:
- name
- type
- datasetId
properties:
name:
type: string
description: 任务名称
description:
type: string
description: 任务描述
type:
type: string
enum: [TEXT_CLASSIFICATION, NAMED_ENTITY_RECOGNITION, OBJECT_DETECTION, SEMANTIC_SEGMENTATION]
description: 标注类型
datasetId:
type: string
description: 数据集ID
configuration:
type: object
description: 标注配置
UpdateAnnotationTaskRequest:
type: object
properties:
name:
type: string
description: 任务名称
description:
type: string
description: 任务描述
status:
type: string
enum: [PENDING, IN_PROGRESS, COMPLETED, PAUSED]
description: 任务状态
AnnotationTaskPageResponse:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/AnnotationTaskResponse'
totalElements:
type: integer
format: int64
totalPages:
type: integer
size:
type: integer
number:
type: integer
PreAnnotationRequest:
type: object
required:
- taskId
- dataIds
properties:
taskId:
type: string
description: 标注任务ID
dataIds:
type: array
items:
type: string
description: 待预标注的数据ID列表
modelId:
type: string
description: 预标注模型ID
confidence:
type: number
format: double
description: 置信度阈值
PreAnnotationResponse:
type: object
properties:
taskId:
type: string
description: 任务ID
processedCount:
type: integer
description: 已处理数据数量
successCount:
type: integer
description: 成功预标注数量
results:
type: array
items:
type: object
properties:
dataId:
type: string
annotations:
type: array
items:
type: object
confidence:
type: number
format: double
securitySchemes:
BearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- BearerAuth: []

View File

@@ -0,0 +1,491 @@
openapi: 3.0.3
info:
title: Data Cleaning Service API
description: 数据清洗服务API - 策略/规则、流程编排对接
version: 1.0.0
contact:
name: Data Mate Platform Team
servers:
- url: http://localhost:8084
description: Development server
tags:
- name: CleaningTask
description: 数据清洗任务管理
- name: CleaningTemplate
description: 数据清洗模板管理
paths:
/ray/log:
get:
summary: 获取ray日志文件
deprecated: false
description: ''
tags: [ ]
parameters: [ ]
responses:
'200':
description: ''
content:
application/json:
schema:
type: object
properties: { }
headers: { }
security: [ ]
/cleaning/tasks:
get:
summary: 查询数据清洗任务列表
deprecated: false
description: 获取所有数据清洗任务或根据查询参数筛选任务。
tags:
- CleaningTask
parameters:
- name: status
in: query
description: 根据任务状态筛选 (e.g., pending, running, completed, failed)
required: false
schema:
type: string
- name: keywords
in: query
description: 关键字
required: false
schema:
type: string
- name: page
in: query
description: 分页数
required: true
schema:
type: integer
- name: size
in: query
description: 分页单页数
required: true
schema:
type: integer
responses:
'200':
description: 成功获取任务列表
content:
application/json:
schema:
type: array
items: &ref_1
$ref: '#/components/schemas/CleaningTask'
headers: { }
security: [ ]
post:
summary: 创建新的数据清洗任务
deprecated: false
description: 可以直接创建任务或基于现有模板创建任务。
tags:
- CleaningTask
parameters: [ ]
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/CreateCleaningTaskRequest'
examples: { }
responses:
'201':
description: 任务创建成功
content:
application/json:
schema: *ref_1
headers: { }
security: [ ]
/cleaning/tasks/{taskId}:
get:
summary: 获取单个数据清洗任务详情
deprecated: false
description: 根据任务ID获取任务的详细信息。
tags:
- CleaningTask
parameters:
- name: taskId
in: path
description: 任务的唯一标识符
required: true
example: ''
schema:
type: string
responses:
'200':
description: 成功获取任务详情
content:
application/json:
schema: *ref_1
headers: { }
security: [ ]
delete:
summary: 删除数据清洗任务
deprecated: false
description: 根据任务ID删除指定的任务。
tags:
- CleaningTask
parameters:
- name: taskId
in: path
description: 任务的唯一标识符
required: true
example: ''
schema:
type: string
responses:
'204':
description: 任务删除成功
headers: { }
security: [ ]
/cleaning/templates:
get:
summary: 查询数据清洗模板列表
deprecated: false
description: 获取所有可用的数据清洗模板。
tags:
- CleaningTemplate
parameters: [ ]
responses:
'200':
description: 成功获取模板列表
content:
application/json:
schema:
type: array
items: &ref_2
$ref: '#/components/schemas/CleaningTemplate'
headers: { }
security: [ ]
post:
summary: 创建新的数据清洗模板
deprecated: false
description: 定义一个新的数据清洗模板。
tags:
- CleaningTemplate
parameters: [ ]
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/CreateCleaningTemplateRequest'
responses:
'201':
description: 模板创建成功
content:
application/json:
schema: *ref_2
headers: { }
security: [ ]
/cleaning/templates/{templateId}:
get:
summary: 获取单个数据清洗模板详情
deprecated: false
description: 根据模板ID获取模板的详细信息。
tags:
- CleaningTemplate
parameters:
- name: templateId
in: path
description: 模板的唯一标识符
required: true
example: ''
schema:
type: string
responses:
'200':
description: 成功获取模板详情
content:
application/json:
schema: *ref_2
headers: { }
security: [ ]
put:
summary: 更新数据清洗模板
deprecated: false
description: 根据模板ID更新模板的全部信息。
tags:
- CleaningTemplate
parameters:
- name: templateId
in: path
description: 模板的唯一标识符
required: true
example: ''
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/UpdateCleaningTemplateRequest'
responses:
'200':
description: 模板更新成功
content:
application/json:
schema: *ref_2
headers: { }
security: [ ]
delete:
summary: 删除数据清洗模板
deprecated: false
description: 根据模板ID删除指定的模板。
tags:
- CleaningTemplate
parameters:
- name: templateId
in: path
description: 模板的唯一标识符
required: true
example: ''
schema:
type: string
responses:
'204':
description: 模板删除成功
headers: { }
security: [ ]
components:
schemas:
OperatorInstance:
type: object
properties:
id:
type: string
overrides:
type: object
properties: { }
additionalProperties:
type: object
properties: { }
required:
- id
- overrides
CleaningProcess:
type: object
properties:
process:
type: number
format: float
description: 进度百分比
totalFileNum:
type: integer
description: 总文件数量
finishedFileNum:
type: integer
description: 已完成文件数量
required:
- process
- totalFileNum
- finishedFileNum
OperatorResponse:
type: object
properties:
id:
type: string
description: 算子ID
name:
type: string
description: 算子名称
description:
type: string
description: 算子描述
version:
type: string
description: 算子版本
inputs:
type: string
description: 输入类型
outputs:
type: string
description: 输入类型
runtime:
type: string
description: 运行时设置
settings:
type: string
description: 算子参数
isStar:
type: boolean
description: 是否收藏
createdAt:
type: string
format: date-time
description: 创建时间
updatedAt:
type: string
format: date-time
description: 更新时间
required:
- inputs
- outputs
- runtime
- settings
- isStar
UpdateCleaningTemplateRequest:
type: object
required:
- name
- instance
- id
properties:
id:
type: string
name:
type: string
description: 模板名称
description:
type: string
description: 模板描述
instance:
type: array
items: &ref_3
$ref: '#/components/schemas/OperatorInstance'
description: 模板定义的清洗规则和配置
CreateCleaningTemplateRequest:
type: object
required:
- name
- instance
properties:
name:
type: string
description: 模板名称
description:
type: string
description: 模板描述
instance:
type: array
items: *ref_3
description: 任务的具体配置(如果非模板创建,则直接定义)'
CleaningTemplate:
type: object
required:
- id
- name
- instance
- createdAt
properties:
id:
type: string
description: 模板唯一标识符
name:
type: string
description: 模板名称
description:
type: string
description: 模板描述
instance:
type: array
items: &ref_4
$ref: '#/components/schemas/OperatorResponse'
description: 模板定义的清洗规则和配置
createdAt:
type: string
format: date-time
description: 模板创建时间
updatedAt:
type: string
format: date-time
description: 模板最后更新时间
CreateCleaningTaskRequest:
type: object
required:
- name
- instance
- srcDatasetId
- srcDatasetName
- destDatasetName
- destDatasetType
properties:
name:
type: string
description: 任务名称
description:
type: string
description: 任务描述
srcDatasetId:
type: string
srcDatasetName:
type: string
destDatasetName:
type: string
destDatasetType:
type: string
instance:
type: array
items: *ref_3
description: 任务的具体配置(如果非模板创建,则直接定义)
ErrorResponse:
type: object
properties:
error:
type: string
description: 错误类型
message:
type: string
description: 错误详细信息
CleaningTask:
type: object
required:
- id
- name
- status
- createdAt
- startedAt
properties:
id:
type: string
description: 任务唯一标识符
name:
type: string
description: 任务名称
description:
type: string
description: 任务描述
srcDatasetId:
type: string
description: 源数据集id
srcDatasetName:
type: string
description: 源数据集名称
destDatasetId:
type: string
description: 目标数据集id
destDatasetName:
type: string
description: 目标数据集名称
status:
type: string
description: 任务当前状态
enum:
- pending
- running
- completed
- failed
templateId:
type: string
description: 关联的模板ID(如果基于模板创建)
instance:
type: array
items: *ref_4
description: 任务的具体配置(如果非模板创建,则直接定义)
progress:
$ref: '#/components/schemas/CleaningProcess'
createdAt:
type: string
description: 任务创建时间
format: date-time
startedAt:
type: string
format: date-time
description: 任务开始时间
finishedAt:
type: string
format: date-time
description: 任务最后更新时间
securitySchemes: { }

View File

@@ -0,0 +1,517 @@
openapi: 3.0.3
info:
title: Data Collection Service API
description: |
数据归集服务API,基于数据归集实现数据采集和归集功能。
主要功能:
- 数据归集任务创建和管理
- 数据同步任务执行
- 任务监控和状态查询
- 执行日志查看
version: 1.0.0
servers:
- url: http://localhost:8090/api/v1/collection
description: Development server
tags:
- name: CollectionTask
description: 数据归集任务管理(包括模板查询)
- name: TaskExecution
description: 任务执行管理
paths:
/data-collection/tasks:
get:
operationId: getTasks
tags: [CollectionTask]
summary: 获取归集任务列表
parameters:
- name: page
in: query
schema:
type: integer
default: 0
- name: size
in: query
schema:
type: integer
default: 20
- name: status
in: query
schema:
$ref: '#/components/schemas/TaskStatus'
- name: name
in: query
description: 任务名称关键字搜索
schema:
type: string
responses:
'200':
description: 归集任务列表
content:
application/json:
schema:
$ref: '#/components/schemas/PagedCollectionTaskSummary'
post:
operationId: createTask
tags: [CollectionTask]
summary: 创建归集任务
description: 创建新的数据归集任务
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateCollectionTaskRequest'
responses:
'201':
description: 归集任务创建成功
content:
application/json:
schema:
$ref: '#/components/schemas/CollectionTaskResponse'
/data-collection/tasks/{id}:
get:
operationId: getTaskDetail
tags: [CollectionTask]
summary: 获取归集任务详情
parameters:
- name: id
in: path
required: true
schema:
type: string
responses:
'200':
description: 归集任务详情
content:
application/json:
schema:
$ref: '#/components/schemas/CollectionTaskResponse'
'404':
description: 归集任务不存在
put:
operationId: updateTask
tags: [CollectionTask]
summary: 更新归集任务
parameters:
- name: id
in: path
required: true
schema:
type: string
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/UpdateCollectionTaskRequest'
responses:
'200':
description: 归集任务更新成功
content:
application/json:
schema:
$ref: '#/components/schemas/CollectionTaskResponse'
delete:
operationId: deleteTask
tags: [CollectionTask]
summary: 删除归集任务
parameters:
- name: id
in: path
required: true
schema:
type: string
responses:
'204':
description: 归集任务删除成功
/tasks/{id}/execute:
post:
tags: [TaskExecution]
summary: 执行归集任务
description: 立即执行指定的归集任务
parameters:
- name: id
in: path
required: true
schema:
type: string
responses:
'201':
description: 任务执行已启动
content:
application/json:
schema:
$ref: '#/components/schemas/TaskExecutionResponse'
/tasks/{id}/executions:
get:
tags: [TaskExecution]
summary: 获取任务执行记录
parameters:
- name: id
in: path
required: true
schema:
type: string
- name: page
in: query
schema:
type: integer
default: 0
- name: size
in: query
schema:
type: integer
default: 20
responses:
'200':
description: 任务执行记录列表
content:
application/json:
schema:
$ref: '#/components/schemas/PagedTaskExecutions'
/executions/{id}:
get:
tags: [TaskExecution]
summary: 获取执行详情
parameters:
- name: id
in: path
required: true
schema:
type: string
responses:
'200':
description: 执行详情
content:
application/json:
schema:
$ref: '#/components/schemas/TaskExecutionDetail'
delete:
tags: [TaskExecution]
summary: 停止任务执行
parameters:
- name: id
in: path
required: true
schema:
type: string
responses:
'204':
description: 任务执行已停止
/templates:
get:
tags: [CollectionTask]
summary: 获取DataX模板列表
description: 获取可用的DataX任务模板列表,用于创建任务时选择
parameters:
- name: sourceType
in: query
description: 源数据源类型过滤
schema:
type: string
- name: targetType
in: query
description: 目标数据源类型过滤
schema:
type: string
- name: page
in: query
schema:
type: integer
default: 0
- name: size
in: query
schema:
type: integer
default: 20
responses:
'200':
description: 归集模板列表
content:
application/json:
schema:
$ref: '#/components/schemas/PagedDataxTemplates'
components:
schemas:
TaskStatus:
type: string
enum:
- DRAFT
- READY
- RUNNING
- SUCCESS
- FAILED
- STOPPED
description: |
任务和执行状态枚举:
- DRAFT: 草稿状态
- READY: 就绪状态
- RUNNING: 运行中
- SUCCESS: 执行成功 (对应原来的COMPLETED/SUCCESS)
- FAILED: 执行失败
- STOPPED: 已停止
SyncMode:
type: string
enum: [ONCE, SCHEDULED]
description: 同步方式:一次性(ONCE) 或 定时(SCHEDULED)
CollectionTaskSummary:
type: object
properties:
id:
type: string
name:
type: string
description:
type: string
status:
$ref: '#/components/schemas/TaskStatus'
syncMode:
$ref: '#/components/schemas/SyncMode'
lastExecutionId:
type: string
description: 最后执行ID
createdAt:
type: string
format: date-time
updatedAt:
type: string
format: date-time
description: 任务列表摘要信息(不包含详细配置与调度表达式)
CollectionTaskResponse:
type: object
properties:
id:
type: string
name:
type: string
description:
type: string
config:
type: object
additionalProperties: true
description: 归集配置,包含源端和目标端配置信息
status:
$ref: '#/components/schemas/TaskStatus'
syncMode:
$ref: '#/components/schemas/SyncMode'
scheduleExpression:
type: string
description: Cron调度表达式 (仅当 syncMode = SCHEDULED 时有效)
lastExecutionId:
type: string
description: 最后执行ID
createdAt:
type: string
format: date-time
updatedAt:
type: string
format: date-time
CreateCollectionTaskRequest:
type: object
required:
- name
- config
- syncMode
properties:
name:
type: string
description: 任务名称
minLength: 1
maxLength: 100
description:
type: string
description: 任务描述
maxLength: 500
config:
type: object
description: 归集配置,包含源端和目标端配置信息
additionalProperties: true
syncMode:
$ref: '#/components/schemas/SyncMode'
scheduleExpression:
type: string
description: Cron调度表达式 (syncMode=SCHEDULED 时必填)
UpdateCollectionTaskRequest:
type: object
properties:
name:
type: string
description: 任务名称
minLength: 1
maxLength: 100
description:
type: string
description: 任务描述
maxLength: 500
config:
type: object
description: 归集配置,包含源端和目标端配置信息
additionalProperties: true
syncMode:
$ref: '#/components/schemas/SyncMode'
scheduleExpression:
type: string
description: Cron调度表达式 (syncMode=SCHEDULED 时必填)
PagedCollectionTaskSummary:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/CollectionTaskSummary'
totalElements:
type: integer
totalPages:
type: integer
number:
type: integer
size:
type: integer
PagedCollectionTasks:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/CollectionTaskResponse'
totalElements:
type: integer
totalPages:
type: integer
number:
type: integer
size:
type: integer
TaskExecutionResponse:
type: object
properties:
id:
type: string
taskId:
type: string
taskName:
type: string
status:
$ref: '#/components/schemas/TaskStatus'
startedAt:
type: string
format: date-time
TaskExecutionDetail:
type: object
properties:
id:
type: string
taskId:
type: string
taskName:
type: string
status:
$ref: '#/components/schemas/TaskStatus'
progress:
type: number
format: double
minimum: 0
maximum: 100
recordsTotal:
type: integer
recordsProcessed:
type: integer
recordsSuccess:
type: integer
recordsFailed:
type: integer
throughput:
type: number
format: double
dataSizeBytes:
type: integer
startedAt:
type: string
format: date-time
completedAt:
type: string
format: date-time
durationSeconds:
type: integer
errorMessage:
type: string
PagedTaskExecutions:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/TaskExecutionDetail'
totalElements:
type: integer
totalPages:
type: integer
number:
type: integer
size:
type: integer
DataxTemplateSummary:
type: object
properties:
id:
type: string
name:
type: string
sourceType:
type: string
description: 源数据源类型
targetType:
type: string
description: 目标数据源类型
description:
type: string
version:
type: string
isSystem:
type: boolean
description: 是否为系统模板
createdAt:
type: string
format: date-time
PagedDataxTemplates:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/DataxTemplateSummary'
totalElements:
type: integer
totalPages:
type: integer
number:
type: integer
size:
type: integer

View File

@@ -0,0 +1,630 @@
openapi: 3.0.3
info:
title: Data Evaluation Service API
description: 数据评估服务API - 质量、适配性、价值评估
version: 1.0.0
contact:
name: Data Mate Platform Team
servers:
- url: http://localhost:8086
description: Development server
tags:
- name: quality-evaluation
description: 数据质量评估
- name: compatibility-evaluation
description: 适配性评估
- name: value-evaluation
description: 价值评估
- name: evaluation-reports
description: 评估报告
paths:
/api/v1/evaluation/quality:
post:
tags:
- quality-evaluation
summary: 数据质量评估
description: 对数据集进行质量评估,包括完整性、准确性、一致性等
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/QualityEvaluationRequest'
responses:
'200':
description: 评估成功
content:
application/json:
schema:
$ref: '#/components/schemas/QualityEvaluationResponse'
/api/v1/evaluation/quality/{evaluationId}:
get:
tags:
- quality-evaluation
summary: 获取质量评估结果
parameters:
- name: evaluationId
in: path
required: true
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/QualityEvaluationDetailResponse'
/api/v1/evaluation/compatibility:
post:
tags:
- compatibility-evaluation
summary: 适配性评估
description: 评估数据集与目标模型或任务的适配性
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CompatibilityEvaluationRequest'
responses:
'200':
description: 评估成功
content:
application/json:
schema:
$ref: '#/components/schemas/CompatibilityEvaluationResponse'
/api/v1/evaluation/value:
post:
tags:
- value-evaluation
summary: 价值评估
description: 评估数据集的商业价值和使用价值
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/ValueEvaluationRequest'
responses:
'200':
description: 评估成功
content:
application/json:
schema:
$ref: '#/components/schemas/ValueEvaluationResponse'
/api/v1/evaluation/reports:
get:
tags:
- evaluation-reports
summary: 获取评估报告列表
parameters:
- name: page
in: query
schema:
type: integer
default: 0
- name: size
in: query
schema:
type: integer
default: 20
- name: type
in: query
schema:
$ref: '#/components/schemas/EvaluationType'
- name: datasetId
in: query
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/EvaluationReportPageResponse'
/api/v1/evaluation/reports/{reportId}:
get:
tags:
- evaluation-reports
summary: 获取评估报告详情
parameters:
- name: reportId
in: path
required: true
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/EvaluationReportDetailResponse'
/api/v1/evaluation/reports/{reportId}/export:
get:
tags:
- evaluation-reports
summary: 导出评估报告
parameters:
- name: reportId
in: path
required: true
schema:
type: string
- name: format
in: query
schema:
type: string
enum: [PDF, EXCEL, JSON]
default: PDF
responses:
'200':
description: 导出成功
content:
application/octet-stream:
schema:
type: string
format: binary
/api/v1/evaluation/batch:
post:
tags:
- evaluation-reports
summary: 批量评估
description: 对多个数据集进行批量评估
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/BatchEvaluationRequest'
responses:
'202':
description: 批量评估任务已提交
content:
application/json:
schema:
$ref: '#/components/schemas/BatchEvaluationResponse'
components:
schemas:
QualityEvaluationRequest:
type: object
required:
- datasetId
- metrics
properties:
datasetId:
type: string
description: 数据集ID
metrics:
type: array
items:
$ref: '#/components/schemas/QualityMetric'
description: 评估指标
sampleSize:
type: integer
description: 采样大小
parameters:
type: object
description: 评估参数
QualityEvaluationResponse:
type: object
properties:
evaluationId:
type: string
status:
$ref: '#/components/schemas/EvaluationStatus'
overallScore:
type: number
format: double
description: 总体质量分数
metrics:
type: array
items:
$ref: '#/components/schemas/QualityMetricResult'
recommendations:
type: array
items:
type: string
createdAt:
type: string
format: date-time
QualityEvaluationDetailResponse:
allOf:
- $ref: '#/components/schemas/QualityEvaluationResponse'
- type: object
properties:
detailedResults:
$ref: '#/components/schemas/DetailedQualityResults'
visualizations:
type: array
items:
$ref: '#/components/schemas/VisualizationData'
CompatibilityEvaluationRequest:
type: object
required:
- datasetId
- targetType
properties:
datasetId:
type: string
targetType:
$ref: '#/components/schemas/TargetType'
targetConfig:
type: object
description: 目标配置(模型、任务等)
evaluationCriteria:
type: array
items:
$ref: '#/components/schemas/CompatibilityCriterion'
CompatibilityEvaluationResponse:
type: object
properties:
evaluationId:
type: string
compatibilityScore:
type: number
format: double
results:
type: array
items:
$ref: '#/components/schemas/CompatibilityResult'
suggestions:
type: array
items:
type: string
createdAt:
type: string
format: date-time
ValueEvaluationRequest:
type: object
required:
- datasetId
- valueCriteria
properties:
datasetId:
type: string
valueCriteria:
type: array
items:
$ref: '#/components/schemas/ValueCriterion'
marketContext:
type: object
description: 市场环境信息
businessContext:
type: object
description: 业务环境信息
ValueEvaluationResponse:
type: object
properties:
evaluationId:
type: string
valueScore:
type: number
format: double
monetaryValue:
type: number
format: double
description: 货币价值估算
strategicValue:
type: number
format: double
description: 战略价值评分
results:
type: array
items:
$ref: '#/components/schemas/ValueResult'
insights:
type: array
items:
type: string
EvaluationReportResponse:
type: object
properties:
id:
type: string
datasetId:
type: string
type:
$ref: '#/components/schemas/EvaluationType'
status:
$ref: '#/components/schemas/EvaluationStatus'
overallScore:
type: number
format: double
summary:
type: string
createdAt:
type: string
format: date-time
completedAt:
type: string
format: date-time
EvaluationReportPageResponse:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/EvaluationReportResponse'
totalElements:
type: integer
format: int64
totalPages:
type: integer
size:
type: integer
number:
type: integer
EvaluationReportDetailResponse:
allOf:
- $ref: '#/components/schemas/EvaluationReportResponse'
- type: object
properties:
qualityResults:
$ref: '#/components/schemas/QualityEvaluationResponse'
compatibilityResults:
$ref: '#/components/schemas/CompatibilityEvaluationResponse'
valueResults:
$ref: '#/components/schemas/ValueEvaluationResponse'
attachments:
type: array
items:
$ref: '#/components/schemas/ReportAttachment'
BatchEvaluationRequest:
type: object
required:
- datasetIds
- evaluationTypes
properties:
datasetIds:
type: array
items:
type: string
evaluationTypes:
type: array
items:
$ref: '#/components/schemas/EvaluationType'
parameters:
type: object
BatchEvaluationResponse:
type: object
properties:
batchId:
type: string
status:
type: string
totalTasks:
type: integer
submittedAt:
type: string
format: date-time
QualityMetric:
type: string
enum:
- COMPLETENESS
- ACCURACY
- CONSISTENCY
- VALIDITY
- UNIQUENESS
- TIMELINESS
QualityMetricResult:
type: object
properties:
metric:
$ref: '#/components/schemas/QualityMetric'
score:
type: number
format: double
details:
type: object
issues:
type: array
items:
$ref: '#/components/schemas/QualityIssue'
DetailedQualityResults:
type: object
properties:
fieldAnalysis:
type: array
items:
$ref: '#/components/schemas/FieldAnalysis'
distributionAnalysis:
$ref: '#/components/schemas/DistributionAnalysis'
correlationAnalysis:
$ref: '#/components/schemas/CorrelationAnalysis'
TargetType:
type: string
enum:
- LANGUAGE_MODEL
- CLASSIFICATION_MODEL
- RECOMMENDATION_SYSTEM
- CUSTOM_TASK
CompatibilityCriterion:
type: string
enum:
- FORMAT_COMPATIBILITY
- SCHEMA_COMPATIBILITY
- SIZE_ADEQUACY
- DISTRIBUTION_MATCH
- FEATURE_COVERAGE
CompatibilityResult:
type: object
properties:
criterion:
$ref: '#/components/schemas/CompatibilityCriterion'
score:
type: number
format: double
status:
type: string
enum: [PASS, WARN, FAIL]
details:
type: string
ValueCriterion:
type: string
enum:
- RARITY
- DEMAND
- QUALITY
- COMPLETENESS
- TIMELINESS
- STRATEGIC_IMPORTANCE
ValueResult:
type: object
properties:
criterion:
$ref: '#/components/schemas/ValueCriterion'
score:
type: number
format: double
impact:
type: string
enum: [LOW, MEDIUM, HIGH]
explanation:
type: string
EvaluationType:
type: string
enum:
- QUALITY
- COMPATIBILITY
- VALUE
- COMPREHENSIVE
EvaluationStatus:
type: string
enum:
- PENDING
- RUNNING
- COMPLETED
- FAILED
QualityIssue:
type: object
properties:
type:
type: string
severity:
type: string
enum: [LOW, MEDIUM, HIGH, CRITICAL]
description:
type: string
affectedRecords:
type: integer
suggestions:
type: array
items:
type: string
FieldAnalysis:
type: object
properties:
fieldName:
type: string
dataType:
type: string
nullCount:
type: integer
uniqueCount:
type: integer
statistics:
type: object
DistributionAnalysis:
type: object
properties:
distributions:
type: array
items:
type: object
outliers:
type: array
items:
type: object
patterns:
type: array
items:
type: string
CorrelationAnalysis:
type: object
properties:
correlationMatrix:
type: array
items:
type: array
items:
type: number
significantCorrelations:
type: array
items:
type: object
VisualizationData:
type: object
properties:
type:
type: string
enum: [CHART, GRAPH, HISTOGRAM, HEATMAP]
title:
type: string
data:
type: object
config:
type: object
ReportAttachment:
type: object
properties:
id:
type: string
name:
type: string
type:
type: string
size:
type: integer
format: int64
downloadUrl:
type: string
securitySchemes:
BearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- BearerAuth: []

View File

@@ -0,0 +1,719 @@
openapi: 3.0.3
info:
title: Data Management Service API
description: |
数据管理服务API,提供数据集的创建、管理和文件操作功能。
主要功能:
- 数据集的创建和管理
- 多种数据集类型支持(图像、文本、音频、视频、多模态等)
- 数据集文件管理
- 数据集标签和元数据管理
- 数据集统计信息
version: 1.0.0
servers:
- url: http://localhost:8092/api/v1/data-management
description: Development server
tags:
- name: Dataset
description: 数据集管理
- name: DatasetFile
description: 数据集文件管理
- name: DatasetType
description: 数据集类型管理
- name: Tag
description: 标签管理
paths:
/data-management/datasets:
get:
tags: [Dataset]
operationId: getDatasets
summary: 获取数据集列表
description: 分页查询数据集列表,支持按类型、标签等条件筛选
parameters:
- name: page
in: query
schema:
type: integer
default: 0
description: 页码,从0开始
- name: size
in: query
schema:
type: integer
default: 20
description: 每页大小
- name: type
in: query
schema:
type: string
description: 数据集类型过滤
- name: tags
in: query
schema:
type: string
description: 标签过滤,多个标签用逗号分隔
- name: keyword
in: query
schema:
type: string
description: 关键词搜索(名称、描述)
- name: status
in: query
schema:
type: string
enum: [ACTIVE, INACTIVE, PROCESSING]
description: 数据集状态过滤
responses:
'200':
description: 成功
content:
application/json:
schema:
$ref: '#/components/schemas/PagedDatasetResponse'
'400':
description: 请求参数错误
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
post:
tags: [Dataset]
operationId: createDataset
summary: 创建数据集
description: 创建新的数据集
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateDatasetRequest'
responses:
'201':
description: 创建成功
content:
application/json:
schema:
$ref: '#/components/schemas/DatasetResponse'
'400':
description: 请求参数错误
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
/data-management/datasets/{datasetId}:
get:
tags: [Dataset]
operationId: getDatasetById
summary: 获取数据集详情
description: 根据ID获取数据集详细信息
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
responses:
'200':
description: 成功
content:
application/json:
schema:
$ref: '#/components/schemas/DatasetResponse'
'404':
description: 数据集不存在
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
put:
tags: [Dataset]
summary: 更新数据集
operationId: updateDataset
description: 更新数据集信息
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/UpdateDatasetRequest'
responses:
'200':
description: 更新成功
content:
application/json:
schema:
$ref: '#/components/schemas/DatasetResponse'
'404':
description: 数据集不存在
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
delete:
tags: [Dataset]
operationId: deleteDataset
summary: 删除数据集
description: 删除指定的数据集
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
responses:
'204':
description: 删除成功
'404':
description: 数据集不存在
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
/data-management/datasets/{datasetId}/files:
get:
tags: [DatasetFile]
summary: 获取数据集文件列表
operationId: getDatasetFiles
description: 分页获取数据集中的文件列表
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
- name: page
in: query
schema:
type: integer
default: 0
description: 页码,从0开始
- name: size
in: query
schema:
type: integer
default: 20
description: 每页大小
- name: fileType
in: query
schema:
type: string
description: 文件类型过滤
- name: status
in: query
schema:
type: string
enum: [UPLOADED, PROCESSING, COMPLETED, ERROR]
description: 文件状态过滤
responses:
'200':
description: 成功
content:
application/json:
schema:
$ref: '#/components/schemas/PagedDatasetFileResponse'
post:
tags: [DatasetFile]
summary: 上传文件到数据集
operationId: uploadDatasetFile
description: 向指定数据集上传文件
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
requestBody:
required: true
content:
multipart/form-data:
schema:
type: object
properties:
file:
type: string
format: binary
description: 要上传的文件
description:
type: string
description: 文件描述
responses:
'201':
description: 上传成功
content:
application/json:
schema:
$ref: '#/components/schemas/DatasetFileResponse'
/data-management/datasets/{datasetId}/files/{fileId}:
get:
tags: [DatasetFile]
summary: 获取文件详情
description: 获取数据集中指定文件的详细信息
operationId: getDatasetFileById
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
- name: fileId
in: path
required: true
schema:
type: string
description: 文件ID
responses:
'200':
description: 成功
content:
application/json:
schema:
$ref: '#/components/schemas/DatasetFileResponse'
delete:
tags: [DatasetFile]
summary: 删除文件
operationId: deleteDatasetFile
description: 从数据集中删除指定文件
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
- name: fileId
in: path
required: true
schema:
type: string
description: 文件ID
responses:
'204':
description: 删除成功
/data-management/datasets/{datasetId}/files/{fileId}/download:
get:
tags: [DatasetFile]
operationId: downloadDatasetFile
summary: 下载文件
description: 下载数据集中的指定文件
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
- name: fileId
in: path
required: true
schema:
type: string
description: 文件ID
responses:
'200':
description: 文件内容
content:
application/octet-stream:
schema:
type: string
format: binary
/data-management/dataset-types:
get:
operationId: getDatasetTypes
tags: [DatasetType]
summary: 获取数据集类型列表
description: 获取所有支持的数据集类型
responses:
'200':
description: 成功
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/DatasetTypeResponse'
/data-management/tags:
get:
tags: [Tag]
operationId: getTags
summary: 获取标签列表
description: 获取所有可用的标签
parameters:
- name: keyword
in: query
schema:
type: string
description: 标签名称关键词搜索
responses:
'200':
description: 成功
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/TagResponse'
post:
tags: [Tag]
operationId: createTag
summary: 创建标签
description: 创建新的标签
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateTagRequest'
responses:
'201':
description: 创建成功
content:
application/json:
schema:
$ref: '#/components/schemas/TagResponse'
/data-management/datasets/{datasetId}/statistics:
get:
tags: [Dataset]
operationId: getDatasetStatistics
summary: 获取数据集统计信息
description: 获取数据集的统计信息(文件数量、大小、完成度等)
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
responses:
'200':
description: 成功
content:
application/json:
schema:
$ref: '#/components/schemas/DatasetStatisticsResponse'
components:
schemas:
PagedDatasetResponse:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/DatasetResponse'
page:
type: integer
description: 当前页码
size:
type: integer
description: 每页大小
totalElements:
type: integer
description: 总元素数
totalPages:
type: integer
description: 总页数
first:
type: boolean
description: 是否为第一页
last:
type: boolean
description: 是否为最后一页
DatasetResponse:
type: object
properties:
id:
type: string
description: 数据集ID
name:
type: string
description: 数据集名称
description:
type: string
description: 数据集描述
type:
$ref: '#/components/schemas/DatasetTypeResponse'
status:
type: string
enum: [ACTIVE, INACTIVE, PROCESSING]
description: 数据集状态
tags:
type: array
items:
$ref: '#/components/schemas/TagResponse'
description: 标签列表
dataSource:
type: string
description: 数据源
targetLocation:
type: string
description: 目标位置
fileCount:
type: integer
description: 文件数量
totalSize:
type: integer
format: int64
description: 总大小(字节)
completionRate:
type: number
format: float
description: 完成率(0-100)
createdAt:
type: string
format: date-time
description: 创建时间
updatedAt:
type: string
format: date-time
description: 更新时间
createdBy:
type: string
description: 创建者
CreateDatasetRequest:
type: object
required:
- name
- type
properties:
name:
type: string
description: 数据集名称
minLength: 1
maxLength: 100
description:
type: string
description: 数据集描述
maxLength: 500
type:
type: string
description: 数据集类型
tags:
type: array
items:
type: string
description: 标签列表
dataSource:
type: string
description: 数据源
targetLocation:
type: string
description: 目标位置
UpdateDatasetRequest:
type: object
properties:
name:
type: string
description: 数据集名称
maxLength: 100
description:
type: string
description: 数据集描述
maxLength: 500
tags:
type: array
items:
type: string
description: 标签列表
status:
type: string
enum: [ACTIVE, INACTIVE]
description: 数据集状态
DatasetTypeResponse:
type: object
properties:
code:
type: string
description: 类型编码
name:
type: string
description: 类型名称
description:
type: string
description: 类型描述
supportedFormats:
type: array
items:
type: string
description: 支持的文件格式
icon:
type: string
description: 图标
PagedDatasetFileResponse:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/DatasetFileResponse'
page:
type: integer
description: 当前页码
size:
type: integer
description: 每页大小
totalElements:
type: integer
description: 总元素数
totalPages:
type: integer
description: 总页数
first:
type: boolean
description: 是否为第一页
last:
type: boolean
description: 是否为最后一页
DatasetFileResponse:
type: object
properties:
id:
type: string
description: 文件ID
fileName:
type: string
description: 文件名
originalName:
type: string
description: 原始文件名
fileType:
type: string
description: 文件类型
fileSize:
type: integer
format: int64
description: 文件大小(字节)
status:
type: string
enum: [UPLOADED, PROCESSING, COMPLETED, ERROR]
description: 文件状态
description:
type: string
description: 文件描述
filePath:
type: string
description: 文件路径
uploadTime:
type: string
format: date-time
description: 上传时间
uploadedBy:
type: string
description: 上传者
TagResponse:
type: object
properties:
id:
type: string
description: 标签ID
name:
type: string
description: 标签名称
color:
type: string
description: 标签颜色
description:
type: string
description: 标签描述
usageCount:
type: integer
description: 使用次数
CreateTagRequest:
type: object
required:
- name
properties:
name:
type: string
description: 标签名称
minLength: 1
maxLength: 50
color:
type: string
description: 标签颜色
pattern: '^#[0-9A-Fa-f]{6}$'
description:
type: string
description: 标签描述
maxLength: 200
DatasetStatisticsResponse:
type: object
properties:
totalFiles:
type: integer
description: 总文件数
completedFiles:
type: integer
description: 已完成文件数
totalSize:
type: integer
format: int64
description: 总大小(字节)
completionRate:
type: number
format: float
description: 完成率(0-100)
fileTypeDistribution:
type: object
additionalProperties:
type: integer
description: 文件类型分布
statusDistribution:
type: object
additionalProperties:
type: integer
description: 状态分布
ErrorResponse:
type: object
properties:
error:
type: string
description: 错误代码
message:
type: string
description: 错误消息
timestamp:
type: string
format: date-time
description: 错误时间
path:
type: string
description: 请求路径

View File

@@ -0,0 +1,620 @@
openapi: 3.0.3
info:
title: Data Synthesis Service API
description: 数据合成服务API - 指令、COT蒸馏、多模态合成
version: 1.0.0
contact:
name: Data Mate Platform Team
servers:
- url: http://localhost:8085
description: Development server
tags:
- name: synthesis-templates
description: 合成模板管理
- name: synthesis-jobs
description: 合成任务管理
- name: instruction-tuning
description: 指令调优
- name: cot-distillation
description: COT蒸馏
paths:
/api/v1/synthesis/templates:
get:
tags:
- synthesis-templates
summary: 获取合成模板列表
parameters:
- name: page
in: query
schema:
type: integer
default: 0
- name: size
in: query
schema:
type: integer
default: 20
- name: type
in: query
schema:
$ref: '#/components/schemas/SynthesisType'
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/SynthesisTemplatePageResponse'
post:
tags:
- synthesis-templates
summary: 创建合成模板
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateSynthesisTemplateRequest'
responses:
'201':
description: 创建成功
content:
application/json:
schema:
$ref: '#/components/schemas/SynthesisTemplateResponse'
/api/v1/synthesis/templates/{templateId}:
get:
tags:
- synthesis-templates
summary: 获取合成模板详情
parameters:
- name: templateId
in: path
required: true
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/SynthesisTemplateDetailResponse'
put:
tags:
- synthesis-templates
summary: 更新合成模板
parameters:
- name: templateId
in: path
required: true
schema:
type: string
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/UpdateSynthesisTemplateRequest'
responses:
'200':
description: 更新成功
/api/v1/synthesis/jobs:
get:
tags:
- synthesis-jobs
summary: 获取合成任务列表
parameters:
- name: page
in: query
schema:
type: integer
default: 0
- name: size
in: query
schema:
type: integer
default: 20
- name: status
in: query
schema:
$ref: '#/components/schemas/JobStatus'
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/SynthesisJobPageResponse'
post:
tags:
- synthesis-jobs
summary: 创建合成任务
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreateSynthesisJobRequest'
responses:
'201':
description: 任务创建成功
content:
application/json:
schema:
$ref: '#/components/schemas/SynthesisJobResponse'
/api/v1/synthesis/jobs/{jobId}:
get:
tags:
- synthesis-jobs
summary: 获取合成任务详情
parameters:
- name: jobId
in: path
required: true
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/SynthesisJobDetailResponse'
/api/v1/synthesis/jobs/{jobId}/execute:
post:
tags:
- synthesis-jobs
summary: 执行合成任务
parameters:
- name: jobId
in: path
required: true
schema:
type: string
responses:
'200':
description: 任务开始执行
content:
application/json:
schema:
$ref: '#/components/schemas/JobExecutionResponse'
/api/v1/synthesis/instruction-tuning:
post:
tags:
- instruction-tuning
summary: 指令调优数据合成
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/InstructionTuningRequest'
responses:
'200':
description: 合成成功
content:
application/json:
schema:
$ref: '#/components/schemas/InstructionTuningResponse'
/api/v1/synthesis/cot-distillation:
post:
tags:
- cot-distillation
summary: COT蒸馏数据合成
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/COTDistillationRequest'
responses:
'200':
description: 蒸馏成功
content:
application/json:
schema:
$ref: '#/components/schemas/COTDistillationResponse'
components:
schemas:
SynthesisTemplateResponse:
type: object
properties:
id:
type: string
name:
type: string
description:
type: string
type:
$ref: '#/components/schemas/SynthesisType'
category:
type: string
modelConfig:
$ref: '#/components/schemas/ModelConfig'
enabled:
type: boolean
createdAt:
type: string
format: date-time
SynthesisTemplateDetailResponse:
allOf:
- $ref: '#/components/schemas/SynthesisTemplateResponse'
- type: object
properties:
promptTemplate:
type: string
parameters:
type: object
examples:
type: array
items:
$ref: '#/components/schemas/SynthesisExample'
SynthesisTemplatePageResponse:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/SynthesisTemplateResponse'
totalElements:
type: integer
format: int64
totalPages:
type: integer
size:
type: integer
number:
type: integer
CreateSynthesisTemplateRequest:
type: object
required:
- name
- type
- promptTemplate
properties:
name:
type: string
description:
type: string
type:
$ref: '#/components/schemas/SynthesisType'
category:
type: string
promptTemplate:
type: string
modelConfig:
$ref: '#/components/schemas/ModelConfig'
parameters:
type: object
UpdateSynthesisTemplateRequest:
type: object
properties:
name:
type: string
description:
type: string
promptTemplate:
type: string
enabled:
type: boolean
parameters:
type: object
SynthesisJobResponse:
type: object
properties:
id:
type: string
name:
type: string
description:
type: string
templateId:
type: string
status:
$ref: '#/components/schemas/JobStatus'
progress:
type: number
format: double
targetCount:
type: integer
generatedCount:
type: integer
startTime:
type: string
format: date-time
endTime:
type: string
format: date-time
createdAt:
type: string
format: date-time
SynthesisJobDetailResponse:
allOf:
- $ref: '#/components/schemas/SynthesisJobResponse'
- type: object
properties:
template:
$ref: '#/components/schemas/SynthesisTemplateResponse'
statistics:
$ref: '#/components/schemas/SynthesisStatistics'
samples:
type: array
items:
$ref: '#/components/schemas/GeneratedSample'
SynthesisJobPageResponse:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/SynthesisJobResponse'
totalElements:
type: integer
format: int64
totalPages:
type: integer
size:
type: integer
number:
type: integer
CreateSynthesisJobRequest:
type: object
required:
- name
- templateId
- targetCount
properties:
name:
type: string
description:
type: string
templateId:
type: string
targetCount:
type: integer
parameters:
type: object
seedData:
type: array
items:
type: object
JobExecutionResponse:
type: object
properties:
executionId:
type: string
status:
type: string
message:
type: string
InstructionTuningRequest:
type: object
required:
- baseInstructions
- targetDomain
- count
properties:
baseInstructions:
type: array
items:
type: string
targetDomain:
type: string
count:
type: integer
modelConfig:
$ref: '#/components/schemas/ModelConfig'
parameters:
type: object
InstructionTuningResponse:
type: object
properties:
jobId:
type: string
generatedInstructions:
type: array
items:
$ref: '#/components/schemas/GeneratedInstruction'
statistics:
$ref: '#/components/schemas/GenerationStatistics'
COTDistillationRequest:
type: object
required:
- sourceModel
- targetFormat
- examples
properties:
sourceModel:
type: string
targetFormat:
type: string
enum: [QA, INSTRUCTION, REASONING]
examples:
type: array
items:
$ref: '#/components/schemas/COTExample'
parameters:
type: object
COTDistillationResponse:
type: object
properties:
jobId:
type: string
distilledData:
type: array
items:
$ref: '#/components/schemas/DistilledCOTData'
statistics:
$ref: '#/components/schemas/DistillationStatistics'
SynthesisType:
type: string
enum:
- INSTRUCTION_TUNING
- COT_DISTILLATION
- DIALOGUE_GENERATION
- TEXT_AUGMENTATION
- MULTIMODAL_SYNTHESIS
- CUSTOM
JobStatus:
type: string
enum:
- PENDING
- RUNNING
- COMPLETED
- FAILED
- CANCELLED
ModelConfig:
type: object
properties:
modelName:
type: string
temperature:
type: number
format: double
maxTokens:
type: integer
topP:
type: number
format: double
frequencyPenalty:
type: number
format: double
SynthesisExample:
type: object
properties:
input:
type: string
output:
type: string
explanation:
type: string
SynthesisStatistics:
type: object
properties:
totalGenerated:
type: integer
successfulGenerated:
type: integer
failedGenerated:
type: integer
averageLength:
type: number
format: double
uniqueCount:
type: integer
GeneratedSample:
type: object
properties:
id:
type: string
content:
type: string
score:
type: number
format: double
metadata:
type: object
createdAt:
type: string
format: date-time
GeneratedInstruction:
type: object
properties:
instruction:
type: string
input:
type: string
output:
type: string
quality:
type: number
format: double
GenerationStatistics:
type: object
properties:
totalGenerated:
type: integer
averageQuality:
type: number
format: double
diversityScore:
type: number
format: double
COTExample:
type: object
properties:
question:
type: string
reasoning:
type: string
answer:
type: string
DistilledCOTData:
type: object
properties:
question:
type: string
reasoning:
type: string
answer:
type: string
confidence:
type: number
format: double
DistillationStatistics:
type: object
properties:
totalProcessed:
type: integer
successfulDistilled:
type: integer
averageConfidence:
type: number
format: double
securitySchemes:
BearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- BearerAuth: []

View File

@@ -0,0 +1,712 @@
openapi: 3.0.3
info:
title: Execution Engine Service API
description: 执行引擎服务API - 与Ray/DataX/Python执行器对接
version: 1.0.0
contact:
name: Data Mate Platform Team
servers:
- url: http://localhost:8088
description: Development server
tags:
- name: jobs
description: 作业管理
- name: executors
description: 执行器管理
- name: resources
description: 资源管理
- name: monitoring
description: 监控管理
paths:
/api/v1/jobs:
get:
tags:
- jobs
summary: 获取作业列表
parameters:
- name: page
in: query
schema:
type: integer
default: 0
- name: size
in: query
schema:
type: integer
default: 20
- name: status
in: query
schema:
$ref: '#/components/schemas/JobStatus'
- name: executor
in: query
schema:
$ref: '#/components/schemas/ExecutorType'
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/JobPageResponse'
post:
tags:
- jobs
summary: 提交作业
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/SubmitJobRequest'
responses:
'201':
description: 作业提交成功
content:
application/json:
schema:
$ref: '#/components/schemas/JobResponse'
/api/v1/jobs/{jobId}:
get:
tags:
- jobs
summary: 获取作业详情
parameters:
- name: jobId
in: path
required: true
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/JobDetailResponse'
delete:
tags:
- jobs
summary: 取消作业
parameters:
- name: jobId
in: path
required: true
schema:
type: string
responses:
'200':
description: 取消成功
/api/v1/jobs/{jobId}/logs:
get:
tags:
- jobs
summary: 获取作业日志
parameters:
- name: jobId
in: path
required: true
schema:
type: string
- name: follow
in: query
description: 是否实时跟踪日志
schema:
type: boolean
default: false
responses:
'200':
description: 获取成功
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/JobLog'
/api/v1/jobs/{jobId}/retry:
post:
tags:
- jobs
summary: 重试作业
parameters:
- name: jobId
in: path
required: true
schema:
type: string
responses:
'200':
description: 重试成功
content:
application/json:
schema:
$ref: '#/components/schemas/JobResponse'
/api/v1/executors:
get:
tags:
- executors
summary: 获取执行器列表
responses:
'200':
description: 获取成功
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/ExecutorResponse'
post:
tags:
- executors
summary: 注册执行器
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/RegisterExecutorRequest'
responses:
'201':
description: 注册成功
/api/v1/executors/{executorId}:
get:
tags:
- executors
summary: 获取执行器详情
parameters:
- name: executorId
in: path
required: true
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/ExecutorDetailResponse'
put:
tags:
- executors
summary: 更新执行器
parameters:
- name: executorId
in: path
required: true
schema:
type: string
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/UpdateExecutorRequest'
responses:
'200':
description: 更新成功
/api/v1/resources/clusters:
get:
tags:
- resources
summary: 获取集群信息
responses:
'200':
description: 获取成功
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/ClusterInfo'
/api/v1/resources/nodes:
get:
tags:
- resources
summary: 获取节点信息
parameters:
- name: clusterId
in: query
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/NodeInfo'
/api/v1/monitoring/metrics:
get:
tags:
- monitoring
summary: 获取监控指标
parameters:
- name: metric
in: query
schema:
type: string
- name: start
in: query
schema:
type: string
format: date-time
- name: end
in: query
schema:
type: string
format: date-time
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/MetricsResponse'
components:
schemas:
JobResponse:
type: object
properties:
id:
type: string
name:
type: string
status:
$ref: '#/components/schemas/JobStatus'
executorType:
$ref: '#/components/schemas/ExecutorType'
priority:
type: integer
progress:
type: number
format: double
submittedAt:
type: string
format: date-time
startedAt:
type: string
format: date-time
completedAt:
type: string
format: date-time
submittedBy:
type: string
JobDetailResponse:
allOf:
- $ref: '#/components/schemas/JobResponse'
- type: object
properties:
configuration:
$ref: '#/components/schemas/JobConfiguration'
resources:
$ref: '#/components/schemas/ResourceRequirement'
metrics:
$ref: '#/components/schemas/JobMetrics'
artifacts:
type: array
items:
$ref: '#/components/schemas/JobArtifact'
dependencies:
type: array
items:
type: string
JobPageResponse:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/JobResponse'
totalElements:
type: integer
format: int64
totalPages:
type: integer
size:
type: integer
number:
type: integer
SubmitJobRequest:
type: object
required:
- name
- executorType
- configuration
properties:
name:
type: string
description:
type: string
executorType:
$ref: '#/components/schemas/ExecutorType'
priority:
type: integer
minimum: 1
maximum: 10
default: 5
configuration:
$ref: '#/components/schemas/JobConfiguration'
resources:
$ref: '#/components/schemas/ResourceRequirement'
dependencies:
type: array
items:
type: string
timeoutSeconds:
type: integer
JobConfiguration:
type: object
properties:
script:
type: string
description: 执行脚本或代码
arguments:
type: array
items:
type: string
description: 执行参数
environment:
type: object
description: 环境变量
files:
type: array
items:
$ref: '#/components/schemas/FileReference'
packages:
type: array
items:
type: string
description: 依赖包列表
ResourceRequirement:
type: object
properties:
cpuCores:
type: number
format: double
memoryGB:
type: number
format: double
gpuCount:
type: integer
diskGB:
type: number
format: double
nodeSelector:
type: object
description: 节点选择器
ExecutorResponse:
type: object
properties:
id:
type: string
name:
type: string
type:
$ref: '#/components/schemas/ExecutorType'
status:
$ref: '#/components/schemas/ExecutorStatus'
version:
type: string
capabilities:
type: array
items:
type: string
registeredAt:
type: string
format: date-time
lastHeartbeat:
type: string
format: date-time
ExecutorDetailResponse:
allOf:
- $ref: '#/components/schemas/ExecutorResponse'
- type: object
properties:
configuration:
type: object
resources:
$ref: '#/components/schemas/ExecutorResources'
currentJobs:
type: array
items:
$ref: '#/components/schemas/JobResponse'
statistics:
$ref: '#/components/schemas/ExecutorStatistics'
RegisterExecutorRequest:
type: object
required:
- name
- type
- endpoint
properties:
name:
type: string
type:
$ref: '#/components/schemas/ExecutorType'
endpoint:
type: string
capabilities:
type: array
items:
type: string
configuration:
type: object
UpdateExecutorRequest:
type: object
properties:
status:
$ref: '#/components/schemas/ExecutorStatus'
configuration:
type: object
ClusterInfo:
type: object
properties:
id:
type: string
name:
type: string
type:
type: string
enum: [RAY, KUBERNETES, YARN, STANDALONE]
status:
type: string
enum: [ACTIVE, INACTIVE, ERROR]
nodeCount:
type: integer
totalCpuCores:
type: integer
totalMemoryGB:
type: number
format: double
totalGpuCount:
type: integer
availableResources:
$ref: '#/components/schemas/ResourceInfo'
NodeInfo:
type: object
properties:
id:
type: string
name:
type: string
clusterId:
type: string
status:
type: string
enum: [ACTIVE, INACTIVE, BUSY, ERROR]
resources:
$ref: '#/components/schemas/ResourceInfo'
usage:
$ref: '#/components/schemas/ResourceUsage'
lastUpdate:
type: string
format: date-time
MetricsResponse:
type: object
properties:
metric:
type: string
dataPoints:
type: array
items:
$ref: '#/components/schemas/MetricDataPoint'
aggregation:
type: object
JobLog:
type: object
properties:
timestamp:
type: string
format: date-time
level:
type: string
enum: [DEBUG, INFO, WARN, ERROR]
source:
type: string
message:
type: string
JobMetrics:
type: object
properties:
cpuUsage:
type: number
format: double
memoryUsage:
type: number
format: double
diskUsage:
type: number
format: double
networkIO:
type: object
duration:
type: integer
format: int64
JobArtifact:
type: object
properties:
id:
type: string
name:
type: string
type:
type: string
enum: [LOG, OUTPUT, CHECKPOINT, MODEL]
size:
type: integer
format: int64
path:
type: string
createdAt:
type: string
format: date-time
FileReference:
type: object
properties:
name:
type: string
path:
type: string
type:
type: string
enum: [LOCAL, HDFS, S3, HTTP]
ExecutorResources:
type: object
properties:
total:
$ref: '#/components/schemas/ResourceInfo'
available:
$ref: '#/components/schemas/ResourceInfo'
allocated:
$ref: '#/components/schemas/ResourceInfo'
ExecutorStatistics:
type: object
properties:
totalJobs:
type: integer
successfulJobs:
type: integer
failedJobs:
type: integer
averageExecutionTime:
type: number
format: double
uptime:
type: integer
format: int64
ResourceInfo:
type: object
properties:
cpuCores:
type: number
format: double
memoryGB:
type: number
format: double
gpuCount:
type: integer
diskGB:
type: number
format: double
ResourceUsage:
type: object
properties:
cpuUsagePercent:
type: number
format: double
memoryUsagePercent:
type: number
format: double
diskUsagePercent:
type: number
format: double
MetricDataPoint:
type: object
properties:
timestamp:
type: string
format: date-time
value:
type: number
format: double
tags:
type: object
JobStatus:
type: string
enum:
- SUBMITTED
- PENDING
- RUNNING
- COMPLETED
- FAILED
- CANCELLED
- TIMEOUT
ExecutorType:
type: string
enum:
- RAY
- DATAX
- PYTHON
- SPARK
- FLINK
- CUSTOM
ExecutorStatus:
type: string
enum:
- ACTIVE
- INACTIVE
- BUSY
- ERROR
- MAINTENANCE
securitySchemes:
BearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- BearerAuth: []

View File

@@ -0,0 +1,547 @@
openapi: 3.0.1
info:
title: Operator Market Service API
description: |
算子市场服务API,提供算子的发布、管理和订阅功能。
主要功能:
- 算子发布和管理
- 算子版本控制
- 算子评分和评论
- 算子分类和标签
- 算子下载和安装
version: 1.0.0
tags:
- name: Operator
- name: Category
- name: Label
paths:
/operators/list:
post:
summary: 获取算子列表
deprecated: false
description: 分页查询算子列表,支持按分类、标签等条件筛选
tags:
- Operator
parameters: []
requestBody:
content:
application/json:
schema:
type: object
properties:
page:
type: integer
description: 页数
size:
type: integer
description: 单页数量
categories:
type: array
items:
type: integer
description: 分类id列表
operatorName:
type: string
description: 算子名称
labelName:
type: string
description: 标签名称
isStar:
type: boolean
description: 是否收藏
required:
- page
- size
- categories
examples: {}
responses:
'200':
description: 成功返回算子列表
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/OperatorResponse'
headers: {}
security: []
/operators/create:
post:
summary: 创建新算子
deprecated: false
description: 创建并发布一个新的算子
tags:
- Operator
parameters: []
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/CreateOperatorRequest'
example: null
responses:
'201':
description: 算子创建成功
content:
application/json:
schema: &ref_0
$ref: '#/components/schemas/OperatorResponse'
headers: {}
security: []
/operators/upload:
post:
summary: 上传新算子
deprecated: false
description: 创建并发布一个新的算子
tags:
- Operator
parameters: []
requestBody:
content:
multipart/form-data:
schema:
type: object
properties:
file:
type: string
format: binary
example: ''
description:
type: string
example: ''
examples: {}
responses:
'201':
description: 算子创建成功
content:
application/json:
schema: *ref_0
headers: {}
security: []
/operators/{id}:
get:
summary: 获取算子详情
deprecated: false
description: 根据ID获取算子的详细信息
tags:
- Operator
parameters:
- name: id
in: path
description: 算子ID
required: true
example: ''
schema:
type: string
responses:
'200':
description: 成功返回算子详情
content:
application/json:
schema: *ref_0
headers: {}
'404':
description: 算子不存在
content:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
headers: {}
security: []
put:
summary: 更新算子信息
deprecated: false
description: 根据ID更新算子信息
tags:
- Operator
parameters:
- name: id
in: path
description: 算子ID
required: true
example: ''
schema:
type: string
requestBody:
content:
application/json:
schema:
$ref: '#/components/schemas/UpdateOperatorRequest'
example: null
responses:
'200':
description: 算子更新成功
content:
application/json:
schema: *ref_0
headers: {}
security: []
/category:
post:
summary: 创建算子分类
deprecated: false
description: ''
tags:
- Category
parameters: []
requestBody:
content:
application/json:
schema:
type: object
properties:
name:
type: string
description: 名称
parentId:
type: integer
description: 父分类id
required:
- name
- parentId
responses:
'201':
description: ''
headers: {}
security: []
delete:
summary: 删除算子分类
deprecated: false
description: ''
tags:
- Category
parameters: []
requestBody:
content:
application/json:
schema:
type: object
properties:
id:
type: integer
description: ID 编号
required:
- id
responses:
'204':
description: ''
headers: {}
security: []
/categories/tree:
get:
summary: 获取算子分类列表
deprecated: false
description: 获取所有可用的算子分类
tags:
- Category
parameters: []
responses:
'200':
description: 成功返回分类列表
content:
application/json:
schema:
type: array
items:
type: object
properties:
id:
type: integer
name:
type: string
count:
type: integer
categories:
$ref: '#/components/schemas/CategoryResponse'
required:
- id
- name
- count
- categories
headers: {}
security: []
/labels:
get:
summary: 获取算子标签列表
deprecated: false
description: 获取所有算子的标签
tags:
- Label
parameters:
- name: page
in: query
description: 页码,从0开始
required: false
schema:
type: integer
default: 0
- name: size
in: query
description: 每页大小
required: false
schema:
type: integer
default: 20
- name: keyword
in: query
description: 关键词搜索
required: false
schema:
type: string
responses:
'200':
description: 成功返回标签列表
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/LabelResponse'
headers: {}
security: []
post:
summary: 创建标签
deprecated: false
description: 批量创建标签
tags:
- Label
parameters: []
requestBody:
content:
application/json:
schema:
type: object
properties:
name:
type: string
description: 名称
required:
- name
example: veniam
responses:
'201':
description: 创建成功
headers: {}
security: []
delete:
summary: 删除标签
deprecated: false
description: 批量删除标签
tags:
- Label
parameters: []
requestBody:
content:
application/json:
schema:
type: array
items:
type: integer
format: int64
description: 标签id列表
example: null
responses:
'204':
description: 删除成功
headers: {}
security: []
/labels/{id}:
put:
summary: 更新标签
deprecated: false
description: 更新标签
tags:
- Label
parameters:
- name: id
in: path
description: 标签ID
required: true
example: ''
schema:
type: string
requestBody:
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/UpdateLabelRequest'
example: null
responses:
'200':
description: 更新成功
headers: {}
security: []
components:
schemas:
UpdateLabelRequest:
type: object
required:
- id
- name
properties:
id:
type: integer
description: 标签id
name:
type: string
description: 标签名称
Response:
type: object
properties:
code:
type: string
message:
type: string
data:
type: object
properties: {}
required:
- code
- message
- data
LabelResponse:
type: object
properties:
id:
type: string
description: 标签ID
name:
type: string
description: 标签名称
SubCategory:
type: object
properties:
id:
type: integer
description: 分类id
name:
type: string
description: 分类名称
count:
type: integer
type:
type: string
description: 分类类型(0:预置,1:自定义)
parentId:
type: integer
description: 父分类id
required:
- id
- name
- type
- parentId
- count
CategoryResponse:
type: array
items:
$ref: '#/components/schemas/SubCategory'
UpdateOperatorRequest:
type: object
properties:
name:
type: string
description: 算子名称
description:
type: string
description: 算子描述
version:
type: string
description: 算子版本
category:
type: string
description: 算子分类
documentation:
type: string
description: 文档内容
ErrorResponse:
type: object
properties:
error:
type: string
description: 错误代码
message:
type: string
description: 错误信息
timestamp:
type: string
format: date-time
description: 错误时间
OperatorResponse:
type: object
properties:
id:
type: string
description: 算子ID
name:
type: string
description: 算子名称
description:
type: string
description: 算子描述
version:
type: string
description: 算子版本
inputs:
type: string
description: 输入类型
outputs:
type: string
description: 输入类型
categories:
type: array
description: 算子分类列表
items:
type: integer
runtime:
type: string
description: 运行时设置
settings:
type: string
description: 算子参数
isStar:
type: boolean
description: 是否收藏
createdAt:
type: string
format: date-time
description: 创建时间
updatedAt:
type: string
format: date-time
description: 更新时间
required:
- language
- modal
- inputs
- outputs
- runtime
- settings
- isStar
CreateOperatorRequest:
type: object
required:
- name
- description
- version
- category
properties:
name:
type: string
description: 算子名称
description:
type: string
description: 算子描述
version:
type: string
description: 算子版本
category:
type: string
description: 算子分类
documentation:
type: string
description: 文档内容
securitySchemes: {}
servers: []

View File

@@ -0,0 +1,639 @@
openapi: 3.0.3
info:
title: Pipeline Orchestration Service API
description: 流程编排服务API - 可视化、模板、执行计划
version: 1.0.0
contact:
name: Data Mate Platform Team
servers:
- url: http://localhost:8087
description: Development server
tags:
- name: pipelines
description: 流水线管理
- name: pipeline-templates
description: 流水线模板
- name: executions
description: 执行管理
- name: workflows
description: 工作流编排
paths:
/api/v1/pipelines:
get:
tags:
- pipelines
summary: 获取流水线列表
parameters:
- name: page
in: query
schema:
type: integer
default: 0
- name: size
in: query
schema:
type: integer
default: 20
- name: status
in: query
schema:
$ref: '#/components/schemas/PipelineStatus'
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/PipelinePageResponse'
post:
tags:
- pipelines
summary: 创建流水线
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreatePipelineRequest'
responses:
'201':
description: 创建成功
content:
application/json:
schema:
$ref: '#/components/schemas/PipelineResponse'
/api/v1/pipelines/{pipelineId}:
get:
tags:
- pipelines
summary: 获取流水线详情
parameters:
- name: pipelineId
in: path
required: true
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/PipelineDetailResponse'
put:
tags:
- pipelines
summary: 更新流水线
parameters:
- name: pipelineId
in: path
required: true
schema:
type: string
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/UpdatePipelineRequest'
responses:
'200':
description: 更新成功
/api/v1/pipelines/{pipelineId}/execute:
post:
tags:
- executions
summary: 执行流水线
parameters:
- name: pipelineId
in: path
required: true
schema:
type: string
requestBody:
required: false
content:
application/json:
schema:
$ref: '#/components/schemas/ExecutePipelineRequest'
responses:
'200':
description: 执行开始
content:
application/json:
schema:
$ref: '#/components/schemas/PipelineExecutionResponse'
/api/v1/executions:
get:
tags:
- executions
summary: 获取执行历史
parameters:
- name: pipelineId
in: query
schema:
type: string
- name: status
in: query
schema:
$ref: '#/components/schemas/ExecutionStatus'
- name: page
in: query
schema:
type: integer
default: 0
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/ExecutionPageResponse'
/api/v1/executions/{executionId}:
get:
tags:
- executions
summary: 获取执行详情
parameters:
- name: executionId
in: path
required: true
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
$ref: '#/components/schemas/ExecutionDetailResponse'
/api/v1/executions/{executionId}/stop:
post:
tags:
- executions
summary: 停止执行
parameters:
- name: executionId
in: path
required: true
schema:
type: string
responses:
'200':
description: 停止成功
/api/v1/templates:
get:
tags:
- pipeline-templates
summary: 获取模板列表
parameters:
- name: category
in: query
schema:
type: string
responses:
'200':
description: 获取成功
content:
application/json:
schema:
type: array
items:
$ref: '#/components/schemas/PipelineTemplateResponse'
post:
tags:
- pipeline-templates
summary: 创建模板
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/CreatePipelineTemplateRequest'
responses:
'201':
description: 创建成功
components:
schemas:
PipelineResponse:
type: object
properties:
id:
type: string
name:
type: string
description:
type: string
status:
$ref: '#/components/schemas/PipelineStatus'
version:
type: string
category:
type: string
tags:
type: array
items:
type: string
createdBy:
type: string
createdAt:
type: string
format: date-time
lastModified:
type: string
format: date-time
PipelineDetailResponse:
allOf:
- $ref: '#/components/schemas/PipelineResponse'
- type: object
properties:
definition:
$ref: '#/components/schemas/PipelineDefinition'
parameters:
type: array
items:
$ref: '#/components/schemas/PipelineParameter'
dependencies:
type: array
items:
type: string
statistics:
$ref: '#/components/schemas/PipelineStatistics'
PipelinePageResponse:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/PipelineResponse'
totalElements:
type: integer
format: int64
totalPages:
type: integer
size:
type: integer
number:
type: integer
CreatePipelineRequest:
type: object
required:
- name
- definition
properties:
name:
type: string
description:
type: string
category:
type: string
definition:
$ref: '#/components/schemas/PipelineDefinition'
parameters:
type: array
items:
$ref: '#/components/schemas/PipelineParameter'
tags:
type: array
items:
type: string
UpdatePipelineRequest:
type: object
properties:
name:
type: string
description:
type: string
definition:
$ref: '#/components/schemas/PipelineDefinition'
status:
$ref: '#/components/schemas/PipelineStatus'
ExecutePipelineRequest:
type: object
properties:
parameters:
type: object
description: 执行参数
environment:
type: string
description: 执行环境
priority:
type: integer
description: 优先级
PipelineExecutionResponse:
type: object
properties:
executionId:
type: string
pipelineId:
type: string
status:
$ref: '#/components/schemas/ExecutionStatus'
startTime:
type: string
format: date-time
message:
type: string
ExecutionResponse:
type: object
properties:
id:
type: string
pipelineId:
type: string
pipelineName:
type: string
status:
$ref: '#/components/schemas/ExecutionStatus'
progress:
type: number
format: double
startTime:
type: string
format: date-time
endTime:
type: string
format: date-time
duration:
type: integer
format: int64
description: 执行时长(毫秒)
ExecutionDetailResponse:
allOf:
- $ref: '#/components/schemas/ExecutionResponse'
- type: object
properties:
steps:
type: array
items:
$ref: '#/components/schemas/ExecutionStep'
logs:
type: array
items:
$ref: '#/components/schemas/ExecutionLog'
metrics:
$ref: '#/components/schemas/ExecutionMetrics'
artifacts:
type: array
items:
$ref: '#/components/schemas/ExecutionArtifact'
ExecutionPageResponse:
type: object
properties:
content:
type: array
items:
$ref: '#/components/schemas/ExecutionResponse'
totalElements:
type: integer
format: int64
totalPages:
type: integer
size:
type: integer
number:
type: integer
PipelineTemplateResponse:
type: object
properties:
id:
type: string
name:
type: string
description:
type: string
category:
type: string
version:
type: string
definition:
$ref: '#/components/schemas/PipelineDefinition'
usageCount:
type: integer
createdAt:
type: string
format: date-time
CreatePipelineTemplateRequest:
type: object
required:
- name
- definition
properties:
name:
type: string
description:
type: string
category:
type: string
definition:
$ref: '#/components/schemas/PipelineDefinition'
PipelineDefinition:
type: object
properties:
nodes:
type: array
items:
$ref: '#/components/schemas/PipelineNode'
edges:
type: array
items:
$ref: '#/components/schemas/PipelineEdge'
settings:
type: object
description: 流水线设置
PipelineNode:
type: object
properties:
id:
type: string
type:
type: string
enum: [OPERATOR, CONDITION, LOOP, PARALLEL]
name:
type: string
operatorId:
type: string
configuration:
type: object
position:
$ref: '#/components/schemas/NodePosition'
PipelineEdge:
type: object
properties:
id:
type: string
source:
type: string
target:
type: string
condition:
type: string
type:
type: string
enum: [SUCCESS, FAILURE, ALWAYS]
NodePosition:
type: object
properties:
x:
type: number
y:
type: number
PipelineParameter:
type: object
properties:
name:
type: string
type:
type: string
required:
type: boolean
defaultValue:
type: string
description:
type: string
PipelineStatistics:
type: object
properties:
totalExecutions:
type: integer
successfulExecutions:
type: integer
failedExecutions:
type: integer
averageDuration:
type: number
format: double
lastExecutionTime:
type: string
format: date-time
ExecutionStep:
type: object
properties:
id:
type: string
nodeId:
type: string
name:
type: string
status:
$ref: '#/components/schemas/ExecutionStatus'
startTime:
type: string
format: date-time
endTime:
type: string
format: date-time
duration:
type: integer
format: int64
message:
type: string
ExecutionLog:
type: object
properties:
timestamp:
type: string
format: date-time
level:
type: string
enum: [DEBUG, INFO, WARN, ERROR]
nodeId:
type: string
message:
type: string
ExecutionMetrics:
type: object
properties:
totalNodes:
type: integer
completedNodes:
type: integer
failedNodes:
type: integer
cpuUsage:
type: number
format: double
memoryUsage:
type: number
format: double
throughput:
type: number
format: double
ExecutionArtifact:
type: object
properties:
id:
type: string
name:
type: string
type:
type: string
size:
type: integer
format: int64
path:
type: string
createdAt:
type: string
format: date-time
PipelineStatus:
type: string
enum:
- DRAFT
- ACTIVE
- INACTIVE
- DEPRECATED
ExecutionStatus:
type: string
enum:
- PENDING
- RUNNING
- SUCCESS
- FAILED
- CANCELLED
- SKIPPED
securitySchemes:
BearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
security:
- BearerAuth: []

212
backend/pom.xml Normal file
View File

@@ -0,0 +1,212 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<version>1.0.0-SNAPSHOT</version>
<packaging>pom</packaging>
<name>DataMatePlatform</name>
<description>一站式数据工作平台,面向模型微调与RAG检索</description>
<properties>
<maven.compiler.source>21</maven.compiler.source>
<maven.compiler.target>21</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<spring-boot.version>3.5.6</spring-boot.version>
<spring-cloud.version>2025.0.0</spring-cloud.version>
<mysql.version>8.0.33</mysql.version>
<postgresql.version>42.6.0</postgresql.version>
<redis.version>3.2.0</redis.version>
<elasticsearch.version>8.11.0</elasticsearch.version>
<junit.version>5.10.0</junit.version>
<springdoc.version>2.2.0</springdoc.version>
<jackson-databind-nullable.version>0.2.6</jackson-databind-nullable.version>
<jakarta-validation.version>3.0.2</jakarta-validation.version>
<jakarta.persistence.version>3.1.0</jakarta.persistence.version>
<maven-assembly-plugin.version>3.3.0</maven-assembly-plugin.version>
<mybatis-plus.version>3.5.14</mybatis-plus.version>
<mapstruct.version>1.6.3</mapstruct.version>
<lombok.version>1.18.32</lombok.version>
<lombok-mapstruct-binding.version>0.2.0</lombok-mapstruct-binding.version>
<poi.version>5.4.0</poi.version>
<log4j2.version>2.21.1</log4j2.version>
</properties>
<modules>
<!-- 共享库 -->
<module>shared/domain-common</module>
<module>shared/security-common</module>
<!-- 核心服务 -->
<module>services/data-management-service</module>
<module>services/data-collection-service</module>
<module>services/operator-market-service</module>
<module>services/data-cleaning-service</module>
<module>services/data-synthesis-service</module>
<module>services/data-annotation-service</module>
<module>services/data-evaluation-service</module>
<module>services/pipeline-orchestration-service</module>
<module>services/execution-engine-service</module>
<!-- RAG服务 -->
<module>services/rag-indexer-service</module>
<module>services/rag-query-service</module>
<!-- 主启动模块 -->
<module>services/main-application</module>
<!-- API Gateway微服务 -->
<module>api-gateway</module>
</modules>
<dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-dependencies</artifactId>
<version>${spring-boot.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-dependencies</artifactId>
<version>${spring-cloud.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
<!-- OpenAPI相关依赖版本管理 -->
<dependency>
<groupId>org.springdoc</groupId>
<artifactId>springdoc-openapi-starter-webmvc-ui</artifactId>
<version>${springdoc.version}</version>
</dependency>
<dependency>
<groupId>org.openapitools</groupId>
<artifactId>jackson-databind-nullable</artifactId>
<version>${jackson-databind-nullable.version}</version>
</dependency>
<dependency>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
<version>${jakarta-validation.version}</version>
</dependency>
<dependency>
<groupId>jakarta.persistence</groupId>
<artifactId>jakarta.persistence-api</artifactId>
<version>${jakarta.persistence.version}</version>
</dependency>
<!-- MyBatis version alignment -->
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-bom</artifactId>
<version>${mybatis-plus.version}</version>
<type>pom</type>
<scope>import</scope>
</dependency>
<dependency>
<groupId>org.mapstruct</groupId>
<artifactId>mapstruct</artifactId>
<version>${mapstruct.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.version}</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>${poi.version}</version>
</dependency>
</dependencies>
</dependencyManagement>
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
<version>${spring-boot.version}</version>
<exclusions>
<exclusion>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
<version>${spring-boot.version}</version>
<exclusions>
<exclusion>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-logging</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-spring-boot3-starter</artifactId>
<version>${mybatis-plus.version}</version>
</dependency>
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-jsqlparser</artifactId>
</dependency>
<!-- Log4j2 API -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-log4j2</artifactId>
<version>${spring-boot.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
</dependency>
<dependency>
<groupId>org.mapstruct</groupId>
<artifactId>mapstruct</artifactId>
<version>${mapstruct.version}</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>${poi.version}</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<version>${spring-boot.version}</version>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,101 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<artifactId>data-annotation-service</artifactId>
<name>Data Annotation Service</name>
<description>数据标注服务</description>
<dependencies>
<dependency>
<groupId>com.datamate</groupId>
<artifactId>domain-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-websocket</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.version}</version>
</dependency>
<!-- OpenAPI Dependencies -->
<dependency>
<groupId>org.springdoc</groupId>
<artifactId>springdoc-openapi-starter-webmvc-ui</artifactId>
<version>2.0.4</version>
</dependency>
<dependency>
<groupId>org.openapitools</groupId>
<artifactId>jackson-databind-nullable</artifactId>
<version>0.2.6</version>
</dependency>
<dependency>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
<!-- OpenAPI Generator Plugin -->
<plugin>
<groupId>org.openapitools</groupId>
<artifactId>openapi-generator-maven-plugin</artifactId>
<version>6.6.0</version>
<executions>
<execution>
<goals>
<goal>generate</goal>
</goals>
<configuration>
<inputSpec>${project.basedir}/../../openapi/specs/data-annotation.yaml</inputSpec>
<generatorName>spring</generatorName>
<output>${project.build.directory}/generated-sources/openapi</output>
<apiPackage>com.datamate.annotation.interfaces.api</apiPackage>
<modelPackage>com.datamate.annotation.interfaces.dto</modelPackage>
<configOptions>
<interfaceOnly>true</interfaceOnly>
<useTags>true</useTags>
<skipDefaultInterface>true</skipDefaultInterface>
<hideGenerationTimestamp>true</hideGenerationTimestamp>
<java8>true</java8>
<dateLibrary>java8</dateLibrary>
<useBeanValidation>true</useBeanValidation>
<performBeanValidation>true</performBeanValidation>
<useSpringBoot3>true</useSpringBoot3>
<documentationProvider>springdoc</documentationProvider>
</configOptions>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

Binary file not shown.

After

Width:  |  Height:  |  Size: 134 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 91 KiB

View File

@@ -0,0 +1,87 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<artifactId>data-cleaning-service</artifactId>
<name>Data Cleaning Service</name>
<description>数据清洗服务</description>
<dependencies>
<dependency>
<groupId>com.datamate</groupId>
<artifactId>domain-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springdoc</groupId>
<artifactId>springdoc-openapi-starter-webmvc-ui</artifactId>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<dependency>
<groupId>org.openapitools</groupId>
<artifactId>jackson-databind-nullable</artifactId>
</dependency>
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-spring-boot3-starter</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-compress</artifactId>
<version>1.26.1</version>
</dependency>
<dependency>
<groupId>org.mapstruct</groupId>
<artifactId>mapstruct</artifactId>
</dependency>
<dependency>
<groupId>org.mapstruct</groupId>
<artifactId>mapstruct-processor</artifactId>
<version>${mapstruct.version}</version>
<scope>provided</scope> <!-- 编译时需要,运行时不需要 -->
</dependency>
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-commons</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,22 @@
package com.datamate.cleaning;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.annotation.EnableScheduling;
/**
* 数据归集服务配置类
*
* 基于DataX的数据归集和同步服务,支持多种数据源的数据采集和归集
*/
@SpringBootApplication
@EnableAsync
@EnableScheduling
@ComponentScan(basePackages = {
"com.datamate.cleaning",
"com.datamate.shared"
})
public class DataCleaningServiceConfiguration {
// Configuration class for JAR packaging - no main method needed
}

View File

@@ -0,0 +1,120 @@
package com.datamate.cleaning.application.httpclient;
import com.datamate.cleaning.domain.model.CreateDatasetRequest;
import com.datamate.cleaning.domain.model.DatasetResponse;
import com.datamate.cleaning.domain.model.PagedDatasetFileResponse;
import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.ErrorCodeImpl;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
import lombok.extern.slf4j.Slf4j;
import org.springframework.data.domain.PageRequest;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.text.MessageFormat;
import java.time.Duration;
import java.util.Map;
import java.util.stream.Collectors;
@Slf4j
public class DatasetClient {
private static final String BASE_URL = "http://localhost:8080/api";
private static final String CREATE_DATASET_URL = BASE_URL + "/data-management/datasets";
private static final String GET_DATASET_URL = BASE_URL + "/data-management/datasets/{0}";
private static final String GET_DATASET_FILE_URL = BASE_URL + "/data-management/datasets/{0}/files";
private static final HttpClient CLIENT = HttpClient.newBuilder().connectTimeout(Duration.ofSeconds(10)).build();
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
static {
OBJECT_MAPPER.registerModule(new JavaTimeModule());
}
public static DatasetResponse createDataset(String name, String type) {
CreateDatasetRequest createDatasetRequest = new CreateDatasetRequest();
createDatasetRequest.setName(name);
createDatasetRequest.setDatasetType(type);
String jsonPayload;
try {
jsonPayload = OBJECT_MAPPER.writeValueAsString(createDatasetRequest);
} catch (IOException e) {
log.error("Error occurred while converting the object.", e);
throw BusinessException.of(SystemErrorCode.UNKNOWN_ERROR);
}
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(CREATE_DATASET_URL))
.timeout(Duration.ofSeconds(30))
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.ofString(jsonPayload))
.build();
return sendAndReturn(request, DatasetResponse.class);
}
public static DatasetResponse getDataset(String datasetId) {
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(MessageFormat.format(GET_DATASET_URL, datasetId)))
.timeout(Duration.ofSeconds(30))
.header("Content-Type", "application/json")
.GET()
.build();
return sendAndReturn(request, DatasetResponse.class);
}
public static PagedDatasetFileResponse getDatasetFile(String datasetId, PageRequest page) {
String url = buildQueryParams(MessageFormat.format(GET_DATASET_FILE_URL, datasetId),
Map.of("page", page.getPageNumber(), "size", page.getPageSize()));
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(url))
.timeout(Duration.ofSeconds(30))
.header("Content-Type", "application/json")
.GET()
.build();
return sendAndReturn(request, PagedDatasetFileResponse.class);
}
private static <T> T sendAndReturn(HttpRequest request, Class<T> clazz) {
try {
HttpResponse<String> response = CLIENT.send(request, HttpResponse.BodyHandlers.ofString());
int statusCode = response.statusCode();
String responseBody = response.body();
JsonNode jsonNode = OBJECT_MAPPER.readTree(responseBody);
if (statusCode < 200 || statusCode >= 300) {
String code = jsonNode.get("code").asText();
String message = jsonNode.get("message").asText();
throw BusinessException.of(ErrorCodeImpl.of(code, message));
}
return OBJECT_MAPPER.treeToValue(jsonNode.get("data"), clazz);
} catch (IOException | InterruptedException e) {
log.error("Error occurred while making the request.", e);
throw BusinessException.of(SystemErrorCode.UNKNOWN_ERROR);
}
}
private static String buildQueryParams(String baseUrl, Map<String, Object> params) {
if (params == null || params.isEmpty()) {
return baseUrl;
}
String queryString = params.entrySet().stream()
.map(entry -> entry.getKey() + entry.getValue().toString())
.collect(Collectors.joining("&"));
return baseUrl + (baseUrl.contains("?") ? "&" : "?") + queryString;
}
}

View File

@@ -0,0 +1,54 @@
package com.datamate.cleaning.application.httpclient;
import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import lombok.extern.slf4j.Slf4j;
import java.io.IOException;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.text.MessageFormat;
import java.time.Duration;
@Slf4j
public class RuntimeClient {
private static final String BASE_URL = "http://runtime:8081/api";
private static final String CREATE_TASK_URL = BASE_URL + "/task/{0}/submit";
private static final String STOP_TASK_URL = BASE_URL + "/task/{0}/stop";
private static final HttpClient CLIENT = HttpClient.newBuilder().connectTimeout(Duration.ofSeconds(10)).build();
public static void submitTask(String taskId) {
send(MessageFormat.format(CREATE_TASK_URL, taskId));
}
public static void stopTask(String taskId) {
send(MessageFormat.format(STOP_TASK_URL, taskId));
}
private static void send(String url) {
HttpRequest request = HttpRequest.newBuilder()
.uri(URI.create(url))
.timeout(Duration.ofSeconds(30))
.header("Content-Type", "application/json")
.POST(HttpRequest.BodyPublishers.noBody())
.build();
try {
HttpResponse<String> response = CLIENT.send(request, HttpResponse.BodyHandlers.ofString());
int statusCode = response.statusCode();
if (statusCode < 200 || statusCode >= 300) {
log.error("Request failed with status code: {}", statusCode);
throw BusinessException.of(SystemErrorCode.SYSTEM_BUSY);
}
} catch (IOException | InterruptedException e) {
log.error("Error occurred while making the request.", e);
throw BusinessException.of(SystemErrorCode.UNKNOWN_ERROR);
}
}
}

View File

@@ -0,0 +1,40 @@
package com.datamate.cleaning.application.scheduler;
import com.datamate.cleaning.application.httpclient.RuntimeClient;
import com.datamate.cleaning.infrastructure.persistence.mapper.CleaningTaskMapper;
import com.datamate.cleaning.interfaces.dto.CleaningTask;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
import java.time.LocalDateTime;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
@Service
@RequiredArgsConstructor
public class CleaningTaskScheduler {
private final CleaningTaskMapper cleaningTaskMapper;
private final ExecutorService taskExecutor = Executors.newFixedThreadPool(5);
public void executeTask(String taskId) {
taskExecutor.submit(() -> submitTask(taskId));
}
private void submitTask(String taskId) {
CleaningTask task = new CleaningTask();
task.setId(taskId);
task.setStatus(CleaningTask.StatusEnum.RUNNING);
task.setStartedAt(LocalDateTime.now());
cleaningTaskMapper.updateTask(task);
RuntimeClient.submitTask(taskId);
}
public void stopTask(String taskId) {
RuntimeClient.stopTask(taskId);
CleaningTask task = new CleaningTask();
task.setId(taskId);
task.setStatus(CleaningTask.StatusEnum.STOPPED);
cleaningTaskMapper.updateTask(task);
}
}

View File

@@ -0,0 +1,186 @@
package com.datamate.cleaning.application.service;
import com.datamate.cleaning.application.httpclient.DatasetClient;
import com.datamate.cleaning.application.scheduler.CleaningTaskScheduler;
import com.datamate.cleaning.domain.converter.OperatorInstanceConverter;
import com.datamate.cleaning.domain.model.DatasetResponse;
import com.datamate.cleaning.domain.model.ExecutorType;
import com.datamate.cleaning.domain.model.OperatorInstancePo;
import com.datamate.cleaning.domain.model.PagedDatasetFileResponse;
import com.datamate.cleaning.domain.model.TaskProcess;
import com.datamate.cleaning.infrastructure.persistence.mapper.CleaningResultMapper;
import com.datamate.cleaning.infrastructure.persistence.mapper.CleaningTaskMapper;
import com.datamate.cleaning.infrastructure.persistence.mapper.OperatorInstanceMapper;
import com.datamate.cleaning.interfaces.dto.CleaningTask;
import com.datamate.cleaning.interfaces.dto.CreateCleaningTaskRequest;
import com.datamate.cleaning.interfaces.dto.OperatorInstance;
import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.PropertyNamingStrategies;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.data.domain.PageRequest;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.yaml.snakeyaml.DumperOptions;
import org.yaml.snakeyaml.Yaml;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.UUID;
@Slf4j
@Service
@RequiredArgsConstructor
public class CleaningTaskService {
private final CleaningTaskMapper cleaningTaskMapper;
private final OperatorInstanceMapper operatorInstanceMapper;
private final CleaningResultMapper cleaningResultMapper;
private final CleaningTaskScheduler taskScheduler;
private final String DATASET_PATH = "/dataset";
private final String FLOW_PATH = "/flow";
public List<CleaningTask> getTasks(String status, String keywords, Integer page, Integer size) {
Integer offset = page * size;
return cleaningTaskMapper.findTasks(status, keywords, size, offset);
}
public int countTasks(String status, String keywords) {
return cleaningTaskMapper.findTasks(status, keywords, null, null).size();
}
@Transactional
public CleaningTask createTask(CreateCleaningTaskRequest request) {
DatasetResponse destDataset = DatasetClient.createDataset(request.getDestDatasetName(),
request.getDestDatasetType());
DatasetResponse srcDataset = DatasetClient.getDataset(request.getSrcDatasetId());
CleaningTask task = new CleaningTask();
task.setName(request.getName());
task.setDescription(request.getDescription());
task.setStatus(CleaningTask.StatusEnum.PENDING);
String taskId = UUID.randomUUID().toString();
task.setId(taskId);
task.setSrcDatasetId(request.getSrcDatasetId());
task.setSrcDatasetName(request.getSrcDatasetName());
task.setDestDatasetId(destDataset.getId());
task.setDestDatasetName(destDataset.getName());
task.setBeforeSize(srcDataset.getTotalSize());
cleaningTaskMapper.insertTask(task);
List<OperatorInstancePo> instancePos = request.getInstance().stream()
.map(OperatorInstanceConverter.INSTANCE::operatorToDo).toList();
operatorInstanceMapper.insertInstance(taskId, instancePos);
prepareTask(task, request.getInstance());
scanDataset(taskId, request.getSrcDatasetId());
executeTask(taskId);
return task;
}
public CleaningTask getTask(String taskId) {
return cleaningTaskMapper.findTaskById(taskId);
}
@Transactional
public void deleteTask(String taskId) {
cleaningTaskMapper.deleteTask(taskId);
operatorInstanceMapper.deleteByInstanceId(taskId);
cleaningResultMapper.deleteByInstanceId(taskId);
}
public void executeTask(String taskId) {
taskScheduler.executeTask(taskId);
}
private void prepareTask(CleaningTask task, List<OperatorInstance> instances) {
TaskProcess process = new TaskProcess();
process.setInstanceId(task.getId());
process.setDatasetId(task.getDestDatasetId());
process.setDatasetPath(FLOW_PATH + "/" + task.getId() + "/dataset.jsonl");
process.setExportPath(DATASET_PATH + "/" + task.getDestDatasetId());
process.setExecutorType(ExecutorType.DATA_PLATFORM.getValue());
process.setProcess(instances.stream()
.map(instance -> Map.of(instance.getId(), instance.getOverrides()))
.toList());
ObjectMapper jsonMapper = new ObjectMapper(new YAMLFactory());
jsonMapper.setPropertyNamingStrategy(PropertyNamingStrategies.SNAKE_CASE);
JsonNode jsonNode = jsonMapper.valueToTree(process);
DumperOptions options = new DumperOptions();
options.setIndent(2);
options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK);
Yaml yaml = new Yaml(options);
File file = new File(FLOW_PATH + "/" + process.getInstanceId() + "/process.yaml");
file.getParentFile().mkdirs();
try (FileWriter writer = new FileWriter(file)) {
yaml.dump(jsonMapper.treeToValue(jsonNode, Map.class), writer);
} catch (IOException e) {
log.error("Failed to prepare process.yaml.", e);
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
}
}
private void scanDataset(String taskId, String srcDatasetId) {
int pageNumber = 0;
int pageSize = 500;
PageRequest pageRequest = PageRequest.of(pageNumber, pageSize);
PagedDatasetFileResponse datasetFile;
do {
datasetFile = DatasetClient.getDatasetFile(srcDatasetId, pageRequest);
if (datasetFile.getContent() != null && datasetFile.getContent().isEmpty()) {
break;
}
List<Map<String, Object>> files = datasetFile.getContent().stream()
.map(content -> Map.of("fileName", (Object) content.getFileName(),
"fileSize", content.getFileSize(),
"filePath", content.getFilePath(),
"fileType", content.getFileType(),
"fileId", content.getId()))
.toList();
writeListMapToJsonlFile(files, FLOW_PATH + "/" + taskId + "/dataset.jsonl");
pageNumber += 1;
} while (pageNumber < datasetFile.getTotalPages());
}
private void writeListMapToJsonlFile(List<Map<String, Object>> mapList, String fileName) {
ObjectMapper objectMapper = new ObjectMapper();
try (BufferedWriter writer = new BufferedWriter(new FileWriter(fileName))) {
if (!mapList.isEmpty()) { // 检查列表是否为空,避免异常
String jsonString = objectMapper.writeValueAsString(mapList.get(0));
writer.write(jsonString);
for (int i = 1; i < mapList.size(); i++) {
writer.newLine();
jsonString = objectMapper.writeValueAsString(mapList.get(i));
writer.write(jsonString);
}
}
} catch (IOException e) {
log.error("Failed to prepare dataset.jsonl.", e);
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
}
}
public void stopTask(String taskId) {
taskScheduler.stopTask(taskId);
}
}

View File

@@ -0,0 +1,95 @@
package com.datamate.cleaning.application.service;
import com.datamate.cleaning.domain.converter.OperatorInstanceConverter;
import com.datamate.cleaning.domain.model.OperatorInstancePo;
import com.datamate.cleaning.domain.model.TemplateWithInstance;
import com.datamate.cleaning.infrastructure.persistence.mapper.CleaningTemplateMapper;
import com.datamate.cleaning.infrastructure.persistence.mapper.OperatorInstanceMapper;
import com.datamate.cleaning.interfaces.dto.CleaningTemplate;
import com.datamate.cleaning.interfaces.dto.CreateCleaningTemplateRequest;
import com.datamate.cleaning.interfaces.dto.OperatorResponse;
import com.datamate.cleaning.interfaces.dto.UpdateCleaningTemplateRequest;
import lombok.RequiredArgsConstructor;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.function.Function;
import java.util.stream.Collectors;
@Service
@RequiredArgsConstructor
public class CleaningTemplateService {
private final CleaningTemplateMapper cleaningTemplateMapper;
private final OperatorInstanceMapper operatorInstanceMapper;
public List<CleaningTemplate> getTemplates(String keywords) {
List<OperatorResponse> allOperators = cleaningTemplateMapper.findAllOperators();
Map<String, OperatorResponse> operatorsMap = allOperators.stream()
.collect(Collectors.toMap(OperatorResponse::getId, Function.identity()));
List<TemplateWithInstance> allTemplates = cleaningTemplateMapper.findAllTemplates(keywords);
Map<String, List<TemplateWithInstance>> templatesMap = allTemplates.stream()
.collect(Collectors.groupingBy(TemplateWithInstance::getId));
return templatesMap.entrySet().stream().map(twi -> {
List<TemplateWithInstance> value = twi.getValue();
CleaningTemplate template = new CleaningTemplate();
template.setId(twi.getKey());
template.setName(value.get(0).getName());
template.setDescription(value.get(0).getDescription());
template.setInstance(value.stream().filter(v -> StringUtils.isNotBlank(v.getOperatorId()))
.sorted(Comparator.comparingInt(TemplateWithInstance::getOpIndex))
.map(v -> {
OperatorResponse operator = operatorsMap.get(v.getOperatorId());
if (StringUtils.isNotBlank(v.getSettingsOverride())) {
operator.setSettings(v.getSettingsOverride());
}
return operator;
}).toList());
template.setCreatedAt(value.get(0).getCreatedAt());
template.setUpdatedAt(value.get(0).getUpdatedAt());
return template;
}).toList();
}
@Transactional
public CleaningTemplate createTemplate(CreateCleaningTemplateRequest request) {
CleaningTemplate template = new CleaningTemplate();
String templateId = UUID.randomUUID().toString();
template.setId(templateId);
template.setName(request.getName());
template.setDescription(request.getDescription());
cleaningTemplateMapper.insertTemplate(template);
List<OperatorInstancePo> instancePos = request.getInstance().stream()
.map(OperatorInstanceConverter.INSTANCE::operatorToDo).toList();
operatorInstanceMapper.insertInstance(templateId, instancePos);
return template;
}
public CleaningTemplate getTemplate(String templateId) {
return cleaningTemplateMapper.findTemplateById(templateId);
}
@Transactional
public CleaningTemplate updateTemplate(String templateId, UpdateCleaningTemplateRequest request) {
CleaningTemplate template = cleaningTemplateMapper.findTemplateById(templateId);
if (template != null) {
template.setName(request.getName());
template.setDescription(request.getDescription());
cleaningTemplateMapper.updateTemplate(template);
}
return template;
}
@Transactional
public void deleteTemplate(String templateId) {
cleaningTemplateMapper.deleteTemplate(templateId);
operatorInstanceMapper.deleteByInstanceId(templateId);
}
}

View File

@@ -0,0 +1,33 @@
package com.datamate.cleaning.domain.converter;
import com.datamate.cleaning.domain.model.OperatorInstancePo;
import com.datamate.cleaning.interfaces.dto.OperatorInstance;
import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.mapstruct.Mapper;
import org.mapstruct.Mapping;
import org.mapstruct.Named;
import org.mapstruct.factory.Mappers;
import java.util.Map;
@Mapper
public interface OperatorInstanceConverter {
OperatorInstanceConverter INSTANCE = Mappers.getMapper(OperatorInstanceConverter.class);
@Mapping(target = "overrides", source = "overrides", qualifiedByName = "mapToJson")
OperatorInstancePo operatorToDo(OperatorInstance instance);
@Named("mapToJson")
static String mapToJson(Map<String, Object> objects) {
ObjectMapper objectMapper = new ObjectMapper();
try {
return objectMapper.writeValueAsString(objects);
} catch (JsonProcessingException e) {
throw BusinessException.of(SystemErrorCode.UNKNOWN_ERROR);
}
}
}

View File

@@ -0,0 +1,26 @@
package com.datamate.cleaning.domain.model;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import java.util.List;
@Getter
@Setter
@NoArgsConstructor
public class CreateDatasetRequest {
/** 数据集名称 */
private String name;
/** 数据集描述 */
private String description;
/** 数据集类型 */
private String datasetType;
/** 标签列表 */
private List<String> tags;
/** 数据源 */
private String dataSource;
/** 目标位置 */
private String targetLocation;
}

View File

@@ -0,0 +1,36 @@
package com.datamate.cleaning.domain.model;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import java.time.LocalDateTime;
@Getter
@Setter
@NoArgsConstructor
public class DatasetFileResponse {
/** 文件ID */
private String id;
/** 文件名 */
private String fileName;
/** 原始文件名 */
private String originalName;
/** 文件类型 */
private String fileType;
/** 文件大小(字节) */
private Long fileSize;
/** 文件状态 */
private String status;
/** 文件描述 */
private String description;
/** 文件路径 */
private String filePath;
/** 上传时间 */
private LocalDateTime uploadTime;
/** 最后更新时间 */
private LocalDateTime lastAccessTime;
/** 上传者 */
private String uploadedBy;
}

View File

@@ -0,0 +1,44 @@
package com.datamate.cleaning.domain.model;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import java.time.LocalDateTime;
/**
* 数据集实体(与数据库表 t_dm_datasets 对齐)
*/
@Getter
@Setter
@NoArgsConstructor
@JsonIgnoreProperties(ignoreUnknown = true)
public class DatasetResponse {
/** 数据集ID */
private String id;
/** 数据集名称 */
private String name;
/** 数据集描述 */
private String description;
/** 数据集类型 */
private String datasetType;
/** 数据集状态 */
private String status;
/** 数据源 */
private String dataSource;
/** 目标位置 */
private String targetLocation;
/** 文件数量 */
private Integer fileCount;
/** 总大小(字节) */
private Long totalSize;
/** 完成率(0-100) */
private Float completionRate;
/** 创建时间 */
private LocalDateTime createdAt;
/** 更新时间 */
private LocalDateTime updatedAt;
/** 创建者 */
private String createdBy;
}

View File

@@ -0,0 +1,23 @@
package com.datamate.cleaning.domain.model;
import lombok.Getter;
import lombok.Setter;
import java.util.List;
/**
* 数据集类型响应DTO
*/
@Getter
@Setter
public class DatasetTypeResponse {
/** 类型编码 */
private String code;
/** 类型名称 */
private String name;
/** 类型描述 */
private String description;
/** 支持的文件格式 */
private List<String> supportedFormats;
/** 图标 */
private String icon;
}

View File

@@ -0,0 +1,25 @@
package com.datamate.cleaning.domain.model;
import lombok.Getter;
@Getter
public enum ExecutorType {
DATA_PLATFORM("data_platform"),
DATA_JUICER_RAY("ray"),
DATA_JUICER_DEFAULT("default");
private final String value;
ExecutorType(String value) {
this.value = value;
}
public static ExecutorType fromValue(String value) {
for (ExecutorType type : ExecutorType.values()) {
if (type.value.equals(value)) {
return type;
}
}
throw new IllegalArgumentException("Unexpected value '" + value + "'");
}
}

View File

@@ -0,0 +1,13 @@
package com.datamate.cleaning.domain.model;
import lombok.Getter;
import lombok.Setter;
@Getter
@Setter
public class OperatorInstancePo {
private String id;
private String overrides;
}

View File

@@ -0,0 +1,28 @@
package com.datamate.cleaning.domain.model;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import java.util.List;
@Getter
@Setter
@NoArgsConstructor
public class PagedDatasetFileResponse {
/** 文件内容列表 */
private List<DatasetFileResponse> content;
/** 当前页码 */
private Integer page;
/** 每页大小 */
private Integer size;
/** 总元素数 */
private Integer totalElements;
/** 总页数 */
private Integer totalPages;
/** 是否为第一页 */
private Boolean first;
/** 是否为最后一页 */
private Boolean last;
}

View File

@@ -0,0 +1,24 @@
package com.datamate.cleaning.domain.model;
import lombok.Getter;
import lombok.Setter;
import java.util.List;
import java.util.Map;
@Getter
@Setter
public class TaskProcess {
private String instanceId;
private String datasetId;
private String datasetPath;
private String exportPath;
private String executorType;
private List<Map<String, Map<String, Object>>> process;
}

View File

@@ -0,0 +1,30 @@
package com.datamate.cleaning.domain.model;
import lombok.Getter;
import lombok.Setter;
import org.springframework.format.annotation.DateTimeFormat;
import java.time.LocalDateTime;
@Getter
@Setter
public class TemplateWithInstance {
private String id;
private String name;
private String description;
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
private LocalDateTime createdAt;
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
private LocalDateTime updatedAt;
private String operatorId;
private Integer opIndex;
private String settingsOverride;
}

View File

@@ -0,0 +1,19 @@
package com.datamate.cleaning.infrastructure.exception;
import com.datamate.common.infrastructure.exception.ErrorCode;
import lombok.AllArgsConstructor;
import lombok.Getter;
@Getter
@AllArgsConstructor
public enum CleanErrorCode implements ErrorCode {
/**
* 清洗任务名称重复
*/
DUPLICATE_TASK_NAME("clean.0001", "清洗任务名称重复"),
CREATE_DATASET_FAILED("clean.0002", "创建数据集失败");
private final String code;
private final String message;
}

View File

@@ -0,0 +1,9 @@
package com.datamate.cleaning.infrastructure.persistence.mapper;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
@Mapper
public interface CleaningResultMapper {
void deleteByInstanceId(@Param("instanceId") String instanceId);
}

View File

@@ -0,0 +1,21 @@
package com.datamate.cleaning.infrastructure.persistence.mapper;
import com.datamate.cleaning.interfaces.dto.CleaningTask;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import java.util.List;
@Mapper
public interface CleaningTaskMapper {
List<CleaningTask> findTasks(@Param("status") String status, @Param("keywords") String keywords,
@Param("size") Integer size, @Param("offset") Integer offset);
CleaningTask findTaskById(@Param("taskId") String taskId);
void insertTask(CleaningTask task);
void updateTask(CleaningTask task);
void deleteTask(@Param("taskId") String taskId);
}

View File

@@ -0,0 +1,25 @@
package com.datamate.cleaning.infrastructure.persistence.mapper;
import com.datamate.cleaning.domain.model.TemplateWithInstance;
import com.datamate.cleaning.interfaces.dto.CleaningTemplate;
import com.datamate.cleaning.interfaces.dto.OperatorResponse;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import java.util.List;
@Mapper
public interface CleaningTemplateMapper {
List<TemplateWithInstance> findAllTemplates(@Param("keywords") String keywords);
List<OperatorResponse> findAllOperators();
CleaningTemplate findTemplateById(@Param("templateId") String templateId);
void insertTemplate(CleaningTemplate template);
void updateTemplate(CleaningTemplate template);
void deleteTemplate(@Param("templateId") String templateId);
}

View File

@@ -0,0 +1,17 @@
package com.datamate.cleaning.infrastructure.persistence.mapper;
import com.datamate.cleaning.domain.model.OperatorInstancePo;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import java.util.List;
@Mapper
public interface OperatorInstanceMapper {
void insertInstance(@Param("instanceId") String instanceId,
@Param("instances") List<OperatorInstancePo> instances);
void deleteByInstanceId(@Param("instanceId") String instanceId);
}

View File

@@ -0,0 +1,59 @@
package com.datamate.cleaning.interfaces.api;
import com.datamate.cleaning.application.service.CleaningTaskService;
import com.datamate.cleaning.interfaces.dto.CleaningTask;
import com.datamate.cleaning.interfaces.dto.CreateCleaningTaskRequest;
import com.datamate.common.infrastructure.common.Response;
import com.datamate.common.interfaces.PagedResponse;
import lombok.RequiredArgsConstructor;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import java.util.List;
@RestController
@RequestMapping("/cleaning/tasks")
@RequiredArgsConstructor
public class CleaningTaskController {
private final CleaningTaskService cleaningTaskService;
@GetMapping
public ResponseEntity<Response<PagedResponse<CleaningTask>>> cleaningTasksGet(
@RequestParam("page") Integer page,
@RequestParam("size") Integer size, @RequestParam(value = "status", required = false) String status,
@RequestParam(value = "keywords", required = false) String keywords) {
List<CleaningTask> tasks = cleaningTaskService.getTasks(status, keywords, page, size);
int count = cleaningTaskService.countTasks(status, keywords);
int totalPages = (count + size + 1) / size;
return ResponseEntity.ok(Response.ok(PagedResponse.of(tasks, page, count, totalPages)));
}
@PostMapping
public ResponseEntity<Response<CleaningTask>> cleaningTasksPost(@RequestBody CreateCleaningTaskRequest request) {
return ResponseEntity.ok(Response.ok(cleaningTaskService.createTask(request)));
}
@PostMapping("/{taskId}/stop")
public ResponseEntity<Response<Object>> cleaningTasksStop(@PathVariable("taskId") String taskId) {
cleaningTaskService.stopTask(taskId);
return ResponseEntity.ok(Response.ok(null));
}
@PostMapping("/{taskId}/execute")
public ResponseEntity<Response<Object>> cleaningTasksStart(@PathVariable("taskId") String taskId) {
cleaningTaskService.executeTask(taskId);
return ResponseEntity.ok(Response.ok(null));
}
@GetMapping("/{taskId}")
public ResponseEntity<Response<CleaningTask>> cleaningTasksTaskIdGet(@PathVariable("taskId") String taskId) {
return ResponseEntity.ok(Response.ok(cleaningTaskService.getTask(taskId)));
}
@DeleteMapping("/{taskId}")
public ResponseEntity<Response<Object>> cleaningTasksTaskIdDelete(@PathVariable("taskId") String taskId) {
cleaningTaskService.deleteTask(taskId);
return ResponseEntity.ok(Response.ok(null));
}
}

View File

@@ -0,0 +1,74 @@
package com.datamate.cleaning.interfaces.api;
import com.datamate.cleaning.application.service.CleaningTemplateService;
import com.datamate.cleaning.interfaces.dto.CleaningTemplate;
import com.datamate.cleaning.interfaces.dto.CreateCleaningTemplateRequest;
import com.datamate.cleaning.interfaces.dto.UpdateCleaningTemplateRequest;
import com.datamate.common.infrastructure.common.Response;
import com.datamate.common.interfaces.PagedResponse;
import lombok.RequiredArgsConstructor;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.DeleteMapping;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.PutMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import java.util.Comparator;
import java.util.List;
@RestController
@RequestMapping("/cleaning/templates")
@RequiredArgsConstructor
public class CleaningTemplateController {
private final CleaningTemplateService cleaningTemplateService;
@GetMapping
public ResponseEntity<Response<PagedResponse<CleaningTemplate>>> cleaningTemplatesGet(
@RequestParam(value = "page", required = false) Integer page,
@RequestParam(value = "size", required = false) Integer size,
@RequestParam(value = "keywords", required = false) String keyword) {
List<CleaningTemplate> templates = cleaningTemplateService.getTemplates(keyword);
if (page == null || size == null) {
return ResponseEntity.ok(Response.ok(PagedResponse.of(templates.stream()
.sorted(Comparator.comparing(CleaningTemplate::getCreatedAt).reversed()).toList())));
}
int count = templates.size();
int totalPages = (count + size + 1) / size;
List<CleaningTemplate> limitTemplates = templates.stream()
.sorted(Comparator.comparing(CleaningTemplate::getCreatedAt).reversed())
.skip((long) page * size)
.limit(size).toList();
return ResponseEntity.ok(Response.ok(PagedResponse.of(limitTemplates, page, count, totalPages)));
}
@PostMapping
public ResponseEntity<Response<CleaningTemplate>> cleaningTemplatesPost(
@RequestBody CreateCleaningTemplateRequest request) {
return ResponseEntity.ok(Response.ok(cleaningTemplateService.createTemplate(request)));
}
@GetMapping("/{templateId}")
public ResponseEntity<Response<CleaningTemplate>> cleaningTemplatesTemplateIdGet(
@PathVariable("templateId") String templateId) {
return ResponseEntity.ok(Response.ok(cleaningTemplateService.getTemplate(templateId)));
}
@PutMapping("/{templateId}")
public ResponseEntity<Response<CleaningTemplate>> cleaningTemplatesTemplateIdPut(
@PathVariable("templateId") String templateId, @RequestBody UpdateCleaningTemplateRequest request) {
return ResponseEntity.ok(Response.ok(cleaningTemplateService.updateTemplate(templateId, request)));
}
@DeleteMapping("/{templateId}")
public ResponseEntity<Response<Object>> cleaningTemplatesTemplateIdDelete(
@PathVariable("templateId") String templateId) {
cleaningTemplateService.deleteTemplate(templateId);
return ResponseEntity.noContent().build();
}
}

View File

@@ -0,0 +1,20 @@
package com.datamate.cleaning.interfaces.dto;
import lombok.Getter;
import lombok.Setter;
/**
* CleaningProcess
*/
@Getter
@Setter
public class CleaningProcess {
private Float process;
private Integer totalFileNum;
private Integer finishedFileNum;
}

View File

@@ -0,0 +1,92 @@
package com.datamate.cleaning.interfaces.dto;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonValue;
import java.time.LocalDateTime;
import java.util.List;
import lombok.Getter;
import lombok.Setter;
import org.springframework.format.annotation.DateTimeFormat;
/**
* CleaningTask
*/
@Getter
@Setter
public class CleaningTask {
private String id;
private String name;
private String description;
private String srcDatasetId;
private String srcDatasetName;
private String destDatasetId;
private String destDatasetName;
private long beforeSize;
private long afterSize;
/**
* 任务当前状态
*/
public enum StatusEnum {
PENDING("PENDING"),
RUNNING("RUNNING"),
COMPLETED("COMPLETED"),
STOPPED("STOPPED"),
FAILED("FAILED");
private final String value;
StatusEnum(String value) {
this.value = value;
}
@JsonValue
public String getValue() {
return value;
}
@JsonCreator
public static StatusEnum fromValue(String value) {
for (StatusEnum b : StatusEnum.values()) {
if (b.value.equals(value)) {
return b;
}
}
throw new IllegalArgumentException("Unexpected value '" + value + "'");
}
}
private StatusEnum status;
private String templateId;
private List<OperatorResponse> instance;
private CleaningProcess progress;
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
private LocalDateTime createdAt;
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
private LocalDateTime startedAt;
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
private LocalDateTime finishedAt;
}

View File

@@ -0,0 +1,33 @@
package com.datamate.cleaning.interfaces.dto;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
import lombok.Getter;
import lombok.Setter;
import org.springframework.format.annotation.DateTimeFormat;
/**
* CleaningTemplate
*/
@Getter
@Setter
public class CleaningTemplate {
private String id;
private String name;
private String description;
private List<OperatorResponse> instance = new ArrayList<>();
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
private LocalDateTime createdAt;
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
private LocalDateTime updatedAt;
}

View File

@@ -0,0 +1,32 @@
package com.datamate.cleaning.interfaces.dto;
import java.util.ArrayList;
import java.util.List;
import lombok.Getter;
import lombok.Setter;
/**
* CreateCleaningTaskRequest
*/
@Getter
@Setter
public class CreateCleaningTaskRequest {
private String name;
private String description;
private String srcDatasetId;
private String srcDatasetName;
private String destDatasetName;
private String destDatasetType;
private List<OperatorInstance> instance = new ArrayList<>();
}

View File

@@ -0,0 +1,23 @@
package com.datamate.cleaning.interfaces.dto;
import java.util.ArrayList;
import java.util.List;
import lombok.Getter;
import lombok.Setter;
/**
* CreateCleaningTemplateRequest
*/
@Getter
@Setter
public class CreateCleaningTemplateRequest {
private String name;
private String description;
private List<OperatorInstance> instance = new ArrayList<>();
}

View File

@@ -0,0 +1,22 @@
package com.datamate.cleaning.interfaces.dto;
import java.util.HashMap;
import java.util.Map;
import lombok.Getter;
import lombok.Setter;
/**
* OperatorInstance
*/
@Getter
@Setter
public class OperatorInstance {
private String id;
private Map<String, Object> overrides = new HashMap<>();
}

View File

@@ -0,0 +1,41 @@
package com.datamate.cleaning.interfaces.dto;
import java.time.LocalDateTime;
import lombok.Getter;
import lombok.Setter;
import org.springframework.format.annotation.DateTimeFormat;
/**
* OperatorResponse
*/
@Getter
@Setter
public class OperatorResponse {
private String id;
private String name;
private String description;
private String version;
private String inputs;
private String outputs;
private String runtime;
private String settings;
private Boolean isStar;
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
private LocalDateTime createdAt;
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
private LocalDateTime updatedAt;
}

View File

@@ -0,0 +1,26 @@
package com.datamate.cleaning.interfaces.dto;
import java.util.ArrayList;
import java.util.List;
import lombok.Getter;
import lombok.Setter;
/**
* UpdateCleaningTemplateRequest
*/
@Getter
@Setter
public class UpdateCleaningTemplateRequest {
private String id;
private String name;
private String description;
private List<OperatorInstance> instance = new ArrayList<>();
}

View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.datamate.cleaning.infrastructure.persistence.mapper.CleaningResultMapper">
<delete id="deleteByInstanceId">
DELETE FROM t_clean_result WHERE instance_id = #{instanceId}
</delete>
</mapper>

View File

@@ -0,0 +1,56 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.datamate.cleaning.infrastructure.persistence.mapper.CleaningTaskMapper">
<sql id="Base_Column_List">
id, name, description, src_dataset_id, src_dataset_name, dest_dataset_id, dest_dataset_name, before_size,
after_size, status, created_at, started_at, finished_at
</sql>
<select id="findTasks" resultType="com.datamate.cleaning.interfaces.dto.CleaningTask">
SELECT <include refid="Base_Column_List"/> FROM t_clean_task
<where>
<if test="status != null and status != ''">
AND status = #{status}
</if>
<if test="keywords != null and status != ''">
AND name LIKE CONCAT('%', #{keywords}, '%')
</if>
</where>
ORDER BY created_at DESC
<if test="size != null and offset != null">
LIMIT ${size} OFFSET ${offset}
</if>
</select>
<select id="findTaskById" resultType="com.datamate.cleaning.interfaces.dto.CleaningTask">
SELECT <include refid="Base_Column_List"/> FROM t_clean_task WHERE id = #{taskId}
</select>
<insert id="insertTask">
INSERT INTO t_clean_task (id, name, description, status, src_dataset_id, src_dataset_name, dest_dataset_id,
dest_dataset_name, before_size, after_size, created_at)
VALUES (#{id}, #{name}, #{description}, #{status}, #{srcDatasetId}, #{srcDatasetName}, #{destDatasetId},
#{destDatasetName}, ${beforeSize}, ${afterSize}, NOW())
</insert>
<update id="updateTask">
UPDATE t_clean_task
<set>
<if test="status != null">
status = #{status.value},
</if>
<if test="startedAt != null">
started_at = #{startedAt},
</if>
<if test="finishedAt != null">
finished_at = #{finishedAt},
</if>
</set>
WHERE id = #{id}
</update>
<delete id="deleteTask">
DELETE FROM t_clean_task WHERE id = #{taskId}
</delete>
</mapper>

View File

@@ -0,0 +1,38 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.datamate.cleaning.infrastructure.persistence.mapper.CleaningTemplateMapper">
<select id="findAllTemplates" resultType="com.datamate.cleaning.domain.model.TemplateWithInstance">
SELECT t.id AS id, name, description, created_at, updated_at, created_by, operator_id, op_index, settings_override
FROM t_clean_template t LEFT JOIN t_operator_instance o ON t.id = o.instance_id
<where>
<if test="keywords != null and status != ''">
AND name LIKE CONCAT('%', #{keywords}, '%')
</if>
</where>
ORDER BY created_at DESC
</select>
<select id="findAllOperators" resultType="com.datamate.cleaning.interfaces.dto.OperatorResponse">
SELECT id, name, description, version, inputs, outputs, runtime, settings, is_star, created_at, updated_at
FROM t_operator
</select>
<select id="findTemplateById" resultType="com.datamate.cleaning.interfaces.dto.CleaningTemplate">
SELECT * FROM t_clean_template WHERE id = #{templateId}
</select>
<insert id="insertTemplate">
INSERT INTO t_clean_template (id, name, description, created_at)
VALUES (#{id}, #{name}, #{description}, NOW())
</insert>
<update id="updateTemplate">
UPDATE t_clean_template SET name = #{name}, description = #{description}, updated_at = NOW() WHERE id = #{id}
</update>
<delete id="deleteTemplate">
DELETE FROM t_clean_template WHERE id = #{templateId}
</delete>
</mapper>

View File

@@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.datamate.cleaning.infrastructure.persistence.mapper.OperatorInstanceMapper">
<insert id="insertInstance">
INSERT INTO t_operator_instance(instance_id, operator_id, op_index, settings_override)
VALUES
<foreach collection="instances" item="operator" separator="," index="index">
(#{instanceId}, #{operator.id}, #{index} + 1, #{operator.overrides})
</foreach>
</insert>
<delete id="deleteByInstanceId">
DELETE FROM t_operator_instance
WHERE instance_id = #{instanceId};
</delete>
</mapper>

View File

@@ -0,0 +1,229 @@
# 数据归集服务 (Data Collection Service)
基于DataX的数据归集和同步服务,提供多数据源之间的数据同步功能。
## 功能特性
- 🔗 **多数据源支持**: 支持MySQL、PostgreSQL、Oracle、SQL Server等主流数据库
- 📊 **任务管理**: 创建、配置、执行和监控数据同步任务
-**定时调度**: 支持Cron表达式的定时任务
- 📈 **实时监控**: 任务执行进度、状态和性能指标监控
- 📝 **执行日志**: 详细的任务执行日志记录
- 🔌 **插件化**: DataX Reader/Writer插件化集成
## 技术架构
- **框架**: Spring Boot 3.x
- **数据库**: MySQL + MyBatis
- **同步引擎**: DataX
- **API**: OpenAPI 3.0 自动生成
- **架构模式**: DDD (领域驱动设计)
## 项目结构
```
src/main/java/com/datamate/collection/
├── DataCollectionApplication.java # 应用启动类
├── domain/ # 领域层
│ ├── model/ # 领域模型
│ │ ├── DataSource.java # 数据源实体
│ │ ├── CollectionTask.java # 归集任务实体
│ │ ├── TaskExecution.java # 任务执行记录
│ │ └── ExecutionLog.java # 执行日志
│ └── service/ # 领域服务
│ ├── DataSourceService.java
│ ├── CollectionTaskService.java
│ ├── TaskExecutionService.java
│ └── impl/ # 服务实现
├── infrastructure/ # 基础设施层
│ ├── config/ # 配置类
│ ├── datax/ # DataX执行引擎
│ │ └── DataXExecutionEngine.java
│ └── persistence/ # 持久化
│ ├── mapper/ # MyBatis Mapper
│ └── typehandler/ # 类型处理器
└── interfaces/ # 接口层
├── api/ # OpenAPI生成的接口
├── dto/ # OpenAPI生成的DTO
└── rest/ # REST控制器
├── DataSourceController.java
├── CollectionTaskController.java
├── TaskExecutionController.java
└── exception/ # 异常处理
src/main/resources/
├── mappers/ # MyBatis XML映射文件
├── application.properties # 应用配置
└── ...
```
## 环境要求
- Java 17+
- Maven 3.6+
- MySQL 8.0+
- DataX 3.0+
- Redis (可选,用于缓存)
## 配置说明
### 应用配置 (application.properties)
```properties
# 服务端口
server.port=8090
# 数据库配置
spring.datasource.url=jdbc:mysql://localhost:3306/knowledge_base
spring.datasource.username=root
spring.datasource.password=123456
# DataX配置
datax.home=/runtime/datax
datax.python.path=/runtime/datax/bin/datax.py
datax.job.timeout=7200
datax.job.memory=2g
```
### DataX配置
确保DataX已正确安装并配置:
1. 下载DataX到 `/runtime/datax` 目录
2. 配置相关Reader/Writer插件
3. 确保Python环境可用
## 数据库初始化
执行数据库初始化脚本:
```bash
mysql -u root -p knowledge_base < scripts/db/data-collection-init.sql
```
## 构建和运行
### 1. 编译项目
```bash
cd backend/services/data-collection-service
mvn clean compile
```
这将触发OpenAPI代码生成。
### 2. 打包
```bash
mvn clean package -DskipTests
```
### 3. 运行
作为独立服务运行:
```bash
java -jar target/data-collection-service-1.0.0-SNAPSHOT.jar
```
或通过main-application统一启动:
```bash
cd backend/services/main-application
mvn spring-boot:run
```
## API文档
服务启动后,可通过以下地址访问API文档:
- Swagger UI: http://localhost:8090/swagger-ui.html
- OpenAPI JSON: http://localhost:8090/v3/api-docs
## 主要API端点
### 数据源管理
- `GET /api/v1/collection/datasources` - 获取数据源列表
- `POST /api/v1/collection/datasources` - 创建数据源
- `GET /api/v1/collection/datasources/{id}` - 获取数据源详情
- `PUT /api/v1/collection/datasources/{id}` - 更新数据源
- `DELETE /api/v1/collection/datasources/{id}` - 删除数据源
- `POST /api/v1/collection/datasources/{id}/test` - 测试连接
### 归集任务管理
- `GET /api/v1/collection/tasks` - 获取任务列表
- `POST /api/v1/collection/tasks` - 创建任务
- `GET /api/v1/collection/tasks/{id}` - 获取任务详情
- `PUT /api/v1/collection/tasks/{id}` - 更新任务
- `DELETE /api/v1/collection/tasks/{id}` - 删除任务
### 任务执行管理
- `POST /api/v1/collection/tasks/{id}/execute` - 执行任务
- `POST /api/v1/collection/tasks/{id}/stop` - 停止任务
- `GET /api/v1/collection/executions` - 获取执行历史
- `GET /api/v1/collection/executions/{executionId}` - 获取执行详情
- `GET /api/v1/collection/executions/{executionId}/logs` - 获取执行日志
### 监控统计
- `GET /api/v1/collection/monitor/statistics` - 获取统计信息
## 开发指南
### 添加新的数据源类型
1.`DataSource.DataSourceType` 枚举中添加新类型
2.`DataXExecutionEngine` 中添加对应的Reader/Writer映射
3. 更新数据库表结构和初始化数据
### 自定义DataX插件
1. 将插件放置在 `/runtime/datax/plugin` 目录下
2.`DataXExecutionEngine` 中配置插件映射关系
3. 根据插件要求调整配置模板
### 扩展监控指标
1.`StatisticsService` 中添加新的统计逻辑
2. 更新 `CollectionStatistics` DTO
3. 在数据库中添加相应的统计表或字段
## 故障排查
### 常见问题
1. **DataX执行失败**
- 检查DataX安装路径和Python环境
- 确认数据源连接配置正确
- 查看执行日志获取详细错误信息
2. **数据库连接失败**
- 检查数据库配置和网络连通性
- 确认数据库用户权限
3. **API调用失败**
- 检查请求参数格式
- 查看应用日志获取详细错误信息
### 日志查看
```bash
# 应用日志
tail -f logs/data-collection-service.log
# 任务执行日志
curl http://localhost:8090/api/v1/collection/executions/{executionId}/logs
```
## 贡献指南
1. Fork项目
2. 创建特性分支: `git checkout -b feature/new-feature`
3. 提交更改: `git commit -am 'Add new feature'`
4. 推送分支: `git push origin feature/new-feature`
5. 提交Pull Request
## 许可证
MIT License

Binary file not shown.

After

Width:  |  Height:  |  Size: 79 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

View File

@@ -0,0 +1,200 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<artifactId>data-collection-service</artifactId>
<packaging>jar</packaging>
<name>Data Collection Service</name>
<description>DataX-based data collection and aggregation service</description>
<dependencies>
<!-- Spring Boot Dependencies -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-validation</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-actuator</artifactId>
</dependency>
<!-- MyBatis Dependencies -->
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-spring-boot3-starter</artifactId>
</dependency>
<!-- Database -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.33</version>
<scope>runtime</scope>
</dependency>
<!-- Redis -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<!-- DataX Dependencies (集成DataX插件) -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-exec</artifactId>
<version>1.3</version>
</dependency>
<!-- Connection Pool -->
<dependency>
<groupId>com.zaxxer</groupId>
<artifactId>HikariCP</artifactId>
</dependency>
<!-- Oracle JDBC Driver -->
<dependency>
<groupId>com.oracle.database.jdbc</groupId>
<artifactId>ojdbc8</artifactId>
<version>21.5.0.0</version>
</dependency>
<!-- PostgreSQL JDBC Driver -->
<dependency>
<groupId>org.postgresql</groupId>
<artifactId>postgresql</artifactId>
</dependency>
<!-- JSON Processing -->
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<!-- Shared Domain -->
<dependency>
<groupId>com.datamate</groupId>
<artifactId>domain-common</artifactId>
<version>1.0.0-SNAPSHOT</version>
</dependency>
<!-- OpenAPI Dependencies -->
<dependency>
<groupId>org.springdoc</groupId>
<artifactId>springdoc-openapi-starter-webmvc-ui</artifactId>
</dependency>
<dependency>
<groupId>org.openapitools</groupId>
<artifactId>jackson-databind-nullable</artifactId>
</dependency>
<dependency>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
</dependency>
<!-- Lombok -->
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<!-- Test Dependencies -->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.16.1</version>
<scope>compile</scope>
</dependency>
</dependencies>
<build>
<plugins>
<!-- OpenAPI Generator Plugin -->
<plugin>
<groupId>org.openapitools</groupId>
<artifactId>openapi-generator-maven-plugin</artifactId>
<version>6.6.0</version>
<executions>
<execution>
<goals>
<goal>generate</goal>
</goals>
<configuration>
<inputSpec>${project.basedir}/../../openapi/specs/data-collection.yaml</inputSpec>
<generatorName>spring</generatorName>
<output>${project.build.directory}/generated-sources/openapi</output>
<apiPackage>com.datamate.collection.interfaces.api</apiPackage>
<modelPackage>com.datamate.collection.interfaces.dto</modelPackage>
<configOptions>
<interfaceOnly>true</interfaceOnly>
<useTags>true</useTags>
<useSpringBoot3>true</useSpringBoot3>
<documentationProvider>springdoc</documentationProvider>
<dateLibrary>java8-localdatetime</dateLibrary>
<java8>true</java8>
</configOptions>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<skip>true</skip>
<classifier>exec</classifier>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
<configuration>
<source>${maven.compiler.source}</source>
<target>${maven.compiler.target}</target>
<annotationProcessorPaths>
<!-- 顺序很重要 -->
<path>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
</path>
<path>
<groupId>org.projectlombok</groupId>
<artifactId>lombok-mapstruct-binding</artifactId>
<version>${lombok-mapstruct-binding.version}</version>
</path>
<path>
<groupId>org.mapstruct</groupId>
<artifactId>mapstruct-processor</artifactId>
<version>${mapstruct.version}</version>
</path>
</annotationProcessorPaths>
<compilerArgs>
<arg>-parameters</arg>
<arg>-Amapstruct.defaultComponentModel=spring</arg>
</compilerArgs>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,24 @@
package com.datamate.collection;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.scheduling.annotation.EnableAsync;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.transaction.annotation.EnableTransactionManagement;
/**
* 数据归集服务配置类
*
* 基于DataX的数据归集和同步服务,支持多种数据源的数据采集和归集
*/
@SpringBootApplication
@EnableAsync
@EnableScheduling
@EnableTransactionManagement
@ComponentScan(basePackages = {
"com.datamate.collection",
"com.datamate.shared"
})
public class DataCollectionServiceConfiguration {
// Configuration class for JAR packaging - no main method needed
}

View File

@@ -0,0 +1,66 @@
package com.datamate.collection.application.scheduler;
import com.datamate.collection.application.service.DataxExecutionService;
import com.datamate.collection.domain.model.CollectionTask;
import com.datamate.collection.domain.model.TaskStatus;
import com.datamate.collection.domain.model.TaskExecution;
import com.datamate.collection.infrastructure.persistence.mapper.CollectionTaskMapper;
import com.datamate.collection.infrastructure.persistence.mapper.TaskExecutionMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.scheduling.support.CronExpression;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import java.time.LocalDateTime;
import java.util.List;
@Slf4j
@Component
@RequiredArgsConstructor
public class TaskSchedulerInitializer {
private final CollectionTaskMapper taskMapper;
private final TaskExecutionMapper executionMapper;
private final DataxExecutionService dataxExecutionService;
// 定期扫描激活的采集任务,根据 Cron 判断是否到期执行
@Scheduled(fixedDelayString = "${datamate.data-collection.scheduler.scan-interval-ms:10000}")
public void scanAndTrigger() {
List<CollectionTask> tasks = taskMapper.selectActiveTasks();
if (tasks == null || tasks.isEmpty()) {
return;
}
LocalDateTime now = LocalDateTime.now();
for (CollectionTask task : tasks) {
String cronExpr = task.getScheduleExpression();
if (!StringUtils.hasText(cronExpr)) {
continue;
}
try {
// 如果最近一次执行仍在运行,则跳过
TaskExecution latest = executionMapper.selectLatestByTaskId(task.getId());
if (latest != null && latest.getStatus() == TaskStatus.RUNNING) {
continue;
}
CronExpression cron = CronExpression.parse(cronExpr);
LocalDateTime base = latest != null && latest.getStartedAt() != null
? latest.getStartedAt()
: now.minusYears(1); // 没有历史记录时,拉长基准时间确保到期判定
LocalDateTime nextTime = cron.next(base);
if (nextTime != null && !nextTime.isAfter(now)) {
// 到期,触发一次执行
TaskExecution exec = dataxExecutionService.createExecution(task);
int timeout = task.getTimeoutSeconds() == null ? 3600 : task.getTimeoutSeconds();
dataxExecutionService.runAsync(task, exec.getId(), timeout);
log.info("Triggered DataX execution for task {} at {}, execId={}", task.getId(), now, exec.getId());
}
} catch (Exception ex) {
log.warn("Skip task {} due to invalid cron or scheduling error: {}", task.getId(), ex.getMessage());
}
}
}
}

View File

@@ -0,0 +1,85 @@
package com.datamate.collection.application.service;
import com.datamate.collection.domain.model.CollectionTask;
import com.datamate.collection.domain.model.TaskExecution;
import com.datamate.collection.domain.model.TaskStatus;
import com.datamate.collection.domain.model.DataxTemplate;
import com.datamate.collection.infrastructure.persistence.mapper.CollectionTaskMapper;
import com.datamate.collection.infrastructure.persistence.mapper.TaskExecutionMapper;
import com.datamate.collection.interfaces.dto.SyncMode;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.time.LocalDateTime;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
@Slf4j
@Service
@RequiredArgsConstructor
public class CollectionTaskService {
private final CollectionTaskMapper taskMapper;
private final TaskExecutionMapper executionMapper;
private final DataxExecutionService dataxExecutionService;
@Transactional
public CollectionTask create(CollectionTask task) {
task.setStatus(TaskStatus.READY);
task.setCreatedAt(LocalDateTime.now());
task.setUpdatedAt(LocalDateTime.now());
taskMapper.insert(task);
executeTaskNow(task);
return task;
}
private void executeTaskNow(CollectionTask task) {
if (Objects.equals(task.getSyncMode(), SyncMode.ONCE.getValue())) {
TaskExecution exec = dataxExecutionService.createExecution(task);
int timeout = task.getTimeoutSeconds() == null ? 3600 : task.getTimeoutSeconds();
dataxExecutionService.runAsync(task, exec.getId(), timeout);
log.info("Triggered DataX execution for task {} at {}, execId={}", task.getId(), LocalDateTime.now(), exec.getId());
}
}
@Transactional
public CollectionTask update(CollectionTask task) {
task.setUpdatedAt(LocalDateTime.now());
taskMapper.update(task);
return task;
}
@Transactional
public void delete(String id) { taskMapper.deleteById(id); }
public CollectionTask get(String id) { return taskMapper.selectById(id); }
public List<CollectionTask> list(Integer page, Integer size, String status, String name) {
Map<String, Object> p = new HashMap<>();
p.put("status", status);
p.put("name", name);
if (page != null && size != null) {
p.put("offset", page * size);
p.put("limit", size);
}
return taskMapper.selectAll(p);
}
@Transactional
public TaskExecution startExecution(CollectionTask task) {
return dataxExecutionService.createExecution(task);
}
// ---- Template related merged methods ----
public List<DataxTemplate> listTemplates(String sourceType, String targetType, int page, int size) {
int offset = page * size;
return taskMapper.selectList(sourceType, targetType, offset, size);
}
public int countTemplates(String sourceType, String targetType) {
return taskMapper.countTemplates(sourceType, targetType);
}
}

View File

@@ -0,0 +1,60 @@
package com.datamate.collection.application.service;
import com.datamate.collection.domain.model.CollectionTask;
import com.datamate.collection.domain.model.TaskExecution;
import com.datamate.collection.domain.model.TaskStatus;
import com.datamate.collection.infrastructure.persistence.mapper.CollectionTaskMapper;
import com.datamate.collection.infrastructure.persistence.mapper.TaskExecutionMapper;
import com.datamate.collection.infrastructure.runtime.datax.DataxJobBuilder;
import com.datamate.collection.infrastructure.runtime.datax.DataxProcessRunner;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.nio.file.Path;
import java.time.Duration;
import java.time.LocalDateTime;
@Slf4j
@Service
@RequiredArgsConstructor
public class DataxExecutionService {
private final DataxJobBuilder jobBuilder;
private final DataxProcessRunner processRunner;
private final TaskExecutionMapper executionMapper;
private final CollectionTaskMapper taskMapper;
@Transactional
public TaskExecution createExecution(CollectionTask task) {
TaskExecution exec = TaskExecution.initTaskExecution();
exec.setTaskId(task.getId());
exec.setTaskName(task.getName());
executionMapper.insert(exec);
taskMapper.updateLastExecution(task.getId(), exec.getId());
taskMapper.updateStatus(task.getId(), TaskStatus.RUNNING.name());
return exec;
}
@Async
public void runAsync(CollectionTask task, String executionId, int timeoutSeconds) {
try {
Path job = jobBuilder.buildJobFile(task);
int code = processRunner.runJob(job.toFile(), executionId, Duration.ofSeconds(timeoutSeconds));
log.info("DataX finished with code {} for execution {}", code, executionId);
// 简化:成功即完成
executionMapper.completeExecution(executionId, TaskStatus.SUCCESS.name(), LocalDateTime.now(),
0, 0L, 0L, 0L, null, null);
taskMapper.updateStatus(task.getId(), TaskStatus.SUCCESS.name());
} catch (Exception e) {
log.error("DataX execution failed", e);
executionMapper.completeExecution(executionId, TaskStatus.FAILED.name(), LocalDateTime.now(),
0, 0L, 0L, 0L, e.getMessage(), null);
taskMapper.updateStatus(task.getId(), TaskStatus.FAILED.name());
}
}
}

View File

@@ -0,0 +1,83 @@
package com.datamate.collection.application.service;
import com.datamate.collection.domain.model.CollectionTask;
import com.datamate.collection.domain.model.TaskExecution;
import com.datamate.collection.domain.model.TaskStatus;
import com.datamate.collection.infrastructure.persistence.mapper.CollectionTaskMapper;
import com.datamate.collection.infrastructure.persistence.mapper.TaskExecutionMapper;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import java.time.Duration;
import java.time.LocalDateTime;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@Service
@RequiredArgsConstructor
public class TaskExecutionService {
private final TaskExecutionMapper executionMapper;
private final CollectionTaskMapper taskMapper;
public List<TaskExecution> list(String taskId, String status, LocalDateTime startDate,
LocalDateTime endDate, Integer page, Integer size) {
Map<String, Object> p = new HashMap<>();
p.put("taskId", taskId);
p.put("status", status);
p.put("startDate", startDate);
p.put("endDate", endDate);
if (page != null && size != null) {
p.put("offset", page * size);
p.put("limit", size);
}
return executionMapper.selectAll(p);
}
public long count(String taskId, String status, LocalDateTime startDate, LocalDateTime endDate) {
Map<String, Object> p = new HashMap<>();
p.put("taskId", taskId);
p.put("status", status);
p.put("startDate", startDate);
p.put("endDate", endDate);
return executionMapper.count(p);
}
// --- Added convenience methods ---
public TaskExecution get(String id) { return executionMapper.selectById(id); }
public TaskExecution getLatestByTaskId(String taskId) { return executionMapper.selectLatestByTaskId(taskId); }
@Transactional
public void complete(String executionId, boolean success, long successCount, long failedCount,
long dataSizeBytes, String errorMessage, String resultJson) {
LocalDateTime now = LocalDateTime.now();
TaskExecution exec = executionMapper.selectById(executionId);
if (exec == null) { return; }
int duration = (int) Duration.between(exec.getStartedAt(), now).getSeconds();
executionMapper.completeExecution(executionId, success ? TaskStatus.SUCCESS.name() : TaskStatus.FAILED.name(),
now, duration, successCount, failedCount, dataSizeBytes, errorMessage, resultJson);
CollectionTask task = taskMapper.selectById(exec.getTaskId());
if (task != null) {
taskMapper.updateStatus(task.getId(), success ? TaskStatus.SUCCESS.name() : TaskStatus.FAILED.name());
}
}
@Transactional
public void stop(String executionId) {
TaskExecution exec = executionMapper.selectById(executionId);
if (exec == null || exec.getStatus() != TaskStatus.RUNNING) { return; }
LocalDateTime now = LocalDateTime.now();
int duration = (int) Duration.between(exec.getStartedAt(), now).getSeconds();
// Reuse completeExecution to persist STOPPED status and timing info
executionMapper.completeExecution(exec.getId(), TaskStatus.STOPPED.name(), now, duration,
exec.getRecordsSuccess(), exec.getRecordsFailed(), exec.getDataSizeBytes(), null, exec.getResult());
taskMapper.updateStatus(exec.getTaskId(), TaskStatus.STOPPED.name());
}
@Transactional
public void stopLatestByTaskId(String taskId) {
TaskExecution latest = executionMapper.selectLatestByTaskId(taskId);
if (latest != null) { stop(latest.getId()); }
}
}

View File

@@ -0,0 +1,45 @@
package com.datamate.collection.domain.model;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.Data;
import java.time.LocalDateTime;
import java.util.Collections;
import java.util.Map;
@Data
public class CollectionTask {
private String id;
private String name;
private String description;
private String config; // DataX JSON 配置,包含源端和目标端配置信息
private TaskStatus status;
private String syncMode; // ONCE / SCHEDULED
private String scheduleExpression;
private Integer retryCount;
private Integer timeoutSeconds;
private Long maxRecords;
private String sortField;
private String lastExecutionId;
private LocalDateTime createdAt;
private LocalDateTime updatedAt;
private String createdBy;
private String updatedBy;
public void addPath() {
try {
ObjectMapper objectMapper = new ObjectMapper();
Map<String, Object> parameter = objectMapper.readValue(
config,
new TypeReference<>() {}
);
parameter.put("destPath", "/dataset/local/" + id);
parameter.put("filePaths", Collections.singletonList(parameter.get("destPath")));
config = objectMapper.writeValueAsString(parameter);
} catch (JsonProcessingException e) {
throw new RuntimeException(e);
}
}
}

View File

@@ -0,0 +1,71 @@
package com.datamate.collection.domain.model;
import lombok.Data;
import lombok.EqualsAndHashCode;
import java.time.LocalDateTime;
@Data
@EqualsAndHashCode(callSuper = false)
public class DataxTemplate {
/**
* 模板ID(UUID)
*/
private String id;
/**
* 模板名称
*/
private String name;
/**
* 源数据源类型
*/
private String sourceType;
/**
* 目标数据源类型
*/
private String targetType;
/**
* 模板内容(JSON格式)
*/
private String templateContent;
/**
* 模板描述
*/
private String description;
/**
* 版本号
*/
private String version;
/**
* 是否为系统模板
*/
private Boolean isSystem;
/**
* 创建时间
*/
private LocalDateTime createdAt;
/**
* 更新时间
*/
private LocalDateTime updatedAt;
/**
* 创建者
*/
private String createdBy;
/**
* 更新者
*/
private String updatedBy;
}

View File

@@ -0,0 +1,39 @@
package com.datamate.collection.domain.model;
import lombok.Data;
import java.time.LocalDateTime;
import java.util.UUID;
@Data
public class TaskExecution {
private String id;
private String taskId;
private String taskName;
private TaskStatus status;
private Double progress;
private Long recordsTotal;
private Long recordsProcessed;
private Long recordsSuccess;
private Long recordsFailed;
private Double throughput;
private Long dataSizeBytes;
private LocalDateTime startedAt;
private LocalDateTime completedAt;
private Integer durationSeconds;
private String errorMessage;
private String dataxJobId;
private String config;
private String result;
private LocalDateTime createdAt;
public static TaskExecution initTaskExecution() {
TaskExecution exec = new TaskExecution();
exec.setId(UUID.randomUUID().toString());
exec.setStatus(TaskStatus.RUNNING);
exec.setProgress(0.0);
exec.setStartedAt(LocalDateTime.now());
exec.setCreatedAt(LocalDateTime.now());
return exec;
}
}

View File

@@ -0,0 +1,21 @@
package com.datamate.collection.domain.model;
/**
* 统一的任务和执行状态枚举
*
* @author Data Mate Platform Team
*/
public enum TaskStatus {
/** 草稿状态 */
DRAFT,
/** 就绪状态 */
READY,
/** 运行中 */
RUNNING,
/** 执行成功(对应原来的COMPLETED) */
SUCCESS,
/** 执行失败 */
FAILED,
/** 已停止 */
STOPPED
}

View File

@@ -0,0 +1,47 @@
package com.datamate.collection.infrastructure.persistence.mapper;
import com.datamate.collection.domain.model.CollectionTask;
import com.datamate.collection.domain.model.DataxTemplate;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import java.util.List;
import java.util.Map;
@Mapper
public interface CollectionTaskMapper {
int insert(CollectionTask entity);
int update(CollectionTask entity);
int deleteById(@Param("id") String id);
CollectionTask selectById(@Param("id") String id);
CollectionTask selectByName(@Param("name") String name);
List<CollectionTask> selectByStatus(@Param("status") String status);
List<CollectionTask> selectAll(Map<String, Object> params);
int updateStatus(@Param("id") String id, @Param("status") String status);
int updateLastExecution(@Param("id") String id, @Param("lastExecutionId") String lastExecutionId);
List<CollectionTask> selectActiveTasks();
/**
* 查询模板列表
*
* @param sourceType 源数据源类型(可选)
* @param targetType 目标数据源类型(可选)
* @param offset 偏移量
* @param limit 限制数量
* @return 模板列表
*/
List<DataxTemplate> selectList(@Param("sourceType") String sourceType,
@Param("targetType") String targetType,
@Param("offset") int offset,
@Param("limit") int limit);
/**
* 统计模板数量
*
* @param sourceType 源数据源类型(可选)
* @param targetType 目标数据源类型(可选)
* @return 模板总数
*/
int countTemplates(@Param("sourceType") String sourceType,
@Param("targetType") String targetType);
}

View File

@@ -0,0 +1,38 @@
package com.datamate.collection.infrastructure.persistence.mapper;
import com.datamate.collection.domain.model.TaskExecution;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Map;
@Mapper
public interface TaskExecutionMapper {
int insert(TaskExecution entity);
int update(TaskExecution entity);
int deleteById(@Param("id") String id);
TaskExecution selectById(@Param("id") String id);
List<TaskExecution> selectByTaskId(@Param("taskId") String taskId, @Param("limit") Integer limit);
List<TaskExecution> selectByStatus(@Param("status") String status);
List<TaskExecution> selectAll(Map<String, Object> params);
long count(Map<String, Object> params);
int updateProgress(@Param("id") String id,
@Param("status") String status,
@Param("progress") Double progress,
@Param("recordsProcessed") Long recordsProcessed,
@Param("throughput") Double throughput);
int completeExecution(@Param("id") String id,
@Param("status") String status,
@Param("completedAt") LocalDateTime completedAt,
@Param("durationSeconds") Integer durationSeconds,
@Param("recordsSuccess") Long recordsSuccess,
@Param("recordsFailed") Long recordsFailed,
@Param("dataSizeBytes") Long dataSizeBytes,
@Param("errorMessage") String errorMessage,
@Param("result") String result);
List<TaskExecution> selectRunningExecutions();
TaskExecution selectLatestByTaskId(@Param("taskId") String taskId);
int deleteOldExecutions(@Param("beforeDate") LocalDateTime beforeDate);
}

View File

@@ -0,0 +1,83 @@
package com.datamate.collection.infrastructure.runtime.datax;
import com.datamate.collection.domain.model.CollectionTask;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component;
import java.io.FileWriter;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* 根据任务配置拼装 DataX 作业 JSON 文件
*/
@Slf4j
@Component
@RequiredArgsConstructor
public class DataxJobBuilder {
private final DataxProperties props;
public Path buildJobFile(CollectionTask task) throws IOException {
Files.createDirectories(Paths.get(props.getJobConfigPath()));
String fileName = String.format("datax-job-%s.json", task.getId());
Path path = Paths.get(props.getJobConfigPath(), fileName);
// 简化:直接将任务中的 config 字段作为 DataX 作业 JSON
try (FileWriter fw = new FileWriter(path.toFile())) {
String json = task.getConfig() == null || task.getConfig().isEmpty() ?
defaultJobJson() : task.getConfig();
if (StringUtils.isNotBlank(task.getConfig())) {
json = getJobConfig(task);
}
log.info("Job config: {}", json);
fw.write(json);
}
return path;
}
private String getJobConfig(CollectionTask task) {
try {
ObjectMapper objectMapper = new ObjectMapper();
Map<String, Object> parameter = objectMapper.readValue(
task.getConfig(),
new TypeReference<>() {}
);
Map<String, Object> job = new HashMap<>();
Map<String, Object> content = new HashMap<>();
Map<String, Object> reader = new HashMap<>();
reader.put("name", "nfsreader");
reader.put("parameter", parameter);
content.put("reader", reader);
Map<String, Object> writer = new HashMap<>();
writer.put("name", "nfswriter");
writer.put("parameter", parameter);
content.put("writer", writer);
job.put("content", List.of(content));
Map<String, Object> setting = new HashMap<>();
Map<String, Object> channel = new HashMap<>();
channel.put("channel", 2);
setting.put("speed", channel);
job.put("setting", setting);
Map<String, Object> jobConfig = new HashMap<>();
jobConfig.put("job", job);
return objectMapper.writeValueAsString(jobConfig);
} catch (Exception e) {
log.error("Failed to parse task config", e);
throw new RuntimeException("Failed to parse task config", e);
}
}
private String defaultJobJson() {
// 提供一个最小可运行的空 job,实际会被具体任务覆盖
return "{\n \"job\": {\n \"setting\": {\n \"speed\": {\n \"channel\": 1\n }\n },\n \"content\": []\n }\n}";
}
}

View File

@@ -0,0 +1,46 @@
package com.datamate.collection.infrastructure.runtime.datax;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.exec.*;
import org.springframework.stereotype.Component;
import java.io.File;
import java.time.Duration;
@Slf4j
@Component
@RequiredArgsConstructor
public class DataxProcessRunner {
private final DataxProperties props;
public int runJob(File jobFile, String executionId, Duration timeout) throws Exception {
File logFile = new File(props.getLogPath(), String.format("datax-%s.log", executionId));
String python = props.getPythonPath();
String dataxPy = props.getHomePath() + File.separator + "bin" + File.separator + "datax.py";
String cmd = String.format("%s %s %s", python, dataxPy, jobFile.getAbsolutePath());
log.info("Execute DataX: {}", cmd);
CommandLine cl = CommandLine.parse(cmd);
DefaultExecutor executor = new DefaultExecutor();
// 将日志追加输出到文件
File parent = logFile.getParentFile();
if (!parent.exists()) parent.mkdirs();
ExecuteStreamHandler streamHandler = new PumpStreamHandler(
new org.apache.commons.io.output.TeeOutputStream(
new java.io.FileOutputStream(logFile, true), System.out),
new org.apache.commons.io.output.TeeOutputStream(
new java.io.FileOutputStream(logFile, true), System.err)
);
executor.setStreamHandler(streamHandler);
ExecuteWatchdog watchdog = new ExecuteWatchdog(timeout.toMillis());
executor.setWatchdog(watchdog);
return executor.execute(cl);
}
}

View File

@@ -0,0 +1,17 @@
package com.datamate.collection.infrastructure.runtime.datax;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
@Data
@Configuration
@ConfigurationProperties(prefix = "datamate.data-collection.datax")
public class DataxProperties {
private String homePath; // DATAX_HOME
private String pythonPath; // python 可执行文件
private String jobConfigPath; // 生成的作业文件目录
private String logPath; // 运行日志目录
private Integer maxMemory = 2048;
private Integer channelCount = 5;
}

View File

@@ -0,0 +1,52 @@
package com.datamate.collection.interfaces.converter;
import com.datamate.collection.domain.model.CollectionTask;
import com.datamate.collection.domain.model.DataxTemplate;
import com.datamate.collection.interfaces.dto.*;
import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.mapstruct.Mapper;
import org.mapstruct.Mapping;
import org.mapstruct.Named;
import org.mapstruct.factory.Mappers;
import java.util.Map;
@Mapper
public interface CollectionTaskConverter {
CollectionTaskConverter INSTANCE = Mappers.getMapper(CollectionTaskConverter.class);
@Mapping(source = "config", target = "config", qualifiedByName = "parseJsonToMap")
CollectionTaskResponse toResponse(CollectionTask task);
CollectionTaskSummary toSummary(CollectionTask task);
DataxTemplateSummary toTemplateSummary(DataxTemplate template);
@Mapping(source = "config", target = "config", qualifiedByName = "mapToJsonString")
CollectionTask toCollectionTask(CreateCollectionTaskRequest request);
@Mapping(source = "config", target = "config", qualifiedByName = "mapToJsonString")
CollectionTask toCollectionTask(UpdateCollectionTaskRequest request);
@Named("parseJsonToMap")
default Map<String, Object> parseJsonToMap(String json) {
try {
ObjectMapper objectMapper = new ObjectMapper();
return objectMapper.readValue(json, Map.class);
} catch (Exception e) {
throw BusinessException.of(SystemErrorCode.INVALID_PARAMETER);
}
}
@Named("mapToJsonString")
default String mapToJsonString(Map<String, Object> map) {
try {
ObjectMapper objectMapper = new ObjectMapper();
return objectMapper.writeValueAsString(map != null ? map : Map.of());
} catch (Exception e) {
throw BusinessException.of(SystemErrorCode.INVALID_PARAMETER);
}
}
}

View File

@@ -0,0 +1,83 @@
package com.datamate.collection.interfaces.rest;
import com.datamate.collection.application.service.CollectionTaskService;
import com.datamate.collection.domain.model.CollectionTask;
import com.datamate.collection.domain.model.DataxTemplate;
import com.datamate.collection.interfaces.api.CollectionTaskApi;
import com.datamate.collection.interfaces.converter.CollectionTaskConverter;
import com.datamate.collection.interfaces.dto.*;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.http.ResponseEntity;
import org.springframework.validation.annotation.Validated;
import org.springframework.web.bind.annotation.RestController;
import java.util.*;
import java.util.stream.Collectors;
@Slf4j
@RestController
@RequiredArgsConstructor
@Validated
public class CollectionTaskController implements CollectionTaskApi {
private final CollectionTaskService taskService;
@Override
public ResponseEntity<CollectionTaskResponse> createTask(CreateCollectionTaskRequest request) {
CollectionTask task = CollectionTaskConverter.INSTANCE.toCollectionTask(request);
task.setId(UUID.randomUUID().toString());
task.addPath();
return ResponseEntity.ok().body(CollectionTaskConverter.INSTANCE.toResponse(taskService.create(task)));
}
@Override
public ResponseEntity<CollectionTaskResponse> updateTask(String id, UpdateCollectionTaskRequest request) {
if (taskService.get(id) == null) {
return ResponseEntity.notFound().build();
}
CollectionTask task = CollectionTaskConverter.INSTANCE.toCollectionTask(request);
task.setId(id);
return ResponseEntity.ok(CollectionTaskConverter.INSTANCE.toResponse(taskService.update(task)));
}
@Override
public ResponseEntity<Void> deleteTask(String id) {
taskService.delete(id);
return ResponseEntity.ok().build();
}
@Override
public ResponseEntity<CollectionTaskResponse> getTaskDetail(String id) {
CollectionTask task = taskService.get(id);
return task == null ? ResponseEntity.notFound().build() : ResponseEntity.ok(CollectionTaskConverter.INSTANCE.toResponse(task));
}
@Override
public ResponseEntity<PagedCollectionTaskSummary> getTasks(Integer page, Integer size, TaskStatus status, String name) {
var list = taskService.list(page, size, status == null ? null : status.getValue(), name);
PagedCollectionTaskSummary response = new PagedCollectionTaskSummary();
response.setContent(list.stream().map(CollectionTaskConverter.INSTANCE::toSummary).collect(Collectors.toList()));
response.setNumber(page);
response.setSize(size);
response.setTotalElements(list.size()); // 简化处理,实际项目中应该有单独的count查询
response.setTotalPages(size == null || size == 0 ? 1 : (int) Math.ceil(list.size() * 1.0 / size));
return ResponseEntity.ok(response);
}
@Override
public ResponseEntity<PagedDataxTemplates> templatesGet(String sourceType, String targetType,
Integer page, Integer size) {
int pageNum = page != null ? page : 0;
int pageSize = size != null ? size : 20;
List<DataxTemplate> templates = taskService.listTemplates(sourceType, targetType, pageNum, pageSize);
int totalElements = taskService.countTemplates(sourceType, targetType);
PagedDataxTemplates response = new PagedDataxTemplates();
response.setContent(templates.stream().map(CollectionTaskConverter.INSTANCE::toTemplateSummary).collect(Collectors.toList()));
response.setNumber(pageNum);
response.setSize(pageSize);
response.setTotalElements(totalElements);
response.setTotalPages(pageSize > 0 ? (int) Math.ceil(totalElements * 1.0 / pageSize) : 1);
return ResponseEntity.ok(response);
}
}

View File

@@ -0,0 +1,101 @@
package com.datamate.collection.interfaces.rest;
import com.datamate.collection.application.service.CollectionTaskService;
import com.datamate.collection.application.service.TaskExecutionService;
import com.datamate.collection.domain.model.TaskExecution;
import com.datamate.collection.interfaces.api.TaskExecutionApi;
import com.datamate.collection.interfaces.dto.PagedTaskExecutions;
import com.datamate.collection.interfaces.dto.TaskExecutionDetail;
import com.datamate.collection.interfaces.dto.TaskExecutionResponse;
import com.datamate.collection.interfaces.dto.TaskStatus; // DTO enum
import lombok.RequiredArgsConstructor;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.validation.annotation.Validated;
import org.springframework.web.bind.annotation.RestController;
import java.util.stream.Collectors;
@RestController
@RequiredArgsConstructor
@Validated
public class TaskExecutionController implements TaskExecutionApi {
private final TaskExecutionService executionService;
private final CollectionTaskService taskService;
private TaskExecutionDetail toDetail(TaskExecution e) {
TaskExecutionDetail d = new TaskExecutionDetail();
d.setId(e.getId());
d.setTaskId(e.getTaskId());
d.setTaskName(e.getTaskName());
if (e.getStatus() != null) { d.setStatus(TaskStatus.fromValue(e.getStatus().name())); }
d.setProgress(e.getProgress());
d.setRecordsTotal(e.getRecordsTotal() != null ? e.getRecordsTotal().intValue() : null);
d.setRecordsProcessed(e.getRecordsProcessed() != null ? e.getRecordsProcessed().intValue() : null);
d.setRecordsSuccess(e.getRecordsSuccess() != null ? e.getRecordsSuccess().intValue() : null);
d.setRecordsFailed(e.getRecordsFailed() != null ? e.getRecordsFailed().intValue() : null);
d.setThroughput(e.getThroughput());
d.setDataSizeBytes(e.getDataSizeBytes() != null ? e.getDataSizeBytes().intValue() : null);
d.setStartedAt(e.getStartedAt());
d.setCompletedAt(e.getCompletedAt());
d.setDurationSeconds(e.getDurationSeconds());
d.setErrorMessage(e.getErrorMessage());
return d;
}
// GET /executions/{id}
@Override
public ResponseEntity<TaskExecutionDetail> executionsIdGet(String id) {
var exec = executionService.get(id);
return exec == null ? ResponseEntity.notFound().build() : ResponseEntity.ok(toDetail(exec));
}
// DELETE /executions/{id}
@Override
public ResponseEntity<Void> executionsIdDelete(String id) {
executionService.stop(id); // 幂等处理,在service内部判断状态
return ResponseEntity.noContent().build();
}
// POST /tasks/{id}/execute -> 201
@Override
public ResponseEntity<TaskExecutionResponse> tasksIdExecutePost(String id) {
var task = taskService.get(id);
if (task == null) { return ResponseEntity.notFound().build(); }
var latestExec = executionService.getLatestByTaskId(id);
if (latestExec != null && latestExec.getStatus() == com.datamate.collection.domain.model.TaskStatus.RUNNING) {
TaskExecutionResponse r = new TaskExecutionResponse();
r.setId(latestExec.getId());
r.setTaskId(latestExec.getTaskId());
r.setTaskName(latestExec.getTaskName());
r.setStatus(TaskStatus.fromValue(latestExec.getStatus().name()));
r.setStartedAt(latestExec.getStartedAt());
return ResponseEntity.status(HttpStatus.CREATED).body(r); // 返回已有运行实例
}
var exec = taskService.startExecution(task);
TaskExecutionResponse r = new TaskExecutionResponse();
r.setId(exec.getId());
r.setTaskId(exec.getTaskId());
r.setTaskName(exec.getTaskName());
r.setStatus(TaskStatus.fromValue(exec.getStatus().name()));
r.setStartedAt(exec.getStartedAt());
return ResponseEntity.status(HttpStatus.CREATED).body(r);
}
// GET /tasks/{id}/executions -> 分页
@Override
public ResponseEntity<PagedTaskExecutions> tasksIdExecutionsGet(String id, Integer page, Integer size) {
if (page == null || page < 0) { page = 0; }
if (size == null || size <= 0) { size = 20; }
var list = executionService.list(id, null, null, null, page, size);
long total = executionService.count(id, null, null, null);
PagedTaskExecutions p = new PagedTaskExecutions();
p.setContent(list.stream().map(this::toDetail).collect(Collectors.toList()));
p.setNumber(page);
p.setSize(size);
p.setTotalElements((int) total);
p.setTotalPages(size == 0 ? 1 : (int) Math.ceil(total * 1.0 / size));
return ResponseEntity.ok(p);
}
}

View File

@@ -0,0 +1,23 @@
datamate:
data-collection:
# DataX配置
datax:
home-path: ${DATAX_HOME:D:/datax}
python-path: ${DATAX_PYTHON_PATH:python3}
job-config-path: ${DATAX_JOB_PATH:./data/temp/datax/jobs}
log-path: ${DATAX_LOG_PATH:./logs/datax}
max-memory: ${DATAX_MAX_MEMORY:2048}
channel-count: ${DATAX_CHANNEL_COUNT:5}
# 执行配置
execution:
max-concurrent-tasks: ${DATA_COLLECTION_MAX_CONCURRENT_TASKS:10}
task-timeout-minutes: ${DATA_COLLECTION_TASK_TIMEOUT:120}
retry-count: ${DATA_COLLECTION_RETRY_COUNT:3}
retry-interval-seconds: ${DATA_COLLECTION_RETRY_INTERVAL:30}
# 监控配置
monitoring:
status-check-interval-seconds: ${DATA_COLLECTION_STATUS_CHECK_INTERVAL:30}
log-retention-days: ${DATA_COLLECTION_LOG_RETENTION:30}
enable-metrics: ${DATA_COLLECTION_ENABLE_METRICS:true}

View File

@@ -0,0 +1,188 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.datamate.collection.infrastructure.persistence.mapper.CollectionTaskMapper">
<!-- Result Map -->
<resultMap id="CollectionTaskResultMap" type="com.datamate.collection.domain.model.CollectionTask">
<id property="id" column="id"/>
<result property="name" column="name"/>
<result property="description" column="description"/>
<result property="config" column="config"/>
<result property="status" column="status" typeHandler="org.apache.ibatis.type.EnumTypeHandler"/>
<result property="syncMode" column="sync_mode"/>
<result property="scheduleExpression" column="schedule_expression"/>
<result property="retryCount" column="retry_count"/>
<result property="timeoutSeconds" column="timeout_seconds"/>
<result property="maxRecords" column="max_records"/>
<result property="sortField" column="sort_field"/>
<result property="lastExecutionId" column="last_execution_id"/>
<result property="createdAt" column="created_at"/>
<result property="updatedAt" column="updated_at"/>
<result property="createdBy" column="created_by"/>
<result property="updatedBy" column="updated_by"/>
</resultMap>
<!-- 结果映射 (模板) -->
<resultMap id="DataxTemplateResultMap" type="com.datamate.collection.domain.model.DataxTemplate">
<id column="id" property="id" jdbcType="VARCHAR"/>
<result column="name" property="name" jdbcType="VARCHAR"/>
<result column="source_type" property="sourceType" jdbcType="VARCHAR"/>
<result column="target_type" property="targetType" jdbcType="VARCHAR"/>
<result column="template_content" property="templateContent" jdbcType="VARCHAR"/>
<result column="description" property="description" jdbcType="VARCHAR"/>
<result column="version" property="version" jdbcType="VARCHAR"/>
<result column="is_system" property="isSystem" jdbcType="BOOLEAN"/>
<result column="created_at" property="createdAt" jdbcType="TIMESTAMP"/>
<result column="updated_at" property="updatedAt" jdbcType="TIMESTAMP"/>
<result column="created_by" property="createdBy" jdbcType="VARCHAR"/>
</resultMap>
<!-- Base Column List (tasks) -->
<sql id="Base_Column_List">
id,
name, description, config, status, sync_mode,
schedule_expression, retry_count, timeout_seconds, max_records, sort_field,
last_execution_id, created_at, updated_at, created_by, updated_by
</sql>
<!-- Template Column List -->
<sql id="Template_Column_List">
id, name, source_type, target_type, template_content, description, version, is_system, created_at, updated_at, created_by
</sql>
<!-- Insert -->
<insert id="insert" parameterType="com.datamate.collection.domain.model.CollectionTask">
INSERT INTO t_dc_collection_tasks (id, name, description, config, status, sync_mode,
schedule_expression, retry_count, timeout_seconds, max_records, sort_field,
last_execution_id, created_at, updated_at, created_by, updated_by)
VALUES (#{id}, #{name}, #{description}, #{config}, #{status}, #{syncMode},
#{scheduleExpression}, #{retryCount}, #{timeoutSeconds}, #{maxRecords}, #{sortField},
#{lastExecutionId}, #{createdAt}, #{updatedAt}, #{createdBy}, #{updatedBy})
</insert>
<!-- Update -->
<update id="update" parameterType="com.datamate.collection.domain.model.CollectionTask">
UPDATE t_dc_collection_tasks
SET name = #{name},
description = #{description},
config = #{config},
status = #{status},
sync_mode = #{syncMode},
schedule_expression = #{scheduleExpression},
retry_count = #{retryCount},
timeout_seconds = #{timeoutSeconds},
max_records = #{maxRecords},
sort_field = #{sortField},
last_execution_id = #{lastExecutionId},
updated_at = #{updatedAt},
updated_by = #{updatedBy}
WHERE id = #{id}
</update>
<!-- Delete by ID -->
<delete id="deleteById" parameterType="java.lang.String">
DELETE FROM t_dc_collection_tasks WHERE id = #{id}
</delete>
<!-- Select by ID -->
<select id="selectById" parameterType="java.lang.String" resultMap="CollectionTaskResultMap">
SELECT <include refid="Base_Column_List"/> FROM t_dc_collection_tasks WHERE id = #{id}
</select>
<!-- Select by Name -->
<select id="selectByName" parameterType="java.lang.String" resultMap="CollectionTaskResultMap">
SELECT <include refid="Base_Column_List"/> FROM t_dc_collection_tasks WHERE name = #{name}
</select>
<!-- Select by Status -->
<select id="selectByStatus" parameterType="java.lang.String" resultMap="CollectionTaskResultMap">
SELECT <include refid="Base_Column_List"/> FROM t_dc_collection_tasks WHERE status = #{status} ORDER BY created_at DESC
</select>
<!-- Select All with Pagination -->
<select id="selectAll" resultMap="CollectionTaskResultMap">
SELECT <include refid="Base_Column_List"/> FROM t_dc_collection_tasks
<where>
<if test="status != null and status != ''">
AND status = #{status}
</if>
<if test="name != null and name != ''">
AND name LIKE CONCAT('%', #{name}, '%')
</if>
</where>
ORDER BY created_at DESC
<if test="offset != null and limit != null">
LIMIT #{offset}, #{limit}
</if>
</select>
<!-- Count Total -->
<select id="count" resultType="java.lang.Long">
SELECT COUNT(*) FROM t_dc_collection_tasks
<where>
<if test="status != null and status != ''">
AND status = #{status}
</if>
<if test="name != null and name != ''">
AND name LIKE CONCAT('%', #{name}, '%')
</if>
<if test="sourceDataSourceId != null and sourceDataSourceId != ''">
AND source_datasource_id = #{sourceDataSourceId}
</if>
<if test="targetDataSourceId != null and targetDataSourceId != ''">
AND target_datasource_id = #{targetDataSourceId}
</if>
</where>
</select>
<!-- Update Status -->
<update id="updateStatus">
UPDATE t_dc_collection_tasks SET status = #{status}, updated_at = NOW() WHERE id = #{id}
</update>
<!-- Update Last Execution -->
<update id="updateLastExecution">
UPDATE t_dc_collection_tasks SET last_execution_id = #{lastExecutionId}, updated_at = NOW() WHERE id = #{id}
</update>
<!-- Select Active Tasks for Scheduling -->
<select id="selectActiveTasks" resultMap="CollectionTaskResultMap">
SELECT <include refid="Base_Column_List"/> FROM t_dc_collection_tasks
WHERE status IN ('READY', 'RUNNING')
AND schedule_expression IS NOT NULL
ORDER BY created_at DESC
</select>
<!-- 查询模板列表 -->
<select id="selectList" resultMap="DataxTemplateResultMap">
SELECT <include refid="Template_Column_List"/> FROM t_dc_datax_templates
<where>
<if test="sourceType != null and sourceType != ''">
AND source_type = #{sourceType}
</if>
<if test="targetType != null and targetType != ''">
AND target_type = #{targetType}
</if>
</where>
ORDER BY is_system DESC, created_at DESC
<if test="limit > 0">
LIMIT #{offset}, #{limit}
</if>
</select>
<!-- 统计模板数量 -->
<select id="countTemplates" resultType="java.lang.Integer">
SELECT COUNT(1) FROM t_dc_datax_templates
<where>
<if test="sourceType != null and sourceType != ''">
AND source_type = #{sourceType}
</if>
<if test="targetType != null and targetType != ''">
AND target_type = #{targetType}
</if>
</where>
</select>
</mapper>

View File

@@ -0,0 +1,191 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.datamate.collection.infrastructure.persistence.mapper.TaskExecutionMapper">
<!-- Result Map -->
<resultMap id="TaskExecutionResultMap" type="com.datamate.collection.domain.model.TaskExecution">
<id property="id" column="id"/>
<result property="taskId" column="task_id"/>
<result property="taskName" column="task_name"/>
<result property="status" column="status" typeHandler="org.apache.ibatis.type.EnumTypeHandler"/>
<result property="progress" column="progress"/>
<result property="recordsTotal" column="records_total"/>
<result property="recordsProcessed" column="records_processed"/>
<result property="recordsSuccess" column="records_success"/>
<result property="recordsFailed" column="records_failed"/>
<result property="throughput" column="throughput"/>
<result property="dataSizeBytes" column="data_size_bytes"/>
<result property="startedAt" column="started_at"/>
<result property="completedAt" column="completed_at"/>
<result property="durationSeconds" column="duration_seconds"/>
<result property="errorMessage" column="error_message"/>
<result property="dataxJobId" column="datax_job_id"/>
<result property="config" column="config"/>
<result property="result" column="result"/>
<result property="createdAt" column="created_at"/>
</resultMap>
<!-- Base Column List -->
<sql id="Base_Column_List">
id, task_id, task_name, status, progress, records_total, records_processed,
records_success, records_failed, throughput, data_size_bytes, started_at,
completed_at, duration_seconds, error_message, datax_job_id, config, result, created_at
</sql>
<!-- Insert -->
<insert id="insert" parameterType="com.datamate.collection.domain.model.TaskExecution">
INSERT INTO t_dc_task_executions (
id, task_id, task_name, status, progress, records_total, records_processed,
records_success, records_failed, throughput, data_size_bytes, started_at,
completed_at, duration_seconds, error_message, datax_job_id, config, result, created_at
) VALUES (
#{id}, #{taskId}, #{taskName}, #{status}, #{progress}, #{recordsTotal}, #{recordsProcessed},
#{recordsSuccess}, #{recordsFailed}, #{throughput}, #{dataSizeBytes}, #{startedAt},
#{completedAt}, #{durationSeconds}, #{errorMessage}, #{dataxJobId}, #{config}, #{result}, #{createdAt}
)
</insert>
<!-- Update -->
<update id="update" parameterType="com.datamate.collection.domain.model.TaskExecution">
UPDATE t_dc_task_executions
SET status = #{status},
progress = #{progress},
records_total = #{recordsTotal},
records_processed = #{recordsProcessed},
records_success = #{recordsSuccess},
records_failed = #{recordsFailed},
throughput = #{throughput},
data_size_bytes = #{dataSizeBytes},
completed_at = #{completedAt},
duration_seconds = #{durationSeconds},
error_message = #{errorMessage},
result = #{result}
WHERE id = #{id}
</update>
<!-- Delete by ID -->
<delete id="deleteById" parameterType="java.lang.String">
DELETE FROM t_dc_task_executions WHERE id = #{id}
</delete>
<!-- Select by ID -->
<select id="selectById" parameterType="java.lang.String" resultMap="TaskExecutionResultMap">
SELECT <include refid="Base_Column_List"/>
FROM t_dc_task_executions
WHERE id = #{id}
</select>
<!-- Select by Task ID -->
<select id="selectByTaskId" resultMap="TaskExecutionResultMap">
SELECT <include refid="Base_Column_List"/>
FROM t_dc_task_executions
WHERE task_id = #{taskId}
ORDER BY started_at DESC
<if test="limit != null">
LIMIT #{limit}
</if>
</select>
<!-- Select by Status -->
<select id="selectByStatus" parameterType="java.lang.String" resultMap="TaskExecutionResultMap">
SELECT <include refid="Base_Column_List"/>
FROM t_dc_task_executions
WHERE status = #{status}
ORDER BY started_at DESC
</select>
<!-- Select All with Pagination -->
<select id="selectAll" resultMap="TaskExecutionResultMap">
SELECT <include refid="Base_Column_List"/>
FROM t_dc_task_executions
<where>
<if test="taskId != null and taskId != ''">
AND task_id = #{taskId}
</if>
<if test="status != null and status != ''">
AND status = #{status}
</if>
<if test="startDate != null">
AND started_at >= #{startDate}
</if>
<if test="endDate != null">
AND started_at &lt;= #{endDate}
</if>
</where>
ORDER BY started_at DESC
<if test="offset != null and limit != null">
LIMIT #{offset}, #{limit}
</if>
</select>
<!-- Count Total -->
<select id="count" resultType="java.lang.Long">
SELECT COUNT(*)
FROM t_dc_task_executions
<where>
<if test="taskId != null and taskId != ''">
AND task_id = #{taskId}
</if>
<if test="status != null and status != ''">
AND status = #{status}
</if>
<if test="startDate != null">
AND started_at >= #{startDate}
</if>
<if test="endDate != null">
AND started_at &lt;= #{endDate}
</if>
</where>
</select>
<!-- Update Status and Progress -->
<update id="updateProgress">
UPDATE t_dc_task_executions
SET status = #{status},
progress = #{progress},
records_processed = #{recordsProcessed},
throughput = #{throughput}
WHERE id = #{id}
</update>
<!-- Complete Execution -->
<update id="completeExecution">
UPDATE t_dc_task_executions
SET status = #{status},
progress = 100.00,
completed_at = #{completedAt},
duration_seconds = #{durationSeconds},
records_success = #{recordsSuccess},
records_failed = #{recordsFailed},
data_size_bytes = #{dataSizeBytes},
error_message = #{errorMessage},
result = #{result}
WHERE id = #{id}
</update>
<!-- Select Running Executions -->
<select id="selectRunningExecutions" resultMap="TaskExecutionResultMap">
SELECT <include refid="Base_Column_List"/>
FROM t_dc_task_executions
WHERE status = 'RUNNING'
ORDER BY started_at ASC
</select>
<!-- Select Latest Execution by Task -->
<select id="selectLatestByTaskId" parameterType="java.lang.String" resultMap="TaskExecutionResultMap">
SELECT <include refid="Base_Column_List"/>
FROM t_dc_task_executions
WHERE task_id = #{taskId}
ORDER BY started_at DESC
LIMIT 1
</select>
<!-- Delete Old Executions -->
<delete id="deleteOldExecutions">
DELETE FROM t_dc_task_executions
WHERE started_at &lt; #{beforeDate}
</delete>
</mapper>

View File

@@ -0,0 +1,92 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<artifactId>data-evaluation-service</artifactId>
<name>Data Evaluation Service</name>
<description>数据评估服务</description>
<dependencies>
<dependency>
<groupId>com.datamate</groupId>
<artifactId>domain-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-openfeign</artifactId>
</dependency>
<dependency>
<groupId>org.springdoc</groupId>
<artifactId>springdoc-openapi-starter-webmvc-ui</artifactId>
</dependency>
<dependency>
<groupId>org.openapitools</groupId>
<artifactId>jackson-databind-nullable</artifactId>
</dependency>
<dependency>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
</plugin>
<plugin>
<groupId>org.openapitools</groupId>
<artifactId>openapi-generator-maven-plugin</artifactId>
<version>6.6.0</version>
<executions>
<execution>
<goals>
<goal>generate</goal>
</goals>
<configuration>
<inputSpec>${project.basedir}/../../openapi/specs/data-evaluation.yaml</inputSpec>
<generatorName>spring</generatorName>
<output>${project.build.directory}/generated-sources/openapi</output>
<apiPackage>com.datamate.evaluation.interfaces.api</apiPackage>
<modelPackage>com.datamate.evaluation.interfaces.dto</modelPackage>
<configOptions>
<interfaceOnly>true</interfaceOnly>
<useTags>true</useTags>
<useSpringBoot3>true</useSpringBoot3>
<documentationProvider>springdoc</documentationProvider>
</configOptions>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,113 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
<artifactId>data-management-service</artifactId>
<name>Data Management Service</name>
<description>数据管理服务</description>
<dependencies>
<dependency>
<groupId>com.datamate</groupId>
<artifactId>domain-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>com.baomidou</groupId>
<artifactId>mybatis-plus-spring-boot3-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>${mysql.version}</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.springframework.cloud</groupId>
<artifactId>spring-cloud-starter-openfeign</artifactId>
</dependency>
<dependency>
<groupId>org.springdoc</groupId>
<artifactId>springdoc-openapi-starter-webmvc-ui</artifactId>
</dependency>
<dependency>
<groupId>org.openapitools</groupId>
<artifactId>jackson-databind-nullable</artifactId>
</dependency>
<dependency>
<groupId>jakarta.validation</groupId>
<artifactId>jakarta.validation-api</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.data</groupId>
<artifactId>spring-data-commons</artifactId>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-maven-plugin</artifactId>
<configuration>
<arguments>true</arguments>
<classifier>exec</classifier>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.11.0</version>
<configuration>
<source>${maven.compiler.source}</source>
<target>${maven.compiler.target}</target>
<annotationProcessorPaths>
<!-- 顺序很重要 -->
<path>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>${lombok.version}</version>
</path>
<path>
<groupId>org.projectlombok</groupId>
<artifactId>lombok-mapstruct-binding</artifactId>
<version>${lombok-mapstruct-binding.version}</version>
</path>
<path>
<groupId>org.mapstruct</groupId>
<artifactId>mapstruct-processor</artifactId>
<version>${mapstruct.version}</version>
</path>
</annotationProcessorPaths>
<compilerArgs>
<arg>-parameters</arg>
<arg>-Amapstruct.defaultComponentModel=spring</arg>
</compilerArgs>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@@ -0,0 +1,22 @@
package com.datamate.datamanagement;
import org.springframework.cloud.openfeign.EnableFeignClients;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableAsync;
/**
* Data Management Service Configuration
* 数据管理服务配置类 - 多源接入、元数据、血缘治理
*/
@Configuration
@EnableFeignClients(basePackages = "com.datamate.datamanagement.infrastructure.client")
@EnableAsync
@ComponentScan(basePackages = {
"com.datamate.datamanagement",
"com.datamate.shared"
})
public class DataManagementServiceConfiguration {
// Service configuration class for JAR packaging
// 作为jar包形式提供服务的配置类
}

View File

@@ -0,0 +1,288 @@
package com.datamate.datamanagement.application;
import com.baomidou.mybatisplus.core.metadata.IPage;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import com.datamate.datamanagement.interfaces.dto.*;
import com.datamate.common.infrastructure.exception.BusinessAssert;
import com.datamate.common.interfaces.PagedResponse;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.domain.model.dataset.Tag;
import com.datamate.datamanagement.infrastructure.client.CollectionTaskClient;
import com.datamate.datamanagement.infrastructure.client.dto.CollectionTaskDetailResponse;
import com.datamate.datamanagement.infrastructure.client.dto.LocalCollectionConfig;
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
import com.datamate.datamanagement.infrastructure.persistence.mapper.TagMapper;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import com.datamate.datamanagement.interfaces.dto.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.StringUtils;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* 数据集应用服务(对齐 DB schema,使用 UUID 字符串主键)
*/
@Slf4j
@Service
@Transactional
@RequiredArgsConstructor
public class DatasetApplicationService {
private final DatasetRepository datasetRepository;
private final TagMapper tagMapper;
private final DatasetFileRepository datasetFileRepository;
private final CollectionTaskClient collectionTaskClient;
private final FileMetadataService fileMetadataService;
private final ObjectMapper objectMapper;
@Value("${dataset.base.path:/dataset}")
private String datasetBasePath;
/**
* 创建数据集
*/
@Transactional
public Dataset createDataset(CreateDatasetRequest createDatasetRequest) {
BusinessAssert.isTrue(datasetRepository.findByName(createDatasetRequest.getName()) == null, DataManagementErrorCode.DATASET_ALREADY_EXISTS);
// 创建数据集对象
Dataset dataset = DatasetConverter.INSTANCE.convertToDataset(createDatasetRequest);
dataset.initCreateParam(datasetBasePath);
// 处理标签
Set<Tag> processedTags = Optional.ofNullable(createDatasetRequest.getTags())
.filter(CollectionUtils::isNotEmpty)
.map(this::processTagNames)
.orElseGet(HashSet::new);
dataset.setTags(processedTags);
datasetRepository.save(dataset);
//todo 需要解耦这块逻辑
if (StringUtils.hasText(createDatasetRequest.getDataSource())) {
// 数据源id不为空,使用异步线程进行文件扫盘落库
processDataSourceAsync(dataset.getId(), createDatasetRequest.getDataSource());
}
return dataset;
}
public Dataset updateDataset(String datasetId, UpdateDatasetRequest updateDatasetRequest) {
Dataset dataset = datasetRepository.getById(datasetId);
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
if (StringUtils.hasText(updateDatasetRequest.getName())) {
dataset.setName(updateDatasetRequest.getName());
}
if (StringUtils.hasText(updateDatasetRequest.getDescription())) {
dataset.setDescription(updateDatasetRequest.getDescription());
}
if (CollectionUtils.isNotEmpty(updateDatasetRequest.getTags())) {
dataset.setTags(processTagNames(updateDatasetRequest.getTags()));
}
if (Objects.nonNull(updateDatasetRequest.getStatus())) {
dataset.setStatus(updateDatasetRequest.getStatus());
}
if (StringUtils.hasText(updateDatasetRequest.getDataSource())) {
// 数据源id不为空,使用异步线程进行文件扫盘落库
processDataSourceAsync(dataset.getId(), updateDatasetRequest.getDataSource());
}
datasetRepository.updateById(dataset);
return dataset;
}
/**
* 删除数据集
*/
public void deleteDataset(String datasetId) {
datasetRepository.removeById(datasetId);
}
/**
* 获取数据集详情
*/
@Transactional(readOnly = true)
public Dataset getDataset(String datasetId) {
Dataset dataset = datasetRepository.getById(datasetId);
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
return dataset;
}
/**
* 分页查询数据集
*/
@Transactional(readOnly = true)
public PagedResponse<DatasetResponse> getDatasets(DatasetPagingQuery query) {
IPage<Dataset> page = new Page<>(query.getPage(), query.getSize());
page = datasetRepository.findByCriteria(page, query);
return PagedResponse.of(DatasetConverter.INSTANCE.convertToResponse(page.getRecords()), page.getCurrent(), page.getTotal(), page.getPages());
}
/**
* 处理标签名称,创建或获取标签
*/
private Set<Tag> processTagNames(List<String> tagNames) {
Set<Tag> tags = new HashSet<>();
for (String tagName : tagNames) {
Tag tag = tagMapper.findByName(tagName);
if (tag == null) {
Tag newTag = new Tag(tagName, null, null, "#007bff");
newTag.setUsageCount(0L);
newTag.setId(UUID.randomUUID().toString());
tagMapper.insert(newTag);
tag = newTag;
}
tag.setUsageCount(tag.getUsageCount() == null ? 1L : tag.getUsageCount() + 1);
tagMapper.updateUsageCount(tag.getId(), tag.getUsageCount());
tags.add(tag);
}
return tags;
}
/**
* 获取数据集统计信息
*/
@Transactional(readOnly = true)
public Map<String, Object> getDatasetStatistics(String datasetId) {
Dataset dataset = datasetRepository.getById(datasetId);
if (dataset == null) {
throw new IllegalArgumentException("Dataset not found: " + datasetId);
}
Map<String, Object> statistics = new HashMap<>();
// 基础统计
Long totalFiles = datasetFileRepository.countByDatasetId(datasetId);
Long completedFiles = datasetFileRepository.countCompletedByDatasetId(datasetId);
Long totalSize = datasetFileRepository.sumSizeByDatasetId(datasetId);
statistics.put("totalFiles", totalFiles != null ? totalFiles.intValue() : 0);
statistics.put("completedFiles", completedFiles != null ? completedFiles.intValue() : 0);
statistics.put("totalSize", totalSize != null ? totalSize : 0L);
// 完成率计算
float completionRate = 0.0f;
if (totalFiles != null && totalFiles > 0) {
completionRate = (completedFiles != null ? completedFiles.floatValue() : 0.0f) / totalFiles.floatValue() * 100.0f;
}
statistics.put("completionRate", completionRate);
// 文件类型分布统计
Map<String, Integer> fileTypeDistribution = new HashMap<>();
List<DatasetFile> allFiles = datasetFileRepository.findAllByDatasetId(datasetId);
if (allFiles != null) {
for (DatasetFile file : allFiles) {
String fileType = file.getFileType() != null ? file.getFileType() : "unknown";
fileTypeDistribution.put(fileType, fileTypeDistribution.getOrDefault(fileType, 0) + 1);
}
}
statistics.put("fileTypeDistribution", fileTypeDistribution);
// 状态分布统计
Map<String, Integer> statusDistribution = new HashMap<>();
if (allFiles != null) {
for (DatasetFile file : allFiles) {
String status = file.getStatus() != null ? file.getStatus() : "unknown";
statusDistribution.put(status, statusDistribution.getOrDefault(status, 0) + 1);
}
}
statistics.put("statusDistribution", statusDistribution);
return statistics;
}
/**
* 获取所有数据集的汇总统计信息
*/
public AllDatasetStatisticsResponse getAllDatasetStatistics() {
return datasetRepository.getAllDatasetStatistics();
}
/**
* 异步处理数据源文件扫描
*
* @param datasetId 数据集ID
* @param dataSourceId 数据源ID(归集任务ID)
*/
@Async
public void processDataSourceAsync(String datasetId, String dataSourceId) {
try {
log.info("开始处理数据源文件扫描,数据集ID: {}, 数据源ID: {}", datasetId, dataSourceId);
// 1. 调用数据归集服务获取任务详情
CollectionTaskDetailResponse taskDetail = collectionTaskClient.getTaskDetail(dataSourceId).getData();
if (taskDetail == null) {
log.error("获取归集任务详情失败,任务ID: {}", dataSourceId);
return;
}
log.info("获取到归集任务详情: {}", taskDetail);
// 2. 解析任务配置
LocalCollectionConfig config = parseTaskConfig(taskDetail.getConfig());
if (config == null) {
log.error("解析任务配置失败,任务ID: {}", dataSourceId);
return;
}
// 4. 获取文件路径列表
List<String> filePaths = config.getFilePaths();
if (CollectionUtils.isEmpty(filePaths)) {
log.warn("文件路径列表为空,任务ID: {}", dataSourceId);
return;
}
log.info("开始扫描文件,共 {} 个文件路径", filePaths.size());
// 5. 扫描文件元数据
List<DatasetFile> datasetFiles = fileMetadataService.scanFiles(filePaths, datasetId);
// 查询数据集中已存在的文件
List<DatasetFile> existDatasetFileList = datasetFileRepository.findAllByDatasetId(datasetId);
Map<String, DatasetFile> existDatasetFilePathMap = existDatasetFileList.stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
Dataset dataset = datasetRepository.getById(datasetId);
// 6. 批量插入数据集文件表
if (CollectionUtils.isNotEmpty(datasetFiles)) {
for (DatasetFile datasetFile : datasetFiles) {
if (existDatasetFilePathMap.containsKey(datasetFile.getFilePath())) {
DatasetFile existDatasetFile = existDatasetFilePathMap.get(datasetFile.getFilePath());
dataset.removeFile(existDatasetFile);
existDatasetFile.setFileSize(datasetFile.getFileSize());
dataset.addFile(existDatasetFile);
datasetFileRepository.updateById(existDatasetFile);
} else {
dataset.addFile(datasetFile);
datasetFileRepository.save(datasetFile);
}
}
log.info("文件元数据写入完成,共写入 {} 条记录", datasetFiles.size());
} else {
log.warn("未扫描到有效文件");
}
datasetRepository.updateById(dataset);
} catch (Exception e) {
log.error("处理数据源文件扫描失败,数据集ID: {}, 数据源ID: {}", datasetId, dataSourceId, e);
}
}
/**
* 解析任务配置
*/
private LocalCollectionConfig parseTaskConfig(Map<String, Object> configMap) {
try {
if (configMap == null || configMap.isEmpty()) {
return null;
}
return objectMapper.convertValue(configMap, LocalCollectionConfig.class);
} catch (Exception e) {
log.error("解析任务配置失败", e);
return null;
}
}
}

View File

@@ -0,0 +1,306 @@
package com.datamate.datamanagement.application;
import com.datamate.common.domain.model.ChunkUploadPreRequest;
import com.datamate.common.domain.model.FileUploadResult;
import com.datamate.common.domain.service.FileService;
import com.datamate.common.domain.utils.AnalyzerUtils;
import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.datamate.datamanagement.domain.contants.DatasetConstant;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo;
import com.datamate.datamanagement.domain.model.dataset.StatusConstants;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import jakarta.servlet.http.HttpServletResponse;
import lombok.extern.slf4j.Slf4j;
import org.apache.ibatis.session.RowBounds;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource;
import org.springframework.core.io.UrlResource;
import org.springframework.data.domain.Page;
import org.springframework.data.domain.PageImpl;
import org.springframework.data.domain.Pageable;
import org.springframework.http.HttpHeaders;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.web.multipart.MultipartFile;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.List;
import java.util.Objects;
import java.util.UUID;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
/**
* 数据集文件应用服务
*/
@Slf4j
@Service
@Transactional
public class DatasetFileApplicationService {
private final DatasetFileRepository datasetFileRepository;
private final DatasetRepository datasetRepository;
private final Path fileStorageLocation;
private final FileService fileService;
@Value("${dataset.base.path:/dataset}")
private String datasetBasePath;
@Autowired
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
DatasetRepository datasetRepository, FileService fileService,
@Value("${app.file.upload-dir:./dataset}") String uploadDir) {
this.datasetFileRepository = datasetFileRepository;
this.datasetRepository = datasetRepository;
this.fileStorageLocation = Paths.get(uploadDir).toAbsolutePath().normalize();
this.fileService = fileService;
try {
Files.createDirectories(this.fileStorageLocation);
} catch (Exception ex) {
throw new RuntimeException("Could not create the directory where the uploaded files will be stored.", ex);
}
}
/**
* 上传文件到数据集
*/
public DatasetFile uploadFile(String datasetId, MultipartFile file) {
Dataset dataset = datasetRepository.getById(datasetId);
if (dataset == null) {
throw new IllegalArgumentException("Dataset not found: " + datasetId);
}
String originalFilename = file.getOriginalFilename();
String fileName = originalFilename != null ? originalFilename : "file";
try {
// 保存文件到磁盘
Path targetLocation = this.fileStorageLocation.resolve(datasetId + File.separator + fileName);
// 确保目标目录存在
Files.createDirectories(targetLocation);
Files.copy(file.getInputStream(), targetLocation, StandardCopyOption.REPLACE_EXISTING);
// 创建文件实体(UUID 主键)
DatasetFile datasetFile = new DatasetFile();
datasetFile.setId(UUID.randomUUID().toString());
datasetFile.setDatasetId(datasetId);
datasetFile.setFileName(fileName);
datasetFile.setFilePath(targetLocation.toString());
datasetFile.setFileType(getFileExtension(originalFilename));
datasetFile.setFileSize(file.getSize());
datasetFile.setUploadTime(LocalDateTime.now());
datasetFile.setStatus(StatusConstants.DatasetFileStatuses.COMPLETED);
// 保存到数据库
datasetFileRepository.save(datasetFile);
// 更新数据集统计
dataset.addFile(datasetFile);
datasetRepository.updateById(dataset);
return datasetFileRepository.findByDatasetIdAndFileName(datasetId, fileName);
} catch (IOException ex) {
log.error("Could not store file {}", fileName, ex);
throw new RuntimeException("Could not store file " + fileName, ex);
}
}
/**
* 获取数据集文件列表
*/
@Transactional(readOnly = true)
public Page<DatasetFile> getDatasetFiles(String datasetId, String fileType,
String status, Pageable pageable) {
RowBounds bounds = new RowBounds(pageable.getPageNumber() * pageable.getPageSize(), pageable.getPageSize());
List<DatasetFile> content = datasetFileRepository.findByCriteria(datasetId, fileType, status, bounds);
long total = content.size() < pageable.getPageSize() && pageable.getPageNumber() == 0 ? content.size() : content.size() + (long) pageable.getPageNumber() * pageable.getPageSize();
return new PageImpl<>(content, pageable, total);
}
/**
* 获取文件详情
*/
@Transactional(readOnly = true)
public DatasetFile getDatasetFile(String datasetId, String fileId) {
DatasetFile file = datasetFileRepository.getById(fileId);
if (file == null) {
throw new IllegalArgumentException("File not found: " + fileId);
}
if (!file.getDatasetId().equals(datasetId)) {
throw new IllegalArgumentException("File does not belong to the specified dataset");
}
return file;
}
/**
* 删除文件
*/
public void deleteDatasetFile(String datasetId, String fileId) {
DatasetFile file = getDatasetFile(datasetId, fileId);
try {
Path filePath = Paths.get(file.getFilePath());
Files.deleteIfExists(filePath);
} catch (IOException ex) {
// ignore
}
datasetFileRepository.removeById(fileId);
Dataset dataset = datasetRepository.getById(datasetId);
// 简单刷新统计(精确处理可从DB统计)
dataset.setFileCount(Math.max(0, dataset.getFileCount() - 1));
dataset.setSizeBytes(Math.max(0, dataset.getSizeBytes() - (file.getFileSize() != null ? file.getFileSize() : 0)));
datasetRepository.updateById(dataset);
}
/**
* 下载文件
*/
@Transactional(readOnly = true)
public Resource downloadFile(String datasetId, String fileId) {
DatasetFile file = getDatasetFile(datasetId, fileId);
try {
Path filePath = Paths.get(file.getFilePath()).normalize();
Resource resource = new UrlResource(filePath.toUri());
if (resource.exists()) {
return resource;
} else {
throw new RuntimeException("File not found: " + file.getFileName());
}
} catch (MalformedURLException ex) {
throw new RuntimeException("File not found: " + file.getFileName(), ex);
}
}
/**
* 下载文件
*/
@Transactional(readOnly = true)
public void downloadDatasetFileAsZip(String datasetId, HttpServletResponse response) {
List<DatasetFile> allByDatasetId = datasetFileRepository.findAllByDatasetId(datasetId);
response.setContentType("application/zip");
String zipName = String.format("dataset_%s.zip",
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss")));
response.setHeader(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=" + zipName);
try (ZipOutputStream zos = new ZipOutputStream(response.getOutputStream())) {
for (DatasetFile file : allByDatasetId) {
addToZipFile(file, zos);
}
} catch (IOException e) {
log.error("Failed to download files in batches.", e);
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
}
}
private void addToZipFile(DatasetFile file, ZipOutputStream zos) throws IOException {
if (file.getFilePath() == null || !Files.exists(Paths.get(file.getFilePath()))) {
log.warn("The file hasn't been found on filesystem, id: {}", file.getId());
return;
}
try (InputStream fis = Files.newInputStream(Paths.get(file.getFilePath()));
BufferedInputStream bis = new BufferedInputStream(fis)) {
ZipEntry zipEntry = new ZipEntry(file.getFileName());
zos.putNextEntry(zipEntry);
byte[] buffer = new byte[8192];
int length;
while ((length = bis.read(buffer)) >= 0) {
zos.write(buffer, 0, length);
}
zos.closeEntry();
}
}
private String getFileExtension(String fileName) {
if (fileName == null || fileName.isEmpty()) {
return null;
}
int lastDotIndex = fileName.lastIndexOf(".");
if (lastDotIndex == -1) {
return null;
}
return fileName.substring(lastDotIndex + 1);
}
/**
* 预上传
*
* @param chunkUploadRequest 上传请求
* @param datasetId 数据集id
* @return 请求id
*/
@Transactional
public String preUpload(UploadFilesPreRequest chunkUploadRequest, String datasetId) {
ChunkUploadPreRequest request = ChunkUploadPreRequest.builder().build();
request.setUploadPath(datasetBasePath + File.separator + datasetId);
request.setTotalFileNum(chunkUploadRequest.getTotalFileNum());
request.setServiceId(DatasetConstant.SERVICE_ID);
DatasetFileUploadCheckInfo checkInfo = new DatasetFileUploadCheckInfo();
checkInfo.setDatasetId(datasetId);
checkInfo.setHasArchive(chunkUploadRequest.isHasArchive());
try {
ObjectMapper objectMapper = new ObjectMapper();
String checkInfoJson = objectMapper.writeValueAsString(checkInfo);
request.setCheckInfo(checkInfoJson);
} catch (JsonProcessingException e) {
throw new IllegalArgumentException("Failed to serialize checkInfo to JSON", e);
}
return fileService.preUpload(request);
}
/**
* 切片上传
*
* @param uploadFileRequest 上传请求
*/
@Transactional
public void chunkUpload(String datasetId, UploadFileRequest uploadFileRequest) {
FileUploadResult uploadResult = fileService.chunkUpload(DatasetConverter.INSTANCE.toChunkUploadRequest(uploadFileRequest));
saveFileInfoToDb(uploadResult, uploadFileRequest, datasetId);
if (uploadResult.isAllFilesUploaded()) {
// 解析文件,后续依据需求看是否添加校验文件元数据和解析半结构化文件的逻辑,
}
}
private void saveFileInfoToDb(FileUploadResult fileUploadResult, UploadFileRequest uploadFile, String datasetId) {
if (Objects.isNull(fileUploadResult.getSavedFile())) {
// 文件切片上传没有完成
return;
}
Dataset dataset = datasetRepository.getById(datasetId);
File savedFile = fileUploadResult.getSavedFile();
LocalDateTime currentTime = LocalDateTime.now();
DatasetFile datasetFile = DatasetFile.builder()
.id(UUID.randomUUID().toString())
.datasetId(datasetId)
.fileSize(savedFile.length())
.uploadTime(currentTime)
.lastAccessTime(currentTime)
.fileName(uploadFile.getFileName())
.filePath(savedFile.getPath())
.fileType(AnalyzerUtils.getExtension(uploadFile.getFileName()))
.build();
datasetFileRepository.save(datasetFile);
dataset.addFile(datasetFile);
datasetRepository.updateById(dataset);
}
}

View File

@@ -0,0 +1,127 @@
package com.datamate.datamanagement.application;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.time.LocalDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.UUID;
/**
* 文件元数据扫描服务
*/
@Slf4j
@Service
public class FileMetadataService {
/**
* 扫描文件路径列表,提取文件元数据
* @param datasetId 数据集ID
* @return 数据集文件列表
*/
public List<DatasetFile> scanFiles(List<String> filePaths, String datasetId) {
List<DatasetFile> datasetFiles = new ArrayList<>();
if (filePaths == null || filePaths.isEmpty()) {
log.warn("文件路径列表为空,跳过扫描");
return datasetFiles;
}
for (String filePath : filePaths) {
try {
Path path = Paths.get(filePath);
if (!Files.exists(path)) {
log.warn("路径不存在: {}", filePath);
continue;
}
if (Files.isDirectory(path)) {
scanDirectory(datasetId, filePath, path, datasetFiles);
} else {
// 如果是文件,直接处理
DatasetFile datasetFile = extractFileMetadata(filePath, datasetId);
if (datasetFile != null) {
datasetFiles.add(datasetFile);
}
}
} catch (Exception e) {
log.error("扫描路径失败: {}, 错误: {}", filePath, e.getMessage(), e);
}
}
log.info("文件扫描完成,共扫描 {} 个文件", datasetFiles.size());
return datasetFiles;
}
private void scanDirectory(String datasetId, String filePath, Path path,
List<DatasetFile> datasetFiles) throws IOException {
// 如果是目录,扫描该目录下的所有文件(非递归)
List<Path> filesInDir = Files.list(path)
.filter(Files::isRegularFile)
.toList();
for (Path file : filesInDir) {
try {
DatasetFile datasetFile = extractFileMetadata(file.toString(), datasetId);
if (datasetFile != null) {
datasetFiles.add(datasetFile);
}
} catch (Exception e) {
log.error("处理目录中的文件失败: {}, 错误: {}", file, e.getMessage(), e);
}
}
log.info("已扫描目录 {} 下的 {} 个文件", filePath, filesInDir.size());
}
/**
* @param filePath 文件路径
* @param datasetId 数据集ID
* @return 数据集文件对象
*/
private DatasetFile extractFileMetadata(String filePath, String datasetId) throws IOException {
Path path = Paths.get(filePath);
if (!Files.exists(path)) {
log.warn("文件不存在: {}", filePath);
return null;
}
if (!Files.isRegularFile(path)) {
log.warn("路径不是文件: {}", filePath);
return null;
}
String fileName = path.getFileName().toString();
long fileSize = Files.size(path);
String fileType = getFileExtension(fileName);
return DatasetFile.builder()
.id(UUID.randomUUID().toString())
.datasetId(datasetId)
.fileName(fileName)
.filePath(filePath)
.fileSize(fileSize)
.fileType(fileType)
.uploadTime(LocalDateTime.now())
.lastAccessTime(LocalDateTime.now())
.status("UPLOADED")
.build();
}
/**
* 获取文件扩展名
*/
private String getFileExtension(String fileName) {
int lastDotIndex = fileName.lastIndexOf('.');
if (lastDotIndex > 0 && lastDotIndex < fileName.length() - 1) {
return fileName.substring(lastDotIndex + 1).toLowerCase();
}
return "unknown";
}
}

View File

@@ -0,0 +1,116 @@
package com.datamate.datamanagement.application;
import com.datamate.datamanagement.domain.model.dataset.Tag;
import com.datamate.datamanagement.infrastructure.persistence.mapper.TagMapper;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.CollectionUtils;
import java.util.List;
import java.util.UUID;
/**
* 标签应用服务(UUID 主键)
*/
@Service
@Transactional
public class TagApplicationService {
private final TagMapper tagMapper;
@Autowired
public TagApplicationService(TagMapper tagMapper) {
this.tagMapper = tagMapper;
}
/**
* 创建标签
*/
public Tag createTag(String name, String color, String description) {
// 检查名称是否已存在
if (tagMapper.findByName(name) != null) {
throw new IllegalArgumentException("Tag with name '" + name + "' already exists");
}
Tag tag = new Tag(name, description, null, color);
tag.setUsageCount(0L);
tag.setId(UUID.randomUUID().toString());
tagMapper.insert(tag);
return tagMapper.findById(tag.getId());
}
/**
* 更新标签
*
* @param tag 待更新的标签实体,必须包含有效的 ID
* @return 更新结果
*/
@Transactional
public Tag updateTag(Tag tag) {
Tag existingTag = tagMapper.findById(tag.getId());
if (existingTag == null) {
throw new IllegalArgumentException("Tag not found: " + tag.getId());
}
existingTag.setName(tag.getName());
existingTag.setColor(tag.getColor());
existingTag.setDescription(tag.getDescription());
tagMapper.update(existingTag);
return tagMapper.findById(existingTag.getId());
}
@Transactional
public void deleteTag(List<String> tagIds) {
List<Tag> tags = tagMapper.findByIdIn(tagIds);
if (tags.stream().anyMatch(tag -> tag.getUsageCount() > 0)) {
throw new IllegalArgumentException("Cannot delete tags that are in use");
}
if (CollectionUtils.isEmpty(tags)) {
return;
}
tagMapper.deleteTagsById(tags.stream().map(Tag::getId).toList());
}
/**
* 获取所有标签
*/
@Transactional(readOnly = true)
public List<Tag> getAllTags() {
return tagMapper.findAllByOrderByUsageCountDesc();
}
/**
* 根据关键词搜索标签
*/
@Transactional(readOnly = true)
public List<Tag> searchTags(String keyword) {
if (keyword == null || keyword.trim().isEmpty()) {
return getAllTags();
}
return tagMapper.findByKeyword(keyword.trim());
}
/**
* 获取标签详情
*/
@Transactional(readOnly = true)
public Tag getTag(String tagId) {
Tag tag = tagMapper.findById(tagId);
if (tag == null) {
throw new IllegalArgumentException("Tag not found: " + tagId);
}
return tag;
}
/**
* 根据名称获取标签
*/
@Transactional(readOnly = true)
public Tag getTagByName(String name) {
Tag tag = tagMapper.findByName(name);
if (tag == null) {
throw new IllegalArgumentException("Tag not found: " + name);
}
return tag;
}
}

View File

@@ -0,0 +1,41 @@
package com.datamate.datamanagement.common.enums;
/**
* 数据集状态类型
* <p>数据集可以处于以下几种状态:
* <p>草稿(DRAFT):数据集正在创建中,尚未完成。
* <p>活动(ACTIVE):数据集处于活动状态, 可以被查询和使用,也可以被更新和删除。
* <p>处理中(PROCESSING):数据集正在处理中,可能需要一些时间,处理完成后会变成活动状态。
* <p>已归档(ARCHIVED):数据集已被归档,不可以更新文件,可以解锁变成活动状态。
* <p>已发布(PUBLISHED):数据集已被发布,可供外部使用,外部用户可以查询和使用数据集。
* <p>已弃用(DEPRECATED):数据集已被弃用,不建议再使用。
*
* @author dallas
* @since 2025-10-17
*/
public enum DatasetStatusType {
/**
* 草稿状态
*/
DRAFT,
/**
* 活动状态
*/
ACTIVE,
/**
* 处理中状态
*/
PROCESSING,
/**
* 已归档状态
*/
ARCHIVED,
/**
* 已发布状态
*/
PUBLISHED,
/**
* 已弃用状态
*/
DEPRECATED
}

View File

@@ -0,0 +1,28 @@
package com.datamate.datamanagement.common.enums;
import lombok.Getter;
/**
* 数据集类型值对象
*
* @author DataMate
* @since 2025-10-15
*/
public enum DatasetType {
TEXT("text", "文本数据集"),
IMAGE("image", "图像数据集"),
AUDIO("audio", "音频数据集"),
VIDEO("video", "视频数据集"),
OTHER("other", "其他数据集");
@Getter
private final String code;
@Getter
private final String description;
DatasetType(String code, String description) {
this.code = code;
this.description = description;
}
}

View File

@@ -0,0 +1,11 @@
package com.datamate.datamanagement.domain.contants;
/**
* 数据集常量
*/
public interface DatasetConstant {
/**
* 服务ID
*/
String SERVICE_ID = "DATA_MANAGEMENT";
}

View File

@@ -0,0 +1,146 @@
package com.datamate.datamanagement.domain.model.dataset;
import com.baomidou.mybatisplus.annotation.TableField;
import com.baomidou.mybatisplus.annotation.TableName;
import com.baomidou.mybatisplus.extension.handlers.JacksonTypeHandler;
import com.datamate.common.domain.model.base.BaseEntity;
import com.datamate.datamanagement.common.enums.DatasetStatusType;
import com.datamate.datamanagement.common.enums.DatasetType;
import lombok.Getter;
import lombok.Setter;
import java.io.File;
import java.time.LocalDateTime;
import java.util.*;
/**
* 数据集实体(与数据库表 t_dm_datasets 对齐)
*/
@Getter
@Setter
@TableName(value = "t_dm_datasets", autoResultMap = true)
public class Dataset extends BaseEntity<String> {
/**
* 数据集名称
*/
private String name;
/**
* 数据集描述
*/
private String description;
/**
* 数据集类型
*/
private DatasetType datasetType;
/**
* 数据集分类
*/
private String category;
/**
* 数据集路径
*/
private String path;
/**
* 数据集格式
*/
private String format;
/**
* 数据集模式信息,JSON格式, 用于解析当前数据集的文件结构
*/
private String schemaInfo;
/**
* 数据集大小(字节)
*/
private Long sizeBytes = 0L;
/**
* 文件数量
*/
private Long fileCount = 0L;
/**
* 记录数量
*/
private Long recordCount = 0L;
/**
* 数据集保留天数
*/
private Integer retentionDays = 0;
/**
* 标签列表, JSON格式
*/
@TableField(typeHandler = JacksonTypeHandler.class)
private Collection<Tag> tags = new HashSet<>();
/**
* 额外元数据,JSON格式
*/
private String metadata;
/**
* 数据集状态
*/
private DatasetStatusType status;
/**
* 是否为公共数据集
*/
private Boolean isPublic = false;
/**
* 是否为精选数据集
*/
private Boolean isFeatured = false;
/**
* 数据集版本号
*/
private Long version = 0L;
@TableField(exist = false)
private List<DatasetFile> files = new ArrayList<>();
public Dataset() {
}
public Dataset(String name, String description, DatasetType datasetType, String category, String path,
String format, DatasetStatusType status, String createdBy) {
this.name = name;
this.description = description;
this.datasetType = datasetType;
this.category = category;
this.path = path;
this.format = format;
this.status = status;
this.createdBy = createdBy;
this.createdAt = LocalDateTime.now();
this.updatedAt = LocalDateTime.now();
}
public void initCreateParam(String datasetBasePath) {
this.id = UUID.randomUUID().toString();
this.path = datasetBasePath + File.separator + this.id;
this.status = DatasetStatusType.DRAFT;
}
public void updateBasicInfo(String name, String description, String category) {
if (name != null && !name.isEmpty()) this.name = name;
if (description != null) this.description = description;
if (category != null) this.category = category;
this.updatedAt = LocalDateTime.now();
}
public void updateStatus(DatasetStatusType status, String updatedBy) {
this.status = status;
this.updatedBy = updatedBy;
this.updatedAt = LocalDateTime.now();
}
public void addFile(DatasetFile file) {
this.files.add(file);
this.fileCount = this.fileCount + 1;
this.sizeBytes = this.sizeBytes + (file.getFileSize() != null ? file.getFileSize() : 0L);
this.updatedAt = LocalDateTime.now();
}
public void removeFile(DatasetFile file) {
if (this.files.remove(file)) {
this.fileCount = Math.max(0, this.fileCount - 1);
this.sizeBytes = Math.max(0, this.sizeBytes - (file.getFileSize() != null ? file.getFileSize() : 0L));
this.updatedAt = LocalDateTime.now();
}
}
}

View File

@@ -0,0 +1,35 @@
package com.datamate.datamanagement.domain.model.dataset;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.*;
import java.time.LocalDateTime;
import java.util.List;
/**
* 数据集文件实体(与数据库表 t_dm_dataset_files 对齐)
*/
@Getter
@Setter
@Builder
@NoArgsConstructor
@AllArgsConstructor
@TableName("t_dm_dataset_files")
public class DatasetFile {
@TableId
private String id; // UUID
private String datasetId; // UUID
private String fileName;
private String filePath;
private String fileType; // JPG/PNG/DCM/TXT
private Long fileSize; // bytes
private String checkSum;
private List<String> tags;
private String metadata;
private String status; // UPLOADED, PROCESSING, COMPLETED, ERROR
private LocalDateTime uploadTime;
private LocalDateTime lastAccessTime;
private LocalDateTime createdAt;
private LocalDateTime updatedAt;
}

View File

@@ -0,0 +1,18 @@
package com.datamate.datamanagement.domain.model.dataset;
import com.datamate.common.domain.model.UploadCheckInfo;
import lombok.Getter;
import lombok.Setter;
/**
* 数据集文件上传检查信息
*/
@Getter
@Setter
public class DatasetFileUploadCheckInfo extends UploadCheckInfo {
/** 数据集id */
private String datasetId;
/** 是否为压缩包上传 */
private boolean hasArchive;
}

View File

@@ -0,0 +1,33 @@
package com.datamate.datamanagement.domain.model.dataset;
/**
* 状态常量类 - 统一管理所有状态枚举值
*/
public final class StatusConstants {
/**
* 数据集状态
*/
public static final class DatasetStatuses {
public static final String DRAFT = "DRAFT";
public static final String ACTIVE = "ACTIVE";
public static final String ARCHIVED = "ARCHIVED";
public static final String PROCESSING = "PROCESSING";
private DatasetStatuses() {}
}
/**
* 数据集文件状态
*/
public static final class DatasetFileStatuses {
public static final String UPLOADED = "UPLOADED";
public static final String PROCESSING = "PROCESSING";
public static final String COMPLETED = "COMPLETED";
public static final String ERROR = "ERROR";
private DatasetFileStatuses() {}
}
private StatusConstants() {}
}

View File

@@ -0,0 +1,33 @@
package com.datamate.datamanagement.domain.model.dataset;
import com.datamate.common.domain.model.base.BaseEntity;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
/**
* 标签实体(与数据库表 t_dm_tags 对齐)
*/
@Getter
@Setter
@NoArgsConstructor
@AllArgsConstructor
public class Tag extends BaseEntity<String> {
private String name;
private String description;
private String category;
private String color;
private Long usageCount = 0L;
public Tag(String name, String description, String category, String color) {
this.name = name;
this.description = description;
this.category = category;
this.color = color;
}
public void decrementUsage() {
if (this.usageCount != null && this.usageCount > 0) this.usageCount--;
}
}

View File

@@ -0,0 +1,22 @@
package com.datamate.datamanagement.infrastructure.client;
import com.datamate.common.infrastructure.common.Response;
import com.datamate.datamanagement.infrastructure.client.dto.CollectionTaskDetailResponse;
import org.springframework.cloud.openfeign.FeignClient;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
/**
* 数据归集服务 Feign Client
*/
@FeignClient(name = "collection-service", url = "${collection.service.url:http://localhost:8080}")
public interface CollectionTaskClient {
/**
* 获取归集任务详情
* @param taskId 任务ID
* @return 任务详情
*/
@GetMapping("/api/data-collection/tasks/{id}")
Response<CollectionTaskDetailResponse> getTaskDetail(@PathVariable("id") String taskId);
}

View File

@@ -0,0 +1,23 @@
package com.datamate.datamanagement.infrastructure.client.dto;
import lombok.Data;
import java.time.LocalDateTime;
import java.util.Map;
/**
* 归集任务详情响应
*/
@Data
public class CollectionTaskDetailResponse {
private String id;
private String name;
private String description;
private Map<String, Object> config;
private String status;
private String syncMode;
private String scheduleExpression;
private String lastExecutionId;
private LocalDateTime createdAt;
private LocalDateTime updatedAt;
}

View File

@@ -0,0 +1,21 @@
package com.datamate.datamanagement.infrastructure.client.dto;
import lombok.Data;
import java.util.List;
/**
* 本地归集任务配置
*/
@Data
public class LocalCollectionConfig {
/**
* 归集类型
*/
private String type;
/**
* 文件路径列表
*/
private List<String> filePaths;
}

View File

@@ -0,0 +1,37 @@
package com.datamate.datamanagement.infrastructure.config;
import org.springframework.boot.context.properties.EnableConfigurationProperties;
import org.springframework.cache.CacheManager;
import org.springframework.cache.annotation.EnableCaching;
import org.springframework.cache.concurrent.ConcurrentMapCacheManager;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.transaction.annotation.EnableTransactionManagement;
import org.springframework.web.multipart.support.StandardServletMultipartResolver;
/**
* 数据管理服务配置
*/
@Configuration
@EnableTransactionManagement
@EnableCaching
@EnableConfigurationProperties(DataManagementProperties.class)
public class DataManagementConfig {
/**
* 缓存管理器
*/
@Bean
public CacheManager cacheManager() {
return new ConcurrentMapCacheManager("datasets", "datasetFiles", "tags");
}
/**
* 文件上传解析器
*/
@Bean
public StandardServletMultipartResolver multipartResolver() {
StandardServletMultipartResolver resolver = new StandardServletMultipartResolver();
return resolver;
}
}

View File

@@ -0,0 +1,82 @@
package com.datamate.datamanagement.infrastructure.config;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.context.annotation.Configuration;
/**
* 数据管理服务配置属性
*/
@Configuration
@ConfigurationProperties(prefix = "datamanagement")
public class DataManagementProperties {
private FileStorage fileStorage = new FileStorage();
private Cache cache = new Cache();
public FileStorage getFileStorage() {
return fileStorage;
}
public void setFileStorage(FileStorage fileStorage) {
this.fileStorage = fileStorage;
}
public Cache getCache() {
return cache;
}
public void setCache(Cache cache) {
this.cache = cache;
}
public static class FileStorage {
private String uploadDir = "./uploads";
private long maxFileSize = 10485760; // 10MB
private long maxRequestSize = 52428800; // 50MB
public String getUploadDir() {
return uploadDir;
}
public void setUploadDir(String uploadDir) {
this.uploadDir = uploadDir;
}
public long getMaxFileSize() {
return maxFileSize;
}
public void setMaxFileSize(long maxFileSize) {
this.maxFileSize = maxFileSize;
}
public long getMaxRequestSize() {
return maxRequestSize;
}
public void setMaxRequestSize(long maxRequestSize) {
this.maxRequestSize = maxRequestSize;
}
}
public static class Cache {
private int ttl = 3600; // 1 hour
private int maxSize = 1000;
public int getTtl() {
return ttl;
}
public void setTtl(int ttl) {
this.ttl = ttl;
}
public int getMaxSize() {
return maxSize;
}
public void setMaxSize(int maxSize) {
this.maxSize = maxSize;
}
}
}

Some files were not shown because too many files have changed in this diff Show More