Merge branch 'main' into develop_deer

This commit is contained in:
hhhhsc
2025-10-28 11:03:01 +08:00
121 changed files with 1999 additions and 935 deletions

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@@ -39,7 +39,7 @@ paths:
schema:
type: integer
default: 0
description: 页码,从0开始
description: 页码,从1开始
- name: size
in: query
schema:
@@ -65,7 +65,7 @@ paths:
in: query
schema:
type: string
enum: [ACTIVE, INACTIVE, PROCESSING]
enum: [DRAFT, ACTIVE, PROCESSING, ARCHIVED, PUBLISHED, DEPRECATED]
description: 数据集状态过滤
responses:
'200':
@@ -231,40 +231,6 @@ paths:
schema:
$ref: '#/components/schemas/PagedDatasetFileResponse'
post:
tags: [DatasetFile]
summary: 上传文件到数据集
operationId: uploadDatasetFile
description: 向指定数据集上传文件
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
requestBody:
required: true
content:
multipart/form-data:
schema:
type: object
properties:
file:
type: string
format: binary
description: 要上传的文件
description:
type: string
description: 文件描述
responses:
'201':
description: 上传成功
content:
application/json:
schema:
$ref: '#/components/schemas/DatasetFileResponse'
/data-management/datasets/{datasetId}/files/{fileId}:
get:
tags: [DatasetFile]
@@ -342,6 +308,78 @@ paths:
type: string
format: binary
/data-management/datasets/{datasetId}/files/download:
get:
tags: [ DatasetFile ]
operationId: downloadDatasetFileAsZip
summary: 下载文件
description: 下载数据集中全部文件
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
responses:
'200':
description: 文件内容
content:
application/octet-stream:
schema:
type: string
format: binary
/data-management/datasets/{datasetId}/files/upload/pre-upload:
post:
tags: [ DatasetFile ]
operationId: preUpload
summary: 切片上传预上传
description: 预上传接口,返回后续分片上传所需的请求ID
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
requestBody:
required: true
content:
application/json:
schema:
$ref: '#/components/schemas/UploadFilesPreRequest'
responses:
'200':
description: 预上传成功,返回请求ID
content:
application/json:
schema:
type: string
/data-management/datasets/{datasetId}/files/upload/chunk:
post:
tags: [ DatasetFile ]
operationId: chunkUpload
summary: 切片上传
description: 使用预上传返回的请求ID进行分片上传
parameters:
- name: datasetId
in: path
required: true
schema:
type: string
description: 数据集ID
requestBody:
required: true
content:
multipart/form-data:
schema:
$ref: '#/components/schemas/UploadFileRequest'
responses:
'200':
description: 上传成功
/data-management/dataset-types:
get:
operationId: getDatasetTypes
@@ -548,9 +586,59 @@ components:
description: 标签列表
status:
type: string
enum: [ACTIVE, INACTIVE]
enum: [DRAFT, ACTIVE, PROCESSING, ARCHIVED, PUBLISHED, DEPRECATED]
description: 数据集状态
UploadFilesPreRequest:
type: object
description: 切片上传预上传请求
properties:
hasArchive:
type: boolean
description: 是否为压缩包上传
default: false
totalFileNum:
type: integer
format: int32
minimum: 1
description: 总文件数量
totalSize:
type: integer
format: int64
description: 总文件大小(字节)
required: [ totalFileNum ]
UploadFileRequest:
type: object
description: 分片上传请求
properties:
reqId:
type: string
description: 预上传返回的请求ID
fileNo:
type: integer
format: int32
description: 文件编号(批量中的第几个)
fileName:
type: string
description: 文件名称
totalChunkNum:
type: integer
format: int32
description: 文件总分片数量
chunkNo:
type: integer
format: int32
description: 当前分片编号(从1开始)
file:
type: string
format: binary
description: 分片二进制内容
checkSumHex:
type: string
description: 分片校验和(十六进制)
required: [ reqId, fileNo, fileName, totalChunkNum, chunkNo, file ]
DatasetTypeResponse:
type: object
properties:

View File

@@ -6,11 +6,11 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<packaging>pom</packaging>
<name>DataMatePlatform</name>
<name>DataMate</name>
<description>一站式数据工作平台,面向模型微调与RAG检索</description>
<properties>

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -6,7 +6,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
@@ -87,7 +87,12 @@
<dependency>
<groupId>com.datamate</groupId>
<artifactId>domain-common</artifactId>
<version>1.0.0-SNAPSHOT</version>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.datamate</groupId>
<artifactId>data-management-service</artifactId>
<version>${project.version}</version>
</dependency>
<!-- OpenAPI Dependencies -->

View File

@@ -17,6 +17,7 @@ import org.springframework.transaction.annotation.EnableTransactionManagement;
@EnableTransactionManagement
@ComponentScan(basePackages = {
"com.datamate.collection",
"com.datamate.datamanagement",
"com.datamate.shared"
})
public class DataCollectionServiceConfiguration {

View File

@@ -7,12 +7,12 @@ import com.datamate.collection.domain.model.entity.CollectionTask;
import com.datamate.collection.domain.model.entity.TaskExecution;
import com.datamate.collection.common.enums.TaskStatus;
import com.datamate.collection.domain.repository.CollectionTaskRepository;
import com.datamate.collection.interfaces.dto.CollectionTaskPagingQuery;
import com.datamate.collection.common.enums.SyncMode;
import com.datamate.common.domain.utils.ChunksSaver;
import com.datamate.datamanagement.application.DatasetApplicationService;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
@@ -25,31 +25,32 @@ import java.util.Objects;
@RequiredArgsConstructor
public class CollectionTaskService {
private final TaskExecutionService taskExecutionService;
private final DatasetApplicationService datasetApplicationService;
private final CollectionTaskRepository collectionTaskRepository;
@Transactional
public CollectionTask create(CollectionTask task) {
task.setStatus(TaskStatus.READY);
task.setCreatedAt(LocalDateTime.now());
task.setUpdatedAt(LocalDateTime.now());
public CollectionTask create(CollectionTask task, String datasetId) {
task.initCreateParam();
collectionTaskRepository.save(task);
executeTaskNow(task);
executeTaskNow(task, datasetId);
return task;
}
private void executeTaskNow(CollectionTask task) {
private void executeTaskNow(CollectionTask task, String datasetId) {
if (Objects.equals(task.getSyncMode(), SyncMode.ONCE)) {
TaskExecution exec = taskExecutionService.createExecution(task);
int timeout = task.getTimeoutSeconds() == null ? 3600 : task.getTimeoutSeconds();
taskExecutionService.runAsync(task, exec.getId(), timeout);
taskExecutionService.runAsync(task, exec.getId(), timeout, datasetId);
log.info("Triggered DataX execution for task {} at {}, execId={}", task.getId(), LocalDateTime.now(), exec.getId());
}
}
@Transactional
public CollectionTask update(CollectionTask task) {
public CollectionTask update(CollectionTask task, String datasetId) {
task.setUpdatedAt(LocalDateTime.now());
task.addPath();
collectionTaskRepository.updateById(task);
executeTaskNow(task, datasetId);
return task;
}
@@ -66,11 +67,8 @@ public class CollectionTaskService {
return collectionTaskRepository.getById(id);
}
public IPage<CollectionTask> getTasks(CollectionTaskPagingQuery query) {
LambdaQueryWrapper<CollectionTask> wrapper = new LambdaQueryWrapper<CollectionTask>()
.eq(query.getStatus() != null, CollectionTask::getStatus, query.getStatus())
.like(StringUtils.isNotBlank(query.getName()), CollectionTask::getName, query.getName());
return collectionTaskRepository.page(new Page<>(query.getPage(), query.getSize()), wrapper);
public IPage<CollectionTask> getTasks(Page<CollectionTask> page, LambdaQueryWrapper<CollectionTask> wrapper) {
return collectionTaskRepository.page(page, wrapper);
}
public List<CollectionTask> selectActiveTasks() {

View File

@@ -6,6 +6,7 @@ import com.datamate.collection.common.enums.TaskStatus;
import com.datamate.collection.domain.process.ProcessRunner;
import com.datamate.collection.domain.repository.CollectionTaskRepository;
import com.datamate.collection.domain.repository.TaskExecutionRepository;
import com.datamate.datamanagement.application.DatasetApplicationService;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Async;
@@ -21,6 +22,7 @@ public class TaskExecutionService {
private final ProcessRunner processRunner;
private final TaskExecutionRepository executionRepository;
private final CollectionTaskRepository collectionTaskRepository;
private final DatasetApplicationService datasetApplicationService;
@Transactional
@@ -39,7 +41,8 @@ public class TaskExecutionService {
}
@Async
public void runAsync(CollectionTask task, String executionId, int timeoutSeconds) {
@Transactional
public void runAsync(CollectionTask task, String executionId, int timeoutSeconds, String datasetId) {
try {
int code = processRunner.runJob(task, executionId, timeoutSeconds);
log.info("DataX finished with code {} for execution {}", code, executionId);
@@ -47,6 +50,7 @@ public class TaskExecutionService {
executionRepository.completeExecution(executionId, TaskStatus.SUCCESS.name(), LocalDateTime.now(),
0, 0L, 0L, 0L, null);
collectionTaskRepository.updateStatus(task.getId(), TaskStatus.SUCCESS.name());
datasetApplicationService.processDataSourceAsync(datasetId, task.getId());
} catch (Exception e) {
log.error("DataX execution failed", e);
executionRepository.completeExecution(executionId, TaskStatus.FAILED.name(), LocalDateTime.now(),

View File

@@ -10,8 +10,10 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.Getter;
import lombok.Setter;
import java.time.LocalDateTime;
import java.util.Collections;
import java.util.Map;
import java.util.UUID;
/**
* 数据采集任务实体(与数据库表 t_dc_collection_tasks 对齐)
@@ -46,4 +48,12 @@ public class CollectionTask extends BaseEntity<String> {
throw new RuntimeException(e);
}
}
public void initCreateParam() {
this.id = UUID.randomUUID().toString();
this.addPath();
this.status = TaskStatus.READY;
this.createdAt = LocalDateTime.now();
this.updatedAt = LocalDateTime.now();
}
}

View File

@@ -6,6 +6,7 @@ import java.util.Map;
import com.datamate.collection.common.enums.TaskStatus;
import com.datamate.collection.common.enums.SyncMode;
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
@@ -44,5 +45,7 @@ public class CollectionTaskResponse {
@DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
private LocalDateTime updatedAt;
private DatasetResponse dataset;
}

View File

@@ -1,6 +1,7 @@
package com.datamate.collection.interfaces.dto;
import com.datamate.collection.common.enums.SyncMode;
import com.datamate.datamanagement.interfaces.dto.CreateDatasetRequest;
import com.fasterxml.jackson.annotation.JsonProperty;
import java.util.HashMap;
@@ -49,5 +50,9 @@ public class CreateCollectionTaskRequest {
@Schema(name = "scheduleExpression", description = "Cron调度表达式 (syncMode=SCHEDULED 时必填)", requiredMode = Schema.RequiredMode.NOT_REQUIRED)
@JsonProperty("scheduleExpression")
private String scheduleExpression;
/** 创建数据集参数 */
@Valid
private CreateDatasetRequest dataset;
}

View File

@@ -46,5 +46,8 @@ public class UpdateCollectionTaskRequest {
@Schema(name = "scheduleExpression", description = "Cron调度表达式 (syncMode=SCHEDULED 时必填)", requiredMode = Schema.RequiredMode.NOT_REQUIRED)
@JsonProperty("scheduleExpression")
private String scheduleExpression;
/** 数据集id */
private String datasetId;
}

View File

@@ -1,14 +1,21 @@
package com.datamate.collection.interfaces.rest;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import com.datamate.collection.application.CollectionTaskService;
import com.datamate.collection.domain.model.entity.CollectionTask;
import com.datamate.collection.interfaces.converter.CollectionTaskConverter;
import com.datamate.collection.interfaces.dto.*;
import com.datamate.common.interfaces.PagedResponse;
import com.datamate.datamanagement.application.DatasetApplicationService;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import jakarta.validation.Valid;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.http.ResponseEntity;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.web.bind.annotation.*;
import java.util.*;
@@ -21,12 +28,19 @@ public class CollectionTaskController{
private final CollectionTaskService taskService;
private final DatasetApplicationService datasetService;
@PostMapping
@Transactional
public ResponseEntity<CollectionTaskResponse> createTask(@Valid @RequestBody CreateCollectionTaskRequest request) {
CollectionTask task = CollectionTaskConverter.INSTANCE.toCollectionTask(request);
task.setId(UUID.randomUUID().toString());
task.addPath();
return ResponseEntity.ok().body(CollectionTaskConverter.INSTANCE.toResponse(taskService.create(task)));
String datasetId = null;
if (Objects.nonNull(request.getDataset())) {
datasetId = datasetService.createDataset(request.getDataset()).getId();
}
CollectionTaskResponse response = CollectionTaskConverter.INSTANCE.toResponse(taskService.create(task, datasetId));
response.setDataset(DatasetConverter.INSTANCE.convertToResponse(datasetService.getDataset(datasetId)));
return ResponseEntity.ok().body(response);
}
@PutMapping("/{id}")
@@ -36,7 +50,7 @@ public class CollectionTaskController{
}
CollectionTask task = CollectionTaskConverter.INSTANCE.toCollectionTask(request);
task.setId(id);
return ResponseEntity.ok(CollectionTaskConverter.INSTANCE.toResponse(taskService.update(task)));
return ResponseEntity.ok(CollectionTaskConverter.INSTANCE.toResponse(taskService.update(task, request.getDatasetId())));
}
@DeleteMapping("/{id}")
@@ -53,6 +67,10 @@ public class CollectionTaskController{
@GetMapping
public ResponseEntity<PagedResponse<CollectionTaskResponse>> getTasks(@Valid CollectionTaskPagingQuery query) {
return ResponseEntity.ok(CollectionTaskConverter.INSTANCE.toResponse(taskService.getTasks(query)));
Page<CollectionTask> page = new Page<>(query.getPage(), query.getSize());
LambdaQueryWrapper<CollectionTask> wrapper = new LambdaQueryWrapper<CollectionTask>()
.eq(query.getStatus() != null, CollectionTask::getStatus, query.getStatus())
.like(StringUtils.isNotBlank(query.getName()), CollectionTask::getName, query.getName());
return ResponseEntity.ok(CollectionTaskConverter.INSTANCE.toResponse(taskService.getTasks(page, wrapper)));
}
}

View File

@@ -53,7 +53,7 @@ public class TaskSchedulerInitializer {
// 到期,触发一次执行
TaskExecution exec = taskExecutionService.createExecution(task);
int timeout = task.getTimeoutSeconds() == null ? 3600 : task.getTimeoutSeconds();
taskExecutionService.runAsync(task, exec.getId(), timeout);
taskExecutionService.runAsync(task, exec.getId(), timeout, null);
log.info("Triggered DataX execution for task {} at {}, execId={}", task.getId(), now, exec.getId());
}
} catch (Exception ex) {

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -28,6 +28,8 @@ import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.util.StringUtils;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
@@ -220,63 +222,75 @@ public class DatasetApplicationService {
public void processDataSourceAsync(String datasetId, String dataSourceId) {
try {
log.info("开始处理数据源文件扫描,数据集ID: {}, 数据源ID: {}", datasetId, dataSourceId);
// 1. 调用数据归集服务获取任务详情
CollectionTaskDetailResponse taskDetail = collectionTaskClient.getTaskDetail(dataSourceId).getData();
if (taskDetail == null) {
log.error("获取归集任务详情失败,任务ID: {}", dataSourceId);
return;
}
log.info("获取到归集任务详情: {}", taskDetail);
// 2. 解析任务配置
LocalCollectionConfig config = parseTaskConfig(taskDetail.getConfig());
if (config == null) {
log.error("解析任务配置失败,任务ID: {}", dataSourceId);
return;
}
// 4. 获取文件路径列表
List<String> filePaths = config.getFilePaths();
List<String> filePaths = getFilePaths(dataSourceId);
if (CollectionUtils.isEmpty(filePaths)) {
log.warn("文件路径列表为空,任务ID: {}", dataSourceId);
return;
}
log.info("开始扫描文件,共 {} 个文件路径", filePaths.size());
// 5. 扫描文件元数据
List<DatasetFile> datasetFiles = fileMetadataService.scanFiles(filePaths, datasetId);
// 查询数据集中已存在的文件
List<DatasetFile> existDatasetFileList = datasetFileRepository.findAllByDatasetId(datasetId);
Map<String, DatasetFile> existDatasetFilePathMap = existDatasetFileList.stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
Dataset dataset = datasetRepository.getById(datasetId);
dataset.setFiles(existDatasetFileList);
// 6. 批量插入数据集文件表
if (CollectionUtils.isNotEmpty(datasetFiles)) {
for (DatasetFile datasetFile : datasetFiles) {
if (existDatasetFilePathMap.containsKey(datasetFile.getFilePath())) {
DatasetFile existDatasetFile = existDatasetFilePathMap.get(datasetFile.getFilePath());
dataset.removeFile(existDatasetFile);
existDatasetFile.setFileSize(datasetFile.getFileSize());
dataset.addFile(existDatasetFile);
datasetFileRepository.updateById(existDatasetFile);
} else {
dataset.addFile(datasetFile);
datasetFileRepository.save(datasetFile);
}
}
log.info("文件元数据写入完成,共写入 {} 条记录", datasetFiles.size());
} else {
log.warn("未扫描到有效文件");
}
// 批量同步数据集文件表
asyncDatasetFile(datasetFiles, existDatasetFilePathMap, dataset, existDatasetFileList, filePaths);
datasetRepository.updateById(dataset);
} catch (Exception e) {
log.error("处理数据源文件扫描失败,数据集ID: {}, 数据源ID: {}", datasetId, dataSourceId, e);
}
}
private void asyncDatasetFile(List<DatasetFile> datasetFiles, Map<String, DatasetFile> existDatasetFilePathMap, Dataset dataset, List<DatasetFile> existDatasetFileList, List<String> filePaths) {
if (CollectionUtils.isNotEmpty(datasetFiles)) {
for (DatasetFile datasetFile : datasetFiles) {
if (existDatasetFilePathMap.containsKey(datasetFile.getFilePath())) {
DatasetFile existDatasetFile = existDatasetFilePathMap.get(datasetFile.getFilePath());
dataset.removeFile(existDatasetFile);
existDatasetFile.setFileSize(datasetFile.getFileSize());
dataset.addFile(existDatasetFile);
dataset.active();
datasetFileRepository.updateById(existDatasetFile);
} else {
dataset.addFile(datasetFile);
dataset.active();
datasetFileRepository.save(datasetFile);
}
}
log.info("文件元数据写入完成,共写入 {} 条记录", datasetFiles.size());
} else {
log.warn("未扫描到有效文件");
}
for (DatasetFile datasetFile : existDatasetFileList) {
String existFilePath = datasetFile.getFilePath();
for (String filePath : filePaths) {
if (existFilePath.equals(filePath) || existFilePath.startsWith(filePath)) {
if (Files.notExists(Paths.get(existFilePath))) {
dataset.removeFile(datasetFile);
datasetFileRepository.removeById(datasetFile.getId());
}
}
}
}
}
private List<String> getFilePaths(String dataSourceId) {
CollectionTaskDetailResponse taskDetail = collectionTaskClient.getTaskDetail(dataSourceId).getData();
if (taskDetail == null) {
log.warn("获取归集任务详情失败,任务ID: {}", dataSourceId);
return Collections.emptyList();
}
log.info("获取到归集任务详情: {}", taskDetail);
LocalCollectionConfig config = parseTaskConfig(taskDetail.getConfig());
if (config == null) {
log.warn("解析任务配置失败,任务ID: {}", dataSourceId);
return Collections.emptyList();
}
return config.getFilePaths();
}
/**
* 解析任务配置
*/

View File

@@ -10,7 +10,6 @@ import com.datamate.datamanagement.domain.contants.DatasetConstant;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo;
import com.datamate.datamanagement.domain.model.dataset.StatusConstants;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
@@ -31,7 +30,6 @@ import org.springframework.data.domain.Pageable;
import org.springframework.http.HttpHeaders;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.web.multipart.MultipartFile;
import java.io.BufferedInputStream;
import java.io.File;
@@ -41,12 +39,9 @@ import java.net.MalformedURLException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.List;
import java.util.Objects;
import java.util.UUID;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
@@ -60,7 +55,6 @@ public class DatasetFileApplicationService {
private final DatasetFileRepository datasetFileRepository;
private final DatasetRepository datasetRepository;
private final Path fileStorageLocation;
private final FileService fileService;
@Value("${dataset.base.path:/dataset}")
@@ -68,61 +62,10 @@ public class DatasetFileApplicationService {
@Autowired
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
DatasetRepository datasetRepository, FileService fileService,
@Value("${app.file.upload-dir:./dataset}") String uploadDir) {
DatasetRepository datasetRepository, FileService fileService) {
this.datasetFileRepository = datasetFileRepository;
this.datasetRepository = datasetRepository;
this.fileStorageLocation = Paths.get(uploadDir).toAbsolutePath().normalize();
this.fileService = fileService;
try {
Files.createDirectories(this.fileStorageLocation);
} catch (Exception ex) {
throw new RuntimeException("Could not create the directory where the uploaded files will be stored.", ex);
}
}
/**
* 上传文件到数据集
*/
public DatasetFile uploadFile(String datasetId, MultipartFile file) {
Dataset dataset = datasetRepository.getById(datasetId);
if (dataset == null) {
throw new IllegalArgumentException("Dataset not found: " + datasetId);
}
String originalFilename = file.getOriginalFilename();
String fileName = originalFilename != null ? originalFilename : "file";
try {
// 保存文件到磁盘
Path targetLocation = this.fileStorageLocation.resolve(datasetId + File.separator + fileName);
// 确保目标目录存在
Files.createDirectories(targetLocation);
Files.copy(file.getInputStream(), targetLocation, StandardCopyOption.REPLACE_EXISTING);
// 创建文件实体(UUID 主键)
DatasetFile datasetFile = new DatasetFile();
datasetFile.setId(UUID.randomUUID().toString());
datasetFile.setDatasetId(datasetId);
datasetFile.setFileName(fileName);
datasetFile.setFilePath(targetLocation.toString());
datasetFile.setFileType(getFileExtension(originalFilename));
datasetFile.setFileSize(file.getSize());
datasetFile.setUploadTime(LocalDateTime.now());
datasetFile.setStatus(StatusConstants.DatasetFileStatuses.COMPLETED);
// 保存到数据库
datasetFileRepository.save(datasetFile);
// 更新数据集统计
dataset.addFile(datasetFile);
datasetRepository.updateById(dataset);
return datasetFileRepository.findByDatasetIdAndFileName(datasetId, fileName);
} catch (IOException ex) {
log.error("Could not store file {}", fileName, ex);
throw new RuntimeException("Could not store file " + fileName, ex);
}
}
/**
@@ -155,20 +98,21 @@ public class DatasetFileApplicationService {
/**
* 删除文件
*/
@Transactional
public void deleteDatasetFile(String datasetId, String fileId) {
DatasetFile file = getDatasetFile(datasetId, fileId);
try {
Path filePath = Paths.get(file.getFilePath());
Files.deleteIfExists(filePath);
} catch (IOException ex) {
// ignore
Dataset dataset = datasetRepository.getById(datasetId);
// 删除文件时,上传到数据集中的文件会同时删除数据库中的记录和文件系统中的文件,归集过来的文件仅删除数据库中的记录
if (file.getFilePath().startsWith(dataset.getPath())) {
try {
Path filePath = Paths.get(file.getFilePath());
Files.deleteIfExists(filePath);
} catch (IOException ex) {
throw BusinessException.of(SystemErrorCode.FILE_SYSTEM_ERROR);
}
}
datasetFileRepository.removeById(fileId);
Dataset dataset = datasetRepository.getById(datasetId);
// 简单刷新统计(精确处理可从DB统计)
dataset.setFileCount(Math.max(0, dataset.getFileCount() - 1));
dataset.setSizeBytes(Math.max(0, dataset.getSizeBytes() - (file.getFileSize() != null ? file.getFileSize() : 0)));
dataset.removeFile(file);
datasetRepository.updateById(dataset);
}
@@ -197,6 +141,7 @@ public class DatasetFileApplicationService {
@Transactional(readOnly = true)
public void downloadDatasetFileAsZip(String datasetId, HttpServletResponse response) {
List<DatasetFile> allByDatasetId = datasetFileRepository.findAllByDatasetId(datasetId);
fileRename(allByDatasetId);
response.setContentType("application/zip");
String zipName = String.format("dataset_%s.zip",
LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss")));
@@ -211,6 +156,27 @@ public class DatasetFileApplicationService {
}
}
private void fileRename(List<DatasetFile> files) {
Set<String> uniqueFilenames = new HashSet<>();
for (DatasetFile file : files) {
String originalFilename = file.getFileName();
if (!uniqueFilenames.add(originalFilename)) {
String newFilename;
int counter = 1;
do {
newFilename = generateNewFilename(originalFilename, counter);
counter++;
} while (!uniqueFilenames.add(newFilename));
file.setFileName(newFilename);
}
}
}
private String generateNewFilename(String oldFilename, int counter) {
int dotIndex = oldFilename.lastIndexOf(".");
return oldFilename.substring(0, dotIndex) + "-(" + counter + ")" + oldFilename.substring(dotIndex);
}
private void addToZipFile(DatasetFile file, ZipOutputStream zos) throws IOException {
if (file.getFilePath() == null || !Files.exists(Paths.get(file.getFilePath()))) {
log.warn("The file hasn't been found on filesystem, id: {}", file.getId());
@@ -229,17 +195,6 @@ public class DatasetFileApplicationService {
}
}
private String getFileExtension(String fileName) {
if (fileName == null || fileName.isEmpty()) {
return null;
}
int lastDotIndex = fileName.lastIndexOf(".");
if (lastDotIndex == -1) {
return null;
}
return fileName.substring(lastDotIndex + 1);
}
/**
* 预上传
*
@@ -275,9 +230,6 @@ public class DatasetFileApplicationService {
public void chunkUpload(String datasetId, UploadFileRequest uploadFileRequest) {
FileUploadResult uploadResult = fileService.chunkUpload(DatasetConverter.INSTANCE.toChunkUploadRequest(uploadFileRequest));
saveFileInfoToDb(uploadResult, uploadFileRequest, datasetId);
if (uploadResult.isAllFilesUploaded()) {
// 解析文件,后续依据需求看是否添加校验文件元数据和解析半结构化文件的逻辑,
}
}
private void saveFileInfoToDb(FileUploadResult fileUploadResult, UploadFileRequest uploadFile, String datasetId) {
@@ -301,6 +253,7 @@ public class DatasetFileApplicationService {
datasetFileRepository.save(datasetFile);
dataset.addFile(datasetFile);
dataset.active();
datasetRepository.updateById(dataset);
}
}

View File

@@ -143,4 +143,10 @@ public class Dataset extends BaseEntity<String> {
this.updatedAt = LocalDateTime.now();
}
}
public void active() {
if (this.status == DatasetStatusType.DRAFT) {
this.status = DatasetStatusType.ACTIVE;
}
}
}

View File

@@ -7,7 +7,6 @@ import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.common.domain.model.ChunkUploadRequest;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.interfaces.dto.*;
import org.mapstruct.Mapper;
import org.mapstruct.Mapping;
import org.mapstruct.factory.Mappers;

View File

@@ -10,11 +10,11 @@ import lombok.Setter;
@Setter
public class AllDatasetStatisticsResponse {
/** 总数据集数 */
private Integer totalDatasets;
private Integer totalDatasets = 0;
/** 总文件数 */
private Long totalSize;
private Long totalSize = 0L;
/** 总大小(字节) */
private Long totalFiles;
private Long totalFiles = 0L;
}

View File

@@ -3,6 +3,7 @@ package com.datamate.datamanagement.interfaces.dto;
import com.datamate.datamanagement.common.enums.DatasetType;
import jakarta.validation.constraints.NotBlank;
import jakarta.validation.constraints.NotNull;
import jakarta.validation.constraints.Size;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
@@ -19,9 +20,11 @@ import java.util.List;
@AllArgsConstructor
public class CreateDatasetRequest {
/** 数据集名称 */
@Size(min = 1, max = 100)
@NotBlank(message = "数据集名称不能为空")
private String name;
/** 数据集描述 */
@Size(max = 500)
private String description;
/** 数据集类型 */
@NotNull(message = "数据集类型不能为空")
@@ -30,6 +33,4 @@ public class CreateDatasetRequest {
private List<String> tags;
/** 数据源 */
private String dataSource;
/** 目标位置 */
private String targetLocation;
}

View File

@@ -24,6 +24,8 @@ public class DatasetResponse {
private String status;
/** 标签列表 */
private List<TagResponse> tags;
/** 数据集保留天数 */
private Integer retentionDays;
/** 数据源 */
private String dataSource;
/** 目标位置 */

View File

@@ -1,6 +1,8 @@
package com.datamate.datamanagement.interfaces.dto;
import com.datamate.datamanagement.common.enums.DatasetStatusType;
import jakarta.validation.constraints.NotBlank;
import jakarta.validation.constraints.Size;
import lombok.Getter;
import lombok.Setter;
@@ -13,8 +15,11 @@ import java.util.List;
@Setter
public class UpdateDatasetRequest {
/** 数据集名称 */
@Size(min = 1, max = 100)
@NotBlank(message = "数据集名称不能为空")
private String name;
/** 数据集描述 */
@Size(max = 500)
private String description;
/** 归集任务id */
private String dataSource;

View File

@@ -68,22 +68,6 @@ public class DatasetFileController {
return ResponseEntity.ok(Response.ok(response));
}
@PostMapping(consumes = MediaType.MULTIPART_FORM_DATA_VALUE)
public ResponseEntity<Response<DatasetFileResponse>> uploadDatasetFile(
@PathVariable("datasetId") String datasetId,
@RequestPart(value = "file", required = false) MultipartFile file) {
try {
DatasetFile datasetFile = datasetFileApplicationService.uploadFile(datasetId, file);
return ResponseEntity.status(HttpStatus.CREATED).body(Response.ok(DatasetConverter.INSTANCE.convertToResponse(datasetFile)));
} catch (IllegalArgumentException e) {
return ResponseEntity.badRequest().body(Response.error(SystemErrorCode.UNKNOWN_ERROR, null));
} catch (Exception e) {
log.error("upload fail", e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(Response.error(SystemErrorCode.UNKNOWN_ERROR, null));
}
}
@GetMapping("/{fileId}")
public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById(
@PathVariable("datasetId") String datasetId,
@@ -109,10 +93,9 @@ public class DatasetFileController {
}
@IgnoreResponseWrap
@GetMapping(value = "/{fileId}/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE)
public ResponseEntity<Resource> downloadDatasetFileById(
@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) {
@GetMapping(value = "/{fileId}/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE + ";charset=UTF-8")
public ResponseEntity<Resource> downloadDatasetFileById(@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) {
try {
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
@@ -142,8 +125,8 @@ public class DatasetFileController {
* @return 批量上传请求id
*/
@PostMapping("/upload/pre-upload")
public ResponseEntity<Response<String>> preUpload(@PathVariable("datasetId") String datasetId, @RequestBody @Valid UploadFilesPreRequest request) {
public ResponseEntity<Response<String>> preUpload(@PathVariable("datasetId") String datasetId,
@RequestBody @Valid UploadFilesPreRequest request) {
return ResponseEntity.ok(Response.ok(datasetFileApplicationService.preUpload(request, datasetId)));
}
@@ -153,7 +136,7 @@ public class DatasetFileController {
* @param uploadFileRequest 上传文件请求
*/
@PostMapping("/upload/chunk")
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId, UploadFileRequest uploadFileRequest) {
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId, @Valid UploadFileRequest uploadFileRequest) {
log.info("file upload reqId:{}, fileNo:{}, total chunk num:{}, current chunkNo:{}",
uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(),
uploadFileRequest.getChunkNo());

View File

@@ -0,0 +1,65 @@
package com.datamate.datamanagement.interfaces.scheduler;
import com.datamate.common.interfaces.PagedResponse;
import com.datamate.datamanagement.application.DatasetApplicationService;
import com.datamate.datamanagement.interfaces.dto.DatasetPagingQuery;
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.time.LocalDateTime;
/**
* 数据集定时任务触发
*
* @since 2025/10/24
*/
@Slf4j
@Component
@RequiredArgsConstructor
public class DatasetScheduler {
private final DatasetApplicationService datasetApplicationService;
/**
* 每天凌晨 00:00 扫描并删除超出保留期的数据集
*/
@Scheduled(cron = "0 0 0 * * ?")
public void cleanupExpiredDatasets() {
int pageNo = 1;
int pageSize = 500;
while (true) {
DatasetPagingQuery datasetPagingQuery = new DatasetPagingQuery();
datasetPagingQuery.setPage(pageNo);
datasetPagingQuery.setSize(pageSize);
PagedResponse<DatasetResponse> datasets = datasetApplicationService.getDatasets(datasetPagingQuery);
if (CollectionUtils.isEmpty(datasets.getContent())) {
break;
}
datasets.getContent().forEach(dataset -> {
Integer retentionDays = dataset.getRetentionDays();
LocalDateTime createdAt = dataset.getCreatedAt();
if (retentionDays != null && retentionDays > 0 && createdAt != null) {
LocalDateTime expireAt = createdAt.plusDays(retentionDays);
if (expireAt.isBefore(LocalDateTime.now())) {
try {
log.info("Deleting dataset {}, expired at {} (retentionDays={})", dataset.getId(), expireAt, retentionDays);
datasetApplicationService.deleteDataset(dataset.getId());
} catch (Exception e) {
log.warn("Failed to delete expired dataset {}: {}", dataset.getId(), e.getMessage());
}
}
}
});
if (datasets.getPage() >= datasets.getTotalPages()) {
break;
}
pageNo++;
}
}
}

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -1,7 +1,7 @@
# 数据引擎平台 - 主应用配置
spring:
application:
name: data-mate-platform
name: datamate
# 暂时排除Spring Security自动配置(开发阶段使用)
autoconfigure:

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>
@@ -33,5 +33,10 @@
<groupId>com.fasterxml.jackson.datatype</groupId>
<artifactId>jackson-datatype-jsr310</artifactId>
</dependency>
<dependency>
<groupId>dev.langchain4j</groupId>
<artifactId>langchain4j-open-ai</artifactId>
<version>1.8.0</version>
</dependency>
</dependencies>
</project>

View File

@@ -0,0 +1,23 @@
package com.datamate.common.infrastructure.exception;
import lombok.AllArgsConstructor;
import lombok.Getter;
/**
* 知识库错误码
*
* @author dallas
* @since 2025-10-24
*/
@Getter
@AllArgsConstructor
public enum KnowledgeBaseErrorCode implements ErrorCode {
/**
* 知识库不存在
*/
KNOWLEDGE_BASE_NOT_FOUND("knowledge.0001", "知识库不存在");
private final String code;
private final String message;
}

View File

@@ -0,0 +1,85 @@
package com.datamate.common.models.application;
import com.baomidou.mybatisplus.core.metadata.IPage;
import com.datamate.common.infrastructure.exception.BusinessAssert;
import com.datamate.common.interfaces.PagedResponse;
import com.datamate.common.models.domain.entity.ModelConfig;
import com.datamate.common.models.domain.repository.ModelConfigRepository;
import com.datamate.common.models.infrastructure.client.ModelClient;
import com.datamate.common.models.infrastructure.exception.ModelsErrorCode;
import com.datamate.common.models.interfaces.rest.dto.CreateModelRequest;
import com.datamate.common.models.interfaces.rest.dto.QueryModelRequest;
import dev.langchain4j.model.chat.ChatModel;
import jakarta.validation.Valid;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
/**
* 模型配置应用服务类
*
* @author dallas
* @since 2025-10-27
*/
@Service
@RequiredArgsConstructor
public class ModelConfigApplicationService {
private final ModelConfigRepository modelConfigRepository;
public List<ModelConfig> getProviders() {
List<ModelConfig> providers = new ArrayList<>();
providers.add(ModelConfig.builder().provider("ModelEngine").baseUrl("http://localhost:9981").build());
providers.add(ModelConfig.builder().provider("Ollama").baseUrl("http://localhost:11434").build());
providers.add(ModelConfig.builder().provider("OpenAI").baseUrl("https://api.openai.com/v1").build());
providers.add(ModelConfig.builder().provider("DeepSeek").baseUrl("https://api.deepseek.cn/v1").build());
providers.add(ModelConfig.builder().provider("火山方舟").baseUrl("https://ark.cn-beijing.volces.com/api/v3").build());
providers.add(ModelConfig.builder().provider("阿里云百炼").baseUrl("https://dashscope.aliyuncs.com/compatible-mode/v1").build());
providers.add(ModelConfig.builder().provider("硅基流动").baseUrl("https://api.siliconflow.cn/v1").build());
providers.add(ModelConfig.builder().provider("智谱AI").baseUrl("https://open.bigmodel.cn/api/paas/v4").build());
return providers;
}
public PagedResponse<ModelConfig> getModels(QueryModelRequest queryModelRequest) {
// 从数据库查询模型配置
IPage<ModelConfig> page = modelConfigRepository.page(queryModelRequest);
return PagedResponse.of(page.getRecords(), page.getCurrent(), page.getTotal(), page.getPages());
}
public ModelConfig getModelDetail(String modelId) {
return modelConfigRepository.getById(modelId);
}
public ModelConfig createModel(CreateModelRequest modelConfig) {
ModelConfig newConfig = ModelConfig.builder()
.provider(modelConfig.getProvider())
.modelName(modelConfig.getModelName())
.type(modelConfig.getType())
.baseUrl(modelConfig.getBaseUrl())
.apiKey(modelConfig.getApiKey())
.isEnabled(true)
.build();
ModelClient.checkHealth(newConfig);
modelConfigRepository.save(newConfig);
return newConfig;
}
public ModelConfig updateModel(String modelId, @Valid CreateModelRequest updateModelRequest) {
ModelConfig modelConfig = modelConfigRepository.getById(modelId);
BusinessAssert.notNull(modelConfig, ModelsErrorCode.MODEL_CONFIG_NOT_FOUND);
modelConfig.setProvider(updateModelRequest.getProvider());
modelConfig.setModelName(updateModelRequest.getModelName());
modelConfig.setType(updateModelRequest.getType());
modelConfig.setBaseUrl(updateModelRequest.getBaseUrl());
modelConfig.setApiKey(updateModelRequest.getApiKey());
modelConfig.setIsEnabled(true);
ModelClient.checkHealth(modelConfig);
modelConfigRepository.updateById(modelConfig);
return modelConfig;
}
public void deleteModel(String modelId) {
modelConfigRepository.removeById(modelId);
}
}

View File

@@ -0,0 +1,44 @@
package com.datamate.common.models.domain.entity;
import com.baomidou.mybatisplus.annotation.TableName;
import com.datamate.common.domain.model.base.BaseEntity;
import lombok.Builder;
import lombok.Getter;
import lombok.Setter;
/**
* 模型配置实体类
*
* @author dallas
* @since 2025-10-27
*/
@Getter
@Setter
@TableName("t_model_config")
@Builder
public class ModelConfig extends BaseEntity<String> {
/**
* 模型名称(如 qwen2)
*/
private String modelName;
/**
* 模型提供商(如 Ollama、OpenAI、DeepSeek)
*/
private String provider;
/**
* API 基础地址
*/
private String baseUrl;
/**
* API 密钥(无密钥则为空)
*/
private String apiKey;
/**
* 模型类型(如 chat、embedding)
*/
private ModelType type;
/**
* 是否启用:1-启用,0-禁用
*/
private Boolean isEnabled;
}

View File

@@ -0,0 +1,18 @@
package com.datamate.common.models.domain.entity;
/**
* 模型类型枚举类
*
* @author dallas
* @since 2025-10-27
*/
public enum ModelType {
/**
* 语言模型
*/
CHAT,
/**
* 嵌入模型
*/
EMBEDDING
}

View File

@@ -0,0 +1,22 @@
package com.datamate.common.models.domain.repository;
import com.baomidou.mybatisplus.core.metadata.IPage;
import com.baomidou.mybatisplus.extension.repository.IRepository;
import com.datamate.common.models.domain.entity.ModelConfig;
import com.datamate.common.models.interfaces.rest.dto.QueryModelRequest;
/**
* 模型配置仓库接口
*
* @author dallas
* @since 2025-10-27
*/
public interface ModelConfigRepository extends IRepository<ModelConfig> {
/**
* 分页查询模型配置
*
* @param queryModelRequest 分页查询参数
* @return 模型配置列表
*/
IPage<ModelConfig> page(QueryModelRequest queryModelRequest);
}

View File

@@ -0,0 +1,44 @@
package com.datamate.common.models.infrastructure.client;
import com.datamate.common.models.domain.entity.ModelConfig;
import com.datamate.common.models.domain.entity.ModelType;
import dev.langchain4j.model.chat.ChatModel;
import dev.langchain4j.model.embedding.EmbeddingModel;
import dev.langchain4j.model.openai.OpenAiChatModel;
import dev.langchain4j.model.openai.OpenAiEmbeddingModel;
import java.util.function.Consumer;
/**
* 模型客户端接口
*
* @author dallas
* @since 2025-10-27
*/
public class ModelClient {
public static <T> T invokeModel(ModelConfig modelConfig, Class<T> modelInterface) {
return switch (modelConfig.getType()) {
case CHAT -> modelInterface.cast(invokeChatModel(modelConfig));
case EMBEDDING -> modelInterface.cast(invokeEmbeddingModel(modelConfig));
};
}
private static EmbeddingModel invokeEmbeddingModel(ModelConfig modelConfig) {
return OpenAiEmbeddingModel.builder()
.baseUrl(modelConfig.getBaseUrl())
.apiKey(modelConfig.getApiKey())
.modelName(modelConfig.getModelName())
.build();
}
private static ChatModel invokeChatModel(ModelConfig modelConfig) {
return OpenAiChatModel.builder()
.baseUrl(modelConfig.getBaseUrl())
.apiKey(modelConfig.getApiKey())
.modelName(modelConfig.getModelName())
.build();
}
public static void checkHealth(ModelConfig modelConfig) {
}
}

View File

@@ -0,0 +1,27 @@
package com.datamate.common.models.infrastructure.exception;
import com.datamate.common.infrastructure.exception.ErrorCode;
import lombok.AllArgsConstructor;
import lombok.Getter;
/**
* 模型配置错误码枚举类
*
* @author dallas
* @since 2025-10-27
*/
@Getter
@AllArgsConstructor
public enum ModelsErrorCode implements ErrorCode {
/**
* 模型配置不存在
*/
MODEL_CONFIG_NOT_FOUND("model.0001", "模型配置不存在"),
/**
* 模型配置已存在
*/
MODEL_CONFIG_ALREADY_EXISTS("model.0002", "模型配置已存在");
private final String code;
private final String message;
}

View File

@@ -0,0 +1,37 @@
package com.datamate.common.models.infrastructure.persistence.impl;
import com.baomidou.mybatisplus.core.conditions.query.LambdaQueryWrapper;
import com.baomidou.mybatisplus.core.metadata.IPage;
import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
import com.baomidou.mybatisplus.extension.repository.CrudRepository;
import com.datamate.common.models.domain.entity.ModelConfig;
import com.datamate.common.models.domain.repository.ModelConfigRepository;
import com.datamate.common.models.infrastructure.persistence.mapper.ModelConfigMapper;
import com.datamate.common.models.interfaces.rest.dto.QueryModelRequest;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Repository;
import org.springframework.util.StringUtils;
import java.util.Objects;
/**
* 模型配置仓库实现类
*
* @author dallas
* @since 2025-10-27
*/
@Repository
@RequiredArgsConstructor
public class ModelConfigRepositoryImpl extends CrudRepository<ModelConfigMapper, ModelConfig> implements ModelConfigRepository {
private final ModelConfigMapper modelConfigMapper;
@Override
public IPage<ModelConfig> page(QueryModelRequest queryModelRequest) {
IPage<ModelConfig> page = new Page<>(queryModelRequest.getPage(), queryModelRequest.getSize());
return this.page(page, new LambdaQueryWrapper<ModelConfig>()
.eq(StringUtils.hasText(queryModelRequest.getProvider()), ModelConfig::getProvider, queryModelRequest.getProvider())
.eq(Objects.nonNull(queryModelRequest.getType()), ModelConfig::getType, queryModelRequest.getType())
.eq(Objects.nonNull(queryModelRequest.getIsEnabled()), ModelConfig::getIsEnabled, queryModelRequest.getIsEnabled()));
}
}

View File

@@ -0,0 +1,15 @@
package com.datamate.common.models.infrastructure.persistence.mapper;
import com.baomidou.mybatisplus.core.mapper.BaseMapper;
import com.datamate.common.models.domain.entity.ModelConfig;
import org.apache.ibatis.annotations.Mapper;
/**
* 模型配置映射器接口
*
* @author dallas
* @since 2025-10-27
*/
@Mapper
public interface ModelConfigMapper extends BaseMapper<ModelConfig> {
}

View File

@@ -0,0 +1,90 @@
package com.datamate.common.models.interfaces.rest;
import com.datamate.common.interfaces.PagedResponse;
import com.datamate.common.models.application.ModelConfigApplicationService;
import com.datamate.common.models.domain.entity.ModelConfig;
import com.datamate.common.models.interfaces.rest.dto.CreateModelRequest;
import com.datamate.common.models.interfaces.rest.dto.QueryModelRequest;
import jakarta.validation.Valid;
import lombok.RequiredArgsConstructor;
import org.springframework.web.bind.annotation.*;
import java.util.List;
/**
* 模型配置控制器类
*
* @author dallas
* @since 2025-10-27
*/
@RestController
@RequestMapping("/api/models")
@RequiredArgsConstructor
public class ModelConfigController {
private final ModelConfigApplicationService modelConfigApplicationService;
/**
* 获取厂商列表
*
* @return 厂商列表
*/
@GetMapping("/providers")
public List<ModelConfig> getProviders() {
return modelConfigApplicationService.getProviders();
}
/**
* 获取模型列表
*
* @return 模型列表
*/
@GetMapping("/list")
public PagedResponse<ModelConfig> getModels(@RequestParam QueryModelRequest queryModelRequest) {
return modelConfigApplicationService.getModels(queryModelRequest);
}
/**
* 获取模型详情
*
* @param modelId 模型 ID
* @return 模型详情
*/
@GetMapping("/{modelId}")
public ModelConfig getModelDetail(@PathVariable String modelId) {
return modelConfigApplicationService.getModelDetail(modelId);
}
/**
* 创建模型配置
*
* @param createModelRequest 创建模型配置请求
* @return 创建的模型配置
*/
@PostMapping("/create")
public ModelConfig createModel(@RequestBody @Valid CreateModelRequest createModelRequest) {
return modelConfigApplicationService.createModel(createModelRequest);
}
/**
* 更新模型配置
*
* @param modelId 模型 ID
* @param updateModelRequest 更新模型配置请求
* @return 更新后的模型配置
*/
@PutMapping("/{modelId}")
public ModelConfig updateModel(@PathVariable String modelId, @RequestBody @Valid CreateModelRequest updateModelRequest) {
return modelConfigApplicationService.updateModel(modelId, updateModelRequest);
}
/**
* 删除模型配置
*
* @param modelId 模型 ID
*/
@DeleteMapping("/{modelId}")
public void deleteModel(@PathVariable String modelId) {
modelConfigApplicationService.deleteModel(modelId);
}
}

View File

@@ -0,0 +1,46 @@
package com.datamate.common.models.interfaces.rest.dto;
import com.datamate.common.models.domain.entity.ModelType;
import jakarta.validation.constraints.NotEmpty;
import jakarta.validation.constraints.NotNull;
import lombok.Getter;
import lombok.Setter;
/**
* 创建模型配置请求类
*
* @author dallas
* @since 2025-10-27
*/
@Setter
@Getter
public class CreateModelRequest {
/**
* 模型名称(如 qwen2)
*/
@NotEmpty(message = "模型名称不能为空")
private String modelName;
/**
* 模型提供商(如 Ollama、OpenAI、DeepSeek)
*/
@NotEmpty(message = "模型提供商不能为空")
private String provider;
/**
* API 基础地址
*/
@NotEmpty(message = "API 基础地址不能为空")
private String baseUrl;
/**
* API 密钥(无密钥则为空)
*/
private String apiKey;
/**
* 模型类型(如 chat、embedding)
*/
@NotNull(message = "模型类型不能为空")
private ModelType type;
/**
* 是否启用:1-启用,0-禁用
*/
private Boolean isEnabled;
}

View File

@@ -0,0 +1,27 @@
package com.datamate.common.models.interfaces.rest.dto;
import com.datamate.common.interfaces.PagingQuery;
import com.datamate.common.models.domain.entity.ModelType;
import lombok.Getter;
import lombok.Setter;
/**
* 模型查询请求 DTO
*
* @author dallas
* @since 2025-10-27
*/
@Getter
@Setter
public class QueryModelRequest extends PagingQuery {
/**
* 模型提供商(如 Ollama、OpenAI、DeepSeek)
*/
private String provider;
/**
* 模型类型(如 chat、embedding)
*/
private ModelType type;
private Boolean isEnabled;
}

View File

@@ -7,7 +7,7 @@
<parent>
<groupId>com.datamate</groupId>
<artifactId>data-mate-platform</artifactId>
<artifactId>datamate</artifactId>
<version>1.0.0-SNAPSHOT</version>
<relativePath>../../pom.xml</relativePath>
</parent>

View File

@@ -1,7 +1,7 @@
# 数据引擎平台 - 主应用配置
spring:
application:
name: data-mate-platform
name: datamate
# 暂时排除Spring Security自动配置(开发阶段使用)
autoconfigure:

View File

@@ -1,7 +1,7 @@
# 数据引擎平台 - 主应用配置
spring:
application:
name: data-mate-platform
name: datamate
# 暂时排除Spring Security自动配置(开发阶段使用)
autoconfigure:

View File

@@ -53,9 +53,6 @@ LS_TASK_PAGE_SIZE=1000
# =========================
# Data Management 服务配置
# =========================
# DM 服务地址
DM_SERVICE_BASE_URL=http://data-engine:8080
# DM 存储文件夹前缀(通常与 Label Studio 的 local-files 文件夹映射一致)
DM_FILE_PATH_PREFIX=/

View File

@@ -0,0 +1,86 @@
# Label Studio Adapter (DataMate)
这是 DataMate 的 Label Studio Adapter 服务,负责将 DataMate 的项目与 Label Studio 同步并提供对外的 HTTP API(基于 FastAPI)。
## 简要说明
- 框架:FastAPI
- 异步数据库/ORM:SQLAlchemy (async)
- 数据库迁移:Alembic
- 运行器:uvicorn
## 快速开始(开发)
1. 克隆仓库并进入项目目录
2. 创建并激活虚拟环境:
```bash
python -m venv .venv
source .venv/bin/activate
```
3. 安装依赖:
```bash
pip install -r requirements.txt
```
4. 准备环境变量(示例)
创建 `.env` 并设置必要的变量,例如:
- DATABASE_URL(或根据项目配置使用具体变量)
- LABEL_STUDIO_BASE_URL
- LABEL_STUDIO_USER_TOKEN
(具体变量请参考 `.env.example`
5. 数据库迁移(开发环境):
```bash
alembic upgrade head
```
6. 启动开发服务器(示例与常用参数):
- 本地开发(默认 host/port,自动重载):
```bash
uvicorn app.main:app --reload
```
- 指定主机与端口并打开调试日志:
```bash
uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload --log-level debug
```
- 在生产环境使用多个 worker(不使用 --reload):
```bash
uvicorn app.main:app --host 0.0.0.0 --port 8000 --workers 4 --log-level info --proxy-headers
```
- 使用环境变量启动(示例):
```bash
HOST=0.0.0.0 PORT=8000 uvicorn app.main:app --reload
```
注意:
- `--reload` 仅用于开发,会监视文件变化并重启进程;不要在生产中使用。
- `--workers` 提供并发处理能力,但会增加内存占用;生产时通常配合进程管理或容器编排(Kubernetes)使用。
- 若需要完整的生产部署建议使用 ASGI 服务器(如 gunicorn + uvicorn workers / 或直接使用 uvicorn 在容器中配合进程管理)。
访问 API 文档:
- Swagger UI: http://127.0.0.1:8000/docs
- ReDoc: http://127.0.0.1:8000/redoc (推荐使用)
## 使用(简要)
- 所有 API 路径以 `/api` 前缀注册(见 `app/main.py``app.include_router(api_router, prefix="/api")`)。
- 根路径 `/` 返回服务信息和文档链接。
更多细节请查看 `doc/usage.md`(接口使用)和 `doc/development.md`(开发说明)。

View File

@@ -4,7 +4,7 @@ from typing import Optional
from app.db.database import get_db
from app.services.dataset_mapping_service import DatasetMappingService
from app.clients import get_clients
from app.infrastructure import DatamateClient, LabelStudioClient
from app.schemas.dataset_mapping import (
DatasetMappingCreateRequest,
DatasetMappingCreateResponse,
@@ -30,18 +30,19 @@ async def create_dataset_mapping(
注意一个数据集可以创建多个标注项目
"""
try:
# 获取全局客户端实例
dm_client_instance, ls_client_instance = get_clients()
dm_client = DatamateClient(db)
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
token=settings.label_studio_user_token)
service = DatasetMappingService(db)
logger.info(f"Create dataset mapping request: {request.source_dataset_id}")
logger.info(f"Create dataset mapping request: {request.dataset_id}")
# 从DM服务获取数据集信息
dataset_info = await dm_client_instance.get_dataset(request.source_dataset_id)
dataset_info = await dm_client.get_dataset(request.dataset_id)
if not dataset_info:
raise HTTPException(
status_code=404,
detail=f"Dataset not found in DM service: {request.source_dataset_id}"
detail=f"Dataset not found in DM service: {request.dataset_id}"
)
# 确定数据类型(基于数据集类型)
@@ -55,11 +56,10 @@ async def create_dataset_mapping(
elif "text" in type_code:
data_type = "text"
# 生成项目名称
project_name = f"{dataset_info.name}"
# 在Label Studio中创建项目
project_data = await ls_client_instance.create_project(
project_data = await ls_client.create_project(
title=project_name,
description=dataset_info.description or f"Imported from DM dataset {dataset_info.id}",
data_type=data_type
@@ -74,8 +74,8 @@ async def create_dataset_mapping(
project_id = project_data["id"]
# 配置本地存储:dataset/<id>
local_storage_path = f"{settings.label_studio_local_storage_dataset_base_path}/{request.source_dataset_id}"
storage_result = await ls_client_instance.create_local_storage(
local_storage_path = f"{settings.label_studio_local_storage_dataset_base_path}/{request.dataset_id}"
storage_result = await ls_client.create_local_storage(
project_id=project_id,
path=local_storage_path,
title="Dataset_BLOB",
@@ -85,7 +85,7 @@ async def create_dataset_mapping(
# 配置本地存储:upload
local_storage_path = f"{settings.label_studio_local_storage_upload_base_path}"
storage_result = await ls_client_instance.create_local_storage(
storage_result = await ls_client.create_local_storage(
project_id=project_id,
path=local_storage_path,
title="Upload_BLOB",
@@ -107,7 +107,7 @@ async def create_dataset_mapping(
)
logger.debug(
f"Dataset mapping created: {mapping.mapping_id} -> S {mapping.source_dataset_id} <> L {mapping.labelling_project_id}"
f"Dataset mapping created: {mapping.mapping_id} -> S {mapping.dataset_id} <> L {mapping.labelling_project_id}"
)
response_data = DatasetMappingCreateResponse(

View File

@@ -1,13 +1,15 @@
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi import Depends, HTTPException, Query
from sqlalchemy.ext.asyncio import AsyncSession
from typing import Optional
from app.db.database import get_db
from app.services.dataset_mapping_service import DatasetMappingService
from app.clients import get_clients
from app.infrastructure import DatamateClient, LabelStudioClient
from app.schemas.dataset_mapping import DeleteDatasetResponse
from app.schemas import StandardResponse
from app.core.logging import get_logger
from app.core.config import settings
from . import project_router
logger = get_logger(__name__)
@@ -37,39 +39,39 @@ async def delete_mapping(
status_code=400,
detail="Either 'm' (mapping UUID) or 'proj' (project ID) must be provided"
)
# 获取全局客户端实例
dm_client_instance, ls_client_instance = get_clients()
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
token=settings.label_studio_user_token)
service = DatasetMappingService(db)
mapping = None
# 优先使用 mapping_id 查询
if m:
logger.info(f"Deleting by mapping UUID: {m}")
logger.debug(f"Deleting by mapping UUID: {m}")
mapping = await service.get_mapping_by_uuid(m)
# 如果没有提供 m,使用 proj 查询
elif proj:
logger.info(f"Deleting by project ID: {proj}")
logger.debug(f"Deleting by project ID: {proj}")
mapping = await service.get_mapping_by_labelling_project_id(proj)
else:
mapping = None
if not mapping:
raise HTTPException(
status_code=404,
detail=f"Mapping not found"
detail=f"Mapping either not found or not specified."
)
mapping_id = mapping.mapping_id
labelling_project_id = mapping.labelling_project_id
labelling_project_name = mapping.labelling_project_name
logger.info(f"Found mapping: {mapping_id}, Label Studio project ID: {labelling_project_id}")
logger.debug(f"Found mapping: {mapping_id}, Label Studio project ID: {labelling_project_id}")
# 1. 删除 Label Studio 项目
try:
delete_success = await ls_client_instance.delete_project(int(labelling_project_id))
delete_success = await ls_client.delete_project(int(labelling_project_id))
if delete_success:
logger.info(f"Successfully deleted Label Studio project: {labelling_project_id}")
logger.debug(f"Successfully deleted Label Studio project: {labelling_project_id}")
else:
logger.warning(f"Failed to delete Label Studio project or project not found: {labelling_project_id}")
except Exception as e:
@@ -84,19 +86,17 @@ async def delete_mapping(
status_code=500,
detail="Failed to delete mapping record"
)
logger.info(f"Successfully deleted mapping: {mapping_id}")
response_data = DeleteDatasetResponse(
mapping_id=mapping_id,
status="success",
message=f"Successfully deleted mapping and Label Studio project '{labelling_project_name}'"
)
logger.info(f"Successfully deleted mapping: {mapping_id}, Label Studio project: {labelling_project_id}")
return StandardResponse(
code=200,
message="success",
data=response_data
data=DeleteDatasetResponse(
mapping_id=mapping_id,
status="success",
message=f"Successfully deleted mapping and Label Studio project '{labelling_project_name}'"
)
)
except HTTPException:

View File

@@ -98,9 +98,9 @@ async def get_mapping(
raise HTTPException(status_code=500, detail="Internal server error")
@project_router.get("/mappings/by-source/{source_dataset_id}", response_model=StandardResponse[PaginatedData[DatasetMappingResponse]])
@project_router.get("/mappings/by-source/{dataset_id}", response_model=StandardResponse[PaginatedData[DatasetMappingResponse]])
async def get_mappings_by_source(
source_dataset_id: str,
dataset_id: str,
page: int = Query(1, ge=1, description="页码(从1开始)"),
page_size: int = Query(20, ge=1, le=100, description="每页记录数"),
db: AsyncSession = Depends(get_db)
@@ -116,11 +116,11 @@ async def get_mappings_by_source(
# 计算 skip
skip = (page - 1) * page_size
logger.info(f"Get mappings by source dataset id: {source_dataset_id}, page={page}, page_size={page_size}")
logger.info(f"Get mappings by source dataset id: {dataset_id}, page={page}, page_size={page_size}")
# 获取数据和总数
mappings, total = await service.get_mappings_by_source_with_count(
source_dataset_id=source_dataset_id,
dataset_id=dataset_id,
skip=skip,
limit=page_size
)

View File

@@ -5,7 +5,7 @@ from typing import List, Optional
from app.db.database import get_db
from app.services.dataset_mapping_service import DatasetMappingService
from app.services.sync_service import SyncService
from app.clients import get_clients
from app.infrastructure import DatamateClient, LabelStudioClient
from app.exceptions import NoDatasetInfoFoundError, DatasetMappingNotFoundError
from app.schemas.dataset_mapping import (
DatasetMappingResponse,
@@ -14,6 +14,7 @@ from app.schemas.dataset_mapping import (
)
from app.schemas import StandardResponse
from app.core.logging import get_logger
from app.core.config import settings
from . import project_router
logger = get_logger(__name__)
@@ -30,10 +31,12 @@ async def sync_dataset_content(
在数据库中记录更新时间返回更新状态
"""
try:
dm_client_instance, ls_client_instance = get_clients()
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
token=settings.label_studio_user_token)
dm_client = DatamateClient(db)
mapping_service = DatasetMappingService(db)
sync_service = SyncService(dm_client_instance, ls_client_instance, mapping_service)
sync_service = SyncService(dm_client, ls_client, mapping_service)
logger.info(f"Sync dataset content request: mapping_id={request.mapping_id}")
# 根据 mapping_id 获取映射关系

View File

@@ -27,7 +27,6 @@ async def get_config():
data={
"app_name": settings.app_name,
"version": settings.app_version,
"dm_service_url": settings.dm_service_base_url,
"label_studio_url": settings.label_studio_base_url,
"debug": settings.debug
}

View File

@@ -73,7 +73,6 @@ class Settings(BaseSettings):
# =========================
# Data Management 服务配置
# =========================
dm_service_base_url: str = "http://data-engine"
dm_file_path_prefix: str = "/" # DM存储文件夹前缀

View File

@@ -0,0 +1,6 @@
# app/clients/__init__.py
from .label_studio import Client as LabelStudioClient
from .datamate import Client as DatamateClient
__all__ = ["LabelStudioClient", "DatamateClient"]

View File

@@ -0,0 +1,159 @@
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy import func
from typing import Optional
from app.core.config import settings
from app.core.logging import get_logger
from app.schemas.dm_service import DatasetResponse, PagedDatasetFileResponse, DatasetFileResponse
from app.models.dm.dataset import Dataset
from app.models.dm.dataset_files import DatasetFiles
logger = get_logger(__name__)
class Client:
"""数据管理服务客户端 - 直接访问数据库"""
def __init__(self, db: AsyncSession):
"""
初始化 DM 客户端
Args:
db: 数据库会话
"""
self.db = db
logger.info("Initialize DM service client (Database mode)")
async def get_dataset(self, dataset_id: str) -> Optional[DatasetResponse]:
"""获取数据集详情"""
try:
logger.info(f"Getting dataset detail: {dataset_id} ...")
result = await self.db.execute(
select(Dataset).where(Dataset.id == dataset_id)
)
dataset = result.scalar_one_or_none()
if not dataset:
logger.error(f"Dataset not found: {dataset_id}")
return None
# 将数据库模型转换为响应模型
# type: ignore 用于忽略 SQLAlchemy 的类型检查问题
return DatasetResponse(
id=dataset.id, # type: ignore
name=dataset.name, # type: ignore
description=dataset.description or "", # type: ignore
datasetType=dataset.dataset_type, # type: ignore
status=dataset.status, # type: ignore
fileCount=dataset.file_count or 0, # type: ignore
totalSize=dataset.size_bytes or 0, # type: ignore
createdAt=dataset.created_at, # type: ignore
updatedAt=dataset.updated_at, # type: ignore
createdBy=dataset.created_by # type: ignore
)
except Exception as e:
logger.error(f"Failed to get dataset {dataset_id}: {e}")
return None
async def get_dataset_files(
self,
dataset_id: str,
page: int = 0,
size: int = 100,
file_type: Optional[str] = None,
status: Optional[str] = None
) -> Optional[PagedDatasetFileResponse]:
"""获取数据集文件列表"""
try:
logger.info(f"Get dataset files: dataset={dataset_id}, page={page}, size={size}")
# 构建查询
query = select(DatasetFiles).where(DatasetFiles.dataset_id == dataset_id)
# 添加可选过滤条件
if file_type:
query = query.where(DatasetFiles.file_type == file_type)
if status:
query = query.where(DatasetFiles.status == status)
# 获取总数
count_query = select(func.count()).select_from(DatasetFiles).where(
DatasetFiles.dataset_id == dataset_id
)
if file_type:
count_query = count_query.where(DatasetFiles.file_type == file_type)
if status:
count_query = count_query.where(DatasetFiles.status == status)
count_result = await self.db.execute(count_query)
total = count_result.scalar_one()
# 分页查询
query = query.offset(page * size).limit(size).order_by(DatasetFiles.created_at.desc())
result = await self.db.execute(query)
files = result.scalars().all()
# 转换为响应模型
# type: ignore 用于忽略 SQLAlchemy 的类型检查问题
content = [
DatasetFileResponse(
id=f.id, # type: ignore
fileName=f.file_name, # type: ignore
fileType=f.file_type or "", # type: ignore
filePath=f.file_path, # type: ignore
originalName=f.file_name, # type: ignore
size=f.file_size, # type: ignore
status=f.status, # type: ignore
uploadedAt=f.upload_time, # type: ignore
description=None,
uploadedBy=None,
lastAccessTime=f.last_access_time # type: ignore
)
for f in files
]
total_pages = (total + size - 1) // size if size > 0 else 0
return PagedDatasetFileResponse(
content=content,
totalElements=total,
totalPages=total_pages,
page=page,
size=size
)
except Exception as e:
logger.error(f"Failed to get dataset files for {dataset_id}: {e}")
return None
async def download_file(self, dataset_id: str, file_id: str) -> Optional[bytes]:
"""
下载文件内容
注意:此方法保留接口兼容性,但实际文件下载可能需要通过文件系统或对象存储
"""
logger.warning(f"download_file is deprecated when using database mode. Use get_file_download_url instead.")
return None
async def get_file_download_url(self, dataset_id: str, file_id: str) -> Optional[str]:
"""获取文件下载URL(或文件路径)"""
try:
result = await self.db.execute(
select(DatasetFiles).where(
DatasetFiles.id == file_id,
DatasetFiles.dataset_id == dataset_id
)
)
file = result.scalar_one_or_none()
if not file:
logger.error(f"File not found: {file_id} in dataset {dataset_id}")
return None
# 返回文件路径(可以是本地路径或对象存储URL)
return file.file_path # type: ignore
except Exception as e:
logger.error(f"Failed to get file path for {file_id}: {e}")
return None
async def close(self):
"""关闭客户端连接(数据库模式下无需操作)"""
logger.info("DM service client closed (Database mode)")

View File

@@ -12,7 +12,7 @@ from app.schemas.label_studio import (
logger = get_logger(__name__)
class LabelStudioClient:
class Client:
"""Label Studio服务客户端
使用 HTTP REST API 直接与 Label Studio 交互

View File

@@ -8,7 +8,7 @@ from typing import Dict, Any
from .core.config import settings
from .core.logging import setup_logging, get_logger
from .clients import DMServiceClient, LabelStudioClient, set_clients
from .infrastructure import LabelStudioClient
from .api import api_router
from .schemas import StandardResponse
@@ -23,23 +23,12 @@ async def lifespan(app: FastAPI):
# 启动时初始化
logger.info("Starting Label Studio Adapter...")
# 初始化客户端
dm_client = DMServiceClient()
# 初始化 Label Studio 客户端,使用 HTTP REST API + Token 认证
ls_client = LabelStudioClient(
base_url=settings.label_studio_base_url,
token=settings.label_studio_user_token
)
# 设置全局客户端
set_clients(dm_client, ls_client)
# 数据库初始化由 Alembic 管理
# 在 Docker 环境中,entrypoint.sh 会在启动前运行: alembic upgrade head
# 在开发环境中,手动运行: alembic upgrade head
logger.info("Database schema managed by Alembic")
logger.info("Label Studio Adapter started")
yield
@@ -155,7 +144,6 @@ async def root():
"message": f"{settings.app_name} is running",
"version": settings.app_version,
"docs_url": "/docs",
"dm_service_url": settings.dm_service_base_url,
"label_studio_url": settings.label_studio_base_url
}
)

View File

@@ -0,0 +1,138 @@
# DataMate 数据模型结构
本文档列出了根据 `scripts/db` 中的 SQL 文件创建的所有 Python 数据模型。
## 模型组织结构
```
app/models/
├── __init__.py # 主模块导出文件
├── dm/ # 数据管理 (Data Management) 模块
│ ├── __init__.py
│ ├── annotation_template.py # 标注模板
│ ├── labeling_project.py # 标注项目
│ ├── dataset.py # 数据集
│ ├── dataset_files.py # 数据集文件
│ ├── dataset_statistics.py # 数据集统计
│ ├── dataset_tag.py # 数据集标签关联
│ ├── tag.py # 标签
│ └── user.py # 用户
├── cleaning/ # 数据清洗 (Data Cleaning) 模块
│ ├── __init__.py
│ ├── clean_template.py # 清洗模板
│ ├── clean_task.py # 清洗任务
│ ├── operator_instance.py # 算子实例
│ └── clean_result.py # 清洗结果
├── collection/ # 数据归集 (Data Collection) 模块
│ ├── __init__.py
│ ├── task_execution.py # 任务执行明细
│ ├── collection_task.py # 数据归集任务
│ ├── task_log.py # 任务执行记录
│ └── datax_template.py # DataX模板配置
├── common/ # 通用 (Common) 模块
│ ├── __init__.py
│ └── chunk_upload_request.py # 文件切片上传请求
└── operator/ # 算子 (Operator) 模块
├── __init__.py
├── operator.py # 算子
├── operator_category.py # 算子分类
└── operator_category_relation.py # 算子分类关联
```
## 模块详情
### 1. Data Management (DM) 模块
对应 SQL: `data-management-init.sql``data-annotation-init.sql`
#### 模型列表:
- **AnnotationTemplate** (`t_dm_annotation_templates`) - 标注模板
- **LabelingProject** (`t_dm_labeling_projects`) - 标注项目
- **Dataset** (`t_dm_datasets`) - 数据集(支持医学影像、文本、问答等多种类型)
- **DatasetFiles** (`t_dm_dataset_files`) - 数据集文件
- **DatasetStatistics** (`t_dm_dataset_statistics`) - 数据集统计信息
- **Tag** (`t_dm_tags`) - 标签
- **DatasetTag** (`t_dm_dataset_tags`) - 数据集标签关联
- **User** (`users`) - 用户
### 2. Data Cleaning 模块
对应 SQL: `data-cleaning-init.sql`
#### 模型列表:
- **CleanTemplate** (`t_clean_template`) - 清洗模板
- **CleanTask** (`t_clean_task`) - 清洗任务
- **OperatorInstance** (`t_operator_instance`) - 算子实例
- **CleanResult** (`t_clean_result`) - 清洗结果
### 3. Data Collection (DC) 模块
对应 SQL: `data-collection-init.sql`
#### 模型列表:
- **TaskExecution** (`t_dc_task_executions`) - 任务执行明细
- **CollectionTask** (`t_dc_collection_tasks`) - 数据归集任务
- **TaskLog** (`t_dc_task_log`) - 任务执行记录
- **DataxTemplate** (`t_dc_datax_templates`) - DataX模板配置
### 4. Common 模块
对应 SQL: `data-common-init.sql`
#### 模型列表:
- **ChunkUploadRequest** (`t_chunk_upload_request`) - 文件切片上传请求
### 5. Operator 模块
对应 SQL: `data-operator-init.sql`
#### 模型列表:
- **Operator** (`t_operator`) - 算子
- **OperatorCategory** (`t_operator_category`) - 算子分类
- **OperatorCategoryRelation** (`t_operator_category_relation`) - 算子分类关联
## 使用方式
```python
# 导入所有模型
from app.models import (
# DM 模块
AnnotationTemplate,
LabelingProject,
Dataset,
DatasetFiles,
DatasetStatistics,
DatasetTag,
Tag,
User,
# Cleaning 模块
CleanTemplate,
CleanTask,
OperatorInstance,
CleanResult,
# Collection 模块
TaskExecution,
CollectionTask,
TaskLog,
DataxTemplate,
# Common 模块
ChunkUploadRequest,
# Operator 模块
Operator,
OperatorCategory,
OperatorCategoryRelation
)
# 或者按模块导入
from app.models.dm import Dataset, DatasetFiles
from app.models.collection import CollectionTask
from app.models.operator import Operator
```
## 注意事项
1. **UUID 主键**: 大部分表使用 UUID (String(36)) 作为主键
2. **时间戳**: 使用 `TIMESTAMP` 类型,并配置自动更新
3. **软删除**: 部分模型(如 AnnotationTemplate, LabelingProject)支持软删除,包含 `deleted_at` 字段和 `is_deleted` 属性
4. **JSON 字段**: 配置信息、元数据等使用 JSON 类型存储
5. **字段一致性**: 所有模型字段都严格按照 SQL 定义创建,确保与数据库表结构完全一致
## 更新记录
- 2025-10-25: 根据 `scripts/db` 中的 SQL 文件创建所有数据模型
- 已更新现有的 `annotation_template.py``labeling_project.py``dataset_files.py` 以匹配 SQL 定义

View File

@@ -0,0 +1,69 @@
# app/models/__init__.py
# Data Management (DM) 模块
from .dm import (
AnnotationTemplate,
LabelingProject,
Dataset,
DatasetFiles,
DatasetStatistics,
DatasetTag,
Tag,
User
)
# Data Cleaning 模块
from .cleaning import (
CleanTemplate,
CleanTask,
OperatorInstance,
CleanResult
)
# Data Collection (DC) 模块
from .collection import (
TaskExecution,
CollectionTask,
TaskLog,
DataxTemplate
)
# Common 模块
from .common import (
ChunkUploadRequest
)
# Operator 模块
from .operator import (
Operator,
OperatorCategory,
OperatorCategoryRelation
)
__all__ = [
# DM 模块
"AnnotationTemplate",
"LabelingProject",
"Dataset",
"DatasetFiles",
"DatasetStatistics",
"DatasetTag",
"Tag",
"User",
# Cleaning 模块
"CleanTemplate",
"CleanTask",
"OperatorInstance",
"CleanResult",
# Collection 模块
"TaskExecution",
"CollectionTask",
"TaskLog",
"DataxTemplate",
# Common 模块
"ChunkUploadRequest",
# Operator 模块
"Operator",
"OperatorCategory",
"OperatorCategoryRelation"
]

View File

@@ -0,0 +1,13 @@
# app/models/cleaning/__init__.py
from .clean_template import CleanTemplate
from .clean_task import CleanTask
from .operator_instance import OperatorInstance
from .clean_result import CleanResult
__all__ = [
"CleanTemplate",
"CleanTask",
"OperatorInstance",
"CleanResult"
]

View File

@@ -0,0 +1,22 @@
from sqlalchemy import Column, String, BigInteger, Text
from app.db.database import Base
class CleanResult(Base):
"""清洗结果模型"""
__tablename__ = "t_clean_result"
instance_id = Column(String(64), primary_key=True, comment="实例ID")
src_file_id = Column(String(64), nullable=True, comment="源文件ID")
dest_file_id = Column(String(64), primary_key=True, comment="目标文件ID")
src_name = Column(String(256), nullable=True, comment="源文件名")
dest_name = Column(String(256), nullable=True, comment="目标文件名")
src_type = Column(String(256), nullable=True, comment="源文件类型")
dest_type = Column(String(256), nullable=True, comment="目标文件类型")
src_size = Column(BigInteger, nullable=True, comment="源文件大小")
dest_size = Column(BigInteger, nullable=True, comment="目标文件大小")
status = Column(String(256), nullable=True, comment="处理状态")
result = Column(Text, nullable=True, comment="处理结果")
def __repr__(self):
return f"<CleanResult(instance_id={self.instance_id}, dest_file_id={self.dest_file_id}, status={self.status})>"

View File

@@ -0,0 +1,27 @@
from sqlalchemy import Column, String, BigInteger, Integer, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
class CleanTask(Base):
"""清洗任务模型"""
__tablename__ = "t_clean_task"
id = Column(String(64), primary_key=True, comment="任务ID")
name = Column(String(64), nullable=True, comment="任务名称")
description = Column(String(256), nullable=True, comment="任务描述")
status = Column(String(256), nullable=True, comment="任务状态")
src_dataset_id = Column(String(64), nullable=True, comment="源数据集ID")
src_dataset_name = Column(String(64), nullable=True, comment="源数据集名称")
dest_dataset_id = Column(String(64), nullable=True, comment="目标数据集ID")
dest_dataset_name = Column(String(64), nullable=True, comment="目标数据集名称")
before_size = Column(BigInteger, nullable=True, comment="清洗前大小")
after_size = Column(BigInteger, nullable=True, comment="清洗后大小")
file_count = Column(Integer, nullable=True, comment="文件数量")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
started_at = Column(TIMESTAMP, nullable=True, comment="开始时间")
finished_at = Column(TIMESTAMP, nullable=True, comment="完成时间")
created_by = Column(String(256), nullable=True, comment="创建者")
def __repr__(self):
return f"<CleanTask(id={self.id}, name={self.name}, status={self.status})>"

View File

@@ -0,0 +1,18 @@
from sqlalchemy import Column, String, Text, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
class CleanTemplate(Base):
"""清洗模板模型"""
__tablename__ = "t_clean_template"
id = Column(String(64), primary_key=True, unique=True, comment="模板ID")
name = Column(String(64), nullable=True, comment="模板名称")
description = Column(String(256), nullable=True, comment="模板描述")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
created_by = Column(String(256), nullable=True, comment="创建者")
def __repr__(self):
return f"<CleanTemplate(id={self.id}, name={self.name})>"

View File

@@ -0,0 +1,15 @@
from sqlalchemy import Column, String, Integer, Text
from app.db.database import Base
class OperatorInstance(Base):
"""算子实例模型"""
__tablename__ = "t_operator_instance"
instance_id = Column(String(256), primary_key=True, comment="实例ID")
operator_id = Column(String(256), primary_key=True, comment="算子ID")
op_index = Column(Integer, primary_key=True, comment="算子索引")
settings_override = Column(Text, nullable=True, comment="配置覆盖")
def __repr__(self):
return f"<OperatorInstance(instance_id={self.instance_id}, operator_id={self.operator_id}, index={self.op_index})>"

View File

@@ -0,0 +1,13 @@
# app/models/collection/__init__.py
from .task_execution import TaskExecution
from .collection_task import CollectionTask
from .task_log import TaskLog
from .datax_template import DataxTemplate
__all__ = [
"TaskExecution",
"CollectionTask",
"TaskLog",
"DataxTemplate"
]

View File

@@ -0,0 +1,28 @@
from sqlalchemy import Column, String, Text, Integer, BigInteger, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
class CollectionTask(Base):
"""数据归集任务模型"""
__tablename__ = "t_dc_collection_tasks"
id = Column(String(36), primary_key=True, comment="任务ID(UUID)")
name = Column(String(255), nullable=False, comment="任务名称")
description = Column(Text, nullable=True, comment="任务描述")
sync_mode = Column(String(20), default='ONCE', comment="同步模式:ONCE/SCHEDULED")
config = Column(Text, nullable=False, comment="归集配置(DataX配置),包含源端和目标端配置信息")
schedule_expression = Column(String(255), nullable=True, comment="Cron调度表达式")
status = Column(String(20), default='DRAFT', comment="任务状态:DRAFT/READY/RUNNING/SUCCESS/FAILED/STOPPED")
retry_count = Column(Integer, default=3, comment="重试次数")
timeout_seconds = Column(Integer, default=3600, comment="超时时间(秒)")
max_records = Column(BigInteger, nullable=True, comment="最大处理记录数")
sort_field = Column(String(100), nullable=True, comment="增量字段")
last_execution_id = Column(String(36), nullable=True, comment="最后执行ID(UUID)")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
created_by = Column(String(255), nullable=True, comment="创建者")
updated_by = Column(String(255), nullable=True, comment="更新者")
def __repr__(self):
return f"<CollectionTask(id={self.id}, name={self.name}, status={self.status})>"

View File

@@ -0,0 +1,23 @@
from sqlalchemy import Column, String, Text, Boolean, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
class DataxTemplate(Base):
"""DataX模板配置模型"""
__tablename__ = "t_dc_datax_templates"
id = Column(String(36), primary_key=True, comment="模板ID(UUID)")
name = Column(String(255), nullable=False, unique=True, comment="模板名称")
source_type = Column(String(50), nullable=False, comment="源数据源类型")
target_type = Column(String(50), nullable=False, comment="目标数据源类型")
template_content = Column(Text, nullable=False, comment="模板内容")
description = Column(Text, nullable=True, comment="模板描述")
version = Column(String(20), default='1.0.0', comment="版本号")
is_system = Column(Boolean, default=False, comment="是否系统模板")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
created_by = Column(String(255), nullable=True, comment="创建者")
def __repr__(self):
return f"<DataxTemplate(id={self.id}, name={self.name}, source={self.source_type}, target={self.target_type})>"

View File

@@ -0,0 +1,34 @@
from sqlalchemy import Column, String, Text, Integer, BigInteger, DECIMAL, JSON, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
class TaskExecution(Base):
"""任务执行明细模型"""
__tablename__ = "t_dc_task_executions"
id = Column(String(36), primary_key=True, comment="执行记录ID(UUID)")
task_id = Column(String(36), nullable=False, comment="任务ID")
task_name = Column(String(255), nullable=False, comment="任务名称")
status = Column(String(20), default='RUNNING', comment="执行状态:RUNNING/SUCCESS/FAILED/STOPPED")
progress = Column(DECIMAL(5, 2), default=0.00, comment="进度百分比")
records_total = Column(BigInteger, default=0, comment="总记录数")
records_processed = Column(BigInteger, default=0, comment="已处理记录数")
records_success = Column(BigInteger, default=0, comment="成功记录数")
records_failed = Column(BigInteger, default=0, comment="失败记录数")
throughput = Column(DECIMAL(10, 2), default=0.00, comment="吞吐量(条/秒)")
data_size_bytes = Column(BigInteger, default=0, comment="数据量(字节)")
started_at = Column(TIMESTAMP, nullable=True, comment="开始时间")
completed_at = Column(TIMESTAMP, nullable=True, comment="完成时间")
duration_seconds = Column(Integer, default=0, comment="执行时长(秒)")
config = Column(JSON, nullable=True, comment="执行配置")
error_message = Column(Text, nullable=True, comment="错误信息")
datax_job_id = Column(Text, nullable=True, comment="datax任务ID")
result = Column(Text, nullable=True, comment="执行结果")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
created_by = Column(String(255), nullable=True, comment="创建者")
updated_by = Column(String(255), nullable=True, comment="更新者")
def __repr__(self):
return f"<TaskExecution(id={self.id}, task_id={self.task_id}, status={self.status})>"

View File

@@ -0,0 +1,26 @@
from sqlalchemy import Column, String, Text, Integer, BigInteger, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
class TaskLog(Base):
"""任务执行记录模型"""
__tablename__ = "t_dc_task_log"
id = Column(String(36), primary_key=True, comment="执行记录ID(UUID)")
task_id = Column(String(36), nullable=False, comment="任务ID")
task_name = Column(String(255), nullable=False, comment="任务名称")
sync_mode = Column(String(20), default='FULL', comment="同步模式:FULL/INCREMENTAL")
status = Column(String(20), default='RUNNING', comment="执行状态:RUNNING/SUCCESS/FAILED/STOPPED")
start_time = Column(TIMESTAMP, nullable=True, comment="开始时间")
end_time = Column(TIMESTAMP, nullable=True, comment="结束时间")
duration = Column(BigInteger, nullable=True, comment="执行时长(毫秒)")
process_id = Column(String(50), nullable=True, comment="进程ID")
log_path = Column(String(500), nullable=True, comment="日志文件路径")
error_msg = Column(Text, nullable=True, comment="错误信息")
result = Column(Text, nullable=True, comment="执行结果")
retry_times = Column(Integer, default=0, comment="重试次数")
create_time = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
def __repr__(self):
return f"<TaskLog(id={self.id}, task_id={self.task_id}, status={self.status})>"

View File

@@ -0,0 +1,7 @@
# app/models/common/__init__.py
from .chunk_upload_request import ChunkUploadRequest
__all__ = [
"ChunkUploadRequest"
]

View File

@@ -0,0 +1,19 @@
from sqlalchemy import Column, String, Integer, Text, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
class ChunkUploadRequest(Base):
"""文件切片上传请求模型"""
__tablename__ = "t_chunk_upload_request"
id = Column(String(36), primary_key=True, comment="UUID")
total_file_num = Column(Integer, nullable=True, comment="总文件数")
uploaded_file_num = Column(Integer, nullable=True, comment="已上传文件数")
upload_path = Column(String(256), nullable=True, comment="文件路径")
timeout = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="上传请求超时时间")
service_id = Column(String(64), nullable=True, comment="上传请求所属服务:DATA-MANAGEMENT(数据管理)")
check_info = Column(Text, nullable=True, comment="业务信息")
def __repr__(self):
return f"<ChunkUploadRequest(id={self.id}, service_id={self.service_id}, progress={self.uploaded_file_num}/{self.total_file_num})>"

View File

@@ -0,0 +1,21 @@
# app/models/dm/__init__.py
from .annotation_template import AnnotationTemplate
from .labeling_project import LabelingProject
from .dataset import Dataset
from .dataset_files import DatasetFiles
from .dataset_statistics import DatasetStatistics
from .dataset_tag import DatasetTag
from .tag import Tag
from .user import User
__all__ = [
"AnnotationTemplate",
"LabelingProject",
"Dataset",
"DatasetFiles",
"DatasetStatistics",
"DatasetTag",
"Tag",
"User"
]

View File

@@ -0,0 +1,24 @@
from sqlalchemy import Column, String, JSON, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
import uuid
class AnnotationTemplate(Base):
"""标注模板模型"""
__tablename__ = "t_dm_annotation_templates"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID主键ID")
name = Column(String(32), nullable=False, comment="模板名称")
description = Column(String(255), nullable=True, comment="模板描述")
configuration = Column(JSON, nullable=True, comment="配置信息(JSON格式)")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
deleted_at = Column(TIMESTAMP, nullable=True, comment="删除时间(软删除)")
def __repr__(self):
return f"<AnnotationTemplate(id={self.id}, name={self.name})>"
@property
def is_deleted(self) -> bool:
"""检查是否已被软删除"""
return self.deleted_at is not None

View File

@@ -0,0 +1,35 @@
from sqlalchemy import Column, String, Text, BigInteger, Integer, Boolean, JSON, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
import uuid
class Dataset(Base):
"""数据集模型(支持医学影像、文本、问答等多种类型)"""
__tablename__ = "t_dm_datasets"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID")
name = Column(String(255), nullable=False, comment="数据集名称")
description = Column(Text, nullable=True, comment="数据集描述")
dataset_type = Column(String(50), nullable=False, comment="数据集类型:IMAGE/TEXT/QA/MULTIMODAL/OTHER")
category = Column(String(100), nullable=True, comment="数据集分类:医学影像/问答/文献等")
path = Column(String(500), nullable=True, comment="数据存储路径")
format = Column(String(50), nullable=True, comment="数据格式:DCM/JPG/JSON/CSV等")
schema_info = Column(JSON, nullable=True, comment="数据结构信息")
size_bytes = Column(BigInteger, default=0, comment="数据大小(字节)")
file_count = Column(BigInteger, default=0, comment="文件数量")
record_count = Column(BigInteger, default=0, comment="记录数量")
retention_days = Column(Integer, default=0, comment="数据保留天数(0表示长期保留)")
tags = Column(JSON, nullable=True, comment="标签列表")
metadata = Column(JSON, nullable=True, comment="元数据信息")
status = Column(String(50), default='DRAFT', comment="状态:DRAFT/ACTIVE/ARCHIVED")
is_public = Column(Boolean, default=False, comment="是否公开")
is_featured = Column(Boolean, default=False, comment="是否推荐")
version = Column(BigInteger, nullable=False, default=0, comment="版本号")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
created_by = Column(String(255), nullable=True, comment="创建者")
updated_by = Column(String(255), nullable=True, comment="更新者")
def __repr__(self):
return f"<Dataset(id={self.id}, name={self.name}, type={self.dataset_type})>"

View File

@@ -0,0 +1,27 @@
from sqlalchemy import Column, String, JSON, BigInteger, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
import uuid
class DatasetFiles(Base):
"""DM数据集文件模型"""
__tablename__ = "t_dm_dataset_files"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID")
dataset_id = Column(String(36), nullable=False, comment="所属数据集ID(UUID)")
file_name = Column(String(255), nullable=False, comment="文件名")
file_path = Column(String(1000), nullable=False, comment="文件路径")
file_type = Column(String(50), nullable=True, comment="文件格式:JPG/PNG/DCM/TXT等")
file_size = Column(BigInteger, default=0, comment="文件大小(字节)")
check_sum = Column(String(64), nullable=True, comment="文件校验和")
tags = Column(JSON, nullable=True, comment="文件标签信息")
metadata = Column(JSON, nullable=True, comment="文件元数据")
status = Column(String(50), default='ACTIVE', comment="文件状态:ACTIVE/DELETED/PROCESSING")
upload_time = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="上传时间")
last_access_time = Column(TIMESTAMP, nullable=True, comment="最后访问时间")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
def __repr__(self):
return f"<DatasetFiles(id={self.id}, dataset_id={self.dataset_id}, file_name={self.file_name})>"

View File

@@ -0,0 +1,25 @@
from sqlalchemy import Column, String, Date, BigInteger, JSON, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
import uuid
class DatasetStatistics(Base):
"""数据集统计信息模型"""
__tablename__ = "t_dm_dataset_statistics"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID")
dataset_id = Column(String(36), nullable=False, comment="数据集ID(UUID)")
stat_date = Column(Date, nullable=False, comment="统计日期")
total_files = Column(BigInteger, default=0, comment="总文件数")
total_size = Column(BigInteger, default=0, comment="总大小(字节)")
processed_files = Column(BigInteger, default=0, comment="已处理文件数")
error_files = Column(BigInteger, default=0, comment="错误文件数")
download_count = Column(BigInteger, default=0, comment="下载次数")
view_count = Column(BigInteger, default=0, comment="查看次数")
quality_metrics = Column(JSON, nullable=True, comment="质量指标")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
def __repr__(self):
return f"<DatasetStatistics(id={self.id}, dataset_id={self.dataset_id}, date={self.stat_date})>"

View File

@@ -0,0 +1,15 @@
from sqlalchemy import Column, String, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
class DatasetTag(Base):
"""数据集标签关联模型"""
__tablename__ = "t_dm_dataset_tags"
dataset_id = Column(String(36), primary_key=True, comment="数据集ID(UUID)")
tag_id = Column(String(36), primary_key=True, comment="标签ID(UUID)")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
def __repr__(self):
return f"<DatasetTag(dataset_id={self.dataset_id}, tag_id={self.tag_id})>"

View File

@@ -0,0 +1,26 @@
from sqlalchemy import Column, String, Integer, JSON, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
import uuid
class LabelingProject(Base):
"""DM标注项目模型(原 DatasetMapping)"""
__tablename__ = "t_dm_labeling_projects"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID主键ID")
dataset_id = Column(String(36), nullable=False, comment="数据集ID")
name = Column(String(32), nullable=False, comment="项目名称")
labeling_project_id = Column(Integer, nullable=False, comment="Label Studio项目ID")
configuration = Column(JSON, nullable=True, comment="标签配置")
progress = Column(JSON, nullable=True, comment="标注进度统计")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
deleted_at = Column(TIMESTAMP, nullable=True, comment="删除时间(软删除)")
def __repr__(self):
return f"<LabelingProject(id={self.id}, dataset_id={self.dataset_id}, name={self.name})>"
@property
def is_deleted(self) -> bool:
"""检查是否已被软删除"""
return self.deleted_at is not None

View File

@@ -0,0 +1,21 @@
from sqlalchemy import Column, String, Text, BigInteger, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
import uuid
class Tag(Base):
"""标签模型"""
__tablename__ = "t_dm_tags"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID")
name = Column(String(100), nullable=False, unique=True, comment="标签名称")
description = Column(Text, nullable=True, comment="标签描述")
category = Column(String(50), nullable=True, comment="标签分类")
color = Column(String(7), nullable=True, comment="标签颜色(十六进制)")
usage_count = Column(BigInteger, default=0, comment="使用次数")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
def __repr__(self):
return f"<Tag(id={self.id}, name={self.name}, category={self.category})>"

View File

@@ -0,0 +1,24 @@
from sqlalchemy import Column, String, BigInteger, Boolean, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
class User(Base):
"""用户模型"""
__tablename__ = "users"
id = Column(BigInteger, primary_key=True, autoincrement=True, comment="用户ID")
username = Column(String(255), nullable=False, unique=True, comment="用户名")
email = Column(String(255), nullable=False, unique=True, comment="邮箱")
password_hash = Column(String(255), nullable=False, comment="密码哈希")
full_name = Column(String(255), nullable=True, comment="真实姓名")
avatar_url = Column(String(500), nullable=True, comment="头像URL")
role = Column(String(50), nullable=False, default='USER', comment="角色:ADMIN/USER")
organization = Column(String(255), nullable=True, comment="所属机构")
enabled = Column(Boolean, nullable=False, default=True, comment="是否启用")
last_login_at = Column(TIMESTAMP, nullable=True, comment="最后登录时间")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
def __repr__(self):
return f"<User(id={self.id}, username={self.username}, role={self.role})>"

View File

@@ -0,0 +1,11 @@
# app/models/operator/__init__.py
from .operator import Operator
from .operator_category import OperatorCategory
from .operator_category_relation import OperatorCategoryRelation
__all__ = [
"Operator",
"OperatorCategory",
"OperatorCategoryRelation"
]

View File

@@ -0,0 +1,24 @@
from sqlalchemy import Column, String, Text, Boolean, TIMESTAMP
from sqlalchemy.sql import func
from app.db.database import Base
class Operator(Base):
"""算子模型"""
__tablename__ = "t_operator"
id = Column(String(64), primary_key=True, comment="算子ID")
name = Column(String(64), nullable=True, comment="算子名称")
description = Column(String(256), nullable=True, comment="算子描述")
version = Column(String(256), nullable=True, comment="版本")
inputs = Column(String(256), nullable=True, comment="输入类型")
outputs = Column(String(256), nullable=True, comment="输出类型")
runtime = Column(Text, nullable=True, comment="运行时信息")
settings = Column(Text, nullable=True, comment="配置信息")
file_name = Column(Text, nullable=True, comment="文件名")
is_star = Column(Boolean, nullable=True, comment="是否收藏")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
def __repr__(self):
return f"<Operator(id={self.id}, name={self.name}, version={self.version})>"

View File

@@ -0,0 +1,15 @@
from sqlalchemy import Column, String, Integer
from app.db.database import Base
class OperatorCategory(Base):
"""算子分类模型"""
__tablename__ = "t_operator_category"
id = Column(Integer, primary_key=True, autoincrement=True, comment="分类ID")
name = Column(String(64), nullable=True, comment="分类名称")
type = Column(String(64), nullable=True, comment="分类类型")
parent_id = Column(Integer, nullable=True, comment="父分类ID")
def __repr__(self):
return f"<OperatorCategory(id={self.id}, name={self.name}, type={self.type})>"

View File

@@ -0,0 +1,13 @@
from sqlalchemy import Column, String, Integer
from app.db.database import Base
class OperatorCategoryRelation(Base):
"""算子分类关联模型"""
__tablename__ = "t_operator_category_relation"
category_id = Column(Integer, primary_key=True, comment="分类ID")
operator_id = Column(String(64), primary_key=True, comment="算子ID")
def __repr__(self):
return f"<OperatorCategoryRelation(category_id={self.category_id}, operator_id={self.operator_id})>"

Some files were not shown because too many files have changed in this diff Show More