feat: add file copying functionality to dataset directory and update base path configuration #80

This commit is contained in:
Dallas98
2025-11-13 16:52:14 +08:00
committed by GitHub
parent 960323f41c
commit 15498f27cf
8 changed files with 122 additions and 45 deletions

View File

@@ -7,7 +7,7 @@
![GitHub Stars](https://img.shields.io/github/stars/ModelEngine-Group/DataMate)
![GitHub Forks](https://img.shields.io/github/forks/ModelEngine-Group/DataMate)
![GitHub Issues](https://img.shields.io/github/issues/ModelEngine-Group/DataMate)
![GitHub License](https://img.shields.io/github/license/ModelEngine-Group/DataMate)
![GitHub License](https://img.shields.io/github/license/ModelEngine-Group/datamate-docs)
**DataMate是面向模型微调与RAG检索的企业级数据处理平台,支持数据归集、数据管理、算子市场、数据清洗、数据合成、数据标注、数据评估、知识生成等核心功能。**

View File

@@ -8,7 +8,6 @@ import com.datamate.collection.interfaces.converter.CollectionTaskConverter;
import com.datamate.collection.interfaces.dto.*;
import com.datamate.common.interfaces.PagedResponse;
import com.datamate.datamanagement.application.DatasetApplicationService;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
import jakarta.validation.Valid;

View File

@@ -49,7 +49,7 @@ public class DatasetApplicationService {
private final FileMetadataService fileMetadataService;
private final ObjectMapper objectMapper;
@Value("${dataset.base.path:/dataset}")
@Value("${datamate.data-management.base-path:/dataset}")
private String datasetBasePath;
/**

View File

@@ -4,6 +4,7 @@ import com.datamate.common.domain.model.ChunkUploadPreRequest;
import com.datamate.common.domain.model.FileUploadResult;
import com.datamate.common.domain.service.FileService;
import com.datamate.common.domain.utils.AnalyzerUtils;
import com.datamate.common.infrastructure.exception.BusinessAssert;
import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.datamate.datamanagement.domain.contants.DatasetConstant;
@@ -13,12 +14,14 @@ import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckIn
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import jakarta.servlet.http.HttpServletResponse;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.io.FileUtils;
import org.apache.ibatis.session.RowBounds;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
@@ -57,7 +60,7 @@ public class DatasetFileApplicationService {
private final DatasetRepository datasetRepository;
private final FileService fileService;
@Value("${dataset.base.path:/dataset}")
@Value("${datamate.data-management.base-path:/dataset}")
private String datasetBasePath;
@Autowired
@@ -257,4 +260,51 @@ public class DatasetFileApplicationService {
dataset.active();
datasetRepository.updateById(dataset);
}
/**
* 复制文件到数据集目录
*
* @param datasetId 数据集id
* @param req 复制文件请求
* @return 复制的文件列表
*/
@Transactional
public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
Dataset dataset = datasetRepository.getById(datasetId);
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
List<DatasetFile> copiedFiles = new ArrayList<>();
for (String sourceFilePath : req.sourcePaths()) {
Path sourcePath = Paths.get(sourceFilePath);
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
continue;
}
String fileName = sourcePath.getFileName().toString();
File targetFile = new File(dataset.getPath(), fileName);
try {
FileUtils.copyInputStreamToFile(Files.newInputStream(sourcePath), targetFile);
} catch (IOException e) {
log.error("Failed to copy file: {}", sourceFilePath, e);
continue;
}
LocalDateTime currentTime = LocalDateTime.now();
DatasetFile datasetFile = DatasetFile.builder()
.id(UUID.randomUUID().toString())
.datasetId(datasetId)
.fileName(fileName)
.fileType(AnalyzerUtils.getExtension(fileName))
.fileSize(targetFile.length())
.filePath(targetFile.getPath())
.uploadTime(currentTime)
.lastAccessTime(currentTime)
.build();
datasetFileRepository.save(datasetFile);
dataset.addFile(datasetFile);
copiedFiles.add(datasetFile);
}
dataset.active();
datasetRepository.updateById(dataset);
return copiedFiles;
}
}

View File

@@ -1,16 +1,13 @@
package com.datamate.datamanagement.interfaces.converter;
import com.datamate.common.infrastructure.exception.BusinessException;
import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.datamate.common.domain.model.ChunkUploadRequest;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.domain.model.dataset.FileTag;
import com.datamate.datamanagement.interfaces.dto.CreateDatasetRequest;
import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse;
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.common.domain.model.ChunkUploadRequest;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.collections4.CollectionUtils;
import org.mapstruct.Mapper;
import org.mapstruct.Mapping;
@@ -59,6 +56,13 @@ public interface DatasetConverter {
*/
DatasetFileResponse convertToResponse(DatasetFile datasetFile);
/**
* 将数据集文件列表转换为响应
*/
List<DatasetFileResponse> convertToResponseList(List<DatasetFile> datasetFiles);
/**
* 获取数据文件的标签分布
*

View File

@@ -0,0 +1,14 @@
package com.datamate.datamanagement.interfaces.dto;
import jakarta.validation.constraints.NotEmpty;
import java.util.List;
/**
* 复制文件请求DTO
*
* @author dallas
* @since 2025-11-13
*/
public record CopyFilesRequest(@NotEmpty List<String> sourcePaths) {
}

View File

@@ -6,10 +6,7 @@ import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.datamate.datamanagement.application.DatasetFileApplicationService;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse;
import com.datamate.datamanagement.interfaces.dto.PagedDatasetFileResponse;
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
import com.datamate.datamanagement.interfaces.dto.*;
import jakarta.servlet.http.HttpServletResponse;
import jakarta.validation.Valid;
import lombok.extern.slf4j.Slf4j;
@@ -23,8 +20,8 @@ import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import java.util.List;
import java.util.stream.Collectors;
/**
@@ -44,20 +41,20 @@ public class DatasetFileController {
@GetMapping
public ResponseEntity<Response<PagedDatasetFileResponse>> getDatasetFiles(
@PathVariable("datasetId") String datasetId,
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
@RequestParam(value = "fileType", required = false) String fileType,
@RequestParam(value = "status", required = false) String status) {
@PathVariable("datasetId") String datasetId,
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
@RequestParam(value = "fileType", required = false) String fileType,
@RequestParam(value = "status", required = false) String status) {
Pageable pageable = PageRequest.of(page != null ? page : 0, size != null ? size : 20);
Page<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFiles(
datasetId, fileType, status, pageable);
datasetId, fileType, status, pageable);
PagedDatasetFileResponse response = new PagedDatasetFileResponse();
response.setContent(filesPage.getContent().stream()
.map(DatasetConverter.INSTANCE::convertToResponse)
.collect(Collectors.toList()));
.map(DatasetConverter.INSTANCE::convertToResponse)
.collect(Collectors.toList()));
response.setPage(filesPage.getNumber());
response.setSize(filesPage.getSize());
response.setTotalElements((int) filesPage.getTotalElements());
@@ -70,8 +67,8 @@ public class DatasetFileController {
@GetMapping("/{fileId}")
public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById(
@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) {
@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) {
try {
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
return ResponseEntity.ok(Response.ok(DatasetConverter.INSTANCE.convertToResponse(datasetFile)));
@@ -82,8 +79,8 @@ public class DatasetFileController {
@DeleteMapping("/{fileId}")
public ResponseEntity<Response<Void>> deleteDatasetFile(
@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) {
@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) {
try {
datasetFileApplicationService.deleteDatasetFile(datasetId, fileId);
return ResponseEntity.ok().build();
@@ -101,10 +98,10 @@ public class DatasetFileController {
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
return ResponseEntity.ok()
.contentType(MediaType.APPLICATION_OCTET_STREAM)
.header(HttpHeaders.CONTENT_DISPOSITION,
"attachment; filename=\"" + datasetFile.getFileName() + "\"")
.body(resource);
.contentType(MediaType.APPLICATION_OCTET_STREAM)
.header(HttpHeaders.CONTENT_DISPOSITION,
"attachment; filename=\"" + datasetFile.getFileName() + "\"")
.body(resource);
} catch (IllegalArgumentException e) {
return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
} catch (Exception e) {
@@ -136,11 +133,26 @@ public class DatasetFileController {
* @param uploadFileRequest 上传文件请求
*/
@PostMapping("/upload/chunk")
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId, @Valid UploadFileRequest uploadFileRequest) {
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId,
@Valid UploadFileRequest uploadFileRequest) {
log.info("file upload reqId:{}, fileNo:{}, total chunk num:{}, current chunkNo:{}",
uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(),
uploadFileRequest.getChunkNo());
uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(),
uploadFileRequest.getChunkNo());
datasetFileApplicationService.chunkUpload(datasetId, uploadFileRequest);
return ResponseEntity.ok().build();
}
/**
* 将指定路径中的文件拷贝到数据集目录下
*
* @param datasetId 数据集ID
* @param req 源文件路径列表
* @return 数据集文件响应DTO列表
*/
@PostMapping("/upload/copy")
public List<DatasetFileResponse> copyFilesToDatasetDir(@PathVariable("datasetId") String datasetId,
@RequestBody @Valid CopyFilesRequest req) {
List<DatasetFile> datasetFiles = datasetFileApplicationService.copyFilesToDatasetDir(datasetId, req);
return DatasetConverter.INSTANCE.convertToResponseList(datasetFiles);
}
}

View File

@@ -118,23 +118,15 @@ management:
# 平台配置
datamate:
# JWT配置
jwt:
secret: ${JWT_SECRET:dataMateSecretKey2024ForJWTTokenGeneration}
expiration: ${JWT_EXPIRATION:86400} # 24小时,单位秒
header: Authorization
prefix: "Bearer "
# 通用配置
# 文件存储配置
storage:
type: ${STORAGE_TYPE:local} # local, minio, s3
local:
base-path: ${STORAGE_LOCAL_PATH:./data/storage}
minio:
endpoint: ${MINIO_ENDPOINT:http://localhost:9000}
access-key: ${MINIO_ACCESS_KEY:minioadmin}
secret-key: ${MINIO_SECRET_KEY:minioadmin}
bucket-name: ${MINIO_BUCKET:data-mate}
# Ray执行器配置
ray:
@@ -148,6 +140,12 @@ datamate:
- "numpy"
- "data-juicer"
# 模块配置
# 数据管理服务配置
data-management:
base-path: /dataset
# 数据归集服务配置(可由模块导入叠加)
data-collection: {}