You've already forked DataMate
feat: add file copying functionality to dataset directory and update base path configuration #80
This commit is contained in:
@@ -7,7 +7,7 @@
|
||||

|
||||

|
||||

|
||||

|
||||

|
||||
|
||||
**DataMate是面向模型微调与RAG检索的企业级数据处理平台,支持数据归集、数据管理、算子市场、数据清洗、数据合成、数据标注、数据评估、知识生成等核心功能。**
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ import com.datamate.collection.interfaces.converter.CollectionTaskConverter;
|
||||
import com.datamate.collection.interfaces.dto.*;
|
||||
import com.datamate.common.interfaces.PagedResponse;
|
||||
import com.datamate.datamanagement.application.DatasetApplicationService;
|
||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
||||
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
|
||||
import jakarta.validation.Valid;
|
||||
|
||||
@@ -49,7 +49,7 @@ public class DatasetApplicationService {
|
||||
private final FileMetadataService fileMetadataService;
|
||||
private final ObjectMapper objectMapper;
|
||||
|
||||
@Value("${dataset.base.path:/dataset}")
|
||||
@Value("${datamate.data-management.base-path:/dataset}")
|
||||
private String datasetBasePath;
|
||||
|
||||
/**
|
||||
|
||||
@@ -4,6 +4,7 @@ import com.datamate.common.domain.model.ChunkUploadPreRequest;
|
||||
import com.datamate.common.domain.model.FileUploadResult;
|
||||
import com.datamate.common.domain.service.FileService;
|
||||
import com.datamate.common.domain.utils.AnalyzerUtils;
|
||||
import com.datamate.common.infrastructure.exception.BusinessAssert;
|
||||
import com.datamate.common.infrastructure.exception.BusinessException;
|
||||
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
||||
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
||||
@@ -13,12 +14,14 @@ import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckIn
|
||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
||||
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
||||
import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
|
||||
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
|
||||
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import jakarta.servlet.http.HttpServletResponse;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.ibatis.session.RowBounds;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
@@ -57,7 +60,7 @@ public class DatasetFileApplicationService {
|
||||
private final DatasetRepository datasetRepository;
|
||||
private final FileService fileService;
|
||||
|
||||
@Value("${dataset.base.path:/dataset}")
|
||||
@Value("${datamate.data-management.base-path:/dataset}")
|
||||
private String datasetBasePath;
|
||||
|
||||
@Autowired
|
||||
@@ -257,4 +260,51 @@ public class DatasetFileApplicationService {
|
||||
dataset.active();
|
||||
datasetRepository.updateById(dataset);
|
||||
}
|
||||
|
||||
/**
|
||||
* 复制文件到数据集目录
|
||||
*
|
||||
* @param datasetId 数据集id
|
||||
* @param req 复制文件请求
|
||||
* @return 复制的文件列表
|
||||
*/
|
||||
@Transactional
|
||||
public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
|
||||
Dataset dataset = datasetRepository.getById(datasetId);
|
||||
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
||||
List<DatasetFile> copiedFiles = new ArrayList<>();
|
||||
for (String sourceFilePath : req.sourcePaths()) {
|
||||
Path sourcePath = Paths.get(sourceFilePath);
|
||||
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
|
||||
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
|
||||
continue;
|
||||
}
|
||||
String fileName = sourcePath.getFileName().toString();
|
||||
File targetFile = new File(dataset.getPath(), fileName);
|
||||
try {
|
||||
FileUtils.copyInputStreamToFile(Files.newInputStream(sourcePath), targetFile);
|
||||
} catch (IOException e) {
|
||||
log.error("Failed to copy file: {}", sourceFilePath, e);
|
||||
continue;
|
||||
}
|
||||
|
||||
LocalDateTime currentTime = LocalDateTime.now();
|
||||
DatasetFile datasetFile = DatasetFile.builder()
|
||||
.id(UUID.randomUUID().toString())
|
||||
.datasetId(datasetId)
|
||||
.fileName(fileName)
|
||||
.fileType(AnalyzerUtils.getExtension(fileName))
|
||||
.fileSize(targetFile.length())
|
||||
.filePath(targetFile.getPath())
|
||||
.uploadTime(currentTime)
|
||||
.lastAccessTime(currentTime)
|
||||
.build();
|
||||
datasetFileRepository.save(datasetFile);
|
||||
dataset.addFile(datasetFile);
|
||||
copiedFiles.add(datasetFile);
|
||||
}
|
||||
dataset.active();
|
||||
datasetRepository.updateById(dataset);
|
||||
return copiedFiles;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,16 +1,13 @@
|
||||
package com.datamate.datamanagement.interfaces.converter;
|
||||
|
||||
import com.datamate.common.infrastructure.exception.BusinessException;
|
||||
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
||||
import com.datamate.common.domain.model.ChunkUploadRequest;
|
||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||
import com.datamate.datamanagement.domain.model.dataset.FileTag;
|
||||
import com.datamate.datamanagement.interfaces.dto.CreateDatasetRequest;
|
||||
import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse;
|
||||
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
|
||||
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
|
||||
import com.datamate.common.domain.model.ChunkUploadRequest;
|
||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.apache.commons.collections4.CollectionUtils;
|
||||
import org.mapstruct.Mapper;
|
||||
import org.mapstruct.Mapping;
|
||||
@@ -59,6 +56,13 @@ public interface DatasetConverter {
|
||||
*/
|
||||
DatasetFileResponse convertToResponse(DatasetFile datasetFile);
|
||||
|
||||
|
||||
/**
|
||||
* 将数据集文件列表转换为响应
|
||||
*/
|
||||
List<DatasetFileResponse> convertToResponseList(List<DatasetFile> datasetFiles);
|
||||
|
||||
|
||||
/**
|
||||
* 获取数据文件的标签分布
|
||||
*
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
package com.datamate.datamanagement.interfaces.dto;
|
||||
|
||||
import jakarta.validation.constraints.NotEmpty;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* 复制文件请求DTO
|
||||
*
|
||||
* @author dallas
|
||||
* @since 2025-11-13
|
||||
*/
|
||||
public record CopyFilesRequest(@NotEmpty List<String> sourcePaths) {
|
||||
}
|
||||
@@ -6,10 +6,7 @@ import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
||||
import com.datamate.datamanagement.application.DatasetFileApplicationService;
|
||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
||||
import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse;
|
||||
import com.datamate.datamanagement.interfaces.dto.PagedDatasetFileResponse;
|
||||
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
|
||||
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
|
||||
import com.datamate.datamanagement.interfaces.dto.*;
|
||||
import jakarta.servlet.http.HttpServletResponse;
|
||||
import jakarta.validation.Valid;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
@@ -23,8 +20,8 @@ import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.*;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
@@ -136,11 +133,26 @@ public class DatasetFileController {
|
||||
* @param uploadFileRequest 上传文件请求
|
||||
*/
|
||||
@PostMapping("/upload/chunk")
|
||||
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId, @Valid UploadFileRequest uploadFileRequest) {
|
||||
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId,
|
||||
@Valid UploadFileRequest uploadFileRequest) {
|
||||
log.info("file upload reqId:{}, fileNo:{}, total chunk num:{}, current chunkNo:{}",
|
||||
uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(),
|
||||
uploadFileRequest.getChunkNo());
|
||||
datasetFileApplicationService.chunkUpload(datasetId, uploadFileRequest);
|
||||
return ResponseEntity.ok().build();
|
||||
}
|
||||
|
||||
/**
|
||||
* 将指定路径中的文件拷贝到数据集目录下
|
||||
*
|
||||
* @param datasetId 数据集ID
|
||||
* @param req 源文件路径列表
|
||||
* @return 数据集文件响应DTO列表
|
||||
*/
|
||||
@PostMapping("/upload/copy")
|
||||
public List<DatasetFileResponse> copyFilesToDatasetDir(@PathVariable("datasetId") String datasetId,
|
||||
@RequestBody @Valid CopyFilesRequest req) {
|
||||
List<DatasetFile> datasetFiles = datasetFileApplicationService.copyFilesToDatasetDir(datasetId, req);
|
||||
return DatasetConverter.INSTANCE.convertToResponseList(datasetFiles);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -118,23 +118,15 @@ management:
|
||||
|
||||
# 平台配置
|
||||
datamate:
|
||||
# JWT配置
|
||||
jwt:
|
||||
secret: ${JWT_SECRET:dataMateSecretKey2024ForJWTTokenGeneration}
|
||||
expiration: ${JWT_EXPIRATION:86400} # 24小时,单位秒
|
||||
header: Authorization
|
||||
prefix: "Bearer "
|
||||
# 通用配置
|
||||
|
||||
|
||||
# 文件存储配置
|
||||
storage:
|
||||
type: ${STORAGE_TYPE:local} # local, minio, s3
|
||||
local:
|
||||
base-path: ${STORAGE_LOCAL_PATH:./data/storage}
|
||||
minio:
|
||||
endpoint: ${MINIO_ENDPOINT:http://localhost:9000}
|
||||
access-key: ${MINIO_ACCESS_KEY:minioadmin}
|
||||
secret-key: ${MINIO_SECRET_KEY:minioadmin}
|
||||
bucket-name: ${MINIO_BUCKET:data-mate}
|
||||
|
||||
|
||||
# Ray执行器配置
|
||||
ray:
|
||||
@@ -148,6 +140,12 @@ datamate:
|
||||
- "numpy"
|
||||
- "data-juicer"
|
||||
|
||||
# 模块配置
|
||||
|
||||
# 数据管理服务配置
|
||||
data-management:
|
||||
base-path: /dataset
|
||||
|
||||
# 数据归集服务配置(可由模块导入叠加)
|
||||
data-collection: {}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user