You've already forked DataMate
feat: add file copying functionality to dataset directory and update base path configuration #80
This commit is contained in:
@@ -7,7 +7,7 @@
|
|||||||

|

|
||||||

|

|
||||||

|

|
||||||

|

|
||||||
|
|
||||||
**DataMate是面向模型微调与RAG检索的企业级数据处理平台,支持数据归集、数据管理、算子市场、数据清洗、数据合成、数据标注、数据评估、知识生成等核心功能。**
|
**DataMate是面向模型微调与RAG检索的企业级数据处理平台,支持数据归集、数据管理、算子市场、数据清洗、数据合成、数据标注、数据评估、知识生成等核心功能。**
|
||||||
|
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ import com.datamate.collection.interfaces.converter.CollectionTaskConverter;
|
|||||||
import com.datamate.collection.interfaces.dto.*;
|
import com.datamate.collection.interfaces.dto.*;
|
||||||
import com.datamate.common.interfaces.PagedResponse;
|
import com.datamate.common.interfaces.PagedResponse;
|
||||||
import com.datamate.datamanagement.application.DatasetApplicationService;
|
import com.datamate.datamanagement.application.DatasetApplicationService;
|
||||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
|
||||||
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
||||||
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
|
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
|
||||||
import jakarta.validation.Valid;
|
import jakarta.validation.Valid;
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ public class DatasetApplicationService {
|
|||||||
private final FileMetadataService fileMetadataService;
|
private final FileMetadataService fileMetadataService;
|
||||||
private final ObjectMapper objectMapper;
|
private final ObjectMapper objectMapper;
|
||||||
|
|
||||||
@Value("${dataset.base.path:/dataset}")
|
@Value("${datamate.data-management.base-path:/dataset}")
|
||||||
private String datasetBasePath;
|
private String datasetBasePath;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import com.datamate.common.domain.model.ChunkUploadPreRequest;
|
|||||||
import com.datamate.common.domain.model.FileUploadResult;
|
import com.datamate.common.domain.model.FileUploadResult;
|
||||||
import com.datamate.common.domain.service.FileService;
|
import com.datamate.common.domain.service.FileService;
|
||||||
import com.datamate.common.domain.utils.AnalyzerUtils;
|
import com.datamate.common.domain.utils.AnalyzerUtils;
|
||||||
|
import com.datamate.common.infrastructure.exception.BusinessAssert;
|
||||||
import com.datamate.common.infrastructure.exception.BusinessException;
|
import com.datamate.common.infrastructure.exception.BusinessException;
|
||||||
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
||||||
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
||||||
@@ -13,12 +14,14 @@ import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckIn
|
|||||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
||||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
||||||
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
||||||
|
import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
|
||||||
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
|
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
|
||||||
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
|
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import jakarta.servlet.http.HttpServletResponse;
|
import jakarta.servlet.http.HttpServletResponse;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.ibatis.session.RowBounds;
|
import org.apache.ibatis.session.RowBounds;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.beans.factory.annotation.Value;
|
import org.springframework.beans.factory.annotation.Value;
|
||||||
@@ -57,7 +60,7 @@ public class DatasetFileApplicationService {
|
|||||||
private final DatasetRepository datasetRepository;
|
private final DatasetRepository datasetRepository;
|
||||||
private final FileService fileService;
|
private final FileService fileService;
|
||||||
|
|
||||||
@Value("${dataset.base.path:/dataset}")
|
@Value("${datamate.data-management.base-path:/dataset}")
|
||||||
private String datasetBasePath;
|
private String datasetBasePath;
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
@@ -257,4 +260,51 @@ public class DatasetFileApplicationService {
|
|||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 复制文件到数据集目录
|
||||||
|
*
|
||||||
|
* @param datasetId 数据集id
|
||||||
|
* @param req 复制文件请求
|
||||||
|
* @return 复制的文件列表
|
||||||
|
*/
|
||||||
|
@Transactional
|
||||||
|
public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
|
||||||
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
|
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
||||||
|
List<DatasetFile> copiedFiles = new ArrayList<>();
|
||||||
|
for (String sourceFilePath : req.sourcePaths()) {
|
||||||
|
Path sourcePath = Paths.get(sourceFilePath);
|
||||||
|
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
|
||||||
|
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
String fileName = sourcePath.getFileName().toString();
|
||||||
|
File targetFile = new File(dataset.getPath(), fileName);
|
||||||
|
try {
|
||||||
|
FileUtils.copyInputStreamToFile(Files.newInputStream(sourcePath), targetFile);
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Failed to copy file: {}", sourceFilePath, e);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
LocalDateTime currentTime = LocalDateTime.now();
|
||||||
|
DatasetFile datasetFile = DatasetFile.builder()
|
||||||
|
.id(UUID.randomUUID().toString())
|
||||||
|
.datasetId(datasetId)
|
||||||
|
.fileName(fileName)
|
||||||
|
.fileType(AnalyzerUtils.getExtension(fileName))
|
||||||
|
.fileSize(targetFile.length())
|
||||||
|
.filePath(targetFile.getPath())
|
||||||
|
.uploadTime(currentTime)
|
||||||
|
.lastAccessTime(currentTime)
|
||||||
|
.build();
|
||||||
|
datasetFileRepository.save(datasetFile);
|
||||||
|
dataset.addFile(datasetFile);
|
||||||
|
copiedFiles.add(datasetFile);
|
||||||
|
}
|
||||||
|
dataset.active();
|
||||||
|
datasetRepository.updateById(dataset);
|
||||||
|
return copiedFiles;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,16 +1,13 @@
|
|||||||
package com.datamate.datamanagement.interfaces.converter;
|
package com.datamate.datamanagement.interfaces.converter;
|
||||||
|
|
||||||
import com.datamate.common.infrastructure.exception.BusinessException;
|
import com.datamate.common.domain.model.ChunkUploadRequest;
|
||||||
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||||
|
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||||
import com.datamate.datamanagement.domain.model.dataset.FileTag;
|
import com.datamate.datamanagement.domain.model.dataset.FileTag;
|
||||||
import com.datamate.datamanagement.interfaces.dto.CreateDatasetRequest;
|
import com.datamate.datamanagement.interfaces.dto.CreateDatasetRequest;
|
||||||
import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse;
|
import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse;
|
||||||
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
|
import com.datamate.datamanagement.interfaces.dto.DatasetResponse;
|
||||||
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
|
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
|
||||||
import com.datamate.common.domain.model.ChunkUploadRequest;
|
|
||||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
|
||||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import org.apache.commons.collections4.CollectionUtils;
|
import org.apache.commons.collections4.CollectionUtils;
|
||||||
import org.mapstruct.Mapper;
|
import org.mapstruct.Mapper;
|
||||||
import org.mapstruct.Mapping;
|
import org.mapstruct.Mapping;
|
||||||
@@ -59,6 +56,13 @@ public interface DatasetConverter {
|
|||||||
*/
|
*/
|
||||||
DatasetFileResponse convertToResponse(DatasetFile datasetFile);
|
DatasetFileResponse convertToResponse(DatasetFile datasetFile);
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将数据集文件列表转换为响应
|
||||||
|
*/
|
||||||
|
List<DatasetFileResponse> convertToResponseList(List<DatasetFile> datasetFiles);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取数据文件的标签分布
|
* 获取数据文件的标签分布
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -0,0 +1,14 @@
|
|||||||
|
package com.datamate.datamanagement.interfaces.dto;
|
||||||
|
|
||||||
|
import jakarta.validation.constraints.NotEmpty;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 复制文件请求DTO
|
||||||
|
*
|
||||||
|
* @author dallas
|
||||||
|
* @since 2025-11-13
|
||||||
|
*/
|
||||||
|
public record CopyFilesRequest(@NotEmpty List<String> sourcePaths) {
|
||||||
|
}
|
||||||
@@ -6,10 +6,7 @@ import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
|||||||
import com.datamate.datamanagement.application.DatasetFileApplicationService;
|
import com.datamate.datamanagement.application.DatasetFileApplicationService;
|
||||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||||
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
||||||
import com.datamate.datamanagement.interfaces.dto.DatasetFileResponse;
|
import com.datamate.datamanagement.interfaces.dto.*;
|
||||||
import com.datamate.datamanagement.interfaces.dto.PagedDatasetFileResponse;
|
|
||||||
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
|
|
||||||
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
|
|
||||||
import jakarta.servlet.http.HttpServletResponse;
|
import jakarta.servlet.http.HttpServletResponse;
|
||||||
import jakarta.validation.Valid;
|
import jakarta.validation.Valid;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
@@ -23,8 +20,8 @@ import org.springframework.http.HttpStatus;
|
|||||||
import org.springframework.http.MediaType;
|
import org.springframework.http.MediaType;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.web.bind.annotation.*;
|
import org.springframework.web.bind.annotation.*;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -44,20 +41,20 @@ public class DatasetFileController {
|
|||||||
|
|
||||||
@GetMapping
|
@GetMapping
|
||||||
public ResponseEntity<Response<PagedDatasetFileResponse>> getDatasetFiles(
|
public ResponseEntity<Response<PagedDatasetFileResponse>> getDatasetFiles(
|
||||||
@PathVariable("datasetId") String datasetId,
|
@PathVariable("datasetId") String datasetId,
|
||||||
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
|
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
|
||||||
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
|
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
|
||||||
@RequestParam(value = "fileType", required = false) String fileType,
|
@RequestParam(value = "fileType", required = false) String fileType,
|
||||||
@RequestParam(value = "status", required = false) String status) {
|
@RequestParam(value = "status", required = false) String status) {
|
||||||
Pageable pageable = PageRequest.of(page != null ? page : 0, size != null ? size : 20);
|
Pageable pageable = PageRequest.of(page != null ? page : 0, size != null ? size : 20);
|
||||||
|
|
||||||
Page<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFiles(
|
Page<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFiles(
|
||||||
datasetId, fileType, status, pageable);
|
datasetId, fileType, status, pageable);
|
||||||
|
|
||||||
PagedDatasetFileResponse response = new PagedDatasetFileResponse();
|
PagedDatasetFileResponse response = new PagedDatasetFileResponse();
|
||||||
response.setContent(filesPage.getContent().stream()
|
response.setContent(filesPage.getContent().stream()
|
||||||
.map(DatasetConverter.INSTANCE::convertToResponse)
|
.map(DatasetConverter.INSTANCE::convertToResponse)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
response.setPage(filesPage.getNumber());
|
response.setPage(filesPage.getNumber());
|
||||||
response.setSize(filesPage.getSize());
|
response.setSize(filesPage.getSize());
|
||||||
response.setTotalElements((int) filesPage.getTotalElements());
|
response.setTotalElements((int) filesPage.getTotalElements());
|
||||||
@@ -70,8 +67,8 @@ public class DatasetFileController {
|
|||||||
|
|
||||||
@GetMapping("/{fileId}")
|
@GetMapping("/{fileId}")
|
||||||
public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById(
|
public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById(
|
||||||
@PathVariable("datasetId") String datasetId,
|
@PathVariable("datasetId") String datasetId,
|
||||||
@PathVariable("fileId") String fileId) {
|
@PathVariable("fileId") String fileId) {
|
||||||
try {
|
try {
|
||||||
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
|
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
|
||||||
return ResponseEntity.ok(Response.ok(DatasetConverter.INSTANCE.convertToResponse(datasetFile)));
|
return ResponseEntity.ok(Response.ok(DatasetConverter.INSTANCE.convertToResponse(datasetFile)));
|
||||||
@@ -82,8 +79,8 @@ public class DatasetFileController {
|
|||||||
|
|
||||||
@DeleteMapping("/{fileId}")
|
@DeleteMapping("/{fileId}")
|
||||||
public ResponseEntity<Response<Void>> deleteDatasetFile(
|
public ResponseEntity<Response<Void>> deleteDatasetFile(
|
||||||
@PathVariable("datasetId") String datasetId,
|
@PathVariable("datasetId") String datasetId,
|
||||||
@PathVariable("fileId") String fileId) {
|
@PathVariable("fileId") String fileId) {
|
||||||
try {
|
try {
|
||||||
datasetFileApplicationService.deleteDatasetFile(datasetId, fileId);
|
datasetFileApplicationService.deleteDatasetFile(datasetId, fileId);
|
||||||
return ResponseEntity.ok().build();
|
return ResponseEntity.ok().build();
|
||||||
@@ -101,10 +98,10 @@ public class DatasetFileController {
|
|||||||
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
|
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
|
||||||
|
|
||||||
return ResponseEntity.ok()
|
return ResponseEntity.ok()
|
||||||
.contentType(MediaType.APPLICATION_OCTET_STREAM)
|
.contentType(MediaType.APPLICATION_OCTET_STREAM)
|
||||||
.header(HttpHeaders.CONTENT_DISPOSITION,
|
.header(HttpHeaders.CONTENT_DISPOSITION,
|
||||||
"attachment; filename=\"" + datasetFile.getFileName() + "\"")
|
"attachment; filename=\"" + datasetFile.getFileName() + "\"")
|
||||||
.body(resource);
|
.body(resource);
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
|
return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
@@ -136,11 +133,26 @@ public class DatasetFileController {
|
|||||||
* @param uploadFileRequest 上传文件请求
|
* @param uploadFileRequest 上传文件请求
|
||||||
*/
|
*/
|
||||||
@PostMapping("/upload/chunk")
|
@PostMapping("/upload/chunk")
|
||||||
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId, @Valid UploadFileRequest uploadFileRequest) {
|
public ResponseEntity<Void> chunkUpload(@PathVariable("datasetId") String datasetId,
|
||||||
|
@Valid UploadFileRequest uploadFileRequest) {
|
||||||
log.info("file upload reqId:{}, fileNo:{}, total chunk num:{}, current chunkNo:{}",
|
log.info("file upload reqId:{}, fileNo:{}, total chunk num:{}, current chunkNo:{}",
|
||||||
uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(),
|
uploadFileRequest.getReqId(), uploadFileRequest.getFileNo(), uploadFileRequest.getTotalChunkNum(),
|
||||||
uploadFileRequest.getChunkNo());
|
uploadFileRequest.getChunkNo());
|
||||||
datasetFileApplicationService.chunkUpload(datasetId, uploadFileRequest);
|
datasetFileApplicationService.chunkUpload(datasetId, uploadFileRequest);
|
||||||
return ResponseEntity.ok().build();
|
return ResponseEntity.ok().build();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将指定路径中的文件拷贝到数据集目录下
|
||||||
|
*
|
||||||
|
* @param datasetId 数据集ID
|
||||||
|
* @param req 源文件路径列表
|
||||||
|
* @return 数据集文件响应DTO列表
|
||||||
|
*/
|
||||||
|
@PostMapping("/upload/copy")
|
||||||
|
public List<DatasetFileResponse> copyFilesToDatasetDir(@PathVariable("datasetId") String datasetId,
|
||||||
|
@RequestBody @Valid CopyFilesRequest req) {
|
||||||
|
List<DatasetFile> datasetFiles = datasetFileApplicationService.copyFilesToDatasetDir(datasetId, req);
|
||||||
|
return DatasetConverter.INSTANCE.convertToResponseList(datasetFiles);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -118,23 +118,15 @@ management:
|
|||||||
|
|
||||||
# 平台配置
|
# 平台配置
|
||||||
datamate:
|
datamate:
|
||||||
# JWT配置
|
# 通用配置
|
||||||
jwt:
|
|
||||||
secret: ${JWT_SECRET:dataMateSecretKey2024ForJWTTokenGeneration}
|
|
||||||
expiration: ${JWT_EXPIRATION:86400} # 24小时,单位秒
|
|
||||||
header: Authorization
|
|
||||||
prefix: "Bearer "
|
|
||||||
|
|
||||||
# 文件存储配置
|
# 文件存储配置
|
||||||
storage:
|
storage:
|
||||||
type: ${STORAGE_TYPE:local} # local, minio, s3
|
type: ${STORAGE_TYPE:local} # local, minio, s3
|
||||||
local:
|
local:
|
||||||
base-path: ${STORAGE_LOCAL_PATH:./data/storage}
|
base-path: ${STORAGE_LOCAL_PATH:./data/storage}
|
||||||
minio:
|
|
||||||
endpoint: ${MINIO_ENDPOINT:http://localhost:9000}
|
|
||||||
access-key: ${MINIO_ACCESS_KEY:minioadmin}
|
|
||||||
secret-key: ${MINIO_SECRET_KEY:minioadmin}
|
|
||||||
bucket-name: ${MINIO_BUCKET:data-mate}
|
|
||||||
|
|
||||||
# Ray执行器配置
|
# Ray执行器配置
|
||||||
ray:
|
ray:
|
||||||
@@ -148,6 +140,12 @@ datamate:
|
|||||||
- "numpy"
|
- "numpy"
|
||||||
- "data-juicer"
|
- "data-juicer"
|
||||||
|
|
||||||
|
# 模块配置
|
||||||
|
|
||||||
|
# 数据管理服务配置
|
||||||
|
data-management:
|
||||||
|
base-path: /dataset
|
||||||
|
|
||||||
# 数据归集服务配置(可由模块导入叠加)
|
# 数据归集服务配置(可由模块导入叠加)
|
||||||
data-collection: {}
|
data-collection: {}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user