You've already forked DataMate
feature:数据集导入数据集支持选择归集任务导入 (#92)
* feature: 实现obs归集 * feature: 增加数据集中出现同名文件时的处理方式 * feature: 前端数据集导入数据时增加可以选择归集任务导入
This commit is contained in:
@@ -6,6 +6,7 @@ import com.datamate.collection.domain.model.entity.CollectionTask;
|
|||||||
import com.datamate.collection.domain.process.ProcessRunner;
|
import com.datamate.collection.domain.process.ProcessRunner;
|
||||||
import com.datamate.collection.infrastructure.datax.config.MysqlConfig;
|
import com.datamate.collection.infrastructure.datax.config.MysqlConfig;
|
||||||
import com.datamate.collection.infrastructure.datax.config.NasConfig;
|
import com.datamate.collection.infrastructure.datax.config.NasConfig;
|
||||||
|
import com.datamate.collection.infrastructure.datax.config.ObsConfig;
|
||||||
import com.datamate.common.infrastructure.exception.BusinessException;
|
import com.datamate.common.infrastructure.exception.BusinessException;
|
||||||
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
@@ -92,18 +93,21 @@ public class DataxProcessRunner implements ProcessRunner {
|
|||||||
try {
|
try {
|
||||||
ObjectMapper objectMapper = new ObjectMapper();
|
ObjectMapper objectMapper = new ObjectMapper();
|
||||||
TemplateType templateType = task.getTaskType();
|
TemplateType templateType = task.getTaskType();
|
||||||
switch (templateType) {
|
return switch (templateType) {
|
||||||
case NAS:
|
case NAS -> {
|
||||||
// NAS 特殊处理
|
// NAS 特殊处理
|
||||||
NasConfig nasConfig = objectMapper.readValue(task.getConfig(), NasConfig.class);
|
NasConfig nasConfig = objectMapper.readValue(task.getConfig(), NasConfig.class);
|
||||||
return nasConfig.toJobConfig(objectMapper, task);
|
yield nasConfig.toJobConfig(objectMapper, task);
|
||||||
case OBS:
|
|
||||||
case MYSQL:
|
|
||||||
MysqlConfig mysqlConfig = objectMapper.readValue(task.getConfig(), MysqlConfig.class);
|
|
||||||
return mysqlConfig.toJobConfig(objectMapper, task);
|
|
||||||
default:
|
|
||||||
throw BusinessException.of(SystemErrorCode.UNKNOWN_ERROR, "Unsupported template type: " + templateType);
|
|
||||||
}
|
}
|
||||||
|
case OBS -> {
|
||||||
|
ObsConfig obsConfig = objectMapper.readValue(task.getConfig(), ObsConfig.class);
|
||||||
|
yield obsConfig.toJobConfig(objectMapper, task);
|
||||||
|
}
|
||||||
|
case MYSQL -> {
|
||||||
|
MysqlConfig mysqlConfig = objectMapper.readValue(task.getConfig(), MysqlConfig.class);
|
||||||
|
yield mysqlConfig.toJobConfig(objectMapper, task);
|
||||||
|
}
|
||||||
|
};
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("Failed to parse task config", e);
|
log.error("Failed to parse task config", e);
|
||||||
throw new RuntimeException("Failed to parse task config", e);
|
throw new RuntimeException("Failed to parse task config", e);
|
||||||
|
|||||||
@@ -0,0 +1,61 @@
|
|||||||
|
package com.datamate.collection.infrastructure.datax.config;
|
||||||
|
|
||||||
|
import com.datamate.collection.domain.model.entity.CollectionTask;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import lombok.Getter;
|
||||||
|
import lombok.Setter;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* OBS 归集配置类
|
||||||
|
*
|
||||||
|
* @since 2025/11/18
|
||||||
|
*/
|
||||||
|
@Getter
|
||||||
|
@Setter
|
||||||
|
public class ObsConfig implements BaseConfig{
|
||||||
|
private String endpoint;
|
||||||
|
private String bucket;
|
||||||
|
private String accessKey;
|
||||||
|
private String secretKey;
|
||||||
|
private String prefix;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 将当前 OBS 配置构造成 DataX 所需的 job JSON 字符串。
|
||||||
|
*/
|
||||||
|
public String toJobConfig(ObjectMapper objectMapper, CollectionTask task) throws Exception {
|
||||||
|
Map<String, Object> parameter = new HashMap<>();
|
||||||
|
if (endpoint != null) parameter.put("endpoint", endpoint);
|
||||||
|
if (bucket != null) parameter.put("bucket", bucket);
|
||||||
|
if (accessKey != null) parameter.put("accessKey", accessKey);
|
||||||
|
if (secretKey != null) parameter.put("secretKey", secretKey);
|
||||||
|
if (prefix != null) parameter.put("prefix", prefix);
|
||||||
|
parameter.put("destPath", task.getTargetPath());
|
||||||
|
|
||||||
|
Map<String, Object> job = new HashMap<>();
|
||||||
|
Map<String, Object> content = new HashMap<>();
|
||||||
|
Map<String, Object> reader = new HashMap<>();
|
||||||
|
reader.put("name", "obsreader");
|
||||||
|
reader.put("parameter", parameter);
|
||||||
|
content.put("reader", reader);
|
||||||
|
|
||||||
|
Map<String, Object> writer = new HashMap<>();
|
||||||
|
writer.put("name", "obswriter");
|
||||||
|
writer.put("parameter", parameter);
|
||||||
|
content.put("writer", writer);
|
||||||
|
|
||||||
|
job.put("content", List.of(content));
|
||||||
|
Map<String, Object> setting = new HashMap<>();
|
||||||
|
Map<String, Object> channel = new HashMap<>();
|
||||||
|
channel.put("channel", 2);
|
||||||
|
setting.put("speed", channel);
|
||||||
|
job.put("setting", setting);
|
||||||
|
|
||||||
|
Map<String, Object> jobConfig = new HashMap<>();
|
||||||
|
jobConfig.put("job", job);
|
||||||
|
return objectMapper.writeValueAsString(jobConfig);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,14 +11,12 @@ import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
|||||||
import com.datamate.datamanagement.domain.model.dataset.Tag;
|
import com.datamate.datamanagement.domain.model.dataset.Tag;
|
||||||
import com.datamate.datamanagement.infrastructure.client.CollectionTaskClient;
|
import com.datamate.datamanagement.infrastructure.client.CollectionTaskClient;
|
||||||
import com.datamate.datamanagement.infrastructure.client.dto.CollectionTaskDetailResponse;
|
import com.datamate.datamanagement.infrastructure.client.dto.CollectionTaskDetailResponse;
|
||||||
import com.datamate.datamanagement.infrastructure.client.dto.LocalCollectionConfig;
|
|
||||||
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
|
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
|
||||||
import com.datamate.datamanagement.infrastructure.persistence.mapper.TagMapper;
|
import com.datamate.datamanagement.infrastructure.persistence.mapper.TagMapper;
|
||||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
||||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
||||||
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
||||||
import com.datamate.datamanagement.interfaces.dto.*;
|
import com.datamate.datamanagement.interfaces.dto.*;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.apache.commons.collections4.CollectionUtils;
|
import org.apache.commons.collections4.CollectionUtils;
|
||||||
@@ -28,11 +26,13 @@ import org.springframework.stereotype.Service;
|
|||||||
import org.springframework.transaction.annotation.Transactional;
|
import org.springframework.transaction.annotation.Transactional;
|
||||||
import org.springframework.util.StringUtils;
|
import org.springframework.util.StringUtils;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 数据集应用服务(对齐 DB schema,使用 UUID 字符串主键)
|
* 数据集应用服务(对齐 DB schema,使用 UUID 字符串主键)
|
||||||
@@ -46,8 +46,7 @@ public class DatasetApplicationService {
|
|||||||
private final TagMapper tagMapper;
|
private final TagMapper tagMapper;
|
||||||
private final DatasetFileRepository datasetFileRepository;
|
private final DatasetFileRepository datasetFileRepository;
|
||||||
private final CollectionTaskClient collectionTaskClient;
|
private final CollectionTaskClient collectionTaskClient;
|
||||||
private final FileMetadataService fileMetadataService;
|
private final DatasetFileApplicationService datasetFileApplicationService;
|
||||||
private final ObjectMapper objectMapper;
|
|
||||||
|
|
||||||
@Value("${datamate.data-management.base-path:/dataset}")
|
@Value("${datamate.data-management.base-path:/dataset}")
|
||||||
private String datasetBasePath;
|
private String datasetBasePath;
|
||||||
@@ -223,68 +222,38 @@ public class DatasetApplicationService {
|
|||||||
@Async
|
@Async
|
||||||
public void processDataSourceAsync(String datasetId, String dataSourceId) {
|
public void processDataSourceAsync(String datasetId, String dataSourceId) {
|
||||||
try {
|
try {
|
||||||
log.info("开始处理数据源文件扫描,数据集ID: {}, 数据源ID: {}", datasetId, dataSourceId);
|
log.info("Initiating data source file scanning, dataset ID: {}, collection task ID: {}", datasetId, dataSourceId);
|
||||||
List<String> filePaths = getFilePaths(dataSourceId);
|
List<String> filePaths = getFilePaths(dataSourceId);
|
||||||
if (CollectionUtils.isEmpty(filePaths)) {
|
if (CollectionUtils.isEmpty(filePaths)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
log.info("开始扫描文件,共 {} 个文件路径", filePaths.size());
|
log.info("Starting file scan, total files: {}", filePaths.size());
|
||||||
|
datasetFileApplicationService.copyFilesToDatasetDir(datasetId, new CopyFilesRequest(filePaths));
|
||||||
List<DatasetFile> datasetFiles = fileMetadataService.scanFiles(filePaths, datasetId);
|
|
||||||
// 查询数据集中已存在的文件
|
|
||||||
List<DatasetFile> existDatasetFileList = datasetFileRepository.findAllByDatasetId(datasetId);
|
|
||||||
Map<String, DatasetFile> existDatasetFilePathMap = existDatasetFileList.stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
|
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
|
||||||
dataset.setFiles(existDatasetFileList);
|
|
||||||
|
|
||||||
// 批量同步数据集文件表
|
|
||||||
asyncDatasetFile(datasetFiles, existDatasetFilePathMap, dataset, existDatasetFileList, filePaths);
|
|
||||||
datasetRepository.updateById(dataset);
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("处理数据源文件扫描失败,数据集ID: {}, 数据源ID: {}", datasetId, dataSourceId, e);
|
log.error("处理数据源文件扫描失败,数据集ID: {}, 数据源ID: {}", datasetId, dataSourceId, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void asyncDatasetFile(List<DatasetFile> datasetFiles, Map<String, DatasetFile> existDatasetFilePathMap, Dataset dataset, List<DatasetFile> existDatasetFileList, List<String> filePaths) {
|
|
||||||
if (CollectionUtils.isNotEmpty(datasetFiles)) {
|
|
||||||
for (DatasetFile datasetFile : datasetFiles) {
|
|
||||||
if (existDatasetFilePathMap.containsKey(datasetFile.getFilePath())) {
|
|
||||||
DatasetFile existDatasetFile = existDatasetFilePathMap.get(datasetFile.getFilePath());
|
|
||||||
dataset.removeFile(existDatasetFile);
|
|
||||||
existDatasetFile.setFileSize(datasetFile.getFileSize());
|
|
||||||
dataset.addFile(existDatasetFile);
|
|
||||||
dataset.active();
|
|
||||||
datasetFileRepository.updateById(existDatasetFile);
|
|
||||||
} else {
|
|
||||||
dataset.addFile(datasetFile);
|
|
||||||
dataset.active();
|
|
||||||
datasetFileRepository.save(datasetFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
log.info("文件元数据写入完成,共写入 {} 条记录", datasetFiles.size());
|
|
||||||
} else {
|
|
||||||
log.warn("未扫描到有效文件");
|
|
||||||
}
|
|
||||||
for (DatasetFile datasetFile : existDatasetFileList) {
|
|
||||||
String existFilePath = datasetFile.getFilePath();
|
|
||||||
for (String filePath : filePaths) {
|
|
||||||
if (existFilePath.equals(filePath) || existFilePath.startsWith(filePath)) {
|
|
||||||
if (Files.notExists(Paths.get(existFilePath))) {
|
|
||||||
dataset.removeFile(datasetFile);
|
|
||||||
datasetFileRepository.removeById(datasetFile.getId());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> getFilePaths(String dataSourceId) {
|
private List<String> getFilePaths(String dataSourceId) {
|
||||||
CollectionTaskDetailResponse taskDetail = collectionTaskClient.getTaskDetail(dataSourceId).getData();
|
CollectionTaskDetailResponse taskDetail = collectionTaskClient.getTaskDetail(dataSourceId).getData();
|
||||||
if (taskDetail == null) {
|
if (taskDetail == null) {
|
||||||
log.warn("获取归集任务详情失败,任务ID: {}", dataSourceId);
|
log.warn("Fail to get collection task detail, task ID: {}", dataSourceId);
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
Path targetPath = Paths.get(taskDetail.getTargetPath());
|
||||||
|
if (!Files.exists(targetPath) || !Files.isDirectory(targetPath)) {
|
||||||
|
log.warn("Target path not exists or is not a directory: {}", taskDetail.getTargetPath());
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
try (Stream<Path> paths = Files.walk(targetPath, 1)) {
|
||||||
|
return paths
|
||||||
|
.filter(Files::isRegularFile) // 只保留文件,排除目录
|
||||||
|
.map(Path::toString) // 转换为字符串路径
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
} catch (IOException e) {
|
||||||
|
log.error("Fail to scan directory: {}", targetPath, e);
|
||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
log.info("获取到归集任务详情: {}", taskDetail);
|
|
||||||
return Collections.singletonList(taskDetail.getTargetPath());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,10 +7,12 @@ import com.datamate.common.domain.utils.AnalyzerUtils;
|
|||||||
import com.datamate.common.infrastructure.exception.BusinessAssert;
|
import com.datamate.common.infrastructure.exception.BusinessAssert;
|
||||||
import com.datamate.common.infrastructure.exception.BusinessException;
|
import com.datamate.common.infrastructure.exception.BusinessException;
|
||||||
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
||||||
|
import com.datamate.datamanagement.common.enums.DuplicateMethod;
|
||||||
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
||||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo;
|
import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo;
|
||||||
|
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
|
||||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
||||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
||||||
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
|
||||||
@@ -45,6 +47,8 @@ import java.time.LocalDateTime;
|
|||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.CompletableFuture;
|
import java.util.concurrent.CompletableFuture;
|
||||||
|
import java.util.function.Function;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import java.util.zip.ZipEntry;
|
import java.util.zip.ZipEntry;
|
||||||
import java.util.zip.ZipOutputStream;
|
import java.util.zip.ZipOutputStream;
|
||||||
|
|
||||||
@@ -63,6 +67,9 @@ public class DatasetFileApplicationService {
|
|||||||
@Value("${datamate.data-management.base-path:/dataset}")
|
@Value("${datamate.data-management.base-path:/dataset}")
|
||||||
private String datasetBasePath;
|
private String datasetBasePath;
|
||||||
|
|
||||||
|
@Value("${datamate.data-management.file.duplicate:COVER}")
|
||||||
|
private DuplicateMethod duplicateMethod;
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
|
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
|
||||||
DatasetRepository datasetRepository, FileService fileService) {
|
DatasetRepository datasetRepository, FileService fileService) {
|
||||||
@@ -254,13 +261,36 @@ public class DatasetFileApplicationService {
|
|||||||
.filePath(savedFile.getPath())
|
.filePath(savedFile.getPath())
|
||||||
.fileType(AnalyzerUtils.getExtension(uploadFile.getFileName()))
|
.fileType(AnalyzerUtils.getExtension(uploadFile.getFileName()))
|
||||||
.build();
|
.build();
|
||||||
|
dataset.setFiles(datasetFileRepository.findAllByDatasetId(datasetId));
|
||||||
datasetFileRepository.save(datasetFile);
|
setDatasetFileId(datasetFile, dataset);
|
||||||
|
datasetFileRepository.saveOrUpdate(datasetFile);
|
||||||
dataset.addFile(datasetFile);
|
dataset.addFile(datasetFile);
|
||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 为数据集文件设置文件id
|
||||||
|
*
|
||||||
|
* @param datasetFile 要设置id的文件
|
||||||
|
* @param dataset 数据集(包含文件列表)
|
||||||
|
*/
|
||||||
|
private void setDatasetFileId(DatasetFile datasetFile, Dataset dataset) {
|
||||||
|
Map<String, DatasetFile> existDatasetFilMap = dataset.getFiles().stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
|
||||||
|
DatasetFile existDatasetFile = existDatasetFilMap.get(datasetFile.getFilePath());
|
||||||
|
if (Objects.isNull(existDatasetFile)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (duplicateMethod == DuplicateMethod.ERROR) {
|
||||||
|
log.error("file {} already exists in dataset {}", datasetFile.getFileName(), datasetFile.getDatasetId());
|
||||||
|
throw BusinessException.of(DataManagementErrorCode.DATASET_FILE_ALREADY_EXISTS);
|
||||||
|
}
|
||||||
|
if (duplicateMethod == DuplicateMethod.COVER) {
|
||||||
|
dataset.removeFile(existDatasetFile);
|
||||||
|
datasetFile.setId(existDatasetFile.getId());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 复制文件到数据集目录
|
* 复制文件到数据集目录
|
||||||
*
|
*
|
||||||
@@ -273,6 +303,8 @@ public class DatasetFileApplicationService {
|
|||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
|
||||||
List<DatasetFile> copiedFiles = new ArrayList<>();
|
List<DatasetFile> copiedFiles = new ArrayList<>();
|
||||||
|
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
||||||
|
dataset.setFiles(existDatasetFiles);
|
||||||
for (String sourceFilePath : req.sourcePaths()) {
|
for (String sourceFilePath : req.sourcePaths()) {
|
||||||
Path sourcePath = Paths.get(sourceFilePath);
|
Path sourcePath = Paths.get(sourceFilePath);
|
||||||
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
|
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
|
||||||
@@ -292,10 +324,11 @@ public class DatasetFileApplicationService {
|
|||||||
.uploadTime(currentTime)
|
.uploadTime(currentTime)
|
||||||
.lastAccessTime(currentTime)
|
.lastAccessTime(currentTime)
|
||||||
.build();
|
.build();
|
||||||
|
setDatasetFileId(datasetFile, dataset);
|
||||||
dataset.addFile(datasetFile);
|
dataset.addFile(datasetFile);
|
||||||
copiedFiles.add(datasetFile);
|
copiedFiles.add(datasetFile);
|
||||||
}
|
}
|
||||||
datasetFileRepository.saveBatch(copiedFiles, 100);
|
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
|
||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
|
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
|
||||||
|
|||||||
@@ -1,127 +0,0 @@
|
|||||||
package com.datamate.datamanagement.application;
|
|
||||||
|
|
||||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.nio.file.Paths;
|
|
||||||
import java.time.LocalDateTime;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.UUID;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 文件元数据扫描服务
|
|
||||||
*/
|
|
||||||
@Slf4j
|
|
||||||
@Service
|
|
||||||
public class FileMetadataService {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 扫描文件路径列表,提取文件元数据
|
|
||||||
* @param datasetId 数据集ID
|
|
||||||
* @return 数据集文件列表
|
|
||||||
*/
|
|
||||||
public List<DatasetFile> scanFiles(List<String> filePaths, String datasetId) {
|
|
||||||
List<DatasetFile> datasetFiles = new ArrayList<>();
|
|
||||||
|
|
||||||
if (filePaths == null || filePaths.isEmpty()) {
|
|
||||||
log.warn("文件路径列表为空,跳过扫描");
|
|
||||||
return datasetFiles;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (String filePath : filePaths) {
|
|
||||||
try {
|
|
||||||
Path path = Paths.get(filePath);
|
|
||||||
|
|
||||||
if (!Files.exists(path)) {
|
|
||||||
log.warn("路径不存在: {}", filePath);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Files.isDirectory(path)) {
|
|
||||||
scanDirectory(datasetId, filePath, path, datasetFiles);
|
|
||||||
} else {
|
|
||||||
// 如果是文件,直接处理
|
|
||||||
DatasetFile datasetFile = extractFileMetadata(filePath, datasetId);
|
|
||||||
if (datasetFile != null) {
|
|
||||||
datasetFiles.add(datasetFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error("扫描路径失败: {}, 错误: {}", filePath, e.getMessage(), e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
log.info("文件扫描完成,共扫描 {} 个文件", datasetFiles.size());
|
|
||||||
return datasetFiles;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void scanDirectory(String datasetId, String filePath, Path path,
|
|
||||||
List<DatasetFile> datasetFiles) throws IOException {
|
|
||||||
// 如果是目录,扫描该目录下的所有文件(非递归)
|
|
||||||
List<Path> filesInDir = Files.list(path)
|
|
||||||
.filter(Files::isRegularFile)
|
|
||||||
.toList();
|
|
||||||
|
|
||||||
for (Path file : filesInDir) {
|
|
||||||
try {
|
|
||||||
DatasetFile datasetFile = extractFileMetadata(file.toString(), datasetId);
|
|
||||||
if (datasetFile != null) {
|
|
||||||
datasetFiles.add(datasetFile);
|
|
||||||
}
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error("处理目录中的文件失败: {}, 错误: {}", file, e.getMessage(), e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
log.info("已扫描目录 {} 下的 {} 个文件", filePath, filesInDir.size());
|
|
||||||
}
|
|
||||||
/**
|
|
||||||
* @param filePath 文件路径
|
|
||||||
* @param datasetId 数据集ID
|
|
||||||
* @return 数据集文件对象
|
|
||||||
*/
|
|
||||||
private DatasetFile extractFileMetadata(String filePath, String datasetId) throws IOException {
|
|
||||||
Path path = Paths.get(filePath);
|
|
||||||
|
|
||||||
if (!Files.exists(path)) {
|
|
||||||
log.warn("文件不存在: {}", filePath);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!Files.isRegularFile(path)) {
|
|
||||||
log.warn("路径不是文件: {}", filePath);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
String fileName = path.getFileName().toString();
|
|
||||||
long fileSize = Files.size(path);
|
|
||||||
String fileType = getFileExtension(fileName);
|
|
||||||
|
|
||||||
return DatasetFile.builder()
|
|
||||||
.id(UUID.randomUUID().toString())
|
|
||||||
.datasetId(datasetId)
|
|
||||||
.fileName(fileName)
|
|
||||||
.filePath(filePath)
|
|
||||||
.fileSize(fileSize)
|
|
||||||
.fileType(fileType)
|
|
||||||
.uploadTime(LocalDateTime.now())
|
|
||||||
.lastAccessTime(LocalDateTime.now())
|
|
||||||
.status("ACTIVE")
|
|
||||||
.build();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 获取文件扩展名
|
|
||||||
*/
|
|
||||||
private String getFileExtension(String fileName) {
|
|
||||||
int lastDotIndex = fileName.lastIndexOf('.');
|
|
||||||
if (lastDotIndex > 0 && lastDotIndex < fileName.length() - 1) {
|
|
||||||
return fileName.substring(lastDotIndex + 1).toLowerCase();
|
|
||||||
}
|
|
||||||
return "unknown";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
package com.datamate.datamanagement.common.enums;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 文件重名时的处理方式
|
||||||
|
*
|
||||||
|
* @since 2025/11/18
|
||||||
|
*/
|
||||||
|
public enum DuplicateMethod {
|
||||||
|
ERROR,
|
||||||
|
COVER
|
||||||
|
}
|
||||||
@@ -32,7 +32,11 @@ public enum DataManagementErrorCode implements ErrorCode {
|
|||||||
/**
|
/**
|
||||||
* 数据集标签已存在
|
* 数据集标签已存在
|
||||||
*/
|
*/
|
||||||
DATASET_TAG_ALREADY_EXISTS("data_management.0005", "数据集标签已存在");
|
DATASET_TAG_ALREADY_EXISTS("data_management.0005", "数据集标签已存在"),
|
||||||
|
/**
|
||||||
|
* 数据集标签已存在
|
||||||
|
*/
|
||||||
|
DATASET_FILE_ALREADY_EXISTS("data_management.0006", "数据集文件已存在");
|
||||||
|
|
||||||
private final String code;
|
private final String code;
|
||||||
private final String message;
|
private final String message;
|
||||||
|
|||||||
@@ -33,4 +33,6 @@ public class CreateDatasetRequest {
|
|||||||
private List<String> tags;
|
private List<String> tags;
|
||||||
/** 数据源 */
|
/** 数据源 */
|
||||||
private String dataSource;
|
private String dataSource;
|
||||||
|
/** 保留天数 */
|
||||||
|
private Integer retentionDays;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -192,9 +192,6 @@ export default function CollectionTaskCreate() {
|
|||||||
/>
|
/>
|
||||||
</Form.Item>
|
</Form.Item>
|
||||||
)}
|
)}
|
||||||
<Form.Item name="maxRetries" label="最大执行次数">
|
|
||||||
<InputNumber min={1} style={{ width: "100%" }} />
|
|
||||||
</Form.Item>
|
|
||||||
|
|
||||||
{/* 模板配置 */}
|
{/* 模板配置 */}
|
||||||
<h2 className="font-medium text-gray-900 pt-6 mb-2 text-lg">
|
<h2 className="font-medium text-gray-900 pt-6 mb-2 text-lg">
|
||||||
|
|||||||
@@ -84,7 +84,7 @@ export default function ImportConfiguration({
|
|||||||
form.resetFields();
|
form.resetFields();
|
||||||
setFileList([]);
|
setFileList([]);
|
||||||
form.setFieldsValue({ files: null });
|
form.setFieldsValue({ files: null });
|
||||||
setImportConfig({ source: DataSource.UPLOAD });
|
setImportConfig({ source: importConfig.source ? importConfig.source : DataSource.UPLOAD });
|
||||||
};
|
};
|
||||||
|
|
||||||
const handleImportData = async () => {
|
const handleImportData = async () => {
|
||||||
|
|||||||
@@ -187,7 +187,7 @@ export const datasetStatusMap = {
|
|||||||
|
|
||||||
export const dataSourceMap: Record<string, { label: string; value: string }> = {
|
export const dataSourceMap: Record<string, { label: string; value: string }> = {
|
||||||
[DataSource.UPLOAD]: { label: "本地上传", value: DataSource.UPLOAD },
|
[DataSource.UPLOAD]: { label: "本地上传", value: DataSource.UPLOAD },
|
||||||
// [DataSource.COLLECTION]: { label: "本地归集 ", value: DataSource.COLLECTION },
|
[DataSource.COLLECTION]: { label: "归集任务导入 ", value: DataSource.COLLECTION },
|
||||||
// [DataSource.DATABASE]: { label: "数据库导入", value: DataSource.DATABASE },
|
// [DataSource.DATABASE]: { label: "数据库导入", value: DataSource.DATABASE },
|
||||||
// [DataSource.NAS]: { label: "NAS导入", value: DataSource.NAS },
|
// [DataSource.NAS]: { label: "NAS导入", value: DataSource.NAS },
|
||||||
// [DataSource.OBS]: { label: "OBS导入", value: DataSource.OBS },
|
// [DataSource.OBS]: { label: "OBS导入", value: DataSource.OBS },
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
|
import uuid
|
||||||
from typing import Set
|
from typing import Set
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
@@ -54,11 +55,13 @@ async def create_ratio_task(
|
|||||||
target_type = get_target_dataset_type(source_types)
|
target_type = get_target_dataset_type(source_types)
|
||||||
|
|
||||||
target_dataset = Dataset(
|
target_dataset = Dataset(
|
||||||
|
id=str(uuid.uuid4()),
|
||||||
name=target_dataset_name,
|
name=target_dataset_name,
|
||||||
description=req.description or "",
|
description=req.description or "",
|
||||||
dataset_type=target_type,
|
dataset_type=target_type,
|
||||||
status="DRAFT",
|
status="DRAFT",
|
||||||
)
|
)
|
||||||
|
target_dataset.path = f"/dataset/{target_dataset.id}"
|
||||||
db.add(target_dataset)
|
db.add(target_dataset)
|
||||||
await db.flush() # 获取 target_dataset.id
|
await db.flush() # 获取 target_dataset.id
|
||||||
|
|
||||||
@@ -212,16 +215,18 @@ async def delete_ratio_tasks(
|
|||||||
raise HTTPException(status_code=500, detail=f"Fail to delete ratio task: {e}")
|
raise HTTPException(status_code=500, detail=f"Fail to delete ratio task: {e}")
|
||||||
|
|
||||||
|
|
||||||
async def valid_exists(db, req: CreateRatioTaskRequest):
|
async def valid_exists(db: AsyncSession, req: CreateRatioTaskRequest) -> None:
|
||||||
# 校验配比任务名称不能重复
|
"""校验配比任务名称不能重复(精确匹配,去除首尾空格)。"""
|
||||||
exist_task_q = await db.execute(
|
name = (req.name or "").strip()
|
||||||
select(RatioInstance).where(RatioInstance.name == req.name)
|
if not name:
|
||||||
)
|
raise HTTPException(status_code=400, detail="ratio task name is required")
|
||||||
try:
|
|
||||||
exist_task_q.scalar_one_or_none()
|
# 查询是否已存在同名任务
|
||||||
except Exception as e:
|
ratio_task = await db.execute(select(RatioInstance.id).where(RatioInstance.name == name))
|
||||||
logger.error(f"create ratio task failed: ratio task {req.name} already exists")
|
exists = ratio_task.scalar_one_or_none()
|
||||||
raise HTTPException(status_code=400, detail=f"ratio task {req.name} already exists")
|
if exists is not None:
|
||||||
|
logger.error(f"create ratio task failed: ratio task '{name}' already exists (id={exists})")
|
||||||
|
raise HTTPException(status_code=400, detail=f"ratio task '{name}' already exists")
|
||||||
|
|
||||||
|
|
||||||
async def get_dataset_types(dm_service: DatasetManagementService, req: CreateRatioTaskRequest) -> Set[str]:
|
async def get_dataset_types(dm_service: DatasetManagementService, req: CreateRatioTaskRequest) -> Set[str]:
|
||||||
|
|||||||
Reference in New Issue
Block a user