feat(data-management): 实现数据集文件版本管理和内部路径保护

- 将数据集文件查询方法替换为只查询可见文件的版本
- 引入文件状态管理(ACTIVE/ARCHIVED)和内部目录结构
- 实现文件重复处理策略,支持版本控制模式而非覆盖
- 添加内部数据目录保护,防止访问.datamate等系统目录
- 重构文件上传流程,引入暂存目录和事务后清理机制
- 实现文件版本归档功能,保留历史版本到专用存储位置
- 优化文件路径规范化和安全验证逻辑
- 修复文件删除逻辑,确保归档文件不会被错误移除
- 更新数据集压缩下载功能以排除内部系统文件
This commit is contained in:
2026-02-04 23:53:35 +08:00
parent 473f4e717f
commit d0972cbc9d
16 changed files with 1141 additions and 484 deletions

View File

@@ -164,7 +164,7 @@ public class DatasetApplicationService {
public Dataset getDataset(String datasetId) {
Dataset dataset = datasetRepository.getById(datasetId);
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
List<DatasetFile> datasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
List<DatasetFile> datasetFiles = datasetFileRepository.findAllVisibleByDatasetId(datasetId);
dataset.setFiles(datasetFiles);
applyVisibleFileCounts(Collections.singletonList(dataset));
return dataset;
@@ -439,7 +439,7 @@ public class DatasetApplicationService {
Map<String, Object> statistics = new HashMap<>();
List<DatasetFile> allFiles = datasetFileRepository.findAllByDatasetId(datasetId);
List<DatasetFile> allFiles = datasetFileRepository.findAllVisibleByDatasetId(datasetId);
List<DatasetFile> visibleFiles = filterVisibleFiles(allFiles);
long totalFiles = visibleFiles.size();
long completedFiles = visibleFiles.stream()

View File

@@ -7,5 +7,6 @@ package com.datamate.datamanagement.common.enums;
*/
public enum DuplicateMethod {
ERROR,
COVER
COVER,
VERSION
}

View File

@@ -152,11 +152,19 @@ public class Dataset extends BaseEntity<String> {
}
public void removeFile(DatasetFile file) {
if (this.files.remove(file)) {
this.fileCount = Math.max(0, this.fileCount - 1);
this.sizeBytes = Math.max(0, this.sizeBytes - (file.getFileSize() != null ? file.getFileSize() : 0L));
this.updatedAt = LocalDateTime.now();
if (file == null) {
return;
}
boolean removed = this.files.remove(file);
if (!removed && file.getId() != null) {
removed = this.files.removeIf(existing -> Objects.equals(existing.getId(), file.getId()));
}
if (!removed) {
return;
}
this.fileCount = Math.max(0, this.fileCount - 1);
this.sizeBytes = Math.max(0, this.sizeBytes - (file.getFileSize() != null ? file.getFileSize() : 0L));
this.updatedAt = LocalDateTime.now();
}
public void active() {

View File

@@ -22,22 +22,26 @@ import java.util.List;
@NoArgsConstructor
@AllArgsConstructor
@TableName("t_dm_dataset_files")
public class DatasetFile {
@TableId
private String id; // UUID
private String datasetId; // UUID
private String fileName;
private String filePath;
private String fileType; // JPG/PNG/DCM/TXT
private Long fileSize; // bytes
private String checkSum;
private String tags;
private String metadata;
private String status; // UPLOADED, PROCESSING, COMPLETED, ERROR
private LocalDateTime uploadTime;
private LocalDateTime lastAccessTime;
private LocalDateTime createdAt;
private LocalDateTime updatedAt;
public class DatasetFile {
@TableId
private String id; // UUID
private String datasetId; // UUID
private String fileName;
private String filePath;
/** 文件逻辑路径(相对数据集根目录,包含子目录) */
private String logicalPath;
/** 文件版本号(同一个 logicalPath 下递增) */
private Long version;
private String fileType; // JPG/PNG/DCM/TXT
private Long fileSize; // bytes
private String checkSum;
private String tags;
private String metadata;
private String status; // ACTIVE/ARCHIVED/DELETED/PROCESSING...
private LocalDateTime uploadTime;
private LocalDateTime lastAccessTime;
private LocalDateTime createdAt;
private LocalDateTime updatedAt;
/** 标记是否为目录(非持久化字段) */
@TableField(exist = false)

View File

@@ -12,13 +12,16 @@ import lombok.Setter;
@Setter
@NoArgsConstructor
@AllArgsConstructor
public class DatasetFileUploadCheckInfo {
/** 数据集id */
private String datasetId;
/** 是否为压缩包上传 */
private boolean hasArchive;
/** 目标子目录前缀,例如 "images/",为空表示数据集根目录 */
private String prefix;
}
public class DatasetFileUploadCheckInfo {
/** 数据集id */
private String datasetId;
/** 是否为压缩包上传 */
private boolean hasArchive;
/** 目标子目录前缀,例如 "images/",为空表示数据集根目录 */
private String prefix;
/** 上传临时落盘目录(仅服务端使用,不对外暴露) */
private String stagingPath;
}

View File

@@ -24,8 +24,19 @@ public interface DatasetFileRepository extends IRepository<DatasetFile> {
List<DatasetFile> findAllByDatasetId(String datasetId);
/**
* 查询数据集内“可见文件”(默认不包含历史归档版本)。
* 约定:status 为 NULL 视为可见;status = ARCHIVED 视为历史版本。
*/
List<DatasetFile> findAllVisibleByDatasetId(String datasetId);
DatasetFile findByDatasetIdAndFileName(String datasetId, String fileName);
/**
* 查询指定逻辑路径的最新版本(ACTIVE/NULL)。
*/
DatasetFile findLatestByDatasetIdAndLogicalPath(String datasetId, String logicalPath);
IPage<DatasetFile> findByCriteria(String datasetId, String fileType, String status, String name,
Boolean hasAnnotation, IPage<DatasetFile> page);

View File

@@ -25,6 +25,8 @@ public class DatasetFileRepositoryImpl extends CrudRepository<DatasetFileMapper,
private final DatasetFileMapper datasetFileMapper;
private static final String ANNOTATION_EXISTS_SQL =
"SELECT 1 FROM t_dm_annotation_results ar WHERE ar.file_id = t_dm_dataset_files.id";
private static final String FILE_STATUS_ARCHIVED = "ARCHIVED";
private static final String FILE_STATUS_ACTIVE = "ACTIVE";
@Override
public Long countByDatasetId(String datasetId) {
@@ -51,19 +53,54 @@ public class DatasetFileRepositoryImpl extends CrudRepository<DatasetFileMapper,
return datasetFileMapper.findAllByDatasetId(datasetId);
}
@Override
public List<DatasetFile> findAllVisibleByDatasetId(String datasetId) {
return datasetFileMapper.selectList(new LambdaQueryWrapper<DatasetFile>()
.eq(DatasetFile::getDatasetId, datasetId)
.and(wrapper -> wrapper.isNull(DatasetFile::getStatus)
.or()
.ne(DatasetFile::getStatus, FILE_STATUS_ARCHIVED))
.orderByDesc(DatasetFile::getUploadTime));
}
@Override
public DatasetFile findByDatasetIdAndFileName(String datasetId, String fileName) {
return datasetFileMapper.findByDatasetIdAndFileName(datasetId, fileName);
}
@Override
public DatasetFile findLatestByDatasetIdAndLogicalPath(String datasetId, String logicalPath) {
if (!StringUtils.hasText(datasetId) || !StringUtils.hasText(logicalPath)) {
return null;
}
return datasetFileMapper.selectOne(new LambdaQueryWrapper<DatasetFile>()
.eq(DatasetFile::getDatasetId, datasetId)
.eq(DatasetFile::getLogicalPath, logicalPath)
.and(wrapper -> wrapper.isNull(DatasetFile::getStatus)
.or()
.eq(DatasetFile::getStatus, FILE_STATUS_ACTIVE))
.orderByDesc(DatasetFile::getVersion)
.orderByDesc(DatasetFile::getUploadTime)
.last("LIMIT 1"));
}
public IPage<DatasetFile> findByCriteria(String datasetId, String fileType, String status, String name,
Boolean hasAnnotation, IPage<DatasetFile> page) {
return datasetFileMapper.selectPage(page, new LambdaQueryWrapper<DatasetFile>()
.eq(DatasetFile::getDatasetId, datasetId)
.eq(StringUtils.hasText(fileType), DatasetFile::getFileType, fileType)
.eq(StringUtils.hasText(status), DatasetFile::getStatus, status)
.like(StringUtils.hasText(name), DatasetFile::getFileName, name)
.exists(Boolean.TRUE.equals(hasAnnotation), ANNOTATION_EXISTS_SQL));
LambdaQueryWrapper<DatasetFile> wrapper = new LambdaQueryWrapper<DatasetFile>()
.eq(DatasetFile::getDatasetId, datasetId)
.eq(StringUtils.hasText(fileType), DatasetFile::getFileType, fileType)
.like(StringUtils.hasText(name), DatasetFile::getFileName, name)
.exists(Boolean.TRUE.equals(hasAnnotation), ANNOTATION_EXISTS_SQL);
if (StringUtils.hasText(status)) {
wrapper.eq(DatasetFile::getStatus, status);
} else {
wrapper.and(visibility -> visibility.isNull(DatasetFile::getStatus)
.or()
.ne(DatasetFile::getStatus, FILE_STATUS_ARCHIVED));
}
return datasetFileMapper.selectPage(page, wrapper);
}
@Override

View File

@@ -3,7 +3,7 @@
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="com.datamate.datamanagement.infrastructure.persistence.mapper.DatasetFileMapper">
<sql id="Base_Column_List">
id, dataset_id, file_name, file_path, file_type, file_size, check_sum, tags, metadata, status,
id, dataset_id, file_name, file_path, logical_path, version, file_type, file_size, check_sum, tags, metadata, status,
upload_time, last_access_time, created_at, updated_at
</sql>
@@ -39,13 +39,17 @@
</select>
<select id="countByDatasetId" parameterType="string" resultType="long">
SELECT COUNT(*) FROM t_dm_dataset_files WHERE dataset_id = #{datasetId}
SELECT COUNT(*)
FROM t_dm_dataset_files
WHERE dataset_id = #{datasetId}
AND (status IS NULL OR status <> 'ARCHIVED')
</select>
<select id="countNonDerivedByDatasetId" parameterType="string" resultType="long">
SELECT COUNT(*)
FROM t_dm_dataset_files
WHERE dataset_id = #{datasetId}
AND (status IS NULL OR status <> 'ARCHIVED')
AND (metadata IS NULL OR JSON_EXTRACT(metadata, '$.derived_from_file_id') IS NULL)
</select>
@@ -54,13 +58,19 @@
</select>
<select id="sumSizeByDatasetId" parameterType="string" resultType="long">
SELECT COALESCE(SUM(file_size), 0) FROM t_dm_dataset_files WHERE dataset_id = #{datasetId}
SELECT COALESCE(SUM(file_size), 0)
FROM t_dm_dataset_files
WHERE dataset_id = #{datasetId}
AND (status IS NULL OR status <> 'ARCHIVED')
</select>
<select id="findByDatasetIdAndFileName" resultType="com.datamate.datamanagement.domain.model.dataset.DatasetFile">
SELECT <include refid="Base_Column_List"/>
FROM t_dm_dataset_files
WHERE dataset_id = #{datasetId} AND file_name = #{fileName}
WHERE dataset_id = #{datasetId}
AND file_name = #{fileName}
AND (status IS NULL OR status <> 'ARCHIVED')
ORDER BY version DESC, upload_time DESC
LIMIT 1
</select>
@@ -91,6 +101,8 @@
UPDATE t_dm_dataset_files
SET file_name = #{fileName},
file_path = #{filePath},
logical_path = #{logicalPath},
version = #{version},
file_type = #{fileType},
file_size = #{fileSize},
upload_time = #{uploadTime},
@@ -126,6 +138,7 @@
<foreach collection="datasetIds" item="datasetId" open="(" separator="," close=")">
#{datasetId}
</foreach>
AND (status IS NULL OR status <> 'ARCHIVED')
AND (metadata IS NULL OR JSON_EXTRACT(metadata, '$.derived_from_file_id') IS NULL)
GROUP BY dataset_id
</select>

View File

@@ -0,0 +1,147 @@
package com.datamate.datamanagement.application;
import com.datamate.common.domain.service.FileService;
import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.ArgumentCaptor;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.security.MessageDigest;
import java.util.List;
import static org.assertj.core.api.Assertions.assertThat;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
@ExtendWith(MockitoExtension.class)
class DatasetFileApplicationServiceVersioningTest {
@TempDir
Path tempDir;
@Mock
DatasetFileRepository datasetFileRepository;
@Mock
DatasetRepository datasetRepository;
@Mock
FileService fileService;
@Mock
PdfTextExtractAsyncService pdfTextExtractAsyncService;
@Mock
DatasetFilePreviewService datasetFilePreviewService;
@Test
void copyFilesToDatasetDirWithSourceRoot_shouldArchiveOldFileAndCreateNewVersionWhenDuplicateLogicalPath()
throws Exception {
String datasetId = "dataset-1";
Path datasetRoot = tempDir.resolve("dataset-root");
Files.createDirectories(datasetRoot);
Path sourceRoot = tempDir.resolve("source-root");
Files.createDirectories(sourceRoot);
Path existingPath = datasetRoot.resolve("a.txt");
Files.writeString(existingPath, "old-content", StandardCharsets.UTF_8);
Path incomingPath = sourceRoot.resolve("a.txt");
Files.writeString(incomingPath, "new-content", StandardCharsets.UTF_8);
Dataset dataset = new Dataset();
dataset.setId(datasetId);
dataset.setPath(datasetRoot.toString());
DatasetFile oldRecord = DatasetFile.builder()
.id("old-file-id")
.datasetId(datasetId)
.fileName("a.txt")
.filePath(existingPath.toString())
.logicalPath(null)
.version(null)
.status(null)
.fileSize(Files.size(existingPath))
.build();
when(datasetRepository.getById(datasetId)).thenReturn(dataset);
when(datasetFileRepository.findAllVisibleByDatasetId(datasetId)).thenReturn(List.of(oldRecord));
when(datasetFileRepository.findLatestByDatasetIdAndLogicalPath(anyString(), anyString())).thenReturn(null);
DatasetFileApplicationService service = new DatasetFileApplicationService(
datasetFileRepository,
datasetRepository,
fileService,
pdfTextExtractAsyncService,
datasetFilePreviewService
);
List<DatasetFile> copied = service.copyFilesToDatasetDirWithSourceRoot(
datasetId,
sourceRoot,
List.of(incomingPath.toString())
);
assertThat(copied).hasSize(1);
assertThat(Files.readString(existingPath, StandardCharsets.UTF_8)).isEqualTo("new-content");
String logicalPathHash = sha256Hex("a.txt");
Path archivedPath = datasetRoot
.resolve(".datamate")
.resolve("versions")
.resolve(logicalPathHash)
.resolve("v1")
.resolve("old-file-id__a.txt")
.toAbsolutePath()
.normalize();
assertThat(Files.exists(archivedPath)).isTrue();
assertThat(Files.readString(archivedPath, StandardCharsets.UTF_8)).isEqualTo("old-content");
ArgumentCaptor<DatasetFile> archivedCaptor = ArgumentCaptor.forClass(DatasetFile.class);
verify(datasetFileRepository).updateById(archivedCaptor.capture());
DatasetFile archivedRecord = archivedCaptor.getValue();
assertThat(archivedRecord.getId()).isEqualTo("old-file-id");
assertThat(archivedRecord.getStatus()).isEqualTo("ARCHIVED");
assertThat(archivedRecord.getLogicalPath()).isEqualTo("a.txt");
assertThat(archivedRecord.getVersion()).isEqualTo(1L);
assertThat(Paths.get(archivedRecord.getFilePath()).toAbsolutePath().normalize()).isEqualTo(archivedPath);
ArgumentCaptor<DatasetFile> createdCaptor = ArgumentCaptor.forClass(DatasetFile.class);
verify(datasetFileRepository).saveOrUpdate(createdCaptor.capture());
DatasetFile newRecord = createdCaptor.getValue();
assertThat(newRecord.getId()).isNotEqualTo("old-file-id");
assertThat(newRecord.getStatus()).isEqualTo("ACTIVE");
assertThat(newRecord.getLogicalPath()).isEqualTo("a.txt");
assertThat(newRecord.getVersion()).isEqualTo(2L);
assertThat(Paths.get(newRecord.getFilePath()).toAbsolutePath().normalize()).isEqualTo(existingPath.toAbsolutePath().normalize());
}
private static String sha256Hex(String value) {
try {
MessageDigest digest = MessageDigest.getInstance("SHA-256");
byte[] hashed = digest.digest((value == null ? "" : value).getBytes(StandardCharsets.UTF_8));
StringBuilder builder = new StringBuilder(hashed.length * 2);
for (byte b : hashed) {
builder.append(String.format("%02x", b));
}
return builder.toString();
} catch (Exception e) {
return Integer.toHexString((value == null ? "" : value).hashCode());
}
}
}