You've already forked DataMate
feat(data-management): 实现数据集文件版本管理和内部路径保护
- 将数据集文件查询方法替换为只查询可见文件的版本 - 引入文件状态管理(ACTIVE/ARCHIVED)和内部目录结构 - 实现文件重复处理策略,支持版本控制模式而非覆盖 - 添加内部数据目录保护,防止访问.datamate等系统目录 - 重构文件上传流程,引入暂存目录和事务后清理机制 - 实现文件版本归档功能,保留历史版本到专用存储位置 - 优化文件路径规范化和安全验证逻辑 - 修复文件删除逻辑,确保归档文件不会被错误移除 - 更新数据集压缩下载功能以排除内部系统文件
This commit is contained in:
@@ -164,7 +164,7 @@ public class DatasetApplicationService {
|
||||
public Dataset getDataset(String datasetId) {
|
||||
Dataset dataset = datasetRepository.getById(datasetId);
|
||||
BusinessAssert.notNull(dataset, DataManagementErrorCode.DATASET_NOT_FOUND);
|
||||
List<DatasetFile> datasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
||||
List<DatasetFile> datasetFiles = datasetFileRepository.findAllVisibleByDatasetId(datasetId);
|
||||
dataset.setFiles(datasetFiles);
|
||||
applyVisibleFileCounts(Collections.singletonList(dataset));
|
||||
return dataset;
|
||||
@@ -439,7 +439,7 @@ public class DatasetApplicationService {
|
||||
|
||||
Map<String, Object> statistics = new HashMap<>();
|
||||
|
||||
List<DatasetFile> allFiles = datasetFileRepository.findAllByDatasetId(datasetId);
|
||||
List<DatasetFile> allFiles = datasetFileRepository.findAllVisibleByDatasetId(datasetId);
|
||||
List<DatasetFile> visibleFiles = filterVisibleFiles(allFiles);
|
||||
long totalFiles = visibleFiles.size();
|
||||
long completedFiles = visibleFiles.stream()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -7,5 +7,6 @@ package com.datamate.datamanagement.common.enums;
|
||||
*/
|
||||
public enum DuplicateMethod {
|
||||
ERROR,
|
||||
COVER
|
||||
COVER,
|
||||
VERSION
|
||||
}
|
||||
|
||||
@@ -152,11 +152,19 @@ public class Dataset extends BaseEntity<String> {
|
||||
}
|
||||
|
||||
public void removeFile(DatasetFile file) {
|
||||
if (this.files.remove(file)) {
|
||||
this.fileCount = Math.max(0, this.fileCount - 1);
|
||||
this.sizeBytes = Math.max(0, this.sizeBytes - (file.getFileSize() != null ? file.getFileSize() : 0L));
|
||||
this.updatedAt = LocalDateTime.now();
|
||||
if (file == null) {
|
||||
return;
|
||||
}
|
||||
boolean removed = this.files.remove(file);
|
||||
if (!removed && file.getId() != null) {
|
||||
removed = this.files.removeIf(existing -> Objects.equals(existing.getId(), file.getId()));
|
||||
}
|
||||
if (!removed) {
|
||||
return;
|
||||
}
|
||||
this.fileCount = Math.max(0, this.fileCount - 1);
|
||||
this.sizeBytes = Math.max(0, this.sizeBytes - (file.getFileSize() != null ? file.getFileSize() : 0L));
|
||||
this.updatedAt = LocalDateTime.now();
|
||||
}
|
||||
|
||||
public void active() {
|
||||
|
||||
@@ -22,22 +22,26 @@ import java.util.List;
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
@TableName("t_dm_dataset_files")
|
||||
public class DatasetFile {
|
||||
@TableId
|
||||
private String id; // UUID
|
||||
private String datasetId; // UUID
|
||||
private String fileName;
|
||||
private String filePath;
|
||||
private String fileType; // JPG/PNG/DCM/TXT
|
||||
private Long fileSize; // bytes
|
||||
private String checkSum;
|
||||
private String tags;
|
||||
private String metadata;
|
||||
private String status; // UPLOADED, PROCESSING, COMPLETED, ERROR
|
||||
private LocalDateTime uploadTime;
|
||||
private LocalDateTime lastAccessTime;
|
||||
private LocalDateTime createdAt;
|
||||
private LocalDateTime updatedAt;
|
||||
public class DatasetFile {
|
||||
@TableId
|
||||
private String id; // UUID
|
||||
private String datasetId; // UUID
|
||||
private String fileName;
|
||||
private String filePath;
|
||||
/** 文件逻辑路径(相对数据集根目录,包含子目录) */
|
||||
private String logicalPath;
|
||||
/** 文件版本号(同一个 logicalPath 下递增) */
|
||||
private Long version;
|
||||
private String fileType; // JPG/PNG/DCM/TXT
|
||||
private Long fileSize; // bytes
|
||||
private String checkSum;
|
||||
private String tags;
|
||||
private String metadata;
|
||||
private String status; // ACTIVE/ARCHIVED/DELETED/PROCESSING...
|
||||
private LocalDateTime uploadTime;
|
||||
private LocalDateTime lastAccessTime;
|
||||
private LocalDateTime createdAt;
|
||||
private LocalDateTime updatedAt;
|
||||
|
||||
/** 标记是否为目录(非持久化字段) */
|
||||
@TableField(exist = false)
|
||||
|
||||
@@ -12,13 +12,16 @@ import lombok.Setter;
|
||||
@Setter
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
public class DatasetFileUploadCheckInfo {
|
||||
/** 数据集id */
|
||||
private String datasetId;
|
||||
|
||||
/** 是否为压缩包上传 */
|
||||
private boolean hasArchive;
|
||||
|
||||
/** 目标子目录前缀,例如 "images/",为空表示数据集根目录 */
|
||||
private String prefix;
|
||||
}
|
||||
public class DatasetFileUploadCheckInfo {
|
||||
/** 数据集id */
|
||||
private String datasetId;
|
||||
|
||||
/** 是否为压缩包上传 */
|
||||
private boolean hasArchive;
|
||||
|
||||
/** 目标子目录前缀,例如 "images/",为空表示数据集根目录 */
|
||||
private String prefix;
|
||||
|
||||
/** 上传临时落盘目录(仅服务端使用,不对外暴露) */
|
||||
private String stagingPath;
|
||||
}
|
||||
|
||||
@@ -24,8 +24,19 @@ public interface DatasetFileRepository extends IRepository<DatasetFile> {
|
||||
|
||||
List<DatasetFile> findAllByDatasetId(String datasetId);
|
||||
|
||||
/**
|
||||
* 查询数据集内“可见文件”(默认不包含历史归档版本)。
|
||||
* 约定:status 为 NULL 视为可见;status = ARCHIVED 视为历史版本。
|
||||
*/
|
||||
List<DatasetFile> findAllVisibleByDatasetId(String datasetId);
|
||||
|
||||
DatasetFile findByDatasetIdAndFileName(String datasetId, String fileName);
|
||||
|
||||
/**
|
||||
* 查询指定逻辑路径的最新版本(ACTIVE/NULL)。
|
||||
*/
|
||||
DatasetFile findLatestByDatasetIdAndLogicalPath(String datasetId, String logicalPath);
|
||||
|
||||
IPage<DatasetFile> findByCriteria(String datasetId, String fileType, String status, String name,
|
||||
Boolean hasAnnotation, IPage<DatasetFile> page);
|
||||
|
||||
|
||||
@@ -25,6 +25,8 @@ public class DatasetFileRepositoryImpl extends CrudRepository<DatasetFileMapper,
|
||||
private final DatasetFileMapper datasetFileMapper;
|
||||
private static final String ANNOTATION_EXISTS_SQL =
|
||||
"SELECT 1 FROM t_dm_annotation_results ar WHERE ar.file_id = t_dm_dataset_files.id";
|
||||
private static final String FILE_STATUS_ARCHIVED = "ARCHIVED";
|
||||
private static final String FILE_STATUS_ACTIVE = "ACTIVE";
|
||||
|
||||
@Override
|
||||
public Long countByDatasetId(String datasetId) {
|
||||
@@ -51,19 +53,54 @@ public class DatasetFileRepositoryImpl extends CrudRepository<DatasetFileMapper,
|
||||
return datasetFileMapper.findAllByDatasetId(datasetId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<DatasetFile> findAllVisibleByDatasetId(String datasetId) {
|
||||
return datasetFileMapper.selectList(new LambdaQueryWrapper<DatasetFile>()
|
||||
.eq(DatasetFile::getDatasetId, datasetId)
|
||||
.and(wrapper -> wrapper.isNull(DatasetFile::getStatus)
|
||||
.or()
|
||||
.ne(DatasetFile::getStatus, FILE_STATUS_ARCHIVED))
|
||||
.orderByDesc(DatasetFile::getUploadTime));
|
||||
}
|
||||
|
||||
@Override
|
||||
public DatasetFile findByDatasetIdAndFileName(String datasetId, String fileName) {
|
||||
return datasetFileMapper.findByDatasetIdAndFileName(datasetId, fileName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DatasetFile findLatestByDatasetIdAndLogicalPath(String datasetId, String logicalPath) {
|
||||
if (!StringUtils.hasText(datasetId) || !StringUtils.hasText(logicalPath)) {
|
||||
return null;
|
||||
}
|
||||
return datasetFileMapper.selectOne(new LambdaQueryWrapper<DatasetFile>()
|
||||
.eq(DatasetFile::getDatasetId, datasetId)
|
||||
.eq(DatasetFile::getLogicalPath, logicalPath)
|
||||
.and(wrapper -> wrapper.isNull(DatasetFile::getStatus)
|
||||
.or()
|
||||
.eq(DatasetFile::getStatus, FILE_STATUS_ACTIVE))
|
||||
.orderByDesc(DatasetFile::getVersion)
|
||||
.orderByDesc(DatasetFile::getUploadTime)
|
||||
.last("LIMIT 1"));
|
||||
}
|
||||
|
||||
public IPage<DatasetFile> findByCriteria(String datasetId, String fileType, String status, String name,
|
||||
Boolean hasAnnotation, IPage<DatasetFile> page) {
|
||||
return datasetFileMapper.selectPage(page, new LambdaQueryWrapper<DatasetFile>()
|
||||
.eq(DatasetFile::getDatasetId, datasetId)
|
||||
.eq(StringUtils.hasText(fileType), DatasetFile::getFileType, fileType)
|
||||
.eq(StringUtils.hasText(status), DatasetFile::getStatus, status)
|
||||
.like(StringUtils.hasText(name), DatasetFile::getFileName, name)
|
||||
.exists(Boolean.TRUE.equals(hasAnnotation), ANNOTATION_EXISTS_SQL));
|
||||
LambdaQueryWrapper<DatasetFile> wrapper = new LambdaQueryWrapper<DatasetFile>()
|
||||
.eq(DatasetFile::getDatasetId, datasetId)
|
||||
.eq(StringUtils.hasText(fileType), DatasetFile::getFileType, fileType)
|
||||
.like(StringUtils.hasText(name), DatasetFile::getFileName, name)
|
||||
.exists(Boolean.TRUE.equals(hasAnnotation), ANNOTATION_EXISTS_SQL);
|
||||
|
||||
if (StringUtils.hasText(status)) {
|
||||
wrapper.eq(DatasetFile::getStatus, status);
|
||||
} else {
|
||||
wrapper.and(visibility -> visibility.isNull(DatasetFile::getStatus)
|
||||
.or()
|
||||
.ne(DatasetFile::getStatus, FILE_STATUS_ARCHIVED));
|
||||
}
|
||||
|
||||
return datasetFileMapper.selectPage(page, wrapper);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
|
||||
<mapper namespace="com.datamate.datamanagement.infrastructure.persistence.mapper.DatasetFileMapper">
|
||||
<sql id="Base_Column_List">
|
||||
id, dataset_id, file_name, file_path, file_type, file_size, check_sum, tags, metadata, status,
|
||||
id, dataset_id, file_name, file_path, logical_path, version, file_type, file_size, check_sum, tags, metadata, status,
|
||||
upload_time, last_access_time, created_at, updated_at
|
||||
</sql>
|
||||
|
||||
@@ -39,13 +39,17 @@
|
||||
</select>
|
||||
|
||||
<select id="countByDatasetId" parameterType="string" resultType="long">
|
||||
SELECT COUNT(*) FROM t_dm_dataset_files WHERE dataset_id = #{datasetId}
|
||||
SELECT COUNT(*)
|
||||
FROM t_dm_dataset_files
|
||||
WHERE dataset_id = #{datasetId}
|
||||
AND (status IS NULL OR status <> 'ARCHIVED')
|
||||
</select>
|
||||
|
||||
<select id="countNonDerivedByDatasetId" parameterType="string" resultType="long">
|
||||
SELECT COUNT(*)
|
||||
FROM t_dm_dataset_files
|
||||
WHERE dataset_id = #{datasetId}
|
||||
AND (status IS NULL OR status <> 'ARCHIVED')
|
||||
AND (metadata IS NULL OR JSON_EXTRACT(metadata, '$.derived_from_file_id') IS NULL)
|
||||
</select>
|
||||
|
||||
@@ -54,13 +58,19 @@
|
||||
</select>
|
||||
|
||||
<select id="sumSizeByDatasetId" parameterType="string" resultType="long">
|
||||
SELECT COALESCE(SUM(file_size), 0) FROM t_dm_dataset_files WHERE dataset_id = #{datasetId}
|
||||
SELECT COALESCE(SUM(file_size), 0)
|
||||
FROM t_dm_dataset_files
|
||||
WHERE dataset_id = #{datasetId}
|
||||
AND (status IS NULL OR status <> 'ARCHIVED')
|
||||
</select>
|
||||
|
||||
<select id="findByDatasetIdAndFileName" resultType="com.datamate.datamanagement.domain.model.dataset.DatasetFile">
|
||||
SELECT <include refid="Base_Column_List"/>
|
||||
FROM t_dm_dataset_files
|
||||
WHERE dataset_id = #{datasetId} AND file_name = #{fileName}
|
||||
WHERE dataset_id = #{datasetId}
|
||||
AND file_name = #{fileName}
|
||||
AND (status IS NULL OR status <> 'ARCHIVED')
|
||||
ORDER BY version DESC, upload_time DESC
|
||||
LIMIT 1
|
||||
</select>
|
||||
|
||||
@@ -91,6 +101,8 @@
|
||||
UPDATE t_dm_dataset_files
|
||||
SET file_name = #{fileName},
|
||||
file_path = #{filePath},
|
||||
logical_path = #{logicalPath},
|
||||
version = #{version},
|
||||
file_type = #{fileType},
|
||||
file_size = #{fileSize},
|
||||
upload_time = #{uploadTime},
|
||||
@@ -126,6 +138,7 @@
|
||||
<foreach collection="datasetIds" item="datasetId" open="(" separator="," close=")">
|
||||
#{datasetId}
|
||||
</foreach>
|
||||
AND (status IS NULL OR status <> 'ARCHIVED')
|
||||
AND (metadata IS NULL OR JSON_EXTRACT(metadata, '$.derived_from_file_id') IS NULL)
|
||||
GROUP BY dataset_id
|
||||
</select>
|
||||
|
||||
@@ -0,0 +1,147 @@
|
||||
package com.datamate.datamanagement.application;
|
||||
|
||||
import com.datamate.common.domain.service.FileService;
|
||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.mockito.ArgumentCaptor;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.security.MessageDigest;
|
||||
import java.util.List;
|
||||
|
||||
import static org.assertj.core.api.Assertions.assertThat;
|
||||
import static org.mockito.ArgumentMatchers.anyString;
|
||||
import static org.mockito.Mockito.verify;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
class DatasetFileApplicationServiceVersioningTest {
|
||||
|
||||
@TempDir
|
||||
Path tempDir;
|
||||
|
||||
@Mock
|
||||
DatasetFileRepository datasetFileRepository;
|
||||
|
||||
@Mock
|
||||
DatasetRepository datasetRepository;
|
||||
|
||||
@Mock
|
||||
FileService fileService;
|
||||
|
||||
@Mock
|
||||
PdfTextExtractAsyncService pdfTextExtractAsyncService;
|
||||
|
||||
@Mock
|
||||
DatasetFilePreviewService datasetFilePreviewService;
|
||||
|
||||
@Test
|
||||
void copyFilesToDatasetDirWithSourceRoot_shouldArchiveOldFileAndCreateNewVersionWhenDuplicateLogicalPath()
|
||||
throws Exception {
|
||||
String datasetId = "dataset-1";
|
||||
|
||||
Path datasetRoot = tempDir.resolve("dataset-root");
|
||||
Files.createDirectories(datasetRoot);
|
||||
|
||||
Path sourceRoot = tempDir.resolve("source-root");
|
||||
Files.createDirectories(sourceRoot);
|
||||
|
||||
Path existingPath = datasetRoot.resolve("a.txt");
|
||||
Files.writeString(existingPath, "old-content", StandardCharsets.UTF_8);
|
||||
|
||||
Path incomingPath = sourceRoot.resolve("a.txt");
|
||||
Files.writeString(incomingPath, "new-content", StandardCharsets.UTF_8);
|
||||
|
||||
Dataset dataset = new Dataset();
|
||||
dataset.setId(datasetId);
|
||||
dataset.setPath(datasetRoot.toString());
|
||||
|
||||
DatasetFile oldRecord = DatasetFile.builder()
|
||||
.id("old-file-id")
|
||||
.datasetId(datasetId)
|
||||
.fileName("a.txt")
|
||||
.filePath(existingPath.toString())
|
||||
.logicalPath(null)
|
||||
.version(null)
|
||||
.status(null)
|
||||
.fileSize(Files.size(existingPath))
|
||||
.build();
|
||||
|
||||
when(datasetRepository.getById(datasetId)).thenReturn(dataset);
|
||||
when(datasetFileRepository.findAllVisibleByDatasetId(datasetId)).thenReturn(List.of(oldRecord));
|
||||
when(datasetFileRepository.findLatestByDatasetIdAndLogicalPath(anyString(), anyString())).thenReturn(null);
|
||||
|
||||
DatasetFileApplicationService service = new DatasetFileApplicationService(
|
||||
datasetFileRepository,
|
||||
datasetRepository,
|
||||
fileService,
|
||||
pdfTextExtractAsyncService,
|
||||
datasetFilePreviewService
|
||||
);
|
||||
|
||||
List<DatasetFile> copied = service.copyFilesToDatasetDirWithSourceRoot(
|
||||
datasetId,
|
||||
sourceRoot,
|
||||
List.of(incomingPath.toString())
|
||||
);
|
||||
|
||||
assertThat(copied).hasSize(1);
|
||||
assertThat(Files.readString(existingPath, StandardCharsets.UTF_8)).isEqualTo("new-content");
|
||||
|
||||
String logicalPathHash = sha256Hex("a.txt");
|
||||
Path archivedPath = datasetRoot
|
||||
.resolve(".datamate")
|
||||
.resolve("versions")
|
||||
.resolve(logicalPathHash)
|
||||
.resolve("v1")
|
||||
.resolve("old-file-id__a.txt")
|
||||
.toAbsolutePath()
|
||||
.normalize();
|
||||
|
||||
assertThat(Files.exists(archivedPath)).isTrue();
|
||||
assertThat(Files.readString(archivedPath, StandardCharsets.UTF_8)).isEqualTo("old-content");
|
||||
|
||||
ArgumentCaptor<DatasetFile> archivedCaptor = ArgumentCaptor.forClass(DatasetFile.class);
|
||||
verify(datasetFileRepository).updateById(archivedCaptor.capture());
|
||||
DatasetFile archivedRecord = archivedCaptor.getValue();
|
||||
assertThat(archivedRecord.getId()).isEqualTo("old-file-id");
|
||||
assertThat(archivedRecord.getStatus()).isEqualTo("ARCHIVED");
|
||||
assertThat(archivedRecord.getLogicalPath()).isEqualTo("a.txt");
|
||||
assertThat(archivedRecord.getVersion()).isEqualTo(1L);
|
||||
assertThat(Paths.get(archivedRecord.getFilePath()).toAbsolutePath().normalize()).isEqualTo(archivedPath);
|
||||
|
||||
ArgumentCaptor<DatasetFile> createdCaptor = ArgumentCaptor.forClass(DatasetFile.class);
|
||||
verify(datasetFileRepository).saveOrUpdate(createdCaptor.capture());
|
||||
DatasetFile newRecord = createdCaptor.getValue();
|
||||
assertThat(newRecord.getId()).isNotEqualTo("old-file-id");
|
||||
assertThat(newRecord.getStatus()).isEqualTo("ACTIVE");
|
||||
assertThat(newRecord.getLogicalPath()).isEqualTo("a.txt");
|
||||
assertThat(newRecord.getVersion()).isEqualTo(2L);
|
||||
assertThat(Paths.get(newRecord.getFilePath()).toAbsolutePath().normalize()).isEqualTo(existingPath.toAbsolutePath().normalize());
|
||||
}
|
||||
|
||||
private static String sha256Hex(String value) {
|
||||
try {
|
||||
MessageDigest digest = MessageDigest.getInstance("SHA-256");
|
||||
byte[] hashed = digest.digest((value == null ? "" : value).getBytes(StandardCharsets.UTF_8));
|
||||
StringBuilder builder = new StringBuilder(hashed.length * 2);
|
||||
for (byte b : hashed) {
|
||||
builder.append(String.format("%02x", b));
|
||||
}
|
||||
return builder.toString();
|
||||
} catch (Exception e) {
|
||||
return Integer.toHexString((value == null ? "" : value).hashCode());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -143,7 +143,20 @@ public class ArchiveAnalyzer {
|
||||
private static Optional<FileUploadResult> extractEntity(ArchiveInputStream<?> archiveInputStream, ArchiveEntry archiveEntry, Path archivePath)
|
||||
throws IOException {
|
||||
byte[] buffer = new byte[DEFAULT_BUFFER_SIZE];
|
||||
Path path = Paths.get(archivePath.getParent().toString(), archiveEntry.getName());
|
||||
Path archiveRoot = archivePath.getParent().toAbsolutePath().normalize();
|
||||
String entryName = archiveEntry.getName();
|
||||
if (entryName == null || entryName.isBlank()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
entryName = entryName.replace("\\", "/");
|
||||
while (entryName.startsWith("/")) {
|
||||
entryName = entryName.substring(1);
|
||||
}
|
||||
Path path = archiveRoot.resolve(entryName).normalize();
|
||||
if (!path.startsWith(archiveRoot)) {
|
||||
log.warn("Skip unsafe archive entry path traversal: {}", archiveEntry.getName());
|
||||
return Optional.empty();
|
||||
}
|
||||
File file = path.toFile();
|
||||
long fileSize = 0L;
|
||||
FileUtils.createParentDirectories(file);
|
||||
|
||||
@@ -13,7 +13,10 @@ public class CommonUtils {
|
||||
* @return 文件名(带后缀)
|
||||
*/
|
||||
public static String trimFilePath(String filePath) {
|
||||
int lastSlashIndex = filePath.lastIndexOf(File.separator);
|
||||
if (filePath == null || filePath.isBlank()) {
|
||||
return "";
|
||||
}
|
||||
int lastSlashIndex = Math.max(filePath.lastIndexOf('/'), filePath.lastIndexOf('\\'));
|
||||
|
||||
String filename = filePath;
|
||||
if (lastSlashIndex != -1) {
|
||||
|
||||
@@ -61,13 +61,15 @@ class DatasetFiles(Base):
|
||||
dataset_id = Column(String(36), nullable=False, comment="所属数据集ID(UUID)")
|
||||
file_name = Column(String(255), nullable=False, comment="文件名")
|
||||
file_path = Column(String(1000), nullable=False, comment="文件路径")
|
||||
logical_path = Column(String(1000), nullable=False, comment="文件逻辑路径(相对数据集根目录)")
|
||||
version = Column(BigInteger, nullable=False, default=1, comment="文件版本号(同 logical_path 递增)")
|
||||
file_type = Column(String(50), nullable=True, comment="文件格式:JPG/PNG/DCM/TXT等")
|
||||
file_size = Column(BigInteger, default=0, comment="文件大小(字节)")
|
||||
check_sum = Column(String(64), nullable=True, comment="文件校验和")
|
||||
tags = Column(JSON, nullable=True, comment="文件标签信息")
|
||||
tags_updated_at = Column(TIMESTAMP, nullable=True, comment="标签最后更新时间")
|
||||
dataset_filemetadata = Column("metadata", JSON, nullable=True, comment="文件元数据")
|
||||
status = Column(String(50), default='ACTIVE', comment="文件状态:ACTIVE/DELETED/PROCESSING")
|
||||
status = Column(String(50), default='ACTIVE', comment="文件状态:ACTIVE/ARCHIVED/DELETED/PROCESSING")
|
||||
upload_time = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="上传时间")
|
||||
last_access_time = Column(TIMESTAMP, nullable=True, comment="最后访问时间")
|
||||
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
|
||||
@@ -112,4 +114,4 @@ class Tag(Base):
|
||||
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<Tag(id={self.id}, name={self.name}, category={self.category})>"
|
||||
return f"<Tag(id={self.id}, name={self.name}, category={self.category})>"
|
||||
|
||||
@@ -372,15 +372,15 @@ def _register_output_dataset(
|
||||
)
|
||||
return
|
||||
|
||||
insert_file_sql = text(
|
||||
"""
|
||||
INSERT INTO t_dm_dataset_files (
|
||||
id, dataset_id, file_name, file_path, file_type, file_size, status
|
||||
) VALUES (
|
||||
:id, :dataset_id, :file_name, :file_path, :file_type, :file_size, :status
|
||||
)
|
||||
"""
|
||||
)
|
||||
insert_file_sql = text(
|
||||
"""
|
||||
INSERT INTO t_dm_dataset_files (
|
||||
id, dataset_id, file_name, file_path, logical_path, version, file_type, file_size, status
|
||||
) VALUES (
|
||||
:id, :dataset_id, :file_name, :file_path, :logical_path, :version, :file_type, :file_size, :status
|
||||
)
|
||||
"""
|
||||
)
|
||||
update_dataset_stat_sql = text(
|
||||
"""
|
||||
UPDATE t_dm_datasets
|
||||
@@ -393,37 +393,43 @@ def _register_output_dataset(
|
||||
with SQLManager.create_connect() as conn:
|
||||
added_count = 0
|
||||
|
||||
for file_name, file_path, file_size in image_files:
|
||||
ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None
|
||||
conn.execute(
|
||||
insert_file_sql,
|
||||
{
|
||||
"id": str(uuid.uuid4()),
|
||||
"dataset_id": output_dataset_id,
|
||||
"file_name": file_name,
|
||||
"file_path": file_path,
|
||||
"file_type": ext,
|
||||
"file_size": int(file_size),
|
||||
"status": "ACTIVE",
|
||||
},
|
||||
)
|
||||
added_count += 1
|
||||
|
||||
for file_name, file_path, file_size in annotation_files:
|
||||
ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None
|
||||
conn.execute(
|
||||
insert_file_sql,
|
||||
{
|
||||
"id": str(uuid.uuid4()),
|
||||
"dataset_id": output_dataset_id,
|
||||
"file_name": file_name,
|
||||
"file_path": file_path,
|
||||
"file_type": ext,
|
||||
"file_size": int(file_size),
|
||||
"status": "ACTIVE",
|
||||
},
|
||||
)
|
||||
added_count += 1
|
||||
for file_name, file_path, file_size in image_files:
|
||||
ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None
|
||||
logical_path = os.path.relpath(file_path, output_dir).replace("\\", "/")
|
||||
conn.execute(
|
||||
insert_file_sql,
|
||||
{
|
||||
"id": str(uuid.uuid4()),
|
||||
"dataset_id": output_dataset_id,
|
||||
"file_name": file_name,
|
||||
"file_path": file_path,
|
||||
"logical_path": logical_path,
|
||||
"version": 1,
|
||||
"file_type": ext,
|
||||
"file_size": int(file_size),
|
||||
"status": "ACTIVE",
|
||||
},
|
||||
)
|
||||
added_count += 1
|
||||
|
||||
for file_name, file_path, file_size in annotation_files:
|
||||
ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None
|
||||
logical_path = os.path.relpath(file_path, output_dir).replace("\\", "/")
|
||||
conn.execute(
|
||||
insert_file_sql,
|
||||
{
|
||||
"id": str(uuid.uuid4()),
|
||||
"dataset_id": output_dataset_id,
|
||||
"file_name": file_name,
|
||||
"file_path": file_path,
|
||||
"logical_path": logical_path,
|
||||
"version": 1,
|
||||
"file_type": ext,
|
||||
"file_size": int(file_size),
|
||||
"status": "ACTIVE",
|
||||
},
|
||||
)
|
||||
added_count += 1
|
||||
|
||||
if added_count > 0:
|
||||
conn.execute(
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
{
|
||||
"query_sql": "SELECT * FROM t_task_instance_info WHERE instance_id IN (:instance_id)",
|
||||
"insert_sql": "INSERT INTO t_task_instance_info (instance_id, meta_file_name, meta_file_type, meta_file_id, meta_file_size, file_id, file_size, file_type, file_name, file_path, status, operator_id, error_code, incremental, child_id, slice_num) VALUES (:instance_id, :meta_file_name, :meta_file_type, :meta_file_id, :meta_file_size, :file_id, :file_size, :file_type, :file_name, :file_path, :status, :operator_id, :error_code, :incremental, :child_id, :slice_num)",
|
||||
"insert_dataset_file_sql": "INSERT INTO t_dm_dataset_files (id, dataset_id, file_name, file_path, file_type, file_size, status, upload_time, last_access_time, created_at, updated_at) VALUES (:id, :dataset_id, :file_name, :file_path, :file_type, :file_size, :status, :upload_time, :last_access_time, :created_at, :updated_at)",
|
||||
"insert_dataset_file_sql": "INSERT INTO t_dm_dataset_files (id, dataset_id, file_name, file_path, logical_path, version, file_type, file_size, status, upload_time, last_access_time, created_at, updated_at) VALUES (:id, :dataset_id, :file_name, :file_path, :logical_path, :version, :file_type, :file_size, :status, :upload_time, :last_access_time, :created_at, :updated_at)",
|
||||
"insert_clean_result_sql": "INSERT INTO t_clean_result (instance_id, src_file_id, dest_file_id, src_name, dest_name, src_type, dest_type, src_size, dest_size, status, result) VALUES (:instance_id, :src_file_id, :dest_file_id, :src_name, :dest_name, :src_type, :dest_type, :src_size, :dest_size, :status, :result)",
|
||||
"query_dataset_sql": "SELECT file_size FROM t_dm_dataset_files WHERE dataset_id = :dataset_id",
|
||||
"query_dataset_sql": "SELECT file_size FROM t_dm_dataset_files WHERE dataset_id = :dataset_id AND (status IS NULL OR status <> 'ARCHIVED')",
|
||||
"update_dataset_sql": "UPDATE t_dm_datasets SET size_bytes = :total_size, file_count = :file_count WHERE id = :dataset_id;",
|
||||
"update_task_sql": "UPDATE t_clean_task SET status = :status, after_size = :total_size, finished_at = :finished_time WHERE id = :task_id",
|
||||
"create_tables_sql": "CREATE TABLE IF NOT EXISTS t_task_instance_info (instance_id VARCHAR(255), meta_file_name TEXT, meta_file_type VARCHAR(100), meta_file_id BIGINT, meta_file_size VARCHAR(100), file_id BIGINT, file_size VARCHAR(100), file_type VARCHAR(100), file_name TEXT, file_path TEXT, status INT, operator_id VARCHAR(255), error_code VARCHAR(100), incremental VARCHAR(50), child_id BIGINT, slice_num INT DEFAULT 0);",
|
||||
@@ -14,4 +14,4 @@
|
||||
"delete_similar_img_tables_sql": "DELETE FROM operator_similar_img_features WHERE flow_id = :flow_id",
|
||||
"create_similar_text_tables_sql": "CREATE TABLE IF NOT EXISTS operators_similar_text_features (id INT AUTO_INCREMENT PRIMARY KEY, task_uuid VARCHAR(255),file_feature TEXT,file_name TEXT,timestamp DATETIME);",
|
||||
"delete_similar_text_tables_sql": "DELETE FROM operators_similar_text_features WHERE flow_id = :flow_id"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -54,19 +54,22 @@ CREATE TABLE IF NOT EXISTS t_dm_dataset_files (
|
||||
dataset_id VARCHAR(36) NOT NULL COMMENT '所属数据集ID(UUID)',
|
||||
file_name VARCHAR(255) NOT NULL COMMENT '文件名',
|
||||
file_path VARCHAR(1000) NOT NULL COMMENT '文件路径',
|
||||
logical_path VARCHAR(1000) NOT NULL COMMENT '文件逻辑路径(相对数据集根目录)',
|
||||
version BIGINT NOT NULL DEFAULT 1 COMMENT '文件版本号(同 logical_path 递增)',
|
||||
file_type VARCHAR(50) COMMENT '文件格式:JPG/PNG/DCM/TXT等',
|
||||
file_size BIGINT DEFAULT 0 COMMENT '文件大小(字节)',
|
||||
check_sum VARCHAR(64) COMMENT '文件校验和',
|
||||
tags JSON COMMENT '文件标签信息',
|
||||
tags_updated_at TIMESTAMP NULL COMMENT '标签最后更新时间',
|
||||
metadata JSON COMMENT '文件元数据',
|
||||
status VARCHAR(50) DEFAULT 'ACTIVE' COMMENT '文件状态:ACTIVE/DELETED/PROCESSING',
|
||||
status VARCHAR(50) DEFAULT 'ACTIVE' COMMENT '文件状态:ACTIVE/ARCHIVED/DELETED/PROCESSING',
|
||||
upload_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '上传时间',
|
||||
last_access_time TIMESTAMP NULL COMMENT '最后访问时间',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
FOREIGN KEY (dataset_id) REFERENCES t_dm_datasets(id) ON DELETE CASCADE,
|
||||
INDEX idx_dm_dataset (dataset_id),
|
||||
INDEX idx_dm_dataset_logical_path (dataset_id, logical_path, version),
|
||||
INDEX idx_dm_file_type (file_type),
|
||||
INDEX idx_dm_file_status (status),
|
||||
INDEX idx_dm_upload_time (upload_time)
|
||||
|
||||
Reference in New Issue
Block a user