You've already forked DataMate
feat(repository): 添加查询衍生文件源文件ID功能
- 在 DatasetFileRepository 接口中添加 findSourceFileIdsWithDerivedFiles 方法定义 - 在 DatasetFileRepositoryImpl 实现类中实现该方法 - 添加查询 metadata 中包含 derived_from_file_id 记录的源文件ID逻辑 - 提供完整的 JavaDoc 文档注释说明方法用途和参数
This commit is contained in:
@@ -13,12 +13,12 @@ import com.datamate.common.infrastructure.exception.CommonErrorCode;
|
|||||||
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
import com.datamate.common.infrastructure.exception.SystemErrorCode;
|
||||||
import com.datamate.common.interfaces.PagedResponse;
|
import com.datamate.common.interfaces.PagedResponse;
|
||||||
import com.datamate.common.interfaces.PagingQuery;
|
import com.datamate.common.interfaces.PagingQuery;
|
||||||
import com.datamate.datamanagement.common.enums.DuplicateMethod;
|
import com.datamate.datamanagement.common.enums.DuplicateMethod;
|
||||||
import com.datamate.datamanagement.common.enums.DatasetType;
|
import com.datamate.datamanagement.common.enums.DatasetType;
|
||||||
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
import com.datamate.datamanagement.domain.contants.DatasetConstant;
|
||||||
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
import com.datamate.datamanagement.domain.model.dataset.Dataset;
|
||||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
|
||||||
import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo;
|
import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo;
|
||||||
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
|
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
|
||||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
|
||||||
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
|
||||||
@@ -66,16 +66,16 @@ import java.util.stream.Stream;
|
|||||||
@Slf4j
|
@Slf4j
|
||||||
@Service
|
@Service
|
||||||
@Transactional
|
@Transactional
|
||||||
public class DatasetFileApplicationService {
|
public class DatasetFileApplicationService {
|
||||||
private static final String PDF_FILE_TYPE = "pdf";
|
private static final String PDF_FILE_TYPE = "pdf";
|
||||||
private static final String DOC_FILE_TYPE = "doc";
|
private static final String DOC_FILE_TYPE = "doc";
|
||||||
private static final String DOCX_FILE_TYPE = "docx";
|
private static final String DOCX_FILE_TYPE = "docx";
|
||||||
private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE);
|
private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE);
|
||||||
|
|
||||||
private final DatasetFileRepository datasetFileRepository;
|
private final DatasetFileRepository datasetFileRepository;
|
||||||
private final DatasetRepository datasetRepository;
|
private final DatasetRepository datasetRepository;
|
||||||
private final FileService fileService;
|
private final FileService fileService;
|
||||||
private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
|
private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
|
||||||
|
|
||||||
@Value("${datamate.data-management.base-path:/dataset}")
|
@Value("${datamate.data-management.base-path:/dataset}")
|
||||||
private String datasetBasePath;
|
private String datasetBasePath;
|
||||||
@@ -83,27 +83,62 @@ public class DatasetFileApplicationService {
|
|||||||
@Value("${datamate.data-management.file.duplicate:COVER}")
|
@Value("${datamate.data-management.file.duplicate:COVER}")
|
||||||
private DuplicateMethod duplicateMethod;
|
private DuplicateMethod duplicateMethod;
|
||||||
|
|
||||||
@Autowired
|
@Autowired
|
||||||
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
|
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
|
||||||
DatasetRepository datasetRepository,
|
DatasetRepository datasetRepository,
|
||||||
FileService fileService,
|
FileService fileService,
|
||||||
PdfTextExtractAsyncService pdfTextExtractAsyncService) {
|
PdfTextExtractAsyncService pdfTextExtractAsyncService) {
|
||||||
this.datasetFileRepository = datasetFileRepository;
|
this.datasetFileRepository = datasetFileRepository;
|
||||||
this.datasetRepository = datasetRepository;
|
this.datasetRepository = datasetRepository;
|
||||||
this.fileService = fileService;
|
this.fileService = fileService;
|
||||||
this.pdfTextExtractAsyncService = pdfTextExtractAsyncService;
|
this.pdfTextExtractAsyncService = pdfTextExtractAsyncService;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取数据集文件列表
|
* 获取数据集文件列表
|
||||||
*/
|
*/
|
||||||
@Transactional(readOnly = true)
|
@Transactional(readOnly = true)
|
||||||
public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name,
|
public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name,
|
||||||
Boolean hasAnnotation, PagingQuery pagingQuery) {
|
Boolean hasAnnotation, PagingQuery pagingQuery) {
|
||||||
IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize());
|
return getDatasetFiles(datasetId, fileType, status, name, hasAnnotation, false, pagingQuery);
|
||||||
IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page);
|
}
|
||||||
return PagedResponse.of(files);
|
|
||||||
}
|
/**
|
||||||
|
* 获取数据集文件列表,支持排除已被转换为TXT的源文档文件
|
||||||
|
*
|
||||||
|
* @param datasetId 数据集ID
|
||||||
|
* @param fileType 文件类型过滤
|
||||||
|
* @param status 状态过滤
|
||||||
|
* @param name 文件名模糊查询
|
||||||
|
* @param hasAnnotation 是否有标注
|
||||||
|
* @param excludeSourceDocuments 是否排除已被转换为TXT的源文档(PDF/DOC/DOCX)
|
||||||
|
* @param pagingQuery 分页参数
|
||||||
|
* @return 分页文件列表
|
||||||
|
*/
|
||||||
|
@Transactional(readOnly = true)
|
||||||
|
public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name,
|
||||||
|
Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) {
|
||||||
|
IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize());
|
||||||
|
IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page);
|
||||||
|
|
||||||
|
if (excludeSourceDocuments) {
|
||||||
|
// 查询所有作为衍生TXT文件源的文档文件ID
|
||||||
|
List<String> sourceFileIds = datasetFileRepository.findSourceFileIdsWithDerivedFiles(datasetId);
|
||||||
|
if (!sourceFileIds.isEmpty()) {
|
||||||
|
// 过滤掉源文件
|
||||||
|
List<DatasetFile> filteredRecords = files.getRecords().stream()
|
||||||
|
.filter(file -> !sourceFileIds.contains(file.getId()))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
// 重新构建分页结果
|
||||||
|
Page<DatasetFile> filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal());
|
||||||
|
filteredPage.setRecords(filteredRecords);
|
||||||
|
return PagedResponse.of(filteredPage);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return PagedResponse.of(files);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 获取数据集文件列表
|
* 获取数据集文件列表
|
||||||
@@ -333,11 +368,11 @@ public class DatasetFileApplicationService {
|
|||||||
* @return 请求id
|
* @return 请求id
|
||||||
*/
|
*/
|
||||||
@Transactional
|
@Transactional
|
||||||
public String preUpload(UploadFilesPreRequest chunkUploadRequest, String datasetId) {
|
public String preUpload(UploadFilesPreRequest chunkUploadRequest, String datasetId) {
|
||||||
Dataset dataset = datasetRepository.getById(datasetId);
|
Dataset dataset = datasetRepository.getById(datasetId);
|
||||||
if (Objects.isNull(dataset)) {
|
if (Objects.isNull(dataset)) {
|
||||||
throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND);
|
throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 构建上传路径,如果有 prefix 则追加到路径中
|
// 构建上传路径,如果有 prefix 则追加到路径中
|
||||||
String prefix = Optional.ofNullable(chunkUploadRequest.getPrefix()).orElse("").trim();
|
String prefix = Optional.ofNullable(chunkUploadRequest.getPrefix()).orElse("").trim();
|
||||||
@@ -346,13 +381,13 @@ public class DatasetFileApplicationService {
|
|||||||
prefix = prefix.substring(1);
|
prefix = prefix.substring(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
String uploadPath = dataset.getPath();
|
String uploadPath = dataset.getPath();
|
||||||
if (uploadPath == null || uploadPath.isBlank()) {
|
if (uploadPath == null || uploadPath.isBlank()) {
|
||||||
uploadPath = datasetBasePath + File.separator + datasetId;
|
uploadPath = datasetBasePath + File.separator + datasetId;
|
||||||
}
|
}
|
||||||
if (!prefix.isEmpty()) {
|
if (!prefix.isEmpty()) {
|
||||||
uploadPath = uploadPath + File.separator + prefix.replace("/", File.separator);
|
uploadPath = uploadPath + File.separator + prefix.replace("/", File.separator);
|
||||||
}
|
}
|
||||||
|
|
||||||
ChunkUploadPreRequest request = ChunkUploadPreRequest.builder().build();
|
ChunkUploadPreRequest request = ChunkUploadPreRequest.builder().build();
|
||||||
request.setUploadPath(uploadPath);
|
request.setUploadPath(uploadPath);
|
||||||
@@ -414,24 +449,24 @@ public class DatasetFileApplicationService {
|
|||||||
for (FileUploadResult file : unpacked) {
|
for (FileUploadResult file : unpacked) {
|
||||||
File savedFile = file.getSavedFile();
|
File savedFile = file.getSavedFile();
|
||||||
LocalDateTime currentTime = LocalDateTime.now();
|
LocalDateTime currentTime = LocalDateTime.now();
|
||||||
DatasetFile datasetFile = DatasetFile.builder()
|
DatasetFile datasetFile = DatasetFile.builder()
|
||||||
.id(UUID.randomUUID().toString())
|
.id(UUID.randomUUID().toString())
|
||||||
.datasetId(datasetId)
|
.datasetId(datasetId)
|
||||||
.fileSize(savedFile.length())
|
.fileSize(savedFile.length())
|
||||||
.uploadTime(currentTime)
|
.uploadTime(currentTime)
|
||||||
.lastAccessTime(currentTime)
|
.lastAccessTime(currentTime)
|
||||||
.fileName(file.getFileName())
|
.fileName(file.getFileName())
|
||||||
.filePath(savedFile.getPath())
|
.filePath(savedFile.getPath())
|
||||||
.fileType(AnalyzerUtils.getExtension(file.getFileName()))
|
.fileType(AnalyzerUtils.getExtension(file.getFileName()))
|
||||||
.build();
|
.build();
|
||||||
setDatasetFileId(datasetFile, dataset);
|
setDatasetFileId(datasetFile, dataset);
|
||||||
datasetFileRepository.saveOrUpdate(datasetFile);
|
datasetFileRepository.saveOrUpdate(datasetFile);
|
||||||
dataset.addFile(datasetFile);
|
dataset.addFile(datasetFile);
|
||||||
triggerPdfTextExtraction(dataset, datasetFile);
|
triggerPdfTextExtraction(dataset, datasetFile);
|
||||||
}
|
}
|
||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 在数据集下创建子目录
|
* 在数据集下创建子目录
|
||||||
@@ -697,29 +732,29 @@ public class DatasetFileApplicationService {
|
|||||||
dataset.addFile(datasetFile);
|
dataset.addFile(datasetFile);
|
||||||
copiedFiles.add(datasetFile);
|
copiedFiles.add(datasetFile);
|
||||||
}
|
}
|
||||||
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
|
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
|
||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
|
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
|
||||||
return copiedFiles;
|
return copiedFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
|
private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
|
||||||
for (String sourcePath : sourcePaths) {
|
for (String sourcePath : sourcePaths) {
|
||||||
Path sourceFilePath = Paths.get(sourcePath);
|
Path sourceFilePath = Paths.get(sourcePath);
|
||||||
Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
|
Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
|
||||||
try {
|
try {
|
||||||
Files.createDirectories(Path.of(dataset.getPath()));
|
Files.createDirectories(Path.of(dataset.getPath()));
|
||||||
Files.copy(sourceFilePath, targetFilePath);
|
Files.copy(sourceFilePath, targetFilePath);
|
||||||
DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
|
DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
|
||||||
dataset.getId(),
|
dataset.getId(),
|
||||||
sourceFilePath.getFileName().toString()
|
sourceFilePath.getFileName().toString()
|
||||||
);
|
);
|
||||||
triggerPdfTextExtraction(dataset, datasetFile);
|
triggerPdfTextExtraction(dataset, datasetFile);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -765,30 +800,30 @@ public class DatasetFileApplicationService {
|
|||||||
.lastAccessTime(currentTime)
|
.lastAccessTime(currentTime)
|
||||||
.metadata(metadata)
|
.metadata(metadata)
|
||||||
.build();
|
.build();
|
||||||
setDatasetFileId(datasetFile, dataset);
|
setDatasetFileId(datasetFile, dataset);
|
||||||
dataset.addFile(datasetFile);
|
dataset.addFile(datasetFile);
|
||||||
addedFiles.add(datasetFile);
|
addedFiles.add(datasetFile);
|
||||||
triggerPdfTextExtraction(dataset, datasetFile);
|
triggerPdfTextExtraction(dataset, datasetFile);
|
||||||
}
|
}
|
||||||
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
|
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
|
||||||
dataset.active();
|
dataset.active();
|
||||||
datasetRepository.updateById(dataset);
|
datasetRepository.updateById(dataset);
|
||||||
// Note: addFilesToDataset only creates DB records, no file system operations
|
// Note: addFilesToDataset only creates DB records, no file system operations
|
||||||
// If file copy is needed, use copyFilesToDatasetDir endpoint instead
|
// If file copy is needed, use copyFilesToDatasetDir endpoint instead
|
||||||
return addedFiles;
|
return addedFiles;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) {
|
private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) {
|
||||||
if (dataset == null || datasetFile == null) {
|
if (dataset == null || datasetFile == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (dataset.getDatasetType() != DatasetType.TEXT) {
|
if (dataset.getDatasetType() != DatasetType.TEXT) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
String fileType = datasetFile.getFileType();
|
String fileType = datasetFile.getFileType();
|
||||||
if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) {
|
if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId());
|
pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,4 +29,13 @@ public interface DatasetFileMapper extends BaseMapper<DatasetFile> {
|
|||||||
int updateFilePathPrefix(@Param("datasetId") String datasetId,
|
int updateFilePathPrefix(@Param("datasetId") String datasetId,
|
||||||
@Param("oldPrefix") String oldPrefix,
|
@Param("oldPrefix") String oldPrefix,
|
||||||
@Param("newPrefix") String newPrefix);
|
@Param("newPrefix") String newPrefix);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 查询数据集中所有作为衍生文件源文件的ID列表
|
||||||
|
* 通过查询 metadata 中包含 derived_from_file_id 的字段值
|
||||||
|
*
|
||||||
|
* @param datasetId 数据集ID
|
||||||
|
* @return 源文件ID列表
|
||||||
|
*/
|
||||||
|
List<String> findSourceFileIdsWithDerivedFiles(@Param("datasetId") String datasetId);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -27,4 +27,13 @@ public interface DatasetFileRepository extends IRepository<DatasetFile> {
|
|||||||
Boolean hasAnnotation, IPage<DatasetFile> page);
|
Boolean hasAnnotation, IPage<DatasetFile> page);
|
||||||
|
|
||||||
int updateFilePathPrefix(String datasetId, String oldPrefix, String newPrefix);
|
int updateFilePathPrefix(String datasetId, String oldPrefix, String newPrefix);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 查询数据集中所有作为衍生文件源文件的ID列表
|
||||||
|
* 通过查询 metadata 中包含 derived_from_file_id 的记录,返回所有源文件ID
|
||||||
|
*
|
||||||
|
* @param datasetId 数据集ID
|
||||||
|
* @return 源文件ID列表
|
||||||
|
*/
|
||||||
|
List<String> findSourceFileIdsWithDerivedFiles(String datasetId);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -64,4 +64,11 @@ public class DatasetFileRepositoryImpl extends CrudRepository<DatasetFileMapper,
|
|||||||
public int updateFilePathPrefix(String datasetId, String oldPrefix, String newPrefix) {
|
public int updateFilePathPrefix(String datasetId, String oldPrefix, String newPrefix) {
|
||||||
return datasetFileMapper.updateFilePathPrefix(datasetId, oldPrefix, newPrefix);
|
return datasetFileMapper.updateFilePathPrefix(datasetId, oldPrefix, newPrefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public List<String> findSourceFileIdsWithDerivedFiles(String datasetId) {
|
||||||
|
// 查询 metadata 中包含 derived_from_file_id 的记录的源文件ID
|
||||||
|
// 使用 MyBatis 的 @Select 注解或直接调用 mapper 方法
|
||||||
|
return datasetFileMapper.findSourceFileIdsWithDerivedFiles(datasetId);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,12 +19,12 @@ import jakarta.validation.Valid;
|
|||||||
import lombok.extern.slf4j.Slf4j;
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.springframework.beans.factory.annotation.Autowired;
|
import org.springframework.beans.factory.annotation.Autowired;
|
||||||
import org.springframework.core.io.Resource;
|
import org.springframework.core.io.Resource;
|
||||||
import org.springframework.http.HttpHeaders;
|
import org.springframework.http.HttpHeaders;
|
||||||
import org.springframework.http.HttpStatus;
|
import org.springframework.http.HttpStatus;
|
||||||
import org.springframework.http.MediaType;
|
import org.springframework.http.MediaType;
|
||||||
import org.springframework.http.MediaTypeFactory;
|
import org.springframework.http.MediaTypeFactory;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.web.bind.annotation.*;
|
import org.springframework.web.bind.annotation.*;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@@ -43,24 +43,26 @@ public class DatasetFileController {
|
|||||||
this.datasetFileApplicationService = datasetFileApplicationService;
|
this.datasetFileApplicationService = datasetFileApplicationService;
|
||||||
}
|
}
|
||||||
|
|
||||||
@GetMapping
|
@GetMapping
|
||||||
public Response<PagedResponse<DatasetFile>> getDatasetFiles(
|
public Response<PagedResponse<DatasetFile>> getDatasetFiles(
|
||||||
@PathVariable("datasetId") String datasetId,
|
@PathVariable("datasetId") String datasetId,
|
||||||
@RequestParam(value = "isWithDirectory", required = false) boolean isWithDirectory,
|
@RequestParam(value = "isWithDirectory", required = false) boolean isWithDirectory,
|
||||||
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
|
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
|
||||||
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
|
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
|
||||||
@RequestParam(value = "prefix", required = false, defaultValue = "") String prefix,
|
@RequestParam(value = "prefix", required = false, defaultValue = "") String prefix,
|
||||||
@RequestParam(value = "status", required = false) String status,
|
@RequestParam(value = "status", required = false) String status,
|
||||||
@RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation) {
|
@RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation,
|
||||||
PagingQuery pagingQuery = new PagingQuery(page, size);
|
@RequestParam(value = "excludeSourceDocuments", required = false, defaultValue = "false") Boolean excludeSourceDocuments) {
|
||||||
PagedResponse<DatasetFile> filesPage;
|
PagingQuery pagingQuery = new PagingQuery(page, size);
|
||||||
if (isWithDirectory) {
|
PagedResponse<DatasetFile> filesPage;
|
||||||
filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(datasetId, prefix, pagingQuery);
|
if (isWithDirectory) {
|
||||||
} else {
|
filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(datasetId, prefix, pagingQuery);
|
||||||
filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation, pagingQuery);
|
} else {
|
||||||
}
|
filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation,
|
||||||
return Response.ok(filesPage);
|
Boolean.TRUE.equals(excludeSourceDocuments), pagingQuery);
|
||||||
}
|
}
|
||||||
|
return Response.ok(filesPage);
|
||||||
|
}
|
||||||
|
|
||||||
@GetMapping("/{fileId}")
|
@GetMapping("/{fileId}")
|
||||||
public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById(
|
public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById(
|
||||||
@@ -86,10 +88,10 @@ public class DatasetFileController {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@IgnoreResponseWrap
|
@IgnoreResponseWrap
|
||||||
@GetMapping(value = "/{fileId}/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE + ";charset=UTF-8")
|
@GetMapping(value = "/{fileId}/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE + ";charset=UTF-8")
|
||||||
public ResponseEntity<Resource> downloadDatasetFileById(@PathVariable("datasetId") String datasetId,
|
public ResponseEntity<Resource> downloadDatasetFileById(@PathVariable("datasetId") String datasetId,
|
||||||
@PathVariable("fileId") String fileId) {
|
@PathVariable("fileId") String fileId) {
|
||||||
try {
|
try {
|
||||||
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
|
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
|
||||||
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
|
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
|
||||||
@@ -103,34 +105,34 @@ public class DatasetFileController {
|
|||||||
return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
|
return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build();
|
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@IgnoreResponseWrap
|
@IgnoreResponseWrap
|
||||||
@GetMapping(value = "/{fileId}/preview", produces = MediaType.ALL_VALUE)
|
@GetMapping(value = "/{fileId}/preview", produces = MediaType.ALL_VALUE)
|
||||||
public ResponseEntity<Resource> previewDatasetFileById(@PathVariable("datasetId") String datasetId,
|
public ResponseEntity<Resource> previewDatasetFileById(@PathVariable("datasetId") String datasetId,
|
||||||
@PathVariable("fileId") String fileId) {
|
@PathVariable("fileId") String fileId) {
|
||||||
try {
|
try {
|
||||||
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
|
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
|
||||||
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
|
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
|
||||||
MediaType mediaType = MediaTypeFactory.getMediaType(resource)
|
MediaType mediaType = MediaTypeFactory.getMediaType(resource)
|
||||||
.orElse(MediaType.APPLICATION_OCTET_STREAM);
|
.orElse(MediaType.APPLICATION_OCTET_STREAM);
|
||||||
|
|
||||||
return ResponseEntity.ok()
|
return ResponseEntity.ok()
|
||||||
.contentType(mediaType)
|
.contentType(mediaType)
|
||||||
.header(HttpHeaders.CONTENT_DISPOSITION,
|
.header(HttpHeaders.CONTENT_DISPOSITION,
|
||||||
"inline; filename=\"" + datasetFile.getFileName() + "\"")
|
"inline; filename=\"" + datasetFile.getFileName() + "\"")
|
||||||
.body(resource);
|
.body(resource);
|
||||||
} catch (IllegalArgumentException e) {
|
} catch (IllegalArgumentException e) {
|
||||||
return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
|
return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build();
|
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@IgnoreResponseWrap
|
@IgnoreResponseWrap
|
||||||
@GetMapping(value = "/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE)
|
@GetMapping(value = "/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE)
|
||||||
public void downloadDatasetFileAsZip(@PathVariable("datasetId") String datasetId, HttpServletResponse response) {
|
public void downloadDatasetFileAsZip(@PathVariable("datasetId") String datasetId, HttpServletResponse response) {
|
||||||
datasetFileApplicationService.downloadDatasetFileAsZip(datasetId, response);
|
datasetFileApplicationService.downloadDatasetFileAsZip(datasetId, response);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -102,4 +102,12 @@
|
|||||||
WHERE dataset_id = #{datasetId}
|
WHERE dataset_id = #{datasetId}
|
||||||
AND file_path LIKE CONCAT(#{oldPrefix}, '%')
|
AND file_path LIKE CONCAT(#{oldPrefix}, '%')
|
||||||
</update>
|
</update>
|
||||||
|
|
||||||
|
<select id="findSourceFileIdsWithDerivedFiles" resultType="java.lang.String">
|
||||||
|
SELECT DISTINCT JSON_UNQUOTE(JSON_EXTRACT(metadata, '$.derived_from_file_id')) AS source_file_id
|
||||||
|
FROM t_dm_dataset_files
|
||||||
|
WHERE dataset_id = #{datasetId}
|
||||||
|
AND metadata IS NOT NULL
|
||||||
|
AND JSON_EXTRACT(metadata, '$.derived_from_file_id') IS NOT NULL
|
||||||
|
</select>
|
||||||
</mapper>
|
</mapper>
|
||||||
|
|||||||
@@ -14,15 +14,20 @@ import {
|
|||||||
import { formatBytes } from "@/utils/unit";
|
import { formatBytes } from "@/utils/unit";
|
||||||
import { useDebouncedEffect } from "@/hooks/useDebouncedEffect";
|
import { useDebouncedEffect } from "@/hooks/useDebouncedEffect";
|
||||||
|
|
||||||
interface DatasetFileTransferProps
|
interface DatasetFileTransferProps
|
||||||
extends React.HTMLAttributes<HTMLDivElement> {
|
extends React.HTMLAttributes<HTMLDivElement> {
|
||||||
open: boolean;
|
open: boolean;
|
||||||
selectedFilesMap: { [key: string]: DatasetFile };
|
selectedFilesMap: { [key: string]: DatasetFile };
|
||||||
onSelectedFilesChange: (filesMap: { [key: string]: DatasetFile }) => void;
|
onSelectedFilesChange: (filesMap: { [key: string]: DatasetFile }) => void;
|
||||||
onDatasetSelect?: (dataset: Dataset | null) => void;
|
onDatasetSelect?: (dataset: Dataset | null) => void;
|
||||||
datasetTypeFilter?: DatasetType;
|
datasetTypeFilter?: DatasetType;
|
||||||
hasAnnotationFilter?: boolean;
|
hasAnnotationFilter?: boolean;
|
||||||
}
|
/**
|
||||||
|
* 是否排除已被转换为TXT的源文档文件(PDF/DOC/DOCX)
|
||||||
|
* 默认为 true,当 datasetTypeFilter 为 TEXT 时自动启用
|
||||||
|
*/
|
||||||
|
excludeSourceDocuments?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
const fileCols = [
|
const fileCols = [
|
||||||
{
|
{
|
||||||
@@ -47,15 +52,18 @@ const fileCols = [
|
|||||||
];
|
];
|
||||||
|
|
||||||
// Customize Table Transfer
|
// Customize Table Transfer
|
||||||
const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
||||||
open,
|
open,
|
||||||
selectedFilesMap,
|
selectedFilesMap,
|
||||||
onSelectedFilesChange,
|
onSelectedFilesChange,
|
||||||
onDatasetSelect,
|
onDatasetSelect,
|
||||||
datasetTypeFilter = DatasetType.TEXT,
|
datasetTypeFilter = DatasetType.TEXT,
|
||||||
hasAnnotationFilter,
|
hasAnnotationFilter,
|
||||||
...props
|
excludeSourceDocuments,
|
||||||
}) => {
|
...props
|
||||||
|
}) => {
|
||||||
|
// 当未指定时,根据数据集类型自动决定是否排除源文档
|
||||||
|
const shouldExcludeSourceDocuments = excludeSourceDocuments ?? (datasetTypeFilter === DatasetType.TEXT);
|
||||||
const [datasets, setDatasets] = React.useState<Dataset[]>([]);
|
const [datasets, setDatasets] = React.useState<Dataset[]>([]);
|
||||||
const [datasetSearch, setDatasetSearch] = React.useState<string>("");
|
const [datasetSearch, setDatasetSearch] = React.useState<string>("");
|
||||||
const [datasetPagination, setDatasetPagination] = React.useState<{
|
const [datasetPagination, setDatasetPagination] = React.useState<{
|
||||||
@@ -66,13 +74,13 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
|
|
||||||
const [files, setFiles] = React.useState<DatasetFile[]>([]);
|
const [files, setFiles] = React.useState<DatasetFile[]>([]);
|
||||||
const [filesSearch, setFilesSearch] = React.useState<string>("");
|
const [filesSearch, setFilesSearch] = React.useState<string>("");
|
||||||
const [filesPagination, setFilesPagination] = React.useState<{
|
const [filesPagination, setFilesPagination] = React.useState<{
|
||||||
current: number;
|
current: number;
|
||||||
pageSize: number;
|
pageSize: number;
|
||||||
total: number;
|
total: number;
|
||||||
}>({ current: 1, pageSize: 10, total: 0 });
|
}>({ current: 1, pageSize: 10, total: 0 });
|
||||||
const filesPage = filesPagination.current;
|
const filesPage = filesPagination.current;
|
||||||
const filesPageSize = filesPagination.pageSize;
|
const filesPageSize = filesPagination.pageSize;
|
||||||
|
|
||||||
const [showFiles, setShowFiles] = React.useState<boolean>(false);
|
const [showFiles, setShowFiles] = React.useState<boolean>(false);
|
||||||
const [selectedDataset, setSelectedDataset] = React.useState<Dataset | null>(
|
const [selectedDataset, setSelectedDataset] = React.useState<Dataset | null>(
|
||||||
@@ -108,19 +116,20 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
|
|
||||||
const fetchFiles = useCallback(
|
const fetchFiles = useCallback(
|
||||||
async (
|
async (
|
||||||
options?: Partial<{ page: number; pageSize: number; keyword: string }>
|
options?: Partial<{ page: number; pageSize: number; keyword: string }>
|
||||||
) => {
|
) => {
|
||||||
if (!selectedDataset) return;
|
if (!selectedDataset) return;
|
||||||
const page = options?.page ?? filesPage;
|
const page = options?.page ?? filesPage;
|
||||||
const pageSize = options?.pageSize ?? filesPageSize;
|
const pageSize = options?.pageSize ?? filesPageSize;
|
||||||
const keyword = options?.keyword ?? filesSearch;
|
const keyword = options?.keyword ?? filesSearch;
|
||||||
|
|
||||||
const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, {
|
const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, {
|
||||||
page,
|
page,
|
||||||
size: pageSize,
|
size: pageSize,
|
||||||
keyword,
|
keyword,
|
||||||
...(hasAnnotationFilter ? { hasAnnotation: true } : {}),
|
...(hasAnnotationFilter ? { hasAnnotation: true } : {}),
|
||||||
});
|
...(shouldExcludeSourceDocuments ? { excludeSourceDocuments: true } : {}),
|
||||||
|
});
|
||||||
setFiles(
|
setFiles(
|
||||||
(data.content || []).map((item: DatasetFile) => ({
|
(data.content || []).map((item: DatasetFile) => ({
|
||||||
...item,
|
...item,
|
||||||
@@ -135,15 +144,16 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
pageSize,
|
pageSize,
|
||||||
total: data.totalElements,
|
total: data.totalElements,
|
||||||
}));
|
}));
|
||||||
},
|
},
|
||||||
[
|
[
|
||||||
selectedDataset,
|
selectedDataset,
|
||||||
filesPage,
|
filesPage,
|
||||||
filesPageSize,
|
filesPageSize,
|
||||||
filesSearch,
|
filesSearch,
|
||||||
hasAnnotationFilter,
|
hasAnnotationFilter,
|
||||||
]
|
shouldExcludeSourceDocuments,
|
||||||
);
|
]
|
||||||
|
);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
// 当数据集变化时,重置文件分页并拉取第一页文件,避免额外的循环请求
|
// 当数据集变化时,重置文件分页并拉取第一页文件,避免额外的循环请求
|
||||||
@@ -178,11 +188,12 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
const allFiles: DatasetFile[] = [];
|
const allFiles: DatasetFile[] = [];
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, {
|
const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, {
|
||||||
page,
|
page,
|
||||||
size: pageSize,
|
size: pageSize,
|
||||||
...(hasAnnotationFilter ? { hasAnnotation: true } : {}),
|
...(hasAnnotationFilter ? { hasAnnotation: true } : {}),
|
||||||
});
|
...(shouldExcludeSourceDocuments ? { excludeSourceDocuments: true } : {}),
|
||||||
|
});
|
||||||
|
|
||||||
const content: DatasetFile[] = (data.content || []).map(
|
const content: DatasetFile[] = (data.content || []).map(
|
||||||
(item: DatasetFile) => ({
|
(item: DatasetFile) => ({
|
||||||
@@ -229,7 +240,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
} finally {
|
} finally {
|
||||||
setSelectingAll(false);
|
setSelectingAll(false);
|
||||||
}
|
}
|
||||||
}, [selectedDataset, selectedFilesMap, onSelectedFilesChange, hasAnnotationFilter]);
|
}, [selectedDataset, selectedFilesMap, onSelectedFilesChange, hasAnnotationFilter, shouldExcludeSourceDocuments]);
|
||||||
|
|
||||||
const toggleSelectFile = (record: DatasetFile) => {
|
const toggleSelectFile = (record: DatasetFile) => {
|
||||||
if (!selectedFilesMap[record.id]) {
|
if (!selectedFilesMap[record.id]) {
|
||||||
@@ -400,10 +411,10 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
|
|||||||
},
|
},
|
||||||
|
|
||||||
// 全选 - 改为全选整个数据集而不是当前页
|
// 全选 - 改为全选整个数据集而不是当前页
|
||||||
onSelectAll: (selected) => {
|
onSelectAll: (selected) => {
|
||||||
if (selected) {
|
if (selected) {
|
||||||
// 点击表头“全选”时,改为一键全选当前数据集的全部文件
|
// 点击表头“全选”时,改为一键全选当前数据集的全部文件
|
||||||
// 而不是只选中当前页
|
// 而不是只选中当前页
|
||||||
handleSelectAllInDataset();
|
handleSelectAllInDataset();
|
||||||
} else {
|
} else {
|
||||||
// 取消表头“全选”时,清空当前已选文件
|
// 取消表头“全选”时,清空当前已选文件
|
||||||
|
|||||||
@@ -275,7 +275,12 @@ export default function CreateAnnotationTask({
|
|||||||
}
|
}
|
||||||
setDatasetPreviewLoading(true);
|
setDatasetPreviewLoading(true);
|
||||||
try {
|
try {
|
||||||
const res = await queryDatasetFilesUsingGet(selectedDatasetId, { page: 0, size: 10 });
|
// 对于文本数据集,排除已被转换为TXT的源文档文件(PDF/DOC/DOCX)
|
||||||
|
const params: { page: number; size: number; excludeSourceDocuments?: boolean } = { page: 0, size: 10 };
|
||||||
|
if (isTextDataset) {
|
||||||
|
params.excludeSourceDocuments = true;
|
||||||
|
}
|
||||||
|
const res = await queryDatasetFilesUsingGet(selectedDatasetId, params);
|
||||||
if (res.code === '0' && res.data) {
|
if (res.code === '0' && res.data) {
|
||||||
setDatasetPreviewData((res.data.content || []) as DatasetPreviewFile[]);
|
setDatasetPreviewData((res.data.content || []) as DatasetPreviewFile[]);
|
||||||
setDatasetPreviewVisible(true);
|
setDatasetPreviewVisible(true);
|
||||||
|
|||||||
Reference in New Issue
Block a user