diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java index 8f3d9bc..9366f48 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java @@ -13,12 +13,12 @@ import com.datamate.common.infrastructure.exception.CommonErrorCode; import com.datamate.common.infrastructure.exception.SystemErrorCode; import com.datamate.common.interfaces.PagedResponse; import com.datamate.common.interfaces.PagingQuery; -import com.datamate.datamanagement.common.enums.DuplicateMethod; -import com.datamate.datamanagement.common.enums.DatasetType; -import com.datamate.datamanagement.domain.contants.DatasetConstant; -import com.datamate.datamanagement.domain.model.dataset.Dataset; -import com.datamate.datamanagement.domain.model.dataset.DatasetFile; -import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo; +import com.datamate.datamanagement.common.enums.DuplicateMethod; +import com.datamate.datamanagement.common.enums.DatasetType; +import com.datamate.datamanagement.domain.contants.DatasetConstant; +import com.datamate.datamanagement.domain.model.dataset.Dataset; +import com.datamate.datamanagement.domain.model.dataset.DatasetFile; +import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo; import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository; @@ -66,16 +66,16 @@ import java.util.stream.Stream; @Slf4j @Service @Transactional -public class DatasetFileApplicationService { - private static final String PDF_FILE_TYPE = "pdf"; - private static final String DOC_FILE_TYPE = "doc"; - private static final String DOCX_FILE_TYPE = "docx"; - private static final Set DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE); - - private final DatasetFileRepository datasetFileRepository; - private final DatasetRepository datasetRepository; - private final FileService fileService; - private final PdfTextExtractAsyncService pdfTextExtractAsyncService; +public class DatasetFileApplicationService { + private static final String PDF_FILE_TYPE = "pdf"; + private static final String DOC_FILE_TYPE = "doc"; + private static final String DOCX_FILE_TYPE = "docx"; + private static final Set DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE); + + private final DatasetFileRepository datasetFileRepository; + private final DatasetRepository datasetRepository; + private final FileService fileService; + private final PdfTextExtractAsyncService pdfTextExtractAsyncService; @Value("${datamate.data-management.base-path:/dataset}") private String datasetBasePath; @@ -83,27 +83,62 @@ public class DatasetFileApplicationService { @Value("${datamate.data-management.file.duplicate:COVER}") private DuplicateMethod duplicateMethod; - @Autowired - public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository, - DatasetRepository datasetRepository, - FileService fileService, - PdfTextExtractAsyncService pdfTextExtractAsyncService) { - this.datasetFileRepository = datasetFileRepository; - this.datasetRepository = datasetRepository; - this.fileService = fileService; - this.pdfTextExtractAsyncService = pdfTextExtractAsyncService; - } + @Autowired + public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository, + DatasetRepository datasetRepository, + FileService fileService, + PdfTextExtractAsyncService pdfTextExtractAsyncService) { + this.datasetFileRepository = datasetFileRepository; + this.datasetRepository = datasetRepository; + this.fileService = fileService; + this.pdfTextExtractAsyncService = pdfTextExtractAsyncService; + } /** * 获取数据集文件列表 */ @Transactional(readOnly = true) - public PagedResponse getDatasetFiles(String datasetId, String fileType, String status, String name, - Boolean hasAnnotation, PagingQuery pagingQuery) { - IPage page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize()); - IPage files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page); - return PagedResponse.of(files); - } + public PagedResponse getDatasetFiles(String datasetId, String fileType, String status, String name, + Boolean hasAnnotation, PagingQuery pagingQuery) { + return getDatasetFiles(datasetId, fileType, status, name, hasAnnotation, false, pagingQuery); + } + + /** + * 获取数据集文件列表,支持排除已被转换为TXT的源文档文件 + * + * @param datasetId 数据集ID + * @param fileType 文件类型过滤 + * @param status 状态过滤 + * @param name 文件名模糊查询 + * @param hasAnnotation 是否有标注 + * @param excludeSourceDocuments 是否排除已被转换为TXT的源文档(PDF/DOC/DOCX) + * @param pagingQuery 分页参数 + * @return 分页文件列表 + */ + @Transactional(readOnly = true) + public PagedResponse getDatasetFiles(String datasetId, String fileType, String status, String name, + Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) { + IPage page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize()); + IPage files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page); + + if (excludeSourceDocuments) { + // 查询所有作为衍生TXT文件源的文档文件ID + List sourceFileIds = datasetFileRepository.findSourceFileIdsWithDerivedFiles(datasetId); + if (!sourceFileIds.isEmpty()) { + // 过滤掉源文件 + List filteredRecords = files.getRecords().stream() + .filter(file -> !sourceFileIds.contains(file.getId())) + .collect(Collectors.toList()); + + // 重新构建分页结果 + Page filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal()); + filteredPage.setRecords(filteredRecords); + return PagedResponse.of(filteredPage); + } + } + + return PagedResponse.of(files); + } /** * 获取数据集文件列表 @@ -333,11 +368,11 @@ public class DatasetFileApplicationService { * @return 请求id */ @Transactional - public String preUpload(UploadFilesPreRequest chunkUploadRequest, String datasetId) { - Dataset dataset = datasetRepository.getById(datasetId); - if (Objects.isNull(dataset)) { - throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND); - } + public String preUpload(UploadFilesPreRequest chunkUploadRequest, String datasetId) { + Dataset dataset = datasetRepository.getById(datasetId); + if (Objects.isNull(dataset)) { + throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND); + } // 构建上传路径,如果有 prefix 则追加到路径中 String prefix = Optional.ofNullable(chunkUploadRequest.getPrefix()).orElse("").trim(); @@ -346,13 +381,13 @@ public class DatasetFileApplicationService { prefix = prefix.substring(1); } - String uploadPath = dataset.getPath(); - if (uploadPath == null || uploadPath.isBlank()) { - uploadPath = datasetBasePath + File.separator + datasetId; - } - if (!prefix.isEmpty()) { - uploadPath = uploadPath + File.separator + prefix.replace("/", File.separator); - } + String uploadPath = dataset.getPath(); + if (uploadPath == null || uploadPath.isBlank()) { + uploadPath = datasetBasePath + File.separator + datasetId; + } + if (!prefix.isEmpty()) { + uploadPath = uploadPath + File.separator + prefix.replace("/", File.separator); + } ChunkUploadPreRequest request = ChunkUploadPreRequest.builder().build(); request.setUploadPath(uploadPath); @@ -414,24 +449,24 @@ public class DatasetFileApplicationService { for (FileUploadResult file : unpacked) { File savedFile = file.getSavedFile(); LocalDateTime currentTime = LocalDateTime.now(); - DatasetFile datasetFile = DatasetFile.builder() - .id(UUID.randomUUID().toString()) - .datasetId(datasetId) - .fileSize(savedFile.length()) + DatasetFile datasetFile = DatasetFile.builder() + .id(UUID.randomUUID().toString()) + .datasetId(datasetId) + .fileSize(savedFile.length()) .uploadTime(currentTime) .lastAccessTime(currentTime) .fileName(file.getFileName()) .filePath(savedFile.getPath()) .fileType(AnalyzerUtils.getExtension(file.getFileName())) .build(); - setDatasetFileId(datasetFile, dataset); - datasetFileRepository.saveOrUpdate(datasetFile); - dataset.addFile(datasetFile); - triggerPdfTextExtraction(dataset, datasetFile); - } - dataset.active(); - datasetRepository.updateById(dataset); - } + setDatasetFileId(datasetFile, dataset); + datasetFileRepository.saveOrUpdate(datasetFile); + dataset.addFile(datasetFile); + triggerPdfTextExtraction(dataset, datasetFile); + } + dataset.active(); + datasetRepository.updateById(dataset); + } /** * 在数据集下创建子目录 @@ -697,29 +732,29 @@ public class DatasetFileApplicationService { dataset.addFile(datasetFile); copiedFiles.add(datasetFile); } - datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100); - dataset.active(); - datasetRepository.updateById(dataset); - CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset)); - return copiedFiles; + datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100); + dataset.active(); + datasetRepository.updateById(dataset); + CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset)); + return copiedFiles; } private void copyFilesToDatasetDir(List sourcePaths, Dataset dataset) { for (String sourcePath : sourcePaths) { Path sourceFilePath = Paths.get(sourcePath); Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString()); - try { - Files.createDirectories(Path.of(dataset.getPath())); - Files.copy(sourceFilePath, targetFilePath); - DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName( - dataset.getId(), - sourceFilePath.getFileName().toString() - ); - triggerPdfTextExtraction(dataset, datasetFile); - } catch (IOException e) { - log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e); - } - } + try { + Files.createDirectories(Path.of(dataset.getPath())); + Files.copy(sourceFilePath, targetFilePath); + DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName( + dataset.getId(), + sourceFilePath.getFileName().toString() + ); + triggerPdfTextExtraction(dataset, datasetFile); + } catch (IOException e) { + log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e); + } + } } /** @@ -765,30 +800,30 @@ public class DatasetFileApplicationService { .lastAccessTime(currentTime) .metadata(metadata) .build(); - setDatasetFileId(datasetFile, dataset); - dataset.addFile(datasetFile); - addedFiles.add(datasetFile); - triggerPdfTextExtraction(dataset, datasetFile); - } - datasetFileRepository.saveOrUpdateBatch(addedFiles, 100); - dataset.active(); - datasetRepository.updateById(dataset); - // Note: addFilesToDataset only creates DB records, no file system operations - // If file copy is needed, use copyFilesToDatasetDir endpoint instead - return addedFiles; - } - - private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) { - if (dataset == null || datasetFile == null) { - return; - } - if (dataset.getDatasetType() != DatasetType.TEXT) { - return; - } - String fileType = datasetFile.getFileType(); - if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) { - return; - } - pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId()); - } -} + setDatasetFileId(datasetFile, dataset); + dataset.addFile(datasetFile); + addedFiles.add(datasetFile); + triggerPdfTextExtraction(dataset, datasetFile); + } + datasetFileRepository.saveOrUpdateBatch(addedFiles, 100); + dataset.active(); + datasetRepository.updateById(dataset); + // Note: addFilesToDataset only creates DB records, no file system operations + // If file copy is needed, use copyFilesToDatasetDir endpoint instead + return addedFiles; + } + + private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) { + if (dataset == null || datasetFile == null) { + return; + } + if (dataset.getDatasetType() != DatasetType.TEXT) { + return; + } + String fileType = datasetFile.getFileType(); + if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) { + return; + } + pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId()); + } +} diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/mapper/DatasetFileMapper.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/mapper/DatasetFileMapper.java index 84f6680..ac7eb11 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/mapper/DatasetFileMapper.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/mapper/DatasetFileMapper.java @@ -29,4 +29,13 @@ public interface DatasetFileMapper extends BaseMapper { int updateFilePathPrefix(@Param("datasetId") String datasetId, @Param("oldPrefix") String oldPrefix, @Param("newPrefix") String newPrefix); + + /** + * 查询数据集中所有作为衍生文件源文件的ID列表 + * 通过查询 metadata 中包含 derived_from_file_id 的字段值 + * + * @param datasetId 数据集ID + * @return 源文件ID列表 + */ + List findSourceFileIdsWithDerivedFiles(@Param("datasetId") String datasetId); } diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/repository/DatasetFileRepository.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/repository/DatasetFileRepository.java index 76ccca4..15883ca 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/repository/DatasetFileRepository.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/repository/DatasetFileRepository.java @@ -27,4 +27,13 @@ public interface DatasetFileRepository extends IRepository { Boolean hasAnnotation, IPage page); int updateFilePathPrefix(String datasetId, String oldPrefix, String newPrefix); + + /** + * 查询数据集中所有作为衍生文件源文件的ID列表 + * 通过查询 metadata 中包含 derived_from_file_id 的记录,返回所有源文件ID + * + * @param datasetId 数据集ID + * @return 源文件ID列表 + */ + List findSourceFileIdsWithDerivedFiles(String datasetId); } diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/repository/impl/DatasetFileRepositoryImpl.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/repository/impl/DatasetFileRepositoryImpl.java index 1143059..3409071 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/repository/impl/DatasetFileRepositoryImpl.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/infrastructure/persistence/repository/impl/DatasetFileRepositoryImpl.java @@ -64,4 +64,11 @@ public class DatasetFileRepositoryImpl extends CrudRepository findSourceFileIdsWithDerivedFiles(String datasetId) { + // 查询 metadata 中包含 derived_from_file_id 的记录的源文件ID + // 使用 MyBatis 的 @Select 注解或直接调用 mapper 方法 + return datasetFileMapper.findSourceFileIdsWithDerivedFiles(datasetId); + } } diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java index 67914b0..12a5e63 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java @@ -19,12 +19,12 @@ import jakarta.validation.Valid; import lombok.extern.slf4j.Slf4j; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.core.io.Resource; -import org.springframework.http.HttpHeaders; -import org.springframework.http.HttpStatus; -import org.springframework.http.MediaType; -import org.springframework.http.MediaTypeFactory; -import org.springframework.http.ResponseEntity; -import org.springframework.web.bind.annotation.*; +import org.springframework.http.HttpHeaders; +import org.springframework.http.HttpStatus; +import org.springframework.http.MediaType; +import org.springframework.http.MediaTypeFactory; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.*; import java.util.List; @@ -43,24 +43,26 @@ public class DatasetFileController { this.datasetFileApplicationService = datasetFileApplicationService; } - @GetMapping - public Response> getDatasetFiles( - @PathVariable("datasetId") String datasetId, - @RequestParam(value = "isWithDirectory", required = false) boolean isWithDirectory, - @RequestParam(value = "page", required = false, defaultValue = "0") Integer page, - @RequestParam(value = "size", required = false, defaultValue = "20") Integer size, - @RequestParam(value = "prefix", required = false, defaultValue = "") String prefix, - @RequestParam(value = "status", required = false) String status, - @RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation) { - PagingQuery pagingQuery = new PagingQuery(page, size); - PagedResponse filesPage; - if (isWithDirectory) { - filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(datasetId, prefix, pagingQuery); - } else { - filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation, pagingQuery); - } - return Response.ok(filesPage); - } + @GetMapping + public Response> getDatasetFiles( + @PathVariable("datasetId") String datasetId, + @RequestParam(value = "isWithDirectory", required = false) boolean isWithDirectory, + @RequestParam(value = "page", required = false, defaultValue = "0") Integer page, + @RequestParam(value = "size", required = false, defaultValue = "20") Integer size, + @RequestParam(value = "prefix", required = false, defaultValue = "") String prefix, + @RequestParam(value = "status", required = false) String status, + @RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation, + @RequestParam(value = "excludeSourceDocuments", required = false, defaultValue = "false") Boolean excludeSourceDocuments) { + PagingQuery pagingQuery = new PagingQuery(page, size); + PagedResponse filesPage; + if (isWithDirectory) { + filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(datasetId, prefix, pagingQuery); + } else { + filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation, + Boolean.TRUE.equals(excludeSourceDocuments), pagingQuery); + } + return Response.ok(filesPage); + } @GetMapping("/{fileId}") public ResponseEntity> getDatasetFileById( @@ -86,10 +88,10 @@ public class DatasetFileController { } } - @IgnoreResponseWrap - @GetMapping(value = "/{fileId}/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE + ";charset=UTF-8") - public ResponseEntity downloadDatasetFileById(@PathVariable("datasetId") String datasetId, - @PathVariable("fileId") String fileId) { + @IgnoreResponseWrap + @GetMapping(value = "/{fileId}/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE + ";charset=UTF-8") + public ResponseEntity downloadDatasetFileById(@PathVariable("datasetId") String datasetId, + @PathVariable("fileId") String fileId) { try { DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId); Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId); @@ -103,34 +105,34 @@ public class DatasetFileController { return ResponseEntity.status(HttpStatus.NOT_FOUND).build(); } catch (Exception e) { return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build(); - } - } - - @IgnoreResponseWrap - @GetMapping(value = "/{fileId}/preview", produces = MediaType.ALL_VALUE) - public ResponseEntity previewDatasetFileById(@PathVariable("datasetId") String datasetId, - @PathVariable("fileId") String fileId) { - try { - DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId); - Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId); - MediaType mediaType = MediaTypeFactory.getMediaType(resource) - .orElse(MediaType.APPLICATION_OCTET_STREAM); - - return ResponseEntity.ok() - .contentType(mediaType) - .header(HttpHeaders.CONTENT_DISPOSITION, - "inline; filename=\"" + datasetFile.getFileName() + "\"") - .body(resource); - } catch (IllegalArgumentException e) { - return ResponseEntity.status(HttpStatus.NOT_FOUND).build(); - } catch (Exception e) { - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build(); - } - } - - @IgnoreResponseWrap - @GetMapping(value = "/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE) - public void downloadDatasetFileAsZip(@PathVariable("datasetId") String datasetId, HttpServletResponse response) { + } + } + + @IgnoreResponseWrap + @GetMapping(value = "/{fileId}/preview", produces = MediaType.ALL_VALUE) + public ResponseEntity previewDatasetFileById(@PathVariable("datasetId") String datasetId, + @PathVariable("fileId") String fileId) { + try { + DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId); + Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId); + MediaType mediaType = MediaTypeFactory.getMediaType(resource) + .orElse(MediaType.APPLICATION_OCTET_STREAM); + + return ResponseEntity.ok() + .contentType(mediaType) + .header(HttpHeaders.CONTENT_DISPOSITION, + "inline; filename=\"" + datasetFile.getFileName() + "\"") + .body(resource); + } catch (IllegalArgumentException e) { + return ResponseEntity.status(HttpStatus.NOT_FOUND).build(); + } catch (Exception e) { + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build(); + } + } + + @IgnoreResponseWrap + @GetMapping(value = "/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE) + public void downloadDatasetFileAsZip(@PathVariable("datasetId") String datasetId, HttpServletResponse response) { datasetFileApplicationService.downloadDatasetFileAsZip(datasetId, response); } diff --git a/backend/services/data-management-service/src/main/resources/mappers/DatasetFileMapper.xml b/backend/services/data-management-service/src/main/resources/mappers/DatasetFileMapper.xml index 0458ab1..178a5eb 100644 --- a/backend/services/data-management-service/src/main/resources/mappers/DatasetFileMapper.xml +++ b/backend/services/data-management-service/src/main/resources/mappers/DatasetFileMapper.xml @@ -102,4 +102,12 @@ WHERE dataset_id = #{datasetId} AND file_path LIKE CONCAT(#{oldPrefix}, '%') + + diff --git a/frontend/src/components/business/DatasetFileTransfer.tsx b/frontend/src/components/business/DatasetFileTransfer.tsx index 6b73b03..d5eed58 100644 --- a/frontend/src/components/business/DatasetFileTransfer.tsx +++ b/frontend/src/components/business/DatasetFileTransfer.tsx @@ -14,15 +14,20 @@ import { import { formatBytes } from "@/utils/unit"; import { useDebouncedEffect } from "@/hooks/useDebouncedEffect"; -interface DatasetFileTransferProps - extends React.HTMLAttributes { - open: boolean; - selectedFilesMap: { [key: string]: DatasetFile }; - onSelectedFilesChange: (filesMap: { [key: string]: DatasetFile }) => void; - onDatasetSelect?: (dataset: Dataset | null) => void; - datasetTypeFilter?: DatasetType; - hasAnnotationFilter?: boolean; -} +interface DatasetFileTransferProps + extends React.HTMLAttributes { + open: boolean; + selectedFilesMap: { [key: string]: DatasetFile }; + onSelectedFilesChange: (filesMap: { [key: string]: DatasetFile }) => void; + onDatasetSelect?: (dataset: Dataset | null) => void; + datasetTypeFilter?: DatasetType; + hasAnnotationFilter?: boolean; + /** + * 是否排除已被转换为TXT的源文档文件(PDF/DOC/DOCX) + * 默认为 true,当 datasetTypeFilter 为 TEXT 时自动启用 + */ + excludeSourceDocuments?: boolean; +} const fileCols = [ { @@ -47,15 +52,18 @@ const fileCols = [ ]; // Customize Table Transfer -const DatasetFileTransfer: React.FC = ({ - open, - selectedFilesMap, - onSelectedFilesChange, - onDatasetSelect, - datasetTypeFilter = DatasetType.TEXT, - hasAnnotationFilter, - ...props -}) => { +const DatasetFileTransfer: React.FC = ({ + open, + selectedFilesMap, + onSelectedFilesChange, + onDatasetSelect, + datasetTypeFilter = DatasetType.TEXT, + hasAnnotationFilter, + excludeSourceDocuments, + ...props +}) => { + // 当未指定时,根据数据集类型自动决定是否排除源文档 + const shouldExcludeSourceDocuments = excludeSourceDocuments ?? (datasetTypeFilter === DatasetType.TEXT); const [datasets, setDatasets] = React.useState([]); const [datasetSearch, setDatasetSearch] = React.useState(""); const [datasetPagination, setDatasetPagination] = React.useState<{ @@ -66,13 +74,13 @@ const DatasetFileTransfer: React.FC = ({ const [files, setFiles] = React.useState([]); const [filesSearch, setFilesSearch] = React.useState(""); - const [filesPagination, setFilesPagination] = React.useState<{ - current: number; - pageSize: number; - total: number; - }>({ current: 1, pageSize: 10, total: 0 }); - const filesPage = filesPagination.current; - const filesPageSize = filesPagination.pageSize; + const [filesPagination, setFilesPagination] = React.useState<{ + current: number; + pageSize: number; + total: number; + }>({ current: 1, pageSize: 10, total: 0 }); + const filesPage = filesPagination.current; + const filesPageSize = filesPagination.pageSize; const [showFiles, setShowFiles] = React.useState(false); const [selectedDataset, setSelectedDataset] = React.useState( @@ -108,19 +116,20 @@ const DatasetFileTransfer: React.FC = ({ const fetchFiles = useCallback( async ( - options?: Partial<{ page: number; pageSize: number; keyword: string }> - ) => { - if (!selectedDataset) return; - const page = options?.page ?? filesPage; - const pageSize = options?.pageSize ?? filesPageSize; - const keyword = options?.keyword ?? filesSearch; - - const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, { - page, - size: pageSize, - keyword, - ...(hasAnnotationFilter ? { hasAnnotation: true } : {}), - }); + options?: Partial<{ page: number; pageSize: number; keyword: string }> + ) => { + if (!selectedDataset) return; + const page = options?.page ?? filesPage; + const pageSize = options?.pageSize ?? filesPageSize; + const keyword = options?.keyword ?? filesSearch; + + const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, { + page, + size: pageSize, + keyword, + ...(hasAnnotationFilter ? { hasAnnotation: true } : {}), + ...(shouldExcludeSourceDocuments ? { excludeSourceDocuments: true } : {}), + }); setFiles( (data.content || []).map((item: DatasetFile) => ({ ...item, @@ -135,15 +144,16 @@ const DatasetFileTransfer: React.FC = ({ pageSize, total: data.totalElements, })); - }, - [ - selectedDataset, - filesPage, - filesPageSize, - filesSearch, - hasAnnotationFilter, - ] - ); + }, + [ + selectedDataset, + filesPage, + filesPageSize, + filesSearch, + hasAnnotationFilter, + shouldExcludeSourceDocuments, + ] + ); useEffect(() => { // 当数据集变化时,重置文件分页并拉取第一页文件,避免额外的循环请求 @@ -178,11 +188,12 @@ const DatasetFileTransfer: React.FC = ({ const allFiles: DatasetFile[] = []; while (true) { - const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, { - page, - size: pageSize, - ...(hasAnnotationFilter ? { hasAnnotation: true } : {}), - }); + const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, { + page, + size: pageSize, + ...(hasAnnotationFilter ? { hasAnnotation: true } : {}), + ...(shouldExcludeSourceDocuments ? { excludeSourceDocuments: true } : {}), + }); const content: DatasetFile[] = (data.content || []).map( (item: DatasetFile) => ({ @@ -229,7 +240,7 @@ const DatasetFileTransfer: React.FC = ({ } finally { setSelectingAll(false); } - }, [selectedDataset, selectedFilesMap, onSelectedFilesChange, hasAnnotationFilter]); + }, [selectedDataset, selectedFilesMap, onSelectedFilesChange, hasAnnotationFilter, shouldExcludeSourceDocuments]); const toggleSelectFile = (record: DatasetFile) => { if (!selectedFilesMap[record.id]) { @@ -400,10 +411,10 @@ const DatasetFileTransfer: React.FC = ({ }, // 全选 - 改为全选整个数据集而不是当前页 - onSelectAll: (selected) => { - if (selected) { - // 点击表头“全选”时,改为一键全选当前数据集的全部文件 - // 而不是只选中当前页 + onSelectAll: (selected) => { + if (selected) { + // 点击表头“全选”时,改为一键全选当前数据集的全部文件 + // 而不是只选中当前页 handleSelectAllInDataset(); } else { // 取消表头“全选”时,清空当前已选文件 diff --git a/frontend/src/pages/DataAnnotation/Create/components/CreateAnnotationTaskDialog.tsx b/frontend/src/pages/DataAnnotation/Create/components/CreateAnnotationTaskDialog.tsx index 2107475..451e1e1 100644 --- a/frontend/src/pages/DataAnnotation/Create/components/CreateAnnotationTaskDialog.tsx +++ b/frontend/src/pages/DataAnnotation/Create/components/CreateAnnotationTaskDialog.tsx @@ -275,7 +275,12 @@ export default function CreateAnnotationTask({ } setDatasetPreviewLoading(true); try { - const res = await queryDatasetFilesUsingGet(selectedDatasetId, { page: 0, size: 10 }); + // 对于文本数据集,排除已被转换为TXT的源文档文件(PDF/DOC/DOCX) + const params: { page: number; size: number; excludeSourceDocuments?: boolean } = { page: 0, size: 10 }; + if (isTextDataset) { + params.excludeSourceDocuments = true; + } + const res = await queryDatasetFilesUsingGet(selectedDatasetId, params); if (res.code === '0' && res.data) { setDatasetPreviewData((res.data.content || []) as DatasetPreviewFile[]); setDatasetPreviewVisible(true);