feat(repository): 添加查询衍生文件源文件ID功能

- 在 DatasetFileRepository 接口中添加 findSourceFileIdsWithDerivedFiles 方法定义
- 在 DatasetFileRepositoryImpl 实现类中实现该方法
- 添加查询 metadata 中包含 derived_from_file_id 记录的源文件ID逻辑
- 提供完整的 JavaDoc 文档注释说明方法用途和参数
This commit is contained in:
2026-01-29 14:38:16 +08:00
parent 0dba604cd3
commit 6eb7507adf
8 changed files with 300 additions and 214 deletions

View File

@@ -13,12 +13,12 @@ import com.datamate.common.infrastructure.exception.CommonErrorCode;
import com.datamate.common.infrastructure.exception.SystemErrorCode; import com.datamate.common.infrastructure.exception.SystemErrorCode;
import com.datamate.common.interfaces.PagedResponse; import com.datamate.common.interfaces.PagedResponse;
import com.datamate.common.interfaces.PagingQuery; import com.datamate.common.interfaces.PagingQuery;
import com.datamate.datamanagement.common.enums.DuplicateMethod; import com.datamate.datamanagement.common.enums.DuplicateMethod;
import com.datamate.datamanagement.common.enums.DatasetType; import com.datamate.datamanagement.common.enums.DatasetType;
import com.datamate.datamanagement.domain.contants.DatasetConstant; import com.datamate.datamanagement.domain.contants.DatasetConstant;
import com.datamate.datamanagement.domain.model.dataset.Dataset; import com.datamate.datamanagement.domain.model.dataset.Dataset;
import com.datamate.datamanagement.domain.model.dataset.DatasetFile; import com.datamate.datamanagement.domain.model.dataset.DatasetFile;
import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo; import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckInfo;
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode; import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
@@ -66,16 +66,16 @@ import java.util.stream.Stream;
@Slf4j @Slf4j
@Service @Service
@Transactional @Transactional
public class DatasetFileApplicationService { public class DatasetFileApplicationService {
private static final String PDF_FILE_TYPE = "pdf"; private static final String PDF_FILE_TYPE = "pdf";
private static final String DOC_FILE_TYPE = "doc"; private static final String DOC_FILE_TYPE = "doc";
private static final String DOCX_FILE_TYPE = "docx"; private static final String DOCX_FILE_TYPE = "docx";
private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE); private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE);
private final DatasetFileRepository datasetFileRepository; private final DatasetFileRepository datasetFileRepository;
private final DatasetRepository datasetRepository; private final DatasetRepository datasetRepository;
private final FileService fileService; private final FileService fileService;
private final PdfTextExtractAsyncService pdfTextExtractAsyncService; private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
@Value("${datamate.data-management.base-path:/dataset}") @Value("${datamate.data-management.base-path:/dataset}")
private String datasetBasePath; private String datasetBasePath;
@@ -83,27 +83,62 @@ public class DatasetFileApplicationService {
@Value("${datamate.data-management.file.duplicate:COVER}") @Value("${datamate.data-management.file.duplicate:COVER}")
private DuplicateMethod duplicateMethod; private DuplicateMethod duplicateMethod;
@Autowired @Autowired
public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository, public DatasetFileApplicationService(DatasetFileRepository datasetFileRepository,
DatasetRepository datasetRepository, DatasetRepository datasetRepository,
FileService fileService, FileService fileService,
PdfTextExtractAsyncService pdfTextExtractAsyncService) { PdfTextExtractAsyncService pdfTextExtractAsyncService) {
this.datasetFileRepository = datasetFileRepository; this.datasetFileRepository = datasetFileRepository;
this.datasetRepository = datasetRepository; this.datasetRepository = datasetRepository;
this.fileService = fileService; this.fileService = fileService;
this.pdfTextExtractAsyncService = pdfTextExtractAsyncService; this.pdfTextExtractAsyncService = pdfTextExtractAsyncService;
} }
/** /**
* 获取数据集文件列表 * 获取数据集文件列表
*/ */
@Transactional(readOnly = true) @Transactional(readOnly = true)
public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name, public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name,
Boolean hasAnnotation, PagingQuery pagingQuery) { Boolean hasAnnotation, PagingQuery pagingQuery) {
IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize()); return getDatasetFiles(datasetId, fileType, status, name, hasAnnotation, false, pagingQuery);
IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page); }
return PagedResponse.of(files);
} /**
* 获取数据集文件列表,支持排除已被转换为TXT的源文档文件
*
* @param datasetId 数据集ID
* @param fileType 文件类型过滤
* @param status 状态过滤
* @param name 文件名模糊查询
* @param hasAnnotation 是否有标注
* @param excludeSourceDocuments 是否排除已被转换为TXT的源文档(PDF/DOC/DOCX)
* @param pagingQuery 分页参数
* @return 分页文件列表
*/
@Transactional(readOnly = true)
public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name,
Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) {
IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize());
IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page);
if (excludeSourceDocuments) {
// 查询所有作为衍生TXT文件源的文档文件ID
List<String> sourceFileIds = datasetFileRepository.findSourceFileIdsWithDerivedFiles(datasetId);
if (!sourceFileIds.isEmpty()) {
// 过滤掉源文件
List<DatasetFile> filteredRecords = files.getRecords().stream()
.filter(file -> !sourceFileIds.contains(file.getId()))
.collect(Collectors.toList());
// 重新构建分页结果
Page<DatasetFile> filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal());
filteredPage.setRecords(filteredRecords);
return PagedResponse.of(filteredPage);
}
}
return PagedResponse.of(files);
}
/** /**
* 获取数据集文件列表 * 获取数据集文件列表
@@ -333,11 +368,11 @@ public class DatasetFileApplicationService {
* @return 请求id * @return 请求id
*/ */
@Transactional @Transactional
public String preUpload(UploadFilesPreRequest chunkUploadRequest, String datasetId) { public String preUpload(UploadFilesPreRequest chunkUploadRequest, String datasetId) {
Dataset dataset = datasetRepository.getById(datasetId); Dataset dataset = datasetRepository.getById(datasetId);
if (Objects.isNull(dataset)) { if (Objects.isNull(dataset)) {
throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND); throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND);
} }
// 构建上传路径,如果有 prefix 则追加到路径中 // 构建上传路径,如果有 prefix 则追加到路径中
String prefix = Optional.ofNullable(chunkUploadRequest.getPrefix()).orElse("").trim(); String prefix = Optional.ofNullable(chunkUploadRequest.getPrefix()).orElse("").trim();
@@ -346,13 +381,13 @@ public class DatasetFileApplicationService {
prefix = prefix.substring(1); prefix = prefix.substring(1);
} }
String uploadPath = dataset.getPath(); String uploadPath = dataset.getPath();
if (uploadPath == null || uploadPath.isBlank()) { if (uploadPath == null || uploadPath.isBlank()) {
uploadPath = datasetBasePath + File.separator + datasetId; uploadPath = datasetBasePath + File.separator + datasetId;
} }
if (!prefix.isEmpty()) { if (!prefix.isEmpty()) {
uploadPath = uploadPath + File.separator + prefix.replace("/", File.separator); uploadPath = uploadPath + File.separator + prefix.replace("/", File.separator);
} }
ChunkUploadPreRequest request = ChunkUploadPreRequest.builder().build(); ChunkUploadPreRequest request = ChunkUploadPreRequest.builder().build();
request.setUploadPath(uploadPath); request.setUploadPath(uploadPath);
@@ -414,24 +449,24 @@ public class DatasetFileApplicationService {
for (FileUploadResult file : unpacked) { for (FileUploadResult file : unpacked) {
File savedFile = file.getSavedFile(); File savedFile = file.getSavedFile();
LocalDateTime currentTime = LocalDateTime.now(); LocalDateTime currentTime = LocalDateTime.now();
DatasetFile datasetFile = DatasetFile.builder() DatasetFile datasetFile = DatasetFile.builder()
.id(UUID.randomUUID().toString()) .id(UUID.randomUUID().toString())
.datasetId(datasetId) .datasetId(datasetId)
.fileSize(savedFile.length()) .fileSize(savedFile.length())
.uploadTime(currentTime) .uploadTime(currentTime)
.lastAccessTime(currentTime) .lastAccessTime(currentTime)
.fileName(file.getFileName()) .fileName(file.getFileName())
.filePath(savedFile.getPath()) .filePath(savedFile.getPath())
.fileType(AnalyzerUtils.getExtension(file.getFileName())) .fileType(AnalyzerUtils.getExtension(file.getFileName()))
.build(); .build();
setDatasetFileId(datasetFile, dataset); setDatasetFileId(datasetFile, dataset);
datasetFileRepository.saveOrUpdate(datasetFile); datasetFileRepository.saveOrUpdate(datasetFile);
dataset.addFile(datasetFile); dataset.addFile(datasetFile);
triggerPdfTextExtraction(dataset, datasetFile); triggerPdfTextExtraction(dataset, datasetFile);
} }
dataset.active(); dataset.active();
datasetRepository.updateById(dataset); datasetRepository.updateById(dataset);
} }
/** /**
* 在数据集下创建子目录 * 在数据集下创建子目录
@@ -697,29 +732,29 @@ public class DatasetFileApplicationService {
dataset.addFile(datasetFile); dataset.addFile(datasetFile);
copiedFiles.add(datasetFile); copiedFiles.add(datasetFile);
} }
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100); datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
dataset.active(); dataset.active();
datasetRepository.updateById(dataset); datasetRepository.updateById(dataset);
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset)); CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
return copiedFiles; return copiedFiles;
} }
private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) { private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
for (String sourcePath : sourcePaths) { for (String sourcePath : sourcePaths) {
Path sourceFilePath = Paths.get(sourcePath); Path sourceFilePath = Paths.get(sourcePath);
Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString()); Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
try { try {
Files.createDirectories(Path.of(dataset.getPath())); Files.createDirectories(Path.of(dataset.getPath()));
Files.copy(sourceFilePath, targetFilePath); Files.copy(sourceFilePath, targetFilePath);
DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName( DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
dataset.getId(), dataset.getId(),
sourceFilePath.getFileName().toString() sourceFilePath.getFileName().toString()
); );
triggerPdfTextExtraction(dataset, datasetFile); triggerPdfTextExtraction(dataset, datasetFile);
} catch (IOException e) { } catch (IOException e) {
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e); log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
} }
} }
} }
/** /**
@@ -765,30 +800,30 @@ public class DatasetFileApplicationService {
.lastAccessTime(currentTime) .lastAccessTime(currentTime)
.metadata(metadata) .metadata(metadata)
.build(); .build();
setDatasetFileId(datasetFile, dataset); setDatasetFileId(datasetFile, dataset);
dataset.addFile(datasetFile); dataset.addFile(datasetFile);
addedFiles.add(datasetFile); addedFiles.add(datasetFile);
triggerPdfTextExtraction(dataset, datasetFile); triggerPdfTextExtraction(dataset, datasetFile);
} }
datasetFileRepository.saveOrUpdateBatch(addedFiles, 100); datasetFileRepository.saveOrUpdateBatch(addedFiles, 100);
dataset.active(); dataset.active();
datasetRepository.updateById(dataset); datasetRepository.updateById(dataset);
// Note: addFilesToDataset only creates DB records, no file system operations // Note: addFilesToDataset only creates DB records, no file system operations
// If file copy is needed, use copyFilesToDatasetDir endpoint instead // If file copy is needed, use copyFilesToDatasetDir endpoint instead
return addedFiles; return addedFiles;
} }
private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) { private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) {
if (dataset == null || datasetFile == null) { if (dataset == null || datasetFile == null) {
return; return;
} }
if (dataset.getDatasetType() != DatasetType.TEXT) { if (dataset.getDatasetType() != DatasetType.TEXT) {
return; return;
} }
String fileType = datasetFile.getFileType(); String fileType = datasetFile.getFileType();
if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) { if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) {
return; return;
} }
pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId()); pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId());
} }
} }

View File

@@ -29,4 +29,13 @@ public interface DatasetFileMapper extends BaseMapper<DatasetFile> {
int updateFilePathPrefix(@Param("datasetId") String datasetId, int updateFilePathPrefix(@Param("datasetId") String datasetId,
@Param("oldPrefix") String oldPrefix, @Param("oldPrefix") String oldPrefix,
@Param("newPrefix") String newPrefix); @Param("newPrefix") String newPrefix);
/**
* 查询数据集中所有作为衍生文件源文件的ID列表
* 通过查询 metadata 中包含 derived_from_file_id 的字段值
*
* @param datasetId 数据集ID
* @return 源文件ID列表
*/
List<String> findSourceFileIdsWithDerivedFiles(@Param("datasetId") String datasetId);
} }

View File

@@ -27,4 +27,13 @@ public interface DatasetFileRepository extends IRepository<DatasetFile> {
Boolean hasAnnotation, IPage<DatasetFile> page); Boolean hasAnnotation, IPage<DatasetFile> page);
int updateFilePathPrefix(String datasetId, String oldPrefix, String newPrefix); int updateFilePathPrefix(String datasetId, String oldPrefix, String newPrefix);
/**
* 查询数据集中所有作为衍生文件源文件的ID列表
* 通过查询 metadata 中包含 derived_from_file_id 的记录,返回所有源文件ID
*
* @param datasetId 数据集ID
* @return 源文件ID列表
*/
List<String> findSourceFileIdsWithDerivedFiles(String datasetId);
} }

View File

@@ -64,4 +64,11 @@ public class DatasetFileRepositoryImpl extends CrudRepository<DatasetFileMapper,
public int updateFilePathPrefix(String datasetId, String oldPrefix, String newPrefix) { public int updateFilePathPrefix(String datasetId, String oldPrefix, String newPrefix) {
return datasetFileMapper.updateFilePathPrefix(datasetId, oldPrefix, newPrefix); return datasetFileMapper.updateFilePathPrefix(datasetId, oldPrefix, newPrefix);
} }
@Override
public List<String> findSourceFileIdsWithDerivedFiles(String datasetId) {
// 查询 metadata 中包含 derived_from_file_id 的记录的源文件ID
// 使用 MyBatis 的 @Select 注解或直接调用 mapper 方法
return datasetFileMapper.findSourceFileIdsWithDerivedFiles(datasetId);
}
} }

View File

@@ -19,12 +19,12 @@ import jakarta.validation.Valid;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.Resource; import org.springframework.core.io.Resource;
import org.springframework.http.HttpHeaders; import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpStatus; import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.http.MediaTypeFactory; import org.springframework.http.MediaTypeFactory;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.*; import org.springframework.web.bind.annotation.*;
import java.util.List; import java.util.List;
@@ -43,24 +43,26 @@ public class DatasetFileController {
this.datasetFileApplicationService = datasetFileApplicationService; this.datasetFileApplicationService = datasetFileApplicationService;
} }
@GetMapping @GetMapping
public Response<PagedResponse<DatasetFile>> getDatasetFiles( public Response<PagedResponse<DatasetFile>> getDatasetFiles(
@PathVariable("datasetId") String datasetId, @PathVariable("datasetId") String datasetId,
@RequestParam(value = "isWithDirectory", required = false) boolean isWithDirectory, @RequestParam(value = "isWithDirectory", required = false) boolean isWithDirectory,
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page, @RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size, @RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
@RequestParam(value = "prefix", required = false, defaultValue = "") String prefix, @RequestParam(value = "prefix", required = false, defaultValue = "") String prefix,
@RequestParam(value = "status", required = false) String status, @RequestParam(value = "status", required = false) String status,
@RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation) { @RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation,
PagingQuery pagingQuery = new PagingQuery(page, size); @RequestParam(value = "excludeSourceDocuments", required = false, defaultValue = "false") Boolean excludeSourceDocuments) {
PagedResponse<DatasetFile> filesPage; PagingQuery pagingQuery = new PagingQuery(page, size);
if (isWithDirectory) { PagedResponse<DatasetFile> filesPage;
filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(datasetId, prefix, pagingQuery); if (isWithDirectory) {
} else { filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(datasetId, prefix, pagingQuery);
filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation, pagingQuery); } else {
} filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation,
return Response.ok(filesPage); Boolean.TRUE.equals(excludeSourceDocuments), pagingQuery);
} }
return Response.ok(filesPage);
}
@GetMapping("/{fileId}") @GetMapping("/{fileId}")
public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById( public ResponseEntity<Response<DatasetFileResponse>> getDatasetFileById(
@@ -86,10 +88,10 @@ public class DatasetFileController {
} }
} }
@IgnoreResponseWrap @IgnoreResponseWrap
@GetMapping(value = "/{fileId}/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE + ";charset=UTF-8") @GetMapping(value = "/{fileId}/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE + ";charset=UTF-8")
public ResponseEntity<Resource> downloadDatasetFileById(@PathVariable("datasetId") String datasetId, public ResponseEntity<Resource> downloadDatasetFileById(@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) { @PathVariable("fileId") String fileId) {
try { try {
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId); DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId); Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
@@ -103,34 +105,34 @@ public class DatasetFileController {
return ResponseEntity.status(HttpStatus.NOT_FOUND).build(); return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
} catch (Exception e) { } catch (Exception e) {
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build(); return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build();
} }
} }
@IgnoreResponseWrap @IgnoreResponseWrap
@GetMapping(value = "/{fileId}/preview", produces = MediaType.ALL_VALUE) @GetMapping(value = "/{fileId}/preview", produces = MediaType.ALL_VALUE)
public ResponseEntity<Resource> previewDatasetFileById(@PathVariable("datasetId") String datasetId, public ResponseEntity<Resource> previewDatasetFileById(@PathVariable("datasetId") String datasetId,
@PathVariable("fileId") String fileId) { @PathVariable("fileId") String fileId) {
try { try {
DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId); DatasetFile datasetFile = datasetFileApplicationService.getDatasetFile(datasetId, fileId);
Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId); Resource resource = datasetFileApplicationService.downloadFile(datasetId, fileId);
MediaType mediaType = MediaTypeFactory.getMediaType(resource) MediaType mediaType = MediaTypeFactory.getMediaType(resource)
.orElse(MediaType.APPLICATION_OCTET_STREAM); .orElse(MediaType.APPLICATION_OCTET_STREAM);
return ResponseEntity.ok() return ResponseEntity.ok()
.contentType(mediaType) .contentType(mediaType)
.header(HttpHeaders.CONTENT_DISPOSITION, .header(HttpHeaders.CONTENT_DISPOSITION,
"inline; filename=\"" + datasetFile.getFileName() + "\"") "inline; filename=\"" + datasetFile.getFileName() + "\"")
.body(resource); .body(resource);
} catch (IllegalArgumentException e) { } catch (IllegalArgumentException e) {
return ResponseEntity.status(HttpStatus.NOT_FOUND).build(); return ResponseEntity.status(HttpStatus.NOT_FOUND).build();
} catch (Exception e) { } catch (Exception e) {
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build(); return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).build();
} }
} }
@IgnoreResponseWrap @IgnoreResponseWrap
@GetMapping(value = "/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE) @GetMapping(value = "/download", produces = MediaType.APPLICATION_OCTET_STREAM_VALUE)
public void downloadDatasetFileAsZip(@PathVariable("datasetId") String datasetId, HttpServletResponse response) { public void downloadDatasetFileAsZip(@PathVariable("datasetId") String datasetId, HttpServletResponse response) {
datasetFileApplicationService.downloadDatasetFileAsZip(datasetId, response); datasetFileApplicationService.downloadDatasetFileAsZip(datasetId, response);
} }

View File

@@ -102,4 +102,12 @@
WHERE dataset_id = #{datasetId} WHERE dataset_id = #{datasetId}
AND file_path LIKE CONCAT(#{oldPrefix}, '%') AND file_path LIKE CONCAT(#{oldPrefix}, '%')
</update> </update>
<select id="findSourceFileIdsWithDerivedFiles" resultType="java.lang.String">
SELECT DISTINCT JSON_UNQUOTE(JSON_EXTRACT(metadata, '$.derived_from_file_id')) AS source_file_id
FROM t_dm_dataset_files
WHERE dataset_id = #{datasetId}
AND metadata IS NOT NULL
AND JSON_EXTRACT(metadata, '$.derived_from_file_id') IS NOT NULL
</select>
</mapper> </mapper>

View File

@@ -14,15 +14,20 @@ import {
import { formatBytes } from "@/utils/unit"; import { formatBytes } from "@/utils/unit";
import { useDebouncedEffect } from "@/hooks/useDebouncedEffect"; import { useDebouncedEffect } from "@/hooks/useDebouncedEffect";
interface DatasetFileTransferProps interface DatasetFileTransferProps
extends React.HTMLAttributes<HTMLDivElement> { extends React.HTMLAttributes<HTMLDivElement> {
open: boolean; open: boolean;
selectedFilesMap: { [key: string]: DatasetFile }; selectedFilesMap: { [key: string]: DatasetFile };
onSelectedFilesChange: (filesMap: { [key: string]: DatasetFile }) => void; onSelectedFilesChange: (filesMap: { [key: string]: DatasetFile }) => void;
onDatasetSelect?: (dataset: Dataset | null) => void; onDatasetSelect?: (dataset: Dataset | null) => void;
datasetTypeFilter?: DatasetType; datasetTypeFilter?: DatasetType;
hasAnnotationFilter?: boolean; hasAnnotationFilter?: boolean;
} /**
* 是否排除已被转换为TXT的源文档文件(PDF/DOC/DOCX)
* 默认为 true,当 datasetTypeFilter 为 TEXT 时自动启用
*/
excludeSourceDocuments?: boolean;
}
const fileCols = [ const fileCols = [
{ {
@@ -47,15 +52,18 @@ const fileCols = [
]; ];
// Customize Table Transfer // Customize Table Transfer
const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
open, open,
selectedFilesMap, selectedFilesMap,
onSelectedFilesChange, onSelectedFilesChange,
onDatasetSelect, onDatasetSelect,
datasetTypeFilter = DatasetType.TEXT, datasetTypeFilter = DatasetType.TEXT,
hasAnnotationFilter, hasAnnotationFilter,
...props excludeSourceDocuments,
}) => { ...props
}) => {
// 当未指定时,根据数据集类型自动决定是否排除源文档
const shouldExcludeSourceDocuments = excludeSourceDocuments ?? (datasetTypeFilter === DatasetType.TEXT);
const [datasets, setDatasets] = React.useState<Dataset[]>([]); const [datasets, setDatasets] = React.useState<Dataset[]>([]);
const [datasetSearch, setDatasetSearch] = React.useState<string>(""); const [datasetSearch, setDatasetSearch] = React.useState<string>("");
const [datasetPagination, setDatasetPagination] = React.useState<{ const [datasetPagination, setDatasetPagination] = React.useState<{
@@ -66,13 +74,13 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
const [files, setFiles] = React.useState<DatasetFile[]>([]); const [files, setFiles] = React.useState<DatasetFile[]>([]);
const [filesSearch, setFilesSearch] = React.useState<string>(""); const [filesSearch, setFilesSearch] = React.useState<string>("");
const [filesPagination, setFilesPagination] = React.useState<{ const [filesPagination, setFilesPagination] = React.useState<{
current: number; current: number;
pageSize: number; pageSize: number;
total: number; total: number;
}>({ current: 1, pageSize: 10, total: 0 }); }>({ current: 1, pageSize: 10, total: 0 });
const filesPage = filesPagination.current; const filesPage = filesPagination.current;
const filesPageSize = filesPagination.pageSize; const filesPageSize = filesPagination.pageSize;
const [showFiles, setShowFiles] = React.useState<boolean>(false); const [showFiles, setShowFiles] = React.useState<boolean>(false);
const [selectedDataset, setSelectedDataset] = React.useState<Dataset | null>( const [selectedDataset, setSelectedDataset] = React.useState<Dataset | null>(
@@ -108,19 +116,20 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
const fetchFiles = useCallback( const fetchFiles = useCallback(
async ( async (
options?: Partial<{ page: number; pageSize: number; keyword: string }> options?: Partial<{ page: number; pageSize: number; keyword: string }>
) => { ) => {
if (!selectedDataset) return; if (!selectedDataset) return;
const page = options?.page ?? filesPage; const page = options?.page ?? filesPage;
const pageSize = options?.pageSize ?? filesPageSize; const pageSize = options?.pageSize ?? filesPageSize;
const keyword = options?.keyword ?? filesSearch; const keyword = options?.keyword ?? filesSearch;
const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, { const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, {
page, page,
size: pageSize, size: pageSize,
keyword, keyword,
...(hasAnnotationFilter ? { hasAnnotation: true } : {}), ...(hasAnnotationFilter ? { hasAnnotation: true } : {}),
}); ...(shouldExcludeSourceDocuments ? { excludeSourceDocuments: true } : {}),
});
setFiles( setFiles(
(data.content || []).map((item: DatasetFile) => ({ (data.content || []).map((item: DatasetFile) => ({
...item, ...item,
@@ -135,15 +144,16 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
pageSize, pageSize,
total: data.totalElements, total: data.totalElements,
})); }));
}, },
[ [
selectedDataset, selectedDataset,
filesPage, filesPage,
filesPageSize, filesPageSize,
filesSearch, filesSearch,
hasAnnotationFilter, hasAnnotationFilter,
] shouldExcludeSourceDocuments,
); ]
);
useEffect(() => { useEffect(() => {
// 当数据集变化时,重置文件分页并拉取第一页文件,避免额外的循环请求 // 当数据集变化时,重置文件分页并拉取第一页文件,避免额外的循环请求
@@ -178,11 +188,12 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
const allFiles: DatasetFile[] = []; const allFiles: DatasetFile[] = [];
while (true) { while (true) {
const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, { const { data } = await queryDatasetFilesUsingGet(selectedDataset.id, {
page, page,
size: pageSize, size: pageSize,
...(hasAnnotationFilter ? { hasAnnotation: true } : {}), ...(hasAnnotationFilter ? { hasAnnotation: true } : {}),
}); ...(shouldExcludeSourceDocuments ? { excludeSourceDocuments: true } : {}),
});
const content: DatasetFile[] = (data.content || []).map( const content: DatasetFile[] = (data.content || []).map(
(item: DatasetFile) => ({ (item: DatasetFile) => ({
@@ -229,7 +240,7 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
} finally { } finally {
setSelectingAll(false); setSelectingAll(false);
} }
}, [selectedDataset, selectedFilesMap, onSelectedFilesChange, hasAnnotationFilter]); }, [selectedDataset, selectedFilesMap, onSelectedFilesChange, hasAnnotationFilter, shouldExcludeSourceDocuments]);
const toggleSelectFile = (record: DatasetFile) => { const toggleSelectFile = (record: DatasetFile) => {
if (!selectedFilesMap[record.id]) { if (!selectedFilesMap[record.id]) {
@@ -400,10 +411,10 @@ const DatasetFileTransfer: React.FC<DatasetFileTransferProps> = ({
}, },
// 全选 - 改为全选整个数据集而不是当前页 // 全选 - 改为全选整个数据集而不是当前页
onSelectAll: (selected) => { onSelectAll: (selected) => {
if (selected) { if (selected) {
// 点击表头“全选”时,改为一键全选当前数据集的全部文件 // 点击表头“全选”时,改为一键全选当前数据集的全部文件
// 而不是只选中当前页 // 而不是只选中当前页
handleSelectAllInDataset(); handleSelectAllInDataset();
} else { } else {
// 取消表头“全选”时,清空当前已选文件 // 取消表头“全选”时,清空当前已选文件

View File

@@ -275,7 +275,12 @@ export default function CreateAnnotationTask({
} }
setDatasetPreviewLoading(true); setDatasetPreviewLoading(true);
try { try {
const res = await queryDatasetFilesUsingGet(selectedDatasetId, { page: 0, size: 10 }); // 对于文本数据集,排除已被转换为TXT的源文档文件(PDF/DOC/DOCX)
const params: { page: number; size: number; excludeSourceDocuments?: boolean } = { page: 0, size: 10 };
if (isTextDataset) {
params.excludeSourceDocuments = true;
}
const res = await queryDatasetFilesUsingGet(selectedDatasetId, params);
if (res.code === '0' && res.data) { if (res.code === '0' && res.data) {
setDatasetPreviewData((res.data.content || []) as DatasetPreviewFile[]); setDatasetPreviewData((res.data.content || []) as DatasetPreviewFile[]);
setDatasetPreviewVisible(true); setDatasetPreviewVisible(true);