feat(data-management): 扩展源文档排除功能支持Excel文件类型

- 在后端服务中扩展源文档类型检查,新增对XLS和XLSX文件的支持
- 修改DatasetFileApplicationService中的过滤逻辑,统一处理所有源文档类型
- 新增isSourceDocument和isDerivedFile辅助方法进行文件类型判断
- 更新前端DatasetFileTransfer组件中的注释说明
- 在Python运行时依赖中添加openpyxl和xlrd库以支持Excel文件处理
- 修改标注项目接口中源文档类型的集合定义
- 更新文件操作钩子中的派生文件排除逻辑
This commit is contained in:
2026-01-31 11:30:55 +08:00
parent 6c7ea0c25e
commit b5d7c66240
8 changed files with 210 additions and 119 deletions

View File

@@ -22,15 +22,16 @@ import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckIn
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import com.datamate.datamanagement.interfaces.dto.AddFilesRequest;
import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import jakarta.servlet.http.HttpServletResponse;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import com.datamate.datamanagement.interfaces.dto.AddFilesRequest;
import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import jakarta.servlet.http.HttpServletResponse;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
@@ -45,16 +46,16 @@ import org.springframework.transaction.annotation.Transactional;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.BasicFileAttributes;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.BasicFileAttributes;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import java.util.function.Function;
import java.util.stream.Collectors;
@@ -79,11 +80,12 @@ public class DatasetFileApplicationService {
XLS_FILE_TYPE,
XLSX_FILE_TYPE
);
private final DatasetFileRepository datasetFileRepository;
private final DatasetRepository datasetRepository;
private final FileService fileService;
private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
private static final String DERIVED_METADATA_KEY = "derived_from_file_id";
private final DatasetFileRepository datasetFileRepository;
private final DatasetRepository datasetRepository;
private final FileService fileService;
private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
@Value("${datamate.data-management.base-path:/dataset}")
private String datasetBasePath;
@@ -119,57 +121,61 @@ public class DatasetFileApplicationService {
* @param status 状态过滤
* @param name 文件名模糊查询
* @param hasAnnotation 是否有标注
* @param excludeSourceDocuments 是否排除已被转换为TXT的源文档(PDF/DOC/DOCX)
* @param excludeSourceDocuments 是否排除源文档(PDF/DOC/DOCX/XLS/XLSX
* @param pagingQuery 分页参数
* @return 分页文件列表
*/
@Transactional(readOnly = true)
public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name,
Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) {
IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize());
IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page);
if (excludeSourceDocuments) {
// 查询所有作为衍生TXT文件源的文档文件ID
List<String> sourceFileIds = datasetFileRepository.findSourceFileIdsWithDerivedFiles(datasetId);
if (!sourceFileIds.isEmpty()) {
// 过滤掉源文件
List<DatasetFile> filteredRecords = files.getRecords().stream()
.filter(file -> !sourceFileIds.contains(file.getId()))
.collect(Collectors.toList());
// 重新构建分页结果
Page<DatasetFile> filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal());
filteredPage.setRecords(filteredRecords);
return PagedResponse.of(filteredPage);
}
}
return PagedResponse.of(files);
}
public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name,
Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) {
IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize());
IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page);
if (excludeSourceDocuments) {
// 过滤掉源文档文件(PDF/DOC/DOCX/XLS/XLSX),用于标注场景只展示派生文件
List<DatasetFile> filteredRecords = files.getRecords().stream()
.filter(file -> !isSourceDocument(file))
.collect(Collectors.toList());
// 重新构建分页结果
Page<DatasetFile> filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal());
filteredPage.setRecords(filteredRecords);
return PagedResponse.of(filteredPage);
}
return PagedResponse.of(files);
}
/**
* 获取数据集文件列表
*/
@Transactional(readOnly = true)
public PagedResponse<DatasetFile> getDatasetFilesWithDirectory(String datasetId, String prefix, PagingQuery pagingQuery) {
Dataset dataset = datasetRepository.getById(datasetId);
int page = Math.max(pagingQuery.getPage(), 1);
int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize();
if (dataset == null) {
return PagedResponse.of(new Page<>(page, size));
}
String datasetPath = dataset.getPath();
Path queryPath = Path.of(dataset.getPath() + File.separator + prefix);
Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId)
.stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
try (Stream<Path> pathStream = Files.list(queryPath)) {
List<Path> allFiles = pathStream
.filter(path -> path.toString().startsWith(datasetPath))
.sorted(Comparator
.comparing((Path path) -> !Files.isDirectory(path))
.thenComparing(path -> path.getFileName().toString()))
.collect(Collectors.toList());
public PagedResponse<DatasetFile> getDatasetFilesWithDirectory(String datasetId, String prefix, boolean excludeDerivedFiles, PagingQuery pagingQuery) {
Dataset dataset = datasetRepository.getById(datasetId);
int page = Math.max(pagingQuery.getPage(), 1);
int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize();
if (dataset == null) {
return PagedResponse.of(new Page<>(page, size));
}
String datasetPath = dataset.getPath();
Path queryPath = Path.of(dataset.getPath() + File.separator + prefix);
Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId)
.stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
Set<String> derivedFilePaths = excludeDerivedFiles
? datasetFilesMap.values().stream()
.filter(this::isDerivedFile)
.map(DatasetFile::getFilePath)
.filter(Objects::nonNull)
.collect(Collectors.toSet())
: Collections.emptySet();
try (Stream<Path> pathStream = Files.list(queryPath)) {
List<Path> allFiles = pathStream
.filter(path -> path.toString().startsWith(datasetPath))
.filter(path -> !excludeDerivedFiles || Files.isDirectory(path) || !derivedFilePaths.contains(path.toString()))
.sorted(Comparator
.comparing((Path path) -> !Files.isDirectory(path))
.thenComparing(path -> path.getFileName().toString()))
.collect(Collectors.toList());
// 计算分页
int total = allFiles.size();
@@ -187,15 +193,15 @@ public class DatasetFileApplicationService {
List<DatasetFile> datasetFiles = pageData.stream().map(path -> getDatasetFile(path, datasetFilesMap)).toList();
return new PagedResponse<>(page, size, total, totalPages, datasetFiles);
} catch (IOException e) {
log.error("list dataset path error", e);
return PagedResponse.of(new Page<>(page, size));
}
}
} catch (IOException e) {
log.error("list dataset path error", e);
return PagedResponse.of(new Page<>(page, size));
}
}
private DatasetFile getDatasetFile(Path path, Map<String, DatasetFile> datasetFilesMap) {
DatasetFile datasetFile = new DatasetFile();
LocalDateTime localDateTime = LocalDateTime.now();
private DatasetFile getDatasetFile(Path path, Map<String, DatasetFile> datasetFilesMap) {
DatasetFile datasetFile = new DatasetFile();
LocalDateTime localDateTime = LocalDateTime.now();
try {
localDateTime = Files.getLastModifiedTime(path).toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime();
} catch (IOException e) {
@@ -246,8 +252,37 @@ public class DatasetFileApplicationService {
datasetFile = exist;
}
}
return datasetFile;
}
return datasetFile;
}
private boolean isSourceDocument(DatasetFile datasetFile) {
if (datasetFile == null) {
return false;
}
String fileType = datasetFile.getFileType();
if (fileType == null || fileType.isBlank()) {
return false;
}
return DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT));
}
private boolean isDerivedFile(DatasetFile datasetFile) {
if (datasetFile == null) {
return false;
}
String metadata = datasetFile.getMetadata();
if (metadata == null || metadata.isBlank()) {
return false;
}
try {
ObjectMapper mapper = new ObjectMapper();
Map<String, Object> metadataMap = mapper.readValue(metadata, new TypeReference<Map<String, Object>>() {});
return metadataMap.get(DERIVED_METADATA_KEY) != null;
} catch (Exception e) {
log.debug("Failed to parse dataset file metadata for derived detection: {}", datasetFile.getId(), e);
return false;
}
}
/**
* 获取文件详情

View File

@@ -44,24 +44,30 @@ public class DatasetFileController {
}
@GetMapping
public Response<PagedResponse<DatasetFile>> getDatasetFiles(
@PathVariable("datasetId") String datasetId,
@RequestParam(value = "isWithDirectory", required = false) boolean isWithDirectory,
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
@RequestParam(value = "prefix", required = false, defaultValue = "") String prefix,
@RequestParam(value = "status", required = false) String status,
@RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation,
@RequestParam(value = "excludeSourceDocuments", required = false, defaultValue = "false") Boolean excludeSourceDocuments) {
PagingQuery pagingQuery = new PagingQuery(page, size);
PagedResponse<DatasetFile> filesPage;
if (isWithDirectory) {
filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(datasetId, prefix, pagingQuery);
} else {
filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation,
Boolean.TRUE.equals(excludeSourceDocuments), pagingQuery);
}
return Response.ok(filesPage);
public Response<PagedResponse<DatasetFile>> getDatasetFiles(
@PathVariable("datasetId") String datasetId,
@RequestParam(value = "isWithDirectory", required = false) boolean isWithDirectory,
@RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
@RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
@RequestParam(value = "prefix", required = false, defaultValue = "") String prefix,
@RequestParam(value = "status", required = false) String status,
@RequestParam(value = "hasAnnotation", required = false) Boolean hasAnnotation,
@RequestParam(value = "excludeSourceDocuments", required = false, defaultValue = "false") Boolean excludeSourceDocuments,
@RequestParam(value = "excludeDerivedFiles", required = false, defaultValue = "false") Boolean excludeDerivedFiles) {
PagingQuery pagingQuery = new PagingQuery(page, size);
PagedResponse<DatasetFile> filesPage;
if (isWithDirectory) {
filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(
datasetId,
prefix,
Boolean.TRUE.equals(excludeDerivedFiles),
pagingQuery
);
} else {
filesPage = datasetFileApplicationService.getDatasetFiles(datasetId, null, status, null, hasAnnotation,
Boolean.TRUE.equals(excludeSourceDocuments), pagingQuery);
}
return Response.ok(filesPage);
}
@GetMapping("/{fileId}")