fix(dataset): 解决数据集文件查询时空目录导致异常的问题

- 添加目录存在性检查,避免文件系统访问异常
- 目录不存在时返回空分页结果而不是抛出异常
- 优化数据集刚创建时的用户体验
This commit is contained in:
2026-01-31 19:10:22 +08:00
parent 150af1a741
commit 2f3a8b38d0

View File

@@ -22,16 +22,16 @@ import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckIn
import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode; import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository;
import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository;
import com.datamate.datamanagement.interfaces.converter.DatasetConverter; import com.datamate.datamanagement.interfaces.converter.DatasetConverter;
import com.datamate.datamanagement.interfaces.dto.AddFilesRequest; import com.datamate.datamanagement.interfaces.dto.AddFilesRequest;
import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest; import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest;
import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest; import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFileRequest; import com.datamate.datamanagement.interfaces.dto.UploadFileRequest;
import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest; import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest;
import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import jakarta.servlet.http.HttpServletResponse; import jakarta.servlet.http.HttpServletResponse;
import lombok.extern.slf4j.Slf4j; import lombok.extern.slf4j.Slf4j;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
@@ -40,24 +40,24 @@ import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value; import org.springframework.beans.factory.annotation.Value;
import org.springframework.core.io.Resource; import org.springframework.core.io.Resource;
import org.springframework.core.io.UrlResource; import org.springframework.core.io.UrlResource;
import org.springframework.http.HttpHeaders; import org.springframework.http.HttpHeaders;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional; import org.springframework.transaction.annotation.Transactional;
import org.springframework.transaction.support.TransactionSynchronization; import org.springframework.transaction.support.TransactionSynchronization;
import org.springframework.transaction.support.TransactionSynchronizationManager; import org.springframework.transaction.support.TransactionSynchronizationManager;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.nio.file.attribute.BasicFileAttributes; import java.nio.file.attribute.BasicFileAttributes;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.time.ZoneId; import java.time.ZoneId;
import java.time.format.DateTimeFormatter; import java.time.format.DateTimeFormatter;
import java.util.*; import java.util.*;
import java.util.concurrent.CompletableFuture; import java.util.concurrent.CompletableFuture;
import java.util.function.Function; import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@@ -70,24 +70,24 @@ import java.util.stream.Stream;
@Service @Service
@Transactional @Transactional
public class DatasetFileApplicationService { public class DatasetFileApplicationService {
private static final String PDF_FILE_TYPE = "pdf"; private static final String PDF_FILE_TYPE = "pdf";
private static final String DOC_FILE_TYPE = "doc"; private static final String DOC_FILE_TYPE = "doc";
private static final String DOCX_FILE_TYPE = "docx"; private static final String DOCX_FILE_TYPE = "docx";
private static final String XLS_FILE_TYPE = "xls"; private static final String XLS_FILE_TYPE = "xls";
private static final String XLSX_FILE_TYPE = "xlsx"; private static final String XLSX_FILE_TYPE = "xlsx";
private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of( private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(
PDF_FILE_TYPE, PDF_FILE_TYPE,
DOC_FILE_TYPE, DOC_FILE_TYPE,
DOCX_FILE_TYPE, DOCX_FILE_TYPE,
XLS_FILE_TYPE, XLS_FILE_TYPE,
XLSX_FILE_TYPE XLSX_FILE_TYPE
); );
private static final String DERIVED_METADATA_KEY = "derived_from_file_id"; private static final String DERIVED_METADATA_KEY = "derived_from_file_id";
private final DatasetFileRepository datasetFileRepository; private final DatasetFileRepository datasetFileRepository;
private final DatasetRepository datasetRepository; private final DatasetRepository datasetRepository;
private final FileService fileService; private final FileService fileService;
private final PdfTextExtractAsyncService pdfTextExtractAsyncService; private final PdfTextExtractAsyncService pdfTextExtractAsyncService;
@Value("${datamate.data-management.base-path:/dataset}") @Value("${datamate.data-management.base-path:/dataset}")
private String datasetBasePath; private String datasetBasePath;
@@ -123,61 +123,65 @@ public class DatasetFileApplicationService {
* @param status 状态过滤 * @param status 状态过滤
* @param name 文件名模糊查询 * @param name 文件名模糊查询
* @param hasAnnotation 是否有标注 * @param hasAnnotation 是否有标注
* @param excludeSourceDocuments 是否排除源文档(PDF/DOC/DOCX/XLS/XLSX) * @param excludeSourceDocuments 是否排除源文档(PDF/DOC/DOCX/XLS/XLSX)
* @param pagingQuery 分页参数 * @param pagingQuery 分页参数
* @return 分页文件列表 * @return 分页文件列表
*/ */
@Transactional(readOnly = true) @Transactional(readOnly = true)
public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name, public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name,
Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) { Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) {
IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize()); IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize());
IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page); IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page);
if (excludeSourceDocuments) { if (excludeSourceDocuments) {
// 过滤掉源文档文件(PDF/DOC/DOCX/XLS/XLSX),用于标注场景只展示派生文件 // 过滤掉源文档文件(PDF/DOC/DOCX/XLS/XLSX),用于标注场景只展示派生文件
List<DatasetFile> filteredRecords = files.getRecords().stream() List<DatasetFile> filteredRecords = files.getRecords().stream()
.filter(file -> !isSourceDocument(file)) .filter(file -> !isSourceDocument(file))
.collect(Collectors.toList()); .collect(Collectors.toList());
// 重新构建分页结果 // 重新构建分页结果
Page<DatasetFile> filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal()); Page<DatasetFile> filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal());
filteredPage.setRecords(filteredRecords); filteredPage.setRecords(filteredRecords);
return PagedResponse.of(filteredPage); return PagedResponse.of(filteredPage);
} }
return PagedResponse.of(files); return PagedResponse.of(files);
} }
/** /**
* 获取数据集文件列表 * 获取数据集文件列表
*/ */
@Transactional(readOnly = true) @Transactional(readOnly = true)
public PagedResponse<DatasetFile> getDatasetFilesWithDirectory(String datasetId, String prefix, boolean excludeDerivedFiles, PagingQuery pagingQuery) { public PagedResponse<DatasetFile> getDatasetFilesWithDirectory(String datasetId, String prefix, boolean excludeDerivedFiles, PagingQuery pagingQuery) {
Dataset dataset = datasetRepository.getById(datasetId); Dataset dataset = datasetRepository.getById(datasetId);
int page = Math.max(pagingQuery.getPage(), 1); int page = Math.max(pagingQuery.getPage(), 1);
int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize(); int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize();
if (dataset == null) { if (dataset == null) {
return PagedResponse.of(new Page<>(page, size)); return PagedResponse.of(new Page<>(page, size));
} }
String datasetPath = dataset.getPath(); String datasetPath = dataset.getPath();
Path queryPath = Path.of(dataset.getPath() + File.separator + prefix); Path queryPath = Path.of(dataset.getPath() + File.separator + prefix);
Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId) Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId)
.stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity())); .stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
Set<String> derivedFilePaths = excludeDerivedFiles Set<String> derivedFilePaths = excludeDerivedFiles
? datasetFilesMap.values().stream() ? datasetFilesMap.values().stream()
.filter(this::isDerivedFile) .filter(this::isDerivedFile)
.map(DatasetFile::getFilePath) .map(DatasetFile::getFilePath)
.filter(Objects::nonNull) .filter(Objects::nonNull)
.collect(Collectors.toSet()) .collect(Collectors.toSet())
: Collections.emptySet(); : Collections.emptySet();
try (Stream<Path> pathStream = Files.list(queryPath)) { // 如果目录不存在,直接返回空结果(数据集刚创建时目录可能还未生成)
List<Path> allFiles = pathStream if (!Files.exists(queryPath)) {
.filter(path -> path.toString().startsWith(datasetPath)) return new PagedResponse<>(page, size, 0, 0, Collections.emptyList());
.filter(path -> !excludeDerivedFiles || Files.isDirectory(path) || !derivedFilePaths.contains(path.toString())) }
.sorted(Comparator try (Stream<Path> pathStream = Files.list(queryPath)) {
.comparing((Path path) -> !Files.isDirectory(path)) List<Path> allFiles = pathStream
.thenComparing(path -> path.getFileName().toString())) .filter(path -> path.toString().startsWith(datasetPath))
.collect(Collectors.toList()); .filter(path -> !excludeDerivedFiles || Files.isDirectory(path) || !derivedFilePaths.contains(path.toString()))
.sorted(Comparator
.comparing((Path path) -> !Files.isDirectory(path))
.thenComparing(path -> path.getFileName().toString()))
.collect(Collectors.toList());
// 计算分页 // 计算分页
int total = allFiles.size(); int total = allFiles.size();
@@ -195,15 +199,15 @@ public class DatasetFileApplicationService {
List<DatasetFile> datasetFiles = pageData.stream().map(path -> getDatasetFile(path, datasetFilesMap)).toList(); List<DatasetFile> datasetFiles = pageData.stream().map(path -> getDatasetFile(path, datasetFilesMap)).toList();
return new PagedResponse<>(page, size, total, totalPages, datasetFiles); return new PagedResponse<>(page, size, total, totalPages, datasetFiles);
} catch (IOException e) { } catch (IOException e) {
log.error("list dataset path error", e); log.error("list dataset path error", e);
return PagedResponse.of(new Page<>(page, size)); return PagedResponse.of(new Page<>(page, size));
} }
} }
private DatasetFile getDatasetFile(Path path, Map<String, DatasetFile> datasetFilesMap) { private DatasetFile getDatasetFile(Path path, Map<String, DatasetFile> datasetFilesMap) {
DatasetFile datasetFile = new DatasetFile(); DatasetFile datasetFile = new DatasetFile();
LocalDateTime localDateTime = LocalDateTime.now(); LocalDateTime localDateTime = LocalDateTime.now();
try { try {
localDateTime = Files.getLastModifiedTime(path).toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime(); localDateTime = Files.getLastModifiedTime(path).toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime();
} catch (IOException e) { } catch (IOException e) {
@@ -254,37 +258,37 @@ public class DatasetFileApplicationService {
datasetFile = exist; datasetFile = exist;
} }
} }
return datasetFile; return datasetFile;
} }
private boolean isSourceDocument(DatasetFile datasetFile) { private boolean isSourceDocument(DatasetFile datasetFile) {
if (datasetFile == null) { if (datasetFile == null) {
return false; return false;
} }
String fileType = datasetFile.getFileType(); String fileType = datasetFile.getFileType();
if (fileType == null || fileType.isBlank()) { if (fileType == null || fileType.isBlank()) {
return false; return false;
} }
return DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT)); return DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT));
} }
private boolean isDerivedFile(DatasetFile datasetFile) { private boolean isDerivedFile(DatasetFile datasetFile) {
if (datasetFile == null) { if (datasetFile == null) {
return false; return false;
} }
String metadata = datasetFile.getMetadata(); String metadata = datasetFile.getMetadata();
if (metadata == null || metadata.isBlank()) { if (metadata == null || metadata.isBlank()) {
return false; return false;
} }
try { try {
ObjectMapper mapper = new ObjectMapper(); ObjectMapper mapper = new ObjectMapper();
Map<String, Object> metadataMap = mapper.readValue(metadata, new TypeReference<Map<String, Object>>() {}); Map<String, Object> metadataMap = mapper.readValue(metadata, new TypeReference<Map<String, Object>>() {});
return metadataMap.get(DERIVED_METADATA_KEY) != null; return metadataMap.get(DERIVED_METADATA_KEY) != null;
} catch (Exception e) { } catch (Exception e) {
log.debug("Failed to parse dataset file metadata for derived detection: {}", datasetFile.getId(), e); log.debug("Failed to parse dataset file metadata for derived detection: {}", datasetFile.getId(), e);
return false; return false;
} }
} }
/** /**
* 获取文件详情 * 获取文件详情
@@ -740,17 +744,17 @@ public class DatasetFileApplicationService {
} }
} }
/** /**
* 复制文件到数据集目录 * 复制文件到数据集目录
* *
* @param datasetId 数据集id * @param datasetId 数据集id
* @param req 复制文件请求 * @param req 复制文件请求
* @return 复制的文件列表 * @return 复制的文件列表
*/ */
@Transactional @Transactional
public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) { public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
Dataset dataset = datasetRepository.getById(datasetId); Dataset dataset = datasetRepository.getById(datasetId);
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND); BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
List<DatasetFile> copiedFiles = new ArrayList<>(); List<DatasetFile> copiedFiles = new ArrayList<>();
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId); List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
dataset.setFiles(existDatasetFiles); dataset.setFiles(existDatasetFiles);
@@ -780,80 +784,80 @@ public class DatasetFileApplicationService {
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100); datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
dataset.active(); dataset.active();
datasetRepository.updateById(dataset); datasetRepository.updateById(dataset);
CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset)); CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
return copiedFiles; return copiedFiles;
} }
/** /**
* 复制文件到数据集目录(保留相对路径,适用于数据源导入) * 复制文件到数据集目录(保留相对路径,适用于数据源导入)
* *
* @param datasetId 数据集id * @param datasetId 数据集id
* @param sourceRoot 数据源根目录 * @param sourceRoot 数据源根目录
* @param sourcePaths 源文件路径列表 * @param sourcePaths 源文件路径列表
* @return 复制的文件列表 * @return 复制的文件列表
*/ */
@Transactional @Transactional
public List<DatasetFile> copyFilesToDatasetDirWithSourceRoot(String datasetId, Path sourceRoot, List<String> sourcePaths) { public List<DatasetFile> copyFilesToDatasetDirWithSourceRoot(String datasetId, Path sourceRoot, List<String> sourcePaths) {
Dataset dataset = datasetRepository.getById(datasetId); Dataset dataset = datasetRepository.getById(datasetId);
BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND); BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
Path normalizedRoot = sourceRoot.toAbsolutePath().normalize(); Path normalizedRoot = sourceRoot.toAbsolutePath().normalize();
List<DatasetFile> copiedFiles = new ArrayList<>(); List<DatasetFile> copiedFiles = new ArrayList<>();
List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId); List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
dataset.setFiles(existDatasetFiles); dataset.setFiles(existDatasetFiles);
Map<String, DatasetFile> copyTargets = new LinkedHashMap<>(); Map<String, DatasetFile> copyTargets = new LinkedHashMap<>();
for (String sourceFilePath : sourcePaths) { for (String sourceFilePath : sourcePaths) {
if (sourceFilePath == null || sourceFilePath.isBlank()) { if (sourceFilePath == null || sourceFilePath.isBlank()) {
continue; continue;
} }
Path sourcePath = Paths.get(sourceFilePath).toAbsolutePath().normalize(); Path sourcePath = Paths.get(sourceFilePath).toAbsolutePath().normalize();
if (!sourcePath.startsWith(normalizedRoot)) { if (!sourcePath.startsWith(normalizedRoot)) {
log.warn("Source file path is out of root: {}", sourceFilePath); log.warn("Source file path is out of root: {}", sourceFilePath);
continue; continue;
} }
if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) { if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath); log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
continue; continue;
} }
Path relativePath = normalizedRoot.relativize(sourcePath); Path relativePath = normalizedRoot.relativize(sourcePath);
String fileName = sourcePath.getFileName().toString(); String fileName = sourcePath.getFileName().toString();
File sourceFile = sourcePath.toFile(); File sourceFile = sourcePath.toFile();
LocalDateTime currentTime = LocalDateTime.now(); LocalDateTime currentTime = LocalDateTime.now();
Path targetPath = Paths.get(dataset.getPath(), relativePath.toString()); Path targetPath = Paths.get(dataset.getPath(), relativePath.toString());
DatasetFile datasetFile = DatasetFile.builder() DatasetFile datasetFile = DatasetFile.builder()
.id(UUID.randomUUID().toString()) .id(UUID.randomUUID().toString())
.datasetId(datasetId) .datasetId(datasetId)
.fileName(fileName) .fileName(fileName)
.fileType(AnalyzerUtils.getExtension(fileName)) .fileType(AnalyzerUtils.getExtension(fileName))
.fileSize(sourceFile.length()) .fileSize(sourceFile.length())
.filePath(targetPath.toString()) .filePath(targetPath.toString())
.uploadTime(currentTime) .uploadTime(currentTime)
.lastAccessTime(currentTime) .lastAccessTime(currentTime)
.build(); .build();
setDatasetFileId(datasetFile, dataset); setDatasetFileId(datasetFile, dataset);
dataset.addFile(datasetFile); dataset.addFile(datasetFile);
copiedFiles.add(datasetFile); copiedFiles.add(datasetFile);
copyTargets.put(sourceFilePath, datasetFile); copyTargets.put(sourceFilePath, datasetFile);
} }
if (copiedFiles.isEmpty()) { if (copiedFiles.isEmpty()) {
return copiedFiles; return copiedFiles;
} }
datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100); datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
dataset.active(); dataset.active();
datasetRepository.updateById(dataset); datasetRepository.updateById(dataset);
CompletableFuture.runAsync(() -> copyFilesToDatasetDirWithRelativePath(copyTargets, dataset, normalizedRoot)); CompletableFuture.runAsync(() -> copyFilesToDatasetDirWithRelativePath(copyTargets, dataset, normalizedRoot));
return copiedFiles; return copiedFiles;
} }
private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) { private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
for (String sourcePath : sourcePaths) { for (String sourcePath : sourcePaths) {
Path sourceFilePath = Paths.get(sourcePath); Path sourceFilePath = Paths.get(sourcePath);
Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString()); Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
try { try {
Files.createDirectories(Path.of(dataset.getPath())); Files.createDirectories(Path.of(dataset.getPath()));
Files.copy(sourceFilePath, targetFilePath); Files.copy(sourceFilePath, targetFilePath);
DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName( DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
@@ -863,39 +867,39 @@ public class DatasetFileApplicationService {
triggerPdfTextExtraction(dataset, datasetFile); triggerPdfTextExtraction(dataset, datasetFile);
} catch (IOException e) { } catch (IOException e) {
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e); log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
} }
} }
} }
private void copyFilesToDatasetDirWithRelativePath( private void copyFilesToDatasetDirWithRelativePath(
Map<String, DatasetFile> copyTargets, Map<String, DatasetFile> copyTargets,
Dataset dataset, Dataset dataset,
Path sourceRoot Path sourceRoot
) { ) {
Path datasetRoot = Paths.get(dataset.getPath()).toAbsolutePath().normalize(); Path datasetRoot = Paths.get(dataset.getPath()).toAbsolutePath().normalize();
Path normalizedRoot = sourceRoot.toAbsolutePath().normalize(); Path normalizedRoot = sourceRoot.toAbsolutePath().normalize();
for (Map.Entry<String, DatasetFile> entry : copyTargets.entrySet()) { for (Map.Entry<String, DatasetFile> entry : copyTargets.entrySet()) {
Path sourcePath = Paths.get(entry.getKey()).toAbsolutePath().normalize(); Path sourcePath = Paths.get(entry.getKey()).toAbsolutePath().normalize();
if (!sourcePath.startsWith(normalizedRoot)) { if (!sourcePath.startsWith(normalizedRoot)) {
log.warn("Source file path is out of root: {}", sourcePath); log.warn("Source file path is out of root: {}", sourcePath);
continue; continue;
} }
Path relativePath = normalizedRoot.relativize(sourcePath); Path relativePath = normalizedRoot.relativize(sourcePath);
Path targetFilePath = datasetRoot.resolve(relativePath).normalize(); Path targetFilePath = datasetRoot.resolve(relativePath).normalize();
if (!targetFilePath.startsWith(datasetRoot)) { if (!targetFilePath.startsWith(datasetRoot)) {
log.warn("Target file path is out of dataset path: {}", targetFilePath); log.warn("Target file path is out of dataset path: {}", targetFilePath);
continue; continue;
} }
try { try {
Files.createDirectories(targetFilePath.getParent()); Files.createDirectories(targetFilePath.getParent());
Files.copy(sourcePath, targetFilePath); Files.copy(sourcePath, targetFilePath);
triggerPdfTextExtraction(dataset, entry.getValue()); triggerPdfTextExtraction(dataset, entry.getValue());
} catch (IOException e) { } catch (IOException e) {
log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e); log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
} }
} }
} }
/** /**
* 添加文件到数据集(仅创建数据库记录,不执行文件系统操作) * 添加文件到数据集(仅创建数据库记录,不执行文件系统操作)
* *
@@ -952,31 +956,31 @@ public class DatasetFileApplicationService {
return addedFiles; return addedFiles;
} }
private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) { private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) {
if (dataset == null || datasetFile == null) { if (dataset == null || datasetFile == null) {
return; return;
} }
if (dataset.getDatasetType() != DatasetType.TEXT) { if (dataset.getDatasetType() != DatasetType.TEXT) {
return; return;
} }
String fileType = datasetFile.getFileType(); String fileType = datasetFile.getFileType();
if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) { if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) {
return; return;
} }
String datasetId = dataset.getId(); String datasetId = dataset.getId();
String fileId = datasetFile.getId(); String fileId = datasetFile.getId();
if (datasetId == null || fileId == null) { if (datasetId == null || fileId == null) {
return; return;
} }
if (TransactionSynchronizationManager.isSynchronizationActive()) { if (TransactionSynchronizationManager.isSynchronizationActive()) {
TransactionSynchronizationManager.registerSynchronization(new TransactionSynchronization() { TransactionSynchronizationManager.registerSynchronization(new TransactionSynchronization() {
@Override @Override
public void afterCommit() { public void afterCommit() {
pdfTextExtractAsyncService.extractPdfText(datasetId, fileId); pdfTextExtractAsyncService.extractPdfText(datasetId, fileId);
} }
}); });
return; return;
} }
pdfTextExtractAsyncService.extractPdfText(datasetId, fileId); pdfTextExtractAsyncService.extractPdfText(datasetId, fileId);
} }
} }