From 2f3a8b38d010adafc0d7d4439d981dac9c5e5398 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sat, 31 Jan 2026 19:10:22 +0800 Subject: [PATCH] =?UTF-8?q?fix(dataset):=20=E8=A7=A3=E5=86=B3=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E9=9B=86=E6=96=87=E4=BB=B6=E6=9F=A5=E8=AF=A2=E6=97=B6?= =?UTF-8?q?=E7=A9=BA=E7=9B=AE=E5=BD=95=E5=AF=BC=E8=87=B4=E5=BC=82=E5=B8=B8?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加目录存在性检查,避免文件系统访问异常 - 目录不存在时返回空分页结果而不是抛出异常 - 优化数据集刚创建时的用户体验 --- .../DatasetFileApplicationService.java | 552 +++++++++--------- 1 file changed, 278 insertions(+), 274 deletions(-) diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java index 040ec6c..7c65d24 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java @@ -22,16 +22,16 @@ import com.datamate.datamanagement.domain.model.dataset.DatasetFileUploadCheckIn import com.datamate.datamanagement.infrastructure.exception.DataManagementErrorCode; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetFileRepository; import com.datamate.datamanagement.infrastructure.persistence.repository.DatasetRepository; -import com.datamate.datamanagement.interfaces.converter.DatasetConverter; -import com.datamate.datamanagement.interfaces.dto.AddFilesRequest; -import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest; -import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest; -import com.datamate.datamanagement.interfaces.dto.UploadFileRequest; -import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import jakarta.servlet.http.HttpServletResponse; +import com.datamate.datamanagement.interfaces.converter.DatasetConverter; +import com.datamate.datamanagement.interfaces.dto.AddFilesRequest; +import com.datamate.datamanagement.interfaces.dto.CopyFilesRequest; +import com.datamate.datamanagement.interfaces.dto.CreateDirectoryRequest; +import com.datamate.datamanagement.interfaces.dto.UploadFileRequest; +import com.datamate.datamanagement.interfaces.dto.UploadFilesPreRequest; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import jakarta.servlet.http.HttpServletResponse; import lombok.extern.slf4j.Slf4j; import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; @@ -40,24 +40,24 @@ import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.core.io.Resource; import org.springframework.core.io.UrlResource; -import org.springframework.http.HttpHeaders; -import org.springframework.stereotype.Service; -import org.springframework.transaction.annotation.Transactional; -import org.springframework.transaction.support.TransactionSynchronization; -import org.springframework.transaction.support.TransactionSynchronizationManager; +import org.springframework.http.HttpHeaders; +import org.springframework.stereotype.Service; +import org.springframework.transaction.annotation.Transactional; +import org.springframework.transaction.support.TransactionSynchronization; +import org.springframework.transaction.support.TransactionSynchronizationManager; import java.io.File; import java.io.IOException; -import java.io.InputStream; -import java.net.MalformedURLException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.attribute.BasicFileAttributes; -import java.time.LocalDateTime; -import java.time.ZoneId; -import java.time.format.DateTimeFormatter; -import java.util.*; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.attribute.BasicFileAttributes; +import java.time.LocalDateTime; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.util.*; import java.util.concurrent.CompletableFuture; import java.util.function.Function; import java.util.stream.Collectors; @@ -70,24 +70,24 @@ import java.util.stream.Stream; @Service @Transactional public class DatasetFileApplicationService { - private static final String PDF_FILE_TYPE = "pdf"; - private static final String DOC_FILE_TYPE = "doc"; - private static final String DOCX_FILE_TYPE = "docx"; - private static final String XLS_FILE_TYPE = "xls"; - private static final String XLSX_FILE_TYPE = "xlsx"; - private static final Set DOCUMENT_TEXT_FILE_TYPES = Set.of( - PDF_FILE_TYPE, - DOC_FILE_TYPE, - DOCX_FILE_TYPE, - XLS_FILE_TYPE, - XLSX_FILE_TYPE - ); - private static final String DERIVED_METADATA_KEY = "derived_from_file_id"; - - private final DatasetFileRepository datasetFileRepository; - private final DatasetRepository datasetRepository; - private final FileService fileService; - private final PdfTextExtractAsyncService pdfTextExtractAsyncService; + private static final String PDF_FILE_TYPE = "pdf"; + private static final String DOC_FILE_TYPE = "doc"; + private static final String DOCX_FILE_TYPE = "docx"; + private static final String XLS_FILE_TYPE = "xls"; + private static final String XLSX_FILE_TYPE = "xlsx"; + private static final Set DOCUMENT_TEXT_FILE_TYPES = Set.of( + PDF_FILE_TYPE, + DOC_FILE_TYPE, + DOCX_FILE_TYPE, + XLS_FILE_TYPE, + XLSX_FILE_TYPE + ); + private static final String DERIVED_METADATA_KEY = "derived_from_file_id"; + + private final DatasetFileRepository datasetFileRepository; + private final DatasetRepository datasetRepository; + private final FileService fileService; + private final PdfTextExtractAsyncService pdfTextExtractAsyncService; @Value("${datamate.data-management.base-path:/dataset}") private String datasetBasePath; @@ -123,61 +123,65 @@ public class DatasetFileApplicationService { * @param status 状态过滤 * @param name 文件名模糊查询 * @param hasAnnotation 是否有标注 - * @param excludeSourceDocuments 是否排除源文档(PDF/DOC/DOCX/XLS/XLSX) + * @param excludeSourceDocuments 是否排除源文档(PDF/DOC/DOCX/XLS/XLSX) * @param pagingQuery 分页参数 * @return 分页文件列表 */ @Transactional(readOnly = true) - public PagedResponse getDatasetFiles(String datasetId, String fileType, String status, String name, - Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) { - IPage page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize()); - IPage files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page); - - if (excludeSourceDocuments) { - // 过滤掉源文档文件(PDF/DOC/DOCX/XLS/XLSX),用于标注场景只展示派生文件 - List filteredRecords = files.getRecords().stream() - .filter(file -> !isSourceDocument(file)) - .collect(Collectors.toList()); - - // 重新构建分页结果 - Page filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal()); - filteredPage.setRecords(filteredRecords); - return PagedResponse.of(filteredPage); - } - - return PagedResponse.of(files); - } + public PagedResponse getDatasetFiles(String datasetId, String fileType, String status, String name, + Boolean hasAnnotation, boolean excludeSourceDocuments, PagingQuery pagingQuery) { + IPage page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize()); + IPage files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, hasAnnotation, page); + + if (excludeSourceDocuments) { + // 过滤掉源文档文件(PDF/DOC/DOCX/XLS/XLSX),用于标注场景只展示派生文件 + List filteredRecords = files.getRecords().stream() + .filter(file -> !isSourceDocument(file)) + .collect(Collectors.toList()); + + // 重新构建分页结果 + Page filteredPage = new Page<>(files.getCurrent(), files.getSize(), files.getTotal()); + filteredPage.setRecords(filteredRecords); + return PagedResponse.of(filteredPage); + } + + return PagedResponse.of(files); + } /** * 获取数据集文件列表 */ @Transactional(readOnly = true) - public PagedResponse getDatasetFilesWithDirectory(String datasetId, String prefix, boolean excludeDerivedFiles, PagingQuery pagingQuery) { - Dataset dataset = datasetRepository.getById(datasetId); - int page = Math.max(pagingQuery.getPage(), 1); - int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize(); - if (dataset == null) { - return PagedResponse.of(new Page<>(page, size)); - } - String datasetPath = dataset.getPath(); - Path queryPath = Path.of(dataset.getPath() + File.separator + prefix); - Map datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId) - .stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity())); - Set derivedFilePaths = excludeDerivedFiles - ? datasetFilesMap.values().stream() - .filter(this::isDerivedFile) - .map(DatasetFile::getFilePath) - .filter(Objects::nonNull) - .collect(Collectors.toSet()) - : Collections.emptySet(); - try (Stream pathStream = Files.list(queryPath)) { - List allFiles = pathStream - .filter(path -> path.toString().startsWith(datasetPath)) - .filter(path -> !excludeDerivedFiles || Files.isDirectory(path) || !derivedFilePaths.contains(path.toString())) - .sorted(Comparator - .comparing((Path path) -> !Files.isDirectory(path)) - .thenComparing(path -> path.getFileName().toString())) - .collect(Collectors.toList()); + public PagedResponse getDatasetFilesWithDirectory(String datasetId, String prefix, boolean excludeDerivedFiles, PagingQuery pagingQuery) { + Dataset dataset = datasetRepository.getById(datasetId); + int page = Math.max(pagingQuery.getPage(), 1); + int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize(); + if (dataset == null) { + return PagedResponse.of(new Page<>(page, size)); + } + String datasetPath = dataset.getPath(); + Path queryPath = Path.of(dataset.getPath() + File.separator + prefix); + Map datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId) + .stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity())); + Set derivedFilePaths = excludeDerivedFiles + ? datasetFilesMap.values().stream() + .filter(this::isDerivedFile) + .map(DatasetFile::getFilePath) + .filter(Objects::nonNull) + .collect(Collectors.toSet()) + : Collections.emptySet(); + // 如果目录不存在,直接返回空结果(数据集刚创建时目录可能还未生成) + if (!Files.exists(queryPath)) { + return new PagedResponse<>(page, size, 0, 0, Collections.emptyList()); + } + try (Stream pathStream = Files.list(queryPath)) { + List allFiles = pathStream + .filter(path -> path.toString().startsWith(datasetPath)) + .filter(path -> !excludeDerivedFiles || Files.isDirectory(path) || !derivedFilePaths.contains(path.toString())) + .sorted(Comparator + .comparing((Path path) -> !Files.isDirectory(path)) + .thenComparing(path -> path.getFileName().toString())) + .collect(Collectors.toList()); // 计算分页 int total = allFiles.size(); @@ -195,15 +199,15 @@ public class DatasetFileApplicationService { List datasetFiles = pageData.stream().map(path -> getDatasetFile(path, datasetFilesMap)).toList(); return new PagedResponse<>(page, size, total, totalPages, datasetFiles); - } catch (IOException e) { - log.error("list dataset path error", e); - return PagedResponse.of(new Page<>(page, size)); - } - } + } catch (IOException e) { + log.error("list dataset path error", e); + return PagedResponse.of(new Page<>(page, size)); + } + } - private DatasetFile getDatasetFile(Path path, Map datasetFilesMap) { - DatasetFile datasetFile = new DatasetFile(); - LocalDateTime localDateTime = LocalDateTime.now(); + private DatasetFile getDatasetFile(Path path, Map datasetFilesMap) { + DatasetFile datasetFile = new DatasetFile(); + LocalDateTime localDateTime = LocalDateTime.now(); try { localDateTime = Files.getLastModifiedTime(path).toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime(); } catch (IOException e) { @@ -254,37 +258,37 @@ public class DatasetFileApplicationService { datasetFile = exist; } } - return datasetFile; - } - - private boolean isSourceDocument(DatasetFile datasetFile) { - if (datasetFile == null) { - return false; - } - String fileType = datasetFile.getFileType(); - if (fileType == null || fileType.isBlank()) { - return false; - } - return DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT)); - } - - private boolean isDerivedFile(DatasetFile datasetFile) { - if (datasetFile == null) { - return false; - } - String metadata = datasetFile.getMetadata(); - if (metadata == null || metadata.isBlank()) { - return false; - } - try { - ObjectMapper mapper = new ObjectMapper(); - Map metadataMap = mapper.readValue(metadata, new TypeReference>() {}); - return metadataMap.get(DERIVED_METADATA_KEY) != null; - } catch (Exception e) { - log.debug("Failed to parse dataset file metadata for derived detection: {}", datasetFile.getId(), e); - return false; - } - } + return datasetFile; + } + + private boolean isSourceDocument(DatasetFile datasetFile) { + if (datasetFile == null) { + return false; + } + String fileType = datasetFile.getFileType(); + if (fileType == null || fileType.isBlank()) { + return false; + } + return DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT)); + } + + private boolean isDerivedFile(DatasetFile datasetFile) { + if (datasetFile == null) { + return false; + } + String metadata = datasetFile.getMetadata(); + if (metadata == null || metadata.isBlank()) { + return false; + } + try { + ObjectMapper mapper = new ObjectMapper(); + Map metadataMap = mapper.readValue(metadata, new TypeReference>() {}); + return metadataMap.get(DERIVED_METADATA_KEY) != null; + } catch (Exception e) { + log.debug("Failed to parse dataset file metadata for derived detection: {}", datasetFile.getId(), e); + return false; + } + } /** * 获取文件详情 @@ -740,17 +744,17 @@ public class DatasetFileApplicationService { } } - /** - * 复制文件到数据集目录 - * - * @param datasetId 数据集id - * @param req 复制文件请求 - * @return 复制的文件列表 - */ - @Transactional - public List copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) { - Dataset dataset = datasetRepository.getById(datasetId); - BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND); + /** + * 复制文件到数据集目录 + * + * @param datasetId 数据集id + * @param req 复制文件请求 + * @return 复制的文件列表 + */ + @Transactional + public List copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) { + Dataset dataset = datasetRepository.getById(datasetId); + BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND); List copiedFiles = new ArrayList<>(); List existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId); dataset.setFiles(existDatasetFiles); @@ -780,80 +784,80 @@ public class DatasetFileApplicationService { datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100); dataset.active(); datasetRepository.updateById(dataset); - CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset)); - return copiedFiles; - } - - /** - * 复制文件到数据集目录(保留相对路径,适用于数据源导入) - * - * @param datasetId 数据集id - * @param sourceRoot 数据源根目录 - * @param sourcePaths 源文件路径列表 - * @return 复制的文件列表 - */ - @Transactional - public List copyFilesToDatasetDirWithSourceRoot(String datasetId, Path sourceRoot, List sourcePaths) { - Dataset dataset = datasetRepository.getById(datasetId); - BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND); - - Path normalizedRoot = sourceRoot.toAbsolutePath().normalize(); - List copiedFiles = new ArrayList<>(); - List existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId); - dataset.setFiles(existDatasetFiles); - Map copyTargets = new LinkedHashMap<>(); - - for (String sourceFilePath : sourcePaths) { - if (sourceFilePath == null || sourceFilePath.isBlank()) { - continue; - } - Path sourcePath = Paths.get(sourceFilePath).toAbsolutePath().normalize(); - if (!sourcePath.startsWith(normalizedRoot)) { - log.warn("Source file path is out of root: {}", sourceFilePath); - continue; - } - if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) { - log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath); - continue; - } - - Path relativePath = normalizedRoot.relativize(sourcePath); - String fileName = sourcePath.getFileName().toString(); - File sourceFile = sourcePath.toFile(); - LocalDateTime currentTime = LocalDateTime.now(); - Path targetPath = Paths.get(dataset.getPath(), relativePath.toString()); - - DatasetFile datasetFile = DatasetFile.builder() - .id(UUID.randomUUID().toString()) - .datasetId(datasetId) - .fileName(fileName) - .fileType(AnalyzerUtils.getExtension(fileName)) - .fileSize(sourceFile.length()) - .filePath(targetPath.toString()) - .uploadTime(currentTime) - .lastAccessTime(currentTime) - .build(); - setDatasetFileId(datasetFile, dataset); - dataset.addFile(datasetFile); - copiedFiles.add(datasetFile); - copyTargets.put(sourceFilePath, datasetFile); - } - - if (copiedFiles.isEmpty()) { - return copiedFiles; - } - datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100); - dataset.active(); - datasetRepository.updateById(dataset); - CompletableFuture.runAsync(() -> copyFilesToDatasetDirWithRelativePath(copyTargets, dataset, normalizedRoot)); - return copiedFiles; - } - - private void copyFilesToDatasetDir(List sourcePaths, Dataset dataset) { - for (String sourcePath : sourcePaths) { - Path sourceFilePath = Paths.get(sourcePath); - Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString()); - try { + CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset)); + return copiedFiles; + } + + /** + * 复制文件到数据集目录(保留相对路径,适用于数据源导入) + * + * @param datasetId 数据集id + * @param sourceRoot 数据源根目录 + * @param sourcePaths 源文件路径列表 + * @return 复制的文件列表 + */ + @Transactional + public List copyFilesToDatasetDirWithSourceRoot(String datasetId, Path sourceRoot, List sourcePaths) { + Dataset dataset = datasetRepository.getById(datasetId); + BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND); + + Path normalizedRoot = sourceRoot.toAbsolutePath().normalize(); + List copiedFiles = new ArrayList<>(); + List existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId); + dataset.setFiles(existDatasetFiles); + Map copyTargets = new LinkedHashMap<>(); + + for (String sourceFilePath : sourcePaths) { + if (sourceFilePath == null || sourceFilePath.isBlank()) { + continue; + } + Path sourcePath = Paths.get(sourceFilePath).toAbsolutePath().normalize(); + if (!sourcePath.startsWith(normalizedRoot)) { + log.warn("Source file path is out of root: {}", sourceFilePath); + continue; + } + if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) { + log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath); + continue; + } + + Path relativePath = normalizedRoot.relativize(sourcePath); + String fileName = sourcePath.getFileName().toString(); + File sourceFile = sourcePath.toFile(); + LocalDateTime currentTime = LocalDateTime.now(); + Path targetPath = Paths.get(dataset.getPath(), relativePath.toString()); + + DatasetFile datasetFile = DatasetFile.builder() + .id(UUID.randomUUID().toString()) + .datasetId(datasetId) + .fileName(fileName) + .fileType(AnalyzerUtils.getExtension(fileName)) + .fileSize(sourceFile.length()) + .filePath(targetPath.toString()) + .uploadTime(currentTime) + .lastAccessTime(currentTime) + .build(); + setDatasetFileId(datasetFile, dataset); + dataset.addFile(datasetFile); + copiedFiles.add(datasetFile); + copyTargets.put(sourceFilePath, datasetFile); + } + + if (copiedFiles.isEmpty()) { + return copiedFiles; + } + datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100); + dataset.active(); + datasetRepository.updateById(dataset); + CompletableFuture.runAsync(() -> copyFilesToDatasetDirWithRelativePath(copyTargets, dataset, normalizedRoot)); + return copiedFiles; + } + + private void copyFilesToDatasetDir(List sourcePaths, Dataset dataset) { + for (String sourcePath : sourcePaths) { + Path sourceFilePath = Paths.get(sourcePath); + Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString()); + try { Files.createDirectories(Path.of(dataset.getPath())); Files.copy(sourceFilePath, targetFilePath); DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName( @@ -863,39 +867,39 @@ public class DatasetFileApplicationService { triggerPdfTextExtraction(dataset, datasetFile); } catch (IOException e) { log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e); - } - } - } - - private void copyFilesToDatasetDirWithRelativePath( - Map copyTargets, - Dataset dataset, - Path sourceRoot - ) { - Path datasetRoot = Paths.get(dataset.getPath()).toAbsolutePath().normalize(); - Path normalizedRoot = sourceRoot.toAbsolutePath().normalize(); - for (Map.Entry entry : copyTargets.entrySet()) { - Path sourcePath = Paths.get(entry.getKey()).toAbsolutePath().normalize(); - if (!sourcePath.startsWith(normalizedRoot)) { - log.warn("Source file path is out of root: {}", sourcePath); - continue; - } - Path relativePath = normalizedRoot.relativize(sourcePath); - Path targetFilePath = datasetRoot.resolve(relativePath).normalize(); - if (!targetFilePath.startsWith(datasetRoot)) { - log.warn("Target file path is out of dataset path: {}", targetFilePath); - continue; - } - try { - Files.createDirectories(targetFilePath.getParent()); - Files.copy(sourcePath, targetFilePath); - triggerPdfTextExtraction(dataset, entry.getValue()); - } catch (IOException e) { - log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e); - } - } - } - + } + } + } + + private void copyFilesToDatasetDirWithRelativePath( + Map copyTargets, + Dataset dataset, + Path sourceRoot + ) { + Path datasetRoot = Paths.get(dataset.getPath()).toAbsolutePath().normalize(); + Path normalizedRoot = sourceRoot.toAbsolutePath().normalize(); + for (Map.Entry entry : copyTargets.entrySet()) { + Path sourcePath = Paths.get(entry.getKey()).toAbsolutePath().normalize(); + if (!sourcePath.startsWith(normalizedRoot)) { + log.warn("Source file path is out of root: {}", sourcePath); + continue; + } + Path relativePath = normalizedRoot.relativize(sourcePath); + Path targetFilePath = datasetRoot.resolve(relativePath).normalize(); + if (!targetFilePath.startsWith(datasetRoot)) { + log.warn("Target file path is out of dataset path: {}", targetFilePath); + continue; + } + try { + Files.createDirectories(targetFilePath.getParent()); + Files.copy(sourcePath, targetFilePath); + triggerPdfTextExtraction(dataset, entry.getValue()); + } catch (IOException e) { + log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e); + } + } + } + /** * 添加文件到数据集(仅创建数据库记录,不执行文件系统操作) * @@ -952,31 +956,31 @@ public class DatasetFileApplicationService { return addedFiles; } - private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) { - if (dataset == null || datasetFile == null) { - return; - } - if (dataset.getDatasetType() != DatasetType.TEXT) { - return; - } - String fileType = datasetFile.getFileType(); - if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) { - return; - } - String datasetId = dataset.getId(); - String fileId = datasetFile.getId(); - if (datasetId == null || fileId == null) { - return; - } - if (TransactionSynchronizationManager.isSynchronizationActive()) { - TransactionSynchronizationManager.registerSynchronization(new TransactionSynchronization() { - @Override - public void afterCommit() { - pdfTextExtractAsyncService.extractPdfText(datasetId, fileId); - } - }); - return; - } - pdfTextExtractAsyncService.extractPdfText(datasetId, fileId); - } -} + private void triggerPdfTextExtraction(Dataset dataset, DatasetFile datasetFile) { + if (dataset == null || datasetFile == null) { + return; + } + if (dataset.getDatasetType() != DatasetType.TEXT) { + return; + } + String fileType = datasetFile.getFileType(); + if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) { + return; + } + String datasetId = dataset.getId(); + String fileId = datasetFile.getId(); + if (datasetId == null || fileId == null) { + return; + } + if (TransactionSynchronizationManager.isSynchronizationActive()) { + TransactionSynchronizationManager.registerSynchronization(new TransactionSynchronization() { + @Override + public void afterCommit() { + pdfTextExtractAsyncService.extractPdfText(datasetId, fileId); + } + }); + return; + } + pdfTextExtractAsyncService.extractPdfText(datasetId, fileId); + } +}