refactor(data-import): 优化数据源文件扫描和复制逻辑

- 修改数据源文件扫描方法，直接在主流程中获取任务详情和路径 - 移除独立的getFilePaths方法，将路径扫描逻辑整合到scanFilePaths方法中 - 新增copyFilesToDatasetDirWithSourceRoot方法支持保留相对路径的文件复制 - 更新数据集文件应用服务中的文件复制逻辑，支持相对路径处理 - 修改Python后端项目接口中的文件查询逻辑，移除注释掉的编辑器服务引用 - 调整文件过滤逻辑，基于元数据中的派生源ID进行文件筛选 - 移除编辑器服务中已废弃的源文档过滤条件
2026-01-30 18:58:34 +08:00
parent 8b2a19f09a
commit 9a205919d7
4 changed files with 156 additions and 58 deletions
--- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java
+++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetApplicationService.java
@@ -414,33 +414,32 @@ public class DatasetApplicationService {
    public void processDataSourceAsync(String datasetId, String dataSourceId) {
        try {
            log.info("Initiating data source file scanning, dataset ID: {}, collection task ID: {}", datasetId, dataSourceId);
-            List<String> filePaths = getFilePaths(dataSourceId);
+            CollectionTaskDetailResponse taskDetail = collectionTaskClient.getTaskDetail(dataSourceId).getData();
+            if (taskDetail == null) {
+                log.warn("Fail to get collection task detail, task ID: {}", dataSourceId);
+                return;
+            }
+            Path targetPath = Paths.get(taskDetail.getTargetPath());
+            if (!Files.exists(targetPath) || !Files.isDirectory(targetPath)) {
+                log.warn("Target path not exists or is not a directory: {}", taskDetail.getTargetPath());
+                return;
+            }
+            List<String> filePaths = scanFilePaths(targetPath);
            if (CollectionUtils.isEmpty(filePaths)) {
                return;
            }
-            datasetFileApplicationService.copyFilesToDatasetDir(datasetId, new CopyFilesRequest(filePaths));
+            datasetFileApplicationService.copyFilesToDatasetDirWithSourceRoot(datasetId, targetPath, filePaths);
            log.info("Success file scan, total files: {}", filePaths.size());
        } catch (Exception e) {
            log.error("处理数据源文件扫描失败，数据集ID: {}, 数据源ID: {}", datasetId, dataSourceId, e);
        }
    }

-    private List<String> getFilePaths(String dataSourceId) {
-        CollectionTaskDetailResponse taskDetail = collectionTaskClient.getTaskDetail(dataSourceId).getData();
-        if (taskDetail == null) {
-            log.warn("Fail to get collection task detail, task ID: {}", dataSourceId);
-            return Collections.emptyList();
-        }
-        Path targetPath = Paths.get(taskDetail.getTargetPath());
-        if (!Files.exists(targetPath) || !Files.isDirectory(targetPath)) {
-            log.warn("Target path not exists or is not a directory: {}", taskDetail.getTargetPath());
-            return Collections.emptyList();
-        }
-
-        try (Stream<Path> paths = Files.walk(targetPath, 1)) {
+    private List<String> scanFilePaths(Path targetPath) {
+        try (Stream<Path> paths = Files.walk(targetPath)) {
            return paths
-                .filter(Files::isRegularFile)  // 只保留文件，排除目录
-                .map(Path::toString)           // 转换为字符串路径
+                .filter(Files::isRegularFile)
+                .map(Path::toString)
                .collect(Collectors.toList());
        } catch (IOException e) {
            log.error("Fail to scan directory: {}", targetPath, e);
--- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java
+++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java
@@ -695,17 +695,17 @@ public class DatasetFileApplicationService {
        }
    }

-    /**
-     * 复制文件到数据集目录
-     *
-     * @param datasetId 数据集id
-     * @param req       复制文件请求
-     * @return 复制的文件列表
-     */
-    @Transactional
-    public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
-        Dataset dataset = datasetRepository.getById(datasetId);
-        BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
+    /**
+     * 复制文件到数据集目录
+     *
+     * @param datasetId 数据集id
+     * @param req       复制文件请求
+     * @return 复制的文件列表
+     */
+    @Transactional
+    public List<DatasetFile> copyFilesToDatasetDir(String datasetId, CopyFilesRequest req) {
+        Dataset dataset = datasetRepository.getById(datasetId);
+        BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
        List<DatasetFile> copiedFiles = new ArrayList<>();
        List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
        dataset.setFiles(existDatasetFiles);
@@ -735,15 +735,80 @@ public class DatasetFileApplicationService {
        datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
        dataset.active();
        datasetRepository.updateById(dataset);
-        CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
-        return copiedFiles;
-    }
-
-    private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
-        for (String sourcePath : sourcePaths) {
-            Path sourceFilePath = Paths.get(sourcePath);
-            Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
-            try {
+        CompletableFuture.runAsync(() -> copyFilesToDatasetDir(req.sourcePaths(), dataset));
+        return copiedFiles;
+    }
+
+    /**
+     * 复制文件到数据集目录（保留相对路径，适用于数据源导入）
+     *
+     * @param datasetId 数据集id
+     * @param sourceRoot 数据源根目录
+     * @param sourcePaths 源文件路径列表
+     * @return 复制的文件列表
+     */
+    @Transactional
+    public List<DatasetFile> copyFilesToDatasetDirWithSourceRoot(String datasetId, Path sourceRoot, List<String> sourcePaths) {
+        Dataset dataset = datasetRepository.getById(datasetId);
+        BusinessAssert.notNull(dataset, SystemErrorCode.RESOURCE_NOT_FOUND);
+
+        Path normalizedRoot = sourceRoot.toAbsolutePath().normalize();
+        List<DatasetFile> copiedFiles = new ArrayList<>();
+        List<DatasetFile> existDatasetFiles = datasetFileRepository.findAllByDatasetId(datasetId);
+        dataset.setFiles(existDatasetFiles);
+        Map<String, DatasetFile> copyTargets = new LinkedHashMap<>();
+
+        for (String sourceFilePath : sourcePaths) {
+            if (sourceFilePath == null || sourceFilePath.isBlank()) {
+                continue;
+            }
+            Path sourcePath = Paths.get(sourceFilePath).toAbsolutePath().normalize();
+            if (!sourcePath.startsWith(normalizedRoot)) {
+                log.warn("Source file path is out of root: {}", sourceFilePath);
+                continue;
+            }
+            if (!Files.exists(sourcePath) || !Files.isRegularFile(sourcePath)) {
+                log.warn("Source file does not exist or is not a regular file: {}", sourceFilePath);
+                continue;
+            }
+
+            Path relativePath = normalizedRoot.relativize(sourcePath);
+            String fileName = sourcePath.getFileName().toString();
+            File sourceFile = sourcePath.toFile();
+            LocalDateTime currentTime = LocalDateTime.now();
+            Path targetPath = Paths.get(dataset.getPath(), relativePath.toString());
+
+            DatasetFile datasetFile = DatasetFile.builder()
+                .id(UUID.randomUUID().toString())
+                .datasetId(datasetId)
+                .fileName(fileName)
+                .fileType(AnalyzerUtils.getExtension(fileName))
+                .fileSize(sourceFile.length())
+                .filePath(targetPath.toString())
+                .uploadTime(currentTime)
+                .lastAccessTime(currentTime)
+                .build();
+            setDatasetFileId(datasetFile, dataset);
+            dataset.addFile(datasetFile);
+            copiedFiles.add(datasetFile);
+            copyTargets.put(sourceFilePath, datasetFile);
+        }
+
+        if (copiedFiles.isEmpty()) {
+            return copiedFiles;
+        }
+        datasetFileRepository.saveOrUpdateBatch(copiedFiles, 100);
+        dataset.active();
+        datasetRepository.updateById(dataset);
+        CompletableFuture.runAsync(() -> copyFilesToDatasetDirWithRelativePath(copyTargets, dataset, normalizedRoot));
+        return copiedFiles;
+    }
+
+    private void copyFilesToDatasetDir(List<String> sourcePaths, Dataset dataset) {
+        for (String sourcePath : sourcePaths) {
+            Path sourceFilePath = Paths.get(sourcePath);
+            Path targetFilePath = Paths.get(dataset.getPath(), sourceFilePath.getFileName().toString());
+            try {
                Files.createDirectories(Path.of(dataset.getPath()));
                Files.copy(sourceFilePath, targetFilePath);
                DatasetFile datasetFile = datasetFileRepository.findByDatasetIdAndFileName(
@@ -753,10 +818,39 @@ public class DatasetFileApplicationService {
                triggerPdfTextExtraction(dataset, datasetFile);
            } catch (IOException e) {
                log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
-            }
-        }
-    }
-
+            }
+        }
+    }
+
+    private void copyFilesToDatasetDirWithRelativePath(
+        Map<String, DatasetFile> copyTargets,
+        Dataset dataset,
+        Path sourceRoot
+    ) {
+        Path datasetRoot = Paths.get(dataset.getPath()).toAbsolutePath().normalize();
+        Path normalizedRoot = sourceRoot.toAbsolutePath().normalize();
+        for (Map.Entry<String, DatasetFile> entry : copyTargets.entrySet()) {
+            Path sourcePath = Paths.get(entry.getKey()).toAbsolutePath().normalize();
+            if (!sourcePath.startsWith(normalizedRoot)) {
+                log.warn("Source file path is out of root: {}", sourcePath);
+                continue;
+            }
+            Path relativePath = normalizedRoot.relativize(sourcePath);
+            Path targetFilePath = datasetRoot.resolve(relativePath).normalize();
+            if (!targetFilePath.startsWith(datasetRoot)) {
+                log.warn("Target file path is out of dataset path: {}", targetFilePath);
+                continue;
+            }
+            try {
+                Files.createDirectories(targetFilePath.getParent());
+                Files.copy(sourcePath, targetFilePath);
+                triggerPdfTextExtraction(dataset, entry.getValue());
+            } catch (IOException e) {
+                log.error("Failed to copy file from {} to {}", sourcePath, targetFilePath, e);
+            }
+        }
+    }
+
    /**
     * 添加文件到数据集（仅创建数据库记录，不执行文件系统操作）
     *