feature: data management supports nested folders (#150)

* fix: k8s部署场景下，backend-python服务挂载需要存储 * fix: 增加数据集文件免拷贝的接口定义 * fix: 评估时评估结果赋予初始空值，防止未评估完成时接口报错 * feature: 数据管理支持嵌套文件夹（展示时按照文件系统展示；批量下载时带上相对路径） * fix: 去除多余的文件重命名逻辑 * refactor: remove unused imports
2025-12-10 16:42:45 +08:00
parent fea7133dee
commit f87060490c
7 changed files with 290 additions and 58 deletions
--- a/backend/openapi/specs/data-management.yaml
+++ b/backend/openapi/specs/data-management.yaml
@@ -330,6 +330,35 @@ paths:
                type: string
                format: binary

+  /data-management/datasets/{datasetId}/files/upload/add:
+      post:
+          tags: [ DatasetFile ]
+          operationId: addFilesToDataset
+          summary: 添加文件到数据集（仅创建数据库记录）
+          description: 将指定源文件路径列表添加到数据集，仅在数据库中创建记录，不执行物理文件系统操作。
+          parameters:
+              -   name: datasetId
+                  in: path
+                  required: true
+                  schema:
+                      type: string
+                  description: 数据集ID
+          requestBody:
+              required: true
+              content:
+                  application/json:
+                      schema:
+                          $ref: '#/components/schemas/AddFilesRequest'
+          responses:
+              '200':
+                  description: 添加成功，返回创建的文件记录列表
+                  content:
+                      application/json:
+                          schema:
+                              type: array
+                              items:
+                                  $ref: '#/components/schemas/DatasetFileResponse'
+
  /data-management/datasets/{datasetId}/files/upload/pre-upload:
    post:
      tags: [ DatasetFile ]
@@ -805,3 +834,19 @@ components:
        path:
          type: string
          description: 请求路径
+
+    AddFilesRequest:
+        type: object
+        description: 将源文件路径添加到数据集的请求
+        properties:
+            sourcePaths:
+                type: array
+                items:
+                    type: string
+                description: 源文件路径列表（相对或绝对路径），每个元素表示一个要添加的文件或目录路径
+            softAdd:
+                type: boolean
+                description: 如果为 true，则仅在数据库中创建记录（默认 false）
+                default: false
+        required:
+            - sourcePaths
--- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java
+++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java
@@ -1,6 +1,7 @@
 package com.datamate.datamanagement.application;

 import com.baomidou.mybatisplus.core.metadata.IPage;
+import com.baomidou.mybatisplus.extension.plugins.pagination.Page;
 import com.datamate.common.domain.model.ChunkUploadPreRequest;
 import com.datamate.common.domain.model.FileUploadResult;
 import com.datamate.common.domain.service.FileService;
@@ -29,6 +30,9 @@ import com.fasterxml.jackson.core.JsonProcessingException;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import jakarta.servlet.http.HttpServletResponse;
 import lombok.extern.slf4j.Slf4j;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
+import org.apache.commons.io.IOUtils;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.beans.factory.annotation.Value;
 import org.springframework.core.io.Resource;
@@ -37,7 +41,6 @@ import org.springframework.http.HttpHeaders;
 import org.springframework.stereotype.Service;
 import org.springframework.transaction.annotation.Transactional;

-import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -45,14 +48,15 @@ import java.net.MalformedURLException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.nio.file.attribute.BasicFileAttributes;
 import java.time.LocalDateTime;
+import java.time.ZoneId;
 import java.time.format.DateTimeFormatter;
 import java.util.*;
 import java.util.concurrent.CompletableFuture;
 import java.util.function.Function;
 import java.util.stream.Collectors;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipOutputStream;
+import java.util.stream.Stream;

 /**
 * 数据集文件应用服务
@@ -85,11 +89,77 @@ public class DatasetFileApplicationService {
     */
    @Transactional(readOnly = true)
    public PagedResponse<DatasetFile> getDatasetFiles(String datasetId, String fileType, String status, String name, PagingQuery pagingQuery) {
-        IPage<DatasetFile> page = new com.baomidou.mybatisplus.extension.plugins.pagination.Page<>(pagingQuery.getPage(), pagingQuery.getSize());
+        IPage<DatasetFile> page = new Page<>(pagingQuery.getPage(), pagingQuery.getSize());
        IPage<DatasetFile> files = datasetFileRepository.findByCriteria(datasetId, fileType, status, name, page);
        return PagedResponse.of(files);
    }

+    /**
+     * 获取数据集文件列表
+     */
+    @Transactional(readOnly = true)
+    public PagedResponse<DatasetFile> getDatasetFilesWithDirectory(String datasetId, String prefix, PagingQuery pagingQuery) {
+        Dataset dataset = datasetRepository.getById(datasetId);
+        int page = Math.max(pagingQuery.getPage(), 1);
+        int size = pagingQuery.getSize() == null || pagingQuery.getSize() < 0 ? 20 : pagingQuery.getSize();
+        if (dataset == null) {
+            return PagedResponse.of(new Page<>(page, size));
+        }
+        String datasetPath = dataset.getPath();
+        Path queryPath = Path.of(dataset.getPath() + File.separator + prefix);
+        Map<String, DatasetFile> datasetFilesMap = datasetFileRepository.findAllByDatasetId(datasetId)
+            .stream().collect(Collectors.toMap(DatasetFile::getFilePath, Function.identity()));
+        try (Stream<Path> pathStream = Files.list(queryPath)) {
+            List<Path> allFiles = pathStream
+                .filter(path -> path.toString().startsWith(datasetPath))
+                .sorted(Comparator
+                    .comparing((Path path) -> !Files.isDirectory(path))
+                    .thenComparing(path -> path.getFileName().toString()))
+                .collect(Collectors.toList());
+
+            // 计算分页
+            int total = allFiles.size();
+            int totalPages = (int) Math.ceil((double) total / size);
+
+            // 获取当前页数据
+            int fromIndex = (page - 1) * size;
+            fromIndex = Math.max(fromIndex, 0);
+            int toIndex = Math.min(fromIndex + size, total);
+
+            List<Path> pageData = new ArrayList<>();
+            if (fromIndex < total) {
+                pageData = allFiles.subList(fromIndex, toIndex);
+            }
+            List<DatasetFile> datasetFiles = pageData.stream().map(path -> getDatasetFile(path, datasetFilesMap)).toList();
+
+            return new PagedResponse<>(page, size, total, totalPages, datasetFiles);
+        } catch (IOException e) {
+            log.error("list dataset path error", e);
+            return PagedResponse.of(new Page<>(page, size));
+        }
+    }
+
+    private DatasetFile getDatasetFile(Path path, Map<String, DatasetFile> datasetFilesMap) {
+        DatasetFile datasetFile = new DatasetFile();
+        LocalDateTime localDateTime = LocalDateTime.now();
+        try {
+            localDateTime = Files.getLastModifiedTime(path).toInstant().atZone(ZoneId.systemDefault()).toLocalDateTime();
+        } catch (IOException e) {
+            log.error("get last modified time error", e);
+        }
+        datasetFile.setFileName(path.getFileName().toString());
+        datasetFile.setUploadTime(localDateTime);
+        if (Files.isDirectory(path)) {
+            datasetFile.setId("directory-" + datasetFile.getFileName());
+        } else if (Objects.isNull(datasetFilesMap.get(path.toString()))) {
+            datasetFile.setId("file-" + datasetFile.getFileName());
+            datasetFile.setFileSize(path.toFile().length());
+        } else {
+            datasetFile = datasetFilesMap.get(path.toString());
+        }
+        return datasetFile;
+    }
+
    /**
     * 获取文件详情
     */
@@ -151,15 +221,26 @@ public class DatasetFileApplicationService {
     */
    @Transactional(readOnly = true)
    public void downloadDatasetFileAsZip(String datasetId, HttpServletResponse response) {
+        Dataset dataset = datasetRepository.getById(datasetId);
+        if (Objects.isNull(dataset)) {
+            throw BusinessException.of(DataManagementErrorCode.DATASET_NOT_FOUND);
+        }
        List<DatasetFile> allByDatasetId = datasetFileRepository.findAllByDatasetId(datasetId);
-        fileRename(allByDatasetId);
+        Set<String> filePaths = allByDatasetId.stream().map(DatasetFile::getFilePath).collect(Collectors.toSet());
+        String datasetPath = dataset.getPath();
+        Path downloadPath = Path.of(datasetPath);
        response.setContentType("application/zip");
        String zipName = String.format("dataset_%s.zip",
                LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyyMMddHHmmss")));
        response.setHeader(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=" + zipName);
-        try (ZipOutputStream zos = new ZipOutputStream(response.getOutputStream())) {
-            for (DatasetFile file : allByDatasetId) {
-                addToZipFile(file, zos);
+        try (ZipArchiveOutputStream zos = new ZipArchiveOutputStream(response.getOutputStream())) {
+            try (Stream<Path> pathStream = Files.walk(downloadPath)) {
+                List<Path> allPaths = pathStream.filter(path -> path.toString().startsWith(datasetPath))
+                    .filter(path -> filePaths.stream().anyMatch(filePath -> filePath.startsWith(path.toString())))
+                    .toList();
+                for (Path path : allPaths) {
+                    addToZipFile(path, downloadPath, zos);
+                }
            }
        } catch (IOException e) {
            log.error("Failed to download files in batches.", e);
@@ -167,42 +248,34 @@ public class DatasetFileApplicationService {
        }
    }

-    private void fileRename(List<DatasetFile> files) {
-        Set<String> uniqueFilenames = new HashSet<>();
-        for (DatasetFile file : files) {
-            String originalFilename = file.getFileName();
-            if (!uniqueFilenames.add(originalFilename)) {
-                String newFilename;
-                int counter = 1;
-                do {
-                    newFilename = generateNewFilename(originalFilename, counter);
-                    counter++;
-                } while (!uniqueFilenames.add(newFilename));
-                file.setFileName(newFilename);
-            }
-        }
-    }
+    private void addToZipFile(Path path, Path basePath, ZipArchiveOutputStream zos) throws IOException {
+        String entryName = basePath.relativize(path)
+            .toString()
+            .replace(File.separator, "/");

-    private String generateNewFilename(String oldFilename, int counter) {
-        int dotIndex = oldFilename.lastIndexOf(".");
-        return oldFilename.substring(0, dotIndex) + "-(" + counter + ")" + oldFilename.substring(dotIndex);
+        // 处理目录
+        if (Files.isDirectory(path)) {
+            if (!entryName.isEmpty()) {
+                entryName += "/";
+                ZipArchiveEntry dirEntry = new ZipArchiveEntry(entryName);
+                zos.putArchiveEntry(dirEntry);
+                zos.closeArchiveEntry();
            }
+        } else {
+            // 处理文件
+            ZipArchiveEntry fileEntry = new ZipArchiveEntry(path.toFile(), entryName);

-    private void addToZipFile(DatasetFile file, ZipOutputStream zos) throws IOException {
-        if (file.getFilePath() == null || !Files.exists(Paths.get(file.getFilePath()))) {
-            log.warn("The file hasn't been found on filesystem, id: {}", file.getId());
-            return;
+            // 设置更多属性
+            BasicFileAttributes attrs = Files.readAttributes(path, BasicFileAttributes.class);
+            fileEntry.setSize(attrs.size());
+            fileEntry.setLastModifiedTime(attrs.lastModifiedTime());
+
+            zos.putArchiveEntry(fileEntry);
+
+            try (InputStream is = Files.newInputStream(path)) {
+                IOUtils.copy(is, zos);
            }
-        try (InputStream fis = Files.newInputStream(Paths.get(file.getFilePath()));
-             BufferedInputStream bis = new BufferedInputStream(fis)) {
-            ZipEntry zipEntry = new ZipEntry(file.getFileName());
-            zos.putNextEntry(zipEntry);
-            byte[] buffer = new byte[8192];
-            int length;
-            while ((length = bis.read(buffer)) >= 0) {
-                zos.write(buffer, 0, length);
-            }
-            zos.closeEntry();
+            zos.closeArchiveEntry();
        }
    }

--- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java
+++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/interfaces/rest/DatasetFileController.java
@@ -46,12 +46,10 @@ public class DatasetFileController {
            @PathVariable("datasetId") String datasetId,
            @RequestParam(value = "page", required = false, defaultValue = "0") Integer page,
            @RequestParam(value = "size", required = false, defaultValue = "20") Integer size,
-            @RequestParam(value = "fileType", required = false) String fileType,
-            @RequestParam(value = "status", required = false) String status,
-            @RequestParam(value = "name", required = false) String name) {
+            @RequestParam(value = "prefix", required = false) String prefix) {
        PagingQuery pagingQuery = new PagingQuery(page, size);
-        PagedResponse<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFiles(
-                datasetId, fileType, status, name, pagingQuery);
+        PagedResponse<DatasetFile> filesPage = datasetFileApplicationService.getDatasetFilesWithDirectory(
+                datasetId, prefix, pagingQuery);
        return Response.ok(filesPage);
    }

--- a/backend/shared/domain-common/src/main/java/com/datamate/common/domain/utils/ArchiveAnalyzer.java
+++ b/backend/shared/domain-common/src/main/java/com/datamate/common/domain/utils/ArchiveAnalyzer.java
@@ -12,6 +12,7 @@ import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.io.FileUtils;

 import java.io.BufferedInputStream;
 import java.io.BufferedOutputStream;
@@ -145,7 +146,7 @@ public class ArchiveAnalyzer {
        Path path = Paths.get(archivePath.getParent().toString(), archiveEntry.getName());
        File file = path.toFile();
        long fileSize = 0L;
-        String extension = AnalyzerUtils.getExtension(archiveEntry.getName());
+        FileUtils.createParentDirectories(file);

        long supportFileSize = 1024*1024*1024; // 上传大小暂定为1个G
        try (OutputStream outputStream = new BufferedOutputStream(Files.newOutputStream(file.toPath()))) {
--- a/frontend/src/pages/DataManagement/Detail/components/Overview.tsx
+++ b/frontend/src/pages/DataManagement/Detail/components/Overview.tsx
@@ -1,6 +1,6 @@
 import { Button, Descriptions, DescriptionsProps, Modal, Table } from "antd";
 import { formatBytes, formatDateTime } from "@/utils/unit";
-import { Download, Trash2 } from "lucide-react";
+import { Download, Trash2, Folder, File } from "lucide-react";
 import { datasetTypeMap } from "../../dataset.const";

 export default function Overview({ dataset, filesOperation, fetchDataset }) {
@@ -102,13 +102,58 @@ export default function Overview({ dataset, filesOperation, fetchDataset }) {
      dataIndex: "fileName",
      key: "fileName",
      fixed: "left",
+      render: (text: string, record: any) => {
+        const isDirectory = record.id.startsWith('directory-');
+        const iconSize = 16;
+
+        const content = (
+          <div className="flex items-center">
+            {isDirectory ? (
+              <Folder className="mr-2 text-blue-500" size={iconSize} />
+            ) : (
+              <File className="mr-2 text-black" size={iconSize} />
+            )}
+            <span className="truncate text-black">{text}</span>
+          </div>
+        );
+
+        if (isDirectory) {
+          return (
+            <Button
+              type="link"
+              onClick={(e) => {
+                const currentPath = filesOperation.pagination.prefix || '';
+                const newPath = `${currentPath}${record.fileName}`;
+                filesOperation.fetchFiles(newPath);
+              }}
+            >
+              {content}
+            </Button>
+          );
+        }
+
+        return (
+            <Button
+              type="link"
+              onClick={(e) => {}}
+            >
+              {content}
+            </Button>
+          );
+      },
    },
    {
      title: "大小",
      dataIndex: "fileSize",
      key: "fileSize",
      width: 150,
-      render: (text) => formatBytes(text),
+      render: (text: number, record: any) => {
+        const isDirectory = record.id.startsWith('directory-');
+        if (isDirectory) {
+          return "-";
+        }
+        return formatBytes(text)
+      },
    },
    {
      title: "上传时间",
@@ -122,7 +167,12 @@ export default function Overview({ dataset, filesOperation, fetchDataset }) {
      key: "action",
      width: 180,
      fixed: "right",
-      render: (_, record) => (
+      render: (_, record) => {
+        const isDirectory = record.id.startsWith('directory-');
+        if (isDirectory) {
+          return <div className="flex"/>;
+        }
+        return (
        <div className="flex">
          <Button
            size="small"
@@ -143,9 +193,10 @@ export default function Overview({ dataset, filesOperation, fetchDataset }) {
            删除
          </Button>
        </div>
-      ),
+      )},
    },
  ];
+
  return (
    <>
      <div className=" flex flex-col gap-4">
@@ -182,6 +233,43 @@ export default function Overview({ dataset, filesOperation, fetchDataset }) {
          </div>
        )}
        <div className="overflow-x-auto">
+          <div className="mb-2">
+            {(filesOperation.pagination.prefix || '') !== '' && (
+              <Button
+                type="link"
+                onClick={() => {
+                  // 获取上一级目录
+                  const currentPath = filesOperation.pagination.prefix || '';
+                  const pathParts = currentPath.split('/').filter(Boolean);
+                  pathParts.pop(); // 移除最后一个目录
+                  const parentPath = pathParts.length > 0 ? `${pathParts.join('/')}/` : '';
+                  filesOperation.fetchFiles(parentPath);
+                }}
+                className="p-0"
+              >
+                <span className="flex items-center text-blue-500">
+                  <svg
+                    className="w-4 h-4 mr-1"
+                    fill="none"
+                    stroke="currentColor"
+                    viewBox="0 0 24 24"
+                    xmlns="http://www.w3.org/2000/svg"
+                  >
+                    <path
+                      strokeLinecap="round"
+                      strokeLinejoin="round"
+                      strokeWidth={2}
+                      d="M10 19l-7-7m0 0l7-7m-7 7h18"
+                    />
+                  </svg>
+                  返回上一级
+                </span>
+              </Button>
+            )}
+            {filesOperation.pagination.prefix && (
+              <span className="ml-2 text-gray-600">当前路径: {filesOperation.pagination.prefix}</span>
+            )}
+          </div>
          <Table
            size="middle"
            rowKey="id"
@@ -192,6 +280,14 @@ export default function Overview({ dataset, filesOperation, fetchDataset }) {
            pagination={{
              ...pagination,
              showTotal: (total) => `共 ${total} 条`,
+              onChange: (page, pageSize) => {
+                filesOperation.setPagination(prev => ({
+                    ...prev,
+                    current: page,
+                    pageSize: pageSize
+                }));
+                filesOperation.fetchFiles(pagination.prefix, page, pageSize);
+              }
            }}
          />
        </div>
--- a/frontend/src/pages/DataManagement/Detail/useFilesOperation.ts
+++ b/frontend/src/pages/DataManagement/Detail/useFilesOperation.ts
@@ -23,19 +23,35 @@ export function useFilesOperation(dataset: Dataset) {
    current: number;
    pageSize: number;
    total: number;
-  }>({ current: 1, pageSize: 10, total: 0 });
+    prefix?: string;
+  }>({ current: 1, pageSize: 10, total: 0, prefix: '' });

  // 文件预览相关状态
  const [previewVisible, setPreviewVisible] = useState(false);
  const [previewContent, setPreviewContent] = useState("");
  const [previewFileName, setPreviewFileName] = useState("");

-  const fetchFiles = async () => {
-    const { data } = await queryDatasetFilesUsingGet(id!, {
-      page: pagination.current - 1,
-      size: pagination.pageSize,
-    });
+  const fetchFiles = async (prefix: string = '', current, pageSize) => {
+    const params: any = {
+      page: current ? current : pagination.current,
+      size: pageSize ? pageSize : pagination.pageSize,
+    };
+
+    if (prefix !== undefined) {
+      params.prefix = prefix;
+    } else if (pagination.prefix) {
+      params.prefix = pagination.prefix;
+    }
+
+    const { data } = await queryDatasetFilesUsingGet(id!, params);
    setFileList(data.content || []);
+
+    // Update pagination with current prefix
+    setPagination(prev => ({
+      ...prev,
+      prefix: prefix !== undefined ? prefix : prev.prefix,
+      total: data.totalElements || 0,
+    }));
  };

  const handleBatchDeleteFiles = () => {
@@ -113,6 +129,7 @@ export function useFilesOperation(dataset: Dataset) {
    fileList,
    selectedFiles,
    setSelectedFiles,
+    pagination,
    setPagination,
    previewVisible,
    setPreviewVisible,
--- a/runtime/datamate-python/app/module/evaluation/service/evaluation.py
+++ b/runtime/datamate-python/app/module/evaluation/service/evaluation.py
@@ -113,6 +113,7 @@ class DatasetEvaluationExecutor(EvaluationExecutor):
                    file_id=dataset_file.id,
                    item_id=item.get("id") if item.get("id") else str(uuid.uuid4()),
                    eval_content=json.dumps(item, ensure_ascii=False),
+                    eval_result="{}",
                    status=TaskStatus.PENDING.value,
                    created_by=self.task.created_by,
                    updated_by=self.task.updated_by,
@@ -152,6 +153,7 @@ class SynthesisEvaluationExecutor(EvaluationExecutor):
                    file_id=synthesis_file.id,
                    item_id=synthesis_data.id,
                    eval_content=json.dumps(synthesis_data.data),
+                    eval_result="{}",
                    status=TaskStatus.PENDING.value,
                    created_by=self.task.created_by,
                    updated_by=self.task.updated_by,