feat(dataset): 添加PDF提取服务中的逻辑路径构建功能

- 移除重复的csv导入语句 - 添加_build_logical_path方法用于构建文件逻辑路径 - 在_create_text_file_record方法中增加logical_path参数 - 更新记录创建调用以传递逻辑路径参数 - 验证逻辑路径不为空并抛出相应异常 - 将逻辑路径存储到数据集文件记录中
2026-02-06 18:30:44 +08:00
parent 0f1dd9ec8d
commit 05752678cc
1 changed files with 19 additions and 3 deletions
--- a/runtime/datamate-python/app/module/dataset/service/pdf_extract.py
+++ b/runtime/datamate-python/app/module/dataset/service/pdf_extract.py
@@ -1,5 +1,4 @@
 import csv
-import csv
 import datetime
 import os
 from io import StringIO
@@ -76,6 +75,7 @@ class PdfTextExtractService:
        source_path = self._resolve_source_path(file_record)
        dataset_path = self._resolve_dataset_path(dataset)
        target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id, file_type)
+        logical_path = self._build_logical_path(dataset_path, target_path)

        existing_record = await self._find_existing_text_record(dataset_id, target_path)
        if existing_record:
@@ -85,7 +85,7 @@ class PdfTextExtractService:
            file_size = self._get_file_size(target_path)
            parser_name = PARSER_BY_FILE_TYPE.get(file_type, "")
            record = await self._create_text_file_record(
-                dataset, file_record, target_path, file_size, parser_name, derived_file_type
+                dataset, file_record, target_path, logical_path, file_size, parser_name, derived_file_type
            )
            return self._build_response(dataset_id, file_id, record)

@@ -94,7 +94,7 @@ class PdfTextExtractService:
        self._write_text_file(target_path, text_content)
        file_size = self._get_file_size(target_path)
        record = await self._create_text_file_record(
-            dataset, file_record, target_path, file_size, parser_name, derived_file_type
+            dataset, file_record, target_path, logical_path, file_size, parser_name, derived_file_type
        )
        return self._build_response(dataset_id, file_id, record)

@@ -170,6 +170,19 @@ class PdfTextExtractService:
        target_dir.mkdir(parents=True, exist_ok=True)
        return target_dir / output_name

+    @staticmethod
+    def _build_logical_path(dataset_path: Path, target_path: Path) -> str:
+        normalized_dataset_path = dataset_path.resolve()
+        normalized_target_path = target_path.resolve()
+        try:
+            relative_path = normalized_target_path.relative_to(normalized_dataset_path)
+        except ValueError as exc:
+            raise HTTPException(status_code=400, detail="解析文件路径超出数据集目录") from exc
+        logical_path = str(relative_path).replace("\\", "/").strip()
+        if not logical_path:
+            raise HTTPException(status_code=500, detail="解析文件逻辑路径为空")
+        return logical_path
+
    async def _find_existing_text_record(self, dataset_id: str, target_path: Path) -> DatasetFiles | None:
        result = await self.db.execute(
            select(DatasetFiles).where(
@@ -259,10 +272,12 @@ class PdfTextExtractService:
        dataset: Dataset,
        source_file: DatasetFiles,
        target_path: Path,
+        logical_path: str,
        file_size: int,
        parser_name: str,
        derived_file_type: str,
    ) -> DatasetFiles:
+        assert logical_path
        assert parser_name
        assert derived_file_type
        metadata = {
@@ -275,6 +290,7 @@ class PdfTextExtractService:
            dataset_id=dataset.id,  # type: ignore[arg-type]
            file_name=target_path.name,
            file_path=str(target_path),
+            logical_path=logical_path,
            file_type=derived_file_type,
            file_size=file_size,
            dataset_filemetadata=metadata,