diff --git a/runtime/datamate-python/app/module/dataset/service/pdf_extract.py b/runtime/datamate-python/app/module/dataset/service/pdf_extract.py index a4e5fb1..a0f2e2a 100644 --- a/runtime/datamate-python/app/module/dataset/service/pdf_extract.py +++ b/runtime/datamate-python/app/module/dataset/service/pdf_extract.py @@ -1,5 +1,4 @@ import csv -import csv import datetime import os from io import StringIO @@ -76,6 +75,7 @@ class PdfTextExtractService: source_path = self._resolve_source_path(file_record) dataset_path = self._resolve_dataset_path(dataset) target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id, file_type) + logical_path = self._build_logical_path(dataset_path, target_path) existing_record = await self._find_existing_text_record(dataset_id, target_path) if existing_record: @@ -85,7 +85,7 @@ class PdfTextExtractService: file_size = self._get_file_size(target_path) parser_name = PARSER_BY_FILE_TYPE.get(file_type, "") record = await self._create_text_file_record( - dataset, file_record, target_path, file_size, parser_name, derived_file_type + dataset, file_record, target_path, logical_path, file_size, parser_name, derived_file_type ) return self._build_response(dataset_id, file_id, record) @@ -94,7 +94,7 @@ class PdfTextExtractService: self._write_text_file(target_path, text_content) file_size = self._get_file_size(target_path) record = await self._create_text_file_record( - dataset, file_record, target_path, file_size, parser_name, derived_file_type + dataset, file_record, target_path, logical_path, file_size, parser_name, derived_file_type ) return self._build_response(dataset_id, file_id, record) @@ -170,6 +170,19 @@ class PdfTextExtractService: target_dir.mkdir(parents=True, exist_ok=True) return target_dir / output_name + @staticmethod + def _build_logical_path(dataset_path: Path, target_path: Path) -> str: + normalized_dataset_path = dataset_path.resolve() + normalized_target_path = target_path.resolve() + try: + relative_path = normalized_target_path.relative_to(normalized_dataset_path) + except ValueError as exc: + raise HTTPException(status_code=400, detail="解析文件路径超出数据集目录") from exc + logical_path = str(relative_path).replace("\\", "/").strip() + if not logical_path: + raise HTTPException(status_code=500, detail="解析文件逻辑路径为空") + return logical_path + async def _find_existing_text_record(self, dataset_id: str, target_path: Path) -> DatasetFiles | None: result = await self.db.execute( select(DatasetFiles).where( @@ -259,10 +272,12 @@ class PdfTextExtractService: dataset: Dataset, source_file: DatasetFiles, target_path: Path, + logical_path: str, file_size: int, parser_name: str, derived_file_type: str, ) -> DatasetFiles: + assert logical_path assert parser_name assert derived_file_type metadata = { @@ -275,6 +290,7 @@ class PdfTextExtractService: dataset_id=dataset.id, # type: ignore[arg-type] file_name=target_path.name, file_path=str(target_path), + logical_path=logical_path, file_type=derived_file_type, file_size=file_size, dataset_filemetadata=metadata,