feat(dataset): 添加PDF提取服务中的逻辑路径构建功能

- 移除重复的csv导入语句
- 添加_build_logical_path方法用于构建文件逻辑路径
- 在_create_text_file_record方法中增加logical_path参数
- 更新记录创建调用以传递逻辑路径参数
- 验证逻辑路径不为空并抛出相应异常
- 将逻辑路径存储到数据集文件记录中
This commit is contained in:
2026-02-06 18:30:44 +08:00
parent 0f1dd9ec8d
commit 05752678cc

View File

@@ -1,5 +1,4 @@
import csv
import csv
import datetime
import os
from io import StringIO
@@ -76,6 +75,7 @@ class PdfTextExtractService:
source_path = self._resolve_source_path(file_record)
dataset_path = self._resolve_dataset_path(dataset)
target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id, file_type)
logical_path = self._build_logical_path(dataset_path, target_path)
existing_record = await self._find_existing_text_record(dataset_id, target_path)
if existing_record:
@@ -85,7 +85,7 @@ class PdfTextExtractService:
file_size = self._get_file_size(target_path)
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "")
record = await self._create_text_file_record(
dataset, file_record, target_path, file_size, parser_name, derived_file_type
dataset, file_record, target_path, logical_path, file_size, parser_name, derived_file_type
)
return self._build_response(dataset_id, file_id, record)
@@ -94,7 +94,7 @@ class PdfTextExtractService:
self._write_text_file(target_path, text_content)
file_size = self._get_file_size(target_path)
record = await self._create_text_file_record(
dataset, file_record, target_path, file_size, parser_name, derived_file_type
dataset, file_record, target_path, logical_path, file_size, parser_name, derived_file_type
)
return self._build_response(dataset_id, file_id, record)
@@ -170,6 +170,19 @@ class PdfTextExtractService:
target_dir.mkdir(parents=True, exist_ok=True)
return target_dir / output_name
@staticmethod
def _build_logical_path(dataset_path: Path, target_path: Path) -> str:
normalized_dataset_path = dataset_path.resolve()
normalized_target_path = target_path.resolve()
try:
relative_path = normalized_target_path.relative_to(normalized_dataset_path)
except ValueError as exc:
raise HTTPException(status_code=400, detail="解析文件路径超出数据集目录") from exc
logical_path = str(relative_path).replace("\\", "/").strip()
if not logical_path:
raise HTTPException(status_code=500, detail="解析文件逻辑路径为空")
return logical_path
async def _find_existing_text_record(self, dataset_id: str, target_path: Path) -> DatasetFiles | None:
result = await self.db.execute(
select(DatasetFiles).where(
@@ -259,10 +272,12 @@ class PdfTextExtractService:
dataset: Dataset,
source_file: DatasetFiles,
target_path: Path,
logical_path: str,
file_size: int,
parser_name: str,
derived_file_type: str,
) -> DatasetFiles:
assert logical_path
assert parser_name
assert derived_file_type
metadata = {
@@ -275,6 +290,7 @@ class PdfTextExtractService:
dataset_id=dataset.id, # type: ignore[arg-type]
file_name=target_path.name,
file_path=str(target_path),
logical_path=logical_path,
file_type=derived_file_type,
file_size=file_size,
dataset_filemetadata=metadata,