You've already forked DataMate
feat(dataset): 添加PDF提取服务中的逻辑路径构建功能
- 移除重复的csv导入语句 - 添加_build_logical_path方法用于构建文件逻辑路径 - 在_create_text_file_record方法中增加logical_path参数 - 更新记录创建调用以传递逻辑路径参数 - 验证逻辑路径不为空并抛出相应异常 - 将逻辑路径存储到数据集文件记录中
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
import csv
|
||||
import csv
|
||||
import datetime
|
||||
import os
|
||||
from io import StringIO
|
||||
@@ -76,6 +75,7 @@ class PdfTextExtractService:
|
||||
source_path = self._resolve_source_path(file_record)
|
||||
dataset_path = self._resolve_dataset_path(dataset)
|
||||
target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id, file_type)
|
||||
logical_path = self._build_logical_path(dataset_path, target_path)
|
||||
|
||||
existing_record = await self._find_existing_text_record(dataset_id, target_path)
|
||||
if existing_record:
|
||||
@@ -85,7 +85,7 @@ class PdfTextExtractService:
|
||||
file_size = self._get_file_size(target_path)
|
||||
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "")
|
||||
record = await self._create_text_file_record(
|
||||
dataset, file_record, target_path, file_size, parser_name, derived_file_type
|
||||
dataset, file_record, target_path, logical_path, file_size, parser_name, derived_file_type
|
||||
)
|
||||
return self._build_response(dataset_id, file_id, record)
|
||||
|
||||
@@ -94,7 +94,7 @@ class PdfTextExtractService:
|
||||
self._write_text_file(target_path, text_content)
|
||||
file_size = self._get_file_size(target_path)
|
||||
record = await self._create_text_file_record(
|
||||
dataset, file_record, target_path, file_size, parser_name, derived_file_type
|
||||
dataset, file_record, target_path, logical_path, file_size, parser_name, derived_file_type
|
||||
)
|
||||
return self._build_response(dataset_id, file_id, record)
|
||||
|
||||
@@ -170,6 +170,19 @@ class PdfTextExtractService:
|
||||
target_dir.mkdir(parents=True, exist_ok=True)
|
||||
return target_dir / output_name
|
||||
|
||||
@staticmethod
|
||||
def _build_logical_path(dataset_path: Path, target_path: Path) -> str:
|
||||
normalized_dataset_path = dataset_path.resolve()
|
||||
normalized_target_path = target_path.resolve()
|
||||
try:
|
||||
relative_path = normalized_target_path.relative_to(normalized_dataset_path)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=400, detail="解析文件路径超出数据集目录") from exc
|
||||
logical_path = str(relative_path).replace("\\", "/").strip()
|
||||
if not logical_path:
|
||||
raise HTTPException(status_code=500, detail="解析文件逻辑路径为空")
|
||||
return logical_path
|
||||
|
||||
async def _find_existing_text_record(self, dataset_id: str, target_path: Path) -> DatasetFiles | None:
|
||||
result = await self.db.execute(
|
||||
select(DatasetFiles).where(
|
||||
@@ -259,10 +272,12 @@ class PdfTextExtractService:
|
||||
dataset: Dataset,
|
||||
source_file: DatasetFiles,
|
||||
target_path: Path,
|
||||
logical_path: str,
|
||||
file_size: int,
|
||||
parser_name: str,
|
||||
derived_file_type: str,
|
||||
) -> DatasetFiles:
|
||||
assert logical_path
|
||||
assert parser_name
|
||||
assert derived_file_type
|
||||
metadata = {
|
||||
@@ -275,6 +290,7 @@ class PdfTextExtractService:
|
||||
dataset_id=dataset.id, # type: ignore[arg-type]
|
||||
file_name=target_path.name,
|
||||
file_path=str(target_path),
|
||||
logical_path=logical_path,
|
||||
file_type=derived_file_type,
|
||||
file_size=file_size,
|
||||
dataset_filemetadata=metadata,
|
||||
|
||||
Reference in New Issue
Block a user