You've already forked DataMate
feat(dataset): 添加PDF提取服务中的逻辑路径构建功能
- 移除重复的csv导入语句 - 添加_build_logical_path方法用于构建文件逻辑路径 - 在_create_text_file_record方法中增加logical_path参数 - 更新记录创建调用以传递逻辑路径参数 - 验证逻辑路径不为空并抛出相应异常 - 将逻辑路径存储到数据集文件记录中
This commit is contained in:
@@ -1,5 +1,4 @@
|
|||||||
import csv
|
import csv
|
||||||
import csv
|
|
||||||
import datetime
|
import datetime
|
||||||
import os
|
import os
|
||||||
from io import StringIO
|
from io import StringIO
|
||||||
@@ -76,6 +75,7 @@ class PdfTextExtractService:
|
|||||||
source_path = self._resolve_source_path(file_record)
|
source_path = self._resolve_source_path(file_record)
|
||||||
dataset_path = self._resolve_dataset_path(dataset)
|
dataset_path = self._resolve_dataset_path(dataset)
|
||||||
target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id, file_type)
|
target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id, file_type)
|
||||||
|
logical_path = self._build_logical_path(dataset_path, target_path)
|
||||||
|
|
||||||
existing_record = await self._find_existing_text_record(dataset_id, target_path)
|
existing_record = await self._find_existing_text_record(dataset_id, target_path)
|
||||||
if existing_record:
|
if existing_record:
|
||||||
@@ -85,7 +85,7 @@ class PdfTextExtractService:
|
|||||||
file_size = self._get_file_size(target_path)
|
file_size = self._get_file_size(target_path)
|
||||||
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "")
|
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "")
|
||||||
record = await self._create_text_file_record(
|
record = await self._create_text_file_record(
|
||||||
dataset, file_record, target_path, file_size, parser_name, derived_file_type
|
dataset, file_record, target_path, logical_path, file_size, parser_name, derived_file_type
|
||||||
)
|
)
|
||||||
return self._build_response(dataset_id, file_id, record)
|
return self._build_response(dataset_id, file_id, record)
|
||||||
|
|
||||||
@@ -94,7 +94,7 @@ class PdfTextExtractService:
|
|||||||
self._write_text_file(target_path, text_content)
|
self._write_text_file(target_path, text_content)
|
||||||
file_size = self._get_file_size(target_path)
|
file_size = self._get_file_size(target_path)
|
||||||
record = await self._create_text_file_record(
|
record = await self._create_text_file_record(
|
||||||
dataset, file_record, target_path, file_size, parser_name, derived_file_type
|
dataset, file_record, target_path, logical_path, file_size, parser_name, derived_file_type
|
||||||
)
|
)
|
||||||
return self._build_response(dataset_id, file_id, record)
|
return self._build_response(dataset_id, file_id, record)
|
||||||
|
|
||||||
@@ -170,6 +170,19 @@ class PdfTextExtractService:
|
|||||||
target_dir.mkdir(parents=True, exist_ok=True)
|
target_dir.mkdir(parents=True, exist_ok=True)
|
||||||
return target_dir / output_name
|
return target_dir / output_name
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _build_logical_path(dataset_path: Path, target_path: Path) -> str:
|
||||||
|
normalized_dataset_path = dataset_path.resolve()
|
||||||
|
normalized_target_path = target_path.resolve()
|
||||||
|
try:
|
||||||
|
relative_path = normalized_target_path.relative_to(normalized_dataset_path)
|
||||||
|
except ValueError as exc:
|
||||||
|
raise HTTPException(status_code=400, detail="解析文件路径超出数据集目录") from exc
|
||||||
|
logical_path = str(relative_path).replace("\\", "/").strip()
|
||||||
|
if not logical_path:
|
||||||
|
raise HTTPException(status_code=500, detail="解析文件逻辑路径为空")
|
||||||
|
return logical_path
|
||||||
|
|
||||||
async def _find_existing_text_record(self, dataset_id: str, target_path: Path) -> DatasetFiles | None:
|
async def _find_existing_text_record(self, dataset_id: str, target_path: Path) -> DatasetFiles | None:
|
||||||
result = await self.db.execute(
|
result = await self.db.execute(
|
||||||
select(DatasetFiles).where(
|
select(DatasetFiles).where(
|
||||||
@@ -259,10 +272,12 @@ class PdfTextExtractService:
|
|||||||
dataset: Dataset,
|
dataset: Dataset,
|
||||||
source_file: DatasetFiles,
|
source_file: DatasetFiles,
|
||||||
target_path: Path,
|
target_path: Path,
|
||||||
|
logical_path: str,
|
||||||
file_size: int,
|
file_size: int,
|
||||||
parser_name: str,
|
parser_name: str,
|
||||||
derived_file_type: str,
|
derived_file_type: str,
|
||||||
) -> DatasetFiles:
|
) -> DatasetFiles:
|
||||||
|
assert logical_path
|
||||||
assert parser_name
|
assert parser_name
|
||||||
assert derived_file_type
|
assert derived_file_type
|
||||||
metadata = {
|
metadata = {
|
||||||
@@ -275,6 +290,7 @@ class PdfTextExtractService:
|
|||||||
dataset_id=dataset.id, # type: ignore[arg-type]
|
dataset_id=dataset.id, # type: ignore[arg-type]
|
||||||
file_name=target_path.name,
|
file_name=target_path.name,
|
||||||
file_path=str(target_path),
|
file_path=str(target_path),
|
||||||
|
logical_path=logical_path,
|
||||||
file_type=derived_file_type,
|
file_type=derived_file_type,
|
||||||
file_size=file_size,
|
file_size=file_size,
|
||||||
dataset_filemetadata=metadata,
|
dataset_filemetadata=metadata,
|
||||||
|
|||||||
Reference in New Issue
Block a user