From 5eafcf0145df66136862b33a31f1ae844ab34616 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Thu, 29 Jan 2026 11:39:00 +0800 Subject: [PATCH] =?UTF-8?q?refactor(request):=20=E4=BC=98=E5=8C=96?= =?UTF-8?q?=E8=AF=B7=E6=B1=82=E5=A4=84=E7=90=86=E9=80=BB=E8=BE=91=E5=B9=B6?= =?UTF-8?q?=E6=B7=BB=E5=8A=A0PDF=E6=8F=90=E5=8F=96=E6=8E=A5=E5=8F=A3?= =?UTF-8?q?=E5=AE=9A=E4=B9=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 简化catch语句移除不必要的异常变量 - 删除无用的FormData条件判断代码 - 将变量声明从let改为const提升代码质量 - 移除响应拦截器中的冗余参数传递 - 在数据集模式模块中添加PDF文本提取相关的请求响应模型 - 更新模块导出列表包含新的PDF提取接口类型定义 --- frontend/src/utils/request.ts | 8 +- .../app/module/dataset/interface/__init__.py | 10 + .../app/module/dataset/schema/__init__.py | 8 +- .../app/module/dataset/service/pdf_extract.py | 190 ++++++++++++++++++ 4 files changed, 210 insertions(+), 6 deletions(-) create mode 100644 runtime/datamate-python/app/module/dataset/interface/__init__.py create mode 100644 runtime/datamate-python/app/module/dataset/service/pdf_extract.py diff --git a/frontend/src/utils/request.ts b/frontend/src/utils/request.ts index 760baf8..b3d972b 100644 --- a/frontend/src/utils/request.ts +++ b/frontend/src/utils/request.ts @@ -123,7 +123,7 @@ class Request { } else { response = xhr.responseText; } - } catch (e) { + } catch { response = xhr.responseText; } @@ -270,8 +270,6 @@ class Request { return await this.handleXHRResponse(xhrResponse, processedConfig); } // 否则使用fetch - if (processedConfig.body instanceof FormData) { - } const response = await fetch(url, processedConfig); return await this.handleResponse(response, processedConfig); } @@ -368,7 +366,7 @@ class Request { */ async delete(url, params = null, options = {}) { let fullURL = this.baseURL + url; - let config = { + const config = { method: "DELETE", redirect: "follow", headers: { @@ -525,7 +523,7 @@ request.addRequestInterceptor((config) => { }); // 添加默认响应拦截器 - 错误处理 -request.addResponseInterceptor((response, config) => { +request.addResponseInterceptor((response) => { // 可以在这里添加全局错误处理逻辑 // 比如token过期自动跳转登录页等 return response; diff --git a/runtime/datamate-python/app/module/dataset/interface/__init__.py b/runtime/datamate-python/app/module/dataset/interface/__init__.py new file mode 100644 index 0000000..d8fe636 --- /dev/null +++ b/runtime/datamate-python/app/module/dataset/interface/__init__.py @@ -0,0 +1,10 @@ +from fastapi import APIRouter + +from .pdf_extract import router as pdf_extract_router + +router = APIRouter( + prefix="/dataset", + tags=["dataset"], +) + +router.include_router(pdf_extract_router) diff --git a/runtime/datamate-python/app/module/dataset/schema/__init__.py b/runtime/datamate-python/app/module/dataset/schema/__init__.py index 221c43f..301a9f5 100644 --- a/runtime/datamate-python/app/module/dataset/schema/__init__.py +++ b/runtime/datamate-python/app/module/dataset/schema/__init__.py @@ -11,6 +11,10 @@ from .dataset import ( DatasetResponse, DatasetTypeResponse, ) +from .pdf_extract import ( + PdfTextExtractRequest, + PdfTextExtractResponse, +) __all__ = [ "DatasetResponse", @@ -21,4 +25,6 @@ __all__ = [ "BatchUpdateFileTagsResponse", "FileTagUpdateResult", "FileTagUpdate", -] \ No newline at end of file + "PdfTextExtractRequest", + "PdfTextExtractResponse", +] diff --git a/runtime/datamate-python/app/module/dataset/service/pdf_extract.py b/runtime/datamate-python/app/module/dataset/service/pdf_extract.py new file mode 100644 index 0000000..5a709af --- /dev/null +++ b/runtime/datamate-python/app/module/dataset/service/pdf_extract.py @@ -0,0 +1,190 @@ +import datetime +import os +from pathlib import Path + +from fastapi import HTTPException +from langchain_community.document_loaders import PyPDFLoader +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.core.logging import get_logger +from app.db.models import Dataset, DatasetFiles +from app.module.dataset.schema.pdf_extract import PdfTextExtractResponse + +logger = get_logger(__name__) + +PDF_FILE_TYPE = "pdf" +TEXT_FILE_TYPE = "txt" +TEXT_FILE_EXTENSION = ".txt" +DERIVED_METADATA_KEY = "derived_from_file_id" +DERIVED_METADATA_NAME_KEY = "derived_from_file_name" +DERIVED_METADATA_TYPE_KEY = "derived_from_file_type" +DERIVED_METADATA_PARSER_KEY = "parser" +DERIVED_METADATA_PARSER_VALUE = "PyPDFLoader" + + +class PdfTextExtractService: + def __init__(self, db: AsyncSession): + self.db = db + + async def extract_pdf_to_text(self, dataset_id: str, file_id: str) -> PdfTextExtractResponse: + dataset = await self._get_dataset(dataset_id) + file_record = await self._get_file_record(dataset_id, file_id) + self._validate_dataset_and_file(dataset, file_record) + + source_path = self._resolve_source_path(file_record) + dataset_path = self._resolve_dataset_path(dataset) + target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id) + + existing_record = await self._find_existing_text_record(dataset_id, target_path) + if existing_record: + return self._build_response(dataset_id, file_id, existing_record) + + if target_path.exists(): + file_size = self._get_file_size(target_path) + record = await self._create_text_file_record(dataset, file_record, target_path, file_size) + return self._build_response(dataset_id, file_id, record) + + text_content = self._parse_pdf(source_path) + self._write_text_file(target_path, text_content) + file_size = self._get_file_size(target_path) + record = await self._create_text_file_record(dataset, file_record, target_path, file_size) + return self._build_response(dataset_id, file_id, record) + + async def _get_dataset(self, dataset_id: str) -> Dataset: + result = await self.db.execute(select(Dataset).where(Dataset.id == dataset_id)) + dataset = result.scalar_one_or_none() + if not dataset: + raise HTTPException(status_code=404, detail=f"Dataset not found: {dataset_id}") + return dataset + + async def _get_file_record(self, dataset_id: str, file_id: str) -> DatasetFiles: + result = await self.db.execute( + select(DatasetFiles).where( + DatasetFiles.id == file_id, + DatasetFiles.dataset_id == dataset_id, + ) + ) + file_record = result.scalar_one_or_none() + if not file_record: + raise HTTPException(status_code=404, detail=f"File not found: {file_id}") + return file_record + + @staticmethod + def _validate_dataset_and_file(dataset: Dataset, file_record: DatasetFiles) -> None: + dataset_type = str(getattr(dataset, "dataset_type", "") or "").upper() + if dataset_type != "TEXT": + raise HTTPException(status_code=400, detail="Only TEXT datasets are supported") + file_type = str(getattr(file_record, "file_type", "") or "").lower() + if file_type != PDF_FILE_TYPE: + raise HTTPException(status_code=400, detail="Only PDF files are supported") + + @staticmethod + def _resolve_source_path(file_record: DatasetFiles) -> Path: + source_path = Path(str(file_record.file_path)).expanduser().resolve() + if not source_path.exists(): + raise HTTPException(status_code=404, detail="PDF file not found on disk") + return source_path + + @staticmethod + def _resolve_dataset_path(dataset: Dataset) -> Path: + dataset_path_value = str(getattr(dataset, "path", "") or "").strip() + if not dataset_path_value: + raise HTTPException(status_code=500, detail="Dataset path is empty") + dataset_path = Path(dataset_path_value).expanduser().resolve() + dataset_path.mkdir(parents=True, exist_ok=True) + return dataset_path + + @staticmethod + def _build_output_filename(file_record: DatasetFiles, file_id: str) -> str: + original_name = str(getattr(file_record, "file_name", "") or "").strip() + if not original_name: + original_name = f"{file_id}.pdf" + return f"{original_name}{TEXT_FILE_EXTENSION}" + + def _resolve_target_path( + self, + dataset_path: Path, + source_path: Path, + file_record: DatasetFiles, + file_id: str, + ) -> Path: + output_name = self._build_output_filename(file_record, file_id) + if dataset_path in source_path.parents: + target_dir = source_path.parent + else: + target_dir = dataset_path + target_dir = target_dir.resolve() + if target_dir != dataset_path and dataset_path not in target_dir.parents: + raise HTTPException(status_code=400, detail="Target path is outside dataset path") + target_dir.mkdir(parents=True, exist_ok=True) + return target_dir / output_name + + async def _find_existing_text_record(self, dataset_id: str, target_path: Path) -> DatasetFiles | None: + result = await self.db.execute( + select(DatasetFiles).where( + DatasetFiles.dataset_id == dataset_id, + DatasetFiles.file_path == str(target_path), + ) + ) + return result.scalar_one_or_none() + + @staticmethod + def _parse_pdf(source_path: Path) -> str: + loader = PyPDFLoader(str(source_path)) + docs = loader.load() + contents = [doc.page_content for doc in docs if doc.page_content] + return "\n\n".join(contents) + + @staticmethod + def _write_text_file(target_path: Path, content: str) -> None: + with open(target_path, "w", encoding="utf-8") as handle: + handle.write(content or "") + + @staticmethod + def _get_file_size(path: Path) -> int: + try: + return int(os.path.getsize(path)) + except OSError: + return 0 + + async def _create_text_file_record( + self, + dataset: Dataset, + source_file: DatasetFiles, + target_path: Path, + file_size: int, + ) -> DatasetFiles: + metadata = { + DERIVED_METADATA_KEY: str(getattr(source_file, "id", "")), + DERIVED_METADATA_NAME_KEY: str(getattr(source_file, "file_name", "")), + DERIVED_METADATA_TYPE_KEY: str(getattr(source_file, "file_type", "")), + DERIVED_METADATA_PARSER_KEY: DERIVED_METADATA_PARSER_VALUE, + } + record = DatasetFiles( + dataset_id=dataset.id, # type: ignore[arg-type] + file_name=target_path.name, + file_path=str(target_path), + file_type=TEXT_FILE_TYPE, + file_size=file_size, + dataset_filemetadata=metadata, + last_access_time=datetime.datetime.now(datetime.UTC), + ) + self.db.add(record) + dataset.file_count = (dataset.file_count or 0) + 1 + dataset.size_bytes = (dataset.size_bytes or 0) + file_size + dataset.status = "ACTIVE" + await self.db.commit() + await self.db.refresh(record) + return record + + @staticmethod + def _build_response(dataset_id: str, file_id: str, record: DatasetFiles) -> PdfTextExtractResponse: + return PdfTextExtractResponse( + datasetId=dataset_id, + sourceFileId=file_id, + textFileId=str(record.id), + textFileName=str(record.file_name), + textFilePath=str(record.file_path), + textFileSize=int(record.file_size or 0), + )