From f77fd99c31d5467693997dcd23ae76e4eb7e4ba8 Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Thu, 29 Jan 2026 13:05:58 +0800 Subject: [PATCH] =?UTF-8?q?feat(data-management):=20=E6=89=A9=E5=B1=95?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E8=A7=A3=E6=9E=90=E5=8A=9F=E8=83=BD=E6=94=AF?= =?UTF-8?q?=E6=8C=81DOC=E5=92=8CDOCX=E6=A0=BC=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加对DOC和DOCX文件类型的常量定义和支持 - 将文件类型验证逻辑从仅PDF扩展为PDF/DOC/DOCX - 集成Docx2txtLoader用于处理Word文档解析 - 更新错误消息为中文描述以提升用户体验 - 重构文件解析方法以支持多种文档格式 - 添加解析器元数据记录以追踪使用的解析工具 - 更新文件路径验证和构建逻辑以适配新的文件类型 --- .../DatasetFileApplicationService.java | 5 +- .../app/module/dataset/schema/pdf_extract.py | 16 ++--- .../app/module/dataset/service/pdf_extract.py | 60 +++++++++++++------ 3 files changed, 54 insertions(+), 27 deletions(-) diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java index 03dc0cd..8f3d9bc 100644 --- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java +++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java @@ -68,6 +68,9 @@ import java.util.stream.Stream; @Transactional public class DatasetFileApplicationService { private static final String PDF_FILE_TYPE = "pdf"; + private static final String DOC_FILE_TYPE = "doc"; + private static final String DOCX_FILE_TYPE = "docx"; + private static final Set DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE); private final DatasetFileRepository datasetFileRepository; private final DatasetRepository datasetRepository; @@ -783,7 +786,7 @@ public class DatasetFileApplicationService { return; } String fileType = datasetFile.getFileType(); - if (fileType == null || !fileType.equalsIgnoreCase(PDF_FILE_TYPE)) { + if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) { return; } pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId()); diff --git a/runtime/datamate-python/app/module/dataset/schema/pdf_extract.py b/runtime/datamate-python/app/module/dataset/schema/pdf_extract.py index 2ab5767..13c21d1 100644 --- a/runtime/datamate-python/app/module/dataset/schema/pdf_extract.py +++ b/runtime/datamate-python/app/module/dataset/schema/pdf_extract.py @@ -2,20 +2,20 @@ from pydantic import BaseModel, Field class PdfTextExtractRequest(BaseModel): - dataset_id: str = Field(..., alias="datasetId", description="Dataset ID") - file_id: str = Field(..., alias="fileId", description="PDF file ID") + dataset_id: str = Field(..., alias="datasetId", description="数据集ID") + file_id: str = Field(..., alias="fileId", description="源文件ID") class Config: populate_by_name = True class PdfTextExtractResponse(BaseModel): - dataset_id: str = Field(..., alias="datasetId", description="Dataset ID") - source_file_id: str = Field(..., alias="sourceFileId", description="Source PDF file ID") - text_file_id: str = Field(..., alias="textFileId", description="Generated text file ID") - text_file_name: str = Field(..., alias="textFileName", description="Generated text file name") - text_file_path: str = Field(..., alias="textFilePath", description="Generated text file path") - text_file_size: int = Field(..., alias="textFileSize", description="Generated text file size") + dataset_id: str = Field(..., alias="datasetId", description="数据集ID") + source_file_id: str = Field(..., alias="sourceFileId", description="源文件ID") + text_file_id: str = Field(..., alias="textFileId", description="解析后的文本文件ID") + text_file_name: str = Field(..., alias="textFileName", description="解析后的文本文件名") + text_file_path: str = Field(..., alias="textFilePath", description="解析后的文本文件路径") + text_file_size: int = Field(..., alias="textFileSize", description="解析后的文本文件大小") class Config: populate_by_name = True diff --git a/runtime/datamate-python/app/module/dataset/service/pdf_extract.py b/runtime/datamate-python/app/module/dataset/service/pdf_extract.py index 5a709af..c6438b9 100644 --- a/runtime/datamate-python/app/module/dataset/service/pdf_extract.py +++ b/runtime/datamate-python/app/module/dataset/service/pdf_extract.py @@ -3,7 +3,7 @@ import os from pathlib import Path from fastapi import HTTPException -from langchain_community.document_loaders import PyPDFLoader +from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -14,13 +14,25 @@ from app.module.dataset.schema.pdf_extract import PdfTextExtractResponse logger = get_logger(__name__) PDF_FILE_TYPE = "pdf" +DOC_FILE_TYPE = "doc" +DOCX_FILE_TYPE = "docx" TEXT_FILE_TYPE = "txt" TEXT_FILE_EXTENSION = ".txt" +SUPPORTED_FILE_TYPES = {PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE} +PARSER_BY_FILE_TYPE = { + PDF_FILE_TYPE: "PyPDFLoader", + DOC_FILE_TYPE: "Docx2txtLoader", + DOCX_FILE_TYPE: "Docx2txtLoader", +} +DEFAULT_EXTENSION_BY_TYPE = { + PDF_FILE_TYPE: ".pdf", + DOC_FILE_TYPE: ".doc", + DOCX_FILE_TYPE: ".docx", +} DERIVED_METADATA_KEY = "derived_from_file_id" DERIVED_METADATA_NAME_KEY = "derived_from_file_name" DERIVED_METADATA_TYPE_KEY = "derived_from_file_type" DERIVED_METADATA_PARSER_KEY = "parser" -DERIVED_METADATA_PARSER_VALUE = "PyPDFLoader" class PdfTextExtractService: @@ -32,6 +44,7 @@ class PdfTextExtractService: file_record = await self._get_file_record(dataset_id, file_id) self._validate_dataset_and_file(dataset, file_record) + file_type = str(getattr(file_record, "file_type", "") or "").lower() source_path = self._resolve_source_path(file_record) dataset_path = self._resolve_dataset_path(dataset) target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id) @@ -42,20 +55,22 @@ class PdfTextExtractService: if target_path.exists(): file_size = self._get_file_size(target_path) - record = await self._create_text_file_record(dataset, file_record, target_path, file_size) + parser_name = PARSER_BY_FILE_TYPE.get(file_type, "") + record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name) return self._build_response(dataset_id, file_id, record) - text_content = self._parse_pdf(source_path) + text_content, parser_name = self._parse_document(source_path, file_type) + assert isinstance(text_content, str) self._write_text_file(target_path, text_content) file_size = self._get_file_size(target_path) - record = await self._create_text_file_record(dataset, file_record, target_path, file_size) + record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name) return self._build_response(dataset_id, file_id, record) async def _get_dataset(self, dataset_id: str) -> Dataset: result = await self.db.execute(select(Dataset).where(Dataset.id == dataset_id)) dataset = result.scalar_one_or_none() if not dataset: - raise HTTPException(status_code=404, detail=f"Dataset not found: {dataset_id}") + raise HTTPException(status_code=404, detail=f"数据集不存在: {dataset_id}") return dataset async def _get_file_record(self, dataset_id: str, file_id: str) -> DatasetFiles: @@ -67,30 +82,30 @@ class PdfTextExtractService: ) file_record = result.scalar_one_or_none() if not file_record: - raise HTTPException(status_code=404, detail=f"File not found: {file_id}") + raise HTTPException(status_code=404, detail=f"文件不存在: {file_id}") return file_record @staticmethod def _validate_dataset_and_file(dataset: Dataset, file_record: DatasetFiles) -> None: dataset_type = str(getattr(dataset, "dataset_type", "") or "").upper() if dataset_type != "TEXT": - raise HTTPException(status_code=400, detail="Only TEXT datasets are supported") + raise HTTPException(status_code=400, detail="仅支持文本类型数据集") file_type = str(getattr(file_record, "file_type", "") or "").lower() - if file_type != PDF_FILE_TYPE: - raise HTTPException(status_code=400, detail="Only PDF files are supported") + if file_type not in SUPPORTED_FILE_TYPES: + raise HTTPException(status_code=400, detail="仅支持 PDF/DOC/DOCX 文件解析") @staticmethod def _resolve_source_path(file_record: DatasetFiles) -> Path: source_path = Path(str(file_record.file_path)).expanduser().resolve() if not source_path.exists(): - raise HTTPException(status_code=404, detail="PDF file not found on disk") + raise HTTPException(status_code=404, detail="源文件不存在") return source_path @staticmethod def _resolve_dataset_path(dataset: Dataset) -> Path: dataset_path_value = str(getattr(dataset, "path", "") or "").strip() if not dataset_path_value: - raise HTTPException(status_code=500, detail="Dataset path is empty") + raise HTTPException(status_code=500, detail="数据集路径为空") dataset_path = Path(dataset_path_value).expanduser().resolve() dataset_path.mkdir(parents=True, exist_ok=True) return dataset_path @@ -99,7 +114,9 @@ class PdfTextExtractService: def _build_output_filename(file_record: DatasetFiles, file_id: str) -> str: original_name = str(getattr(file_record, "file_name", "") or "").strip() if not original_name: - original_name = f"{file_id}.pdf" + file_type = str(getattr(file_record, "file_type", "") or "").lower() + default_extension = DEFAULT_EXTENSION_BY_TYPE.get(file_type, f".{file_type}") + original_name = f"{file_id}{default_extension}" return f"{original_name}{TEXT_FILE_EXTENSION}" def _resolve_target_path( @@ -116,7 +133,7 @@ class PdfTextExtractService: target_dir = dataset_path target_dir = target_dir.resolve() if target_dir != dataset_path and dataset_path not in target_dir.parents: - raise HTTPException(status_code=400, detail="Target path is outside dataset path") + raise HTTPException(status_code=400, detail="解析文件路径超出数据集目录") target_dir.mkdir(parents=True, exist_ok=True) return target_dir / output_name @@ -130,11 +147,16 @@ class PdfTextExtractService: return result.scalar_one_or_none() @staticmethod - def _parse_pdf(source_path: Path) -> str: - loader = PyPDFLoader(str(source_path)) + def _parse_document(source_path: Path, file_type: str) -> tuple[str, str]: + if file_type == PDF_FILE_TYPE: + loader = PyPDFLoader(str(source_path)) + parser_name = PARSER_BY_FILE_TYPE[PDF_FILE_TYPE] + else: + loader = Docx2txtLoader(str(source_path)) + parser_name = PARSER_BY_FILE_TYPE.get(file_type, "Docx2txtLoader") docs = loader.load() contents = [doc.page_content for doc in docs if doc.page_content] - return "\n\n".join(contents) + return "\n\n".join(contents), parser_name @staticmethod def _write_text_file(target_path: Path, content: str) -> None: @@ -154,12 +176,14 @@ class PdfTextExtractService: source_file: DatasetFiles, target_path: Path, file_size: int, + parser_name: str, ) -> DatasetFiles: + assert parser_name metadata = { DERIVED_METADATA_KEY: str(getattr(source_file, "id", "")), DERIVED_METADATA_NAME_KEY: str(getattr(source_file, "file_name", "")), DERIVED_METADATA_TYPE_KEY: str(getattr(source_file, "file_type", "")), - DERIVED_METADATA_PARSER_KEY: DERIVED_METADATA_PARSER_VALUE, + DERIVED_METADATA_PARSER_KEY: parser_name, } record = DatasetFiles( dataset_id=dataset.id, # type: ignore[arg-type]