From f77fd99c31d5467693997dcd23ae76e4eb7e4ba8 Mon Sep 17 00:00:00 2001
From: Jerry Yan <792602257@qq.com>
Date: Thu, 29 Jan 2026 13:05:58 +0800
Subject: [PATCH] =?UTF-8?q?feat(data-management):=20=E6=89=A9=E5=B1=95?=
 =?UTF-8?q?=E6=96=87=E6=A1=A3=E8=A7=A3=E6=9E=90=E5=8A=9F=E8=83=BD=E6=94=AF?=
 =?UTF-8?q?=E6=8C=81DOC=E5=92=8CDOCX=E6=A0=BC=E5=BC=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 添加对DOC和DOCX文件类型的常量定义和支持
- 将文件类型验证逻辑从仅PDF扩展为PDF/DOC/DOCX
- 集成Docx2txtLoader用于处理Word文档解析
- 更新错误消息为中文描述以提升用户体验
- 重构文件解析方法以支持多种文档格式
- 添加解析器元数据记录以追踪使用的解析工具
- 更新文件路径验证和构建逻辑以适配新的文件类型
---
 .../DatasetFileApplicationService.java        |  5 +-
 .../app/module/dataset/schema/pdf_extract.py  | 16 ++---
 .../app/module/dataset/service/pdf_extract.py | 60 +++++++++++++------
 3 files changed, 54 insertions(+), 27 deletions(-)
diff --git a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java
index 03dc0cd..8f3d9bc 100644
--- a/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java
+++ b/backend/services/data-management-service/src/main/java/com/datamate/datamanagement/application/DatasetFileApplicationService.java
@@ -68,6 +68,9 @@ import java.util.stream.Stream;
 @Transactional
 public class DatasetFileApplicationService {
     private static final String PDF_FILE_TYPE = "pdf";
+    private static final String DOC_FILE_TYPE = "doc";
+    private static final String DOCX_FILE_TYPE = "docx";
+    private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE);
 
     private final DatasetFileRepository datasetFileRepository;
     private final DatasetRepository datasetRepository;
@@ -783,7 +786,7 @@ public class DatasetFileApplicationService {
             return;
         }
         String fileType = datasetFile.getFileType();
-        if (fileType == null || !fileType.equalsIgnoreCase(PDF_FILE_TYPE)) {
+        if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) {
             return;
         }
         pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId());
diff --git a/runtime/datamate-python/app/module/dataset/schema/pdf_extract.py b/runtime/datamate-python/app/module/dataset/schema/pdf_extract.py
index 2ab5767..13c21d1 100644
--- a/runtime/datamate-python/app/module/dataset/schema/pdf_extract.py
+++ b/runtime/datamate-python/app/module/dataset/schema/pdf_extract.py
@@ -2,20 +2,20 @@ from pydantic import BaseModel, Field
 
 
 class PdfTextExtractRequest(BaseModel):
-    dataset_id: str = Field(..., alias="datasetId", description="Dataset ID")
-    file_id: str = Field(..., alias="fileId", description="PDF file ID")
+    dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
+    file_id: str = Field(..., alias="fileId", description="源文件ID")
 
     class Config:
         populate_by_name = True
 
 
 class PdfTextExtractResponse(BaseModel):
-    dataset_id: str = Field(..., alias="datasetId", description="Dataset ID")
-    source_file_id: str = Field(..., alias="sourceFileId", description="Source PDF file ID")
-    text_file_id: str = Field(..., alias="textFileId", description="Generated text file ID")
-    text_file_name: str = Field(..., alias="textFileName", description="Generated text file name")
-    text_file_path: str = Field(..., alias="textFilePath", description="Generated text file path")
-    text_file_size: int = Field(..., alias="textFileSize", description="Generated text file size")
+    dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
+    source_file_id: str = Field(..., alias="sourceFileId", description="源文件ID")
+    text_file_id: str = Field(..., alias="textFileId", description="解析后的文本文件ID")
+    text_file_name: str = Field(..., alias="textFileName", description="解析后的文本文件名")
+    text_file_path: str = Field(..., alias="textFilePath", description="解析后的文本文件路径")
+    text_file_size: int = Field(..., alias="textFileSize", description="解析后的文本文件大小")
 
     class Config:
         populate_by_name = True
diff --git a/runtime/datamate-python/app/module/dataset/service/pdf_extract.py b/runtime/datamate-python/app/module/dataset/service/pdf_extract.py
index 5a709af..c6438b9 100644
--- a/runtime/datamate-python/app/module/dataset/service/pdf_extract.py
+++ b/runtime/datamate-python/app/module/dataset/service/pdf_extract.py
@@ -3,7 +3,7 @@ import os
 from pathlib import Path
 
 from fastapi import HTTPException
-from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession
 
@@ -14,13 +14,25 @@ from app.module.dataset.schema.pdf_extract import PdfTextExtractResponse
 logger = get_logger(__name__)
 
 PDF_FILE_TYPE = "pdf"
+DOC_FILE_TYPE = "doc"
+DOCX_FILE_TYPE = "docx"
 TEXT_FILE_TYPE = "txt"
 TEXT_FILE_EXTENSION = ".txt"
+SUPPORTED_FILE_TYPES = {PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE}
+PARSER_BY_FILE_TYPE = {
+    PDF_FILE_TYPE: "PyPDFLoader",
+    DOC_FILE_TYPE: "Docx2txtLoader",
+    DOCX_FILE_TYPE: "Docx2txtLoader",
+}
+DEFAULT_EXTENSION_BY_TYPE = {
+    PDF_FILE_TYPE: ".pdf",
+    DOC_FILE_TYPE: ".doc",
+    DOCX_FILE_TYPE: ".docx",
+}
 DERIVED_METADATA_KEY = "derived_from_file_id"
 DERIVED_METADATA_NAME_KEY = "derived_from_file_name"
 DERIVED_METADATA_TYPE_KEY = "derived_from_file_type"
 DERIVED_METADATA_PARSER_KEY = "parser"
-DERIVED_METADATA_PARSER_VALUE = "PyPDFLoader"
 
 
 class PdfTextExtractService:
@@ -32,6 +44,7 @@ class PdfTextExtractService:
         file_record = await self._get_file_record(dataset_id, file_id)
         self._validate_dataset_and_file(dataset, file_record)
 
+        file_type = str(getattr(file_record, "file_type", "") or "").lower()
         source_path = self._resolve_source_path(file_record)
         dataset_path = self._resolve_dataset_path(dataset)
         target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id)
@@ -42,20 +55,22 @@ class PdfTextExtractService:
 
         if target_path.exists():
             file_size = self._get_file_size(target_path)
-            record = await self._create_text_file_record(dataset, file_record, target_path, file_size)
+            parser_name = PARSER_BY_FILE_TYPE.get(file_type, "")
+            record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name)
             return self._build_response(dataset_id, file_id, record)
 
-        text_content = self._parse_pdf(source_path)
+        text_content, parser_name = self._parse_document(source_path, file_type)
+        assert isinstance(text_content, str)
         self._write_text_file(target_path, text_content)
         file_size = self._get_file_size(target_path)
-        record = await self._create_text_file_record(dataset, file_record, target_path, file_size)
+        record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name)
         return self._build_response(dataset_id, file_id, record)
 
     async def _get_dataset(self, dataset_id: str) -> Dataset:
         result = await self.db.execute(select(Dataset).where(Dataset.id == dataset_id))
         dataset = result.scalar_one_or_none()
         if not dataset:
-            raise HTTPException(status_code=404, detail=f"Dataset not found: {dataset_id}")
+            raise HTTPException(status_code=404, detail=f"数据集不存在: {dataset_id}")
         return dataset
 
     async def _get_file_record(self, dataset_id: str, file_id: str) -> DatasetFiles:
@@ -67,30 +82,30 @@ class PdfTextExtractService:
         )
         file_record = result.scalar_one_or_none()
         if not file_record:
-            raise HTTPException(status_code=404, detail=f"File not found: {file_id}")
+            raise HTTPException(status_code=404, detail=f"文件不存在: {file_id}")
         return file_record
 
     @staticmethod
     def _validate_dataset_and_file(dataset: Dataset, file_record: DatasetFiles) -> None:
         dataset_type = str(getattr(dataset, "dataset_type", "") or "").upper()
         if dataset_type != "TEXT":
-            raise HTTPException(status_code=400, detail="Only TEXT datasets are supported")
+            raise HTTPException(status_code=400, detail="仅支持文本类型数据集")
         file_type = str(getattr(file_record, "file_type", "") or "").lower()
-        if file_type != PDF_FILE_TYPE:
-            raise HTTPException(status_code=400, detail="Only PDF files are supported")
+        if file_type not in SUPPORTED_FILE_TYPES:
+            raise HTTPException(status_code=400, detail="仅支持 PDF/DOC/DOCX 文件解析")
 
     @staticmethod
     def _resolve_source_path(file_record: DatasetFiles) -> Path:
         source_path = Path(str(file_record.file_path)).expanduser().resolve()
         if not source_path.exists():
-            raise HTTPException(status_code=404, detail="PDF file not found on disk")
+            raise HTTPException(status_code=404, detail="源文件不存在")
         return source_path
 
     @staticmethod
     def _resolve_dataset_path(dataset: Dataset) -> Path:
         dataset_path_value = str(getattr(dataset, "path", "") or "").strip()
         if not dataset_path_value:
-            raise HTTPException(status_code=500, detail="Dataset path is empty")
+            raise HTTPException(status_code=500, detail="数据集路径为空")
         dataset_path = Path(dataset_path_value).expanduser().resolve()
         dataset_path.mkdir(parents=True, exist_ok=True)
         return dataset_path
@@ -99,7 +114,9 @@ class PdfTextExtractService:
     def _build_output_filename(file_record: DatasetFiles, file_id: str) -> str:
         original_name = str(getattr(file_record, "file_name", "") or "").strip()
         if not original_name:
-            original_name = f"{file_id}.pdf"
+            file_type = str(getattr(file_record, "file_type", "") or "").lower()
+            default_extension = DEFAULT_EXTENSION_BY_TYPE.get(file_type, f".{file_type}")
+            original_name = f"{file_id}{default_extension}"
         return f"{original_name}{TEXT_FILE_EXTENSION}"
 
     def _resolve_target_path(
@@ -116,7 +133,7 @@ class PdfTextExtractService:
             target_dir = dataset_path
         target_dir = target_dir.resolve()
         if target_dir != dataset_path and dataset_path not in target_dir.parents:
-            raise HTTPException(status_code=400, detail="Target path is outside dataset path")
+            raise HTTPException(status_code=400, detail="解析文件路径超出数据集目录")
         target_dir.mkdir(parents=True, exist_ok=True)
         return target_dir / output_name
 
@@ -130,11 +147,16 @@ class PdfTextExtractService:
         return result.scalar_one_or_none()
 
     @staticmethod
-    def _parse_pdf(source_path: Path) -> str:
-        loader = PyPDFLoader(str(source_path))
+    def _parse_document(source_path: Path, file_type: str) -> tuple[str, str]:
+        if file_type == PDF_FILE_TYPE:
+            loader = PyPDFLoader(str(source_path))
+            parser_name = PARSER_BY_FILE_TYPE[PDF_FILE_TYPE]
+        else:
+            loader = Docx2txtLoader(str(source_path))
+            parser_name = PARSER_BY_FILE_TYPE.get(file_type, "Docx2txtLoader")
         docs = loader.load()
         contents = [doc.page_content for doc in docs if doc.page_content]
-        return "\n\n".join(contents)
+        return "\n\n".join(contents), parser_name
 
     @staticmethod
     def _write_text_file(target_path: Path, content: str) -> None:
@@ -154,12 +176,14 @@ class PdfTextExtractService:
         source_file: DatasetFiles,
         target_path: Path,
         file_size: int,
+        parser_name: str,
     ) -> DatasetFiles:
+        assert parser_name
         metadata = {
             DERIVED_METADATA_KEY: str(getattr(source_file, "id", "")),
             DERIVED_METADATA_NAME_KEY: str(getattr(source_file, "file_name", "")),
             DERIVED_METADATA_TYPE_KEY: str(getattr(source_file, "file_type", "")),
-            DERIVED_METADATA_PARSER_KEY: DERIVED_METADATA_PARSER_VALUE,
+            DERIVED_METADATA_PARSER_KEY: parser_name,
         }
         record = DatasetFiles(
             dataset_id=dataset.id,  # type: ignore[arg-type]