You've already forked DataMate
feat(data-management): 扩展文档解析功能支持DOC和DOCX格式
- 添加对DOC和DOCX文件类型的常量定义和支持 - 将文件类型验证逻辑从仅PDF扩展为PDF/DOC/DOCX - 集成Docx2txtLoader用于处理Word文档解析 - 更新错误消息为中文描述以提升用户体验 - 重构文件解析方法以支持多种文档格式 - 添加解析器元数据记录以追踪使用的解析工具 - 更新文件路径验证和构建逻辑以适配新的文件类型
This commit is contained in:
@@ -68,6 +68,9 @@ import java.util.stream.Stream;
|
|||||||
@Transactional
|
@Transactional
|
||||||
public class DatasetFileApplicationService {
|
public class DatasetFileApplicationService {
|
||||||
private static final String PDF_FILE_TYPE = "pdf";
|
private static final String PDF_FILE_TYPE = "pdf";
|
||||||
|
private static final String DOC_FILE_TYPE = "doc";
|
||||||
|
private static final String DOCX_FILE_TYPE = "docx";
|
||||||
|
private static final Set<String> DOCUMENT_TEXT_FILE_TYPES = Set.of(PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE);
|
||||||
|
|
||||||
private final DatasetFileRepository datasetFileRepository;
|
private final DatasetFileRepository datasetFileRepository;
|
||||||
private final DatasetRepository datasetRepository;
|
private final DatasetRepository datasetRepository;
|
||||||
@@ -783,7 +786,7 @@ public class DatasetFileApplicationService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
String fileType = datasetFile.getFileType();
|
String fileType = datasetFile.getFileType();
|
||||||
if (fileType == null || !fileType.equalsIgnoreCase(PDF_FILE_TYPE)) {
|
if (fileType == null || !DOCUMENT_TEXT_FILE_TYPES.contains(fileType.toLowerCase(Locale.ROOT))) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId());
|
pdfTextExtractAsyncService.extractPdfText(dataset.getId(), datasetFile.getId());
|
||||||
|
|||||||
@@ -2,20 +2,20 @@ from pydantic import BaseModel, Field
|
|||||||
|
|
||||||
|
|
||||||
class PdfTextExtractRequest(BaseModel):
|
class PdfTextExtractRequest(BaseModel):
|
||||||
dataset_id: str = Field(..., alias="datasetId", description="Dataset ID")
|
dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
|
||||||
file_id: str = Field(..., alias="fileId", description="PDF file ID")
|
file_id: str = Field(..., alias="fileId", description="源文件ID")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
populate_by_name = True
|
populate_by_name = True
|
||||||
|
|
||||||
|
|
||||||
class PdfTextExtractResponse(BaseModel):
|
class PdfTextExtractResponse(BaseModel):
|
||||||
dataset_id: str = Field(..., alias="datasetId", description="Dataset ID")
|
dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
|
||||||
source_file_id: str = Field(..., alias="sourceFileId", description="Source PDF file ID")
|
source_file_id: str = Field(..., alias="sourceFileId", description="源文件ID")
|
||||||
text_file_id: str = Field(..., alias="textFileId", description="Generated text file ID")
|
text_file_id: str = Field(..., alias="textFileId", description="解析后的文本文件ID")
|
||||||
text_file_name: str = Field(..., alias="textFileName", description="Generated text file name")
|
text_file_name: str = Field(..., alias="textFileName", description="解析后的文本文件名")
|
||||||
text_file_path: str = Field(..., alias="textFilePath", description="Generated text file path")
|
text_file_path: str = Field(..., alias="textFilePath", description="解析后的文本文件路径")
|
||||||
text_file_size: int = Field(..., alias="textFileSize", description="Generated text file size")
|
text_file_size: int = Field(..., alias="textFileSize", description="解析后的文本文件大小")
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
populate_by_name = True
|
populate_by_name = True
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
from langchain_community.document_loaders import PyPDFLoader
|
from langchain_community.document_loaders import Docx2txtLoader, PyPDFLoader
|
||||||
from sqlalchemy import select
|
from sqlalchemy import select
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
@@ -14,13 +14,25 @@ from app.module.dataset.schema.pdf_extract import PdfTextExtractResponse
|
|||||||
logger = get_logger(__name__)
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
PDF_FILE_TYPE = "pdf"
|
PDF_FILE_TYPE = "pdf"
|
||||||
|
DOC_FILE_TYPE = "doc"
|
||||||
|
DOCX_FILE_TYPE = "docx"
|
||||||
TEXT_FILE_TYPE = "txt"
|
TEXT_FILE_TYPE = "txt"
|
||||||
TEXT_FILE_EXTENSION = ".txt"
|
TEXT_FILE_EXTENSION = ".txt"
|
||||||
|
SUPPORTED_FILE_TYPES = {PDF_FILE_TYPE, DOC_FILE_TYPE, DOCX_FILE_TYPE}
|
||||||
|
PARSER_BY_FILE_TYPE = {
|
||||||
|
PDF_FILE_TYPE: "PyPDFLoader",
|
||||||
|
DOC_FILE_TYPE: "Docx2txtLoader",
|
||||||
|
DOCX_FILE_TYPE: "Docx2txtLoader",
|
||||||
|
}
|
||||||
|
DEFAULT_EXTENSION_BY_TYPE = {
|
||||||
|
PDF_FILE_TYPE: ".pdf",
|
||||||
|
DOC_FILE_TYPE: ".doc",
|
||||||
|
DOCX_FILE_TYPE: ".docx",
|
||||||
|
}
|
||||||
DERIVED_METADATA_KEY = "derived_from_file_id"
|
DERIVED_METADATA_KEY = "derived_from_file_id"
|
||||||
DERIVED_METADATA_NAME_KEY = "derived_from_file_name"
|
DERIVED_METADATA_NAME_KEY = "derived_from_file_name"
|
||||||
DERIVED_METADATA_TYPE_KEY = "derived_from_file_type"
|
DERIVED_METADATA_TYPE_KEY = "derived_from_file_type"
|
||||||
DERIVED_METADATA_PARSER_KEY = "parser"
|
DERIVED_METADATA_PARSER_KEY = "parser"
|
||||||
DERIVED_METADATA_PARSER_VALUE = "PyPDFLoader"
|
|
||||||
|
|
||||||
|
|
||||||
class PdfTextExtractService:
|
class PdfTextExtractService:
|
||||||
@@ -32,6 +44,7 @@ class PdfTextExtractService:
|
|||||||
file_record = await self._get_file_record(dataset_id, file_id)
|
file_record = await self._get_file_record(dataset_id, file_id)
|
||||||
self._validate_dataset_and_file(dataset, file_record)
|
self._validate_dataset_and_file(dataset, file_record)
|
||||||
|
|
||||||
|
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
||||||
source_path = self._resolve_source_path(file_record)
|
source_path = self._resolve_source_path(file_record)
|
||||||
dataset_path = self._resolve_dataset_path(dataset)
|
dataset_path = self._resolve_dataset_path(dataset)
|
||||||
target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id)
|
target_path = self._resolve_target_path(dataset_path, source_path, file_record, file_id)
|
||||||
@@ -42,20 +55,22 @@ class PdfTextExtractService:
|
|||||||
|
|
||||||
if target_path.exists():
|
if target_path.exists():
|
||||||
file_size = self._get_file_size(target_path)
|
file_size = self._get_file_size(target_path)
|
||||||
record = await self._create_text_file_record(dataset, file_record, target_path, file_size)
|
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "")
|
||||||
|
record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name)
|
||||||
return self._build_response(dataset_id, file_id, record)
|
return self._build_response(dataset_id, file_id, record)
|
||||||
|
|
||||||
text_content = self._parse_pdf(source_path)
|
text_content, parser_name = self._parse_document(source_path, file_type)
|
||||||
|
assert isinstance(text_content, str)
|
||||||
self._write_text_file(target_path, text_content)
|
self._write_text_file(target_path, text_content)
|
||||||
file_size = self._get_file_size(target_path)
|
file_size = self._get_file_size(target_path)
|
||||||
record = await self._create_text_file_record(dataset, file_record, target_path, file_size)
|
record = await self._create_text_file_record(dataset, file_record, target_path, file_size, parser_name)
|
||||||
return self._build_response(dataset_id, file_id, record)
|
return self._build_response(dataset_id, file_id, record)
|
||||||
|
|
||||||
async def _get_dataset(self, dataset_id: str) -> Dataset:
|
async def _get_dataset(self, dataset_id: str) -> Dataset:
|
||||||
result = await self.db.execute(select(Dataset).where(Dataset.id == dataset_id))
|
result = await self.db.execute(select(Dataset).where(Dataset.id == dataset_id))
|
||||||
dataset = result.scalar_one_or_none()
|
dataset = result.scalar_one_or_none()
|
||||||
if not dataset:
|
if not dataset:
|
||||||
raise HTTPException(status_code=404, detail=f"Dataset not found: {dataset_id}")
|
raise HTTPException(status_code=404, detail=f"数据集不存在: {dataset_id}")
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
async def _get_file_record(self, dataset_id: str, file_id: str) -> DatasetFiles:
|
async def _get_file_record(self, dataset_id: str, file_id: str) -> DatasetFiles:
|
||||||
@@ -67,30 +82,30 @@ class PdfTextExtractService:
|
|||||||
)
|
)
|
||||||
file_record = result.scalar_one_or_none()
|
file_record = result.scalar_one_or_none()
|
||||||
if not file_record:
|
if not file_record:
|
||||||
raise HTTPException(status_code=404, detail=f"File not found: {file_id}")
|
raise HTTPException(status_code=404, detail=f"文件不存在: {file_id}")
|
||||||
return file_record
|
return file_record
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _validate_dataset_and_file(dataset: Dataset, file_record: DatasetFiles) -> None:
|
def _validate_dataset_and_file(dataset: Dataset, file_record: DatasetFiles) -> None:
|
||||||
dataset_type = str(getattr(dataset, "dataset_type", "") or "").upper()
|
dataset_type = str(getattr(dataset, "dataset_type", "") or "").upper()
|
||||||
if dataset_type != "TEXT":
|
if dataset_type != "TEXT":
|
||||||
raise HTTPException(status_code=400, detail="Only TEXT datasets are supported")
|
raise HTTPException(status_code=400, detail="仅支持文本类型数据集")
|
||||||
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
||||||
if file_type != PDF_FILE_TYPE:
|
if file_type not in SUPPORTED_FILE_TYPES:
|
||||||
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
raise HTTPException(status_code=400, detail="仅支持 PDF/DOC/DOCX 文件解析")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _resolve_source_path(file_record: DatasetFiles) -> Path:
|
def _resolve_source_path(file_record: DatasetFiles) -> Path:
|
||||||
source_path = Path(str(file_record.file_path)).expanduser().resolve()
|
source_path = Path(str(file_record.file_path)).expanduser().resolve()
|
||||||
if not source_path.exists():
|
if not source_path.exists():
|
||||||
raise HTTPException(status_code=404, detail="PDF file not found on disk")
|
raise HTTPException(status_code=404, detail="源文件不存在")
|
||||||
return source_path
|
return source_path
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _resolve_dataset_path(dataset: Dataset) -> Path:
|
def _resolve_dataset_path(dataset: Dataset) -> Path:
|
||||||
dataset_path_value = str(getattr(dataset, "path", "") or "").strip()
|
dataset_path_value = str(getattr(dataset, "path", "") or "").strip()
|
||||||
if not dataset_path_value:
|
if not dataset_path_value:
|
||||||
raise HTTPException(status_code=500, detail="Dataset path is empty")
|
raise HTTPException(status_code=500, detail="数据集路径为空")
|
||||||
dataset_path = Path(dataset_path_value).expanduser().resolve()
|
dataset_path = Path(dataset_path_value).expanduser().resolve()
|
||||||
dataset_path.mkdir(parents=True, exist_ok=True)
|
dataset_path.mkdir(parents=True, exist_ok=True)
|
||||||
return dataset_path
|
return dataset_path
|
||||||
@@ -99,7 +114,9 @@ class PdfTextExtractService:
|
|||||||
def _build_output_filename(file_record: DatasetFiles, file_id: str) -> str:
|
def _build_output_filename(file_record: DatasetFiles, file_id: str) -> str:
|
||||||
original_name = str(getattr(file_record, "file_name", "") or "").strip()
|
original_name = str(getattr(file_record, "file_name", "") or "").strip()
|
||||||
if not original_name:
|
if not original_name:
|
||||||
original_name = f"{file_id}.pdf"
|
file_type = str(getattr(file_record, "file_type", "") or "").lower()
|
||||||
|
default_extension = DEFAULT_EXTENSION_BY_TYPE.get(file_type, f".{file_type}")
|
||||||
|
original_name = f"{file_id}{default_extension}"
|
||||||
return f"{original_name}{TEXT_FILE_EXTENSION}"
|
return f"{original_name}{TEXT_FILE_EXTENSION}"
|
||||||
|
|
||||||
def _resolve_target_path(
|
def _resolve_target_path(
|
||||||
@@ -116,7 +133,7 @@ class PdfTextExtractService:
|
|||||||
target_dir = dataset_path
|
target_dir = dataset_path
|
||||||
target_dir = target_dir.resolve()
|
target_dir = target_dir.resolve()
|
||||||
if target_dir != dataset_path and dataset_path not in target_dir.parents:
|
if target_dir != dataset_path and dataset_path not in target_dir.parents:
|
||||||
raise HTTPException(status_code=400, detail="Target path is outside dataset path")
|
raise HTTPException(status_code=400, detail="解析文件路径超出数据集目录")
|
||||||
target_dir.mkdir(parents=True, exist_ok=True)
|
target_dir.mkdir(parents=True, exist_ok=True)
|
||||||
return target_dir / output_name
|
return target_dir / output_name
|
||||||
|
|
||||||
@@ -130,11 +147,16 @@ class PdfTextExtractService:
|
|||||||
return result.scalar_one_or_none()
|
return result.scalar_one_or_none()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _parse_pdf(source_path: Path) -> str:
|
def _parse_document(source_path: Path, file_type: str) -> tuple[str, str]:
|
||||||
loader = PyPDFLoader(str(source_path))
|
if file_type == PDF_FILE_TYPE:
|
||||||
|
loader = PyPDFLoader(str(source_path))
|
||||||
|
parser_name = PARSER_BY_FILE_TYPE[PDF_FILE_TYPE]
|
||||||
|
else:
|
||||||
|
loader = Docx2txtLoader(str(source_path))
|
||||||
|
parser_name = PARSER_BY_FILE_TYPE.get(file_type, "Docx2txtLoader")
|
||||||
docs = loader.load()
|
docs = loader.load()
|
||||||
contents = [doc.page_content for doc in docs if doc.page_content]
|
contents = [doc.page_content for doc in docs if doc.page_content]
|
||||||
return "\n\n".join(contents)
|
return "\n\n".join(contents), parser_name
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _write_text_file(target_path: Path, content: str) -> None:
|
def _write_text_file(target_path: Path, content: str) -> None:
|
||||||
@@ -154,12 +176,14 @@ class PdfTextExtractService:
|
|||||||
source_file: DatasetFiles,
|
source_file: DatasetFiles,
|
||||||
target_path: Path,
|
target_path: Path,
|
||||||
file_size: int,
|
file_size: int,
|
||||||
|
parser_name: str,
|
||||||
) -> DatasetFiles:
|
) -> DatasetFiles:
|
||||||
|
assert parser_name
|
||||||
metadata = {
|
metadata = {
|
||||||
DERIVED_METADATA_KEY: str(getattr(source_file, "id", "")),
|
DERIVED_METADATA_KEY: str(getattr(source_file, "id", "")),
|
||||||
DERIVED_METADATA_NAME_KEY: str(getattr(source_file, "file_name", "")),
|
DERIVED_METADATA_NAME_KEY: str(getattr(source_file, "file_name", "")),
|
||||||
DERIVED_METADATA_TYPE_KEY: str(getattr(source_file, "file_type", "")),
|
DERIVED_METADATA_TYPE_KEY: str(getattr(source_file, "file_type", "")),
|
||||||
DERIVED_METADATA_PARSER_KEY: DERIVED_METADATA_PARSER_VALUE,
|
DERIVED_METADATA_PARSER_KEY: parser_name,
|
||||||
}
|
}
|
||||||
record = DatasetFiles(
|
record = DatasetFiles(
|
||||||
dataset_id=dataset.id, # type: ignore[arg-type]
|
dataset_id=dataset.id, # type: ignore[arg-type]
|
||||||
|
|||||||
Reference in New Issue
Block a user