You've already forked DataMate
* fix(chart): update Helm chart helpers and values for improved configuration * feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths * feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthDataDetail): add delete action for chunks with confirmation prompt * feat(SynthDataDetail): update edit and delete buttons to icon-only format * feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion * feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection * feat(DataSynthesis): refactor data synthesis models and update task handling logic * feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic * feat(DataSynthesis): refactor data synthesis models and update task handling logic * fix(generation_service): ensure processed chunks are incremented regardless of question generation success * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options
94 lines
3.1 KiB
Python
94 lines
3.1 KiB
Python
from typing import List, Union, Optional
|
|
from pathlib import Path
|
|
|
|
from langchain_core.documents import Document
|
|
from langchain_community.document_loaders import (
|
|
TextLoader,
|
|
JSONLoader,
|
|
CSVLoader,
|
|
UnstructuredMarkdownLoader,
|
|
PyPDFLoader,
|
|
Docx2txtLoader
|
|
)
|
|
|
|
from app.core.logging import get_logger
|
|
|
|
log = get_logger(__name__)
|
|
|
|
class UniversalDocLoader:
|
|
"""
|
|
通用泛文本文档加载类
|
|
支持格式:TXT/JSON/CSV/Markdown/Word(.docx)/PPT(.pptx)/PDF
|
|
"""
|
|
# 格式-加载器映射(轻量优先)
|
|
SUPPORTED_FORMATS = {
|
|
# 纯文本类
|
|
".txt": TextLoader,
|
|
".json": JSONLoader,
|
|
".csv": CSVLoader,
|
|
".md": UnstructuredMarkdownLoader,
|
|
# 办公文档类
|
|
".docx": Docx2txtLoader,
|
|
".doc": Docx2txtLoader,
|
|
# PDF 类
|
|
".pdf": PyPDFLoader
|
|
}
|
|
|
|
def __init__(self, file_path: Union[str, Path]):
|
|
self.file_path = Path(file_path).resolve()
|
|
self.file_suffix = self.file_path.suffix.lower()
|
|
log.info(f"初始化文档加载器: {self.file_path} (格式: {self.file_suffix})")
|
|
self._validate_file()
|
|
|
|
def _validate_file(self) -> None:
|
|
"""验证文件存在性和格式支持性"""
|
|
if not self.file_path.exists():
|
|
raise FileNotFoundError(f"文件不存在: {self.file_path}")
|
|
if self.file_suffix not in self.SUPPORTED_FORMATS:
|
|
raise ValueError(
|
|
f"不支持的格式: {self.file_suffix} | 支持格式: {list(self.SUPPORTED_FORMATS.keys())}"
|
|
)
|
|
|
|
def load(
|
|
self,
|
|
file_format: Optional[str] = None,
|
|
**loader_kwargs
|
|
) -> List[Document]:
|
|
"""
|
|
加载文档并返回 LangChain Document 列表
|
|
:param file_format: 手动指定格式(如 ".pdf"),默认自动识别
|
|
:param loader_kwargs: 传递给具体加载器的参数(如 JSONLoader 的 jq_schema)
|
|
:return: List[Document]
|
|
"""
|
|
# 确定目标格式
|
|
target_format = file_format.lower() if file_format else self.file_suffix
|
|
loader_cls = self.SUPPORTED_FORMATS[target_format]
|
|
|
|
# 加载器默认参数优化
|
|
loader_kwargs = self._set_default_kwargs(loader_cls, loader_kwargs)
|
|
|
|
# 初始化并加载
|
|
loader = loader_cls(str(self.file_path), **loader_kwargs)
|
|
return loader.load()
|
|
|
|
@staticmethod
|
|
def _set_default_kwargs(loader_cls, kwargs: dict) -> dict:
|
|
"""为不同加载器设置默认参数,简化调用"""
|
|
if loader_cls == JSONLoader and "jq_schema" not in kwargs:
|
|
kwargs.setdefault("jq_schema", ".")
|
|
kwargs.setdefault("text_content", False)
|
|
if loader_cls == CSVLoader and "csv_args" not in kwargs:
|
|
kwargs["csv_args"] = {"delimiter": ","}
|
|
return kwargs
|
|
|
|
|
|
# 文档加载器便捷函数
|
|
def load_documents(
|
|
file_path: Union[str, Path],
|
|
file_format: Optional[str] = None,
|
|
**loader_kwargs
|
|
) -> List[Document]:
|
|
"""快速加载文档的便捷函数"""
|
|
loader = UniversalDocLoader(file_path)
|
|
return loader.load(file_format=file_format, **loader_kwargs)
|