feat: enhance dataset file fetching with improved pagination and document loading support (#156)

This commit is contained in:
Dallas98
2025-12-10 22:39:24 +08:00
committed by GitHub
parent e9fd6a3ae1
commit 2f3ae21f8a
7 changed files with 158 additions and 112 deletions

View File

@@ -0,0 +1,93 @@
from typing import List, Union, Optional
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.document_loaders import (
TextLoader,
JSONLoader,
CSVLoader,
UnstructuredMarkdownLoader,
PyPDFLoader,
Docx2txtLoader
)
from app.core.logging import get_logger
log = get_logger(__name__)
class UniversalDocLoader:
"""
通用泛文本文档加载类
支持格式:TXT/JSON/CSV/Markdown/Word(.docx)/PPT(.pptx)/PDF
"""
# 格式-加载器映射(轻量优先)
SUPPORTED_FORMATS = {
# 纯文本类
".txt": TextLoader,
".json": JSONLoader,
".csv": CSVLoader,
".md": UnstructuredMarkdownLoader,
# 办公文档类
".docx": Docx2txtLoader,
".doc": Docx2txtLoader,
# PDF 类
".pdf": PyPDFLoader
}
def __init__(self, file_path: Union[str, Path]):
self.file_path = Path(file_path).resolve()
self.file_suffix = self.file_path.suffix.lower()
log.info(f"初始化文档加载器: {self.file_path} (格式: {self.file_suffix})")
self._validate_file()
def _validate_file(self) -> None:
"""验证文件存在性和格式支持性"""
if not self.file_path.exists():
raise FileNotFoundError(f"文件不存在: {self.file_path}")
if self.file_suffix not in self.SUPPORTED_FORMATS:
raise ValueError(
f"不支持的格式: {self.file_suffix} | 支持格式: {list(self.SUPPORTED_FORMATS.keys())}"
)
def load(
self,
file_format: Optional[str] = None,
**loader_kwargs
) -> List[Document]:
"""
加载文档并返回 LangChain Document 列表
:param file_format: 手动指定格式(如 ".pdf"),默认自动识别
:param loader_kwargs: 传递给具体加载器的参数(如 JSONLoader 的 jq_schema)
:return: List[Document]
"""
# 确定目标格式
target_format = file_format.lower() if file_format else self.file_suffix
loader_cls = self.SUPPORTED_FORMATS[target_format]
# 加载器默认参数优化
loader_kwargs = self._set_default_kwargs(loader_cls, loader_kwargs)
# 初始化并加载
loader = loader_cls(str(self.file_path), **loader_kwargs)
return loader.load()
@staticmethod
def _set_default_kwargs(loader_cls, kwargs: dict) -> dict:
"""为不同加载器设置默认参数,简化调用"""
if loader_cls == JSONLoader and "jq_schema" not in kwargs:
kwargs.setdefault("jq_schema", ".")
kwargs.setdefault("text_content", False)
if loader_cls == CSVLoader and "csv_args" not in kwargs:
kwargs["csv_args"] = {"delimiter": ","}
return kwargs
# 文档加载器便捷函数
def load_documents(
file_path: Union[str, Path],
file_format: Optional[str] = None,
**loader_kwargs
) -> List[Document]:
"""快速加载文档的便捷函数"""
loader = UniversalDocLoader(file_path)
return loader.load(file_format=file_format, **loader_kwargs)