You've already forked DataMate
feat: enhance dataset file fetching with improved pagination and document loading support (#156)
This commit is contained in:
3
runtime/datamate-python/app/common/__init__.py
Normal file
3
runtime/datamate-python/app/common/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
公共模块
|
||||
"""
|
||||
93
runtime/datamate-python/app/common/document_loaders.py
Normal file
93
runtime/datamate-python/app/common/document_loaders.py
Normal file
@@ -0,0 +1,93 @@
|
||||
from typing import List, Union, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_community.document_loaders import (
|
||||
TextLoader,
|
||||
JSONLoader,
|
||||
CSVLoader,
|
||||
UnstructuredMarkdownLoader,
|
||||
PyPDFLoader,
|
||||
Docx2txtLoader
|
||||
)
|
||||
|
||||
from app.core.logging import get_logger
|
||||
|
||||
log = get_logger(__name__)
|
||||
|
||||
class UniversalDocLoader:
|
||||
"""
|
||||
通用泛文本文档加载类
|
||||
支持格式:TXT/JSON/CSV/Markdown/Word(.docx)/PPT(.pptx)/PDF
|
||||
"""
|
||||
# 格式-加载器映射(轻量优先)
|
||||
SUPPORTED_FORMATS = {
|
||||
# 纯文本类
|
||||
".txt": TextLoader,
|
||||
".json": JSONLoader,
|
||||
".csv": CSVLoader,
|
||||
".md": UnstructuredMarkdownLoader,
|
||||
# 办公文档类
|
||||
".docx": Docx2txtLoader,
|
||||
".doc": Docx2txtLoader,
|
||||
# PDF 类
|
||||
".pdf": PyPDFLoader
|
||||
}
|
||||
|
||||
def __init__(self, file_path: Union[str, Path]):
|
||||
self.file_path = Path(file_path).resolve()
|
||||
self.file_suffix = self.file_path.suffix.lower()
|
||||
log.info(f"初始化文档加载器: {self.file_path} (格式: {self.file_suffix})")
|
||||
self._validate_file()
|
||||
|
||||
def _validate_file(self) -> None:
|
||||
"""验证文件存在性和格式支持性"""
|
||||
if not self.file_path.exists():
|
||||
raise FileNotFoundError(f"文件不存在: {self.file_path}")
|
||||
if self.file_suffix not in self.SUPPORTED_FORMATS:
|
||||
raise ValueError(
|
||||
f"不支持的格式: {self.file_suffix} | 支持格式: {list(self.SUPPORTED_FORMATS.keys())}"
|
||||
)
|
||||
|
||||
def load(
|
||||
self,
|
||||
file_format: Optional[str] = None,
|
||||
**loader_kwargs
|
||||
) -> List[Document]:
|
||||
"""
|
||||
加载文档并返回 LangChain Document 列表
|
||||
:param file_format: 手动指定格式(如 ".pdf"),默认自动识别
|
||||
:param loader_kwargs: 传递给具体加载器的参数(如 JSONLoader 的 jq_schema)
|
||||
:return: List[Document]
|
||||
"""
|
||||
# 确定目标格式
|
||||
target_format = file_format.lower() if file_format else self.file_suffix
|
||||
loader_cls = self.SUPPORTED_FORMATS[target_format]
|
||||
|
||||
# 加载器默认参数优化
|
||||
loader_kwargs = self._set_default_kwargs(loader_cls, loader_kwargs)
|
||||
|
||||
# 初始化并加载
|
||||
loader = loader_cls(str(self.file_path), **loader_kwargs)
|
||||
return loader.load()
|
||||
|
||||
@staticmethod
|
||||
def _set_default_kwargs(loader_cls, kwargs: dict) -> dict:
|
||||
"""为不同加载器设置默认参数,简化调用"""
|
||||
if loader_cls == JSONLoader and "jq_schema" not in kwargs:
|
||||
kwargs.setdefault("jq_schema", ".")
|
||||
kwargs.setdefault("text_content", False)
|
||||
if loader_cls == CSVLoader and "csv_args" not in kwargs:
|
||||
kwargs["csv_args"] = {"delimiter": ","}
|
||||
return kwargs
|
||||
|
||||
|
||||
# 文档加载器便捷函数
|
||||
def load_documents(
|
||||
file_path: Union[str, Path],
|
||||
file_format: Optional[str] = None,
|
||||
**loader_kwargs
|
||||
) -> List[Document]:
|
||||
"""快速加载文档的便捷函数"""
|
||||
loader = UniversalDocLoader(file_path)
|
||||
return loader.load(file_format=file_format, **loader_kwargs)
|
||||
0
runtime/datamate-python/app/common/text_split.py
Normal file
0
runtime/datamate-python/app/common/text_split.py
Normal file
@@ -1,20 +1,8 @@
|
||||
import asyncio
|
||||
import uuid
|
||||
import json
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from langchain_community.document_loaders import (
|
||||
TextLoader,
|
||||
CSVLoader,
|
||||
JSONLoader,
|
||||
UnstructuredMarkdownLoader,
|
||||
UnstructuredHTMLLoader,
|
||||
UnstructuredFileLoader,
|
||||
PyPDFLoader,
|
||||
UnstructuredWordDocumentLoader,
|
||||
UnstructuredPowerPointLoader,
|
||||
UnstructuredExcelLoader,
|
||||
)
|
||||
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
@@ -30,6 +18,7 @@ from app.db.models.model_config import get_model_by_id
|
||||
from app.db.session import logger
|
||||
from app.module.shared.util.model_chat import _extract_json_substring
|
||||
from app.module.system.service.common_service import get_chat_client, chat
|
||||
from app.common.document_loaders import load_documents
|
||||
|
||||
|
||||
class GenerationService:
|
||||
@@ -250,8 +239,7 @@ class GenerationService:
|
||||
|
||||
保留每个 Document 的 metadata,方便后续追加例如文件ID、chunk序号等信息。
|
||||
"""
|
||||
loader = self._build_loader(file_path)
|
||||
docs = loader.load()
|
||||
docs = load_documents(file_path)
|
||||
|
||||
splitter = RecursiveCharacterTextSplitter(
|
||||
chunk_size=chunk_size,
|
||||
@@ -262,67 +250,6 @@ class GenerationService:
|
||||
split_docs = splitter.split_documents(docs)
|
||||
return split_docs
|
||||
|
||||
@staticmethod
|
||||
def _build_loader(file_path: str):
|
||||
"""根据文件扩展名选择合适的 LangChain 文本加载器,尽量覆盖常见泛文本格式。
|
||||
|
||||
优先按格式选择专门的 Loader,找不到匹配时退回到 TextLoader。
|
||||
"""
|
||||
path = Path(file_path)
|
||||
suffix = path.suffix.lower()
|
||||
path_str = str(path)
|
||||
|
||||
# 1. 纯文本类
|
||||
if suffix in {".txt", "", ".log"}: # "" 兼容无扩展名
|
||||
return TextLoader(path_str, encoding="utf-8")
|
||||
|
||||
# 2. Markdown
|
||||
if suffix in {".md", ".markdown"}:
|
||||
# UnstructuredMarkdownLoader 会保留更多结构信息
|
||||
return UnstructuredMarkdownLoader(path_str)
|
||||
|
||||
# 3. HTML / HTM
|
||||
if suffix in {".html", ".htm"}:
|
||||
return UnstructuredHTMLLoader(path_str)
|
||||
|
||||
# 4. JSON
|
||||
if suffix == ".json":
|
||||
# 使用 JSONLoader 将 JSON 中的内容展开成文档
|
||||
# 这里使用默认 jq_schema,后续需要更精细地提取可以在此调整
|
||||
return JSONLoader(file_path=path_str, jq_schema=".")
|
||||
|
||||
# 5. CSV / TSV
|
||||
if suffix in {".csv", ".tsv"}:
|
||||
# CSVLoader 默认将每一行作为一条 Document
|
||||
return CSVLoader(file_path=path_str)
|
||||
|
||||
# 6. YAML
|
||||
if suffix in {".yaml", ".yml"}:
|
||||
# 暂时按纯文本加载
|
||||
return TextLoader(path_str, encoding="utf-8")
|
||||
|
||||
# 7. PDF
|
||||
if suffix == ".pdf":
|
||||
return PyPDFLoader(path_str)
|
||||
|
||||
# 8. Word 文档
|
||||
if suffix in {".docx", ".doc"}:
|
||||
# UnstructuredWordDocumentLoader 支持 .docx/.doc 文本抽取
|
||||
return UnstructuredWordDocumentLoader(path_str)
|
||||
|
||||
# 9. PowerPoint
|
||||
if suffix in {".ppt", ".pptx"}:
|
||||
return UnstructuredPowerPointLoader(path_str)
|
||||
|
||||
# 10. Excel
|
||||
if suffix in {".xls", ".xlsx"}:
|
||||
return UnstructuredExcelLoader(path_str)
|
||||
|
||||
# 11. 兜底:使用 UnstructuredFileLoader 或 TextLoader 作为纯文本
|
||||
try:
|
||||
return UnstructuredFileLoader(path_str)
|
||||
except Exception:
|
||||
return TextLoader(path_str, encoding="utf-8")
|
||||
|
||||
@staticmethod
|
||||
def _build_qa_prompt(chunk: str, synthesis_cfg: dict) -> str:
|
||||
|
||||
Reference in New Issue
Block a user