You've already forked DataMate
feat:问题生成过程优化及COT数据生成优化 (#169)
* fix(chart): update Helm chart helpers and values for improved configuration * feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths * feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthDataDetail): add delete action for chunks with confirmation prompt * feat(SynthDataDetail): update edit and delete buttons to icon-only format * feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion * feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection * feat(DataSynthesis): refactor data synthesis models and update task handling logic * feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic * feat(DataSynthesis): refactor data synthesis models and update task handling logic * fix(generation_service): ensure processed chunks are incremented regardless of question generation success * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options
This commit is contained in:
@@ -0,0 +1,3 @@
|
||||
"""
|
||||
公共模块
|
||||
"""
|
||||
@@ -0,0 +1,93 @@
|
||||
from typing import List, Union, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_community.document_loaders import (
|
||||
TextLoader,
|
||||
JSONLoader,
|
||||
CSVLoader,
|
||||
UnstructuredMarkdownLoader,
|
||||
PyPDFLoader,
|
||||
Docx2txtLoader
|
||||
)
|
||||
|
||||
from app.core.logging import get_logger
|
||||
|
||||
log = get_logger(__name__)
|
||||
|
||||
class UniversalDocLoader:
|
||||
"""
|
||||
通用泛文本文档加载类
|
||||
支持格式:TXT/JSON/CSV/Markdown/Word(.docx)/PPT(.pptx)/PDF
|
||||
"""
|
||||
# 格式-加载器映射(轻量优先)
|
||||
SUPPORTED_FORMATS = {
|
||||
# 纯文本类
|
||||
".txt": TextLoader,
|
||||
".json": JSONLoader,
|
||||
".csv": CSVLoader,
|
||||
".md": UnstructuredMarkdownLoader,
|
||||
# 办公文档类
|
||||
".docx": Docx2txtLoader,
|
||||
".doc": Docx2txtLoader,
|
||||
# PDF 类
|
||||
".pdf": PyPDFLoader
|
||||
}
|
||||
|
||||
def __init__(self, file_path: Union[str, Path]):
|
||||
self.file_path = Path(file_path).resolve()
|
||||
self.file_suffix = self.file_path.suffix.lower()
|
||||
log.info(f"初始化文档加载器: {self.file_path} (格式: {self.file_suffix})")
|
||||
self._validate_file()
|
||||
|
||||
def _validate_file(self) -> None:
|
||||
"""验证文件存在性和格式支持性"""
|
||||
if not self.file_path.exists():
|
||||
raise FileNotFoundError(f"文件不存在: {self.file_path}")
|
||||
if self.file_suffix not in self.SUPPORTED_FORMATS:
|
||||
raise ValueError(
|
||||
f"不支持的格式: {self.file_suffix} | 支持格式: {list(self.SUPPORTED_FORMATS.keys())}"
|
||||
)
|
||||
|
||||
def load(
|
||||
self,
|
||||
file_format: Optional[str] = None,
|
||||
**loader_kwargs
|
||||
) -> List[Document]:
|
||||
"""
|
||||
加载文档并返回 LangChain Document 列表
|
||||
:param file_format: 手动指定格式(如 ".pdf"),默认自动识别
|
||||
:param loader_kwargs: 传递给具体加载器的参数(如 JSONLoader 的 jq_schema)
|
||||
:return: List[Document]
|
||||
"""
|
||||
# 确定目标格式
|
||||
target_format = file_format.lower() if file_format else self.file_suffix
|
||||
loader_cls = self.SUPPORTED_FORMATS[target_format]
|
||||
|
||||
# 加载器默认参数优化
|
||||
loader_kwargs = self._set_default_kwargs(loader_cls, loader_kwargs)
|
||||
|
||||
# 初始化并加载
|
||||
loader = loader_cls(str(self.file_path), **loader_kwargs)
|
||||
return loader.load()
|
||||
|
||||
@staticmethod
|
||||
def _set_default_kwargs(loader_cls, kwargs: dict) -> dict:
|
||||
"""为不同加载器设置默认参数,简化调用"""
|
||||
if loader_cls == JSONLoader and "jq_schema" not in kwargs:
|
||||
kwargs.setdefault("jq_schema", ".")
|
||||
kwargs.setdefault("text_content", False)
|
||||
if loader_cls == CSVLoader and "csv_args" not in kwargs:
|
||||
kwargs["csv_args"] = {"delimiter": ","}
|
||||
return kwargs
|
||||
|
||||
|
||||
# 文档加载器便捷函数
|
||||
def load_documents(
|
||||
file_path: Union[str, Path],
|
||||
file_format: Optional[str] = None,
|
||||
**loader_kwargs
|
||||
) -> List[Document]:
|
||||
"""快速加载文档的便捷函数"""
|
||||
loader = UniversalDocLoader(file_path)
|
||||
return loader.load(file_format=file_format, **loader_kwargs)
|
||||
169
runtime/datamate-python/app/module/shared/common/text_split.py
Normal file
169
runtime/datamate-python/app/module/shared/common/text_split.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import os
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_text_splitters import (
|
||||
RecursiveCharacterTextSplitter,
|
||||
MarkdownHeaderTextSplitter
|
||||
)
|
||||
|
||||
|
||||
class DocumentSplitter:
|
||||
"""
|
||||
文档分割器类 - 增强版,优先通过元数据识别文档类型
|
||||
核心特性:
|
||||
1. 优先从metadata的source字段(文件扩展名)识别Markdown
|
||||
2. 元数据缺失时,通过内容特征降级检测
|
||||
3. 支持CJK(中日韩)语言优化
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int = 2000,
|
||||
chunk_overlap: int = 200,
|
||||
is_cjk_language: bool = True,
|
||||
markdown_headers: Optional[List[Tuple[str, str]]] = None
|
||||
):
|
||||
"""
|
||||
初始化文档分割器
|
||||
|
||||
Args:
|
||||
chunk_size: 每个文本块的最大长度(默认2000字符)
|
||||
chunk_overlap: 文本块之间的重叠长度(默认200字符)
|
||||
is_cjk_language: 是否处理中日韩等无词边界语言(默认True)
|
||||
markdown_headers: Markdown标题分割规则(默认:#/##/###/####)
|
||||
"""
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
self.is_cjk_language = is_cjk_language
|
||||
|
||||
# 默认Markdown标题分割规则
|
||||
self.markdown_headers = markdown_headers or [
|
||||
("#", "header_1"),
|
||||
("##", "header_2"),
|
||||
("###", "header_3"),
|
||||
("####", "header_4"),
|
||||
]
|
||||
|
||||
# 初始化基础文本分割器
|
||||
self.text_splitter = self._create_text_splitter()
|
||||
|
||||
def _create_text_splitter(self) -> RecursiveCharacterTextSplitter:
|
||||
"""创建递归字符分割器(内部方法)"""
|
||||
# 优化后的CJK分隔符列表(修复语法错误,调整优先级)
|
||||
if self.is_cjk_language:
|
||||
separators = [
|
||||
"\n\n", "\n", # 段落/换行(最高优先级)
|
||||
"。", ".", # 句号(中文/英文)
|
||||
"!", "!", # 感叹号(中文/英文)
|
||||
"?", "?", # 问号(中文/英文)
|
||||
";", ";", # 分号(中文/英文)
|
||||
",", ",", # 逗号(中文/英文)
|
||||
"、", # 顿号(中文)
|
||||
":", ":", # 冒号(中文/英文)
|
||||
" ", # 空格
|
||||
"\u200b", "", # 零宽空格/兜底
|
||||
]
|
||||
else:
|
||||
separators = ["\n\n", "\n", " ", ".", "!", "?", ";", ":", ",", ""]
|
||||
|
||||
return RecursiveCharacterTextSplitter(
|
||||
chunk_size=self.chunk_size,
|
||||
chunk_overlap=self.chunk_overlap,
|
||||
separators=separators,
|
||||
length_function=len,
|
||||
is_separator_regex=False
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _is_markdown(doc: Document) -> bool:
|
||||
"""
|
||||
优先从元数据判断是否为Markdown
|
||||
规则:检查metadata中的source字段扩展名是否为.md/.markdown/.mdx等
|
||||
"""
|
||||
# 获取source字段(忽略大小写)
|
||||
source = doc.metadata.get("source", "").lower()
|
||||
if not source:
|
||||
return False
|
||||
|
||||
# 获取文件扩展名
|
||||
ext = os.path.splitext(source)[-1].lower()
|
||||
# Markdown常见扩展名列表
|
||||
md_ext = [".md", ".markdown", ".mdx", ".mkd", ".mkdown"]
|
||||
return ext in md_ext
|
||||
|
||||
def split(self, documents: List[Document], is_markdown: bool = False) -> List[Document]:
|
||||
"""
|
||||
核心分割方法
|
||||
|
||||
Args:
|
||||
documents: 待分割的Document列表
|
||||
is_markdown: 是否为Markdown文档(默认False)
|
||||
|
||||
Returns:
|
||||
分割后的Document列表
|
||||
"""
|
||||
if not documents:
|
||||
return []
|
||||
|
||||
# Markdown文档处理:先按标题分割,再按字符分割
|
||||
if is_markdown:
|
||||
# 初始化Markdown标题分割器
|
||||
md_splitter = MarkdownHeaderTextSplitter(
|
||||
headers_to_split_on=self.markdown_headers,
|
||||
strip_headers=True,
|
||||
return_each_line=False
|
||||
)
|
||||
|
||||
# 按标题分割并继承元数据
|
||||
md_chunks = []
|
||||
for doc in documents:
|
||||
chunks = md_splitter.split_text(doc.page_content)
|
||||
for chunk in chunks:
|
||||
chunk.metadata.update(doc.metadata)
|
||||
md_chunks.extend(chunks)
|
||||
|
||||
# 对标题分割后的内容进行字符分割
|
||||
final_chunks = self.text_splitter.split_documents(md_chunks)
|
||||
|
||||
# 普通文本直接分割
|
||||
else:
|
||||
final_chunks = self.text_splitter.split_documents(documents)
|
||||
|
||||
return final_chunks
|
||||
|
||||
# 核心自动分割方法(元数据优先)
|
||||
@classmethod
|
||||
def auto_split(
|
||||
cls,
|
||||
documents: List[Document],
|
||||
chunk_size: int = 2000,
|
||||
chunk_overlap: int = 200
|
||||
) -> List[Document]:
|
||||
"""
|
||||
极简快捷方法:自动识别文档类型并分割(元数据优先)
|
||||
仅需传入3个参数,无需初始化类实例
|
||||
|
||||
Args:
|
||||
documents: 待分割的Document列表
|
||||
chunk_size: 每个文本块的最大长度(默认2000字符)
|
||||
chunk_overlap: 文本块之间的重叠长度(默认200字符)
|
||||
|
||||
Returns:
|
||||
分割后的Document列表
|
||||
"""
|
||||
if not documents:
|
||||
return []
|
||||
|
||||
# 初始化分割器实例(使用CJK默认优化)
|
||||
splitter = cls(
|
||||
chunk_size=chunk_size,
|
||||
chunk_overlap=chunk_overlap,
|
||||
is_cjk_language=True
|
||||
)
|
||||
|
||||
# 自动检测文档类型(元数据优先)
|
||||
is_md = splitter._is_markdown(documents[0])
|
||||
|
||||
# 根据检测结果选择分割方式
|
||||
return splitter.split(documents, is_markdown=is_md)
|
||||
Reference in New Issue
Block a user