feat:问题生成过程优化及COT数据生成优化 (#169)

* fix(chart): update Helm chart helpers and values for improved configuration

* feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths

* feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates

* feat(SynthFileTask): enhance file display with progress tracking and delete action

* feat(SynthFileTask): enhance file display with progress tracking and delete action

* feat(SynthDataDetail): add delete action for chunks with confirmation prompt

* feat(SynthDataDetail): update edit and delete buttons to icon-only format

* feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion

* feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection

* feat(DataSynthesis): refactor data synthesis models and update task handling logic

* feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic

* feat(DataSynthesis): refactor data synthesis models and update task handling logic

* fix(generation_service): ensure processed chunks are incremented regardless of question generation success

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options
This commit is contained in:
Dallas98
2025-12-18 16:51:18 +08:00
committed by GitHub
parent 761f7f6a51
commit e0e9b1d94d
14 changed files with 1362 additions and 571 deletions

View File

@@ -0,0 +1,3 @@
"""
公共模块
"""

View File

@@ -0,0 +1,93 @@
from typing import List, Union, Optional
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.document_loaders import (
TextLoader,
JSONLoader,
CSVLoader,
UnstructuredMarkdownLoader,
PyPDFLoader,
Docx2txtLoader
)
from app.core.logging import get_logger
log = get_logger(__name__)
class UniversalDocLoader:
"""
通用泛文本文档加载类
支持格式:TXT/JSON/CSV/Markdown/Word(.docx)/PPT(.pptx)/PDF
"""
# 格式-加载器映射(轻量优先)
SUPPORTED_FORMATS = {
# 纯文本类
".txt": TextLoader,
".json": JSONLoader,
".csv": CSVLoader,
".md": UnstructuredMarkdownLoader,
# 办公文档类
".docx": Docx2txtLoader,
".doc": Docx2txtLoader,
# PDF 类
".pdf": PyPDFLoader
}
def __init__(self, file_path: Union[str, Path]):
self.file_path = Path(file_path).resolve()
self.file_suffix = self.file_path.suffix.lower()
log.info(f"初始化文档加载器: {self.file_path} (格式: {self.file_suffix})")
self._validate_file()
def _validate_file(self) -> None:
"""验证文件存在性和格式支持性"""
if not self.file_path.exists():
raise FileNotFoundError(f"文件不存在: {self.file_path}")
if self.file_suffix not in self.SUPPORTED_FORMATS:
raise ValueError(
f"不支持的格式: {self.file_suffix} | 支持格式: {list(self.SUPPORTED_FORMATS.keys())}"
)
def load(
self,
file_format: Optional[str] = None,
**loader_kwargs
) -> List[Document]:
"""
加载文档并返回 LangChain Document 列表
:param file_format: 手动指定格式(如 ".pdf"),默认自动识别
:param loader_kwargs: 传递给具体加载器的参数(如 JSONLoader 的 jq_schema)
:return: List[Document]
"""
# 确定目标格式
target_format = file_format.lower() if file_format else self.file_suffix
loader_cls = self.SUPPORTED_FORMATS[target_format]
# 加载器默认参数优化
loader_kwargs = self._set_default_kwargs(loader_cls, loader_kwargs)
# 初始化并加载
loader = loader_cls(str(self.file_path), **loader_kwargs)
return loader.load()
@staticmethod
def _set_default_kwargs(loader_cls, kwargs: dict) -> dict:
"""为不同加载器设置默认参数,简化调用"""
if loader_cls == JSONLoader and "jq_schema" not in kwargs:
kwargs.setdefault("jq_schema", ".")
kwargs.setdefault("text_content", False)
if loader_cls == CSVLoader and "csv_args" not in kwargs:
kwargs["csv_args"] = {"delimiter": ","}
return kwargs
# 文档加载器便捷函数
def load_documents(
file_path: Union[str, Path],
file_format: Optional[str] = None,
**loader_kwargs
) -> List[Document]:
"""快速加载文档的便捷函数"""
loader = UniversalDocLoader(file_path)
return loader.load(file_format=file_format, **loader_kwargs)

View File

@@ -0,0 +1,169 @@
import os
from typing import List, Optional, Tuple
from langchain_core.documents import Document
from langchain_text_splitters import (
RecursiveCharacterTextSplitter,
MarkdownHeaderTextSplitter
)
class DocumentSplitter:
"""
文档分割器类 - 增强版,优先通过元数据识别文档类型
核心特性:
1. 优先从metadata的source字段(文件扩展名)识别Markdown
2. 元数据缺失时,通过内容特征降级检测
3. 支持CJK(中日韩)语言优化
"""
def __init__(
self,
chunk_size: int = 2000,
chunk_overlap: int = 200,
is_cjk_language: bool = True,
markdown_headers: Optional[List[Tuple[str, str]]] = None
):
"""
初始化文档分割器
Args:
chunk_size: 每个文本块的最大长度(默认2000字符)
chunk_overlap: 文本块之间的重叠长度(默认200字符)
is_cjk_language: 是否处理中日韩等无词边界语言(默认True)
markdown_headers: Markdown标题分割规则(默认:#/##/###/####)
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.is_cjk_language = is_cjk_language
# 默认Markdown标题分割规则
self.markdown_headers = markdown_headers or [
("#", "header_1"),
("##", "header_2"),
("###", "header_3"),
("####", "header_4"),
]
# 初始化基础文本分割器
self.text_splitter = self._create_text_splitter()
def _create_text_splitter(self) -> RecursiveCharacterTextSplitter:
"""创建递归字符分割器(内部方法)"""
# 优化后的CJK分隔符列表(修复语法错误,调整优先级)
if self.is_cjk_language:
separators = [
"\n\n", "\n", # 段落/换行(最高优先级)
"", ".", # 句号(中文/英文)
"", "!", # 感叹号(中文/英文)
"", "?", # 问号(中文/英文)
"", ";", # 分号(中文/英文)
"", ",", # 逗号(中文/英文)
"", # 顿号(中文)
"", ":", # 冒号(中文/英文)
" ", # 空格
"\u200b", "", # 零宽空格/兜底
]
else:
separators = ["\n\n", "\n", " ", ".", "!", "?", ";", ":", ",", ""]
return RecursiveCharacterTextSplitter(
chunk_size=self.chunk_size,
chunk_overlap=self.chunk_overlap,
separators=separators,
length_function=len,
is_separator_regex=False
)
@staticmethod
def _is_markdown(doc: Document) -> bool:
"""
优先从元数据判断是否为Markdown
规则:检查metadata中的source字段扩展名是否为.md/.markdown/.mdx等
"""
# 获取source字段(忽略大小写)
source = doc.metadata.get("source", "").lower()
if not source:
return False
# 获取文件扩展名
ext = os.path.splitext(source)[-1].lower()
# Markdown常见扩展名列表
md_ext = [".md", ".markdown", ".mdx", ".mkd", ".mkdown"]
return ext in md_ext
def split(self, documents: List[Document], is_markdown: bool = False) -> List[Document]:
"""
核心分割方法
Args:
documents: 待分割的Document列表
is_markdown: 是否为Markdown文档(默认False)
Returns:
分割后的Document列表
"""
if not documents:
return []
# Markdown文档处理:先按标题分割,再按字符分割
if is_markdown:
# 初始化Markdown标题分割器
md_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=self.markdown_headers,
strip_headers=True,
return_each_line=False
)
# 按标题分割并继承元数据
md_chunks = []
for doc in documents:
chunks = md_splitter.split_text(doc.page_content)
for chunk in chunks:
chunk.metadata.update(doc.metadata)
md_chunks.extend(chunks)
# 对标题分割后的内容进行字符分割
final_chunks = self.text_splitter.split_documents(md_chunks)
# 普通文本直接分割
else:
final_chunks = self.text_splitter.split_documents(documents)
return final_chunks
# 核心自动分割方法(元数据优先)
@classmethod
def auto_split(
cls,
documents: List[Document],
chunk_size: int = 2000,
chunk_overlap: int = 200
) -> List[Document]:
"""
极简快捷方法:自动识别文档类型并分割(元数据优先)
仅需传入3个参数,无需初始化类实例
Args:
documents: 待分割的Document列表
chunk_size: 每个文本块的最大长度(默认2000字符)
chunk_overlap: 文本块之间的重叠长度(默认200字符)
Returns:
分割后的Document列表
"""
if not documents:
return []
# 初始化分割器实例(使用CJK默认优化)
splitter = cls(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
is_cjk_language=True
)
# 自动检测文档类型(元数据优先)
is_md = splitter._is_markdown(documents[0])
# 根据检测结果选择分割方式
return splitter.split(documents, is_markdown=is_md)

View File

@@ -14,7 +14,8 @@ def call_openai_style_model(base_url, api_key, model_name, prompt, **kwargs):
)
return response.choices[0].message.content
def _extract_json_substring(raw: str) -> str:
def extract_json_substring(raw: str) -> str:
"""从 LLM 的原始回答中提取最可能的 JSON 字符串片段。
处理思路:
@@ -22,11 +23,21 @@ def _extract_json_substring(raw: str) -> str:
- 优先在文本中查找第一个 '{''[' 作为 JSON 起始;
- 再从后向前找最后一个 '}'']' 作为结束;
- 如果找不到合适的边界,就退回原始字符串。
- 部分模型可能会在回复中加入 `<think>...</think>` 内部思考内容,应在解析前先去除。
该方法不会保证截取的一定是合法 JSON,但能显著提高 json.loads 的成功率。
"""
if not raw:
return raw
# 先移除所有 <think>...</think> 段落(包括跨多行的情况)
try:
import re
raw = re.sub(r"<think>[\s\S]*?</think>", "", raw, flags=re.IGNORECASE)
except Exception:
# 正则异常时不影响后续逻辑,继续使用原始文本
pass
start = None
end = None