From e0e9b1d94d99800de32f8933b3cc89984d6e527e Mon Sep 17 00:00:00 2001 From: Dallas98 <40557804+Dallas98@users.noreply.github.com> Date: Thu, 18 Dec 2025 16:51:18 +0800 Subject: [PATCH] =?UTF-8?q?feat=EF=BC=9A=E9=97=AE=E9=A2=98=E7=94=9F?= =?UTF-8?q?=E6=88=90=E8=BF=87=E7=A8=8B=E4=BC=98=E5=8C=96=E5=8F=8ACOT?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E7=94=9F=E6=88=90=E4=BC=98=E5=8C=96=20(#169)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(chart): update Helm chart helpers and values for improved configuration * feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths * feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthDataDetail): add delete action for chunks with confirmation prompt * feat(SynthDataDetail): update edit and delete buttons to icon-only format * feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion * feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection * feat(DataSynthesis): refactor data synthesis models and update task handling logic * feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic * feat(DataSynthesis): refactor data synthesis models and update task handling logic * fix(generation_service): ensure processed chunks are incremented regardless of question generation success * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options --- .../src/pages/SynthesisTask/CreateTask.tsx | 562 ++++++++++--- .../datamate-python/app/common/text_split.py | 0 .../app/db/models/data_synthesis.py | 81 +- .../module/evaluation/service/evaluation.py | 8 +- .../generation/interface/generation_api.py | 127 +-- .../module/generation/schema/generation.py | 41 +- .../generation/service/export_service.py | 4 +- .../generation/service/generation_service.py | 745 +++++++++++------- .../app/module/generation/service/prompt.py | 167 ++-- .../{ => module/shared}/common/__init__.py | 0 .../shared}/common/document_loaders.py | 0 .../app/module/shared/common/text_split.py | 169 ++++ .../app/module/shared/util/model_chat.py | 13 +- scripts/db/data-synthesis-init.sql | 16 +- 14 files changed, 1362 insertions(+), 571 deletions(-) delete mode 100644 runtime/datamate-python/app/common/text_split.py rename runtime/datamate-python/app/{ => module/shared}/common/__init__.py (100%) rename runtime/datamate-python/app/{ => module/shared}/common/document_loaders.py (100%) create mode 100644 runtime/datamate-python/app/module/shared/common/text_split.py diff --git a/frontend/src/pages/SynthesisTask/CreateTask.tsx b/frontend/src/pages/SynthesisTask/CreateTask.tsx index 5c00a2c..db59b2b 100644 --- a/frontend/src/pages/SynthesisTask/CreateTask.tsx +++ b/frontend/src/pages/SynthesisTask/CreateTask.tsx @@ -1,7 +1,7 @@ import { useEffect, useState } from "react"; import type { Dataset, DatasetFile } from "@/pages/DataManagement/dataset.model"; -import { Steps, Card, Select, Input, Checkbox, Button, Form, message } from "antd"; -import { Eye, ArrowLeft, ArrowRight, Play, Search, MoreHorizontal } from "lucide-react"; +import { Steps, Card, Select, Input, Button, Form, message, Tag, Tooltip, InputNumber } from "antd"; +import { Eye, ArrowLeft, ArrowRight, Play, Search, Sparkles, Brain, Layers } from "lucide-react"; import { Link, useNavigate } from "react-router"; import { queryDatasetsUsingGet } from "../DataManagement/dataset.api"; import DatasetFileTransfer from "@/components/business/DatasetFileTransfer"; @@ -31,13 +31,18 @@ export default function SynthesisTaskCreate() { const [selectedFiles, setSelectedFiles] = useState([]); const [selectedMap, setSelectedMap] = useState>({}); const [selectedDataset, setSelectedDataset] = useState(null); + // 当前选中的模板类型(QA / COT),用于高亮展示 const [selectedSynthesisTypes, setSelectedSynthesisTypes] = useState(["qa"]); const [taskType, setTaskType] = useState<"qa" | "cot">("qa"); - const [promptTemplate, setPromptTemplate] = useState(""); + const [questionPrompt, setQuestionPrompt] = useState(""); + const [answerPrompt, setAnswerPrompt] = useState(""); const [submitting, setSubmitting] = useState(false); const [modelOptions, setModelOptions] = useState<{ label: string; value: string }[]>([]); const [modelsLoading, setModelsLoading] = useState(false); - const [selectedModel, setSelectedModel] = useState(undefined); + const [questionModelId, setQuestionModelId] = useState(undefined); + const [answerModelId, setAnswerModelId] = useState(undefined); + + // 文本切片配置 const [sliceConfig, setSliceConfig] = useState({ processType: "DEFAULT_CHUNK" as | "DEFAULT_CHUNK" @@ -45,10 +50,23 @@ export default function SynthesisTaskCreate() { | "PARAGRAPH_CHUNK" | "FIXED_LENGTH_CHUNK" | "CUSTOM_SEPARATOR_CHUNK", - chunkSize: 500, - overlapSize: 50, + chunkSize: 3000, + overlapSize: 100, delimiter: "", }); + + // 问题/答案合成配置(与后端 question_synth_config / answer_synth_config 对齐) + const [questionConfig, setQuestionConfig] = useState({ + number: 1, + temperature: 0.7, + }); + const [answerConfig, setAnswerConfig] = useState({ + // 答案侧不再需要 number,只保留温度 + temperature: 0.7, + }); + // 合成总数上限,默认 5000 + const [maxQaPairs, setMaxQaPairs] = useState(5000); + const sliceOptions = [ { label: "默认分块", value: "DEFAULT_CHUNK" }, { label: "按章节分块", value: "CHAPTER_CHUNK" }, @@ -62,33 +80,43 @@ export default function SynthesisTaskCreate() { return data; }; - const fetchPrompt = async (type: "qa" | "cot") => { + // 问题 Prompt:固定使用 QUESTION 类型获取 + const fetchQuestionPrompt = async () => { try { - const synthTypeParam = type.toUpperCase(); - const res = await getPromptByTypeUsingGet(synthTypeParam); + const res = await getPromptByTypeUsingGet("QUESTION"); const prompt = typeof res === "string" ? res : (res as { data?: string })?.data ?? ""; - setPromptTemplate(prompt || ""); + setQuestionPrompt(prompt || ""); } catch (e) { console.error(e); - message.error("获取提示词模板失败"); - setPromptTemplate(""); + message.error("获取问题 Prompt 模板失败"); + setQuestionPrompt(""); } }; - useEffect(() => { - fetchDatasets(); - }, []); - - useEffect(() => { - fetchPrompt(taskType); - }, [taskType]); + // 答案 Prompt:根据当前任务类型获取 QA/COT 模板 + const fetchAnswerPrompt = async (type: "qa" | "cot") => { + try { + const synthTypeParam = type === "qa" ? "QA" : "COT"; + const res = await getPromptByTypeUsingGet(synthTypeParam); + const prompt = typeof res === "string" ? res : (res as { data?: string })?.data ?? ""; + setAnswerPrompt(prompt || ""); + } catch (e) { + console.error(e); + message.error("获取答案 Prompt 模板失败"); + setAnswerPrompt(""); + } + }; + // 拉取模型列表,仅保留 CHAT 模型 useEffect(() => { const loadModels = async () => { setModelsLoading(true); try { const { data } = await queryModelListUsingGet({ page: 0, size: 1000 }); - const options = (data?.content || []).map((model: ModelI) => ({ + const chatModels: ModelI[] = (data?.content || []).filter( + (model: ModelI) => model.type === "CHAT" + ); + const options = chatModels.map((model) => ({ label: `${model.modelName} (${model.provider})`, value: model.id, })); @@ -102,11 +130,22 @@ export default function SynthesisTaskCreate() { loadModels(); }, []); + // 默认选中第一个 CHAT 模型作为问题/答案模型 useEffect(() => { - if (!selectedModel && modelOptions.length > 0) { - setSelectedModel(modelOptions[0].value); + if (modelOptions.length > 0) { + setQuestionModelId((prev) => prev ?? modelOptions[0].value); + setAnswerModelId((prev) => prev ?? modelOptions[0].value); } - }, [modelOptions, selectedModel]); + }, [modelOptions]); + + useEffect(() => { + fetchDatasets(); + }, []); + + useEffect(() => { + fetchQuestionPrompt(); + fetchAnswerPrompt(taskType); + }, [taskType]); // 表单数据 const [formValues, setFormValues] = useState({ @@ -131,13 +170,12 @@ export default function SynthesisTaskCreate() { const handleCreateTask = async () => { try { const values = (await form.validateFields()) as CreateTaskFormValues; - // precise validation if (!(taskType === "qa" || taskType === "cot")) { message.error("请选择一个合成类型"); return; } - if (!selectedModel) { - message.error("请选择模型"); + if (!questionModelId || !answerModelId) { + message.error("请选择问题和答案使用的模型"); return; } if (selectedFiles.length === 0) { @@ -145,25 +183,42 @@ export default function SynthesisTaskCreate() { return; } - // 构造后端要求的参数格式 - const payload: Record = { - name: values.name || form.getFieldValue("name"), - model_id: selectedModel, - source_file_id: selectedFiles, + const synthConfig: Record = { text_split_config: { chunk_size: sliceConfig.chunkSize, chunk_overlap: sliceConfig.overlapSize, }, - synthesis_config: { - prompt_template: promptTemplate, + question_synth_config: { + model_id: questionModelId, + prompt_template: questionPrompt, + number: questionConfig.number, + temperature: questionConfig.temperature, }, - synthesis_type: taskType === "qa" ? "QA" : "COT", + answer_synth_config: { + model_id: answerModelId, + prompt_template: answerPrompt, + temperature: answerConfig.temperature, + }, + max_qa_pairs: typeof maxQaPairs === "number" && maxQaPairs > 0 ? maxQaPairs : undefined, }; - // 只有在有真实内容时携带 description,避免强制传空字符串 - const desc = values.description ?? form.getFieldValue("description"); - if (typeof desc === "string" && desc.trim().length > 0) { - payload.description = desc.trim(); + const payload: Record = { + name: values.name || form.getFieldValue("name"), + description: values.description ?? form.getFieldValue("description"), + synthesis_type: taskType === "qa" ? "QA" : "COT", + source_file_id: selectedFiles, + synth_config: synthConfig, + }; + + // 清洗 description:空字符串转为 undefined,让后端用 validator 处理为 None + const desc = payload.description; + if (typeof desc === "string" && desc.trim().length === 0) { + delete payload.description; + } + + // 如果未设置 max_qa_pairs,则从 synth_config 中移除该字段,避免传递 undefined + if (synthConfig.max_qa_pairs === undefined) { + delete (synthConfig as { max_qa_pairs?: number }).max_qa_pairs; } setSubmitting(true); @@ -187,25 +242,43 @@ export default function SynthesisTaskCreate() { return; } console.error(error); - message.error((error instanceof Error ? error.message : "合成任务创建失败")); + message.error(error instanceof Error ? error.message : "合成任务创建失败"); } finally { setSubmitting(false); } }; - // 仅两个一级类型,无二级目录 - const synthesisTypes = [ - { id: "qa", name: "生成问答对" }, - { id: "cot", name: "生成COT链式推理" }, - ] as const; + // 仅两个一级类型,无二级目录 -> 扩展为模板配置 + const synthesisTemplates = [ + { + id: "sft-qa", + type: "qa" as const, + title: "SFT 问答数据合成", + subtitle: "从长文档自动生成高质量问答样本", + badge: "推荐", + description: + "适用于构建监督微调(SFT)问答数据集,支持从知识库或长文档中抽取关键问答对。", + colorClass: "from-sky-500/10 via-sky-400/5 to-transparent", + borderClass: "border-sky-100 hover:border-sky-300", + icon: Sparkles, + }, + { + id: "cot-reasoning", + type: "cot" as const, + title: "COT 链式推理合成", + subtitle: "一步步推理过程与最终答案", + badge: "推理增强", + description: + "生成包含模型推理中间过程的 COT 数据,用于提升模型的复杂推理和解释能力。", + colorClass: "from-violet-500/10 via-violet-400/5 to-transparent", + borderClass: "border-violet-100 hover:border-violet-300", + icon: Brain, + }, + ]; - const handleSynthesisTypeSelect = (typeId: "qa" | "cot") => { - setSelectedSynthesisTypes((prev) => { - const next = prev.includes(typeId) ? [] : [typeId]; - if (next[0] === "qa") setTaskType("qa"); - if (next[0] === "cot") setTaskType("cot"); - return next; - }); + const handleTemplateClick = (tpl: (typeof synthesisTemplates)[number]) => { + setTaskType(tpl.type); + setSelectedSynthesisTypes([tpl.type]); }; useEffect(() => { @@ -247,120 +320,374 @@ export default function SynthesisTaskCreate() { if (createStep === 2) { return ( -
-
- {/* 左侧合成指令(仅两个一级类型,单选) */} +
+
+ {/* 左侧合成指令模板区:占 1/3 宽度 */}
- -

合成指令(仅支持单选)

-
-
- - + +
+
+

+ + 合成指令模板 +

+

+ 从左侧选择一个模板,我们会自动为你填充合适的 Prompt 与合成策略。 +

+ + 单选 +
-
- {synthesisTypes.map((type) => ( -
handleSynthesisTypeSelect(type.id)} - > - handleSynthesisTypeSelect(type.id)} - /> - {type.name} - -
- ))} + +
+
+ + +
+ +
+ {synthesisTemplates.map((tpl) => { + const Icon = tpl.icon; + const active = selectedSynthesisTypes.includes(tpl.type); + + return ( +
handleTemplateClick(tpl)} + className={`group relative rounded-xl border p-2.5 text-xs transition-all duration-200 cursor-pointer bg-white/80 hover:bg-white/100 ${ + tpl.borderClass + } ${ + active + ? "ring-1 ring-offset-1 ring-blue-500/60 border-blue-400/70 shadow-sm bg-gradient-to-r " + + tpl.colorClass + : "border-slate-100 hover:shadow-sm" + }`} + > +
+
+ +
+
+
+ + {tpl.title} + + {tpl.badge && ( + + {tpl.badge} + + )} +
+

+ {tpl.subtitle} +

+

+ {tpl.description} +

+
+
+ +
+ +
+ {active ? "✓" : ""} +
+
+
+
+ ); + })} +
- {/* 右侧合成配置 */} + {/* 右侧合成配置:占 2/3 宽度 */}
- -
-

合成配置

+ +
+
+

+ + 合成配置 +

+

+ 根据左侧模板自动带出配置,你也可以在此基础上进行微调。 +

+
- + + +
- {/* 切片配置 */} - + {/* 步骤说明条 */} +
+ 1 + 设置合成总数 + / + 2 + 配置文本切片策略 + / + 3 + 配置问题合成参数 + / + 4 + 配置答案合成参数 +
+ + {/* 1. 合成总数配置 */} +
+
+
+ 1 + 合成总数上限 +
+ 控制整个任务最多生成的 QA 对数量 +
+
+ setMaxQaPairs(typeof v === "number" ? v : undefined)} + /> + 可选项,建议在大规模合成时设置上限 +
+
+ + {/* 2. 文本切片配置 */} +
+
+
+ 2 + 文本切片配置 +
+ 影响上下文长度与召回粒度 +
- 分块策略 + 分块策略 setSliceConfig((p) => ({ ...p, chunkSize: Number(e.target.value) }))} + size="small" />
- 重叠大小 + 重叠大小 setSliceConfig((p) => ({ ...p, overlapSize: Number(e.target.value) }))} + size="small" />
{sliceConfig.processType === "CUSTOM_SEPARATOR_CHUNK" && (
- 自定义分隔符 + 自定义分隔符 setSliceConfig((p) => ({ ...p, delimiter: e.target.value }))} + size="small" />
)} - +
- {/* 模型选择 */} - - 模型选择 - setQuestionModelId(v)} + /> +
+
+ 问题 Prompt 模板 +

+ 用于指导模型如何从切片文本中生成高质量问题,可在保持变量占位符不变的前提下个性化修改。 +