feat：问题生成过程优化及COT数据生成优化 (#169)

* fix(chart): update Helm chart helpers and values for improved configuration * feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths * feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthDataDetail): add delete action for chunks with confirmation prompt * feat(SynthDataDetail): update edit and delete buttons to icon-only format * feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion * feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection * feat(DataSynthesis): refactor data synthesis models and update task handling logic * feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic * feat(DataSynthesis): refactor data synthesis models and update task handling logic * fix(generation_service): ensure processed chunks are incremented regardless of question generation success * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options
2025-12-18 16:51:18 +08:00
parent 761f7f6a51
commit e0e9b1d94d
14 changed files with 1362 additions and 571 deletions
--- a/runtime/datamate-python/app/module/generation/schema/generation.py
+++ b/runtime/datamate-python/app/module/generation/schema/generation.py
@@ -11,33 +11,45 @@ class TextSplitConfig(BaseModel):
    chunk_overlap: int = Field(..., description="重叠令牌数")


-class SynthesisConfig(BaseModel):
+class SyntheConfig(BaseModel):
    """合成配置"""
-    prompt_template: str = Field(..., description="合成提示模板")
-    synthesis_count: int = Field(None, description="单个chunk合成的数据数量")
+    model_id: str = Field(..., description="模型ID")
+    prompt_template: str = Field(None, description="合成提示模板")
+    number: Optional[int] = Field(None, description="单个chunk合成的数据数量")
    temperature: Optional[float] = Field(None, description="温度参数")


+class Config(BaseModel):
+    """配置"""
+    text_split_config: TextSplitConfig = Field(None, description="文本切片配置")
+    question_synth_config: SyntheConfig = Field(None, description="问题合成配置")
+    answer_synth_config: SyntheConfig = Field(None, description="答案合成配置")
+    # 新增：整个任务允许生成的 QA 总上限（问题/答案对数量）
+    max_qa_pairs: Optional[int] = Field(
+        default=None,
+        description="整个任务允许生成的 QA 对总量上限；为 None 或 <=0 表示不限制",
+    )
+
+
 class SynthesisType(Enum):
    """合成类型"""
    QA = "QA"
    COT = "COT"
+    QUESTION = "QUESTION"


 class CreateSynthesisTaskRequest(BaseModel):
    """创建数据合成任务请求"""
    name: str = Field(..., description="合成任务名称")
    description: Optional[str] = Field(None, description="合成任务描述")
-    model_id: str = Field(..., description="模型ID")
-    source_file_id: list[str] = Field(..., description="原始文件ID列表")
-    text_split_config: TextSplitConfig = Field(None, description="文本切片配置")
-    synthesis_config: SynthesisConfig = Field(..., description="合成配置")
    synthesis_type: SynthesisType = Field(..., description="合成类型")
+    source_file_id: list[str] = Field(..., description="原始文件ID列表")
+    synth_config: Config = Field(..., description="合成配置")

    @field_validator("description")
    @classmethod
    def empty_string_to_none(cls, v: Optional[str]) -> Optional[str]:
-        """前端如果传入空字符串，将其统一转换为 None，避免存库时看起来像有描述但实际上为空。"""
+        """前端如果传入空字符串，将其统一转化为 None，避免存库时看起来像有描述但实际上为空。"""
        if isinstance(v, str) and v.strip() == "":
            return None
        return v
@@ -50,17 +62,7 @@ class DataSynthesisTaskItem(BaseModel):
    description: Optional[str] = None
    status: Optional[str] = None
    synthesis_type: str
-    model_id: str
-    progress: int
-    result_data_location: Optional[str] = None
-    text_split_config: Dict[str, Any]
-    synthesis_config: Dict[str, Any]
-    source_file_id: list[str]
    total_files: int
-    processed_files: int
-    total_chunks: int
-    processed_chunks: int
-    total_synthesis_data: int
    created_at: Optional[datetime] = None
    updated_at: Optional[datetime] = None
    created_by: Optional[str] = None
@@ -85,7 +87,6 @@ class DataSynthesisFileTaskItem(BaseModel):
    synthesis_instance_id: str
    file_name: str
    source_file_id: str
-    target_file_location: str
    status: Optional[str] = None
    total_chunks: int
    processed_chunks: int
@@ -108,7 +109,7 @@ class PagedDataSynthesisFileTaskResponse(BaseModel):


 class DataSynthesisChunkItem(BaseModel):
-    """数据合成文件下的 chunk 记录"""
+    """数据合成任务下的 chunk 记录"""
    id: str
    synthesis_file_instance_id: str
    chunk_index: Optional[int] = None