feat：问题生成过程优化及COT数据生成优化 (#169)

* fix(chart): update Helm chart helpers and values for improved configuration * feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths * feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthFileTask): enhance file display with progress tracking and delete action * feat(SynthDataDetail): add delete action for chunks with confirmation prompt * feat(SynthDataDetail): update edit and delete buttons to icon-only format * feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion * feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection * feat(DataSynthesis): refactor data synthesis models and update task handling logic * feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic * feat(DataSynthesis): refactor data synthesis models and update task handling logic * fix(generation_service): ensure processed chunks are incremented regardless of question generation success * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options * feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options
2025-12-18 16:51:18 +08:00
parent 761f7f6a51
commit e0e9b1d94d
14 changed files with 1362 additions and 571 deletions
--- a/runtime/datamate-python/app/module/generation/interface/generation_api.py
+++ b/runtime/datamate-python/app/module/generation/interface/generation_api.py
@@ -1,4 +1,5 @@
 import uuid
+from typing import cast

 from fastapi import APIRouter, HTTPException, Depends, BackgroundTasks
 from sqlalchemy import select, func, delete
@@ -7,13 +8,12 @@ from sqlalchemy.ext.asyncio import AsyncSession
 from app.core.logging import get_logger
 from app.db.models.data_synthesis import (
    save_synthesis_task,
-    DataSynthesisInstance,
+    DataSynthInstance,
    DataSynthesisFileInstance,
    DataSynthesisChunkInstance,
    SynthesisData,
 )
 from app.db.models.dataset_management import DatasetFiles
-from app.db.models.model_config import get_model_by_id
 from app.db.session import get_db
 from app.module.generation.schema.generation import (
    CreateSynthesisTaskRequest,
@@ -28,9 +28,9 @@ from app.module.generation.schema.generation import (
    SynthesisDataUpdateRequest,
    BatchDeleteSynthesisDataRequest,
 )
+from app.module.generation.service.export_service import SynthesisDatasetExporter, SynthesisExportError
 from app.module.generation.service.generation_service import GenerationService
 from app.module.generation.service.prompt import get_prompt
-from app.module.generation.service.export_service import SynthesisDatasetExporter, SynthesisExportError
 from app.module.shared.schema import StandardResponse

 router = APIRouter(
@@ -47,10 +47,6 @@ async def create_synthesis_task(
    db: AsyncSession = Depends(get_db),
 ):
    """创建数据合成任务"""
-    result = await get_model_by_id(db, request.model_id)
-    if not result:
-        raise HTTPException(status_code=404, detail="Model not found")
-
    # 先根据 source_file_id 在 DatasetFiles 中查出已有文件信息
    file_ids = request.source_file_id or []
    dataset_files = []
@@ -65,32 +61,48 @@ async def create_synthesis_task(
    synthesis_task = await save_synthesis_task(db, request)

    # 将已有的 DatasetFiles 记录保存到 t_data_synthesis_file_instances
+    synth_files = []
    for f in dataset_files:
        file_instance = DataSynthesisFileInstance(
            id=str(uuid.uuid4()),  # 使用新的 UUID 作为文件任务记录的主键，避免与 DatasetFiles 主键冲突
            synthesis_instance_id=synthesis_task.id,
            file_name=f.file_name,
            source_file_id=str(f.id),
-            target_file_location=synthesis_task.result_data_location or "",
            status="pending",
            total_chunks=0,
            processed_chunks=0,
            created_by="system",
            updated_by="system",
        )
-        db.add(file_instance)
+        synth_files.append(file_instance)

    if dataset_files:
+        db.add_all(synth_files)
        await db.commit()

    generation_service = GenerationService(db)
    # 异步处理任务：只传任务 ID，后台任务中使用新的 DB 会话重新加载任务对象
    background_tasks.add_task(generation_service.process_task, synthesis_task.id)

+    # 将 ORM 对象包装成 DataSynthesisTaskItem，兼容新字段从 synth_config 还原
+
+    task_item = DataSynthesisTaskItem(
+        id=synthesis_task.id,
+        name=synthesis_task.name,
+        description=synthesis_task.description,
+        status=synthesis_task.status,
+        synthesis_type=synthesis_task.synth_type,
+        total_files=synthesis_task.total_files,
+        created_at=synthesis_task.created_at,
+        updated_at=synthesis_task.updated_at,
+        created_by=synthesis_task.created_by,
+        updated_by=synthesis_task.updated_by,
+    )
+
    return StandardResponse(
        code=200,
        message="success",
-        data=synthesis_task,
+        data=task_item,
    )


@@ -100,14 +112,26 @@ async def get_synthesis_task(
    db: AsyncSession = Depends(get_db)
 ):
    """获取数据合成任务详情"""
-    result = await db.get(DataSynthesisInstance, task_id)
-    if not result:
+    synthesis_task = await db.get(DataSynthInstance, task_id)
+    if not synthesis_task:
        raise HTTPException(status_code=404, detail="Synthesis task not found")

+    task_item = DataSynthesisTaskItem(
+        id=synthesis_task.id,
+        name=synthesis_task.name,
+        description=synthesis_task.description,
+        status=synthesis_task.status,
+        synthesis_type=synthesis_task.synth_type,
+        total_files=synthesis_task.total_files,
+        created_at=synthesis_task.created_at,
+        updated_at=synthesis_task.updated_at,
+        created_by=synthesis_task.created_by,
+        updated_by=synthesis_task.updated_by,
+    )
    return StandardResponse(
        code=200,
        message="success",
-        data=result,
+        data=task_item,
    )


@@ -121,16 +145,16 @@ async def list_synthesis_tasks(
    db: AsyncSession = Depends(get_db)
 ):
    """分页列出所有数据合成任务，默认按创建时间倒序"""
-    query = select(DataSynthesisInstance)
+    query = select(DataSynthInstance)
    if synthesis_type:
-        query = query.filter(DataSynthesisInstance.synthesis_type == synthesis_type)
+        query = query.filter(DataSynthInstance.synth_type == synthesis_type)
    if status:
-        query = query.filter(DataSynthesisInstance.status == status)
+        query = query.filter(DataSynthInstance.status == status)
    if name:
-        query = query.filter(DataSynthesisInstance.name.like(f"%{name}%"))
+        query = query.filter(DataSynthInstance.name.like(f"%{name}%"))

    # 默认按创建时间倒序排列
-    query = query.order_by(DataSynthesisInstance.created_at.desc())
+    query = query.order_by(DataSynthInstance.created_at.desc())

    count_q = select(func.count()).select_from(query.subquery())
    total = (await db.execute(count_q)).scalar_one()
@@ -143,31 +167,39 @@ async def list_synthesis_tasks(
    result = await db.execute(query.offset((page - 1) * page_size).limit(page_size))
    rows = result.scalars().all()

-    task_items = [
-        DataSynthesisTaskItem(
-            id=row.id,
-            name=row.name,
-            description=row.description,
-            status=row.status,
-            synthesis_type=row.synthesis_type,
-            model_id=row.model_id,
-            progress=row.progress,
-            result_data_location=row.result_data_location,
-            text_split_config=row.text_split_config,
-            synthesis_config=row.synthesis_config,
-            source_file_id=row.source_file_id,
-            total_files=row.total_files,
-            processed_files=row.processed_files,
-            total_chunks=row.total_chunks,
-            processed_chunks=row.processed_chunks,
-            total_synthesis_data=row.total_synthesis_data,
-            created_at=row.created_at,
-            updated_at=row.updated_at,
-            created_by=row.created_by,
-            updated_by=row.updated_by,
+    task_items: list[DataSynthesisTaskItem] = []
+    for row in rows:
+        synth_cfg = getattr(row, "synth_config", {}) or {}
+        text_split_cfg = synth_cfg.get("text_split_config") or {}
+        synthesis_cfg = synth_cfg.get("synthesis_config") or {}
+        source_file_ids = synth_cfg.get("source_file_id") or []
+        model_id = synth_cfg.get("model_id")
+        result_location = synth_cfg.get("result_data_location")
+
+        task_items.append(
+            DataSynthesisTaskItem(
+                id=str(row.id),
+                name=str(row.name),
+                description=cast(str | None, row.description),
+                status=cast(str | None, row.status),
+                synthesis_type=str(row.synth_type),
+                model_id=model_id or "",
+                progress=int(cast(int, row.progress)),
+                result_data_location=result_location,
+                text_split_config=text_split_cfg,
+                synthesis_config=synthesis_cfg,
+                source_file_id=list(source_file_ids),
+                total_files=int(cast(int, row.total_files)),
+                processed_files=int(cast(int, row.processed_files)),
+                total_chunks=int(cast(int, row.total_chunks)),
+                processed_chunks=int(cast(int, row.processed_chunks)),
+                total_synthesis_data=int(cast(int, row.total_synth_data)),
+                created_at=row.created_at,
+                updated_at=row.updated_at,
+                created_by=row.created_by,
+                updated_by=row.updated_by,
+            )
        )
-        for row in rows
-    ]

    paged = PagedDataSynthesisTaskResponse(
        content=task_items,
@@ -190,7 +222,7 @@ async def delete_synthesis_task(
    db: AsyncSession = Depends(get_db)
 ):
    """删除数据合成任务"""
-    task = await db.get(DataSynthesisInstance, task_id)
+    task = await db.get(DataSynthInstance, task_id)
    if not task:
        raise HTTPException(status_code=404, detail="Synthesis task not found")

@@ -241,7 +273,7 @@ async def delete_synthesis_file_task(
 ):
    """删除数据合成任务中的文件任务，同时刷新任务表中的文件/切片数量"""
    # 先获取任务和文件任务记录
-    task = await db.get(DataSynthesisInstance, task_id)
+    task = await db.get(DataSynthInstance, task_id)
    if not task:
        raise HTTPException(status_code=404, detail="Synthesis task not found")

@@ -306,7 +338,7 @@ async def list_synthesis_file_tasks(
 ):
    """分页获取某个数据合成任务下的文件任务列表"""
    # 先校验任务是否存在
-    task = await db.get(DataSynthesisInstance, task_id)
+    task = await db.get(DataSynthInstance, task_id)
    if not task:
        raise HTTPException(status_code=404, detail="Synthesis task not found")

@@ -333,7 +365,6 @@ async def list_synthesis_file_tasks(
            synthesis_instance_id=row.synthesis_instance_id,
            file_name=row.file_name,
            source_file_id=row.source_file_id,
-            target_file_location=row.target_file_location,
            status=row.status,
            total_chunks=row.total_chunks,
            processed_chunks=row.processed_chunks,
@@ -523,7 +554,7 @@ async def delete_synthesis_data_by_chunk(
    result = await db.execute(
        delete(SynthesisData).where(SynthesisData.chunk_instance_id == chunk_id)
    )
-    deleted = result.rowcount or 0
+    deleted = int(getattr(result, "rowcount", 0) or 0)

    await db.commit()

@@ -542,7 +573,7 @@ async def batch_delete_synthesis_data(
    result = await db.execute(
        delete(SynthesisData).where(SynthesisData.id.in_(request.ids))
    )
-    deleted = result.rowcount or 0
+    deleted = int(getattr(result, "rowcount", 0) or 0)
    await db.commit()

    return StandardResponse(code=200, message="success", data=deleted)