feat:问题生成过程优化及COT数据生成优化 (#169)

* fix(chart): update Helm chart helpers and values for improved configuration

* feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths

* feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates

* feat(SynthFileTask): enhance file display with progress tracking and delete action

* feat(SynthFileTask): enhance file display with progress tracking and delete action

* feat(SynthDataDetail): add delete action for chunks with confirmation prompt

* feat(SynthDataDetail): update edit and delete buttons to icon-only format

* feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion

* feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection

* feat(DataSynthesis): refactor data synthesis models and update task handling logic

* feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic

* feat(DataSynthesis): refactor data synthesis models and update task handling logic

* fix(generation_service): ensure processed chunks are incremented regardless of question generation success

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options
This commit is contained in:
Dallas98
2025-12-18 16:51:18 +08:00
committed by GitHub
parent 761f7f6a51
commit e0e9b1d94d
14 changed files with 1362 additions and 571 deletions

View File

@@ -1,66 +1,65 @@
import uuid
from xml.etree.ElementTree import tostring
from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, ForeignKey, func
from sqlalchemy.orm import relationship
from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, func
from app.db.session import Base
from app.module.generation.schema.generation import CreateSynthesisTaskRequest
async def save_synthesis_task(db_session, synthesis_task: CreateSynthesisTaskRequest):
"""保存数据合成任务。"""
# 转换为模型实例
"""保存数据合成任务。
注意:当前 MySQL 表 `t_data_synth_instances` 结构中只包含 synth_type / synth_config 等字段,
没有 model_id、text_split_config、source_file_id、result_data_location 等列,因此这里只保存
与表结构一致的字段,其他信息由上层逻辑或其它表负责管理。
"""
gid = str(uuid.uuid4())
synthesis_task_instance = DataSynthesisInstance(
# 兼容旧请求结构:从请求对象中提取必要字段,
# - 合成类型:synthesis_type -> synth_type
# - 合成配置:text_split_config + synthesis_config 合并后写入 synth_config
synth_task_instance = DataSynthInstance(
id=gid,
name=synthesis_task.name,
description=synthesis_task.description,
status="pending",
model_id=synthesis_task.model_id,
synthesis_type=synthesis_task.synthesis_type.value,
synth_type=synthesis_task.synthesis_type.value,
progress=0,
result_data_location=f"/dataset/synthesis_results/{gid}/",
text_split_config=synthesis_task.text_split_config.model_dump(),
synthesis_config=synthesis_task.synthesis_config.model_dump(),
source_file_id=synthesis_task.source_file_id,
total_files=len(synthesis_task.source_file_id),
synth_config=synthesis_task.synth_config.model_dump(),
total_files=len(synthesis_task.source_file_id or []),
processed_files=0,
total_chunks=0,
processed_chunks=0,
total_synthesis_data=0,
total_synth_data=0,
created_at=func.now(),
updated_at=func.now(),
created_by="system",
updated_by="system"
updated_by="system",
)
db_session.add(synthesis_task_instance)
db_session.add(synth_task_instance)
await db_session.commit()
await db_session.refresh(synthesis_task_instance)
return synthesis_task_instance
await db_session.refresh(synth_task_instance)
return synth_task_instance
class DataSynthesisInstance(Base):
"""数据合成任务表,对应表 t_data_synthesis_instances
class DataSynthInstance(Base):
"""数据合成任务表,对应表 t_data_synth_instances
create table if not exists t_data_synthesis_instances
create table if not exists t_data_synth_instances
(
id VARCHAR(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci PRIMARY KEY COMMENT 'UUID',
name VARCHAR(255) NOT NULL COMMENT '任务名称',
description TEXT COMMENT '任务描述',
status VARCHAR(20) COMMENT '任务状态',
synthesis_type VARCHAR(20) NOT NULL COMMENT '合成类型',
model_id VARCHAR(255) NOT NULL COMMENT '模型ID',
synth_type VARCHAR(20) NOT NULL COMMENT '合成类型',
progress INT DEFAULT 0 COMMENT '任务进度(百分比)',
result_data_location VARCHAR(1000) COMMENT '结果数据存储位',
text_split_config JSON NOT NULL COMMENT '文本切片配置',
synthesis_config JSON NOT NULL COMMENT '合成配置',
source_file_id JSON NOT NULL COMMENT '原始文件ID列表',
synth_config JSON NOT NULL COMMENT '合成配',
total_files INT DEFAULT 0 COMMENT '总文件数',
processed_files INT DEFAULT 0 COMMENT '已处理文件数',
total_chunks INT DEFAULT 0 COMMENT '总文本块数',
processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数',
total_synthesis_data INT DEFAULT 0 COMMENT '总合成数据量',
total_synth_data INT DEFAULT 0 COMMENT '总合成数据量',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
@@ -68,27 +67,29 @@ class DataSynthesisInstance(Base):
) COMMENT='数据合成任务表(UUID 主键)';
"""
__tablename__ = "t_data_synthesis_instances"
__tablename__ = "t_data_synth_instances"
id = Column(String(36), primary_key=True, index=True, comment="UUID")
name = Column(String(255), nullable=False, comment="任务名称")
description = Column(Text, nullable=True, comment="任务描述")
status = Column(String(20), nullable=True, comment="任务状态")
synthesis_type = Column(String(20), nullable=False, comment="合成类型")
model_id = Column(String(255), nullable=False, comment="模型ID")
# 与数据库字段保持一致:synth_type / synth_config
synth_type = Column(String(20), nullable=False, comment="合成类型")
progress = Column(Integer, nullable=False, default=0, comment="任务进度(百分比)")
result_data_location = Column(String(1000), nullable=True, comment="结果数据存储位")
text_split_config = Column(JSON, nullable=False, comment="文本切片配置")
synthesis_config = Column(JSON, nullable=False, comment="合成配置")
source_file_id = Column(JSON, nullable=False, comment="原始文件ID列表")
synth_config = Column(JSON, nullable=False, comment="合成配")
total_files = Column(Integer, nullable=False, default=0, comment="总文件数")
processed_files = Column(Integer, nullable=False, default=0, comment="已处理文件数")
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")
total_synthesis_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), nullable=True, comment="更新时间")
total_synth_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")
created_at = Column(TIMESTAMP, nullable=False, default=func.now(), comment="创建时间")
updated_at = Column(
TIMESTAMP,
nullable=False,
default=func.now(),
onupdate=func.now(),
comment="更新时间",
)
created_by = Column(String(255), nullable=True, comment="创建者")
updated_by = Column(String(255), nullable=True, comment="更新者")
@@ -123,7 +124,7 @@ class DataSynthesisFileInstance(Base):
)
file_name = Column(String(255), nullable=False, comment="文件名")
source_file_id = Column(String(255), nullable=False, comment="原始文件ID")
target_file_location = Column(String(1000), nullable=False, comment="目标文件存储位置")
target_file_location = Column(String(1000), nullable=True, comment="目标文件存储位置")
status = Column(String(20), nullable=True, comment="任务状态")
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")