Files
DataMate/runtime/datamate-python/app/db/models/data_synthesis.py
Dallas98 e0e9b1d94d feat:问题生成过程优化及COT数据生成优化 (#169)
* fix(chart): update Helm chart helpers and values for improved configuration

* feat(SynthesisTaskTab): enhance task table with tooltip support and improved column widths

* feat(CreateTask, SynthFileTask): improve task creation and detail view with enhanced payload handling and UI updates

* feat(SynthFileTask): enhance file display with progress tracking and delete action

* feat(SynthFileTask): enhance file display with progress tracking and delete action

* feat(SynthDataDetail): add delete action for chunks with confirmation prompt

* feat(SynthDataDetail): update edit and delete buttons to icon-only format

* feat(SynthDataDetail): add confirmation modals for chunk and synthesis data deletion

* feat(DocumentSplitter): add enhanced document splitting functionality with CJK support and metadata detection

* feat(DataSynthesis): refactor data synthesis models and update task handling logic

* feat(DataSynthesis): streamline synthesis task handling and enhance chunk processing logic

* feat(DataSynthesis): refactor data synthesis models and update task handling logic

* fix(generation_service): ensure processed chunks are incremented regardless of question generation success

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options

* feat(CreateTask): enhance task creation with new synthesis templates and improved configuration options
2025-12-18 16:51:18 +08:00

199 lines
8.7 KiB
Python

import uuid
from sqlalchemy import Column, String, Text, Integer, JSON, TIMESTAMP, func
from app.db.session import Base
from app.module.generation.schema.generation import CreateSynthesisTaskRequest
async def save_synthesis_task(db_session, synthesis_task: CreateSynthesisTaskRequest):
"""保存数据合成任务。
注意:当前 MySQL 表 `t_data_synth_instances` 结构中只包含 synth_type / synth_config 等字段,
没有 model_id、text_split_config、source_file_id、result_data_location 等列,因此这里只保存
与表结构一致的字段,其他信息由上层逻辑或其它表负责管理。
"""
gid = str(uuid.uuid4())
# 兼容旧请求结构:从请求对象中提取必要字段,
# - 合成类型:synthesis_type -> synth_type
# - 合成配置:text_split_config + synthesis_config 合并后写入 synth_config
synth_task_instance = DataSynthInstance(
id=gid,
name=synthesis_task.name,
description=synthesis_task.description,
status="pending",
synth_type=synthesis_task.synthesis_type.value,
progress=0,
synth_config=synthesis_task.synth_config.model_dump(),
total_files=len(synthesis_task.source_file_id or []),
processed_files=0,
total_chunks=0,
processed_chunks=0,
total_synth_data=0,
created_at=func.now(),
updated_at=func.now(),
created_by="system",
updated_by="system",
)
db_session.add(synth_task_instance)
await db_session.commit()
await db_session.refresh(synth_task_instance)
return synth_task_instance
class DataSynthInstance(Base):
"""数据合成任务表,对应表 t_data_synth_instances
create table if not exists t_data_synth_instances
(
id VARCHAR(36) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci PRIMARY KEY COMMENT 'UUID',
name VARCHAR(255) NOT NULL COMMENT '任务名称',
description TEXT COMMENT '任务描述',
status VARCHAR(20) COMMENT '任务状态',
synth_type VARCHAR(20) NOT NULL COMMENT '合成类型',
progress INT DEFAULT 0 COMMENT '任务进度(百分比)',
synth_config JSON NOT NULL COMMENT '合成配置',
total_files INT DEFAULT 0 COMMENT '总文件数',
processed_files INT DEFAULT 0 COMMENT '已处理文件数',
total_chunks INT DEFAULT 0 COMMENT '总文本块数',
processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数',
total_synth_data INT DEFAULT 0 COMMENT '总合成数据量',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
updated_by VARCHAR(255) COMMENT '更新者'
) COMMENT='数据合成任务表(UUID 主键)';
"""
__tablename__ = "t_data_synth_instances"
id = Column(String(36), primary_key=True, index=True, comment="UUID")
name = Column(String(255), nullable=False, comment="任务名称")
description = Column(Text, nullable=True, comment="任务描述")
status = Column(String(20), nullable=True, comment="任务状态")
# 与数据库字段保持一致:synth_type / synth_config
synth_type = Column(String(20), nullable=False, comment="合成类型")
progress = Column(Integer, nullable=False, default=0, comment="任务进度(百分比)")
synth_config = Column(JSON, nullable=False, comment="合成配置")
total_files = Column(Integer, nullable=False, default=0, comment="总文件数")
processed_files = Column(Integer, nullable=False, default=0, comment="已处理文件数")
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")
total_synth_data = Column(Integer, nullable=False, default=0, comment="总合成数据量")
created_at = Column(TIMESTAMP, nullable=False, default=func.now(), comment="创建时间")
updated_at = Column(
TIMESTAMP,
nullable=False,
default=func.now(),
onupdate=func.now(),
comment="更新时间",
)
created_by = Column(String(255), nullable=True, comment="创建者")
updated_by = Column(String(255), nullable=True, comment="更新者")
class DataSynthesisFileInstance(Base):
"""数据合成文件任务表,对应表 t_data_synthesis_file_instances
create table if not exists t_data_synthesis_file_instances (
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
synthesis_instance_id VARCHAR(36) COMMENT '数据合成任务ID',
file_name VARCHAR(255) NOT NULL COMMENT '文件名',
source_file_id VARCHAR(255) NOT NULL COMMENT '原始文件ID',
target_file_location VARCHAR(1000) NOT NULL COMMENT '目标文件存储位置',
status VARCHAR(20) COMMENT '任务状态',
total_chunks INT DEFAULT 0 COMMENT '总文本块数',
processed_chunks INT DEFAULT 0 COMMENT '已处理文本块数',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
created_by VARCHAR(255) COMMENT '创建者',
updated_by VARCHAR(255) COMMENT '更新者'
) COMMENT='数据合成文件任务表(UUID 主键)';
"""
__tablename__ = "t_data_synthesis_file_instances"
id = Column(String(36), primary_key=True, index=True, comment="UUID")
synthesis_instance_id = Column(
String(36),
nullable=False,
comment="数据合成任务ID",
index=True,
)
file_name = Column(String(255), nullable=False, comment="文件名")
source_file_id = Column(String(255), nullable=False, comment="原始文件ID")
target_file_location = Column(String(1000), nullable=True, comment="目标文件存储位置")
status = Column(String(20), nullable=True, comment="任务状态")
total_chunks = Column(Integer, nullable=False, default=0, comment="总文本块数")
processed_chunks = Column(Integer, nullable=False, default=0, comment="已处理文本块数")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), nullable=True, comment="创建时间")
updated_at = Column(
TIMESTAMP,
server_default=func.current_timestamp(),
onupdate=func.current_timestamp(),
nullable=True,
comment="更新时间",
)
created_by = Column(String(255), nullable=True, comment="创建者")
updated_by = Column(String(255), nullable=True, comment="更新者")
class DataSynthesisChunkInstance(Base):
"""数据合成分块任务表,对应表 t_data_synthesis_chunk_instances
create table if not exists t_data_synthesis_chunk_instances (
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
synthesis_file_instance_id VARCHAR(36) COMMENT '数据合成文件任务ID',
chunk_index INT COMMENT '分块索引',
chunk_content TEXT COMMENT '分块内容',
metadata JSON COMMENT '分块元数据'
) COMMENT='数据合成分块任务表(UUID 主键)';
"""
__tablename__ = "t_data_synthesis_chunk_instances"
id = Column(String(36), primary_key=True, index=True, comment="UUID")
synthesis_file_instance_id = Column(
String(36),
nullable=False,
comment="数据合成文件任务ID",
index=True,
)
chunk_index = Column(Integer, nullable=True, comment="分块索引")
chunk_content = Column(Text, nullable=True, comment="分块内容")
# SQLAlchemy Declarative 保留了属性名 'metadata',这里使用 chunk_metadata 作为属性名,
# 底层列名仍为 'metadata' 以保持与表结构兼容。
chunk_metadata = Column("metadata", JSON, nullable=True, comment="分块元数据")
class SynthesisData(Base):
"""数据合成结果表,对应表 t_synthesis_data
create table if not exists t_synthesis_data (
id VARCHAR(36) PRIMARY KEY COMMENT 'UUID',
data json COMMENT '合成的数据',
synthesis_file_instance_id VARCHAR(36) COMMENT '数据合成文件任务ID',
chunk_instance_id VARCHAR(36) COMMENT '分块任务ID'
) COMMENT='数据合成任务队列表(UUID 主键)';
"""
__tablename__ = "t_data_synthesis_data"
id = Column(String(36), primary_key=True, index=True, comment="UUID")
data = Column(JSON, nullable=True, comment="合成的数据")
synthesis_file_instance_id = Column(
String(36),
nullable=False,
comment="数据合成文件任务ID",
index=True,
)
chunk_instance_id = Column(
String(36),
nullable=False,
comment="分块任务ID",
index=True,
)