Files
DataMate/runtime/datamate-python/app/db/models/annotation_management.py
Jerry Yan 8ffa131fad feat(annotation): 自动标注任务支持非图像类型数据集(TEXT/AUDIO/VIDEO)
移除自动标注任务创建流程中的 IMAGE-only 限制,使 TEXT、AUDIO、VIDEO
类型数据集均可用于自动标注任务。

- 新增数据库迁移:t_dm_auto_annotation_tasks 表添加 dataset_type 列
- 后端 schema/API/service 全链路传递 dataset_type
- Worker 动态构建 sample key(image/text/audio/video)和输出目录
- 前端移除数据集类型校验,下拉框显示数据集类型标识
- 输出数据集继承源数据集类型,不再硬编码为 IMAGE
- 保持向后兼容:默认值为 IMAGE,worker 有元数据回退和目录 fallback

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 23:23:05 +08:00

304 lines
11 KiB
Python

"""Tables of Annotation Management Module"""
import uuid
from sqlalchemy import (
Column,
String,
Boolean,
TIMESTAMP,
Text,
Integer,
JSON,
ForeignKey,
UniqueConstraint,
Index,
BigInteger,
)
from sqlalchemy.sql import func
from app.db.session import Base
ANNOTATION_STATUS_ANNOTATED = "ANNOTATED"
ANNOTATION_STATUS_NO_ANNOTATION = "NO_ANNOTATION"
ANNOTATION_STATUS_NOT_APPLICABLE = "NOT_APPLICABLE"
ANNOTATION_STATUS_IN_PROGRESS = "IN_PROGRESS"
ANNOTATION_STATUS_VALUES = {
ANNOTATION_STATUS_ANNOTATED,
ANNOTATION_STATUS_NO_ANNOTATION,
ANNOTATION_STATUS_NOT_APPLICABLE,
ANNOTATION_STATUS_IN_PROGRESS,
}
ANNOTATION_STATUS_CLIENT_VALUES = {
ANNOTATION_STATUS_ANNOTATED,
ANNOTATION_STATUS_NO_ANNOTATION,
ANNOTATION_STATUS_NOT_APPLICABLE,
}
class AnnotationTemplate(Base):
"""标注配置模板模型"""
__tablename__ = "t_dm_annotation_templates"
id = Column(
String(64),
primary_key=True,
default=lambda: str(uuid.uuid4()),
comment="模板ID(UUID或自定义ID)",
)
name = Column(String(100), nullable=False, comment="模板名称")
description = Column(String(500), nullable=True, comment="模板描述")
data_type = Column(
String(50),
nullable=False,
comment="数据类型: image/text/audio/video/timeseries/pdf/chat/html/table",
)
labeling_type = Column(
String(50), nullable=False, comment="标注类型: asr/ner/object-detection/等"
)
configuration = Column(
JSON, nullable=True, comment="标注配置(兼容字段,主配置为 label_config)"
)
label_config = Column(
Text, nullable=True, comment="Label Studio XML配置(模板主配置)"
)
style = Column(String(32), nullable=False, comment="样式配置: horizontal/vertical")
category = Column(
String(50),
default="custom",
comment="模板分类: audio-speech/chat/computer-vision/nlp/等",
)
built_in = Column(Boolean, default=False, comment="是否系统内置模板")
version = Column(String(20), default="1.0", comment="模板版本")
created_at = Column(
TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间"
)
updated_at = Column(
TIMESTAMP,
server_default=func.current_timestamp(),
onupdate=func.current_timestamp(),
comment="更新时间",
)
deleted_at = Column(TIMESTAMP, nullable=True, comment="删除时间(软删除)")
def __repr__(self):
return f"<AnnotationTemplate(id={self.id}, name={self.name}, data_type={self.data_type})>"
@property
def is_deleted(self) -> bool:
"""检查是否已被软删除"""
return self.deleted_at is not None
class LabelingProject(Base):
"""标注项目模型"""
__tablename__ = "t_dm_labeling_projects"
id = Column(
String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID"
)
dataset_id = Column(String(36), nullable=False, comment="数据集ID")
name = Column(String(100), nullable=False, comment="项目名称")
labeling_project_id = Column(
String(8), nullable=False, comment="Label Studio项目ID"
)
template_id = Column(
String(64),
ForeignKey("t_dm_annotation_templates.id", ondelete="SET NULL"),
nullable=True,
comment="使用的模板ID",
)
configuration = Column(
JSON, nullable=True, comment="项目配置(可能包含对模板的自定义修改)"
)
progress = Column(JSON, nullable=True, comment="项目进度信息")
created_at = Column(
TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间"
)
updated_at = Column(
TIMESTAMP,
server_default=func.current_timestamp(),
onupdate=func.current_timestamp(),
comment="更新时间",
)
deleted_at = Column(TIMESTAMP, nullable=True, comment="删除时间(软删除)")
def __repr__(self):
return f"<LabelingProject(id={self.id}, name={self.name}, dataset_id={self.dataset_id})>"
@property
def is_deleted(self) -> bool:
"""检查是否已被软删除"""
return self.deleted_at is not None
class LabelingProjectFile(Base):
"""标注项目文件快照模型"""
__tablename__ = "t_dm_labeling_project_files"
id = Column(
String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID"
)
project_id = Column(String(36), nullable=False, comment="标注项目ID")
file_id = Column(String(36), nullable=False, comment="文件ID")
created_at = Column(
TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间"
)
__table_args__ = (
UniqueConstraint("project_id", "file_id", name="uk_project_file"),
Index("idx_project_id", "project_id"),
Index("idx_file_id", "file_id"),
)
def __repr__(self):
return f"<LabelingProjectFile(id={self.id}, project_id={self.project_id}, file_id={self.file_id})>"
class AnnotationResult(Base):
"""标注结果模型(单人单份最终标签,Label Studio annotation 原始 JSON)"""
__tablename__ = "t_dm_annotation_results"
id = Column(
String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID"
)
project_id = Column(
String(36), nullable=False, comment="标注项目ID(t_dm_labeling_projects.id)"
)
file_id = Column(
String(36), nullable=False, comment="文件ID(t_dm_dataset_files.id)"
)
annotation = Column(
JSON,
nullable=False,
comment="Label Studio annotation 原始JSON(单人单份最终结果)",
)
annotation_status = Column(
String(32),
nullable=False,
default=ANNOTATION_STATUS_ANNOTATED,
comment="标注状态: ANNOTATED/NO_ANNOTATION/NOT_APPLICABLE/IN_PROGRESS",
)
file_version = Column(BigInteger, nullable=True, comment="标注时的文件版本号")
created_at = Column(
TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间"
)
updated_at = Column(
TIMESTAMP,
server_default=func.current_timestamp(),
onupdate=func.current_timestamp(),
comment="更新时间",
)
def __repr__(self):
return f"<AnnotationResult(id={self.id}, project_id={self.project_id}, file_id={self.file_id})>"
class AutoAnnotationTask(Base):
"""自动标注任务模型,对应表 t_dm_auto_annotation_tasks"""
__tablename__ = "t_dm_auto_annotation_tasks"
id = Column(
String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID"
)
name = Column(String(255), nullable=False, comment="任务名称")
dataset_id = Column(String(36), nullable=False, comment="数据集ID")
dataset_name = Column(
String(255), nullable=True, comment="数据集名称(冗余字段,方便查询)"
)
dataset_type = Column(
String(50), nullable=False, default="IMAGE",
comment="数据集类型: IMAGE/TEXT/AUDIO/VIDEO",
)
created_by = Column(String(255), nullable=True, comment="任务创建人")
config = Column(JSON, nullable=False, comment="任务配置(模型规模、置信度等)")
file_ids = Column(
JSON, nullable=True, comment="要处理的文件ID列表,为空则处理数据集所有图像"
)
status = Column(
String(50),
nullable=False,
default="pending",
comment="任务状态: pending/running/completed/failed/stopped",
)
task_mode = Column(
String(32),
nullable=False,
default="legacy_yolo",
comment="任务模式: legacy_yolo/pipeline",
)
executor_type = Column(
String(32),
nullable=False,
default="annotation_local",
comment="执行器类型",
)
pipeline = Column(JSON, nullable=True, comment="算子编排定义")
progress = Column(Integer, default=0, comment="任务进度 0-100")
stop_requested = Column(Boolean, default=False, comment="是否请求停止")
total_images = Column(Integer, default=0, comment="总图片数")
processed_images = Column(Integer, default=0, comment="已处理图片数")
detected_objects = Column(Integer, default=0, comment="检测到的对象总数")
output_path = Column(String(500), nullable=True, comment="输出路径")
output_dataset_id = Column(String(36), nullable=True, comment="输出数据集ID")
error_message = Column(Text, nullable=True, comment="错误信息")
created_at = Column(
TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间"
)
updated_at = Column(
TIMESTAMP,
server_default=func.current_timestamp(),
onupdate=func.current_timestamp(),
comment="更新时间",
)
started_at = Column(TIMESTAMP, nullable=True, comment="任务启动时间")
heartbeat_at = Column(TIMESTAMP, nullable=True, comment="worker心跳时间")
run_token = Column(String(64), nullable=True, comment="运行令牌")
completed_at = Column(TIMESTAMP, nullable=True, comment="完成时间")
deleted_at = Column(TIMESTAMP, nullable=True, comment="删除时间(软删除)")
def __repr__(self) -> str: # pragma: no cover - repr 简单返回
return f"<AutoAnnotationTask(id={self.id}, name={self.name}, status={self.status})>"
@property
def is_deleted(self) -> bool:
"""检查是否已被软删除"""
return self.deleted_at is not None
class AnnotationTaskOperatorInstance(Base):
"""自动标注任务内算子实例模型,对应表 t_dm_annotation_task_operator_instance"""
__tablename__ = "t_dm_annotation_task_operator_instance"
id = Column(BigInteger, primary_key=True, autoincrement=True, comment="自增主键")
task_id = Column(String(36), nullable=False, comment="自动标注任务ID")
op_index = Column(Integer, nullable=False, comment="算子顺序(从1开始)")
operator_id = Column(String(64), nullable=False, comment="算子ID(raw_id)")
settings_override = Column(JSON, nullable=True, comment="任务级算子参数覆盖")
inputs = Column(String(64), nullable=True, comment="输入模态")
outputs = Column(String(64), nullable=True, comment="输出模态")
created_at = Column(
TIMESTAMP,
server_default=func.current_timestamp(),
nullable=False,
comment="创建时间",
)
updated_at = Column(
TIMESTAMP,
server_default=func.current_timestamp(),
onupdate=func.current_timestamp(),
nullable=False,
comment="更新时间",
)
__table_args__ = (
UniqueConstraint("task_id", "op_index", name="uk_task_op_index"),
Index("idx_task_id", "task_id"),
Index("idx_operator_id", "operator_id"),
)