You've already forked DataMate
## 功能概述
将数据标注模块从固定 YOLO 算子改造为支持通用算子编排,实现与数据清洗模块类似的灵活算子组合能力。
## 改动内容
### 第 1 步:数据库改造(DDL)
- 新增 SQL migration 脚本:scripts/db/data-annotation-operator-pipeline-migration.sql
- 修改 t_dm_auto_annotation_tasks 表:
- 新增字段:task_mode, executor_type, pipeline, output_dataset_id, created_by, stop_requested, started_at, heartbeat_at, run_token
- 新增索引:idx_status_created, idx_created_by
- 创建 t_dm_annotation_task_operator_instance 表:用于存储算子实例详情
### 第 2 步:API 层改造
- 扩展请求模型(schema/auto.py):
- 新增 OperatorPipelineStep 模型
- 支持 pipeline 字段,保留旧 YOLO 字段向后兼容
- 实现多写法归一(operatorId/operator_id/id, overrides/settingsOverride/settings_override)
- 修改任务创建服务(service/auto.py):
- 新增 validate_file_ids() 校验方法
- 新增 _to_pipeline() 兼容映射方法
- 写入新字段并集成算子实例表
- 修复 fileIds 去重准确性问题
- 新增 API 路由(interface/auto.py):
- 新增 /operator-tasks 系列接口
- 新增 stop API 接口(/auto/{id}/stop 和 /operator-tasks/{id}/stop)
- 保留旧 /auto 接口向后兼容
- ORM 模型对齐(annotation_management.py):
- AutoAnnotationTask 新增所有 DDL 字段
- 新增 AnnotationTaskOperatorInstance 模型
- 状态定义补充 stopped
### 第 3 步:Runtime 层改造
- 修改 worker 执行逻辑(auto_annotation_worker.py):
- 实现原子任务抢占机制(run_token)
- 从硬编码 YOLO 改为通用 pipeline 执行
- 新增算子解析和实例化能力
- 支持 stop_requested 检查
- 保留 legacy_yolo 模式向后兼容
- 支持多种算子调用方式(execute 和 __call__)
### 第 4 步:灰度发布
- 完善 YOLO 算子元数据(metadata.yml):
- 补齐 raw_id, language, modal, inputs, outputs, settings 字段
- 注册标注算子(__init__.py):
- 将 YOLO 算子注册到 OPERATORS 注册表
- 确保 annotation 包被正确加载
- 新增白名单控制:
- 支持环境变量 AUTO_ANNOTATION_OPERATOR_WHITELIST
- 灰度发布时可限制可用算子
## 关键特性
### 向后兼容
- 旧 /auto 接口完全保留
- 旧请求参数自动映射到 pipeline
- legacy_yolo 模式确保旧逻辑正常运行
### 新功能
- 支持通用 pipeline 编排
- 支持多算子组合
- 支持任务停止控制
- 支持白名单灰度发布
### 可靠性
- 原子任务抢占(防止重复执行)
- 完整的错误处理和状态管理
- 详细的审计追踪(算子实例表)
## 部署说明
1. 执行 DDL:mysql < scripts/db/data-annotation-operator-pipeline-migration.sql
2. 配置环境变量:AUTO_ANNOTATION_OPERATOR_WHITELIST=ImageObjectDetectionBoundingBox
3. 重启服务:datamate-runtime 和 datamate-backend-python
## 验证步骤
1. 兼容模式验证:使用旧 /auto 接口创建任务
2. 通用编排验证:使用新 /operator-tasks 接口创建 pipeline 任务
3. 原子 claim 验证:检查 run_token 机制
4. 停止验证:测试 stop API
5. 白名单验证:测试算子白名单拦截
## 相关文件
- DDL: scripts/db/data-annotation-operator-pipeline-migration.sql
- API: runtime/datamate-python/app/module/annotation/
- Worker: runtime/python-executor/datamate/auto_annotation_worker.py
- 算子: runtime/ops/annotation/image_object_detection_bounding_box/
300 lines
11 KiB
Python
300 lines
11 KiB
Python
"""Tables of Annotation Management Module"""
|
|
|
|
import uuid
|
|
from sqlalchemy import (
|
|
Column,
|
|
String,
|
|
Boolean,
|
|
TIMESTAMP,
|
|
Text,
|
|
Integer,
|
|
JSON,
|
|
ForeignKey,
|
|
UniqueConstraint,
|
|
Index,
|
|
BigInteger,
|
|
)
|
|
from sqlalchemy.sql import func
|
|
|
|
from app.db.session import Base
|
|
|
|
ANNOTATION_STATUS_ANNOTATED = "ANNOTATED"
|
|
ANNOTATION_STATUS_NO_ANNOTATION = "NO_ANNOTATION"
|
|
ANNOTATION_STATUS_NOT_APPLICABLE = "NOT_APPLICABLE"
|
|
ANNOTATION_STATUS_IN_PROGRESS = "IN_PROGRESS"
|
|
ANNOTATION_STATUS_VALUES = {
|
|
ANNOTATION_STATUS_ANNOTATED,
|
|
ANNOTATION_STATUS_NO_ANNOTATION,
|
|
ANNOTATION_STATUS_NOT_APPLICABLE,
|
|
ANNOTATION_STATUS_IN_PROGRESS,
|
|
}
|
|
ANNOTATION_STATUS_CLIENT_VALUES = {
|
|
ANNOTATION_STATUS_ANNOTATED,
|
|
ANNOTATION_STATUS_NO_ANNOTATION,
|
|
ANNOTATION_STATUS_NOT_APPLICABLE,
|
|
}
|
|
|
|
|
|
class AnnotationTemplate(Base):
|
|
"""标注配置模板模型"""
|
|
|
|
__tablename__ = "t_dm_annotation_templates"
|
|
|
|
id = Column(
|
|
String(64),
|
|
primary_key=True,
|
|
default=lambda: str(uuid.uuid4()),
|
|
comment="模板ID(UUID或自定义ID)",
|
|
)
|
|
name = Column(String(100), nullable=False, comment="模板名称")
|
|
description = Column(String(500), nullable=True, comment="模板描述")
|
|
data_type = Column(
|
|
String(50),
|
|
nullable=False,
|
|
comment="数据类型: image/text/audio/video/timeseries/pdf/chat/html/table",
|
|
)
|
|
labeling_type = Column(
|
|
String(50), nullable=False, comment="标注类型: asr/ner/object-detection/等"
|
|
)
|
|
configuration = Column(
|
|
JSON, nullable=True, comment="标注配置(兼容字段,主配置为 label_config)"
|
|
)
|
|
label_config = Column(
|
|
Text, nullable=True, comment="Label Studio XML配置(模板主配置)"
|
|
)
|
|
style = Column(String(32), nullable=False, comment="样式配置: horizontal/vertical")
|
|
category = Column(
|
|
String(50),
|
|
default="custom",
|
|
comment="模板分类: audio-speech/chat/computer-vision/nlp/等",
|
|
)
|
|
built_in = Column(Boolean, default=False, comment="是否系统内置模板")
|
|
version = Column(String(20), default="1.0", comment="模板版本")
|
|
created_at = Column(
|
|
TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间"
|
|
)
|
|
updated_at = Column(
|
|
TIMESTAMP,
|
|
server_default=func.current_timestamp(),
|
|
onupdate=func.current_timestamp(),
|
|
comment="更新时间",
|
|
)
|
|
deleted_at = Column(TIMESTAMP, nullable=True, comment="删除时间(软删除)")
|
|
|
|
def __repr__(self):
|
|
return f"<AnnotationTemplate(id={self.id}, name={self.name}, data_type={self.data_type})>"
|
|
|
|
@property
|
|
def is_deleted(self) -> bool:
|
|
"""检查是否已被软删除"""
|
|
return self.deleted_at is not None
|
|
|
|
|
|
class LabelingProject(Base):
|
|
"""标注项目模型"""
|
|
|
|
__tablename__ = "t_dm_labeling_projects"
|
|
|
|
id = Column(
|
|
String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID"
|
|
)
|
|
dataset_id = Column(String(36), nullable=False, comment="数据集ID")
|
|
name = Column(String(100), nullable=False, comment="项目名称")
|
|
labeling_project_id = Column(
|
|
String(8), nullable=False, comment="Label Studio项目ID"
|
|
)
|
|
template_id = Column(
|
|
String(64),
|
|
ForeignKey("t_dm_annotation_templates.id", ondelete="SET NULL"),
|
|
nullable=True,
|
|
comment="使用的模板ID",
|
|
)
|
|
configuration = Column(
|
|
JSON, nullable=True, comment="项目配置(可能包含对模板的自定义修改)"
|
|
)
|
|
progress = Column(JSON, nullable=True, comment="项目进度信息")
|
|
created_at = Column(
|
|
TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间"
|
|
)
|
|
updated_at = Column(
|
|
TIMESTAMP,
|
|
server_default=func.current_timestamp(),
|
|
onupdate=func.current_timestamp(),
|
|
comment="更新时间",
|
|
)
|
|
deleted_at = Column(TIMESTAMP, nullable=True, comment="删除时间(软删除)")
|
|
|
|
def __repr__(self):
|
|
return f"<LabelingProject(id={self.id}, name={self.name}, dataset_id={self.dataset_id})>"
|
|
|
|
@property
|
|
def is_deleted(self) -> bool:
|
|
"""检查是否已被软删除"""
|
|
return self.deleted_at is not None
|
|
|
|
|
|
class LabelingProjectFile(Base):
|
|
"""标注项目文件快照模型"""
|
|
|
|
__tablename__ = "t_dm_labeling_project_files"
|
|
|
|
id = Column(
|
|
String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID"
|
|
)
|
|
project_id = Column(String(36), nullable=False, comment="标注项目ID")
|
|
file_id = Column(String(36), nullable=False, comment="文件ID")
|
|
created_at = Column(
|
|
TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间"
|
|
)
|
|
|
|
__table_args__ = (
|
|
UniqueConstraint("project_id", "file_id", name="uk_project_file"),
|
|
Index("idx_project_id", "project_id"),
|
|
Index("idx_file_id", "file_id"),
|
|
)
|
|
|
|
def __repr__(self):
|
|
return f"<LabelingProjectFile(id={self.id}, project_id={self.project_id}, file_id={self.file_id})>"
|
|
|
|
|
|
class AnnotationResult(Base):
|
|
"""标注结果模型(单人单份最终标签,Label Studio annotation 原始 JSON)"""
|
|
|
|
__tablename__ = "t_dm_annotation_results"
|
|
|
|
id = Column(
|
|
String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID"
|
|
)
|
|
project_id = Column(
|
|
String(36), nullable=False, comment="标注项目ID(t_dm_labeling_projects.id)"
|
|
)
|
|
file_id = Column(
|
|
String(36), nullable=False, comment="文件ID(t_dm_dataset_files.id)"
|
|
)
|
|
annotation = Column(
|
|
JSON,
|
|
nullable=False,
|
|
comment="Label Studio annotation 原始JSON(单人单份最终结果)",
|
|
)
|
|
annotation_status = Column(
|
|
String(32),
|
|
nullable=False,
|
|
default=ANNOTATION_STATUS_ANNOTATED,
|
|
comment="标注状态: ANNOTATED/NO_ANNOTATION/NOT_APPLICABLE/IN_PROGRESS",
|
|
)
|
|
file_version = Column(BigInteger, nullable=True, comment="标注时的文件版本号")
|
|
created_at = Column(
|
|
TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间"
|
|
)
|
|
updated_at = Column(
|
|
TIMESTAMP,
|
|
server_default=func.current_timestamp(),
|
|
onupdate=func.current_timestamp(),
|
|
comment="更新时间",
|
|
)
|
|
|
|
def __repr__(self):
|
|
return f"<AnnotationResult(id={self.id}, project_id={self.project_id}, file_id={self.file_id})>"
|
|
|
|
|
|
class AutoAnnotationTask(Base):
|
|
"""自动标注任务模型,对应表 t_dm_auto_annotation_tasks"""
|
|
|
|
__tablename__ = "t_dm_auto_annotation_tasks"
|
|
|
|
id = Column(
|
|
String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID"
|
|
)
|
|
name = Column(String(255), nullable=False, comment="任务名称")
|
|
dataset_id = Column(String(36), nullable=False, comment="数据集ID")
|
|
dataset_name = Column(
|
|
String(255), nullable=True, comment="数据集名称(冗余字段,方便查询)"
|
|
)
|
|
created_by = Column(String(255), nullable=True, comment="任务创建人")
|
|
config = Column(JSON, nullable=False, comment="任务配置(模型规模、置信度等)")
|
|
file_ids = Column(
|
|
JSON, nullable=True, comment="要处理的文件ID列表,为空则处理数据集所有图像"
|
|
)
|
|
status = Column(
|
|
String(50),
|
|
nullable=False,
|
|
default="pending",
|
|
comment="任务状态: pending/running/completed/failed/stopped",
|
|
)
|
|
task_mode = Column(
|
|
String(32),
|
|
nullable=False,
|
|
default="legacy_yolo",
|
|
comment="任务模式: legacy_yolo/pipeline",
|
|
)
|
|
executor_type = Column(
|
|
String(32),
|
|
nullable=False,
|
|
default="annotation_local",
|
|
comment="执行器类型",
|
|
)
|
|
pipeline = Column(JSON, nullable=True, comment="算子编排定义")
|
|
progress = Column(Integer, default=0, comment="任务进度 0-100")
|
|
stop_requested = Column(Boolean, default=False, comment="是否请求停止")
|
|
total_images = Column(Integer, default=0, comment="总图片数")
|
|
processed_images = Column(Integer, default=0, comment="已处理图片数")
|
|
detected_objects = Column(Integer, default=0, comment="检测到的对象总数")
|
|
output_path = Column(String(500), nullable=True, comment="输出路径")
|
|
output_dataset_id = Column(String(36), nullable=True, comment="输出数据集ID")
|
|
error_message = Column(Text, nullable=True, comment="错误信息")
|
|
created_at = Column(
|
|
TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间"
|
|
)
|
|
updated_at = Column(
|
|
TIMESTAMP,
|
|
server_default=func.current_timestamp(),
|
|
onupdate=func.current_timestamp(),
|
|
comment="更新时间",
|
|
)
|
|
started_at = Column(TIMESTAMP, nullable=True, comment="任务启动时间")
|
|
heartbeat_at = Column(TIMESTAMP, nullable=True, comment="worker心跳时间")
|
|
run_token = Column(String(64), nullable=True, comment="运行令牌")
|
|
completed_at = Column(TIMESTAMP, nullable=True, comment="完成时间")
|
|
deleted_at = Column(TIMESTAMP, nullable=True, comment="删除时间(软删除)")
|
|
|
|
def __repr__(self) -> str: # pragma: no cover - repr 简单返回
|
|
return f"<AutoAnnotationTask(id={self.id}, name={self.name}, status={self.status})>"
|
|
|
|
@property
|
|
def is_deleted(self) -> bool:
|
|
"""检查是否已被软删除"""
|
|
return self.deleted_at is not None
|
|
|
|
|
|
class AnnotationTaskOperatorInstance(Base):
|
|
"""自动标注任务内算子实例模型,对应表 t_dm_annotation_task_operator_instance"""
|
|
|
|
__tablename__ = "t_dm_annotation_task_operator_instance"
|
|
|
|
id = Column(BigInteger, primary_key=True, autoincrement=True, comment="自增主键")
|
|
task_id = Column(String(36), nullable=False, comment="自动标注任务ID")
|
|
op_index = Column(Integer, nullable=False, comment="算子顺序(从1开始)")
|
|
operator_id = Column(String(64), nullable=False, comment="算子ID(raw_id)")
|
|
settings_override = Column(JSON, nullable=True, comment="任务级算子参数覆盖")
|
|
inputs = Column(String(64), nullable=True, comment="输入模态")
|
|
outputs = Column(String(64), nullable=True, comment="输出模态")
|
|
created_at = Column(
|
|
TIMESTAMP,
|
|
server_default=func.current_timestamp(),
|
|
nullable=False,
|
|
comment="创建时间",
|
|
)
|
|
updated_at = Column(
|
|
TIMESTAMP,
|
|
server_default=func.current_timestamp(),
|
|
onupdate=func.current_timestamp(),
|
|
nullable=False,
|
|
comment="更新时间",
|
|
)
|
|
|
|
__table_args__ = (
|
|
UniqueConstraint("task_id", "op_index", name="uk_task_op_index"),
|
|
Index("idx_task_id", "task_id"),
|
|
Index("idx_operator_id", "operator_id"),
|
|
)
|