feat(annotation): 支持通用算子编排的数据标注功能

## 功能概述
将数据标注模块从固定 YOLO 算子改造为支持通用算子编排,实现与数据清洗模块类似的灵活算子组合能力。

## 改动内容

### 第 1 步:数据库改造(DDL)
- 新增 SQL migration 脚本:scripts/db/data-annotation-operator-pipeline-migration.sql
- 修改 t_dm_auto_annotation_tasks 表:
  - 新增字段:task_mode, executor_type, pipeline, output_dataset_id, created_by, stop_requested, started_at, heartbeat_at, run_token
  - 新增索引:idx_status_created, idx_created_by
- 创建 t_dm_annotation_task_operator_instance 表:用于存储算子实例详情

### 第 2 步:API 层改造
- 扩展请求模型(schema/auto.py):
  - 新增 OperatorPipelineStep 模型
  - 支持 pipeline 字段,保留旧 YOLO 字段向后兼容
  - 实现多写法归一(operatorId/operator_id/id, overrides/settingsOverride/settings_override)
- 修改任务创建服务(service/auto.py):
  - 新增 validate_file_ids() 校验方法
  - 新增 _to_pipeline() 兼容映射方法
  - 写入新字段并集成算子实例表
  - 修复 fileIds 去重准确性问题
- 新增 API 路由(interface/auto.py):
  - 新增 /operator-tasks 系列接口
  - 新增 stop API 接口(/auto/{id}/stop 和 /operator-tasks/{id}/stop)
  - 保留旧 /auto 接口向后兼容
- ORM 模型对齐(annotation_management.py):
  - AutoAnnotationTask 新增所有 DDL 字段
  - 新增 AnnotationTaskOperatorInstance 模型
  - 状态定义补充 stopped

### 第 3 步:Runtime 层改造
- 修改 worker 执行逻辑(auto_annotation_worker.py):
  - 实现原子任务抢占机制(run_token)
  - 从硬编码 YOLO 改为通用 pipeline 执行
  - 新增算子解析和实例化能力
  - 支持 stop_requested 检查
  - 保留 legacy_yolo 模式向后兼容
  - 支持多种算子调用方式(execute 和 __call__)

### 第 4 步:灰度发布
- 完善 YOLO 算子元数据(metadata.yml):
  - 补齐 raw_id, language, modal, inputs, outputs, settings 字段
- 注册标注算子(__init__.py):
  - 将 YOLO 算子注册到 OPERATORS 注册表
  - 确保 annotation 包被正确加载
- 新增白名单控制:
  - 支持环境变量 AUTO_ANNOTATION_OPERATOR_WHITELIST
  - 灰度发布时可限制可用算子

## 关键特性

### 向后兼容
- 旧 /auto 接口完全保留
- 旧请求参数自动映射到 pipeline
- legacy_yolo 模式确保旧逻辑正常运行

### 新功能
- 支持通用 pipeline 编排
- 支持多算子组合
- 支持任务停止控制
- 支持白名单灰度发布

### 可靠性
- 原子任务抢占(防止重复执行)
- 完整的错误处理和状态管理
- 详细的审计追踪(算子实例表)

## 部署说明

1. 执行 DDL:mysql < scripts/db/data-annotation-operator-pipeline-migration.sql
2. 配置环境变量:AUTO_ANNOTATION_OPERATOR_WHITELIST=ImageObjectDetectionBoundingBox
3. 重启服务:datamate-runtime 和 datamate-backend-python

## 验证步骤

1. 兼容模式验证:使用旧 /auto 接口创建任务
2. 通用编排验证:使用新 /operator-tasks 接口创建 pipeline 任务
3. 原子 claim 验证:检查 run_token 机制
4. 停止验证:测试 stop API
5. 白名单验证:测试算子白名单拦截

## 相关文件

- DDL: scripts/db/data-annotation-operator-pipeline-migration.sql
- API: runtime/datamate-python/app/module/annotation/
- Worker: runtime/python-executor/datamate/auto_annotation_worker.py
- 算子: runtime/ops/annotation/image_object_detection_bounding_box/
This commit is contained in:
2026-02-07 22:35:33 +08:00
parent 9efc07935f
commit 2f49fc4199
9 changed files with 1606 additions and 480 deletions

View File

@@ -0,0 +1,249 @@
-- DataMate 数据标注模块 - 通用算子编排改造(第1步:DDL)
-- 说明:
-- 1) 修改 t_dm_auto_annotation_tasks,新增编排相关字段和索引
-- 2) 新建 t_dm_annotation_task_operator_instance,用于记录任务内算子实例
-- 3) 本脚本按“幂等”方式编写,可重复执行
USE datamate;
SET @db_name = DATABASE();
-- =====================================================
-- 1) 修改 t_dm_auto_annotation_tasks 表
-- =====================================================
-- task_mode: 任务模式(legacy_yolo / pipeline)
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND COLUMN_NAME = 'task_mode'
),
'SELECT ''skip: column task_mode exists''',
'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN task_mode VARCHAR(32) NOT NULL DEFAULT ''legacy_yolo'' COMMENT ''任务模式: legacy_yolo/pipeline'' AFTER status'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- executor_type: 执行器类型
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND COLUMN_NAME = 'executor_type'
),
'SELECT ''skip: column executor_type exists''',
'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN executor_type VARCHAR(32) NOT NULL DEFAULT ''annotation_local'' COMMENT ''执行器类型'' AFTER task_mode'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- pipeline: 算子编排定义(JSON)
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND COLUMN_NAME = 'pipeline'
),
'SELECT ''skip: column pipeline exists''',
'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN pipeline JSON NULL COMMENT ''算子编排定义'' AFTER executor_type'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- output_dataset_id: 输出数据集ID
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND COLUMN_NAME = 'output_dataset_id'
),
'SELECT ''skip: column output_dataset_id exists''',
'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN output_dataset_id VARCHAR(36) NULL COMMENT ''输出数据集ID'' AFTER output_path'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- created_by: 任务创建人
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND COLUMN_NAME = 'created_by'
),
'SELECT ''skip: column created_by exists''',
'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN created_by VARCHAR(255) NULL COMMENT ''任务创建人'' AFTER dataset_name'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- stop_requested: 停止请求标记
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND COLUMN_NAME = 'stop_requested'
),
'SELECT ''skip: column stop_requested exists''',
'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN stop_requested TINYINT(1) NOT NULL DEFAULT 0 COMMENT ''是否请求停止: 0否/1是'' AFTER progress'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- started_at: 启动时间
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND COLUMN_NAME = 'started_at'
),
'SELECT ''skip: column started_at exists''',
'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN started_at TIMESTAMP NULL COMMENT ''任务启动时间'' AFTER updated_at'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- heartbeat_at: worker 心跳时间
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND COLUMN_NAME = 'heartbeat_at'
),
'SELECT ''skip: column heartbeat_at exists''',
'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN heartbeat_at TIMESTAMP NULL COMMENT ''worker心跳时间'' AFTER started_at'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- run_token: 运行令牌(用于任务 claim)
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND COLUMN_NAME = 'run_token'
),
'SELECT ''skip: column run_token exists''',
'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN run_token VARCHAR(64) NULL COMMENT ''运行令牌'' AFTER heartbeat_at'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- status 注释补全 stopped(若字段已存在)
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND COLUMN_NAME = 'status'
),
'ALTER TABLE t_dm_auto_annotation_tasks MODIFY COLUMN status VARCHAR(50) NOT NULL DEFAULT ''pending'' COMMENT ''任务状态: pending/running/completed/failed/stopped''',
'SELECT ''skip: column status not found'''
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- 索引:按状态 + 创建时间查询任务
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.STATISTICS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND INDEX_NAME = 'idx_status_created'
),
'SELECT ''skip: index idx_status_created exists''',
'CREATE INDEX idx_status_created ON t_dm_auto_annotation_tasks (status, created_at)'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- 索引:按创建人过滤任务
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.STATISTICS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND INDEX_NAME = 'idx_created_by'
),
'SELECT ''skip: index idx_created_by exists''',
'CREATE INDEX idx_created_by ON t_dm_auto_annotation_tasks (created_by)'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
-- =====================================================
-- 2) 创建 t_dm_annotation_task_operator_instance 表
-- =====================================================
CREATE TABLE IF NOT EXISTS t_dm_annotation_task_operator_instance (
id BIGINT AUTO_INCREMENT PRIMARY KEY COMMENT '自增主键',
task_id VARCHAR(36) NOT NULL COMMENT '自动标注任务ID',
op_index INT NOT NULL COMMENT '算子顺序(从1开始)',
operator_id VARCHAR(64) NOT NULL COMMENT '算子ID(raw_id)',
settings_override JSON NULL COMMENT '任务级算子参数覆盖',
inputs VARCHAR(64) NULL COMMENT '输入模态',
outputs VARCHAR(64) NULL COMMENT '输出模态',
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
UNIQUE KEY uk_task_op_index (task_id, op_index),
KEY idx_task_id (task_id),
KEY idx_operator_id (operator_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COMMENT='标注任务算子实例表';