Files
DataMate/runtime/datamate-python/app/module/annotation/schema/auto.py
Jerry Yan 8ffa131fad feat(annotation): 自动标注任务支持非图像类型数据集(TEXT/AUDIO/VIDEO)
移除自动标注任务创建流程中的 IMAGE-only 限制,使 TEXT、AUDIO、VIDEO
类型数据集均可用于自动标注任务。

- 新增数据库迁移:t_dm_auto_annotation_tasks 表添加 dataset_type 列
- 后端 schema/API/service 全链路传递 dataset_type
- Worker 动态构建 sample key(image/text/audio/video)和输出目录
- 前端移除数据集类型校验,下拉框显示数据集类型标识
- 输出数据集继承源数据集类型,不再硬编码为 IMAGE
- 保持向后兼容:默认值为 IMAGE,worker 有元数据回退和目录 fallback

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 23:23:05 +08:00

172 lines
6.8 KiB
Python

"""Schemas for Auto Annotation tasks"""
from __future__ import annotations
import json
from typing import List, Optional, Dict, Any
from datetime import datetime
from pydantic import BaseModel, Field, ConfigDict, model_validator
class AutoAnnotationConfig(BaseModel):
"""自动标注任务配置(与前端 payload 对齐)"""
model_size: str = Field(alias="modelSize", description="模型规模: n/s/m/l/x")
conf_threshold: float = Field(alias="confThreshold", description="置信度阈值 0-1")
target_classes: List[int] = Field(
default_factory=list,
alias="targetClasses",
description="目标类别ID列表,空表示全部类别",
)
output_dataset_name: Optional[str] = Field(
default=None,
alias="outputDatasetName",
description="自动标注结果要写入的新数据集名称(可选)",
)
model_config = ConfigDict(populate_by_name=True)
class OperatorPipelineStep(BaseModel):
"""通用算子编排中的单个算子节点定义"""
operator_id: str = Field(alias="operatorId", description="算子ID(raw_id)")
overrides: Dict[str, Any] = Field(
default_factory=dict,
alias="overrides",
description="算子参数覆盖(对应 settings override)",
)
@model_validator(mode="before")
@classmethod
def normalize_compatible_fields(cls, value: Any):
if not isinstance(value, dict):
return value
normalized = dict(value)
if "operatorId" not in normalized:
for key in ("operator_id", "id"):
candidate = normalized.get(key)
if candidate:
normalized["operatorId"] = candidate
break
if "overrides" not in normalized:
for key in ("settingsOverride", "settings_override"):
candidate = normalized.get(key)
if isinstance(candidate, str):
try:
candidate = json.loads(candidate)
except Exception:
candidate = None
if isinstance(candidate, dict):
normalized["overrides"] = candidate
break
return normalized
model_config = ConfigDict(populate_by_name=True)
class CreateAutoAnnotationTaskRequest(BaseModel):
"""创建自动标注任务的请求体,对齐前端 CreateAutoAnnotationDialog 发送的结构"""
name: str = Field(..., min_length=1, max_length=255, description="任务名称")
dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
dataset_type: Optional[str] = Field(
default=None,
alias="datasetType",
description="数据集类型: IMAGE/TEXT/AUDIO/VIDEO(不传时由后端自动获取)",
)
config: Optional[AutoAnnotationConfig] = Field(
default=None,
description="兼容旧版 YOLO 任务配置",
)
pipeline: Optional[List[OperatorPipelineStep]] = Field(
default=None,
description="通用算子编排定义",
)
task_mode: str = Field(
default="legacy_yolo",
alias="taskMode",
description="任务模式: legacy_yolo/pipeline",
)
executor_type: str = Field(
default="annotation_local",
alias="executorType",
description="执行器类型",
)
output_dataset_name: Optional[str] = Field(
default=None,
alias="outputDatasetName",
description="输出数据集名称(优先级高于 config.outputDatasetName)",
)
file_ids: Optional[List[str]] = Field(
None,
alias="fileIds",
description="要处理的文件ID列表,为空则处理数据集中所有图像",
)
@model_validator(mode="after")
def validate_config_or_pipeline(self):
if self.config is None and not self.pipeline:
raise ValueError("Either config or pipeline must be provided")
return self
model_config = ConfigDict(populate_by_name=True)
class AutoAnnotationTaskResponse(BaseModel):
"""自动标注任务响应模型(列表/详情均可复用)"""
id: str = Field(..., description="任务ID")
name: str = Field(..., description="任务名称")
dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
dataset_name: Optional[str] = Field(None, alias="datasetName", description="数据集名称")
dataset_type: Optional[str] = Field(None, alias="datasetType", description="数据集类型: IMAGE/TEXT/AUDIO/VIDEO")
task_mode: Optional[str] = Field(None, alias="taskMode", description="任务模式")
executor_type: Optional[str] = Field(None, alias="executorType", description="执行器类型")
pipeline: Optional[List[Dict[str, Any]]] = Field(None, description="算子编排定义")
source_datasets: Optional[List[str]] = Field(
default=None,
alias="sourceDatasets",
description="本任务实际处理涉及到的所有数据集名称列表",
)
config: Dict[str, Any] = Field(..., description="任务配置")
status: str = Field(..., description="任务状态")
progress: int = Field(..., description="任务进度 0-100")
total_images: int = Field(..., alias="totalImages", description="总图片数")
processed_images: int = Field(..., alias="processedImages", description="已处理图片数")
detected_objects: int = Field(..., alias="detectedObjects", description="检测到的对象总数")
output_path: Optional[str] = Field(None, alias="outputPath", description="输出路径")
output_dataset_id: Optional[str] = Field(
None,
alias="outputDatasetId",
description="输出数据集ID",
)
stop_requested: Optional[bool] = Field(
None,
alias="stopRequested",
description="是否请求停止",
)
error_message: Optional[str] = Field(None, alias="errorMessage", description="错误信息")
created_by: Optional[str] = Field(None, alias="createdBy", description="创建人")
started_at: Optional[datetime] = Field(None, alias="startedAt", description="启动时间")
heartbeat_at: Optional[datetime] = Field(None, alias="heartbeatAt", description="心跳时间")
created_at: datetime = Field(..., alias="createdAt", description="创建时间")
updated_at: Optional[datetime] = Field(None, alias="updatedAt", description="更新时间")
completed_at: Optional[datetime] = Field(None, alias="completedAt", description="完成时间")
model_config = ConfigDict(populate_by_name=True, from_attributes=True)
class AutoAnnotationTaskListResponse(BaseModel):
"""自动标注任务列表响应,目前前端直接使用数组,这里预留分页结构"""
content: List[AutoAnnotationTaskResponse] = Field(..., description="任务列表")
total: int = Field(..., description="总数")
model_config = ConfigDict(populate_by_name=True)