Files
DataMate/runtime/datamate-python/app/module/annotation/service/auto.py
Jerry Yan 8ffa131fad feat(annotation): 自动标注任务支持非图像类型数据集(TEXT/AUDIO/VIDEO)
移除自动标注任务创建流程中的 IMAGE-only 限制,使 TEXT、AUDIO、VIDEO
类型数据集均可用于自动标注任务。

- 新增数据库迁移:t_dm_auto_annotation_tasks 表添加 dataset_type 列
- 后端 schema/API/service 全链路传递 dataset_type
- Worker 动态构建 sample key(image/text/audio/video)和输出目录
- 前端移除数据集类型校验,下拉框显示数据集类型标识
- 输出数据集继承源数据集类型,不再硬编码为 IMAGE
- 保持向后兼容:默认值为 IMAGE,worker 有元数据回退和目录 fallback

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 23:23:05 +08:00

356 lines
12 KiB
Python

"""Service layer for Auto Annotation tasks"""
from __future__ import annotations
from typing import List, Optional, Dict, Any
from datetime import datetime
from uuid import uuid4
from fastapi import HTTPException
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.models.annotation_management import (
AutoAnnotationTask,
AnnotationTaskOperatorInstance,
)
from app.db.models.dataset_management import Dataset, DatasetFiles
from app.module.annotation.security import RequestUserContext
from ..schema.auto import (
CreateAutoAnnotationTaskRequest,
AutoAnnotationTaskResponse,
)
class AutoAnnotationTaskService:
"""自动标注任务服务(仅管理任务元数据,真正执行由 runtime 负责)"""
@staticmethod
def _normalize_file_ids(file_ids: Optional[List[str]]) -> List[str]:
if not file_ids:
return []
return [fid for fid in dict.fromkeys(file_ids) if fid]
@staticmethod
def _extract_operator_id(step: Dict[str, Any]) -> Optional[str]:
operator_id = step.get("operatorId") or step.get("operator_id") or step.get("id")
if operator_id is None:
return None
operator_id = str(operator_id).strip()
return operator_id or None
@classmethod
def _to_operator_instances(
cls,
task_id: str,
pipeline: List[Dict[str, Any]],
) -> List[AnnotationTaskOperatorInstance]:
instances: List[AnnotationTaskOperatorInstance] = []
for step in pipeline:
if not isinstance(step, dict):
continue
operator_id = cls._extract_operator_id(step)
if not operator_id:
continue
settings_override = (
step.get("overrides")
or step.get("settingsOverride")
or step.get("settings_override")
or {}
)
if not isinstance(settings_override, dict):
settings_override = {}
instances.append(
AnnotationTaskOperatorInstance(
task_id=task_id,
op_index=len(instances) + 1,
operator_id=operator_id,
settings_override=settings_override,
inputs=step.get("inputs"),
outputs=step.get("outputs"),
)
)
return instances
@staticmethod
def _to_pipeline(request: CreateAutoAnnotationTaskRequest) -> Optional[List[Dict[str, Any]]]:
"""将请求标准化为 pipeline 结构。"""
if request.pipeline:
return [step.model_dump(by_alias=True) for step in request.pipeline]
if request.config is None:
return None
# 兼容旧版 YOLO 请求 -> 单步 pipeline
config = request.config.model_dump(by_alias=True)
step_overrides: Dict[str, Any] = {
"modelSize": config.get("modelSize"),
"confThreshold": config.get("confThreshold"),
"targetClasses": config.get("targetClasses") or [],
}
output_dataset_name = request.output_dataset_name or config.get("outputDatasetName")
if output_dataset_name:
step_overrides["outputDatasetName"] = output_dataset_name
return [
{
"operatorId": "ImageObjectDetectionBoundingBox",
"overrides": step_overrides,
}
]
async def validate_file_ids(
self,
db: AsyncSession,
dataset_id: str,
file_ids: Optional[List[str]],
) -> List[str]:
"""校验 fileIds 是否全部属于 dataset 且有效。"""
normalized_ids = self._normalize_file_ids(file_ids)
if not normalized_ids:
if file_ids:
raise HTTPException(status_code=400, detail="fileIds 不能为空列表")
return []
stmt = select(DatasetFiles.id).where(
DatasetFiles.id.in_(normalized_ids),
DatasetFiles.dataset_id == dataset_id,
DatasetFiles.status == "ACTIVE",
)
result = await db.execute(stmt)
found_ids = {row[0] for row in result.fetchall()}
missing = [fid for fid in normalized_ids if fid not in found_ids]
if missing:
raise HTTPException(
status_code=400,
detail=f"部分 fileIds 不存在、不可用或不属于数据集: {missing[:10]}",
)
return normalized_ids
async def create_task(
self,
db: AsyncSession,
request: CreateAutoAnnotationTaskRequest,
user_context: RequestUserContext,
dataset_name: Optional[str] = None,
total_images: int = 0,
dataset_type: str = "IMAGE",
) -> AutoAnnotationTaskResponse:
"""创建自动标注任务,初始状态为 pending。
这里仅插入任务记录,不负责真正执行 YOLO 推理,
后续可以由调度器/worker 读取该表并更新进度。
"""
now = datetime.now()
validated_file_ids = await self.validate_file_ids(
db,
request.dataset_id,
request.file_ids,
)
if validated_file_ids:
total_images = len(validated_file_ids)
normalized_pipeline = self._to_pipeline(request)
if not normalized_pipeline:
raise HTTPException(status_code=400, detail="pipeline 不能为空")
normalized_config = request.config.model_dump(by_alias=True) if request.config else {}
task_id = str(uuid4())
task = AutoAnnotationTask(
id=task_id,
name=request.name,
dataset_id=request.dataset_id,
dataset_name=dataset_name,
dataset_type=dataset_type,
created_by=user_context.user_id,
config=normalized_config,
task_mode=request.task_mode,
executor_type=request.executor_type,
pipeline=normalized_pipeline,
file_ids=validated_file_ids or None,
status="pending",
progress=0,
total_images=total_images,
processed_images=0,
detected_objects=0,
stop_requested=False,
created_at=now,
updated_at=now,
)
operator_instances = self._to_operator_instances(task_id, normalized_pipeline)
db.add(task)
if operator_instances:
db.add_all(operator_instances)
await db.commit()
await db.refresh(task)
# 创建后附带 sourceDatasets 信息(通常只有一个原始数据集)
resp = AutoAnnotationTaskResponse.model_validate(task)
try:
resp.source_datasets = await self._compute_source_datasets(db, task)
except Exception:
resp.source_datasets = [dataset_name] if dataset_name else [request.dataset_id]
return resp
def _apply_dataset_scope(self, query, user_context: RequestUserContext):
if user_context.is_admin:
return query
return query.join(
Dataset,
AutoAnnotationTask.dataset_id == Dataset.id,
).where(Dataset.created_by == user_context.user_id)
async def list_tasks(
self,
db: AsyncSession,
user_context: RequestUserContext,
) -> List[AutoAnnotationTaskResponse]:
"""获取未软删除的自动标注任务列表,按创建时间倒序。"""
query = select(AutoAnnotationTask).where(AutoAnnotationTask.deleted_at.is_(None))
query = self._apply_dataset_scope(query, user_context)
result = await db.execute(
query.order_by(AutoAnnotationTask.created_at.desc())
)
tasks: List[AutoAnnotationTask] = list(result.scalars().all())
responses: List[AutoAnnotationTaskResponse] = []
for task in tasks:
resp = AutoAnnotationTaskResponse.model_validate(task)
try:
resp.source_datasets = await self._compute_source_datasets(db, task)
except Exception:
# 出错时降级为单个 datasetName/datasetId
fallback_name = getattr(task, "dataset_name", None)
fallback_id = getattr(task, "dataset_id", "")
resp.source_datasets = [fallback_name] if fallback_name else [fallback_id]
responses.append(resp)
return responses
async def get_task(
self,
db: AsyncSession,
task_id: str,
user_context: RequestUserContext,
) -> Optional[AutoAnnotationTaskResponse]:
query = select(AutoAnnotationTask).where(
AutoAnnotationTask.id == task_id,
AutoAnnotationTask.deleted_at.is_(None),
)
query = self._apply_dataset_scope(query, user_context)
result = await db.execute(query)
task = result.scalar_one_or_none()
if not task:
return None
resp = AutoAnnotationTaskResponse.model_validate(task)
try:
resp.source_datasets = await self._compute_source_datasets(db, task)
except Exception:
fallback_name = getattr(task, "dataset_name", None)
fallback_id = getattr(task, "dataset_id", "")
resp.source_datasets = [fallback_name] if fallback_name else [fallback_id]
return resp
async def _compute_source_datasets(
self,
db: AsyncSession,
task: AutoAnnotationTask,
) -> List[str]:
"""根据任务的 file_ids 推断实际涉及到的所有数据集名称。
- 如果存在 file_ids,则通过 t_dm_dataset_files 反查 dataset_id,再关联 t_dm_datasets 获取名称;
- 如果没有 file_ids,则退回到任务上冗余的 dataset_name/dataset_id。
"""
file_ids = task.file_ids or []
if file_ids:
stmt = (
select(Dataset.name)
.join(DatasetFiles, Dataset.id == DatasetFiles.dataset_id)
.where(DatasetFiles.id.in_(file_ids))
.distinct()
)
result = await db.execute(stmt)
names = [row[0] for row in result.fetchall() if row[0]]
if names:
return names
# 回退:只显示一个数据集
if task.dataset_name:
return [task.dataset_name]
if task.dataset_id:
return [task.dataset_id]
return []
async def request_stop_task(
self,
db: AsyncSession,
task_id: str,
user_context: RequestUserContext,
) -> Optional[AutoAnnotationTaskResponse]:
query = select(AutoAnnotationTask).where(
AutoAnnotationTask.id == task_id,
AutoAnnotationTask.deleted_at.is_(None),
)
query = self._apply_dataset_scope(query, user_context)
result = await db.execute(query)
task = result.scalar_one_or_none()
if not task:
return None
now = datetime.now()
terminal_states = {"completed", "failed", "stopped"}
if task.status not in terminal_states:
task.stop_requested = True
task.error_message = "Task stop requested"
if task.status == "pending":
task.status = "stopped"
task.progress = task.progress or 0
task.completed_at = now
task.run_token = None
task.updated_at = now
await db.commit()
await db.refresh(task)
resp = AutoAnnotationTaskResponse.model_validate(task)
try:
resp.source_datasets = await self._compute_source_datasets(db, task)
except Exception:
fallback_name = getattr(task, "dataset_name", None)
fallback_id = getattr(task, "dataset_id", "")
resp.source_datasets = [fallback_name] if fallback_name else [fallback_id]
return resp
async def soft_delete_task(
self,
db: AsyncSession,
task_id: str,
user_context: RequestUserContext,
) -> bool:
query = select(AutoAnnotationTask).where(
AutoAnnotationTask.id == task_id,
AutoAnnotationTask.deleted_at.is_(None),
)
query = self._apply_dataset_scope(query, user_context)
result = await db.execute(query)
task = result.scalar_one_or_none()
if not task:
return False
task.deleted_at = datetime.now()
await db.commit()
return True