feat(annotation): 自动标注任务支持非图像类型数据集（TEXT/AUDIO/VIDEO）

移除自动标注任务创建流程中的 IMAGE-only 限制，使 TEXT、AUDIO、VIDEO 类型数据集均可用于自动标注任务。 - 新增数据库迁移：t_dm_auto_annotation_tasks 表添加 dataset_type 列 - 后端 schema/API/service 全链路传递 dataset_type - Worker 动态构建 sample key（image/text/audio/video）和输出目录 - 前端移除数据集类型校验，下拉框显示数据集类型标识 - 输出数据集继承源数据集类型，不再硬编码为 IMAGE - 保持向后兼容：默认值为 IMAGE，worker 有元数据回退和目录 fallback Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 23:23:05 +08:00
parent 807c2289e2
commit 8ffa131fad
7 changed files with 1161 additions and 1082 deletions
@@ -6,7 +6,7 @@ import { ArrowLeft } from "lucide-react";
 import { Link, useNavigate } from "react-router";
 import { queryDatasetsUsingGet } from "@/pages/DataManagement/dataset.api";
 import { mapDataset } from "@/pages/DataManagement/dataset.const";
-import { Dataset, DatasetType } from "@/pages/DataManagement/dataset.model";
+import { Dataset } from "@/pages/DataManagement/dataset.model";
 import { createAnnotationOperatorTaskUsingPost } from "../annotation.api";
 import { useCreateStepTwo } from "./hooks/useCreateStepTwo";
 import PipelinePreview from "./components/PipelinePreview";
@@ -85,11 +85,6 @@ export default function AnnotationOperatorTaskCreate() {
    try {
      if (currentStep === 1) {
        await form.validateFields();
        if (selectedDataset?.datasetType !== DatasetType.IMAGE) {
          message.error("自动标注算子编排当前仅支持图片数据集");
          return;
        }
      }
      setCurrentStep((prev) => Math.min(prev + 1, 2));
    } catch {
@@ -109,11 +104,6 @@ export default function AnnotationOperatorTaskCreate() {
        return;
      }
      if (selectedDataset?.datasetType !== DatasetType.IMAGE) {
        message.error("自动标注算子编排当前仅支持图片数据集");
        return;
      }
      const outputDatasetName = values.outputDatasetName?.trim();
      const pipeline = selectedOperators.map((operator, index) => {
        const overrides = {
@@ -200,10 +190,10 @@ export default function AnnotationOperatorTaskCreate() {
                label="选择数据集"
                name="datasetId"
                rules={[{ required: true, message: "请选择数据集" }]}
-                extra="自动标注算子编排当前仅支持图片数据集"
+                extra="请选择用于自动标注的数据集"
              >
                <Select
-                  placeholder="请选择图片数据集"
+                  placeholder="请选择数据集"
                  optionFilterProp="label"
                  options={datasets.map((dataset) => ({
                    label: (
@@ -215,12 +205,11 @@ export default function AnnotationOperatorTaskCreate() {
                          {dataset.name}
                        </div>
                        <div className="text-xs text-gray-500">
-                          {dataset?.fileCount} 文件 • {dataset.size}
+                          {dataset.datasetType} &bull; {dataset?.fileCount} 文件 &bull; {dataset.size}
                        </div>
                      </div>
                    ),
                    value: dataset.id,
                    disabled: dataset.datasetType !== DatasetType.IMAGE,
                  }))}
                />
              </Form.Item>
@@ -210,6 +210,10 @@ class AutoAnnotationTask(Base):
    dataset_name = Column(
        String(255), nullable=True, comment="数据集名称（冗余字段，方便查询）"
    )
    dataset_type = Column(
        String(50), nullable=False, default="IMAGE",
        comment="数据集类型: IMAGE/TEXT/AUDIO/VIDEO",
    )
    created_by = Column(String(255), nullable=True, comment="任务创建人")
    config = Column(JSON, nullable=False, comment="任务配置（模型规模、置信度等）")
    file_ids = Column(
@@ -85,23 +85,28 @@ async def _create_task_internal(
    await assert_dataset_access(db, normalized_request.dataset_id, user_context)
    # 尝试获取数据集名称和总量用于冗余字段
    dataset_name = None
    dataset_type = "IMAGE"
    total_images = len(normalized_request.file_ids) if normalized_request.file_ids else 0
    try:
        dm_client = DatasetManagementService(db)
        dataset = await dm_client.get_dataset(normalized_request.dataset_id)
        if dataset is not None:
            dataset_name = dataset.name
            dataset_type = getattr(dataset, "datasetType", None) or "IMAGE"
            if not normalized_request.file_ids:
                total_images = getattr(dataset, "fileCount", 0) or 0
    except Exception as e:  # pragma: no cover - 容错
        logger.warning("Failed to fetch dataset summary for annotation task: %s", e)
    resolved_dataset_type = normalized_request.dataset_type or dataset_type
    return await service.create_task(
        db,
        normalized_request,
        user_context=user_context,
        dataset_name=dataset_name,
        total_images=total_images,
        dataset_type=resolved_dataset_type,
    )
@@ -75,6 +75,11 @@ class CreateAutoAnnotationTaskRequest(BaseModel):
    name: str = Field(..., min_length=1, max_length=255, description="任务名称")
    dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
    dataset_type: Optional[str] = Field(
        default=None,
        alias="datasetType",
        description="数据集类型: IMAGE/TEXT/AUDIO/VIDEO（不传时由后端自动获取）",
    )
    config: Optional[AutoAnnotationConfig] = Field(
        default=None,
        description="兼容旧版 YOLO 任务配置",
@@ -120,6 +125,7 @@ class AutoAnnotationTaskResponse(BaseModel):
    name: str = Field(..., description="任务名称")
    dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
    dataset_name: Optional[str] = Field(None, alias="datasetName", description="数据集名称")
    dataset_type: Optional[str] = Field(None, alias="datasetType", description="数据集类型: IMAGE/TEXT/AUDIO/VIDEO")
    task_mode: Optional[str] = Field(None, alias="taskMode", description="任务模式")
    executor_type: Optional[str] = Field(None, alias="executorType", description="执行器类型")
    pipeline: Optional[List[Dict[str, Any]]] = Field(None, description="算子编排定义")
@@ -141,6 +141,7 @@ class AutoAnnotationTaskService:
        user_context: RequestUserContext,
        dataset_name: Optional[str] = None,
        total_images: int = 0,
        dataset_type: str = "IMAGE",
    ) -> AutoAnnotationTaskResponse:
        """创建自动标注任务，初始状态为 pending。
@@ -170,6 +171,7 @@ class AutoAnnotationTaskService:
            name=request.name,
            dataset_id=request.dataset_id,
            dataset_name=dataset_name,
            dataset_type=dataset_type,
            created_by=user_context.user_id,
            config=normalized_config,
            task_mode=request.task_mode,
@@ -175,7 +175,7 @@ def _fetch_pending_task() -> Optional[Dict[str, Any]]:
    query_sql = text(
        """
-        SELECT id, name, dataset_id, dataset_name, created_by,
+        SELECT id, name, dataset_id, dataset_name, dataset_type, created_by,
               config, file_ids, pipeline,
               task_mode, executor_type,
               status, stop_requested, run_token,
@@ -521,6 +521,35 @@ def _count_detections(sample: Dict[str, Any]) -> int:
        return 0
 # ---------------------------------------------------------------------------
 # 数据集类型 → sample key / 输出子目录 映射
 # ---------------------------------------------------------------------------
 DATASET_TYPE_SAMPLE_KEY: Dict[str, str] = {
    "IMAGE": "image",
    "TEXT": "text",
    "AUDIO": "audio",
    "VIDEO": "video",
 }
 DATASET_TYPE_DATA_DIR: Dict[str, str] = {
    "IMAGE": "images",
    "TEXT": "data",
    "AUDIO": "data",
    "VIDEO": "data",
 }
 def _get_sample_key(dataset_type: str) -> str:
    """根据数据集类型返回 sample dict 中主数据对应的 key。"""
    return DATASET_TYPE_SAMPLE_KEY.get(dataset_type.upper(), "image")
 def _get_data_dir_name(dataset_type: str) -> str:
    """根据数据集类型返回输出子目录名。"""
    return DATASET_TYPE_DATA_DIR.get(dataset_type.upper(), "images")
 def _get_operator_whitelist() -> Optional[set[str]]:
    """获取灰度白名单；返回 None 表示放开全部。"""
@@ -584,7 +613,7 @@ def _load_dataset_meta(dataset_id: str) -> Optional[Dict[str, Any]]:
    sql = text(
        """
-        SELECT id, name, parent_dataset_id, path
+        SELECT id, name, parent_dataset_id, path, dataset_type
        FROM t_dm_datasets
        WHERE id = :dataset_id
        """
@@ -638,11 +667,12 @@ def _load_files_by_ids(file_ids: List[str]) -> List[Tuple[str, str, str]]:
        return [(str(r[0]), str(r[1]), str(r[2])) for r in rows]
-def _ensure_output_dir(output_dir: str) -> str:
+def _ensure_output_dir(output_dir: str, dataset_type: str = "IMAGE") -> str:
-    """确保输出目录及其 images/、annotations/ 子目录存在。"""
+    """确保输出目录及其数据/annotations 子目录存在。"""
    os.makedirs(output_dir, exist_ok=True)
-    os.makedirs(os.path.join(output_dir, "images"), exist_ok=True)
+    data_dir_name = _get_data_dir_name(dataset_type)
    os.makedirs(os.path.join(output_dir, data_dir_name), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "annotations"), exist_ok=True)
    return output_dir
@@ -651,6 +681,7 @@ def _create_output_dataset(
    source_dataset_id: str,
    source_dataset_name: str,
    output_dataset_name: str,
    dataset_type: str = "IMAGE",
 ) -> Tuple[str, str]:
    """为自动标注结果创建一个新的数据集并返回 (dataset_id, path)。"""
@@ -673,7 +704,7 @@ def _create_output_dataset(
        "parent_dataset_id": parent_dataset_id,
        "name": output_dataset_name,
        "description": description,
-        "dataset_type": "IMAGE",
+        "dataset_type": dataset_type,
        "path": output_dir,
        "status": "ACTIVE",
    }
@@ -690,31 +721,38 @@ def _register_output_dataset(
    output_dir: str,
    output_dataset_name: str,
    total_images: int,
    dataset_type: str = "IMAGE",
 ) -> None:
    """将自动标注结果注册到新建的数据集。"""
-    images_dir = os.path.join(output_dir, "images")
+    data_dir_name = _get_data_dir_name(dataset_type)
-    if not os.path.isdir(images_dir):
+    data_dir = os.path.join(output_dir, data_dir_name)
    # 兼容旧任务和 IMAGE 算子（它们写入 images/ 目录）
    if not os.path.isdir(data_dir):
        fallback_dir = os.path.join(output_dir, "images")
        if os.path.isdir(fallback_dir):
            data_dir = fallback_dir
        else:
            logger.warning(
-            "Auto-annotation images directory not found for task {}: {}",
+                "Auto-annotation data directory not found for task {}: {}",
                task_id,
-            images_dir,
+                data_dir,
            )
            return
-    image_files: List[Tuple[str, str, int]] = []
+    data_files: List[Tuple[str, str, int]] = []
    annotation_files: List[Tuple[str, str, int]] = []
    total_size = 0
-    for file_name in sorted(os.listdir(images_dir)):
+    for file_name in sorted(os.listdir(data_dir)):
-        file_path = os.path.join(images_dir, file_name)
+        file_path = os.path.join(data_dir, file_name)
        if not os.path.isfile(file_path):
            continue
        try:
            file_size = os.path.getsize(file_path)
        except OSError:
            file_size = 0
-        image_files.append((file_name, file_path, int(file_size)))
+        data_files.append((file_name, file_path, int(file_size)))
        total_size += int(file_size)
    annotations_dir = os.path.join(output_dir, "annotations")
@@ -730,11 +768,11 @@ def _register_output_dataset(
            annotation_files.append((file_name, file_path, int(file_size)))
            total_size += int(file_size)
-    if not image_files:
+    if not data_files:
        logger.warning(
-            "No image files found in auto-annotation output for task {}: {}",
+            "No data files found in auto-annotation output for task {}: {}",
            task_id,
-            images_dir,
+            data_dir,
        )
        return
@@ -759,7 +797,7 @@ def _register_output_dataset(
    with SQLManager.create_connect() as conn:
        added_count = 0
-        for file_name, file_path, file_size in image_files:
+        for file_name, file_path, file_size in data_files:
            ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None
            logical_path = os.path.relpath(file_path, output_dir).replace("\\", "/")
            conn.execute(
@@ -811,7 +849,7 @@ def _register_output_dataset(
        "Registered auto-annotation output into dataset: dataset_id={}, name={}, added_files={}, added_size_bytes={}, task_id={}, output_dir={}",
        output_dataset_id,
        output_dataset_name,
-        len(image_files) + len(annotation_files),
+        len(data_files) + len(annotation_files),
        total_size,
        task_id,
        output_dir,
@@ -832,6 +870,13 @@ def _process_single_task(task: Dict[str, Any]) -> None:
    pipeline_raw = task.get("pipeline")
    selected_file_ids: Optional[List[str]] = task.get("file_ids") or None
    # 解析数据集类型，兜底从数据集元数据获取
    dataset_type = str(task.get("dataset_type") or "").upper() or "IMAGE"
    if dataset_type == "IMAGE" and not task.get("dataset_type"):
        source_meta = _load_dataset_meta(dataset_id)
        if source_meta and source_meta.get("dataset_type"):
            dataset_type = str(source_meta["dataset_type"]).upper()
    output_dataset_name = _get_output_dataset_name(
        task_id=task_id,
        dataset_id=dataset_id,
@@ -892,8 +937,9 @@ def _process_single_task(task: Dict[str, Any]) -> None:
        source_dataset_id=dataset_id,
        source_dataset_name=source_dataset_name,
        output_dataset_name=output_dataset_name,
        dataset_type=dataset_type,
    )
-    output_dir = _ensure_output_dir(output_dir)
+    output_dir = _ensure_output_dir(output_dir, dataset_type=dataset_type)
    _update_task_status(
        task_id,
@@ -959,8 +1005,9 @@ def _process_single_task(task: Dict[str, Any]) -> None:
                return
            try:
                sample_key = _get_sample_key(dataset_type)
                sample = {
-                    "image": file_path,
+                    sample_key: file_path,
                    "filename": file_name,
                }
@@ -983,7 +1030,7 @@ def _process_single_task(task: Dict[str, Any]) -> None:
                )
            except Exception as e:
                logger.error(
-                    "Failed to process image for task {}: file_path={}, error={}",
+                    "Failed to process file for task {}: file_path={}, error={}",
                    task_id,
                    file_path,
                    e,
@@ -1021,6 +1068,7 @@ def _process_single_task(task: Dict[str, Any]) -> None:
                    output_dir=output_dir,
                    output_dataset_name=output_dataset_name,
                    total_images=total_images,
                    dataset_type=dataset_type,
                )
            except Exception as e:  # pragma: no cover - 防御性日志
                logger.error(
@@ -0,0 +1,25 @@
 -- =============================================
 -- 自动标注任务支持多数据集类型迁移
 -- 为 t_dm_auto_annotation_tasks 表添加 dataset_type 列
 -- =============================================
 USE datamate;
 SET @db_name = DATABASE();
 -- 添加 dataset_type 列（IMAGE/TEXT/AUDIO/VIDEO），已有记录默认为 IMAGE
 SET @ddl = (
    SELECT IF(
        EXISTS(
            SELECT 1
            FROM information_schema.COLUMNS
            WHERE TABLE_SCHEMA = @db_name
              AND TABLE_NAME = 't_dm_auto_annotation_tasks'
              AND COLUMN_NAME = 'dataset_type'
        ),
        'SELECT ''skip: column dataset_type already exists''',
        'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN dataset_type VARCHAR(50) NOT NULL DEFAULT ''IMAGE'' COMMENT ''数据集类型: IMAGE/TEXT/AUDIO/VIDEO'' AFTER dataset_name'
    )
 );
 PREPARE stmt FROM @ddl;
 EXECUTE stmt;
 DEALLOCATE PREPARE stmt;