feat(annotation): 自动标注任务支持非图像类型数据集(TEXT/AUDIO/VIDEO)

移除自动标注任务创建流程中的 IMAGE-only 限制,使 TEXT、AUDIO、VIDEO
类型数据集均可用于自动标注任务。

- 新增数据库迁移:t_dm_auto_annotation_tasks 表添加 dataset_type 列
- 后端 schema/API/service 全链路传递 dataset_type
- Worker 动态构建 sample key(image/text/audio/video)和输出目录
- 前端移除数据集类型校验,下拉框显示数据集类型标识
- 输出数据集继承源数据集类型,不再硬编码为 IMAGE
- 保持向后兼容:默认值为 IMAGE,worker 有元数据回退和目录 fallback

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-09 23:23:05 +08:00
parent 807c2289e2
commit 8ffa131fad
7 changed files with 1161 additions and 1082 deletions

View File

@@ -6,7 +6,7 @@ import { ArrowLeft } from "lucide-react";
import { Link, useNavigate } from "react-router";
import { queryDatasetsUsingGet } from "@/pages/DataManagement/dataset.api";
import { mapDataset } from "@/pages/DataManagement/dataset.const";
import { Dataset, DatasetType } from "@/pages/DataManagement/dataset.model";
import { Dataset } from "@/pages/DataManagement/dataset.model";
import { createAnnotationOperatorTaskUsingPost } from "../annotation.api";
import { useCreateStepTwo } from "./hooks/useCreateStepTwo";
import PipelinePreview from "./components/PipelinePreview";
@@ -85,11 +85,6 @@ export default function AnnotationOperatorTaskCreate() {
try {
if (currentStep === 1) {
await form.validateFields();
if (selectedDataset?.datasetType !== DatasetType.IMAGE) {
message.error("自动标注算子编排当前仅支持图片数据集");
return;
}
}
setCurrentStep((prev) => Math.min(prev + 1, 2));
} catch {
@@ -109,11 +104,6 @@ export default function AnnotationOperatorTaskCreate() {
return;
}
if (selectedDataset?.datasetType !== DatasetType.IMAGE) {
message.error("自动标注算子编排当前仅支持图片数据集");
return;
}
const outputDatasetName = values.outputDatasetName?.trim();
const pipeline = selectedOperators.map((operator, index) => {
const overrides = {
@@ -200,10 +190,10 @@ export default function AnnotationOperatorTaskCreate() {
label="选择数据集"
name="datasetId"
rules={[{ required: true, message: "请选择数据集" }]}
extra="自动标注算子编排当前仅支持图片数据集"
extra="请选择用于自动标注的数据集"
>
<Select
placeholder="请选择图片数据集"
placeholder="请选择数据集"
optionFilterProp="label"
options={datasets.map((dataset) => ({
label: (
@@ -215,12 +205,11 @@ export default function AnnotationOperatorTaskCreate() {
{dataset.name}
</div>
<div className="text-xs text-gray-500">
{dataset?.fileCount} {dataset.size}
{dataset.datasetType} &bull; {dataset?.fileCount} &bull; {dataset.size}
</div>
</div>
),
value: dataset.id,
disabled: dataset.datasetType !== DatasetType.IMAGE,
}))}
/>
</Form.Item>

View File

@@ -210,6 +210,10 @@ class AutoAnnotationTask(Base):
dataset_name = Column(
String(255), nullable=True, comment="数据集名称(冗余字段,方便查询)"
)
dataset_type = Column(
String(50), nullable=False, default="IMAGE",
comment="数据集类型: IMAGE/TEXT/AUDIO/VIDEO",
)
created_by = Column(String(255), nullable=True, comment="任务创建人")
config = Column(JSON, nullable=False, comment="任务配置(模型规模、置信度等)")
file_ids = Column(

View File

@@ -85,23 +85,28 @@ async def _create_task_internal(
await assert_dataset_access(db, normalized_request.dataset_id, user_context)
# 尝试获取数据集名称和总量用于冗余字段
dataset_name = None
dataset_type = "IMAGE"
total_images = len(normalized_request.file_ids) if normalized_request.file_ids else 0
try:
dm_client = DatasetManagementService(db)
dataset = await dm_client.get_dataset(normalized_request.dataset_id)
if dataset is not None:
dataset_name = dataset.name
dataset_type = getattr(dataset, "datasetType", None) or "IMAGE"
if not normalized_request.file_ids:
total_images = getattr(dataset, "fileCount", 0) or 0
except Exception as e: # pragma: no cover - 容错
logger.warning("Failed to fetch dataset summary for annotation task: %s", e)
resolved_dataset_type = normalized_request.dataset_type or dataset_type
return await service.create_task(
db,
normalized_request,
user_context=user_context,
dataset_name=dataset_name,
total_images=total_images,
dataset_type=resolved_dataset_type,
)

View File

@@ -75,6 +75,11 @@ class CreateAutoAnnotationTaskRequest(BaseModel):
name: str = Field(..., min_length=1, max_length=255, description="任务名称")
dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
dataset_type: Optional[str] = Field(
default=None,
alias="datasetType",
description="数据集类型: IMAGE/TEXT/AUDIO/VIDEO(不传时由后端自动获取)",
)
config: Optional[AutoAnnotationConfig] = Field(
default=None,
description="兼容旧版 YOLO 任务配置",
@@ -120,6 +125,7 @@ class AutoAnnotationTaskResponse(BaseModel):
name: str = Field(..., description="任务名称")
dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
dataset_name: Optional[str] = Field(None, alias="datasetName", description="数据集名称")
dataset_type: Optional[str] = Field(None, alias="datasetType", description="数据集类型: IMAGE/TEXT/AUDIO/VIDEO")
task_mode: Optional[str] = Field(None, alias="taskMode", description="任务模式")
executor_type: Optional[str] = Field(None, alias="executorType", description="执行器类型")
pipeline: Optional[List[Dict[str, Any]]] = Field(None, description="算子编排定义")

View File

@@ -141,6 +141,7 @@ class AutoAnnotationTaskService:
user_context: RequestUserContext,
dataset_name: Optional[str] = None,
total_images: int = 0,
dataset_type: str = "IMAGE",
) -> AutoAnnotationTaskResponse:
"""创建自动标注任务,初始状态为 pending。
@@ -170,6 +171,7 @@ class AutoAnnotationTaskService:
name=request.name,
dataset_id=request.dataset_id,
dataset_name=dataset_name,
dataset_type=dataset_type,
created_by=user_context.user_id,
config=normalized_config,
task_mode=request.task_mode,

View File

@@ -175,7 +175,7 @@ def _fetch_pending_task() -> Optional[Dict[str, Any]]:
query_sql = text(
"""
SELECT id, name, dataset_id, dataset_name, created_by,
SELECT id, name, dataset_id, dataset_name, dataset_type, created_by,
config, file_ids, pipeline,
task_mode, executor_type,
status, stop_requested, run_token,
@@ -521,6 +521,35 @@ def _count_detections(sample: Dict[str, Any]) -> int:
return 0
# ---------------------------------------------------------------------------
# 数据集类型 → sample key / 输出子目录 映射
# ---------------------------------------------------------------------------
DATASET_TYPE_SAMPLE_KEY: Dict[str, str] = {
"IMAGE": "image",
"TEXT": "text",
"AUDIO": "audio",
"VIDEO": "video",
}
DATASET_TYPE_DATA_DIR: Dict[str, str] = {
"IMAGE": "images",
"TEXT": "data",
"AUDIO": "data",
"VIDEO": "data",
}
def _get_sample_key(dataset_type: str) -> str:
"""根据数据集类型返回 sample dict 中主数据对应的 key。"""
return DATASET_TYPE_SAMPLE_KEY.get(dataset_type.upper(), "image")
def _get_data_dir_name(dataset_type: str) -> str:
"""根据数据集类型返回输出子目录名。"""
return DATASET_TYPE_DATA_DIR.get(dataset_type.upper(), "images")
def _get_operator_whitelist() -> Optional[set[str]]:
"""获取灰度白名单;返回 None 表示放开全部。"""
@@ -584,7 +613,7 @@ def _load_dataset_meta(dataset_id: str) -> Optional[Dict[str, Any]]:
sql = text(
"""
SELECT id, name, parent_dataset_id, path
SELECT id, name, parent_dataset_id, path, dataset_type
FROM t_dm_datasets
WHERE id = :dataset_id
"""
@@ -638,11 +667,12 @@ def _load_files_by_ids(file_ids: List[str]) -> List[Tuple[str, str, str]]:
return [(str(r[0]), str(r[1]), str(r[2])) for r in rows]
def _ensure_output_dir(output_dir: str) -> str:
"""确保输出目录及其 images/、annotations/ 子目录存在。"""
def _ensure_output_dir(output_dir: str, dataset_type: str = "IMAGE") -> str:
"""确保输出目录及其数据/annotations 子目录存在。"""
os.makedirs(output_dir, exist_ok=True)
os.makedirs(os.path.join(output_dir, "images"), exist_ok=True)
data_dir_name = _get_data_dir_name(dataset_type)
os.makedirs(os.path.join(output_dir, data_dir_name), exist_ok=True)
os.makedirs(os.path.join(output_dir, "annotations"), exist_ok=True)
return output_dir
@@ -651,6 +681,7 @@ def _create_output_dataset(
source_dataset_id: str,
source_dataset_name: str,
output_dataset_name: str,
dataset_type: str = "IMAGE",
) -> Tuple[str, str]:
"""为自动标注结果创建一个新的数据集并返回 (dataset_id, path)。"""
@@ -673,7 +704,7 @@ def _create_output_dataset(
"parent_dataset_id": parent_dataset_id,
"name": output_dataset_name,
"description": description,
"dataset_type": "IMAGE",
"dataset_type": dataset_type,
"path": output_dir,
"status": "ACTIVE",
}
@@ -690,31 +721,38 @@ def _register_output_dataset(
output_dir: str,
output_dataset_name: str,
total_images: int,
dataset_type: str = "IMAGE",
) -> None:
"""将自动标注结果注册到新建的数据集。"""
images_dir = os.path.join(output_dir, "images")
if not os.path.isdir(images_dir):
logger.warning(
"Auto-annotation images directory not found for task {}: {}",
task_id,
images_dir,
)
return
data_dir_name = _get_data_dir_name(dataset_type)
data_dir = os.path.join(output_dir, data_dir_name)
# 兼容旧任务和 IMAGE 算子(它们写入 images/ 目录)
if not os.path.isdir(data_dir):
fallback_dir = os.path.join(output_dir, "images")
if os.path.isdir(fallback_dir):
data_dir = fallback_dir
else:
logger.warning(
"Auto-annotation data directory not found for task {}: {}",
task_id,
data_dir,
)
return
image_files: List[Tuple[str, str, int]] = []
data_files: List[Tuple[str, str, int]] = []
annotation_files: List[Tuple[str, str, int]] = []
total_size = 0
for file_name in sorted(os.listdir(images_dir)):
file_path = os.path.join(images_dir, file_name)
for file_name in sorted(os.listdir(data_dir)):
file_path = os.path.join(data_dir, file_name)
if not os.path.isfile(file_path):
continue
try:
file_size = os.path.getsize(file_path)
except OSError:
file_size = 0
image_files.append((file_name, file_path, int(file_size)))
data_files.append((file_name, file_path, int(file_size)))
total_size += int(file_size)
annotations_dir = os.path.join(output_dir, "annotations")
@@ -730,11 +768,11 @@ def _register_output_dataset(
annotation_files.append((file_name, file_path, int(file_size)))
total_size += int(file_size)
if not image_files:
if not data_files:
logger.warning(
"No image files found in auto-annotation output for task {}: {}",
"No data files found in auto-annotation output for task {}: {}",
task_id,
images_dir,
data_dir,
)
return
@@ -759,7 +797,7 @@ def _register_output_dataset(
with SQLManager.create_connect() as conn:
added_count = 0
for file_name, file_path, file_size in image_files:
for file_name, file_path, file_size in data_files:
ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None
logical_path = os.path.relpath(file_path, output_dir).replace("\\", "/")
conn.execute(
@@ -811,7 +849,7 @@ def _register_output_dataset(
"Registered auto-annotation output into dataset: dataset_id={}, name={}, added_files={}, added_size_bytes={}, task_id={}, output_dir={}",
output_dataset_id,
output_dataset_name,
len(image_files) + len(annotation_files),
len(data_files) + len(annotation_files),
total_size,
task_id,
output_dir,
@@ -832,6 +870,13 @@ def _process_single_task(task: Dict[str, Any]) -> None:
pipeline_raw = task.get("pipeline")
selected_file_ids: Optional[List[str]] = task.get("file_ids") or None
# 解析数据集类型,兜底从数据集元数据获取
dataset_type = str(task.get("dataset_type") or "").upper() or "IMAGE"
if dataset_type == "IMAGE" and not task.get("dataset_type"):
source_meta = _load_dataset_meta(dataset_id)
if source_meta and source_meta.get("dataset_type"):
dataset_type = str(source_meta["dataset_type"]).upper()
output_dataset_name = _get_output_dataset_name(
task_id=task_id,
dataset_id=dataset_id,
@@ -892,8 +937,9 @@ def _process_single_task(task: Dict[str, Any]) -> None:
source_dataset_id=dataset_id,
source_dataset_name=source_dataset_name,
output_dataset_name=output_dataset_name,
dataset_type=dataset_type,
)
output_dir = _ensure_output_dir(output_dir)
output_dir = _ensure_output_dir(output_dir, dataset_type=dataset_type)
_update_task_status(
task_id,
@@ -959,8 +1005,9 @@ def _process_single_task(task: Dict[str, Any]) -> None:
return
try:
sample_key = _get_sample_key(dataset_type)
sample = {
"image": file_path,
sample_key: file_path,
"filename": file_name,
}
@@ -983,7 +1030,7 @@ def _process_single_task(task: Dict[str, Any]) -> None:
)
except Exception as e:
logger.error(
"Failed to process image for task {}: file_path={}, error={}",
"Failed to process file for task {}: file_path={}, error={}",
task_id,
file_path,
e,
@@ -1021,6 +1068,7 @@ def _process_single_task(task: Dict[str, Any]) -> None:
output_dir=output_dir,
output_dataset_name=output_dataset_name,
total_images=total_images,
dataset_type=dataset_type,
)
except Exception as e: # pragma: no cover - 防御性日志
logger.error(

View File

@@ -0,0 +1,25 @@
-- =============================================
-- 自动标注任务支持多数据集类型迁移
-- 为 t_dm_auto_annotation_tasks 表添加 dataset_type 列
-- =============================================
USE datamate;
SET @db_name = DATABASE();
-- 添加 dataset_type 列(IMAGE/TEXT/AUDIO/VIDEO),已有记录默认为 IMAGE
SET @ddl = (
SELECT IF(
EXISTS(
SELECT 1
FROM information_schema.COLUMNS
WHERE TABLE_SCHEMA = @db_name
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
AND COLUMN_NAME = 'dataset_type'
),
'SELECT ''skip: column dataset_type already exists''',
'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN dataset_type VARCHAR(50) NOT NULL DEFAULT ''IMAGE'' COMMENT ''数据集类型: IMAGE/TEXT/AUDIO/VIDEO'' AFTER dataset_name'
)
);
PREPARE stmt FROM @ddl;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;