You've already forked DataMate
feat(annotation): 自动标注任务支持非图像类型数据集(TEXT/AUDIO/VIDEO)
移除自动标注任务创建流程中的 IMAGE-only 限制,使 TEXT、AUDIO、VIDEO 类型数据集均可用于自动标注任务。 - 新增数据库迁移:t_dm_auto_annotation_tasks 表添加 dataset_type 列 - 后端 schema/API/service 全链路传递 dataset_type - Worker 动态构建 sample key(image/text/audio/video)和输出目录 - 前端移除数据集类型校验,下拉框显示数据集类型标识 - 输出数据集继承源数据集类型,不再硬编码为 IMAGE - 保持向后兼容:默认值为 IMAGE,worker 有元数据回退和目录 fallback Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -6,7 +6,7 @@ import { ArrowLeft } from "lucide-react";
|
||||
import { Link, useNavigate } from "react-router";
|
||||
import { queryDatasetsUsingGet } from "@/pages/DataManagement/dataset.api";
|
||||
import { mapDataset } from "@/pages/DataManagement/dataset.const";
|
||||
import { Dataset, DatasetType } from "@/pages/DataManagement/dataset.model";
|
||||
import { Dataset } from "@/pages/DataManagement/dataset.model";
|
||||
import { createAnnotationOperatorTaskUsingPost } from "../annotation.api";
|
||||
import { useCreateStepTwo } from "./hooks/useCreateStepTwo";
|
||||
import PipelinePreview from "./components/PipelinePreview";
|
||||
@@ -85,11 +85,6 @@ export default function AnnotationOperatorTaskCreate() {
|
||||
try {
|
||||
if (currentStep === 1) {
|
||||
await form.validateFields();
|
||||
|
||||
if (selectedDataset?.datasetType !== DatasetType.IMAGE) {
|
||||
message.error("自动标注算子编排当前仅支持图片数据集");
|
||||
return;
|
||||
}
|
||||
}
|
||||
setCurrentStep((prev) => Math.min(prev + 1, 2));
|
||||
} catch {
|
||||
@@ -109,11 +104,6 @@ export default function AnnotationOperatorTaskCreate() {
|
||||
return;
|
||||
}
|
||||
|
||||
if (selectedDataset?.datasetType !== DatasetType.IMAGE) {
|
||||
message.error("自动标注算子编排当前仅支持图片数据集");
|
||||
return;
|
||||
}
|
||||
|
||||
const outputDatasetName = values.outputDatasetName?.trim();
|
||||
const pipeline = selectedOperators.map((operator, index) => {
|
||||
const overrides = {
|
||||
@@ -200,10 +190,10 @@ export default function AnnotationOperatorTaskCreate() {
|
||||
label="选择数据集"
|
||||
name="datasetId"
|
||||
rules={[{ required: true, message: "请选择数据集" }]}
|
||||
extra="自动标注算子编排当前仅支持图片数据集"
|
||||
extra="请选择用于自动标注的数据集"
|
||||
>
|
||||
<Select
|
||||
placeholder="请选择图片数据集"
|
||||
placeholder="请选择数据集"
|
||||
optionFilterProp="label"
|
||||
options={datasets.map((dataset) => ({
|
||||
label: (
|
||||
@@ -215,12 +205,11 @@ export default function AnnotationOperatorTaskCreate() {
|
||||
{dataset.name}
|
||||
</div>
|
||||
<div className="text-xs text-gray-500">
|
||||
{dataset?.fileCount} 文件 • {dataset.size}
|
||||
{dataset.datasetType} • {dataset?.fileCount} 文件 • {dataset.size}
|
||||
</div>
|
||||
</div>
|
||||
),
|
||||
value: dataset.id,
|
||||
disabled: dataset.datasetType !== DatasetType.IMAGE,
|
||||
}))}
|
||||
/>
|
||||
</Form.Item>
|
||||
|
||||
@@ -210,6 +210,10 @@ class AutoAnnotationTask(Base):
|
||||
dataset_name = Column(
|
||||
String(255), nullable=True, comment="数据集名称(冗余字段,方便查询)"
|
||||
)
|
||||
dataset_type = Column(
|
||||
String(50), nullable=False, default="IMAGE",
|
||||
comment="数据集类型: IMAGE/TEXT/AUDIO/VIDEO",
|
||||
)
|
||||
created_by = Column(String(255), nullable=True, comment="任务创建人")
|
||||
config = Column(JSON, nullable=False, comment="任务配置(模型规模、置信度等)")
|
||||
file_ids = Column(
|
||||
|
||||
@@ -85,23 +85,28 @@ async def _create_task_internal(
|
||||
await assert_dataset_access(db, normalized_request.dataset_id, user_context)
|
||||
# 尝试获取数据集名称和总量用于冗余字段
|
||||
dataset_name = None
|
||||
dataset_type = "IMAGE"
|
||||
total_images = len(normalized_request.file_ids) if normalized_request.file_ids else 0
|
||||
try:
|
||||
dm_client = DatasetManagementService(db)
|
||||
dataset = await dm_client.get_dataset(normalized_request.dataset_id)
|
||||
if dataset is not None:
|
||||
dataset_name = dataset.name
|
||||
dataset_type = getattr(dataset, "datasetType", None) or "IMAGE"
|
||||
if not normalized_request.file_ids:
|
||||
total_images = getattr(dataset, "fileCount", 0) or 0
|
||||
except Exception as e: # pragma: no cover - 容错
|
||||
logger.warning("Failed to fetch dataset summary for annotation task: %s", e)
|
||||
|
||||
resolved_dataset_type = normalized_request.dataset_type or dataset_type
|
||||
|
||||
return await service.create_task(
|
||||
db,
|
||||
normalized_request,
|
||||
user_context=user_context,
|
||||
dataset_name=dataset_name,
|
||||
total_images=total_images,
|
||||
dataset_type=resolved_dataset_type,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -75,6 +75,11 @@ class CreateAutoAnnotationTaskRequest(BaseModel):
|
||||
|
||||
name: str = Field(..., min_length=1, max_length=255, description="任务名称")
|
||||
dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
|
||||
dataset_type: Optional[str] = Field(
|
||||
default=None,
|
||||
alias="datasetType",
|
||||
description="数据集类型: IMAGE/TEXT/AUDIO/VIDEO(不传时由后端自动获取)",
|
||||
)
|
||||
config: Optional[AutoAnnotationConfig] = Field(
|
||||
default=None,
|
||||
description="兼容旧版 YOLO 任务配置",
|
||||
@@ -120,6 +125,7 @@ class AutoAnnotationTaskResponse(BaseModel):
|
||||
name: str = Field(..., description="任务名称")
|
||||
dataset_id: str = Field(..., alias="datasetId", description="数据集ID")
|
||||
dataset_name: Optional[str] = Field(None, alias="datasetName", description="数据集名称")
|
||||
dataset_type: Optional[str] = Field(None, alias="datasetType", description="数据集类型: IMAGE/TEXT/AUDIO/VIDEO")
|
||||
task_mode: Optional[str] = Field(None, alias="taskMode", description="任务模式")
|
||||
executor_type: Optional[str] = Field(None, alias="executorType", description="执行器类型")
|
||||
pipeline: Optional[List[Dict[str, Any]]] = Field(None, description="算子编排定义")
|
||||
|
||||
@@ -141,6 +141,7 @@ class AutoAnnotationTaskService:
|
||||
user_context: RequestUserContext,
|
||||
dataset_name: Optional[str] = None,
|
||||
total_images: int = 0,
|
||||
dataset_type: str = "IMAGE",
|
||||
) -> AutoAnnotationTaskResponse:
|
||||
"""创建自动标注任务,初始状态为 pending。
|
||||
|
||||
@@ -170,6 +171,7 @@ class AutoAnnotationTaskService:
|
||||
name=request.name,
|
||||
dataset_id=request.dataset_id,
|
||||
dataset_name=dataset_name,
|
||||
dataset_type=dataset_type,
|
||||
created_by=user_context.user_id,
|
||||
config=normalized_config,
|
||||
task_mode=request.task_mode,
|
||||
|
||||
@@ -175,7 +175,7 @@ def _fetch_pending_task() -> Optional[Dict[str, Any]]:
|
||||
|
||||
query_sql = text(
|
||||
"""
|
||||
SELECT id, name, dataset_id, dataset_name, created_by,
|
||||
SELECT id, name, dataset_id, dataset_name, dataset_type, created_by,
|
||||
config, file_ids, pipeline,
|
||||
task_mode, executor_type,
|
||||
status, stop_requested, run_token,
|
||||
@@ -521,6 +521,35 @@ def _count_detections(sample: Dict[str, Any]) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 数据集类型 → sample key / 输出子目录 映射
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
DATASET_TYPE_SAMPLE_KEY: Dict[str, str] = {
|
||||
"IMAGE": "image",
|
||||
"TEXT": "text",
|
||||
"AUDIO": "audio",
|
||||
"VIDEO": "video",
|
||||
}
|
||||
|
||||
DATASET_TYPE_DATA_DIR: Dict[str, str] = {
|
||||
"IMAGE": "images",
|
||||
"TEXT": "data",
|
||||
"AUDIO": "data",
|
||||
"VIDEO": "data",
|
||||
}
|
||||
|
||||
|
||||
def _get_sample_key(dataset_type: str) -> str:
|
||||
"""根据数据集类型返回 sample dict 中主数据对应的 key。"""
|
||||
return DATASET_TYPE_SAMPLE_KEY.get(dataset_type.upper(), "image")
|
||||
|
||||
|
||||
def _get_data_dir_name(dataset_type: str) -> str:
|
||||
"""根据数据集类型返回输出子目录名。"""
|
||||
return DATASET_TYPE_DATA_DIR.get(dataset_type.upper(), "images")
|
||||
|
||||
|
||||
def _get_operator_whitelist() -> Optional[set[str]]:
|
||||
"""获取灰度白名单;返回 None 表示放开全部。"""
|
||||
|
||||
@@ -584,7 +613,7 @@ def _load_dataset_meta(dataset_id: str) -> Optional[Dict[str, Any]]:
|
||||
|
||||
sql = text(
|
||||
"""
|
||||
SELECT id, name, parent_dataset_id, path
|
||||
SELECT id, name, parent_dataset_id, path, dataset_type
|
||||
FROM t_dm_datasets
|
||||
WHERE id = :dataset_id
|
||||
"""
|
||||
@@ -638,11 +667,12 @@ def _load_files_by_ids(file_ids: List[str]) -> List[Tuple[str, str, str]]:
|
||||
return [(str(r[0]), str(r[1]), str(r[2])) for r in rows]
|
||||
|
||||
|
||||
def _ensure_output_dir(output_dir: str) -> str:
|
||||
"""确保输出目录及其 images/、annotations/ 子目录存在。"""
|
||||
def _ensure_output_dir(output_dir: str, dataset_type: str = "IMAGE") -> str:
|
||||
"""确保输出目录及其数据/annotations 子目录存在。"""
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
os.makedirs(os.path.join(output_dir, "images"), exist_ok=True)
|
||||
data_dir_name = _get_data_dir_name(dataset_type)
|
||||
os.makedirs(os.path.join(output_dir, data_dir_name), exist_ok=True)
|
||||
os.makedirs(os.path.join(output_dir, "annotations"), exist_ok=True)
|
||||
return output_dir
|
||||
|
||||
@@ -651,6 +681,7 @@ def _create_output_dataset(
|
||||
source_dataset_id: str,
|
||||
source_dataset_name: str,
|
||||
output_dataset_name: str,
|
||||
dataset_type: str = "IMAGE",
|
||||
) -> Tuple[str, str]:
|
||||
"""为自动标注结果创建一个新的数据集并返回 (dataset_id, path)。"""
|
||||
|
||||
@@ -673,7 +704,7 @@ def _create_output_dataset(
|
||||
"parent_dataset_id": parent_dataset_id,
|
||||
"name": output_dataset_name,
|
||||
"description": description,
|
||||
"dataset_type": "IMAGE",
|
||||
"dataset_type": dataset_type,
|
||||
"path": output_dir,
|
||||
"status": "ACTIVE",
|
||||
}
|
||||
@@ -690,31 +721,38 @@ def _register_output_dataset(
|
||||
output_dir: str,
|
||||
output_dataset_name: str,
|
||||
total_images: int,
|
||||
dataset_type: str = "IMAGE",
|
||||
) -> None:
|
||||
"""将自动标注结果注册到新建的数据集。"""
|
||||
|
||||
images_dir = os.path.join(output_dir, "images")
|
||||
if not os.path.isdir(images_dir):
|
||||
data_dir_name = _get_data_dir_name(dataset_type)
|
||||
data_dir = os.path.join(output_dir, data_dir_name)
|
||||
# 兼容旧任务和 IMAGE 算子(它们写入 images/ 目录)
|
||||
if not os.path.isdir(data_dir):
|
||||
fallback_dir = os.path.join(output_dir, "images")
|
||||
if os.path.isdir(fallback_dir):
|
||||
data_dir = fallback_dir
|
||||
else:
|
||||
logger.warning(
|
||||
"Auto-annotation images directory not found for task {}: {}",
|
||||
"Auto-annotation data directory not found for task {}: {}",
|
||||
task_id,
|
||||
images_dir,
|
||||
data_dir,
|
||||
)
|
||||
return
|
||||
|
||||
image_files: List[Tuple[str, str, int]] = []
|
||||
data_files: List[Tuple[str, str, int]] = []
|
||||
annotation_files: List[Tuple[str, str, int]] = []
|
||||
total_size = 0
|
||||
|
||||
for file_name in sorted(os.listdir(images_dir)):
|
||||
file_path = os.path.join(images_dir, file_name)
|
||||
for file_name in sorted(os.listdir(data_dir)):
|
||||
file_path = os.path.join(data_dir, file_name)
|
||||
if not os.path.isfile(file_path):
|
||||
continue
|
||||
try:
|
||||
file_size = os.path.getsize(file_path)
|
||||
except OSError:
|
||||
file_size = 0
|
||||
image_files.append((file_name, file_path, int(file_size)))
|
||||
data_files.append((file_name, file_path, int(file_size)))
|
||||
total_size += int(file_size)
|
||||
|
||||
annotations_dir = os.path.join(output_dir, "annotations")
|
||||
@@ -730,11 +768,11 @@ def _register_output_dataset(
|
||||
annotation_files.append((file_name, file_path, int(file_size)))
|
||||
total_size += int(file_size)
|
||||
|
||||
if not image_files:
|
||||
if not data_files:
|
||||
logger.warning(
|
||||
"No image files found in auto-annotation output for task {}: {}",
|
||||
"No data files found in auto-annotation output for task {}: {}",
|
||||
task_id,
|
||||
images_dir,
|
||||
data_dir,
|
||||
)
|
||||
return
|
||||
|
||||
@@ -759,7 +797,7 @@ def _register_output_dataset(
|
||||
with SQLManager.create_connect() as conn:
|
||||
added_count = 0
|
||||
|
||||
for file_name, file_path, file_size in image_files:
|
||||
for file_name, file_path, file_size in data_files:
|
||||
ext = os.path.splitext(file_name)[1].lstrip(".").upper() or None
|
||||
logical_path = os.path.relpath(file_path, output_dir).replace("\\", "/")
|
||||
conn.execute(
|
||||
@@ -811,7 +849,7 @@ def _register_output_dataset(
|
||||
"Registered auto-annotation output into dataset: dataset_id={}, name={}, added_files={}, added_size_bytes={}, task_id={}, output_dir={}",
|
||||
output_dataset_id,
|
||||
output_dataset_name,
|
||||
len(image_files) + len(annotation_files),
|
||||
len(data_files) + len(annotation_files),
|
||||
total_size,
|
||||
task_id,
|
||||
output_dir,
|
||||
@@ -832,6 +870,13 @@ def _process_single_task(task: Dict[str, Any]) -> None:
|
||||
pipeline_raw = task.get("pipeline")
|
||||
selected_file_ids: Optional[List[str]] = task.get("file_ids") or None
|
||||
|
||||
# 解析数据集类型,兜底从数据集元数据获取
|
||||
dataset_type = str(task.get("dataset_type") or "").upper() or "IMAGE"
|
||||
if dataset_type == "IMAGE" and not task.get("dataset_type"):
|
||||
source_meta = _load_dataset_meta(dataset_id)
|
||||
if source_meta and source_meta.get("dataset_type"):
|
||||
dataset_type = str(source_meta["dataset_type"]).upper()
|
||||
|
||||
output_dataset_name = _get_output_dataset_name(
|
||||
task_id=task_id,
|
||||
dataset_id=dataset_id,
|
||||
@@ -892,8 +937,9 @@ def _process_single_task(task: Dict[str, Any]) -> None:
|
||||
source_dataset_id=dataset_id,
|
||||
source_dataset_name=source_dataset_name,
|
||||
output_dataset_name=output_dataset_name,
|
||||
dataset_type=dataset_type,
|
||||
)
|
||||
output_dir = _ensure_output_dir(output_dir)
|
||||
output_dir = _ensure_output_dir(output_dir, dataset_type=dataset_type)
|
||||
|
||||
_update_task_status(
|
||||
task_id,
|
||||
@@ -959,8 +1005,9 @@ def _process_single_task(task: Dict[str, Any]) -> None:
|
||||
return
|
||||
|
||||
try:
|
||||
sample_key = _get_sample_key(dataset_type)
|
||||
sample = {
|
||||
"image": file_path,
|
||||
sample_key: file_path,
|
||||
"filename": file_name,
|
||||
}
|
||||
|
||||
@@ -983,7 +1030,7 @@ def _process_single_task(task: Dict[str, Any]) -> None:
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
"Failed to process image for task {}: file_path={}, error={}",
|
||||
"Failed to process file for task {}: file_path={}, error={}",
|
||||
task_id,
|
||||
file_path,
|
||||
e,
|
||||
@@ -1021,6 +1068,7 @@ def _process_single_task(task: Dict[str, Any]) -> None:
|
||||
output_dir=output_dir,
|
||||
output_dataset_name=output_dataset_name,
|
||||
total_images=total_images,
|
||||
dataset_type=dataset_type,
|
||||
)
|
||||
except Exception as e: # pragma: no cover - 防御性日志
|
||||
logger.error(
|
||||
|
||||
25
scripts/db/data-annotation-multitype-migration.sql
Normal file
25
scripts/db/data-annotation-multitype-migration.sql
Normal file
@@ -0,0 +1,25 @@
|
||||
-- =============================================
|
||||
-- 自动标注任务支持多数据集类型迁移
|
||||
-- 为 t_dm_auto_annotation_tasks 表添加 dataset_type 列
|
||||
-- =============================================
|
||||
|
||||
USE datamate;
|
||||
SET @db_name = DATABASE();
|
||||
|
||||
-- 添加 dataset_type 列(IMAGE/TEXT/AUDIO/VIDEO),已有记录默认为 IMAGE
|
||||
SET @ddl = (
|
||||
SELECT IF(
|
||||
EXISTS(
|
||||
SELECT 1
|
||||
FROM information_schema.COLUMNS
|
||||
WHERE TABLE_SCHEMA = @db_name
|
||||
AND TABLE_NAME = 't_dm_auto_annotation_tasks'
|
||||
AND COLUMN_NAME = 'dataset_type'
|
||||
),
|
||||
'SELECT ''skip: column dataset_type already exists''',
|
||||
'ALTER TABLE t_dm_auto_annotation_tasks ADD COLUMN dataset_type VARCHAR(50) NOT NULL DEFAULT ''IMAGE'' COMMENT ''数据集类型: IMAGE/TEXT/AUDIO/VIDEO'' AFTER dataset_name'
|
||||
)
|
||||
);
|
||||
PREPARE stmt FROM @ddl;
|
||||
EXECUTE stmt;
|
||||
DEALLOCATE PREPARE stmt;
|
||||
Reference in New Issue
Block a user