Files
DataMate/runtime/datamate-python/app/module/annotation/service/builtin_templates.py
Jerry Yan 977a930c97 feat(annotation): 支持音频和视频数据类型的标注任务
- 添加了音频和视频数据类型常量定义
- 实现了音频和视频标注模板的内置配置
- 扩展前端组件以支持按数据类型过滤标注模板
- 重构后端编辑器服务以处理音频和视频任务构建
- 更新数据库初始化脚本包含音频和视频标注模板
- 添加音频和视频数据类型的预览URL映射逻辑
2026-01-26 23:54:40 +08:00

397 lines
15 KiB
Python

from dataclasses import dataclass
from datetime import datetime
from typing import List
from sqlalchemy import select
from sqlalchemy.exc import IntegrityError
from sqlalchemy.ext.asyncio import AsyncSession
from app.core.logging import get_logger
from app.db.models.annotation_management import AnnotationTemplate
from app.module.annotation.utils.config_validator import LabelStudioConfigValidator
logger = get_logger(__name__)
DATA_TYPE_IMAGE = "image"
DATA_TYPE_AUDIO = "audio"
DATA_TYPE_VIDEO = "video"
CATEGORY_COMPUTER_VISION = "computer-vision"
CATEGORY_AUDIO_SPEECH = "audio-speech"
CATEGORY_VIDEO = "video"
STYLE_HORIZONTAL = "horizontal"
VERSION_DEFAULT = "1.0.0"
IMAGE_CLASSIFICATION_LABEL_CONFIG = """<View>
<Image name=\"image\" value=\"$image\"/>
<Choices name=\"choice\" toName=\"image\">
<Choice value=\"Adult content\"/>
<Choice value=\"Weapons\" />
<Choice value=\"Violence\" />
</Choices>
</View>"""
OBJECT_DETECTION_LABEL_CONFIG = """<View>
<Image name=\"image\" value=\"$image\"/>
<RectangleLabels name=\"label\" toName=\"image\">
<Label value=\"Airplane\" background=\"green\"/>
<Label value=\"Car\" background=\"blue\"/>
</RectangleLabels>
</View>"""
SEMANTIC_SEGMENTATION_MASK_LABEL_CONFIG = """<View>
<Image name=\"image\" value=\"$image\" zoom=\"true\"/>
<BrushLabels name=\"tag\" toName=\"image\">
<Label value=\"Airplane\" background=\"rgba(255, 0, 0, 0.7)\"/>
<Label value=\"Car\" background=\"rgba(0, 0, 255, 0.7)\"/>
</BrushLabels>
</View>"""
SEMANTIC_SEGMENTATION_POLYGON_LABEL_CONFIG = """<View>
<Header value=\"选择标签并点击图像开始\"/>
<Image name=\"image\" value=\"$image\" zoom=\"true\"/>
<PolygonLabels name=\"label\" toName=\"image\" strokeWidth=\"3\" pointSize=\"small\" opacity=\"0.9\">
<Label value=\"Airplane\" background=\"red\"/>
<Label value=\"Car\" background=\"blue\"/>
</PolygonLabels>
</View>"""
ASR_SEGMENTS_LABEL_CONFIG = """<View>
<Labels name=\"labels\" toName=\"audio\">
<Label value=\"Speech\" />
<Label value=\"Noise\" />
</Labels>
<Audio name=\"audio\" value=\"$audio\"/>
<TextArea name=\"transcription\" toName=\"audio\"
rows=\"2\" editable=\"true\"
perRegion=\"true\" required=\"true\" />
</View>"""
ASR_LABEL_CONFIG = """<View>
<Audio name=\"audio\" value=\"$audio\" zoom=\"true\" hotkey=\"ctrl+enter\" />
<Header value=\"转录音频内容\" />
<TextArea name=\"transcription\" toName=\"audio\"
rows=\"4\" editable=\"true\" maxSubmissions=\"1\" />
</View>"""
CONVERSATION_ANALYSIS_LABEL_CONFIG = """<View>
<Audio name=\"audio\" value=\"$audio\" hotkey=\"space\" sync=\"text\"/>
<Header value=\"对话记录\"/>
<Paragraphs audioUrl=\"$audio\" sync=\"audio\" name=\"text\" value=\"$text\"
layout=\"dialogue\" textKey=\"text\" nameKey=\"author\"
granularity=\"paragraph\" contextscroll=\"true\" />
<View style=\"position: sticky\">
<Header value=\"情感标签\"/>
<ParagraphLabels name=\"label\" toName=\"text\">
<Label value=\"Positive\" background=\"#00ff00\"/>
<Label value=\"Negative\" background=\"#ff0000\"/>
</ParagraphLabels>
</View>
</View>"""
INTENT_CLASSIFICATION_LABEL_CONFIG = """<View>
<Labels name=\"labels\" toName=\"audio\">
<Label value=\"Segment\" />
</Labels>
<Audio name=\"audio\" value=\"$audio\"/>
<Choices name=\"intent\" toName=\"audio\" perRegion=\"true\" required=\"true\">
<Choice value=\"Question\" />
<Choice value=\"Request\" />
<Choice value=\"Satisfied\" />
<Choice value=\"Interested\" />
<Choice value=\"Unsatisfied\" />
</Choices>
</View>"""
SIGNAL_QUALITY_LABEL_CONFIG = """<View>
<Rating name=\"rating\" toName=\"audio\" maxRating=\"10\" icon=\"star\" size=\"medium\" />
<Audio name=\"audio\" value=\"$audio\"/>
</View>"""
SOUND_EVENT_DETECTION_LABEL_CONFIG = """<View>
<Labels name=\"label\" toName=\"audio\" zoom=\"true\" hotkey=\"ctrl+enter\">
<Label value=\"Event A\" background=\"red\"/>
<Label value=\"Event B\" background=\"green\"/>
</Labels>
<Audio name=\"audio\" value=\"$audio\"/>
</View>"""
SPEAKER_SEGMENTATION_LABEL_CONFIG = """<View>
<Labels name=\"label\" toName=\"audio\" zoom=\"true\" hotkey=\"ctrl+enter\">
<Label value=\"Speaker one\" background=\"#00FF00\"/>
<Label value=\"Speaker two\" background=\"#12ad59\"/>
</Labels>
<Audio name=\"audio\" value=\"$audio\" />
</View>"""
VIDEO_CLASSIFICATION_LABEL_CONFIG = """<View>
<Video name=\"video\" value=\"$video\"/>
<Choices name=\"choice\" toName=\"video\" showInline=\"true\">
<Choice value=\"Sports\" />
<Choice value=\"News\" />
<Choice value=\"Entertainment\" />
<Choice value=\"Education\" />
</Choices>
</View>"""
VIDEO_OBJECT_TRACKING_LABEL_CONFIG = """<View>
<Labels name=\"videoLabels\" toName=\"video\" allowEmpty=\"true\">
<Label value=\"Man\" background=\"blue\"/>
<Label value=\"Woman\" background=\"red\"/>
<Label value=\"Other\" background=\"green\"/>
</Labels>
<Video name=\"video\" value=\"$video\" framerate=\"25.0\"/>
<VideoRectangle name=\"box\" toName=\"video\" />
</View>"""
VIDEO_TIMELINE_SEGMENTATION_LABEL_CONFIG = """<View>
<Header value=\"视频时间线分割\"/>
<Video name=\"video\" value=\"$video_url\" sync=\"audio\"/>
<Labels name=\"tricks\" toName=\"audio\" choice=\"multiple\">
<Label value=\"Intro\" background=\"#358EF3\"/>
<Label value=\"Content\" background=\"#1BB500\"/>
<Label value=\"Outro\" background=\"#FFA91D\"/>
</Labels>
<Audio name=\"audio\" value=\"$video_url\" sync=\"video\" speed=\"false\"/>
</View>"""
@dataclass(frozen=True)
class BuiltInTemplateDefinition:
id: str
name: str
description: str
data_type: str
labeling_type: str
label_config: str
style: str
category: str
version: str
BUILT_IN_TEMPLATES: List[BuiltInTemplateDefinition] = [
BuiltInTemplateDefinition(
id="tpl-image-classification-001",
name="图像分类",
description=(
"对图像进行分类,适用于内容审核、安全检测、社交媒体审核等场景。"
"关联模型:ResNet、EfficientNet、Vision Transformer"
),
data_type=DATA_TYPE_IMAGE,
labeling_type="image-classification",
label_config=IMAGE_CLASSIFICATION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_COMPUTER_VISION,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-object-detection-001",
name="目标检测(边界框)",
description=(
"在目标周围绘制边界框,适用于自动驾驶、交通监控、安防监控、零售分析等场景。"
"关联模型:YOLO、R-CNN、SSD"
),
data_type=DATA_TYPE_IMAGE,
labeling_type="object-detection",
label_config=OBJECT_DETECTION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_COMPUTER_VISION,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-semantic-segmentation-mask-001",
name="语义分割(掩码)",
description=(
"使用画笔工具在目标周围绘制掩码,适用于自动驾驶、医学图像分析、卫星图像分析等场景。"
"关联模型:U-Net、DeepLab、Mask R-CNN"
),
data_type=DATA_TYPE_IMAGE,
labeling_type="semantic-segmentation-mask",
label_config=SEMANTIC_SEGMENTATION_MASK_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_COMPUTER_VISION,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-semantic-segmentation-polygon-001",
name="语义分割(多边形)",
description=(
"在目标周围绘制多边形,适用于自动驾驶、医学图像、卫星图像、精准农业等场景。"
"关联模型:DeepLab、PSPNet、U-Net"
),
data_type=DATA_TYPE_IMAGE,
labeling_type="semantic-segmentation-polygon",
label_config=SEMANTIC_SEGMENTATION_POLYGON_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_COMPUTER_VISION,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-asr-segments-001",
name="语音识别(分段)",
description=(
"对音频进行语音活动分段并转录文本,适用于呼叫中心转录、会议记录、播客转录等场景。"
"关联模型:Whisper、Wav2Vec2、DeepSpeech"
),
data_type=DATA_TYPE_AUDIO,
labeling_type="asr-segments",
label_config=ASR_SEGMENTS_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-asr-001",
name="语音识别",
description=(
"转录音频内容,适用于播客转录、会议记录、客服通话、字幕生成等场景。"
"关联模型:Whisper、Wav2Vec、DeepSpeech"
),
data_type=DATA_TYPE_AUDIO,
labeling_type="asr",
label_config=ASR_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-conversation-analysis-001",
name="对话分析",
description="分析对话语句并标注事实和情感方面,适用于呼叫中心质检、客服分析、会议分析等场景",
data_type=DATA_TYPE_AUDIO,
labeling_type="conversation-analysis",
label_config=CONVERSATION_ANALYSIS_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-intent-classification-001",
name="意图分类",
description="进行语音活动分段并选择语音意图,适用于语音助手、智能音箱、IVR系统等场景",
data_type=DATA_TYPE_AUDIO,
labeling_type="intent-classification",
label_config=INTENT_CLASSIFICATION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-signal-quality-001",
name="信号质量检测",
description="评估音频信号质量,适用于电信、呼叫中心质检、音频制作、VoIP质量评估等场景",
data_type=DATA_TYPE_AUDIO,
labeling_type="signal-quality",
label_config=SIGNAL_QUALITY_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-sound-event-001",
name="声音事件检测",
description="选择音频片段并分类声音事件,适用于安防监控、智慧城市、环境监测、工业监测等场景",
data_type=DATA_TYPE_AUDIO,
labeling_type="sound-event-detection",
label_config=SOUND_EVENT_DETECTION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-speaker-segmentation-001",
name="说话人分割",
description="执行说话人分割/话者分离任务,适用于会议转录、播客制作、呼叫中心分析等场景",
data_type=DATA_TYPE_AUDIO,
labeling_type="speaker-segmentation",
label_config=SPEAKER_SEGMENTATION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-video-classification-001",
name="视频分类",
description="对视频进行整体分类,适用于内容审核、媒体分析、质检等场景",
data_type=DATA_TYPE_VIDEO,
labeling_type="video-classification",
label_config=VIDEO_CLASSIFICATION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_VIDEO,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-video-object-tracking-001",
name="视频目标追踪",
description="在视频中追踪目标对象,适用于安防监控、交通分析、行为分析等场景",
data_type=DATA_TYPE_VIDEO,
labeling_type="video-object-tracking",
label_config=VIDEO_OBJECT_TRACKING_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_VIDEO,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-video-timeline-segmentation-001",
name="视频时间线分割",
description="对视频时间线进行分段标注,适用于视频剪辑、内容索引等场景",
data_type=DATA_TYPE_VIDEO,
labeling_type="video-timeline-segmentation",
label_config=VIDEO_TIMELINE_SEGMENTATION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_VIDEO,
version=VERSION_DEFAULT,
),
]
assert len({template.id for template in BUILT_IN_TEMPLATES}) == len(BUILT_IN_TEMPLATES), (
"内置模板ID不能重复"
)
async def ensure_builtin_annotation_templates(db: AsyncSession) -> int:
inserted = 0
for template in BUILT_IN_TEMPLATES:
label_config = template.label_config.strip()
assert label_config, f"内置模板 {template.id} 的 label_config 不能为空"
valid, error = LabelStudioConfigValidator.validate_xml(label_config)
if not valid:
raise ValueError(f"内置模板 {template.id} 的 label_config 无效: {error}")
result = await db.execute(
select(AnnotationTemplate).where(AnnotationTemplate.id == template.id)
)
existing = result.scalar_one_or_none()
if existing:
continue
record = AnnotationTemplate(
id=template.id,
name=template.name,
description=template.description,
data_type=template.data_type,
labeling_type=template.labeling_type,
configuration=None,
label_config=label_config,
style=template.style,
category=template.category,
built_in=True,
version=template.version,
created_at=datetime.now(),
)
db.add(record)
try:
await db.commit()
inserted += 1
except IntegrityError:
await db.rollback()
logger.warning(f"内置模板已存在,跳过插入: {template.id}")
except Exception:
await db.rollback()
logger.exception(f"写入内置模板失败: {template.id}")
raise
return inserted