feat(annotation): 支持音频和视频数据类型的标注任务

- 添加了音频和视频数据类型常量定义
- 实现了音频和视频标注模板的内置配置
- 扩展前端组件以支持按数据类型过滤标注模板
- 重构后端编辑器服务以处理音频和视频任务构建
- 更新数据库初始化脚本包含音频和视频标注模板
- 添加音频和视频数据类型的预览URL映射逻辑
This commit is contained in:
2026-01-26 23:54:40 +08:00
parent 47295e8cdf
commit 977a930c97
5 changed files with 461 additions and 59 deletions

View File

@@ -13,7 +13,11 @@ from app.module.annotation.utils.config_validator import LabelStudioConfigValida
logger = get_logger(__name__)
DATA_TYPE_IMAGE = "image"
DATA_TYPE_AUDIO = "audio"
DATA_TYPE_VIDEO = "video"
CATEGORY_COMPUTER_VISION = "computer-vision"
CATEGORY_AUDIO_SPEECH = "audio-speech"
CATEGORY_VIDEO = "video"
STYLE_HORIZONTAL = "horizontal"
VERSION_DEFAULT = "1.0.0"
@@ -51,6 +55,105 @@ SEMANTIC_SEGMENTATION_POLYGON_LABEL_CONFIG = """<View>
</PolygonLabels>
</View>"""
ASR_SEGMENTS_LABEL_CONFIG = """<View>
<Labels name=\"labels\" toName=\"audio\">
<Label value=\"Speech\" />
<Label value=\"Noise\" />
</Labels>
<Audio name=\"audio\" value=\"$audio\"/>
<TextArea name=\"transcription\" toName=\"audio\"
rows=\"2\" editable=\"true\"
perRegion=\"true\" required=\"true\" />
</View>"""
ASR_LABEL_CONFIG = """<View>
<Audio name=\"audio\" value=\"$audio\" zoom=\"true\" hotkey=\"ctrl+enter\" />
<Header value=\"转录音频内容\" />
<TextArea name=\"transcription\" toName=\"audio\"
rows=\"4\" editable=\"true\" maxSubmissions=\"1\" />
</View>"""
CONVERSATION_ANALYSIS_LABEL_CONFIG = """<View>
<Audio name=\"audio\" value=\"$audio\" hotkey=\"space\" sync=\"text\"/>
<Header value=\"对话记录\"/>
<Paragraphs audioUrl=\"$audio\" sync=\"audio\" name=\"text\" value=\"$text\"
layout=\"dialogue\" textKey=\"text\" nameKey=\"author\"
granularity=\"paragraph\" contextscroll=\"true\" />
<View style=\"position: sticky\">
<Header value=\"情感标签\"/>
<ParagraphLabels name=\"label\" toName=\"text\">
<Label value=\"Positive\" background=\"#00ff00\"/>
<Label value=\"Negative\" background=\"#ff0000\"/>
</ParagraphLabels>
</View>
</View>"""
INTENT_CLASSIFICATION_LABEL_CONFIG = """<View>
<Labels name=\"labels\" toName=\"audio\">
<Label value=\"Segment\" />
</Labels>
<Audio name=\"audio\" value=\"$audio\"/>
<Choices name=\"intent\" toName=\"audio\" perRegion=\"true\" required=\"true\">
<Choice value=\"Question\" />
<Choice value=\"Request\" />
<Choice value=\"Satisfied\" />
<Choice value=\"Interested\" />
<Choice value=\"Unsatisfied\" />
</Choices>
</View>"""
SIGNAL_QUALITY_LABEL_CONFIG = """<View>
<Rating name=\"rating\" toName=\"audio\" maxRating=\"10\" icon=\"star\" size=\"medium\" />
<Audio name=\"audio\" value=\"$audio\"/>
</View>"""
SOUND_EVENT_DETECTION_LABEL_CONFIG = """<View>
<Labels name=\"label\" toName=\"audio\" zoom=\"true\" hotkey=\"ctrl+enter\">
<Label value=\"Event A\" background=\"red\"/>
<Label value=\"Event B\" background=\"green\"/>
</Labels>
<Audio name=\"audio\" value=\"$audio\"/>
</View>"""
SPEAKER_SEGMENTATION_LABEL_CONFIG = """<View>
<Labels name=\"label\" toName=\"audio\" zoom=\"true\" hotkey=\"ctrl+enter\">
<Label value=\"Speaker one\" background=\"#00FF00\"/>
<Label value=\"Speaker two\" background=\"#12ad59\"/>
</Labels>
<Audio name=\"audio\" value=\"$audio\" />
</View>"""
VIDEO_CLASSIFICATION_LABEL_CONFIG = """<View>
<Video name=\"video\" value=\"$video\"/>
<Choices name=\"choice\" toName=\"video\" showInline=\"true\">
<Choice value=\"Sports\" />
<Choice value=\"News\" />
<Choice value=\"Entertainment\" />
<Choice value=\"Education\" />
</Choices>
</View>"""
VIDEO_OBJECT_TRACKING_LABEL_CONFIG = """<View>
<Labels name=\"videoLabels\" toName=\"video\" allowEmpty=\"true\">
<Label value=\"Man\" background=\"blue\"/>
<Label value=\"Woman\" background=\"red\"/>
<Label value=\"Other\" background=\"green\"/>
</Labels>
<Video name=\"video\" value=\"$video\" framerate=\"25.0\"/>
<VideoRectangle name=\"box\" toName=\"video\" />
</View>"""
VIDEO_TIMELINE_SEGMENTATION_LABEL_CONFIG = """<View>
<Header value=\"视频时间线分割\"/>
<Video name=\"video\" value=\"$video_url\" sync=\"audio\"/>
<Labels name=\"tricks\" toName=\"audio\" choice=\"multiple\">
<Label value=\"Intro\" background=\"#358EF3\"/>
<Label value=\"Content\" background=\"#1BB500\"/>
<Label value=\"Outro\" background=\"#FFA91D\"/>
</Labels>
<Audio name=\"audio\" value=\"$video_url\" sync=\"video\" speed=\"false\"/>
</View>"""
@dataclass(frozen=True)
class BuiltInTemplateDefinition:
@@ -122,6 +225,122 @@ BUILT_IN_TEMPLATES: List[BuiltInTemplateDefinition] = [
category=CATEGORY_COMPUTER_VISION,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-asr-segments-001",
name="语音识别(分段)",
description=(
"对音频进行语音活动分段并转录文本,适用于呼叫中心转录、会议记录、播客转录等场景。"
"关联模型:Whisper、Wav2Vec2、DeepSpeech"
),
data_type=DATA_TYPE_AUDIO,
labeling_type="asr-segments",
label_config=ASR_SEGMENTS_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-asr-001",
name="语音识别",
description=(
"转录音频内容,适用于播客转录、会议记录、客服通话、字幕生成等场景。"
"关联模型:Whisper、Wav2Vec、DeepSpeech"
),
data_type=DATA_TYPE_AUDIO,
labeling_type="asr",
label_config=ASR_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-conversation-analysis-001",
name="对话分析",
description="分析对话语句并标注事实和情感方面,适用于呼叫中心质检、客服分析、会议分析等场景",
data_type=DATA_TYPE_AUDIO,
labeling_type="conversation-analysis",
label_config=CONVERSATION_ANALYSIS_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-intent-classification-001",
name="意图分类",
description="进行语音活动分段并选择语音意图,适用于语音助手、智能音箱、IVR系统等场景",
data_type=DATA_TYPE_AUDIO,
labeling_type="intent-classification",
label_config=INTENT_CLASSIFICATION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-signal-quality-001",
name="信号质量检测",
description="评估音频信号质量,适用于电信、呼叫中心质检、音频制作、VoIP质量评估等场景",
data_type=DATA_TYPE_AUDIO,
labeling_type="signal-quality",
label_config=SIGNAL_QUALITY_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-sound-event-001",
name="声音事件检测",
description="选择音频片段并分类声音事件,适用于安防监控、智慧城市、环境监测、工业监测等场景",
data_type=DATA_TYPE_AUDIO,
labeling_type="sound-event-detection",
label_config=SOUND_EVENT_DETECTION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-speaker-segmentation-001",
name="说话人分割",
description="执行说话人分割/话者分离任务,适用于会议转录、播客制作、呼叫中心分析等场景",
data_type=DATA_TYPE_AUDIO,
labeling_type="speaker-segmentation",
label_config=SPEAKER_SEGMENTATION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_AUDIO_SPEECH,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-video-classification-001",
name="视频分类",
description="对视频进行整体分类,适用于内容审核、媒体分析、质检等场景",
data_type=DATA_TYPE_VIDEO,
labeling_type="video-classification",
label_config=VIDEO_CLASSIFICATION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_VIDEO,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-video-object-tracking-001",
name="视频目标追踪",
description="在视频中追踪目标对象,适用于安防监控、交通分析、行为分析等场景",
data_type=DATA_TYPE_VIDEO,
labeling_type="video-object-tracking",
label_config=VIDEO_OBJECT_TRACKING_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_VIDEO,
version=VERSION_DEFAULT,
),
BuiltInTemplateDefinition(
id="tpl-video-timeline-segmentation-001",
name="视频时间线分割",
description="对视频时间线进行分段标注,适用于视频剪辑、内容索引等场景",
data_type=DATA_TYPE_VIDEO,
labeling_type="video-timeline-segmentation",
label_config=VIDEO_TIMELINE_SEGMENTATION_LABEL_CONFIG,
style=STYLE_HORIZONTAL,
category=CATEGORY_VIDEO,
version=VERSION_DEFAULT,
),
]
assert len({template.id for template in BUILT_IN_TEMPLATES}) == len(BUILT_IN_TEMPLATES), (

View File

@@ -43,6 +43,8 @@ logger = get_logger(__name__)
TEXT_DATA_KEY = "text"
IMAGE_DATA_KEY = "image"
AUDIO_DATA_KEY = "audio"
VIDEO_DATA_KEY = "video"
DATASET_ID_KEY = "dataset_id"
FILE_ID_KEY = "file_id"
FILE_NAME_KEY = "file_name"
@@ -53,9 +55,19 @@ SEGMENT_INDEX_KEY = "segment_index"
SEGMENT_INDEX_CAMEL_KEY = "segmentIndex"
JSONL_EXTENSION = ".jsonl"
TEXTUAL_OBJECT_CATEGORIES = {"text", "document"}
MEDIA_OBJECT_CATEGORIES = {"image"}
IMAGE_OBJECT_CATEGORIES = {"image"}
MEDIA_OBJECT_CATEGORIES = {"media"}
OBJECT_NAME_HEADER_PREFIX = "dm_object_header_"
SUPPORTED_EDITOR_DATASET_TYPES = ("TEXT", "IMAGE")
DATASET_TYPE_TEXT = "TEXT"
DATASET_TYPE_IMAGE = "IMAGE"
DATASET_TYPE_AUDIO = "AUDIO"
DATASET_TYPE_VIDEO = "VIDEO"
SUPPORTED_EDITOR_DATASET_TYPES = (
DATASET_TYPE_TEXT,
DATASET_TYPE_IMAGE,
DATASET_TYPE_AUDIO,
DATASET_TYPE_VIDEO,
)
SEGMENTATION_ENABLED_KEY = "segmentation_enabled"
@@ -174,21 +186,19 @@ class AnnotationEditorService:
return keys[0]
@classmethod
def _resolve_primary_media_key(
def _resolve_media_value_keys(
cls,
label_config: Optional[str],
default_key: str,
categories: Optional[set[str]] = None,
) -> str:
) -> List[str]:
if not label_config:
return default_key
return [default_key]
target_categories = categories or set()
keys = cls._extract_object_value_keys_by_category(label_config, target_categories)
if not keys:
return default_key
if default_key in keys:
return default_key
return keys[0]
return [default_key]
return keys
@staticmethod
def _try_parse_json_payload(text_content: str) -> Optional[Dict[str, Any]]:
@@ -467,7 +477,10 @@ class AnnotationEditorService:
dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
if dataset_type not in SUPPORTED_EDITOR_DATASET_TYPES:
raise HTTPException(status_code=400, detail="当前仅支持 TEXT/IMAGE 项目的内嵌编辑器")
raise HTTPException(
status_code=400,
detail="当前仅支持 TEXT/IMAGE/AUDIO/VIDEO 项目的内嵌编辑器",
)
file_result = await self.db.execute(
select(DatasetFiles).where(
@@ -479,9 +492,15 @@ class AnnotationEditorService:
if not file_record:
raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
if dataset_type == "IMAGE":
if dataset_type == DATASET_TYPE_IMAGE:
return await self._build_image_task(project, file_record, file_id)
if dataset_type == DATASET_TYPE_AUDIO:
return await self._build_audio_task(project, file_record, file_id)
if dataset_type == DATASET_TYPE_VIDEO:
return await self._build_video_task(project, file_record, file_id)
return await self._build_text_task(project, file_record, file_id, segment_index)
async def _build_text_task(
@@ -668,23 +687,20 @@ class AnnotationEditorService:
currentSegmentIndex=current_segment_index,
)
async def _build_image_task(
async def _build_media_task(
self,
project: LabelingProject,
file_record: DatasetFiles,
file_id: str,
default_key: str,
categories: set[str],
) -> EditorTaskResponse:
label_config = await self._resolve_project_label_config(project)
image_key = self._resolve_primary_media_key(
label_config,
IMAGE_DATA_KEY,
MEDIA_OBJECT_CATEGORIES,
)
media_keys = self._resolve_media_value_keys(label_config, default_key, categories)
preview_url = self._build_file_preview_url(project.dataset_id, file_id)
file_name = str(getattr(file_record, "file_name", ""))
task_data: Dict[str, Any] = {
image_key: preview_url,
FILE_ID_KEY: file_id,
FILE_ID_CAMEL_KEY: file_id,
DATASET_ID_KEY: project.dataset_id,
@@ -692,6 +708,9 @@ class AnnotationEditorService:
FILE_NAME_KEY: file_name,
FILE_NAME_CAMEL_KEY: file_name,
}
for key in media_keys:
task_data[key] = preview_url
self._apply_text_placeholders(task_data, label_config)
# 获取现有标注
ann_result = await self.db.execute(
@@ -738,6 +757,48 @@ class AnnotationEditorService:
currentSegmentIndex=0,
)
async def _build_image_task(
self,
project: LabelingProject,
file_record: DatasetFiles,
file_id: str,
) -> EditorTaskResponse:
return await self._build_media_task(
project=project,
file_record=file_record,
file_id=file_id,
default_key=IMAGE_DATA_KEY,
categories=IMAGE_OBJECT_CATEGORIES,
)
async def _build_audio_task(
self,
project: LabelingProject,
file_record: DatasetFiles,
file_id: str,
) -> EditorTaskResponse:
return await self._build_media_task(
project=project,
file_record=file_record,
file_id=file_id,
default_key=AUDIO_DATA_KEY,
categories=MEDIA_OBJECT_CATEGORIES,
)
async def _build_video_task(
self,
project: LabelingProject,
file_record: DatasetFiles,
file_id: str,
) -> EditorTaskResponse:
return await self._build_media_task(
project=project,
file_record=file_record,
file_id=file_id,
default_key=VIDEO_DATA_KEY,
categories=MEDIA_OBJECT_CATEGORIES,
)
async def upsert_annotation(self, project_id: str, file_id: str, request: UpsertAnnotationRequest) -> UpsertAnnotationResponse:
project = await self._get_project_or_404(project_id)