Files
DataMate/runtime/datamate-python/app/module/annotation/schema/editor.py
Jerry Yan 719f54bf2e feat(annotation): 完善文件版本管理和标注同步功能
- 将 useNewVersionUsingPost 重命名为 applyNewVersionUsingPost
- 添加 fileVersionCheckSeqRef 避免版本检查竞态条件
- 移除 checkingFileVersion 状态变量的渲染依赖
- 在文件版本信息中添加 annotationVersionUnknown 字段
- 修复前端文件版本比较显示的 JSX 语法
- 添加历史标注缺少版本信息的提示显示
- 配置 Alembic 异步数据库迁移环境支持 aiomysql
- 添加文件版本未知状态的后端判断逻辑
- 实现标注清除时的段落注释清理功能
- 添加知识库同步钩子到版本更新流程
2026-02-05 23:22:49 +08:00

233 lines
8.0 KiB
Python

"""
标注编辑器(Label Studio Editor)接口模型
设计目标:
- 单人单份最终标签:每个 project_id + file_id 只维护 1 条最终标注结果
- 完全兼容 Label Studio:标注结果以 annotation 原始 JSON 形式存储与返回
"""
from __future__ import annotations
from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Field, ConfigDict
from app.db.models.annotation_management import (
ANNOTATION_STATUS_ANNOTATED,
ANNOTATION_STATUS_IN_PROGRESS,
ANNOTATION_STATUS_NO_ANNOTATION,
ANNOTATION_STATUS_NOT_APPLICABLE,
)
class AnnotationStatus(str, Enum):
"""标注状态枚举"""
ANNOTATED = ANNOTATION_STATUS_ANNOTATED
IN_PROGRESS = ANNOTATION_STATUS_IN_PROGRESS
NO_ANNOTATION = ANNOTATION_STATUS_NO_ANNOTATION
NOT_APPLICABLE = ANNOTATION_STATUS_NOT_APPLICABLE
class EditorProjectInfo(BaseModel):
"""编辑器项目元信息"""
project_id: str = Field(
...,
alias="projectId",
description="DataMate 标注项目ID(t_dm_labeling_projects.id)",
)
dataset_id: str = Field(
..., alias="datasetId", description="数据集ID(t_dm_datasets.id)"
)
dataset_type: Optional[str] = Field(
None,
alias="datasetType",
description="数据集类型(TEXT/IMAGE/AUDIO/VIDEO 等)",
)
template_id: Optional[str] = Field(
None, alias="templateId", description="模板ID(t_dm_annotation_templates.id)"
)
label_config: Optional[str] = Field(
None, alias="labelConfig", description="Label Studio XML 配置"
)
supported: bool = Field(..., description="当前数据类型是否支持内嵌编辑器")
unsupported_reason: Optional[str] = Field(
None, alias="unsupportedReason", description="不支持原因(当 supported=false)"
)
model_config = ConfigDict(populate_by_name=True)
class EditorTaskListItem(BaseModel):
"""编辑器任务列表条目(对应一个数据集文件)"""
file_id: str = Field(..., alias="fileId", description="文件ID")
file_name: str = Field(..., alias="fileName", description="文件名")
file_type: Optional[str] = Field(None, alias="fileType", description="文件类型")
has_annotation: bool = Field(
..., alias="hasAnnotation", description="是否已有最终标注"
)
annotation_updated_at: Optional[datetime] = Field(
None, alias="annotationUpdatedAt", description="标注更新时间"
)
annotation_status: Optional[AnnotationStatus] = Field(
None,
alias="annotationStatus",
description="标注状态",
)
model_config = ConfigDict(populate_by_name=True, use_enum_values=True)
class EditorTaskListResponse(BaseModel):
"""编辑器任务列表响应"""
content: List[EditorTaskListItem] = Field(..., description="任务列表")
total_elements: int = Field(..., alias="totalElements", description="总条数")
total_pages: int = Field(..., alias="totalPages", description="总页数")
page: int = Field(..., description="页码(从0开始)")
size: int = Field(..., description="每页大小")
model_config = ConfigDict(populate_by_name=True)
class SegmentInfo(BaseModel):
"""段落摘要(用于文本分段标注)"""
idx: int = Field(..., description="段落索引")
has_annotation: bool = Field(
False, alias="hasAnnotation", description="该段落是否已有标注"
)
line_index: int = Field(0, alias="lineIndex", description="JSONL 行索引(从0开始)")
chunk_index: int = Field(
0, alias="chunkIndex", description="行内分片索引(从0开始)"
)
model_config = ConfigDict(populate_by_name=True)
class EditorTaskResponse(BaseModel):
"""编辑器任务详情(可直接喂给 Label Studio Editor 的 task 对象)"""
task: Dict[str, Any] = Field(..., description="Label Studio task 对象")
annotation_updated_at: Optional[datetime] = Field(
None, alias="annotationUpdatedAt", description="标注更新时间"
)
# 分段相关字段
segmented: bool = Field(False, description="是否启用分段模式")
total_segments: int = Field(0, alias="totalSegments", description="总段落数")
current_segment_index: int = Field(
0, alias="currentSegmentIndex", description="当前段落索引"
)
model_config = ConfigDict(populate_by_name=True)
class SegmentDetail(BaseModel):
"""段落内容"""
idx: int = Field(..., description="段落索引")
text: str = Field(..., description="段落文本")
has_annotation: bool = Field(
False, alias="hasAnnotation", description="该段落是否已有标注"
)
line_index: int = Field(0, alias="lineIndex", description="JSONL 行索引(从0开始)")
chunk_index: int = Field(
0, alias="chunkIndex", description="行内分片索引(从0开始)"
)
model_config = ConfigDict(populate_by_name=True)
class EditorTaskSegmentResponse(BaseModel):
"""编辑器单段内容响应"""
segmented: bool = Field(False, description="是否启用分段模式")
segment: Optional[SegmentDetail] = Field(None, description="段落内容")
total_segments: int = Field(0, alias="totalSegments", description="总段落数")
current_segment_index: int = Field(
0, alias="currentSegmentIndex", description="当前段落索引"
)
model_config = ConfigDict(populate_by_name=True)
class UpsertAnnotationRequest(BaseModel):
"""保存/覆盖最终标注(Label Studio annotation 原始对象)"""
annotation: Dict[str, Any] = Field(
..., description="Label Studio annotation 对象(包含 result 等)"
)
annotation_status: Optional[AnnotationStatus] = Field(
None,
alias="annotationStatus",
description="标注状态(无标注传 NO_ANNOTATION,不适用传 NOT_APPLICABLE,IN_PROGRESS 由后端维护)",
)
expected_updated_at: Optional[datetime] = Field(
None,
alias="expectedUpdatedAt",
description="乐观锁:若提供则要求与当前记录 updated_at 一致,否则返回 409",
)
# 分段保存支持
segment_index: Optional[int] = Field(
None,
alias="segmentIndex",
description="段落索引(分段模式下必填)",
)
model_config = ConfigDict(populate_by_name=True, use_enum_values=True)
class UpsertAnnotationResponse(BaseModel):
"""保存/覆盖最终标注响应"""
annotation_id: str = Field(
...,
alias="annotationId",
description="标注结果ID(t_dm_annotation_results.id)",
)
updated_at: datetime = Field(..., alias="updatedAt", description="标注更新时间")
model_config = ConfigDict(populate_by_name=True)
class FileVersionCheckResponse(BaseModel):
"""文件版本检查响应"""
file_id: str = Field(..., alias="fileId", description="文件ID")
current_file_version: int = Field(
..., alias="currentFileVersion", description="当前文件版本"
)
annotation_file_version: Optional[int] = Field(
None, alias="annotationFileVersion", description="标注时的文件版本"
)
annotation_version_unknown: bool = Field(
False,
alias="annotationVersionUnknown",
description="是否缺少标注时的文件版本(历史数据)",
)
has_new_version: bool = Field(
..., alias="hasNewVersion", description="是否有新版本"
)
model_config = ConfigDict(populate_by_name=True)
class UseNewVersionResponse(BaseModel):
"""使用新版本响应"""
file_id: str = Field(..., alias="fileId", description="文件ID")
previous_file_version: Optional[int] = Field(
None, alias="previousFileVersion", description="之前标注的文件版本"
)
current_file_version: int = Field(
..., alias="currentFileVersion", description="当前文件版本"
)
message: str = Field(..., description="操作结果消息")
model_config = ConfigDict(populate_by_name=True)