You've already forked DataMate
- 新增 getEditorTaskSegmentsUsingGet 接口用于获取任务分段信息 - 移除 SegmentInfo 中的 text、start、end 字段,精简数据结构 - 添加 EditorTaskSegmentsResponse 类型定义用于分段摘要响应 - 实现服务端 get_task_segments 方法,支持分段信息查询 - 重构前端组件缓存机制,使用 segmentSummaryFileRef 管理分段状态 - 优化分段构建逻辑,提取 _build_segment_contexts 公共方法 - 调整后端 _build_text_task 方法中的分段处理流程 - 更新 API 类型定义,统一 RequestParams 和 RequestPayload 类型
148 lines
5.9 KiB
Python
148 lines
5.9 KiB
Python
"""
|
|
标注编辑器(Label Studio Editor)接口模型
|
|
|
|
设计目标:
|
|
- 单人单份最终标签:每个 project_id + file_id 只维护 1 条最终标注结果
|
|
- 完全兼容 Label Studio:标注结果以 annotation 原始 JSON 形式存储与返回
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from datetime import datetime
|
|
from enum import Enum
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from pydantic import BaseModel, Field, ConfigDict
|
|
|
|
from app.db.models.annotation_management import (
|
|
ANNOTATION_STATUS_ANNOTATED,
|
|
ANNOTATION_STATUS_IN_PROGRESS,
|
|
ANNOTATION_STATUS_NO_ANNOTATION,
|
|
ANNOTATION_STATUS_NOT_APPLICABLE,
|
|
)
|
|
|
|
|
|
class AnnotationStatus(str, Enum):
|
|
"""标注状态枚举"""
|
|
|
|
ANNOTATED = ANNOTATION_STATUS_ANNOTATED
|
|
IN_PROGRESS = ANNOTATION_STATUS_IN_PROGRESS
|
|
NO_ANNOTATION = ANNOTATION_STATUS_NO_ANNOTATION
|
|
NOT_APPLICABLE = ANNOTATION_STATUS_NOT_APPLICABLE
|
|
|
|
|
|
class EditorProjectInfo(BaseModel):
|
|
"""编辑器项目元信息"""
|
|
|
|
project_id: str = Field(..., alias="projectId", description="DataMate 标注项目ID(t_dm_labeling_projects.id)")
|
|
dataset_id: str = Field(..., alias="datasetId", description="数据集ID(t_dm_datasets.id)")
|
|
dataset_type: Optional[str] = Field(
|
|
None,
|
|
alias="datasetType",
|
|
description="数据集类型(TEXT/IMAGE/AUDIO/VIDEO 等)",
|
|
)
|
|
template_id: Optional[str] = Field(None, alias="templateId", description="模板ID(t_dm_annotation_templates.id)")
|
|
label_config: Optional[str] = Field(None, alias="labelConfig", description="Label Studio XML 配置")
|
|
supported: bool = Field(..., description="当前数据类型是否支持内嵌编辑器")
|
|
unsupported_reason: Optional[str] = Field(None, alias="unsupportedReason", description="不支持原因(当 supported=false)")
|
|
|
|
model_config = ConfigDict(populate_by_name=True)
|
|
|
|
|
|
class EditorTaskListItem(BaseModel):
|
|
"""编辑器任务列表条目(对应一个数据集文件)"""
|
|
|
|
file_id: str = Field(..., alias="fileId", description="文件ID")
|
|
file_name: str = Field(..., alias="fileName", description="文件名")
|
|
file_type: Optional[str] = Field(None, alias="fileType", description="文件类型")
|
|
has_annotation: bool = Field(..., alias="hasAnnotation", description="是否已有最终标注")
|
|
annotation_updated_at: Optional[datetime] = Field(None, alias="annotationUpdatedAt", description="标注更新时间")
|
|
annotation_status: Optional[AnnotationStatus] = Field(
|
|
None,
|
|
alias="annotationStatus",
|
|
description="标注状态",
|
|
)
|
|
|
|
model_config = ConfigDict(populate_by_name=True, use_enum_values=True)
|
|
|
|
|
|
class EditorTaskListResponse(BaseModel):
|
|
"""编辑器任务列表响应"""
|
|
|
|
content: List[EditorTaskListItem] = Field(..., description="任务列表")
|
|
total_elements: int = Field(..., alias="totalElements", description="总条数")
|
|
total_pages: int = Field(..., alias="totalPages", description="总页数")
|
|
page: int = Field(..., description="页码(从0开始)")
|
|
size: int = Field(..., description="每页大小")
|
|
|
|
model_config = ConfigDict(populate_by_name=True)
|
|
|
|
|
|
class SegmentInfo(BaseModel):
|
|
"""段落摘要(用于文本分段标注)"""
|
|
|
|
idx: int = Field(..., description="段落索引")
|
|
has_annotation: bool = Field(False, alias="hasAnnotation", description="该段落是否已有标注")
|
|
line_index: int = Field(0, alias="lineIndex", description="JSONL 行索引(从0开始)")
|
|
chunk_index: int = Field(0, alias="chunkIndex", description="行内分片索引(从0开始)")
|
|
|
|
model_config = ConfigDict(populate_by_name=True)
|
|
|
|
|
|
class EditorTaskResponse(BaseModel):
|
|
"""编辑器任务详情(可直接喂给 Label Studio Editor 的 task 对象)"""
|
|
|
|
task: Dict[str, Any] = Field(..., description="Label Studio task 对象")
|
|
annotation_updated_at: Optional[datetime] = Field(None, alias="annotationUpdatedAt", description="标注更新时间")
|
|
|
|
# 分段相关字段
|
|
segmented: bool = Field(False, description="是否启用分段模式")
|
|
total_segments: int = Field(0, alias="totalSegments", description="总段落数")
|
|
current_segment_index: int = Field(0, alias="currentSegmentIndex", description="当前段落索引")
|
|
|
|
model_config = ConfigDict(populate_by_name=True)
|
|
|
|
|
|
class EditorTaskSegmentsResponse(BaseModel):
|
|
"""编辑器段落摘要响应"""
|
|
|
|
segmented: bool = Field(False, description="是否启用分段模式")
|
|
segments: List[SegmentInfo] = Field(default_factory=list, description="段落摘要列表")
|
|
total_segments: int = Field(0, alias="totalSegments", description="总段落数")
|
|
|
|
model_config = ConfigDict(populate_by_name=True)
|
|
|
|
|
|
class UpsertAnnotationRequest(BaseModel):
|
|
"""保存/覆盖最终标注(Label Studio annotation 原始对象)"""
|
|
|
|
annotation: Dict[str, Any] = Field(..., description="Label Studio annotation 对象(包含 result 等)")
|
|
annotation_status: Optional[AnnotationStatus] = Field(
|
|
None,
|
|
alias="annotationStatus",
|
|
description="标注状态(无标注传 NO_ANNOTATION,不适用传 NOT_APPLICABLE,IN_PROGRESS 由后端维护)",
|
|
)
|
|
expected_updated_at: Optional[datetime] = Field(
|
|
None,
|
|
alias="expectedUpdatedAt",
|
|
description="乐观锁:若提供则要求与当前记录 updated_at 一致,否则返回 409",
|
|
)
|
|
# 分段保存支持
|
|
segment_index: Optional[int] = Field(
|
|
None,
|
|
alias="segmentIndex",
|
|
description="段落索引(分段模式下必填)",
|
|
)
|
|
|
|
model_config = ConfigDict(populate_by_name=True, use_enum_values=True)
|
|
|
|
|
|
class UpsertAnnotationResponse(BaseModel):
|
|
"""保存/覆盖最终标注响应"""
|
|
|
|
annotation_id: str = Field(..., alias="annotationId", description="标注结果ID(t_dm_annotation_results.id)")
|
|
updated_at: datetime = Field(..., alias="updatedAt", description="标注更新时间")
|
|
|
|
model_config = ConfigDict(populate_by_name=True)
|
|
|