feat(annotation): 添加分段标注统计和进度跟踪功能

- 新增 SegmentStats 类型定义用于分段统计
- 实现分段标注进度计算和缓存机制
- 添加标注任务状态判断逻辑支持分段模式
- 集成分段统计数据显示到任务列表界面
- 实现分段总数自动计算和验证功能
- 扩展标注状态枚举支持进行中标注状态
- 优化任务选择逻辑基于分段完成状态
- 添加分段统计数据预加载和同步机制
This commit is contained in:
2026-01-31 15:42:04 +08:00
parent 3e0a15ac8e
commit 33cf65c9f8
5 changed files with 265 additions and 31 deletions

View File

@@ -9,10 +9,17 @@ from app.db.session import Base
ANNOTATION_STATUS_ANNOTATED = "ANNOTATED"
ANNOTATION_STATUS_NO_ANNOTATION = "NO_ANNOTATION"
ANNOTATION_STATUS_NOT_APPLICABLE = "NOT_APPLICABLE"
ANNOTATION_STATUS_IN_PROGRESS = "IN_PROGRESS"
ANNOTATION_STATUS_VALUES = {
ANNOTATION_STATUS_ANNOTATED,
ANNOTATION_STATUS_NO_ANNOTATION,
ANNOTATION_STATUS_NOT_APPLICABLE,
ANNOTATION_STATUS_IN_PROGRESS,
}
ANNOTATION_STATUS_CLIENT_VALUES = {
ANNOTATION_STATUS_ANNOTATED,
ANNOTATION_STATUS_NO_ANNOTATION,
ANNOTATION_STATUS_NOT_APPLICABLE,
}
class AnnotationTemplate(Base):
@@ -101,7 +108,7 @@ class AnnotationResult(Base):
String(32),
nullable=False,
default=ANNOTATION_STATUS_ANNOTATED,
comment="标注状态: ANNOTATED/NO_ANNOTATION/NOT_APPLICABLE",
comment="标注状态: ANNOTATED/NO_ANNOTATION/NOT_APPLICABLE/IN_PROGRESS",
)
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")

View File

@@ -16,6 +16,7 @@ from pydantic import BaseModel, Field, ConfigDict
from app.db.models.annotation_management import (
ANNOTATION_STATUS_ANNOTATED,
ANNOTATION_STATUS_IN_PROGRESS,
ANNOTATION_STATUS_NO_ANNOTATION,
ANNOTATION_STATUS_NOT_APPLICABLE,
)
@@ -25,6 +26,7 @@ class AnnotationStatus(str, Enum):
"""标注状态枚举"""
ANNOTATED = ANNOTATION_STATUS_ANNOTATED
IN_PROGRESS = ANNOTATION_STATUS_IN_PROGRESS
NO_ANNOTATION = ANNOTATION_STATUS_NO_ANNOTATION
NOT_APPLICABLE = ANNOTATION_STATUS_NOT_APPLICABLE
@@ -112,7 +114,7 @@ class UpsertAnnotationRequest(BaseModel):
annotation_status: Optional[AnnotationStatus] = Field(
None,
alias="annotationStatus",
description="标注状态(无标注传 NO_ANNOTATION,不适用传 NOT_APPLICABLE)",
description="标注状态(无标注传 NO_ANNOTATION,不适用传 NOT_APPLICABLE,IN_PROGRESS 由后端维护",
)
expected_updated_at: Optional[datetime] = Field(
None,

View File

@@ -26,9 +26,10 @@ from app.core.logging import get_logger
from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject, LabelingProjectFile
from app.db.models.annotation_management import (
ANNOTATION_STATUS_ANNOTATED,
ANNOTATION_STATUS_IN_PROGRESS,
ANNOTATION_STATUS_CLIENT_VALUES,
ANNOTATION_STATUS_NO_ANNOTATION,
ANNOTATION_STATUS_NOT_APPLICABLE,
ANNOTATION_STATUS_VALUES,
)
from app.module.annotation.config import LabelStudioTagConfig
from app.module.annotation.schema.editor import (
@@ -61,6 +62,7 @@ SEGMENT_INDEX_KEY = "segment_index"
SEGMENT_INDEX_CAMEL_KEY = "segmentIndex"
SEGMENTED_KEY = "segmented"
SEGMENTS_KEY = "segments"
SEGMENT_TOTAL_KEY = "total_segments"
SEGMENT_RESULT_KEY = "result"
SEGMENT_CREATED_AT_KEY = "created_at"
SEGMENT_UPDATED_AT_KEY = "updated_at"
@@ -416,6 +418,76 @@ class AnnotationEditorService:
result = payload.get(SEGMENT_RESULT_KEY)
return isinstance(result, list) and len(result) > 0
@staticmethod
def _resolve_segment_total(payload: Optional[Dict[str, Any]]) -> Optional[int]:
if not payload or not isinstance(payload, dict):
return None
value = payload.get(SEGMENT_TOTAL_KEY)
if isinstance(value, int):
return value if value > 0 else None
if isinstance(value, float) and value.is_integer():
return int(value) if value > 0 else None
if isinstance(value, str) and value.isdigit():
parsed = int(value)
return parsed if parsed > 0 else None
return None
async def _compute_segment_total(
self,
project: LabelingProject,
file_record: DatasetFiles,
file_id: str,
) -> Optional[int]:
dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
if dataset_type != DATASET_TYPE_TEXT:
return None
if not self._resolve_segmentation_enabled(project):
return None
text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
if not isinstance(text_content, str):
return None
label_config = await self._resolve_project_label_config(project)
primary_text_key = self._resolve_primary_text_key(label_config)
file_name = str(getattr(file_record, "file_name", "")).lower()
records: List[Tuple[Optional[Dict[str, Any]], str]] = []
if file_name.endswith(JSONL_EXTENSION):
records = self._parse_jsonl_records(text_content)
else:
parsed_payload = self._try_parse_json_payload(text_content)
if parsed_payload:
records = [(parsed_payload, text_content)]
if not records:
records = [(None, text_content)]
record_texts = [
self._resolve_primary_text_value(payload, raw_text, primary_text_key)
for payload, raw_text in records
]
if not record_texts:
record_texts = [text_content]
needs_segmentation = len(records) > 1 or any(
len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts
)
if not needs_segmentation:
return None
splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD)
total_segments = 0
for record_text in record_texts:
normalized_text = record_text or ""
if len(normalized_text) > self.SEGMENT_THRESHOLD:
raw_segments = splitter.split(normalized_text)
total_segments += len(raw_segments) if raw_segments else 1
else:
total_segments += 1
return total_segments if total_segments > 0 else 1
@classmethod
def _build_source_document_filter(cls):
file_type_lower = func.lower(DatasetFiles.file_type)
@@ -946,19 +1018,36 @@ class AnnotationEditorService:
final_payload = annotation_payload
requested_status = request.annotation_status
if requested_status is not None and requested_status not in ANNOTATION_STATUS_VALUES:
if requested_status is not None and requested_status not in ANNOTATION_STATUS_CLIENT_VALUES:
raise HTTPException(status_code=400, detail="annotationStatus 不合法")
has_result = self._has_annotation_result(final_payload)
if has_result:
final_status = ANNOTATION_STATUS_ANNOTATED
segment_total = None
segment_done = None
if request.segment_index is not None:
segment_total = self._resolve_segment_total(final_payload)
if segment_total is None:
segment_total = await self._compute_segment_total(project, file_record, file_id)
if segment_total and segment_total > 0:
final_payload[SEGMENT_TOTAL_KEY] = segment_total
segment_done = len(self._extract_segment_annotations(final_payload))
if (
segment_total is not None
and segment_done is not None
and segment_done < segment_total
):
final_status = ANNOTATION_STATUS_IN_PROGRESS
else:
if requested_status == ANNOTATION_STATUS_NO_ANNOTATION:
final_status = ANNOTATION_STATUS_NO_ANNOTATION
elif requested_status == ANNOTATION_STATUS_NOT_APPLICABLE:
final_status = ANNOTATION_STATUS_NOT_APPLICABLE
has_result = self._has_annotation_result(final_payload)
if has_result:
final_status = ANNOTATION_STATUS_ANNOTATED
else:
raise HTTPException(status_code=400, detail="未发现标注内容,请确认无标注/不适用后再保存")
if requested_status == ANNOTATION_STATUS_NO_ANNOTATION:
final_status = ANNOTATION_STATUS_NO_ANNOTATION
elif requested_status == ANNOTATION_STATUS_NOT_APPLICABLE:
final_status = ANNOTATION_STATUS_NOT_APPLICABLE
else:
raise HTTPException(status_code=400, detail="未发现标注内容,请确认无标注/不适用后再保存")
if existing:
if request.expected_updated_at and existing.updated_at: