feat(annotation): 添加分段标注统计和进度跟踪功能

- 新增 SegmentStats 类型定义用于分段统计 - 实现分段标注进度计算和缓存机制 - 添加标注任务状态判断逻辑支持分段模式 - 集成分段统计数据显示到任务列表界面 - 实现分段总数自动计算和验证功能 - 扩展标注状态枚举支持进行中标注状态 - 优化任务选择逻辑基于分段完成状态 - 添加分段统计数据预加载和同步机制
2026-01-31 15:42:04 +08:00
parent 3e0a15ac8e
commit 33cf65c9f8
5 changed files with 265 additions and 31 deletions
--- a/runtime/datamate-python/app/module/annotation/service/editor.py
+++ b/runtime/datamate-python/app/module/annotation/service/editor.py
@@ -26,9 +26,10 @@ from app.core.logging import get_logger
 from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject, LabelingProjectFile
 from app.db.models.annotation_management import (
    ANNOTATION_STATUS_ANNOTATED,
+    ANNOTATION_STATUS_IN_PROGRESS,
+    ANNOTATION_STATUS_CLIENT_VALUES,
    ANNOTATION_STATUS_NO_ANNOTATION,
    ANNOTATION_STATUS_NOT_APPLICABLE,
-    ANNOTATION_STATUS_VALUES,
 )
 from app.module.annotation.config import LabelStudioTagConfig
 from app.module.annotation.schema.editor import (
@@ -61,6 +62,7 @@ SEGMENT_INDEX_KEY = "segment_index"
 SEGMENT_INDEX_CAMEL_KEY = "segmentIndex"
 SEGMENTED_KEY = "segmented"
 SEGMENTS_KEY = "segments"
+SEGMENT_TOTAL_KEY = "total_segments"
 SEGMENT_RESULT_KEY = "result"
 SEGMENT_CREATED_AT_KEY = "created_at"
 SEGMENT_UPDATED_AT_KEY = "updated_at"
@@ -416,6 +418,76 @@ class AnnotationEditorService:
        result = payload.get(SEGMENT_RESULT_KEY)
        return isinstance(result, list) and len(result) > 0

+    @staticmethod
+    def _resolve_segment_total(payload: Optional[Dict[str, Any]]) -> Optional[int]:
+        if not payload or not isinstance(payload, dict):
+            return None
+        value = payload.get(SEGMENT_TOTAL_KEY)
+        if isinstance(value, int):
+            return value if value > 0 else None
+        if isinstance(value, float) and value.is_integer():
+            return int(value) if value > 0 else None
+        if isinstance(value, str) and value.isdigit():
+            parsed = int(value)
+            return parsed if parsed > 0 else None
+        return None
+
+    async def _compute_segment_total(
+        self,
+        project: LabelingProject,
+        file_record: DatasetFiles,
+        file_id: str,
+    ) -> Optional[int]:
+        dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
+        if dataset_type != DATASET_TYPE_TEXT:
+            return None
+        if not self._resolve_segmentation_enabled(project):
+            return None
+
+        text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
+        if not isinstance(text_content, str):
+            return None
+
+        label_config = await self._resolve_project_label_config(project)
+        primary_text_key = self._resolve_primary_text_key(label_config)
+        file_name = str(getattr(file_record, "file_name", "")).lower()
+
+        records: List[Tuple[Optional[Dict[str, Any]], str]] = []
+        if file_name.endswith(JSONL_EXTENSION):
+            records = self._parse_jsonl_records(text_content)
+        else:
+            parsed_payload = self._try_parse_json_payload(text_content)
+            if parsed_payload:
+                records = [(parsed_payload, text_content)]
+
+        if not records:
+            records = [(None, text_content)]
+
+        record_texts = [
+            self._resolve_primary_text_value(payload, raw_text, primary_text_key)
+            for payload, raw_text in records
+        ]
+        if not record_texts:
+            record_texts = [text_content]
+
+        needs_segmentation = len(records) > 1 or any(
+            len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts
+        )
+        if not needs_segmentation:
+            return None
+
+        splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD)
+        total_segments = 0
+        for record_text in record_texts:
+            normalized_text = record_text or ""
+            if len(normalized_text) > self.SEGMENT_THRESHOLD:
+                raw_segments = splitter.split(normalized_text)
+                total_segments += len(raw_segments) if raw_segments else 1
+            else:
+                total_segments += 1
+
+        return total_segments if total_segments > 0 else 1
+
    @classmethod
    def _build_source_document_filter(cls):
        file_type_lower = func.lower(DatasetFiles.file_type)
@@ -946,19 +1018,36 @@ class AnnotationEditorService:
            final_payload = annotation_payload

        requested_status = request.annotation_status
-        if requested_status is not None and requested_status not in ANNOTATION_STATUS_VALUES:
+        if requested_status is not None and requested_status not in ANNOTATION_STATUS_CLIENT_VALUES:
            raise HTTPException(status_code=400, detail="annotationStatus 不合法")

-        has_result = self._has_annotation_result(final_payload)
-        if has_result:
-            final_status = ANNOTATION_STATUS_ANNOTATED
+        segment_total = None
+        segment_done = None
+        if request.segment_index is not None:
+            segment_total = self._resolve_segment_total(final_payload)
+            if segment_total is None:
+                segment_total = await self._compute_segment_total(project, file_record, file_id)
+            if segment_total and segment_total > 0:
+                final_payload[SEGMENT_TOTAL_KEY] = segment_total
+                segment_done = len(self._extract_segment_annotations(final_payload))
+
+        if (
+            segment_total is not None
+            and segment_done is not None
+            and segment_done < segment_total
+        ):
+            final_status = ANNOTATION_STATUS_IN_PROGRESS
        else:
-            if requested_status == ANNOTATION_STATUS_NO_ANNOTATION:
-                final_status = ANNOTATION_STATUS_NO_ANNOTATION
-            elif requested_status == ANNOTATION_STATUS_NOT_APPLICABLE:
-                final_status = ANNOTATION_STATUS_NOT_APPLICABLE
+            has_result = self._has_annotation_result(final_payload)
+            if has_result:
+                final_status = ANNOTATION_STATUS_ANNOTATED
            else:
-                raise HTTPException(status_code=400, detail="未发现标注内容，请确认无标注/不适用后再保存")
+                if requested_status == ANNOTATION_STATUS_NO_ANNOTATION:
+                    final_status = ANNOTATION_STATUS_NO_ANNOTATION
+                elif requested_status == ANNOTATION_STATUS_NOT_APPLICABLE:
+                    final_status = ANNOTATION_STATUS_NOT_APPLICABLE
+                else:
+                    raise HTTPException(status_code=400, detail="未发现标注内容，请确认无标注/不适用后再保存")

        if existing:
            if request.expected_updated_at and existing.updated_at: