feat(annotation): 添加分段标注统计和进度跟踪功能

- 新增 SegmentStats 类型定义用于分段统计 - 实现分段标注进度计算和缓存机制 - 添加标注任务状态判断逻辑支持分段模式 - 集成分段统计数据显示到任务列表界面 - 实现分段总数自动计算和验证功能 - 扩展标注状态枚举支持进行中标注状态 - 优化任务选择逻辑基于分段完成状态 - 添加分段统计数据预加载和同步机制
2026-01-31 15:42:04 +08:00
parent 3e0a15ac8e
commit 33cf65c9f8
5 changed files with 265 additions and 31 deletions
@@ -28,6 +28,7 @@ type EditorTaskListItem = {
  hasAnnotation: boolean;
  annotationUpdatedAt?: string | null;
  annotationStatus?: AnnotationResultStatus | null;
+  segmentStats?: SegmentStats;
 };

 type LsfMessage = {
@@ -45,6 +46,11 @@ type SegmentInfo = {
  chunkIndex: number;
 };

+type SegmentStats = {
+  done: number;
+  total: number;
+};
+
 type ApiResponse<T> = {
  code?: number;
  message?: string;
@@ -136,6 +142,16 @@ const isAnnotationResultEmpty = (annotation?: Record<string, unknown>) => {
 };

 const resolveTaskStatusMeta = (item: EditorTaskListItem) => {
+  const segmentSummary = resolveSegmentSummary(item);
+  if (segmentSummary) {
+    if (segmentSummary.done >= segmentSummary.total) {
+      return { text: "已标注", type: "success" as const };
+    }
+    if (segmentSummary.done > 0) {
+      return { text: "标注中", type: "warning" as const };
+    }
+    return { text: "未标注", type: "secondary" as const };
+  }
  if (!item.hasAnnotation) {
    return { text: "未标注", type: "secondary" as const };
  }
@@ -145,6 +161,9 @@ const resolveTaskStatusMeta = (item: EditorTaskListItem) => {
  if (item.annotationStatus === AnnotationResultStatus.NOT_APPLICABLE) {
    return { text: NOT_APPLICABLE_LABEL, type: "warning" as const };
  }
+  if (item.annotationStatus === AnnotationResultStatus.IN_PROGRESS) {
+    return { text: "标注中", type: "warning" as const };
+  }
  return { text: "已标注", type: "success" as const };
 };

@@ -184,6 +203,25 @@ const buildAnnotationSnapshot = (annotation?: Record<string, unknown>) => {
 const buildSnapshotKey = (fileId: string, segmentIndex?: number) =>
  `${fileId}::${segmentIndex ?? "full"}`;

+const buildSegmentStats = (segmentList?: SegmentInfo[] | null): SegmentStats | null => {
+  if (!Array.isArray(segmentList) || segmentList.length === 0) return null;
+  const total = segmentList.length;
+  const done = segmentList.reduce((count, seg) => count + (seg.hasAnnotation ? 1 : 0), 0);
+  return { done, total };
+};
+
+const normalizeSegmentStats = (stats?: SegmentStats | null): SegmentStats | null => {
+  if (!stats) return null;
+  const total = Number(stats.total);
+  const done = Number(stats.done);
+  if (!Number.isFinite(total) || total <= 0) return null;
+  const safeDone = Math.min(Math.max(done, 0), total);
+  return { done: safeDone, total };
+};
+
+const resolveSegmentSummary = (item: EditorTaskListItem) =>
+  normalizeSegmentStats(item.segmentStats);
+
 const mergeTaskItems = (base: EditorTaskListItem[], next: EditorTaskListItem[]) => {
  if (next.length === 0) return base;
  const seen = new Set(base.map((item) => item.fileId));
@@ -234,6 +272,9 @@ export default function LabelStudioTextEditor() {
  const exportCheckSeqRef = useRef(0);
  const savedSnapshotsRef = useRef<Record<string, string>>({});
  const pendingAutoAdvanceRef = useRef(false);
+  const segmentStatsCacheRef = useRef<Record<string, SegmentStats>>({});
+  const segmentStatsSeqRef = useRef(0);
+  const segmentStatsLoadingRef = useRef<Set<string>>(new Set());

  const [loadingProject, setLoadingProject] = useState(true);
  const [loadingTasks, setLoadingTasks] = useState(false);
@@ -276,6 +317,70 @@ export default function LabelStudioTextEditor() {
    win.postMessage({ type, payload }, origin);
  }, [origin]);

+  const applySegmentStats = useCallback((fileId: string, stats: SegmentStats | null) => {
+    if (!fileId) return;
+    const normalized = normalizeSegmentStats(stats);
+    setTasks((prev) =>
+      prev.map((item) =>
+        item.fileId === fileId
+          ? { ...item, segmentStats: normalized || undefined }
+          : item
+      )
+    );
+  }, []);
+
+  const updateSegmentStatsCache = useCallback((fileId: string, stats: SegmentStats | null) => {
+    if (!fileId) return;
+    const normalized = normalizeSegmentStats(stats);
+    if (normalized) {
+      segmentStatsCacheRef.current[fileId] = normalized;
+    } else {
+      delete segmentStatsCacheRef.current[fileId];
+    }
+    applySegmentStats(fileId, normalized);
+  }, [applySegmentStats]);
+
+  const fetchSegmentStatsForFile = useCallback(async (fileId: string, seq: number) => {
+    if (!projectId || !fileId) return;
+    if (segmentStatsCacheRef.current[fileId] || segmentStatsLoadingRef.current.has(fileId)) return;
+    segmentStatsLoadingRef.current.add(fileId);
+    try {
+      const resp = (await getEditorTaskUsingGet(projectId, fileId, {
+        segmentIndex: 0,
+      })) as ApiResponse<EditorTaskResponse>;
+      if (segmentStatsSeqRef.current !== seq) return;
+      const data = resp?.data;
+      if (!data?.segmented) return;
+      const stats = buildSegmentStats(data.segments);
+      if (!stats) return;
+      segmentStatsCacheRef.current[fileId] = stats;
+      applySegmentStats(fileId, stats);
+    } catch (e) {
+      console.error(e);
+    } finally {
+      segmentStatsLoadingRef.current.delete(fileId);
+    }
+  }, [applySegmentStats, projectId]);
+
+  const prefetchSegmentStats = useCallback((items: EditorTaskListItem[]) => {
+    if (!projectId) return;
+    const fileIds = items
+      .map((item) => item.fileId)
+      .filter((fileId) => fileId && !segmentStatsCacheRef.current[fileId]);
+    if (fileIds.length === 0) return;
+    const seq = segmentStatsSeqRef.current;
+    let cursor = 0;
+    const workerCount = Math.min(3, fileIds.length);
+    const runWorker = async () => {
+      while (cursor < fileIds.length && segmentStatsSeqRef.current === seq) {
+        const fileId = fileIds[cursor];
+        cursor += 1;
+        await fetchSegmentStatsForFile(fileId, seq);
+      }
+    };
+    void Promise.all(Array.from({ length: workerCount }, () => runWorker()));
+  }, [fetchSegmentStatsForFile, projectId]);
+
  const confirmEmptyAnnotationStatus = useCallback(() => {
    return new Promise<AnnotationResultStatus | null>((resolve) => {
      let resolved = false;
@@ -327,8 +432,13 @@ export default function LabelStudioTextEditor() {
  }, [message, projectId]);

  const updateTaskSelection = useCallback((items: EditorTaskListItem[]) => {
+    const isCompleted = (item: EditorTaskListItem) => {
+      const summary = resolveSegmentSummary(item);
+      if (summary) return summary.done >= summary.total;
+      return item.hasAnnotation;
+    };
    const defaultFileId =
-      items.find((item) => !item.hasAnnotation)?.fileId || items[0]?.fileId || "";
+      items.find((item) => !isCompleted(item))?.fileId || items[0]?.fileId || "";
    setSelectedFileId((prev) => {
      if (prev && items.some((item) => item.fileId === prev)) return prev;
      return defaultFileId;
@@ -385,6 +495,9 @@ export default function LabelStudioTextEditor() {
    if (mode === "reset") {
      prefetchSeqRef.current += 1;
      setPrefetching(false);
+      segmentStatsSeqRef.current += 1;
+      segmentStatsCacheRef.current = {};
+      segmentStatsLoadingRef.current = new Set();
    }
    if (mode === "append") {
      setLoadingMore(true);
@@ -469,13 +582,16 @@ export default function LabelStudioTextEditor() {
        ? resolveSegmentIndex(data.currentSegmentIndex) ?? 0
        : undefined;
      if (data?.segmented) {
+        const stats = buildSegmentStats(data.segments);
        setSegmented(true);
        setSegments(data.segments || []);
        setCurrentSegmentIndex(segmentIndex ?? 0);
+        updateSegmentStatsCache(fileId, stats);
      } else {
        setSegmented(false);
        setSegments([]);
        setCurrentSegmentIndex(0);
+        updateSegmentStatsCache(fileId, null);
      }

      const taskData = {
@@ -535,7 +651,7 @@ export default function LabelStudioTextEditor() {
    } finally {
      if (seq === initSeqRef.current) setLoadingTaskDetail(false);
    }
-  }, [iframeReady, message, postToIframe, project, projectId]);
+  }, [iframeReady, message, postToIframe, project, projectId, updateSegmentStatsCache]);

  const advanceAfterSave = useCallback(async (fileId: string, segmentIndex?: number) => {
    if (!fileId) return;
@@ -643,13 +759,13 @@ export default function LabelStudioTextEditor() {

      // 分段模式下更新当前段落的标注状态
      if (segmented && segmentIndex !== undefined) {
-        setSegments((prev) =>
-          prev.map((seg) =>
-            seg.idx === segmentIndex
-              ? { ...seg, hasAnnotation: true }
-              : seg
-          )
+        const nextSegments = segments.map((seg) =>
+          seg.idx === segmentIndex
+            ? { ...seg, hasAnnotation: true }
+            : seg
        );
+        setSegments(nextSegments);
+        updateSegmentStatsCache(String(fileId), buildSegmentStats(nextSegments));
      }
      if (options?.autoAdvance) {
        await advanceAfterSave(String(fileId), segmentIndex);
@@ -669,8 +785,10 @@ export default function LabelStudioTextEditor() {
    message,
    projectId,
    segmented,
+    segments,
    selectedFileId,
    tasks,
+    updateSegmentStatsCache,
  ]);

  const requestExportForCheck = useCallback(() => {
@@ -834,6 +952,9 @@ export default function LabelStudioTextEditor() {
    setSegments([]);
    setCurrentSegmentIndex(0);
    savedSnapshotsRef.current = {};
+    segmentStatsSeqRef.current += 1;
+    segmentStatsCacheRef.current = {};
+    segmentStatsLoadingRef.current = new Set();
    if (exportCheckRef.current?.timer) {
      window.clearTimeout(exportCheckRef.current.timer);
    }
@@ -847,6 +968,12 @@ export default function LabelStudioTextEditor() {
    loadTasks({ mode: "reset" });
  }, [project?.supported, loadTasks]);

+  useEffect(() => {
+    if (!segmented) return;
+    if (tasks.length === 0) return;
+    prefetchSegmentStats(tasks);
+  }, [prefetchSegmentStats, segmented, tasks]);
+
  useEffect(() => {
    if (!selectedFileId) return;
    initEditorForFile(selectedFileId);
@@ -1097,6 +1224,7 @@ export default function LabelStudioTextEditor() {
              dataSource={tasks}
              loadMore={loadMoreNode}
              renderItem={(item) => {
+                const segmentSummary = resolveSegmentSummary(item);
                const statusMeta = resolveTaskStatusMeta(item);
                return (
                  <List.Item
@@ -1110,18 +1238,25 @@ export default function LabelStudioTextEditor() {
                    onClick={() => setSelectedFileId(item.fileId)}
                  >
                    <div className="flex flex-col w-full gap-1">
-                      <Typography.Text ellipsis style={{ fontSize: 13 }}>
-                        {item.fileName}
-                      </Typography.Text>
-                      <div className="flex items-center justify-between">
-                        <Typography.Text type={statusMeta.type} style={{ fontSize: 11 }}>
-                          {statusMeta.text}
+                        <Typography.Text ellipsis style={{ fontSize: 13 }}>
+                          {item.fileName}
                        </Typography.Text>
-                        {item.annotationUpdatedAt && (
-                          <Typography.Text type="secondary" style={{ fontSize: 10 }}>
-                            {item.annotationUpdatedAt}
-                          </Typography.Text>
-                        )}
+                        <div className="flex items-center justify-between">
+                          <div className="flex items-center gap-2">
+                            <Typography.Text type={statusMeta.type} style={{ fontSize: 11 }}>
+                              {statusMeta.text}
+                            </Typography.Text>
+                            {segmentSummary && (
+                              <Typography.Text type="secondary" style={{ fontSize: 10 }}>
+                                已标注 {segmentSummary.done}/{segmentSummary.total}
+                              </Typography.Text>
+                            )}
+                          </div>
+                          {item.annotationUpdatedAt && (
+                            <Typography.Text type="secondary" style={{ fontSize: 10 }}>
+                              {item.annotationUpdatedAt}
+                            </Typography.Text>
+                          )}
                      </div>
                    </div>
                  </List.Item>
@@ -10,6 +10,7 @@ export enum AnnotationTaskStatus {

 export enum AnnotationResultStatus {
  ANNOTATED = "ANNOTATED",
+  IN_PROGRESS = "IN_PROGRESS",
  NO_ANNOTATION = "NO_ANNOTATION",
  NOT_APPLICABLE = "NOT_APPLICABLE",
 }
@@ -9,10 +9,17 @@ from app.db.session import Base
 ANNOTATION_STATUS_ANNOTATED = "ANNOTATED"
 ANNOTATION_STATUS_NO_ANNOTATION = "NO_ANNOTATION"
 ANNOTATION_STATUS_NOT_APPLICABLE = "NOT_APPLICABLE"
+ANNOTATION_STATUS_IN_PROGRESS = "IN_PROGRESS"
 ANNOTATION_STATUS_VALUES = {
    ANNOTATION_STATUS_ANNOTATED,
    ANNOTATION_STATUS_NO_ANNOTATION,
    ANNOTATION_STATUS_NOT_APPLICABLE,
+    ANNOTATION_STATUS_IN_PROGRESS,
+}
+ANNOTATION_STATUS_CLIENT_VALUES = {
+    ANNOTATION_STATUS_ANNOTATED,
+    ANNOTATION_STATUS_NO_ANNOTATION,
+    ANNOTATION_STATUS_NOT_APPLICABLE,
 }

 class AnnotationTemplate(Base):
@@ -101,7 +108,7 @@ class AnnotationResult(Base):
        String(32),
        nullable=False,
        default=ANNOTATION_STATUS_ANNOTATED,
-        comment="标注状态: ANNOTATED/NO_ANNOTATION/NOT_APPLICABLE",
+        comment="标注状态: ANNOTATED/NO_ANNOTATION/NOT_APPLICABLE/IN_PROGRESS",
    )
    created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
    updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
@@ -16,6 +16,7 @@ from pydantic import BaseModel, Field, ConfigDict

 from app.db.models.annotation_management import (
    ANNOTATION_STATUS_ANNOTATED,
+    ANNOTATION_STATUS_IN_PROGRESS,
    ANNOTATION_STATUS_NO_ANNOTATION,
    ANNOTATION_STATUS_NOT_APPLICABLE,
 )
@@ -25,6 +26,7 @@ class AnnotationStatus(str, Enum):
    """标注状态枚举"""

    ANNOTATED = ANNOTATION_STATUS_ANNOTATED
+    IN_PROGRESS = ANNOTATION_STATUS_IN_PROGRESS
    NO_ANNOTATION = ANNOTATION_STATUS_NO_ANNOTATION
    NOT_APPLICABLE = ANNOTATION_STATUS_NOT_APPLICABLE

@@ -112,7 +114,7 @@ class UpsertAnnotationRequest(BaseModel):
    annotation_status: Optional[AnnotationStatus] = Field(
        None,
        alias="annotationStatus",
-        description="标注状态（无标注传 NO_ANNOTATION，不适用传 NOT_APPLICABLE）",
+        description="标注状态（无标注传 NO_ANNOTATION，不适用传 NOT_APPLICABLE，IN_PROGRESS 由后端维护）",
    )
    expected_updated_at: Optional[datetime] = Field(
        None,
@@ -26,9 +26,10 @@ from app.core.logging import get_logger
 from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject, LabelingProjectFile
 from app.db.models.annotation_management import (
    ANNOTATION_STATUS_ANNOTATED,
+    ANNOTATION_STATUS_IN_PROGRESS,
+    ANNOTATION_STATUS_CLIENT_VALUES,
    ANNOTATION_STATUS_NO_ANNOTATION,
    ANNOTATION_STATUS_NOT_APPLICABLE,
-    ANNOTATION_STATUS_VALUES,
 )
 from app.module.annotation.config import LabelStudioTagConfig
 from app.module.annotation.schema.editor import (
@@ -61,6 +62,7 @@ SEGMENT_INDEX_KEY = "segment_index"
 SEGMENT_INDEX_CAMEL_KEY = "segmentIndex"
 SEGMENTED_KEY = "segmented"
 SEGMENTS_KEY = "segments"
+SEGMENT_TOTAL_KEY = "total_segments"
 SEGMENT_RESULT_KEY = "result"
 SEGMENT_CREATED_AT_KEY = "created_at"
 SEGMENT_UPDATED_AT_KEY = "updated_at"
@@ -416,6 +418,76 @@ class AnnotationEditorService:
        result = payload.get(SEGMENT_RESULT_KEY)
        return isinstance(result, list) and len(result) > 0

+    @staticmethod
+    def _resolve_segment_total(payload: Optional[Dict[str, Any]]) -> Optional[int]:
+        if not payload or not isinstance(payload, dict):
+            return None
+        value = payload.get(SEGMENT_TOTAL_KEY)
+        if isinstance(value, int):
+            return value if value > 0 else None
+        if isinstance(value, float) and value.is_integer():
+            return int(value) if value > 0 else None
+        if isinstance(value, str) and value.isdigit():
+            parsed = int(value)
+            return parsed if parsed > 0 else None
+        return None
+
+    async def _compute_segment_total(
+        self,
+        project: LabelingProject,
+        file_record: DatasetFiles,
+        file_id: str,
+    ) -> Optional[int]:
+        dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
+        if dataset_type != DATASET_TYPE_TEXT:
+            return None
+        if not self._resolve_segmentation_enabled(project):
+            return None
+
+        text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
+        if not isinstance(text_content, str):
+            return None
+
+        label_config = await self._resolve_project_label_config(project)
+        primary_text_key = self._resolve_primary_text_key(label_config)
+        file_name = str(getattr(file_record, "file_name", "")).lower()
+
+        records: List[Tuple[Optional[Dict[str, Any]], str]] = []
+        if file_name.endswith(JSONL_EXTENSION):
+            records = self._parse_jsonl_records(text_content)
+        else:
+            parsed_payload = self._try_parse_json_payload(text_content)
+            if parsed_payload:
+                records = [(parsed_payload, text_content)]
+
+        if not records:
+            records = [(None, text_content)]
+
+        record_texts = [
+            self._resolve_primary_text_value(payload, raw_text, primary_text_key)
+            for payload, raw_text in records
+        ]
+        if not record_texts:
+            record_texts = [text_content]
+
+        needs_segmentation = len(records) > 1 or any(
+            len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts
+        )
+        if not needs_segmentation:
+            return None
+
+        splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD)
+        total_segments = 0
+        for record_text in record_texts:
+            normalized_text = record_text or ""
+            if len(normalized_text) > self.SEGMENT_THRESHOLD:
+                raw_segments = splitter.split(normalized_text)
+                total_segments += len(raw_segments) if raw_segments else 1
+            else:
+                total_segments += 1
+
+        return total_segments if total_segments > 0 else 1
+
    @classmethod
    def _build_source_document_filter(cls):
        file_type_lower = func.lower(DatasetFiles.file_type)
@@ -946,19 +1018,36 @@ class AnnotationEditorService:
            final_payload = annotation_payload

        requested_status = request.annotation_status
-        if requested_status is not None and requested_status not in ANNOTATION_STATUS_VALUES:
+        if requested_status is not None and requested_status not in ANNOTATION_STATUS_CLIENT_VALUES:
            raise HTTPException(status_code=400, detail="annotationStatus 不合法")

-        has_result = self._has_annotation_result(final_payload)
-        if has_result:
-            final_status = ANNOTATION_STATUS_ANNOTATED
+        segment_total = None
+        segment_done = None
+        if request.segment_index is not None:
+            segment_total = self._resolve_segment_total(final_payload)
+            if segment_total is None:
+                segment_total = await self._compute_segment_total(project, file_record, file_id)
+            if segment_total and segment_total > 0:
+                final_payload[SEGMENT_TOTAL_KEY] = segment_total
+                segment_done = len(self._extract_segment_annotations(final_payload))
+
+        if (
+            segment_total is not None
+            and segment_done is not None
+            and segment_done < segment_total
+        ):
+            final_status = ANNOTATION_STATUS_IN_PROGRESS
        else:
-            if requested_status == ANNOTATION_STATUS_NO_ANNOTATION:
-                final_status = ANNOTATION_STATUS_NO_ANNOTATION
-            elif requested_status == ANNOTATION_STATUS_NOT_APPLICABLE:
-                final_status = ANNOTATION_STATUS_NOT_APPLICABLE
+            has_result = self._has_annotation_result(final_payload)
+            if has_result:
+                final_status = ANNOTATION_STATUS_ANNOTATED
            else:
-                raise HTTPException(status_code=400, detail="未发现标注内容，请确认无标注/不适用后再保存")
+                if requested_status == ANNOTATION_STATUS_NO_ANNOTATION:
+                    final_status = ANNOTATION_STATUS_NO_ANNOTATION
+                elif requested_status == ANNOTATION_STATUS_NOT_APPLICABLE:
+                    final_status = ANNOTATION_STATUS_NOT_APPLICABLE
+                else:
+                    raise HTTPException(status_code=400, detail="未发现标注内容，请确认无标注/不适用后再保存")

        if existing:
            if request.expected_updated_at and existing.updated_at: