From 33cf65c9f8ca7a7219dd4de16677e313572371ca Mon Sep 17 00:00:00 2001 From: Jerry Yan <792602257@qq.com> Date: Sat, 31 Jan 2026 15:42:04 +0800 Subject: [PATCH] =?UTF-8?q?feat(annotation):=20=E6=B7=BB=E5=8A=A0=E5=88=86?= =?UTF-8?q?=E6=AE=B5=E6=A0=87=E6=B3=A8=E7=BB=9F=E8=AE=A1=E5=92=8C=E8=BF=9B?= =?UTF-8?q?=E5=BA=A6=E8=B7=9F=E8=B8=AA=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 新增 SegmentStats 类型定义用于分段统计 - 实现分段标注进度计算和缓存机制 - 添加标注任务状态判断逻辑支持分段模式 - 集成分段统计数据显示到任务列表界面 - 实现分段总数自动计算和验证功能 - 扩展标注状态枚举支持进行中标注状态 - 优化任务选择逻辑基于分段完成状态 - 添加分段统计数据预加载和同步机制 --- .../Annotate/LabelStudioTextEditor.tsx | 173 ++++++++++++++++-- .../pages/DataAnnotation/annotation.model.ts | 1 + .../app/db/models/annotation_management.py | 9 +- .../app/module/annotation/schema/editor.py | 4 +- .../app/module/annotation/service/editor.py | 109 ++++++++++- 5 files changed, 265 insertions(+), 31 deletions(-) diff --git a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx index 4498dee..fe28229 100644 --- a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx +++ b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx @@ -28,6 +28,7 @@ type EditorTaskListItem = { hasAnnotation: boolean; annotationUpdatedAt?: string | null; annotationStatus?: AnnotationResultStatus | null; + segmentStats?: SegmentStats; }; type LsfMessage = { @@ -45,6 +46,11 @@ type SegmentInfo = { chunkIndex: number; }; +type SegmentStats = { + done: number; + total: number; +}; + type ApiResponse = { code?: number; message?: string; @@ -136,6 +142,16 @@ const isAnnotationResultEmpty = (annotation?: Record) => { }; const resolveTaskStatusMeta = (item: EditorTaskListItem) => { + const segmentSummary = resolveSegmentSummary(item); + if (segmentSummary) { + if (segmentSummary.done >= segmentSummary.total) { + return { text: "已标注", type: "success" as const }; + } + if (segmentSummary.done > 0) { + return { text: "标注中", type: "warning" as const }; + } + return { text: "未标注", type: "secondary" as const }; + } if (!item.hasAnnotation) { return { text: "未标注", type: "secondary" as const }; } @@ -145,6 +161,9 @@ const resolveTaskStatusMeta = (item: EditorTaskListItem) => { if (item.annotationStatus === AnnotationResultStatus.NOT_APPLICABLE) { return { text: NOT_APPLICABLE_LABEL, type: "warning" as const }; } + if (item.annotationStatus === AnnotationResultStatus.IN_PROGRESS) { + return { text: "标注中", type: "warning" as const }; + } return { text: "已标注", type: "success" as const }; }; @@ -184,6 +203,25 @@ const buildAnnotationSnapshot = (annotation?: Record) => { const buildSnapshotKey = (fileId: string, segmentIndex?: number) => `${fileId}::${segmentIndex ?? "full"}`; +const buildSegmentStats = (segmentList?: SegmentInfo[] | null): SegmentStats | null => { + if (!Array.isArray(segmentList) || segmentList.length === 0) return null; + const total = segmentList.length; + const done = segmentList.reduce((count, seg) => count + (seg.hasAnnotation ? 1 : 0), 0); + return { done, total }; +}; + +const normalizeSegmentStats = (stats?: SegmentStats | null): SegmentStats | null => { + if (!stats) return null; + const total = Number(stats.total); + const done = Number(stats.done); + if (!Number.isFinite(total) || total <= 0) return null; + const safeDone = Math.min(Math.max(done, 0), total); + return { done: safeDone, total }; +}; + +const resolveSegmentSummary = (item: EditorTaskListItem) => + normalizeSegmentStats(item.segmentStats); + const mergeTaskItems = (base: EditorTaskListItem[], next: EditorTaskListItem[]) => { if (next.length === 0) return base; const seen = new Set(base.map((item) => item.fileId)); @@ -234,6 +272,9 @@ export default function LabelStudioTextEditor() { const exportCheckSeqRef = useRef(0); const savedSnapshotsRef = useRef>({}); const pendingAutoAdvanceRef = useRef(false); + const segmentStatsCacheRef = useRef>({}); + const segmentStatsSeqRef = useRef(0); + const segmentStatsLoadingRef = useRef>(new Set()); const [loadingProject, setLoadingProject] = useState(true); const [loadingTasks, setLoadingTasks] = useState(false); @@ -276,6 +317,70 @@ export default function LabelStudioTextEditor() { win.postMessage({ type, payload }, origin); }, [origin]); + const applySegmentStats = useCallback((fileId: string, stats: SegmentStats | null) => { + if (!fileId) return; + const normalized = normalizeSegmentStats(stats); + setTasks((prev) => + prev.map((item) => + item.fileId === fileId + ? { ...item, segmentStats: normalized || undefined } + : item + ) + ); + }, []); + + const updateSegmentStatsCache = useCallback((fileId: string, stats: SegmentStats | null) => { + if (!fileId) return; + const normalized = normalizeSegmentStats(stats); + if (normalized) { + segmentStatsCacheRef.current[fileId] = normalized; + } else { + delete segmentStatsCacheRef.current[fileId]; + } + applySegmentStats(fileId, normalized); + }, [applySegmentStats]); + + const fetchSegmentStatsForFile = useCallback(async (fileId: string, seq: number) => { + if (!projectId || !fileId) return; + if (segmentStatsCacheRef.current[fileId] || segmentStatsLoadingRef.current.has(fileId)) return; + segmentStatsLoadingRef.current.add(fileId); + try { + const resp = (await getEditorTaskUsingGet(projectId, fileId, { + segmentIndex: 0, + })) as ApiResponse; + if (segmentStatsSeqRef.current !== seq) return; + const data = resp?.data; + if (!data?.segmented) return; + const stats = buildSegmentStats(data.segments); + if (!stats) return; + segmentStatsCacheRef.current[fileId] = stats; + applySegmentStats(fileId, stats); + } catch (e) { + console.error(e); + } finally { + segmentStatsLoadingRef.current.delete(fileId); + } + }, [applySegmentStats, projectId]); + + const prefetchSegmentStats = useCallback((items: EditorTaskListItem[]) => { + if (!projectId) return; + const fileIds = items + .map((item) => item.fileId) + .filter((fileId) => fileId && !segmentStatsCacheRef.current[fileId]); + if (fileIds.length === 0) return; + const seq = segmentStatsSeqRef.current; + let cursor = 0; + const workerCount = Math.min(3, fileIds.length); + const runWorker = async () => { + while (cursor < fileIds.length && segmentStatsSeqRef.current === seq) { + const fileId = fileIds[cursor]; + cursor += 1; + await fetchSegmentStatsForFile(fileId, seq); + } + }; + void Promise.all(Array.from({ length: workerCount }, () => runWorker())); + }, [fetchSegmentStatsForFile, projectId]); + const confirmEmptyAnnotationStatus = useCallback(() => { return new Promise((resolve) => { let resolved = false; @@ -327,8 +432,13 @@ export default function LabelStudioTextEditor() { }, [message, projectId]); const updateTaskSelection = useCallback((items: EditorTaskListItem[]) => { + const isCompleted = (item: EditorTaskListItem) => { + const summary = resolveSegmentSummary(item); + if (summary) return summary.done >= summary.total; + return item.hasAnnotation; + }; const defaultFileId = - items.find((item) => !item.hasAnnotation)?.fileId || items[0]?.fileId || ""; + items.find((item) => !isCompleted(item))?.fileId || items[0]?.fileId || ""; setSelectedFileId((prev) => { if (prev && items.some((item) => item.fileId === prev)) return prev; return defaultFileId; @@ -385,6 +495,9 @@ export default function LabelStudioTextEditor() { if (mode === "reset") { prefetchSeqRef.current += 1; setPrefetching(false); + segmentStatsSeqRef.current += 1; + segmentStatsCacheRef.current = {}; + segmentStatsLoadingRef.current = new Set(); } if (mode === "append") { setLoadingMore(true); @@ -469,13 +582,16 @@ export default function LabelStudioTextEditor() { ? resolveSegmentIndex(data.currentSegmentIndex) ?? 0 : undefined; if (data?.segmented) { + const stats = buildSegmentStats(data.segments); setSegmented(true); setSegments(data.segments || []); setCurrentSegmentIndex(segmentIndex ?? 0); + updateSegmentStatsCache(fileId, stats); } else { setSegmented(false); setSegments([]); setCurrentSegmentIndex(0); + updateSegmentStatsCache(fileId, null); } const taskData = { @@ -535,7 +651,7 @@ export default function LabelStudioTextEditor() { } finally { if (seq === initSeqRef.current) setLoadingTaskDetail(false); } - }, [iframeReady, message, postToIframe, project, projectId]); + }, [iframeReady, message, postToIframe, project, projectId, updateSegmentStatsCache]); const advanceAfterSave = useCallback(async (fileId: string, segmentIndex?: number) => { if (!fileId) return; @@ -643,13 +759,13 @@ export default function LabelStudioTextEditor() { // 分段模式下更新当前段落的标注状态 if (segmented && segmentIndex !== undefined) { - setSegments((prev) => - prev.map((seg) => - seg.idx === segmentIndex - ? { ...seg, hasAnnotation: true } - : seg - ) + const nextSegments = segments.map((seg) => + seg.idx === segmentIndex + ? { ...seg, hasAnnotation: true } + : seg ); + setSegments(nextSegments); + updateSegmentStatsCache(String(fileId), buildSegmentStats(nextSegments)); } if (options?.autoAdvance) { await advanceAfterSave(String(fileId), segmentIndex); @@ -669,8 +785,10 @@ export default function LabelStudioTextEditor() { message, projectId, segmented, + segments, selectedFileId, tasks, + updateSegmentStatsCache, ]); const requestExportForCheck = useCallback(() => { @@ -834,6 +952,9 @@ export default function LabelStudioTextEditor() { setSegments([]); setCurrentSegmentIndex(0); savedSnapshotsRef.current = {}; + segmentStatsSeqRef.current += 1; + segmentStatsCacheRef.current = {}; + segmentStatsLoadingRef.current = new Set(); if (exportCheckRef.current?.timer) { window.clearTimeout(exportCheckRef.current.timer); } @@ -847,6 +968,12 @@ export default function LabelStudioTextEditor() { loadTasks({ mode: "reset" }); }, [project?.supported, loadTasks]); + useEffect(() => { + if (!segmented) return; + if (tasks.length === 0) return; + prefetchSegmentStats(tasks); + }, [prefetchSegmentStats, segmented, tasks]); + useEffect(() => { if (!selectedFileId) return; initEditorForFile(selectedFileId); @@ -1097,6 +1224,7 @@ export default function LabelStudioTextEditor() { dataSource={tasks} loadMore={loadMoreNode} renderItem={(item) => { + const segmentSummary = resolveSegmentSummary(item); const statusMeta = resolveTaskStatusMeta(item); return ( setSelectedFileId(item.fileId)} >
- - {item.fileName} - -
- - {statusMeta.text} + + {item.fileName} - {item.annotationUpdatedAt && ( - - {item.annotationUpdatedAt} - - )} +
+
+ + {statusMeta.text} + + {segmentSummary && ( + + 已标注 {segmentSummary.done}/{segmentSummary.total} + + )} +
+ {item.annotationUpdatedAt && ( + + {item.annotationUpdatedAt} + + )}
diff --git a/frontend/src/pages/DataAnnotation/annotation.model.ts b/frontend/src/pages/DataAnnotation/annotation.model.ts index a63f30c..309a4a0 100644 --- a/frontend/src/pages/DataAnnotation/annotation.model.ts +++ b/frontend/src/pages/DataAnnotation/annotation.model.ts @@ -10,6 +10,7 @@ export enum AnnotationTaskStatus { export enum AnnotationResultStatus { ANNOTATED = "ANNOTATED", + IN_PROGRESS = "IN_PROGRESS", NO_ANNOTATION = "NO_ANNOTATION", NOT_APPLICABLE = "NOT_APPLICABLE", } diff --git a/runtime/datamate-python/app/db/models/annotation_management.py b/runtime/datamate-python/app/db/models/annotation_management.py index b5f2444..c10cb8d 100644 --- a/runtime/datamate-python/app/db/models/annotation_management.py +++ b/runtime/datamate-python/app/db/models/annotation_management.py @@ -9,10 +9,17 @@ from app.db.session import Base ANNOTATION_STATUS_ANNOTATED = "ANNOTATED" ANNOTATION_STATUS_NO_ANNOTATION = "NO_ANNOTATION" ANNOTATION_STATUS_NOT_APPLICABLE = "NOT_APPLICABLE" +ANNOTATION_STATUS_IN_PROGRESS = "IN_PROGRESS" ANNOTATION_STATUS_VALUES = { ANNOTATION_STATUS_ANNOTATED, ANNOTATION_STATUS_NO_ANNOTATION, ANNOTATION_STATUS_NOT_APPLICABLE, + ANNOTATION_STATUS_IN_PROGRESS, +} +ANNOTATION_STATUS_CLIENT_VALUES = { + ANNOTATION_STATUS_ANNOTATED, + ANNOTATION_STATUS_NO_ANNOTATION, + ANNOTATION_STATUS_NOT_APPLICABLE, } class AnnotationTemplate(Base): @@ -101,7 +108,7 @@ class AnnotationResult(Base): String(32), nullable=False, default=ANNOTATION_STATUS_ANNOTATED, - comment="标注状态: ANNOTATED/NO_ANNOTATION/NOT_APPLICABLE", + comment="标注状态: ANNOTATED/NO_ANNOTATION/NOT_APPLICABLE/IN_PROGRESS", ) created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间") updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间") diff --git a/runtime/datamate-python/app/module/annotation/schema/editor.py b/runtime/datamate-python/app/module/annotation/schema/editor.py index 36711b3..06ced10 100644 --- a/runtime/datamate-python/app/module/annotation/schema/editor.py +++ b/runtime/datamate-python/app/module/annotation/schema/editor.py @@ -16,6 +16,7 @@ from pydantic import BaseModel, Field, ConfigDict from app.db.models.annotation_management import ( ANNOTATION_STATUS_ANNOTATED, + ANNOTATION_STATUS_IN_PROGRESS, ANNOTATION_STATUS_NO_ANNOTATION, ANNOTATION_STATUS_NOT_APPLICABLE, ) @@ -25,6 +26,7 @@ class AnnotationStatus(str, Enum): """标注状态枚举""" ANNOTATED = ANNOTATION_STATUS_ANNOTATED + IN_PROGRESS = ANNOTATION_STATUS_IN_PROGRESS NO_ANNOTATION = ANNOTATION_STATUS_NO_ANNOTATION NOT_APPLICABLE = ANNOTATION_STATUS_NOT_APPLICABLE @@ -112,7 +114,7 @@ class UpsertAnnotationRequest(BaseModel): annotation_status: Optional[AnnotationStatus] = Field( None, alias="annotationStatus", - description="标注状态(无标注传 NO_ANNOTATION,不适用传 NOT_APPLICABLE)", + description="标注状态(无标注传 NO_ANNOTATION,不适用传 NOT_APPLICABLE,IN_PROGRESS 由后端维护)", ) expected_updated_at: Optional[datetime] = Field( None, diff --git a/runtime/datamate-python/app/module/annotation/service/editor.py b/runtime/datamate-python/app/module/annotation/service/editor.py index 10953a3..1cd5587 100644 --- a/runtime/datamate-python/app/module/annotation/service/editor.py +++ b/runtime/datamate-python/app/module/annotation/service/editor.py @@ -26,9 +26,10 @@ from app.core.logging import get_logger from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject, LabelingProjectFile from app.db.models.annotation_management import ( ANNOTATION_STATUS_ANNOTATED, + ANNOTATION_STATUS_IN_PROGRESS, + ANNOTATION_STATUS_CLIENT_VALUES, ANNOTATION_STATUS_NO_ANNOTATION, ANNOTATION_STATUS_NOT_APPLICABLE, - ANNOTATION_STATUS_VALUES, ) from app.module.annotation.config import LabelStudioTagConfig from app.module.annotation.schema.editor import ( @@ -61,6 +62,7 @@ SEGMENT_INDEX_KEY = "segment_index" SEGMENT_INDEX_CAMEL_KEY = "segmentIndex" SEGMENTED_KEY = "segmented" SEGMENTS_KEY = "segments" +SEGMENT_TOTAL_KEY = "total_segments" SEGMENT_RESULT_KEY = "result" SEGMENT_CREATED_AT_KEY = "created_at" SEGMENT_UPDATED_AT_KEY = "updated_at" @@ -416,6 +418,76 @@ class AnnotationEditorService: result = payload.get(SEGMENT_RESULT_KEY) return isinstance(result, list) and len(result) > 0 + @staticmethod + def _resolve_segment_total(payload: Optional[Dict[str, Any]]) -> Optional[int]: + if not payload or not isinstance(payload, dict): + return None + value = payload.get(SEGMENT_TOTAL_KEY) + if isinstance(value, int): + return value if value > 0 else None + if isinstance(value, float) and value.is_integer(): + return int(value) if value > 0 else None + if isinstance(value, str) and value.isdigit(): + parsed = int(value) + return parsed if parsed > 0 else None + return None + + async def _compute_segment_total( + self, + project: LabelingProject, + file_record: DatasetFiles, + file_id: str, + ) -> Optional[int]: + dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id)) + if dataset_type != DATASET_TYPE_TEXT: + return None + if not self._resolve_segmentation_enabled(project): + return None + + text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id) + if not isinstance(text_content, str): + return None + + label_config = await self._resolve_project_label_config(project) + primary_text_key = self._resolve_primary_text_key(label_config) + file_name = str(getattr(file_record, "file_name", "")).lower() + + records: List[Tuple[Optional[Dict[str, Any]], str]] = [] + if file_name.endswith(JSONL_EXTENSION): + records = self._parse_jsonl_records(text_content) + else: + parsed_payload = self._try_parse_json_payload(text_content) + if parsed_payload: + records = [(parsed_payload, text_content)] + + if not records: + records = [(None, text_content)] + + record_texts = [ + self._resolve_primary_text_value(payload, raw_text, primary_text_key) + for payload, raw_text in records + ] + if not record_texts: + record_texts = [text_content] + + needs_segmentation = len(records) > 1 or any( + len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts + ) + if not needs_segmentation: + return None + + splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD) + total_segments = 0 + for record_text in record_texts: + normalized_text = record_text or "" + if len(normalized_text) > self.SEGMENT_THRESHOLD: + raw_segments = splitter.split(normalized_text) + total_segments += len(raw_segments) if raw_segments else 1 + else: + total_segments += 1 + + return total_segments if total_segments > 0 else 1 + @classmethod def _build_source_document_filter(cls): file_type_lower = func.lower(DatasetFiles.file_type) @@ -946,19 +1018,36 @@ class AnnotationEditorService: final_payload = annotation_payload requested_status = request.annotation_status - if requested_status is not None and requested_status not in ANNOTATION_STATUS_VALUES: + if requested_status is not None and requested_status not in ANNOTATION_STATUS_CLIENT_VALUES: raise HTTPException(status_code=400, detail="annotationStatus 不合法") - has_result = self._has_annotation_result(final_payload) - if has_result: - final_status = ANNOTATION_STATUS_ANNOTATED + segment_total = None + segment_done = None + if request.segment_index is not None: + segment_total = self._resolve_segment_total(final_payload) + if segment_total is None: + segment_total = await self._compute_segment_total(project, file_record, file_id) + if segment_total and segment_total > 0: + final_payload[SEGMENT_TOTAL_KEY] = segment_total + segment_done = len(self._extract_segment_annotations(final_payload)) + + if ( + segment_total is not None + and segment_done is not None + and segment_done < segment_total + ): + final_status = ANNOTATION_STATUS_IN_PROGRESS else: - if requested_status == ANNOTATION_STATUS_NO_ANNOTATION: - final_status = ANNOTATION_STATUS_NO_ANNOTATION - elif requested_status == ANNOTATION_STATUS_NOT_APPLICABLE: - final_status = ANNOTATION_STATUS_NOT_APPLICABLE + has_result = self._has_annotation_result(final_payload) + if has_result: + final_status = ANNOTATION_STATUS_ANNOTATED else: - raise HTTPException(status_code=400, detail="未发现标注内容,请确认无标注/不适用后再保存") + if requested_status == ANNOTATION_STATUS_NO_ANNOTATION: + final_status = ANNOTATION_STATUS_NO_ANNOTATION + elif requested_status == ANNOTATION_STATUS_NOT_APPLICABLE: + final_status = ANNOTATION_STATUS_NOT_APPLICABLE + else: + raise HTTPException(status_code=400, detail="未发现标注内容,请确认无标注/不适用后再保存") if existing: if request.expected_updated_at and existing.updated_at: