feat(annotation): 优化文本标注分段功能实现

- 新增 getEditorTaskSegmentsUsingGet 接口用于获取任务分段信息 - 移除 SegmentInfo 中的 text、start、end 字段，精简数据结构 - 添加 EditorTaskSegmentsResponse 类型定义用于分段摘要响应 - 实现服务端 get_task_segments 方法，支持分段信息查询 - 重构前端组件缓存机制，使用 segmentSummaryFileRef 管理分段状态 - 优化分段构建逻辑，提取 _build_segment_contexts 公共方法 - 调整后端 _build_text_task 方法中的分段处理流程 - 更新 API 类型定义，统一 RequestParams 和 RequestPayload 类型
2026-02-04 16:59:04 +08:00
parent 394e2bda18
commit cda22a720c
5 changed files with 250 additions and 108 deletions
--- a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx
+++ b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx
@@ -6,6 +6,7 @@ import { useNavigate, useParams } from "react-router";
 import {
  getEditorProjectInfoUsingGet,
  getEditorTaskUsingGet,
  getEditorTaskSegmentsUsingGet,
  listEditorTasksUsingGet,
  upsertEditorAnnotationUsingPut,
 } from "../annotation.api";
@@ -38,9 +39,6 @@ type LsfMessage = {
 type SegmentInfo = {
  idx: number;
  text: string;
  start: number;
  end: number;
  hasAnnotation: boolean;
  lineIndex: number;
  chunkIndex: number;
@@ -66,10 +64,16 @@ type EditorTaskPayload = {
 type EditorTaskResponse = {
  task?: EditorTaskPayload;
  segmented?: boolean;
-  segments?: SegmentInfo[];
+  totalSegments?: number;
  currentSegmentIndex?: number;
 };
 type EditorTaskSegmentsResponse = {
  segmented?: boolean;
  segments?: SegmentInfo[];
  totalSegments?: number;
 };
 type EditorTaskListResponse = {
  content?: EditorTaskListItem[];
  totalElements?: number;
@@ -288,6 +292,7 @@ export default function LabelStudioTextEditor() {
  const segmentStatsCacheRef = useRef<Record<string, SegmentStats>>({});
  const segmentStatsSeqRef = useRef(0);
  const segmentStatsLoadingRef = useRef<Set<string>>(new Set());
  const segmentSummaryFileRef = useRef<string>("");
  const [loadingProject, setLoadingProject] = useState(true);
  const [loadingTasks, setLoadingTasks] = useState(false);
@@ -358,9 +363,7 @@ export default function LabelStudioTextEditor() {
    if (segmentStatsCacheRef.current[fileId] || segmentStatsLoadingRef.current.has(fileId)) return;
    segmentStatsLoadingRef.current.add(fileId);
    try {
-      const resp = (await getEditorTaskUsingGet(projectId, fileId, {
+      const resp = (await getEditorTaskSegmentsUsingGet(projectId, fileId)) as ApiResponse<EditorTaskSegmentsResponse>;
        segmentIndex: 0,
      })) as ApiResponse<EditorTaskResponse>;
      if (segmentStatsSeqRef.current !== seq) return;
      const data = resp?.data;
      if (!data?.segmented) return;
@@ -591,20 +594,38 @@ export default function LabelStudioTextEditor() {
      if (seq !== initSeqRef.current) return;
      // 更新分段状态
-      const segmentIndex = data?.segmented
+      const isSegmented = !!data?.segmented;
      const segmentIndex = isSegmented
        ? resolveSegmentIndex(data.currentSegmentIndex) ?? 0
        : undefined;
-      if (data?.segmented) {
+      if (isSegmented) {
-        const stats = buildSegmentStats(data.segments);
+        let nextSegments: SegmentInfo[] = [];
        if (segmentSummaryFileRef.current === fileId && segments.length > 0) {
          nextSegments = segments;
        } else {
          try {
            const segmentResp = (await getEditorTaskSegmentsUsingGet(projectId, fileId)) as ApiResponse<EditorTaskSegmentsResponse>;
            if (seq !== initSeqRef.current) return;
            const segmentData = segmentResp?.data;
            if (segmentData?.segmented) {
              nextSegments = Array.isArray(segmentData.segments) ? segmentData.segments : [];
            }
          } catch (e) {
            console.error(e);
          }
        }
        const stats = buildSegmentStats(nextSegments);
        setSegmented(true);
-        setSegments(data.segments || []);
+        setSegments(nextSegments);
        setCurrentSegmentIndex(segmentIndex ?? 0);
        updateSegmentStatsCache(fileId, stats);
        segmentSummaryFileRef.current = fileId;
      } else {
        setSegmented(false);
        setSegments([]);
        setCurrentSegmentIndex(0);
        updateSegmentStatsCache(fileId, null);
        segmentSummaryFileRef.current = fileId;
      }
      const taskData = {
@@ -664,7 +685,7 @@ export default function LabelStudioTextEditor() {
    } finally {
      if (seq === initSeqRef.current) setLoadingTaskDetail(false);
    }
-  }, [iframeReady, message, postToIframe, project, projectId, updateSegmentStatsCache]);
+  }, [iframeReady, message, postToIframe, project, projectId, segments, updateSegmentStatsCache]);
  const advanceAfterSave = useCallback(async (fileId: string, segmentIndex?: number) => {
    if (!fileId) return;
@@ -979,6 +1000,7 @@ export default function LabelStudioTextEditor() {
    setSegmented(false);
    setSegments([]);
    setCurrentSegmentIndex(0);
    segmentSummaryFileRef.current = "";
    savedSnapshotsRef.current = {};
    segmentStatsSeqRef.current += 1;
    segmentStatsCacheRef.current = {};
--- a/frontend/src/pages/DataAnnotation/annotation.api.ts
+++ b/frontend/src/pages/DataAnnotation/annotation.api.ts
@@ -3,16 +3,19 @@ import { get, post, put, del, download } from "@/utils/request";
 // 导出格式类型
 export type ExportFormat = "json" | "jsonl" | "csv" | "coco" | "yolo";
 type RequestParams = Record<string, unknown>;
 type RequestPayload = Record<string, unknown>;
 // 标注任务管理相关接口
-export function queryAnnotationTasksUsingGet(params?: any) {
+export function queryAnnotationTasksUsingGet(params?: RequestParams) {
  return get("/api/annotation/project", params);
 }
-export function createAnnotationTaskUsingPost(data: any) {
+export function createAnnotationTaskUsingPost(data: RequestPayload) {
  return post("/api/annotation/project", data);
 }
-export function syncAnnotationTaskUsingPost(data: any) {
+export function syncAnnotationTaskUsingPost(data: RequestPayload) {
  return post(`/api/annotation/task/sync`, data);
 }
@@ -25,7 +28,7 @@ export function getAnnotationTaskByIdUsingGet(taskId: string) {
  return get(`/api/annotation/project/${taskId}`);
 }
-export function updateAnnotationTaskByIdUsingPut(taskId: string, data: any) {
+export function updateAnnotationTaskByIdUsingPut(taskId: string, data: RequestPayload) {
  return put(`/api/annotation/project/${taskId}`, data);
 }
@@ -35,17 +38,17 @@ export function getTagConfigUsingGet() {
 }
 // 标注模板管理
-export function queryAnnotationTemplatesUsingGet(params?: any) {
+export function queryAnnotationTemplatesUsingGet(params?: RequestParams) {
  return get("/api/annotation/template", params);
 }
-export function createAnnotationTemplateUsingPost(data: any) {
+export function createAnnotationTemplateUsingPost(data: RequestPayload) {
  return post("/api/annotation/template", data);
 }
 export function updateAnnotationTemplateByIdUsingPut(
  templateId: string | number,
-  data: any
+  data: RequestPayload
 ) {
  return put(`/api/annotation/template/${templateId}`, data);
 }
@@ -65,7 +68,7 @@ export function getEditorProjectInfoUsingGet(projectId: string) {
  return get(`/api/annotation/editor/projects/${projectId}`);
 }
-export function listEditorTasksUsingGet(projectId: string, params?: any) {
+export function listEditorTasksUsingGet(projectId: string, params?: RequestParams) {
  return get(`/api/annotation/editor/projects/${projectId}/tasks`, params);
 }
@@ -77,11 +80,15 @@ export function getEditorTaskUsingGet(
  return get(`/api/annotation/editor/projects/${projectId}/tasks/${fileId}`, params);
 }
 export function getEditorTaskSegmentsUsingGet(projectId: string, fileId: string) {
  return get(`/api/annotation/editor/projects/${projectId}/tasks/${fileId}/segments`);
 }
 export function upsertEditorAnnotationUsingPut(
  projectId: string,
  fileId: string,
  data: {
-    annotation: any;
+    annotation: Record<string, unknown>;
    expectedUpdatedAt?: string;
    segmentIndex?: number;
  }
--- a/runtime/datamate-python/app/module/annotation/interface/editor.py
+++ b/runtime/datamate-python/app/module/annotation/interface/editor.py
@@ -20,6 +20,7 @@ from app.module.annotation.schema.editor import (
    EditorProjectInfo,
    EditorTaskListResponse,
    EditorTaskResponse,
    EditorTaskSegmentsResponse,
    UpsertAnnotationRequest,
    UpsertAnnotationResponse,
 )
@@ -87,6 +88,20 @@ async def get_editor_task(
    return StandardResponse(code=200, message="success", data=task)
@router.get(
    "/projects/{project_id}/tasks/{file_id}/segments",
    response_model=StandardResponse[EditorTaskSegmentsResponse],
 )
 async def list_editor_task_segments(
    project_id: str = Path(..., description="标注项目ID（t_dm_labeling_projects.id）"),
    file_id: str = Path(..., description="文件ID（t_dm_dataset_files.id）"),
    db: AsyncSession = Depends(get_db),
 ):
    service = AnnotationEditorService(db)
    result = await service.get_task_segments(project_id, file_id)
    return StandardResponse(code=200, message="success", data=result)
@router.put(
    "/projects/{project_id}/tasks/{file_id}/annotation",
    response_model=StandardResponse[UpsertAnnotationResponse],
--- a/runtime/datamate-python/app/module/annotation/schema/editor.py
+++ b/runtime/datamate-python/app/module/annotation/schema/editor.py
@@ -79,12 +79,9 @@ class EditorTaskListResponse(BaseModel):
 class SegmentInfo(BaseModel):
-    """段落信息（用于文本分段标注）"""
+    """段落摘要（用于文本分段标注）"""
    idx: int = Field(..., description="段落索引")
    text: str = Field(..., description="段落文本")
    start: int = Field(..., description="在原文中的起始位置")
    end: int = Field(..., description="在原文中的结束位置")
    has_annotation: bool = Field(False, alias="hasAnnotation", description="该段落是否已有标注")
    line_index: int = Field(0, alias="lineIndex", description="JSONL 行索引（从0开始）")
    chunk_index: int = Field(0, alias="chunkIndex", description="行内分片索引（从0开始）")
@@ -100,13 +97,22 @@ class EditorTaskResponse(BaseModel):
    # 分段相关字段
    segmented: bool = Field(False, description="是否启用分段模式")
    segments: Optional[List[SegmentInfo]] = Field(None, description="段落列表")
    total_segments: int = Field(0, alias="totalSegments", description="总段落数")
    current_segment_index: int = Field(0, alias="currentSegmentIndex", description="当前段落索引")
    model_config = ConfigDict(populate_by_name=True)
 class EditorTaskSegmentsResponse(BaseModel):
    """编辑器段落摘要响应"""
    segmented: bool = Field(False, description="是否启用分段模式")
    segments: List[SegmentInfo] = Field(default_factory=list, description="段落摘要列表")
    total_segments: int = Field(0, alias="totalSegments", description="总段落数")
    model_config = ConfigDict(populate_by_name=True)
 class UpsertAnnotationRequest(BaseModel):
    """保存/覆盖最终标注（Label Studio annotation 原始对象）"""
--- a/runtime/datamate-python/app/module/annotation/service/editor.py
+++ b/runtime/datamate-python/app/module/annotation/service/editor.py
@@ -37,6 +37,7 @@ from app.module.annotation.schema.editor import (
    EditorTaskListItem,
    EditorTaskListResponse,
    EditorTaskResponse,
    EditorTaskSegmentsResponse,
    SegmentInfo,
    UpsertAnnotationRequest,
    UpsertAnnotationResponse,
@@ -538,6 +539,49 @@ class AnnotationEditorService:
                return value
        return raw_text
    def _build_segment_contexts(
        self,
        records: List[Tuple[Optional[Dict[str, Any]], str]],
        record_texts: List[str],
        segment_annotation_keys: set[str],
    ) -> Tuple[List[SegmentInfo], List[Tuple[Optional[Dict[str, Any]], str, str, int, int]]]:
        splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD)
        segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = []
        segment_cursor = 0
        for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)):
            normalized_text = record_text or ""
            if len(normalized_text) > self.SEGMENT_THRESHOLD:
                raw_segments = splitter.split(normalized_text)
                for chunk_index, seg in enumerate(raw_segments):
                    segments.append(
                        SegmentInfo(
                            idx=segment_cursor,
                            hasAnnotation=str(segment_cursor) in segment_annotation_keys,
                            lineIndex=record_index,
                            chunkIndex=chunk_index,
                        )
                    )
                    segment_contexts.append((payload, raw_text, seg["text"], record_index, chunk_index))
                    segment_cursor += 1
            else:
                segments.append(
                    SegmentInfo(
                        idx=segment_cursor,
                        hasAnnotation=str(segment_cursor) in segment_annotation_keys,
                        lineIndex=record_index,
                        chunkIndex=0,
                    )
                )
                segment_contexts.append((payload, raw_text, normalized_text, record_index, 0))
                segment_cursor += 1
        if not segments:
            segments = [SegmentInfo(idx=0, hasAnnotation=False, lineIndex=0, chunkIndex=0)]
            segment_contexts = [(None, "", "", 0, 0)]
        return segments, segment_contexts
    async def get_project_info(self, project_id: str) -> EditorProjectInfo:
        project = await self._get_project_or_404(project_id)
@@ -668,6 +712,87 @@ class AnnotationEditorService:
        return await self._build_text_task(project, file_record, file_id, segment_index)
    async def get_task_segments(
        self,
        project_id: str,
        file_id: str,
    ) -> EditorTaskSegmentsResponse:
        project = await self._get_project_or_404(project_id)
        dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
        if dataset_type != DATASET_TYPE_TEXT:
            raise HTTPException(
                status_code=400,
                detail="当前仅支持 TEXT 项目的段落摘要",
            )
        file_result = await self.db.execute(
            select(DatasetFiles).where(
                DatasetFiles.id == file_id,
                DatasetFiles.dataset_id == project.dataset_id,
            )
        )
        file_record = file_result.scalar_one_or_none()
        if not file_record:
            raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
        if not self._resolve_segmentation_enabled(project):
            return EditorTaskSegmentsResponse(segmented=False, segments=[], totalSegments=0)
        text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
        assert isinstance(text_content, str)
        label_config = await self._resolve_project_label_config(project)
        primary_text_key = self._resolve_primary_text_key(label_config)
        file_name = str(getattr(file_record, "file_name", "")).lower()
        records: List[Tuple[Optional[Dict[str, Any]], str]] = []
        if file_name.endswith(JSONL_EXTENSION):
            records = self._parse_jsonl_records(text_content)
        else:
            parsed_payload = self._try_parse_json_payload(text_content)
            if parsed_payload:
                records = [(parsed_payload, text_content)]
        if not records:
            records = [(None, text_content)]
        record_texts = [
            self._resolve_primary_text_value(payload, raw_text, primary_text_key)
            for payload, raw_text in records
        ]
        if not record_texts:
            record_texts = [text_content]
        needs_segmentation = len(records) > 1 or any(
            len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts
        )
        if not needs_segmentation:
            return EditorTaskSegmentsResponse(segmented=False, segments=[], totalSegments=0)
        ann_result = await self.db.execute(
            select(AnnotationResult).where(
                AnnotationResult.project_id == project.id,
                AnnotationResult.file_id == file_id,
            )
        )
        ann = ann_result.scalar_one_or_none()
        segment_annotations: Dict[str, Dict[str, Any]] = {}
        if ann and isinstance(ann.annotation, dict):
            segment_annotations = self._extract_segment_annotations(ann.annotation)
        segment_annotation_keys = set(segment_annotations.keys())
        segments, _ = self._build_segment_contexts(
            records,
            record_texts,
            segment_annotation_keys,
        )
        return EditorTaskSegmentsResponse(
            segmented=True,
            segments=segments,
            totalSegments=len(segments),
        )
    async def _build_text_task(
        self,
        project: LabelingProject,
@@ -723,7 +848,8 @@ class AnnotationEditorService:
        needs_segmentation = segmentation_enabled and (
            len(records) > 1 or any(len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts)
        )
-        segments: Optional[List[SegmentInfo]] = None
+        segments: List[SegmentInfo] = []
        segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = []
        current_segment_index = 0
        display_text = record_texts[0] if record_texts else text_content
        selected_payload = records[0][0] if records else None
@@ -732,46 +858,13 @@ class AnnotationEditorService:
            display_text = "\n".join(record_texts) if record_texts else text_content
        if needs_segmentation:
-            splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD)
+            _, segment_contexts = self._build_segment_contexts(
-            segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = []
+                records,
-            segments = []
+                record_texts,
-            segment_cursor = 0
+                segment_annotation_keys,
-
+            )
            for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)):
                normalized_text = record_text or ""
                if len(normalized_text) > self.SEGMENT_THRESHOLD:
                    raw_segments = splitter.split(normalized_text)
                    for chunk_index, seg in enumerate(raw_segments):
                        segments.append(SegmentInfo(
                            idx=segment_cursor,
                            text=seg["text"],
                            start=seg["start"],
                            end=seg["end"],
                            hasAnnotation=str(segment_cursor) in segment_annotation_keys,
                            lineIndex=record_index,
                            chunkIndex=chunk_index,
                        ))
                        segment_contexts.append((payload, raw_text, seg["text"], record_index, chunk_index))
                        segment_cursor += 1
                else:
                    segments.append(SegmentInfo(
                        idx=segment_cursor,
                        text=normalized_text,
                        start=0,
                        end=len(normalized_text),
                        hasAnnotation=str(segment_cursor) in segment_annotation_keys,
                        lineIndex=record_index,
                        chunkIndex=0,
                    ))
                    segment_contexts.append((payload, raw_text, normalized_text, record_index, 0))
                    segment_cursor += 1
            if not segments:
                segments = [SegmentInfo(idx=0, text="", start=0, end=0, hasAnnotation=False, lineIndex=0, chunkIndex=0)]
                segment_contexts = [(None, "", "", 0, 0)]
            current_segment_index = segment_index if segment_index is not None else 0
-            if current_segment_index < 0 or current_segment_index >= len(segments):
+            if current_segment_index < 0 or current_segment_index >= len(segment_contexts):
                current_segment_index = 0
            selected_payload, _, display_text, _, _ = segment_contexts[current_segment_index]
@@ -849,8 +942,7 @@ class AnnotationEditorService:
            task=task,
            annotationUpdatedAt=annotation_updated_at,
            segmented=needs_segmentation,
-            segments=segments,
+            totalSegments=len(segment_contexts) if needs_segmentation else 1,
            totalSegments=len(segments) if segments else 1,
            currentSegmentIndex=current_segment_index,
        )