feat(annotation): 文件版本更新时支持保留标注记录（位置偏移+文字匹配迁移）

新增 AnnotationMigrator 迁移算法，在 TEXT 类型数据集的文件版本更新时，可选通过 difflib 位置偏移映射和文字二次匹配将旧版本标注迁移到新版本上。前端版本切换对话框增加"保留标注"复选框（仅 TEXT 类型显示），后端 API 增加 preserveAnnotations 参数，完全向后兼容。 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 19:42:59 +08:00
parent 7d5a809772
commit 807c2289e2
6 changed files with 499 additions and 82 deletions
--- a/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx
+++ b/frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx
@@ -1,5 +1,5 @@
 import { useCallback, useEffect, useMemo, useRef, useState } from "react";
-import { App, Button, Card, List, Spin, Typography, Tag, Empty } from "antd";
+import { App, Button, Card, Checkbox, List, Spin, Typography, Tag, Empty } from "antd";
 import { LeftOutlined, ReloadOutlined, SaveOutlined, MenuFoldOutlined, MenuUnfoldOutlined } from "@ant-design/icons";
 import { useNavigate, useParams } from "react-router";

@@ -11,6 +11,7 @@ import {
  checkFileVersionUsingGet,
  applyNewVersionUsingPost,
  type FileVersionCheckResponse,
+  type UseNewVersionResponse,
 } from "../annotation.api";
 import { AnnotationResultStatus } from "../annotation.model";

@@ -242,6 +243,7 @@ export default function LabelStudioTextEditor() {
  } | null>(null);
  const savedSnapshotsRef = useRef<Record<string, string>>({});
  const pendingAutoAdvanceRef = useRef(false);
+  const preserveAnnotationsRef = useRef(true);

  const [loadingProject, setLoadingProject] = useState(true);
  const [loadingTasks, setLoadingTasks] = useState(false);
@@ -594,18 +596,31 @@ export default function LabelStudioTextEditor() {
  const handleUseNewVersion = useCallback(async () => {
    if (!selectedFileId) return;

+    // Reset ref to default before opening dialog
+    preserveAnnotationsRef.current = true;
+
    modal.confirm({
      title: "确认使用新版本",
      content: (
        <div className="flex flex-col gap-2">
          <Typography.Text>
-            确认使用新版本？这将清空当前标注并使用最新版本的文件内容。
+            确认使用新版本？这将使用最新版本的文件内容。
          </Typography.Text>
          {fileVersionInfo && (
            <Typography.Text type="secondary">
              当前标注版本: {fileVersionInfo.annotationFileVersion}，最新文件版本: {fileVersionInfo.currentFileVersion}
            </Typography.Text>
          )}
+          {isTextProject && (
+            <Checkbox
+              defaultChecked={true}
+              onChange={(e) => {
+                preserveAnnotationsRef.current = e.target.checked;
+              }}
+            >
+              尝试保留已有标注（根据文字匹配迁移）
+            </Checkbox>
+          )}
        </div>
      ),
      okText: "确认",
@@ -615,8 +630,19 @@ export default function LabelStudioTextEditor() {
        if (!projectId || !selectedFileId) return;
        setUsingNewVersion(true);
        try {
-          await applyNewVersionUsingPost(projectId, selectedFileId);
-          message.success("已使用新版本并清空标注");
+          const resp = (await applyNewVersionUsingPost(
+            projectId,
+            selectedFileId,
+            preserveAnnotationsRef.current,
+          )) as ApiResponse<UseNewVersionResponse>;
+          const data = resp?.data;
+          if (data?.migratedCount != null) {
+            message.success(
+              `已切换到新版本，${data.migratedCount} 条标注已迁移，${data.failedCount ?? 0} 条无法迁移`,
+            );
+          } else {
+            message.success("已使用新版本并清空标注");
+          }
          setFileVersionInfo(null);
          await loadTasks({ mode: "reset" });
          await initEditorForFile(selectedFileId);
@@ -628,7 +654,7 @@ export default function LabelStudioTextEditor() {
        }
      },
    });
-  }, [modal, message, projectId, selectedFileId, fileVersionInfo, loadTasks, initEditorForFile]);
+  }, [modal, message, projectId, selectedFileId, fileVersionInfo, isTextProject, loadTasks, initEditorForFile]);

  const advanceAfterSave = useCallback(async (fileId: string, segmentIndex?: number) => {
    if (!fileId) return;
--- a/frontend/src/pages/DataAnnotation/annotation.api.ts
+++ b/frontend/src/pages/DataAnnotation/annotation.api.ts
@@ -141,10 +141,19 @@ export interface UseNewVersionResponse {
  previousFileVersion: number | null;
  currentFileVersion: number;
  message: string;
+  migratedCount?: number;
+  failedCount?: number;
 }

-export function applyNewVersionUsingPost(projectId: string, fileId: string) {
-  return post(`/api/annotation/editor/projects/${projectId}/files/${fileId}/use-new-version`, {});
+export function applyNewVersionUsingPost(
+  projectId: string,
+  fileId: string,
+  preserveAnnotations: boolean = false,
+) {
+  return post(
+    `/api/annotation/editor/projects/${projectId}/files/${fileId}/use-new-version`,
+    { preserveAnnotations },
+  );
 }


--- a/runtime/datamate-python/app/module/annotation/interface/editor.py
+++ b/runtime/datamate-python/app/module/annotation/interface/editor.py
@@ -22,6 +22,7 @@ from app.module.annotation.schema.editor import (
    EditorTaskSegmentResponse,
    EditorTaskResponse,
    FileVersionCheckResponse,
+    UseNewVersionRequest,
    UseNewVersionResponse,
    UpsertAnnotationRequest,
    UpsertAnnotationResponse,
@@ -158,12 +159,14 @@ async def check_file_version(
 async def use_new_version(
    project_id: str = Path(..., description="标注项目ID（t_dm_labeling_projects.id）"),
    file_id: str = Path(..., description="文件ID（t_dm_dataset_files.id）"),
+    request: Optional[UseNewVersionRequest] = None,
    db: AsyncSession = Depends(get_db),
    user_context: RequestUserContext = Depends(get_request_user_context),
 ):
    """
-    使用文件新版本并清空标注
+    使用文件新版本（可选保留标注）
    """
+    preserve = request.preserve_annotations if request else False
    service = AnnotationEditorService(db, user_context)
-    result = await service.use_new_version(project_id, file_id)
+    result = await service.use_new_version(project_id, file_id, preserve_annotations=preserve)
    return StandardResponse(code=200, message="success", data=result)
--- a/runtime/datamate-python/app/module/annotation/schema/editor.py
+++ b/runtime/datamate-python/app/module/annotation/schema/editor.py
@@ -220,6 +220,18 @@ class FileVersionCheckResponse(BaseModel):
    model_config = ConfigDict(populate_by_name=True)


+class UseNewVersionRequest(BaseModel):
+    """使用新版本请求"""
+
+    preserve_annotations: bool = Field(
+        False,
+        alias="preserveAnnotations",
+        description="是否尝试保留标注（基于文字匹配迁移）",
+    )
+
+    model_config = ConfigDict(populate_by_name=True)
+
+
 class UseNewVersionResponse(BaseModel):
    """使用新版本响应"""

@@ -231,5 +243,11 @@ class UseNewVersionResponse(BaseModel):
        ..., alias="currentFileVersion", description="当前文件版本"
    )
    message: str = Field(..., description="操作结果消息")
+    migrated_count: Optional[int] = Field(
+        None, alias="migratedCount", description="成功迁移的标注数量"
+    )
+    failed_count: Optional[int] = Field(
+        None, alias="failedCount", description="无法迁移的标注数量"
+    )

    model_config = ConfigDict(populate_by_name=True)
--- a/runtime/datamate-python/app/module/annotation/service/annotation_migrator.py
+++ b/runtime/datamate-python/app/module/annotation/service/annotation_migrator.py
@@ -0,0 +1,215 @@
+"""
+标注迁移器
+
+在文件版本更新时，将旧版本的标注结果迁移到新版本文本上。
+仅适用于 TEXT 类型数据集（标注含有 start/end 字符位置和 text 文本片段）。
+
+迁移算法：
+1. 对没有 value.start/value.end 的标注项（如 choices），直接保留不变
+2. 对有位置信息的标注项：
+   a. 用 SequenceMatcher 计算旧位置 -> 新位置的偏移映射
+   b. 验证映射后的文本是否匹配
+   c. 若不匹配，全文搜索最近的匹配位置
+   d. 若仍找不到，记入失败列表
+"""
+
+from __future__ import annotations
+
+import difflib
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional
+
+
+@dataclass
+class MigrationResult:
+    """标注迁移结果"""
+
+    migrated: List[Dict[str, Any]] = field(default_factory=list)
+    failed: List[Dict[str, Any]] = field(default_factory=list)
+    total: int = 0
+    migrated_count: int = 0
+    failed_count: int = 0
+
+
+class AnnotationMigrator:
+    """标注迁移核心算法"""
+
+    @staticmethod
+    def migrate_annotation_results(
+        old_text: str,
+        new_text: str,
+        results: List[Dict[str, Any]],
+    ) -> MigrationResult:
+        """
+        迁移标注结果列表中包含位置信息的标注项。
+
+        Args:
+            old_text: 旧版本文本
+            new_text: 新版本文本
+            results: Label Studio annotation result 数组
+
+        Returns:
+            MigrationResult 包含成功/失败的标注项
+        """
+        if not results:
+            return MigrationResult()
+
+        offset_map = AnnotationMigrator._build_offset_map(old_text, new_text)
+
+        migrated: List[Dict[str, Any]] = []
+        failed: List[Dict[str, Any]] = []
+
+        for item in results:
+            value = item.get("value") if isinstance(item, dict) else None
+            if not isinstance(value, dict):
+                # 无 value 结构，直接保留
+                migrated.append(item)
+                continue
+
+            old_start = value.get("start")
+            old_end = value.get("end")
+
+            if old_start is None or old_end is None:
+                # 无位置信息（如 choices 类型），直接保留
+                migrated.append(item)
+                continue
+
+            if not isinstance(old_start, (int, float)) or not isinstance(
+                old_end, (int, float)
+            ):
+                migrated.append(item)
+                continue
+
+            old_start = int(old_start)
+            old_end = int(old_end)
+            target_text = value.get("text", "")
+
+            # 尝试通过偏移映射迁移
+            new_start = offset_map(old_start)
+            new_end = offset_map(old_end)
+
+            if new_start is not None and new_end is not None:
+                new_start = int(new_start)
+                new_end = int(new_end)
+                if (
+                    0 <= new_start <= new_end <= len(new_text)
+                    and new_text[new_start:new_end] == target_text
+                ):
+                    # 偏移映射成功且文本匹配
+                    new_item = _deep_copy_item(item)
+                    new_item["value"] = dict(value)
+                    new_item["value"]["start"] = new_start
+                    new_item["value"]["end"] = new_end
+                    migrated.append(new_item)
+                    continue
+
+            # 偏移映射失败或文本不匹配，尝试全文搜索
+            if target_text:
+                hint_pos = new_start if new_start is not None else old_start
+                found_pos = AnnotationMigrator._find_nearest_occurrence(
+                    new_text, target_text, hint_pos
+                )
+                if found_pos is not None:
+                    new_item = _deep_copy_item(item)
+                    new_item["value"] = dict(value)
+                    new_item["value"]["start"] = found_pos
+                    new_item["value"]["end"] = found_pos + len(target_text)
+                    migrated.append(new_item)
+                    continue
+
+            # 无法迁移
+            failed.append(item)
+
+        total = len(results)
+        return MigrationResult(
+            migrated=migrated,
+            failed=failed,
+            total=total,
+            migrated_count=len(migrated),
+            failed_count=len(failed),
+        )
+
+    @staticmethod
+    def _build_offset_map(
+        old_text: str, new_text: str
+    ) -> Callable[[int], Optional[int]]:
+        """
+        用 difflib.SequenceMatcher 构建旧位置 -> 新位置映射函数。
+
+        对于旧文本中的每个字符位置，通过匹配块计算其在新文本中的对应位置。
+        """
+        matcher = difflib.SequenceMatcher(None, old_text, new_text, autojunk=False)
+        matching_blocks = matcher.get_matching_blocks()
+
+        # 构建映射表：对每个匹配块，旧位置 i 映射到新位置 j + (i - a)
+        # matching_blocks 中每个元素为 (a, b, size)，表示
+        # old_text[a:a+size] == new_text[b:b+size]
+        blocks = [
+            (a, b, size) for a, b, size in matching_blocks if size > 0
+        ]
+
+        def map_position(old_pos: int) -> Optional[int]:
+            for a, b, size in blocks:
+                if a <= old_pos < a + size:
+                    return b + (old_pos - a)
+            # 位置不在任何匹配块中，尝试找最近的块进行推算
+            if not blocks:
+                return None
+
+            # 找到最近的匹配块
+            best_block = None
+            best_distance = float("inf")
+            for a, b, size in blocks:
+                # 到块起始位置的距离
+                dist_start = abs(old_pos - a)
+                dist_end = abs(old_pos - (a + size))
+                dist = min(dist_start, dist_end)
+                if dist < best_distance:
+                    best_distance = dist
+                    best_block = (a, b, size)
+
+            if best_block is None:
+                return None
+
+            a, b, size = best_block
+            # 推算偏移
+            offset = old_pos - a
+            new_pos = b + offset
+            if 0 <= new_pos <= len(new_text):
+                return new_pos
+            return None
+
+        return map_position
+
+    @staticmethod
+    def _find_nearest_occurrence(
+        text: str, target: str, hint_pos: int
+    ) -> Optional[int]:
+        """
+        在 text 中查找 target，优先返回距离 hint_pos 最近的位置。
+        """
+        if not target:
+            return None
+
+        positions: List[int] = []
+        start = 0
+        while True:
+            idx = text.find(target, start)
+            if idx < 0:
+                break
+            positions.append(idx)
+            start = idx + 1
+
+        if not positions:
+            return None
+
+        # 返回距离 hint_pos 最近的位置
+        return min(positions, key=lambda pos: abs(pos - hint_pos))
+
+
+def _deep_copy_item(item: Dict[str, Any]) -> Dict[str, Any]:
+    """浅拷贝标注项，深拷贝 value 字段"""
+    new_item = dict(item)
+    if "value" in new_item and isinstance(new_item["value"], dict):
+        new_item["value"] = dict(new_item["value"])
+    return new_item
--- a/runtime/datamate-python/app/module/annotation/service/editor.py
+++ b/runtime/datamate-python/app/module/annotation/service/editor.py
@@ -61,6 +61,7 @@ from app.module.annotation.security import (
 from app.module.annotation.service.text_fetcher import (
    fetch_text_content_via_download_api,
 )
+from app.module.annotation.service.annotation_migrator import AnnotationMigrator

 logger = get_logger(__name__)

@@ -1734,16 +1735,21 @@ class AnnotationEditorService:
            "latestFileId": latest_file.id if latest_file else file_id,
        }

-    async def use_new_version(self, project_id: str, file_id: str) -> Dict[str, Any]:
+    async def use_new_version(
+        self, project_id: str, file_id: str,
+        preserve_annotations: bool = False,
+    ) -> Dict[str, Any]:
        """
-        使用文件新版本并清空标注
+        使用文件新版本

        如果文件有多个版本（通过 logical_path 关联），将标注切换到最新版本。
-        如果存在标注记录，会清空标注内容。
+        当 preserve_annotations=True 且数据集类型为 TEXT 时，尝试通过位置偏移 +
+        文字匹配将旧版本的标注迁移到新版本上；否则清空标注内容。

        Args:
            project_id: 标注项目ID
            file_id: 文件ID（当前关联的文件ID）
+            preserve_annotations: 是否尝试保留标注

        Returns:
            操作结果
@@ -1819,9 +1825,61 @@ class AnnotationEditorService:
        now = datetime.utcnow()

        if annotation:
-            # 存在标注记录：清空标注并更新文件版本
            previous_file_version = annotation.file_version

+            # 判断是否可以尝试迁移标注
+            dataset_type = self._normalize_dataset_type(
+                await self._get_dataset_type(project.dataset_id)
+            )
+            can_migrate = (
+                preserve_annotations
+                and dataset_type == DATASET_TYPE_TEXT
+                and self._has_annotation_result(annotation.annotation)
+            )
+
+            if can_migrate:
+                migrated_payload, migrated_count, failed_count = (
+                    await self._migrate_annotations_to_new_version(
+                        project=project,
+                        annotation=annotation,
+                        old_file_id=file_id,
+                        new_file_id=str(latest_file.id),
+                    )
+                )
+
+                has_result = self._has_annotation_result(migrated_payload)
+                final_status = (
+                    ANNOTATION_STATUS_ANNOTATED
+                    if has_result
+                    else ANNOTATION_STATUS_NO_ANNOTATION
+                )
+
+                annotation.file_id = str(latest_file.id)
+                annotation.annotation = migrated_payload
+                annotation.annotation_status = final_status
+                annotation.file_version = latest_file.version
+                annotation.updated_at = now
+
+                await self.db.commit()
+                await self.db.refresh(annotation)
+
+                await self._sync_annotation_to_knowledge(
+                    project,
+                    latest_file,
+                    migrated_payload,
+                    annotation.updated_at or now,
+                )
+
+                return {
+                    "fileId": str(latest_file.id),
+                    "previousFileVersion": previous_file_version,
+                    "currentFileVersion": latest_file.version,
+                    "message": f"已切换到新版本，{migrated_count} 条标注已迁移，{failed_count} 条无法迁移",
+                    "migratedCount": migrated_count,
+                    "failedCount": failed_count,
+                }
+
+            # 不迁移：清空标注
            cleared_payload: Dict[str, Any] = {}
            if isinstance(annotation.annotation, dict) and self._is_segmented_annotation(
                annotation.annotation
@@ -1879,3 +1937,91 @@ class AnnotationEditorService:
                "currentFileVersion": latest_file.version,
                "message": "已切换到新版本",
            }
+
+    async def _migrate_annotations_to_new_version(
+        self,
+        project: LabelingProject,
+        annotation: AnnotationResult,
+        old_file_id: str,
+        new_file_id: str,
+    ) -> tuple:
+        """
+        迁移标注到新版本文件。
+
+        Returns:
+            (migrated_payload, migrated_count, failed_count)
+        """
+        old_text = await self._fetch_text_content_via_download_api(
+            project.dataset_id, old_file_id
+        )
+        new_text = await self._fetch_text_content_via_download_api(
+            project.dataset_id, new_file_id
+        )
+
+        ann_data = annotation.annotation
+        if not isinstance(ann_data, dict):
+            return {}, 0, 0
+
+        total_migrated = 0
+        total_failed = 0
+
+        if self._is_segmented_annotation(ann_data):
+            # 分段标注：逐段迁移
+            segments = self._extract_segment_annotations(ann_data)
+            migrated_segments: Dict[str, Dict[str, Any]] = {}
+
+            for seg_key, seg_data in segments.items():
+                if not isinstance(seg_data, dict):
+                    continue
+                seg_results = seg_data.get(SEGMENT_RESULT_KEY, [])
+                if not isinstance(seg_results, list) or not seg_results:
+                    # 空标注段落，保留结构
+                    migrated_segments[seg_key] = dict(seg_data)
+                    migrated_segments[seg_key][SEGMENT_RESULT_KEY] = []
+                    continue
+
+                migration = AnnotationMigrator.migrate_annotation_results(
+                    old_text, new_text, seg_results
+                )
+                total_migrated += migration.migrated_count
+                total_failed += migration.failed_count
+
+                new_seg = dict(seg_data)
+                new_seg[SEGMENT_RESULT_KEY] = migration.migrated
+                migrated_segments[seg_key] = new_seg
+
+            seg_total = self._resolve_segment_total(ann_data)
+            if seg_total is None:
+                seg_total = len(migrated_segments)
+
+            migrated_payload: Dict[str, Any] = {
+                SEGMENTED_KEY: True,
+                "version": ann_data.get("version", 1),
+                SEGMENTS_KEY: migrated_segments,
+                SEGMENT_TOTAL_KEY: seg_total,
+            }
+        else:
+            # 非分段标注：直接迁移 result
+            results = ann_data.get(SEGMENT_RESULT_KEY, [])
+            if not isinstance(results, list):
+                results = []
+
+            migration = AnnotationMigrator.migrate_annotation_results(
+                old_text, new_text, results
+            )
+            total_migrated = migration.migrated_count
+            total_failed = migration.failed_count
+
+            migrated_payload = dict(ann_data)
+            migrated_payload[SEGMENT_RESULT_KEY] = migration.migrated
+
+        logger.info(
+            "标注迁移完成：project_id=%s old_file=%s new_file=%s migrated=%d failed=%d",
+            project.id,
+            old_file_id,
+            new_file_id,
+            total_migrated,
+            total_failed,
+        )
+
+        return migrated_payload, total_migrated, total_failed