feat(annotation): 文件版本更新时支持保留标注记录（位置偏移+文字匹配迁移）

新增 AnnotationMigrator 迁移算法，在 TEXT 类型数据集的文件版本更新时，可选通过 difflib 位置偏移映射和文字二次匹配将旧版本标注迁移到新版本上。前端版本切换对话框增加"保留标注"复选框（仅 TEXT 类型显示），后端 API 增加 preserveAnnotations 参数，完全向后兼容。 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 19:42:59 +08:00
parent 7d5a809772
commit 807c2289e2
6 changed files with 499 additions and 82 deletions
--- a/runtime/datamate-python/app/module/annotation/service/editor.py
+++ b/runtime/datamate-python/app/module/annotation/service/editor.py
@@ -61,6 +61,7 @@ from app.module.annotation.security import (
 from app.module.annotation.service.text_fetcher import (
    fetch_text_content_via_download_api,
 )
+from app.module.annotation.service.annotation_migrator import AnnotationMigrator

 logger = get_logger(__name__)

@@ -1734,16 +1735,21 @@ class AnnotationEditorService:
            "latestFileId": latest_file.id if latest_file else file_id,
        }

-    async def use_new_version(self, project_id: str, file_id: str) -> Dict[str, Any]:
+    async def use_new_version(
+        self, project_id: str, file_id: str,
+        preserve_annotations: bool = False,
+    ) -> Dict[str, Any]:
        """
-        使用文件新版本并清空标注
+        使用文件新版本

        如果文件有多个版本（通过 logical_path 关联），将标注切换到最新版本。
-        如果存在标注记录，会清空标注内容。
+        当 preserve_annotations=True 且数据集类型为 TEXT 时，尝试通过位置偏移 +
+        文字匹配将旧版本的标注迁移到新版本上；否则清空标注内容。

        Args:
            project_id: 标注项目ID
            file_id: 文件ID（当前关联的文件ID）
+            preserve_annotations: 是否尝试保留标注

        Returns:
            操作结果
@@ -1819,9 +1825,61 @@ class AnnotationEditorService:
        now = datetime.utcnow()

        if annotation:
-            # 存在标注记录：清空标注并更新文件版本
            previous_file_version = annotation.file_version

+            # 判断是否可以尝试迁移标注
+            dataset_type = self._normalize_dataset_type(
+                await self._get_dataset_type(project.dataset_id)
+            )
+            can_migrate = (
+                preserve_annotations
+                and dataset_type == DATASET_TYPE_TEXT
+                and self._has_annotation_result(annotation.annotation)
+            )
+
+            if can_migrate:
+                migrated_payload, migrated_count, failed_count = (
+                    await self._migrate_annotations_to_new_version(
+                        project=project,
+                        annotation=annotation,
+                        old_file_id=file_id,
+                        new_file_id=str(latest_file.id),
+                    )
+                )
+
+                has_result = self._has_annotation_result(migrated_payload)
+                final_status = (
+                    ANNOTATION_STATUS_ANNOTATED
+                    if has_result
+                    else ANNOTATION_STATUS_NO_ANNOTATION
+                )
+
+                annotation.file_id = str(latest_file.id)
+                annotation.annotation = migrated_payload
+                annotation.annotation_status = final_status
+                annotation.file_version = latest_file.version
+                annotation.updated_at = now
+
+                await self.db.commit()
+                await self.db.refresh(annotation)
+
+                await self._sync_annotation_to_knowledge(
+                    project,
+                    latest_file,
+                    migrated_payload,
+                    annotation.updated_at or now,
+                )
+
+                return {
+                    "fileId": str(latest_file.id),
+                    "previousFileVersion": previous_file_version,
+                    "currentFileVersion": latest_file.version,
+                    "message": f"已切换到新版本，{migrated_count} 条标注已迁移，{failed_count} 条无法迁移",
+                    "migratedCount": migrated_count,
+                    "failedCount": failed_count,
+                }
+
+            # 不迁移：清空标注
            cleared_payload: Dict[str, Any] = {}
            if isinstance(annotation.annotation, dict) and self._is_segmented_annotation(
                annotation.annotation
@@ -1879,3 +1937,91 @@ class AnnotationEditorService:
                "currentFileVersion": latest_file.version,
                "message": "已切换到新版本",
            }
+
+    async def _migrate_annotations_to_new_version(
+        self,
+        project: LabelingProject,
+        annotation: AnnotationResult,
+        old_file_id: str,
+        new_file_id: str,
+    ) -> tuple:
+        """
+        迁移标注到新版本文件。
+
+        Returns:
+            (migrated_payload, migrated_count, failed_count)
+        """
+        old_text = await self._fetch_text_content_via_download_api(
+            project.dataset_id, old_file_id
+        )
+        new_text = await self._fetch_text_content_via_download_api(
+            project.dataset_id, new_file_id
+        )
+
+        ann_data = annotation.annotation
+        if not isinstance(ann_data, dict):
+            return {}, 0, 0
+
+        total_migrated = 0
+        total_failed = 0
+
+        if self._is_segmented_annotation(ann_data):
+            # 分段标注：逐段迁移
+            segments = self._extract_segment_annotations(ann_data)
+            migrated_segments: Dict[str, Dict[str, Any]] = {}
+
+            for seg_key, seg_data in segments.items():
+                if not isinstance(seg_data, dict):
+                    continue
+                seg_results = seg_data.get(SEGMENT_RESULT_KEY, [])
+                if not isinstance(seg_results, list) or not seg_results:
+                    # 空标注段落，保留结构
+                    migrated_segments[seg_key] = dict(seg_data)
+                    migrated_segments[seg_key][SEGMENT_RESULT_KEY] = []
+                    continue
+
+                migration = AnnotationMigrator.migrate_annotation_results(
+                    old_text, new_text, seg_results
+                )
+                total_migrated += migration.migrated_count
+                total_failed += migration.failed_count
+
+                new_seg = dict(seg_data)
+                new_seg[SEGMENT_RESULT_KEY] = migration.migrated
+                migrated_segments[seg_key] = new_seg
+
+            seg_total = self._resolve_segment_total(ann_data)
+            if seg_total is None:
+                seg_total = len(migrated_segments)
+
+            migrated_payload: Dict[str, Any] = {
+                SEGMENTED_KEY: True,
+                "version": ann_data.get("version", 1),
+                SEGMENTS_KEY: migrated_segments,
+                SEGMENT_TOTAL_KEY: seg_total,
+            }
+        else:
+            # 非分段标注：直接迁移 result
+            results = ann_data.get(SEGMENT_RESULT_KEY, [])
+            if not isinstance(results, list):
+                results = []
+
+            migration = AnnotationMigrator.migrate_annotation_results(
+                old_text, new_text, results
+            )
+            total_migrated = migration.migrated_count
+            total_failed = migration.failed_count
+
+            migrated_payload = dict(ann_data)
+            migrated_payload[SEGMENT_RESULT_KEY] = migration.migrated
+
+        logger.info(
+            "标注迁移完成：project_id=%s old_file=%s new_file=%s migrated=%d failed=%d",
+            project.id,
+            old_file_id,
+            new_file_id,
+            total_migrated,
+            total_failed,
+        )
+
+        return migrated_payload, total_migrated, total_failed