feat(annotation): 文件版本更新时支持保留标注记录(位置偏移+文字匹配迁移)

新增 AnnotationMigrator 迁移算法,在 TEXT 类型数据集的文件版本更新时,
可选通过 difflib 位置偏移映射和文字二次匹配将旧版本标注迁移到新版本上。
前端版本切换对话框增加"保留标注"复选框(仅 TEXT 类型显示),后端 API
增加 preserveAnnotations 参数,完全向后兼容。

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-09 19:42:59 +08:00
parent 7d5a809772
commit 807c2289e2
6 changed files with 499 additions and 82 deletions

View File

@@ -61,6 +61,7 @@ from app.module.annotation.security import (
from app.module.annotation.service.text_fetcher import (
fetch_text_content_via_download_api,
)
from app.module.annotation.service.annotation_migrator import AnnotationMigrator
logger = get_logger(__name__)
@@ -1734,16 +1735,21 @@ class AnnotationEditorService:
"latestFileId": latest_file.id if latest_file else file_id,
}
async def use_new_version(self, project_id: str, file_id: str) -> Dict[str, Any]:
async def use_new_version(
self, project_id: str, file_id: str,
preserve_annotations: bool = False,
) -> Dict[str, Any]:
"""
使用文件新版本并清空标注
使用文件新版本
如果文件有多个版本(通过 logical_path 关联),将标注切换到最新版本。
如果存在标注记录,会清空标注内容。
当 preserve_annotations=True 且数据集类型为 TEXT 时,尝试通过位置偏移 +
文字匹配将旧版本的标注迁移到新版本上;否则清空标注内容。
Args:
project_id: 标注项目ID
file_id: 文件ID(当前关联的文件ID)
preserve_annotations: 是否尝试保留标注
Returns:
操作结果
@@ -1819,9 +1825,61 @@ class AnnotationEditorService:
now = datetime.utcnow()
if annotation:
# 存在标注记录:清空标注并更新文件版本
previous_file_version = annotation.file_version
# 判断是否可以尝试迁移标注
dataset_type = self._normalize_dataset_type(
await self._get_dataset_type(project.dataset_id)
)
can_migrate = (
preserve_annotations
and dataset_type == DATASET_TYPE_TEXT
and self._has_annotation_result(annotation.annotation)
)
if can_migrate:
migrated_payload, migrated_count, failed_count = (
await self._migrate_annotations_to_new_version(
project=project,
annotation=annotation,
old_file_id=file_id,
new_file_id=str(latest_file.id),
)
)
has_result = self._has_annotation_result(migrated_payload)
final_status = (
ANNOTATION_STATUS_ANNOTATED
if has_result
else ANNOTATION_STATUS_NO_ANNOTATION
)
annotation.file_id = str(latest_file.id)
annotation.annotation = migrated_payload
annotation.annotation_status = final_status
annotation.file_version = latest_file.version
annotation.updated_at = now
await self.db.commit()
await self.db.refresh(annotation)
await self._sync_annotation_to_knowledge(
project,
latest_file,
migrated_payload,
annotation.updated_at or now,
)
return {
"fileId": str(latest_file.id),
"previousFileVersion": previous_file_version,
"currentFileVersion": latest_file.version,
"message": f"已切换到新版本,{migrated_count} 条标注已迁移,{failed_count} 条无法迁移",
"migratedCount": migrated_count,
"failedCount": failed_count,
}
# 不迁移:清空标注
cleared_payload: Dict[str, Any] = {}
if isinstance(annotation.annotation, dict) and self._is_segmented_annotation(
annotation.annotation
@@ -1879,3 +1937,91 @@ class AnnotationEditorService:
"currentFileVersion": latest_file.version,
"message": "已切换到新版本",
}
async def _migrate_annotations_to_new_version(
self,
project: LabelingProject,
annotation: AnnotationResult,
old_file_id: str,
new_file_id: str,
) -> tuple:
"""
迁移标注到新版本文件。
Returns:
(migrated_payload, migrated_count, failed_count)
"""
old_text = await self._fetch_text_content_via_download_api(
project.dataset_id, old_file_id
)
new_text = await self._fetch_text_content_via_download_api(
project.dataset_id, new_file_id
)
ann_data = annotation.annotation
if not isinstance(ann_data, dict):
return {}, 0, 0
total_migrated = 0
total_failed = 0
if self._is_segmented_annotation(ann_data):
# 分段标注:逐段迁移
segments = self._extract_segment_annotations(ann_data)
migrated_segments: Dict[str, Dict[str, Any]] = {}
for seg_key, seg_data in segments.items():
if not isinstance(seg_data, dict):
continue
seg_results = seg_data.get(SEGMENT_RESULT_KEY, [])
if not isinstance(seg_results, list) or not seg_results:
# 空标注段落,保留结构
migrated_segments[seg_key] = dict(seg_data)
migrated_segments[seg_key][SEGMENT_RESULT_KEY] = []
continue
migration = AnnotationMigrator.migrate_annotation_results(
old_text, new_text, seg_results
)
total_migrated += migration.migrated_count
total_failed += migration.failed_count
new_seg = dict(seg_data)
new_seg[SEGMENT_RESULT_KEY] = migration.migrated
migrated_segments[seg_key] = new_seg
seg_total = self._resolve_segment_total(ann_data)
if seg_total is None:
seg_total = len(migrated_segments)
migrated_payload: Dict[str, Any] = {
SEGMENTED_KEY: True,
"version": ann_data.get("version", 1),
SEGMENTS_KEY: migrated_segments,
SEGMENT_TOTAL_KEY: seg_total,
}
else:
# 非分段标注:直接迁移 result
results = ann_data.get(SEGMENT_RESULT_KEY, [])
if not isinstance(results, list):
results = []
migration = AnnotationMigrator.migrate_annotation_results(
old_text, new_text, results
)
total_migrated = migration.migrated_count
total_failed = migration.failed_count
migrated_payload = dict(ann_data)
migrated_payload[SEGMENT_RESULT_KEY] = migration.migrated
logger.info(
"标注迁移完成:project_id=%s old_file=%s new_file=%s migrated=%d failed=%d",
project.id,
old_file_id,
new_file_id,
total_migrated,
total_failed,
)
return migrated_payload, total_migrated, total_failed