You've already forked DataMate
feat(annotation): 文件版本更新时支持保留标注记录(位置偏移+文字匹配迁移)
新增 AnnotationMigrator 迁移算法,在 TEXT 类型数据集的文件版本更新时, 可选通过 difflib 位置偏移映射和文字二次匹配将旧版本标注迁移到新版本上。 前端版本切换对话框增加"保留标注"复选框(仅 TEXT 类型显示),后端 API 增加 preserveAnnotations 参数,完全向后兼容。 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -61,6 +61,7 @@ from app.module.annotation.security import (
|
||||
from app.module.annotation.service.text_fetcher import (
|
||||
fetch_text_content_via_download_api,
|
||||
)
|
||||
from app.module.annotation.service.annotation_migrator import AnnotationMigrator
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
@@ -1734,16 +1735,21 @@ class AnnotationEditorService:
|
||||
"latestFileId": latest_file.id if latest_file else file_id,
|
||||
}
|
||||
|
||||
async def use_new_version(self, project_id: str, file_id: str) -> Dict[str, Any]:
|
||||
async def use_new_version(
|
||||
self, project_id: str, file_id: str,
|
||||
preserve_annotations: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
使用文件新版本并清空标注
|
||||
使用文件新版本
|
||||
|
||||
如果文件有多个版本(通过 logical_path 关联),将标注切换到最新版本。
|
||||
如果存在标注记录,会清空标注内容。
|
||||
当 preserve_annotations=True 且数据集类型为 TEXT 时,尝试通过位置偏移 +
|
||||
文字匹配将旧版本的标注迁移到新版本上;否则清空标注内容。
|
||||
|
||||
Args:
|
||||
project_id: 标注项目ID
|
||||
file_id: 文件ID(当前关联的文件ID)
|
||||
preserve_annotations: 是否尝试保留标注
|
||||
|
||||
Returns:
|
||||
操作结果
|
||||
@@ -1819,9 +1825,61 @@ class AnnotationEditorService:
|
||||
now = datetime.utcnow()
|
||||
|
||||
if annotation:
|
||||
# 存在标注记录:清空标注并更新文件版本
|
||||
previous_file_version = annotation.file_version
|
||||
|
||||
# 判断是否可以尝试迁移标注
|
||||
dataset_type = self._normalize_dataset_type(
|
||||
await self._get_dataset_type(project.dataset_id)
|
||||
)
|
||||
can_migrate = (
|
||||
preserve_annotations
|
||||
and dataset_type == DATASET_TYPE_TEXT
|
||||
and self._has_annotation_result(annotation.annotation)
|
||||
)
|
||||
|
||||
if can_migrate:
|
||||
migrated_payload, migrated_count, failed_count = (
|
||||
await self._migrate_annotations_to_new_version(
|
||||
project=project,
|
||||
annotation=annotation,
|
||||
old_file_id=file_id,
|
||||
new_file_id=str(latest_file.id),
|
||||
)
|
||||
)
|
||||
|
||||
has_result = self._has_annotation_result(migrated_payload)
|
||||
final_status = (
|
||||
ANNOTATION_STATUS_ANNOTATED
|
||||
if has_result
|
||||
else ANNOTATION_STATUS_NO_ANNOTATION
|
||||
)
|
||||
|
||||
annotation.file_id = str(latest_file.id)
|
||||
annotation.annotation = migrated_payload
|
||||
annotation.annotation_status = final_status
|
||||
annotation.file_version = latest_file.version
|
||||
annotation.updated_at = now
|
||||
|
||||
await self.db.commit()
|
||||
await self.db.refresh(annotation)
|
||||
|
||||
await self._sync_annotation_to_knowledge(
|
||||
project,
|
||||
latest_file,
|
||||
migrated_payload,
|
||||
annotation.updated_at or now,
|
||||
)
|
||||
|
||||
return {
|
||||
"fileId": str(latest_file.id),
|
||||
"previousFileVersion": previous_file_version,
|
||||
"currentFileVersion": latest_file.version,
|
||||
"message": f"已切换到新版本,{migrated_count} 条标注已迁移,{failed_count} 条无法迁移",
|
||||
"migratedCount": migrated_count,
|
||||
"failedCount": failed_count,
|
||||
}
|
||||
|
||||
# 不迁移:清空标注
|
||||
cleared_payload: Dict[str, Any] = {}
|
||||
if isinstance(annotation.annotation, dict) and self._is_segmented_annotation(
|
||||
annotation.annotation
|
||||
@@ -1879,3 +1937,91 @@ class AnnotationEditorService:
|
||||
"currentFileVersion": latest_file.version,
|
||||
"message": "已切换到新版本",
|
||||
}
|
||||
|
||||
async def _migrate_annotations_to_new_version(
|
||||
self,
|
||||
project: LabelingProject,
|
||||
annotation: AnnotationResult,
|
||||
old_file_id: str,
|
||||
new_file_id: str,
|
||||
) -> tuple:
|
||||
"""
|
||||
迁移标注到新版本文件。
|
||||
|
||||
Returns:
|
||||
(migrated_payload, migrated_count, failed_count)
|
||||
"""
|
||||
old_text = await self._fetch_text_content_via_download_api(
|
||||
project.dataset_id, old_file_id
|
||||
)
|
||||
new_text = await self._fetch_text_content_via_download_api(
|
||||
project.dataset_id, new_file_id
|
||||
)
|
||||
|
||||
ann_data = annotation.annotation
|
||||
if not isinstance(ann_data, dict):
|
||||
return {}, 0, 0
|
||||
|
||||
total_migrated = 0
|
||||
total_failed = 0
|
||||
|
||||
if self._is_segmented_annotation(ann_data):
|
||||
# 分段标注:逐段迁移
|
||||
segments = self._extract_segment_annotations(ann_data)
|
||||
migrated_segments: Dict[str, Dict[str, Any]] = {}
|
||||
|
||||
for seg_key, seg_data in segments.items():
|
||||
if not isinstance(seg_data, dict):
|
||||
continue
|
||||
seg_results = seg_data.get(SEGMENT_RESULT_KEY, [])
|
||||
if not isinstance(seg_results, list) or not seg_results:
|
||||
# 空标注段落,保留结构
|
||||
migrated_segments[seg_key] = dict(seg_data)
|
||||
migrated_segments[seg_key][SEGMENT_RESULT_KEY] = []
|
||||
continue
|
||||
|
||||
migration = AnnotationMigrator.migrate_annotation_results(
|
||||
old_text, new_text, seg_results
|
||||
)
|
||||
total_migrated += migration.migrated_count
|
||||
total_failed += migration.failed_count
|
||||
|
||||
new_seg = dict(seg_data)
|
||||
new_seg[SEGMENT_RESULT_KEY] = migration.migrated
|
||||
migrated_segments[seg_key] = new_seg
|
||||
|
||||
seg_total = self._resolve_segment_total(ann_data)
|
||||
if seg_total is None:
|
||||
seg_total = len(migrated_segments)
|
||||
|
||||
migrated_payload: Dict[str, Any] = {
|
||||
SEGMENTED_KEY: True,
|
||||
"version": ann_data.get("version", 1),
|
||||
SEGMENTS_KEY: migrated_segments,
|
||||
SEGMENT_TOTAL_KEY: seg_total,
|
||||
}
|
||||
else:
|
||||
# 非分段标注:直接迁移 result
|
||||
results = ann_data.get(SEGMENT_RESULT_KEY, [])
|
||||
if not isinstance(results, list):
|
||||
results = []
|
||||
|
||||
migration = AnnotationMigrator.migrate_annotation_results(
|
||||
old_text, new_text, results
|
||||
)
|
||||
total_migrated = migration.migrated_count
|
||||
total_failed = migration.failed_count
|
||||
|
||||
migrated_payload = dict(ann_data)
|
||||
migrated_payload[SEGMENT_RESULT_KEY] = migration.migrated
|
||||
|
||||
logger.info(
|
||||
"标注迁移完成:project_id=%s old_file=%s new_file=%s migrated=%d failed=%d",
|
||||
project.id,
|
||||
old_file_id,
|
||||
new_file_id,
|
||||
total_migrated,
|
||||
total_failed,
|
||||
)
|
||||
|
||||
return migrated_payload, total_migrated, total_failed
|
||||
|
||||
Reference in New Issue
Block a user