feat(annotation): implement file version management for annotation feature

Add support for detecting new file versions and switching to them: Backend Changes: - Add file_version column to AnnotationResult model - Create Alembic migration for database schema update - Implement check_file_version() method to compare annotation and file versions - Implement use_new_version() method to clear annotations and update version - Update upsert_annotation() to record file version when saving - Add new API endpoints: GET /version and POST /use-new-version - Add FileVersionCheckResponse and UseNewVersionResponse schemas Frontend Changes: - Add checkFileVersionUsingGet and useNewVersionUsingPost API calls - Add version warning banner showing current vs latest file version - Add 'Use New Version' button with confirmation dialog - Clear version info state when switching files to avoid stale warnings Bug Fixes: - Fix previousFileVersion returning updated value (save before update) - Handle null file_version for historical data compatibility - Fix segmented annotation clearing (preserve structure, clear results) - Fix files without annotations incorrectly showing new version warnings - Preserve total_segments when clearing segmented annotations Files Modified: - frontend/src/pages/DataAnnotation/Annotate/LabelStudioTextEditor.tsx - frontend/src/pages/DataAnnotation/annotation.api.ts - runtime/datamate-python/app/db/models/annotation_management.py - runtime/datamate-python/app/module/annotation/interface/editor.py - runtime/datamate-python/app/module/annotation/schema/editor.py - runtime/datamate-python/app/module/annotation/service/editor.py New Files: - runtime/datamate-python/alembic.ini - runtime/datamate-python/alembic/env.py - runtime/datamate-python/alembic/script.py.mako - runtime/datamate-python/alembic/versions/20250205_0001_add_file_version.py
2026-02-05 20:12:07 +08:00
parent 4143bc75f9
commit f5cb265667
10 changed files with 915 additions and 171 deletions
--- a/runtime/datamate-python/app/module/annotation/service/editor.py
+++ b/runtime/datamate-python/app/module/annotation/service/editor.py
@@ -23,7 +23,13 @@ from sqlalchemy.ext.asyncio import AsyncSession

 from app.core.config import settings
 from app.core.logging import get_logger
-from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject, LabelingProjectFile
+from app.db.models import (
+    AnnotationResult,
+    Dataset,
+    DatasetFiles,
+    LabelingProject,
+    LabelingProjectFile,
+)
 from app.db.models.annotation_management import (
    ANNOTATION_STATUS_ANNOTATED,
    ANNOTATION_STATUS_IN_PROGRESS,
@@ -45,8 +51,12 @@ from app.module.annotation.schema.editor import (
 )
 from app.module.annotation.service.template import AnnotationTemplateService
 from app.module.annotation.service.knowledge_sync import KnowledgeSyncService
-from app.module.annotation.service.annotation_text_splitter import AnnotationTextSplitter
-from app.module.annotation.service.text_fetcher import fetch_text_content_via_download_api
+from app.module.annotation.service.annotation_text_splitter import (
+    AnnotationTextSplitter,
+)
+from app.module.annotation.service.text_fetcher import (
+    fetch_text_content_via_download_api,
+)

 logger = get_logger(__name__)

@@ -169,7 +179,9 @@ class AnnotationEditorService:
        template = await self.template_service.get_template(self.db, template_id)
        return getattr(template, "label_config", None) if template else None

-    async def _resolve_project_label_config(self, project: LabelingProject) -> Optional[str]:
+    async def _resolve_project_label_config(
+        self, project: LabelingProject
+    ) -> Optional[str]:
        label_config = None
        if project.configuration and isinstance(project.configuration, dict):
            label_config = project.configuration.get("label_config")
@@ -210,7 +222,9 @@ class AnnotationEditorService:
        if not label_config:
            return [default_key]
        target_categories = categories or set()
-        keys = cls._extract_object_value_keys_by_category(label_config, target_categories)
+        keys = cls._extract_object_value_keys_by_category(
+            label_config, target_categories
+        )
        if not keys:
            return [default_key]
        return keys
@@ -231,7 +245,9 @@ class AnnotationEditorService:
        return parsed if isinstance(parsed, dict) else None

    @classmethod
-    def _parse_jsonl_records(cls, text_content: str) -> List[Tuple[Optional[Dict[str, Any]], str]]:
+    def _parse_jsonl_records(
+        cls, text_content: str
+    ) -> List[Tuple[Optional[Dict[str, Any]], str]]:
        lines = [line for line in text_content.splitlines() if line.strip()]
        records: List[Tuple[Optional[Dict[str, Any]], str]] = []
        for line in lines:
@@ -277,7 +293,9 @@ class AnnotationEditorService:

    @classmethod
    def _extract_textual_value_keys(cls, label_config: str) -> List[str]:
-        return cls._extract_object_value_keys_by_category(label_config, TEXTUAL_OBJECT_CATEGORIES)
+        return cls._extract_object_value_keys_by_category(
+            label_config, TEXTUAL_OBJECT_CATEGORIES
+        )

    @staticmethod
    def _needs_placeholder(value: Any) -> bool:
@@ -287,7 +305,9 @@ class AnnotationEditorService:
            return True
        return False

-    def _apply_text_placeholders(self, data: Dict[str, Any], label_config: Optional[str]) -> None:
+    def _apply_text_placeholders(
+        self, data: Dict[str, Any], label_config: Optional[str]
+    ) -> None:
        if not label_config:
            return
        for key in self._extract_textual_value_keys(label_config):
@@ -346,7 +366,9 @@ class AnnotationEditorService:

                if i > 0:
                    prev = children[i - 1]
-                    if prev.tag == "Header" and self._header_already_present(prev, obj_name):
+                    if prev.tag == "Header" and self._header_already_present(
+                        prev, obj_name
+                    ):
                        i += 1
                        continue

@@ -362,7 +384,9 @@ class AnnotationEditorService:
        return ET.tostring(root, encoding="unicode")

    @staticmethod
-    def _extract_segment_annotations(payload: Optional[Dict[str, Any]]) -> Dict[str, Dict[str, Any]]:
+    def _extract_segment_annotations(
+        payload: Optional[Dict[str, Any]],
+    ) -> Dict[str, Dict[str, Any]]:
        if not payload or not isinstance(payload, dict):
            return {}
        segments = payload.get(SEGMENTS_KEY)
@@ -440,13 +464,17 @@ class AnnotationEditorService:
        file_record: DatasetFiles,
        file_id: str,
    ) -> Optional[int]:
-        dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
+        dataset_type = self._normalize_dataset_type(
+            await self._get_dataset_type(project.dataset_id)
+        )
        if dataset_type != DATASET_TYPE_TEXT:
            return None
        if not self._resolve_segmentation_enabled(project):
            return None

-        text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
+        text_content = await self._fetch_text_content_via_download_api(
+            project.dataset_id, file_id
+        )
        if not isinstance(text_content, str):
            return None

@@ -495,7 +523,9 @@ class AnnotationEditorService:
        file_type_lower = func.lower(DatasetFiles.file_type)
        file_name_lower = func.lower(DatasetFiles.file_name)
        type_condition = file_type_lower.in_(SOURCE_DOCUMENT_TYPES)
-        name_conditions = [file_name_lower.like(f"%{ext}") for ext in SOURCE_DOCUMENT_EXTENSIONS]
+        name_conditions = [
+            file_name_lower.like(f"%{ext}") for ext in SOURCE_DOCUMENT_EXTENSIONS
+        ]
        return or_(type_condition, *name_conditions)

    def _build_task_data(
@@ -545,13 +575,17 @@ class AnnotationEditorService:
        records: List[Tuple[Optional[Dict[str, Any]], str]],
        record_texts: List[str],
        segment_annotation_keys: set[str],
-    ) -> Tuple[List[SegmentInfo], List[Tuple[Optional[Dict[str, Any]], str, str, int, int]]]:
+    ) -> Tuple[
+        List[SegmentInfo], List[Tuple[Optional[Dict[str, Any]], str, str, int, int]]
+    ]:
        splitter = AnnotationTextSplitter(max_chars=self.SEGMENT_THRESHOLD)
        segments: List[SegmentInfo] = []
        segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = []
        segment_cursor = 0

-        for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)):
+        for record_index, ((payload, raw_text), record_text) in enumerate(
+            zip(records, record_texts)
+        ):
            normalized_text = record_text or ""
            if len(normalized_text) > self.SEGMENT_THRESHOLD:
                raw_segments = splitter.split(normalized_text)
@@ -559,12 +593,15 @@ class AnnotationEditorService:
                    segments.append(
                        SegmentInfo(
                            idx=segment_cursor,
-                            hasAnnotation=str(segment_cursor) in segment_annotation_keys,
+                            hasAnnotation=str(segment_cursor)
+                            in segment_annotation_keys,
                            lineIndex=record_index,
                            chunkIndex=chunk_index,
                        )
                    )
-                    segment_contexts.append((payload, raw_text, seg["text"], record_index, chunk_index))
+                    segment_contexts.append(
+                        (payload, raw_text, seg["text"], record_index, chunk_index)
+                    )
                    segment_cursor += 1
            else:
                segments.append(
@@ -575,11 +612,15 @@ class AnnotationEditorService:
                        chunkIndex=0,
                    )
                )
-                segment_contexts.append((payload, raw_text, normalized_text, record_index, 0))
+                segment_contexts.append(
+                    (payload, raw_text, normalized_text, record_index, 0)
+                )
                segment_cursor += 1

        if not segments:
-            segments = [SegmentInfo(idx=0, hasAnnotation=False, lineIndex=0, chunkIndex=0)]
+            segments = [
+                SegmentInfo(idx=0, hasAnnotation=False, lineIndex=0, chunkIndex=0)
+            ]
            segment_contexts = [(None, "", "", 0, 0)]

        return segments, segment_contexts
@@ -587,7 +628,9 @@ class AnnotationEditorService:
    async def get_project_info(self, project_id: str) -> EditorProjectInfo:
        project = await self._get_project_or_404(project_id)

-        dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
+        dataset_type = self._normalize_dataset_type(
+            await self._get_dataset_type(project.dataset_id)
+        )
        supported = dataset_type in SUPPORTED_EDITOR_DATASET_TYPES
        unsupported_reason = None
        if not supported:
@@ -653,7 +696,12 @@ class AnnotationEditorService:
        rows = files_result.all()

        items: List[EditorTaskListItem] = []
-        for file_record, annotation_id, annotation_updated_at, annotation_status in rows:
+        for (
+            file_record,
+            annotation_id,
+            annotation_updated_at,
+            annotation_status,
+        ) in rows:
            fid = str(file_record.id)  # type: ignore[arg-type]
            items.append(
                EditorTaskListItem(
@@ -675,7 +723,9 @@ class AnnotationEditorService:
            size=size,
        )

-    async def _fetch_text_content_via_download_api(self, dataset_id: str, file_id: str) -> str:
+    async def _fetch_text_content_via_download_api(
+        self, dataset_id: str, file_id: str
+    ) -> str:
        return await fetch_text_content_via_download_api(dataset_id, file_id)

    async def get_task(
@@ -686,7 +736,9 @@ class AnnotationEditorService:
    ) -> EditorTaskResponse:
        project = await self._get_project_or_404(project_id)

-        dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
+        dataset_type = self._normalize_dataset_type(
+            await self._get_dataset_type(project.dataset_id)
+        )
        if dataset_type not in SUPPORTED_EDITOR_DATASET_TYPES:
            raise HTTPException(
                status_code=400,
@@ -701,7 +753,9 @@ class AnnotationEditorService:
        )
        file_record = file_result.scalar_one_or_none()
        if not file_record:
-            raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
+            raise HTTPException(
+                status_code=404, detail=f"文件不存在或不属于该项目: {file_id}"
+            )

        if dataset_type == DATASET_TYPE_IMAGE:
            return await self._build_image_task(project, file_record, file_id)
@@ -722,7 +776,9 @@ class AnnotationEditorService:
    ) -> EditorTaskSegmentResponse:
        project = await self._get_project_or_404(project_id)

-        dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
+        dataset_type = self._normalize_dataset_type(
+            await self._get_dataset_type(project.dataset_id)
+        )
        if dataset_type != DATASET_TYPE_TEXT:
            raise HTTPException(
                status_code=400,
@@ -737,7 +793,9 @@ class AnnotationEditorService:
        )
        file_record = file_result.scalar_one_or_none()
        if not file_record:
-            raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
+            raise HTTPException(
+                status_code=404, detail=f"文件不存在或不属于该项目: {file_id}"
+            )

        if not self._resolve_segmentation_enabled(project):
            return EditorTaskSegmentResponse(
@@ -747,7 +805,9 @@ class AnnotationEditorService:
                currentSegmentIndex=0,
            )

-        text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
+        text_content = await self._fetch_text_content_via_download_api(
+            project.dataset_id, file_id
+        )
        assert isinstance(text_content, str)
        label_config = await self._resolve_project_label_config(project)
        primary_text_key = self._resolve_primary_text_key(label_config)
@@ -839,7 +899,9 @@ class AnnotationEditorService:
        file_id: str,
        segment_index: Optional[int],
    ) -> EditorTaskResponse:
-        text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
+        text_content = await self._fetch_text_content_via_download_api(
+            project.dataset_id, file_id
+        )
        assert isinstance(text_content, str)
        label_config = await self._resolve_project_label_config(project)
        primary_text_key = self._resolve_primary_text_key(label_config)
@@ -885,7 +947,8 @@ class AnnotationEditorService:
        if not segmentation_enabled:
            segment_index = None
        needs_segmentation = segmentation_enabled and (
-            len(records) > 1 or any(len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts)
+            len(records) > 1
+            or any(len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts)
        )
        segments: List[SegmentInfo] = []
        segment_contexts: List[Tuple[Optional[Dict[str, Any]], str, str, int, int]] = []
@@ -903,10 +966,14 @@ class AnnotationEditorService:
                segment_annotation_keys,
            )
            current_segment_index = segment_index if segment_index is not None else 0
-            if current_segment_index < 0 or current_segment_index >= len(segment_contexts):
+            if current_segment_index < 0 or current_segment_index >= len(
+                segment_contexts
+            ):
                current_segment_index = 0

-            selected_payload, _, display_text, _, _ = segment_contexts[current_segment_index]
+            selected_payload, _, display_text, _, _ = segment_contexts[
+                current_segment_index
+            ]

        # 构造 task 对象
        task_data = self._build_task_data(
@@ -936,11 +1003,16 @@ class AnnotationEditorService:
                # 分段模式：获取当前段落的标注
                seg_ann = segment_annotations.get(str(current_segment_index), {})
                stored = {
-                    "id": self._make_ls_annotation_id(project.id, file_id) + current_segment_index,
+                    "id": self._make_ls_annotation_id(project.id, file_id)
+                    + current_segment_index,
                    "task": ls_task_id,
                    "result": seg_ann.get(SEGMENT_RESULT_KEY, []),
-                    "created_at": seg_ann.get(SEGMENT_CREATED_AT_KEY, datetime.utcnow().isoformat() + "Z"),
-                    "updated_at": seg_ann.get(SEGMENT_UPDATED_AT_KEY, datetime.utcnow().isoformat() + "Z"),
+                    "created_at": seg_ann.get(
+                        SEGMENT_CREATED_AT_KEY, datetime.utcnow().isoformat() + "Z"
+                    ),
+                    "updated_at": seg_ann.get(
+                        SEGMENT_UPDATED_AT_KEY, datetime.utcnow().isoformat() + "Z"
+                    ),
                }
                task["annotations"] = [stored]
            elif not needs_segmentation and not has_segmented_annotation:
@@ -952,7 +1024,10 @@ class AnnotationEditorService:
                task["annotations"] = [stored]
            else:
                # 首次从非分段切换到分段：提供空标注
-                empty_ann_id = self._make_ls_annotation_id(project.id, file_id) + current_segment_index
+                empty_ann_id = (
+                    self._make_ls_annotation_id(project.id, file_id)
+                    + current_segment_index
+                )
                task["annotations"] = [
                    {
                        "id": empty_ann_id,
@@ -994,7 +1069,9 @@ class AnnotationEditorService:
        categories: set[str],
    ) -> EditorTaskResponse:
        label_config = await self._resolve_project_label_config(project)
-        media_keys = self._resolve_media_value_keys(label_config, default_key, categories)
+        media_keys = self._resolve_media_value_keys(
+            label_config, default_key, categories
+        )
        preview_url = self._build_file_preview_url(project.dataset_id, file_id)
        file_name = str(getattr(file_record, "file_name", ""))

@@ -1097,7 +1174,9 @@ class AnnotationEditorService:
            categories=MEDIA_OBJECT_CATEGORIES,
        )

-    async def upsert_annotation(self, project_id: str, file_id: str, request: UpsertAnnotationRequest) -> UpsertAnnotationResponse:
+    async def upsert_annotation(
+        self, project_id: str, file_id: str, request: UpsertAnnotationRequest
+    ) -> UpsertAnnotationResponse:
        project = await self._get_project_or_404(project_id)

        # 校验文件归属
@@ -1112,7 +1191,26 @@ class AnnotationEditorService:
        )
        file_record = file_result.scalar_one_or_none()
        if not file_record:
-            raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
+            raise HTTPException(
+                status_code=404, detail=f"文件不存在或不属于该项目: {file_id}"
+            )
+
+        # 检查文件版本是否变化
+        current_file_version = file_record.version
+        existing_result = await self.db.execute(
+            select(AnnotationResult).where(
+                AnnotationResult.project_id == project_id,
+                AnnotationResult.file_id == file_id,
+            )
+        )
+        existing_annotation = existing_result.scalar_one_or_none()
+
+        if existing_annotation and existing_annotation.file_version is not None:
+            if existing_annotation.file_version != current_file_version:
+                raise HTTPException(
+                    status_code=409,
+                    detail=f"文件已更新到新版本（当前版本: {current_file_version}, 标注版本: {existing_annotation.file_version}），请使用新版本",
+                )

        annotation_payload = dict(request.annotation or {})
        result = annotation_payload.get("result")
@@ -1127,7 +1225,9 @@ class AnnotationEditorService:
        if request.segment_index is not None:
            segment_total_hint = self._resolve_segment_total(annotation_payload)
            if segment_total_hint is None:
-                segment_total_hint = await self._compute_segment_total(project, file_record, file_id)
+                segment_total_hint = await self._compute_segment_total(
+                    project, file_record, file_id
+                )

        existing_result = await self.db.execute(
            select(AnnotationResult)
@@ -1161,11 +1261,16 @@ class AnnotationEditorService:
            # 非分段模式：直接使用传入的 annotation
            annotation_payload["task"] = ls_task_id
            if not isinstance(annotation_payload.get("id"), int):
-                annotation_payload["id"] = self._make_ls_annotation_id(project_id, file_id)
+                annotation_payload["id"] = self._make_ls_annotation_id(
+                    project_id, file_id
+                )
            final_payload = annotation_payload

        requested_status = request.annotation_status
-        if requested_status is not None and requested_status not in ANNOTATION_STATUS_CLIENT_VALUES:
+        if (
+            requested_status is not None
+            and requested_status not in ANNOTATION_STATUS_CLIENT_VALUES
+        ):
            raise HTTPException(status_code=400, detail="annotationStatus 不合法")

        segment_total = None
@@ -1194,7 +1299,10 @@ class AnnotationEditorService:
                elif requested_status == ANNOTATION_STATUS_NOT_APPLICABLE:
                    final_status = ANNOTATION_STATUS_NOT_APPLICABLE
                else:
-                    raise HTTPException(status_code=400, detail="未发现标注内容，请确认无标注/不适用后再保存")
+                    raise HTTPException(
+                        status_code=400,
+                        detail="未发现标注内容，请确认无标注/不适用后再保存",
+                    )

        if request.segment_index is not None:
            segment_entries = self._extract_segment_annotations(final_payload)
@@ -1210,11 +1318,16 @@ class AnnotationEditorService:

        if existing:
            if request.expected_updated_at and existing.updated_at:
-                if existing.updated_at != request.expected_updated_at.replace(tzinfo=None):
-                    raise HTTPException(status_code=409, detail="标注已被更新，请刷新后重试")
+                if existing.updated_at != request.expected_updated_at.replace(
+                    tzinfo=None
+                ):
+                    raise HTTPException(
+                        status_code=409, detail="标注已被更新，请刷新后重试"
+                    )

            existing.annotation = final_payload  # type: ignore[assignment]
            existing.annotation_status = final_status  # type: ignore[assignment]
+            existing.file_version = current_file_version  # type: ignore[assignment]
            existing.updated_at = now  # type: ignore[assignment]
            await self.db.commit()
            await self.db.refresh(existing)
@@ -1223,7 +1336,9 @@ class AnnotationEditorService:
                annotationId=existing.id,
                updatedAt=existing.updated_at or now,
            )
-            await self._sync_annotation_to_knowledge(project, file_record, final_payload, existing.updated_at)
+            await self._sync_annotation_to_knowledge(
+                project, file_record, final_payload, existing.updated_at
+            )
            return response

        new_id = str(uuid.uuid4())
@@ -1233,6 +1348,7 @@ class AnnotationEditorService:
            file_id=file_id,
            annotation=final_payload,
            annotation_status=final_status,
+            file_version=current_file_version,
            created_at=now,
            updated_at=now,
        )
@@ -1244,7 +1360,9 @@ class AnnotationEditorService:
            annotationId=record.id,
            updatedAt=record.updated_at or now,
        )
-        await self._sync_annotation_to_knowledge(project, file_record, final_payload, record.updated_at)
+        await self._sync_annotation_to_knowledge(
+            project, file_record, final_payload, record.updated_at
+        )
        return response

    def _merge_segment_annotation(
@@ -1292,7 +1410,9 @@ class AnnotationEditorService:
        # 更新指定段落的标注
        segments[str(segment_index)] = {
            SEGMENT_RESULT_KEY: new_annotation.get(SEGMENT_RESULT_KEY, []),
-            SEGMENT_CREATED_AT_KEY: new_annotation.get(SEGMENT_CREATED_AT_KEY, datetime.utcnow().isoformat() + "Z"),
+            SEGMENT_CREATED_AT_KEY: new_annotation.get(
+                SEGMENT_CREATED_AT_KEY, datetime.utcnow().isoformat() + "Z"
+            ),
            SEGMENT_UPDATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
        }

@@ -1317,9 +1437,7 @@ class AnnotationEditorService:
            logger.warning("标注同步知识管理失败：%s", exc)

    async def precompute_segmentation_for_project(
-        self,
-        project_id: str,
-        max_retries: int = 3
+        self, project_id: str, max_retries: int = 3
    ) -> Dict[str, Any]:
        """
        为指定项目的所有文本文件预计算切片结构并持久化到数据库
@@ -1332,7 +1450,9 @@ class AnnotationEditorService:
            统计信息：{total_files, succeeded, failed}
        """
        project = await self._get_project_or_404(project_id)
-        dataset_type = self._normalize_dataset_type(await self._get_dataset_type(project.dataset_id))
+        dataset_type = self._normalize_dataset_type(
+            await self._get_dataset_type(project.dataset_id)
+        )

        # 只处理文本数据集
        if dataset_type != DATASET_TYPE_TEXT:
@@ -1364,9 +1484,8 @@ class AnnotationEditorService:
        for file_record in file_records:
            file_type = str(getattr(file_record, "file_type", "") or "").lower()
            file_name = str(getattr(file_record, "file_name", "")).lower()
-            is_source_document = (
-                file_type in SOURCE_DOCUMENT_TYPES or
-                any(file_name.endswith(ext) for ext in SOURCE_DOCUMENT_EXTENSIONS)
+            is_source_document = file_type in SOURCE_DOCUMENT_TYPES or any(
+                file_name.endswith(ext) for ext in SOURCE_DOCUMENT_EXTENSIONS
            )
            if not is_source_document:
                valid_files.append(file_record)
@@ -1385,7 +1504,9 @@ class AnnotationEditorService:
            for retry in range(max_retries):
                try:
                    # 读取文本内容
-                    text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
+                    text_content = await self._fetch_text_content_via_download_api(
+                        project.dataset_id, file_id
+                    )
                    if not isinstance(text_content, str):
                        logger.warning(f"文件 {file_id} 内容不是字符串，跳过切片")
                        failed += 1
@@ -1404,7 +1525,9 @@ class AnnotationEditorService:
                        records = [(None, text_content)]

                    record_texts = [
-                        self._resolve_primary_text_value(payload, raw_text, primary_text_key)
+                        self._resolve_primary_text_value(
+                            payload, raw_text, primary_text_key
+                        )
                        for payload, raw_text in records
                    ]
                    if not record_texts:
@@ -1412,7 +1535,8 @@ class AnnotationEditorService:

                    # 判断是否需要分段
                    needs_segmentation = len(records) > 1 or any(
-                        len(text or "") > self.SEGMENT_THRESHOLD for text in record_texts
+                        len(text or "") > self.SEGMENT_THRESHOLD
+                        for text in record_texts
                    )

                    if not needs_segmentation:
@@ -1425,7 +1549,9 @@ class AnnotationEditorService:
                    segment_cursor = 0
                    segments = {}

-                    for record_index, ((payload, raw_text), record_text) in enumerate(zip(records, record_texts)):
+                    for record_index, ((payload, raw_text), record_text) in enumerate(
+                        zip(records, record_texts)
+                    ):
                        normalized_text = record_text or ""

                        if len(normalized_text) > self.SEGMENT_THRESHOLD:
@@ -1433,15 +1559,19 @@ class AnnotationEditorService:
                            for chunk_index, seg in enumerate(raw_segments):
                                segments[str(segment_cursor)] = {
                                    SEGMENT_RESULT_KEY: [],
-                                    SEGMENT_CREATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
-                                    SEGMENT_UPDATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
+                                    SEGMENT_CREATED_AT_KEY: datetime.utcnow().isoformat()
+                                    + "Z",
+                                    SEGMENT_UPDATED_AT_KEY: datetime.utcnow().isoformat()
+                                    + "Z",
                                }
                                segment_cursor += 1
                        else:
                            segments[str(segment_cursor)] = {
                                SEGMENT_RESULT_KEY: [],
-                                SEGMENT_CREATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
-                                SEGMENT_UPDATED_AT_KEY: datetime.utcnow().isoformat() + "Z",
+                                SEGMENT_CREATED_AT_KEY: datetime.utcnow().isoformat()
+                                + "Z",
+                                SEGMENT_UPDATED_AT_KEY: datetime.utcnow().isoformat()
+                                + "Z",
                            }
                            segment_cursor += 1

@@ -1508,3 +1638,145 @@ class AnnotationEditorService:
            "failed": failed,
        }

+    async def check_file_version(self, project_id: str, file_id: str) -> Dict[str, Any]:
+        """
+        检查文件是否有新版本
+
+        Args:
+            project_id: 标注项目ID
+            file_id: 文件ID
+
+        Returns:
+            包含文件版本信息的字典
+        """
+        project = await self._get_project_or_404(project_id)
+
+        # 获取文件信息
+        file_result = await self.db.execute(
+            select(DatasetFiles)
+            .join(LabelingProjectFile, LabelingProjectFile.file_id == DatasetFiles.id)
+            .where(
+                LabelingProjectFile.project_id == project.id,
+                DatasetFiles.id == file_id,
+                DatasetFiles.dataset_id == project.dataset_id,
+            )
+        )
+        file_record = file_result.scalar_one_or_none()
+        if not file_record:
+            raise HTTPException(
+                status_code=404, detail=f"文件不存在或不属于该项目: {file_id}"
+            )
+
+        # 获取标注信息
+        annotation_result = await self.db.execute(
+            select(AnnotationResult).where(
+                AnnotationResult.project_id == project_id,
+                AnnotationResult.file_id == file_id,
+            )
+        )
+        annotation = annotation_result.scalar_one_or_none()
+
+        current_file_version = file_record.version
+        annotation_file_version = annotation.file_version if annotation else None
+
+        if annotation is None:
+            has_new_version = False
+        elif annotation_file_version is None:
+            has_new_version = True
+        else:
+            has_new_version = current_file_version > annotation_file_version
+
+        return {
+            "fileId": file_id,
+            "currentFileVersion": current_file_version,
+            "annotationFileVersion": annotation_file_version,
+            "hasNewVersion": has_new_version,
+        }
+
+    async def use_new_version(self, project_id: str, file_id: str) -> Dict[str, Any]:
+        """
+        使用文件新版本并清空标注
+
+        Args:
+            project_id: 标注项目ID
+            file_id: 文件ID
+
+        Returns:
+            操作结果
+        """
+        project = await self._get_project_or_404(project_id)
+
+        # 获取文件信息
+        file_result = await self.db.execute(
+            select(DatasetFiles)
+            .join(LabelingProjectFile, LabelingProjectFile.file_id == DatasetFiles.id)
+            .where(
+                LabelingProjectFile.project_id == project.id,
+                DatasetFiles.id == file_id,
+                DatasetFiles.dataset_id == project.dataset_id,
+            )
+        )
+        file_record = file_result.scalar_one_or_none()
+        if not file_record:
+            raise HTTPException(
+                status_code=404, detail=f"文件不存在或不属于该项目: {file_id}"
+            )
+
+        # 获取标注信息
+        annotation_result = await self.db.execute(
+            select(AnnotationResult)
+            .where(
+                AnnotationResult.project_id == project_id,
+                AnnotationResult.file_id == file_id,
+            )
+            .with_for_update()
+        )
+        annotation = annotation_result.scalar_one_or_none()
+
+        current_file_version = file_record.version
+
+        if not annotation:
+            raise HTTPException(status_code=404, detail=f"标注不存在: {file_id}")
+
+        previous_file_version = annotation.file_version
+
+        if annotation.file_version is not None:
+            if current_file_version <= annotation.file_version:
+                raise HTTPException(
+                    status_code=400,
+                    detail=f"文件版本（{current_file_version}）未更新或低于标注版本（{annotation.file_version}）",
+                )
+
+        # 清空标注并更新版本号
+        now = datetime.utcnow()
+        if isinstance(annotation.annotation, dict):
+            if annotation.annotation.get(SEGMENTED_KEY):
+                segments = annotation.annotation.get(SEGMENTS_KEY, {})
+                for segment_id, segment_data in segments.items():
+                    if isinstance(segment_data, dict):
+                        segment_data[SEGMENT_RESULT_KEY] = []
+                annotation.annotation = {
+                    SEGMENTED_KEY: True,
+                    "version": annotation.annotation.get("version", 1),
+                    SEGMENTS_KEY: segments,
+                    "total_segments": annotation.annotation.get(
+                        "total_segments", len(segments)
+                    ),
+                }
+            else:
+                annotation.annotation = {}
+        else:
+            annotation.annotation = {}
+        annotation.annotation_status = ANNOTATION_STATUS_NO_ANNOTATION
+        annotation.file_version = current_file_version
+        annotation.updated_at = now
+
+        await self.db.commit()
+        await self.db.refresh(annotation)
+
+        return {
+            "fileId": file_id,
+            "previousFileVersion": previous_file_version,
+            "currentFileVersion": current_file_version,
+            "message": "已使用新版本并清空标注",
+        }