feat(annotation): 添加标注项目文件快照功能

- 新增 LabelingProjectFile 模型用于存储标注项目的文件快照 - 在创建标注项目时记录关联的文件快照数据 - 更新查询逻辑以基于项目快照过滤文件列表 - 优化导出统计功能使用快照数据进行计算 - 添加数据库表结构支持项目文件快照关系
2026-01-30 18:10:13 +08:00
parent 3c3ca130b3
commit 8b2a19f09a
7 changed files with 145 additions and 33 deletions
--- a/runtime/datamate-python/app/module/annotation/service/mapping.py
+++ b/runtime/datamate-python/app/module/annotation/service/mapping.py
@@ -1,13 +1,13 @@
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
-from sqlalchemy import update, func
+from sqlalchemy import update, func, insert
 from sqlalchemy.orm import aliased
 from typing import Optional, List, Tuple
 from datetime import datetime
 import uuid

 from app.core.logging import get_logger
-from app.db.models import LabelingProject, AnnotationTemplate, AnnotationResult
+from app.db.models import LabelingProject, AnnotationTemplate, AnnotationResult, LabelingProjectFile
 from app.db.models.dataset_management import Dataset, DatasetFiles
 from app.module.annotation.schema import (
    DatasetMappingCreateRequest,
@@ -20,9 +20,11 @@ logger = get_logger(__name__)

 class DatasetMappingService:
    """数据集映射服务"""
-    
+
    def __init__(self, db: AsyncSession):
        self.db = db
+
+    SNAPSHOT_INSERT_BATCH_SIZE = 500
    
    def _build_query_with_dataset_name(self):
        """Build base query with dataset name joined"""
@@ -49,11 +51,14 @@ class DatasetMappingService:
        Returns:
            (total_count, annotated_count) 元组
        """
-        # 获取数据集总数据量（统计 ACTIVE 和 COMPLETED 状态的文件）
+        # 获取标注项目快照数据量（只统计快照内的文件）
        total_result = await self.db.execute(
-            select(func.count()).select_from(DatasetFiles).where(
+            select(func.count())
+            .select_from(LabelingProjectFile)
+            .join(DatasetFiles, LabelingProjectFile.file_id == DatasetFiles.id)
+            .where(
+                LabelingProjectFile.project_id == project_id,
                DatasetFiles.dataset_id == dataset_id,
-                DatasetFiles.status.in_(["ACTIVE", "COMPLETED"]),
            )
        )
        total_count = int(total_result.scalar() or 0)
@@ -213,6 +218,48 @@ class DatasetMappingService:
        
        logger.debug(f"Mapping created: {labeling_project.id}")
        return await self._to_response(labeling_project)
+
+    async def create_mapping_with_snapshot(
+        self,
+        labeling_project: LabelingProject,
+        file_ids: List[str],
+    ) -> DatasetMappingResponse:
+        """创建数据集映射并写入快照文件"""
+        logger.debug(
+            "Create dataset mapping with snapshot: %s -> %s, files=%d",
+            labeling_project.dataset_id,
+            labeling_project.labeling_project_id,
+            len(file_ids),
+        )
+
+        self.db.add(labeling_project)
+        await self.db.flush()
+        assert labeling_project.id, "labeling_project.id must be set before snapshot insert"
+
+        if file_ids:
+            await self._insert_snapshot_records(labeling_project.id, file_ids)
+
+        await self.db.commit()
+        await self.db.refresh(labeling_project)
+
+        logger.debug("Mapping created with snapshot: %s", labeling_project.id)
+        return await self._to_response(labeling_project)
+
+    async def _insert_snapshot_records(self, project_id: str, file_ids: List[str]) -> None:
+        batch: List[dict] = []
+        for file_id in file_ids:
+            batch.append(
+                {
+                    "id": str(uuid.uuid4()),
+                    "project_id": project_id,
+                    "file_id": file_id,
+                }
+            )
+            if len(batch) >= self.SNAPSHOT_INSERT_BATCH_SIZE:
+                await self.db.execute(insert(LabelingProjectFile).values(batch))
+                batch.clear()
+        if batch:
+            await self.db.execute(insert(LabelingProjectFile).values(batch))
    
    async def get_mapping_by_source_uuid(
        self,