feat(annotation): 添加标注项目文件快照功能

- 新增 LabelingProjectFile 模型用于存储标注项目的文件快照
- 在创建标注项目时记录关联的文件快照数据
- 更新查询逻辑以基于项目快照过滤文件列表
- 优化导出统计功能使用快照数据进行计算
- 添加数据库表结构支持项目文件快照关系
This commit is contained in:
2026-01-30 18:10:13 +08:00
parent 3c3ca130b3
commit 8b2a19f09a
7 changed files with 145 additions and 33 deletions

View File

@@ -1,13 +1,13 @@
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy import update, func
from sqlalchemy import update, func, insert
from sqlalchemy.orm import aliased
from typing import Optional, List, Tuple
from datetime import datetime
import uuid
from app.core.logging import get_logger
from app.db.models import LabelingProject, AnnotationTemplate, AnnotationResult
from app.db.models import LabelingProject, AnnotationTemplate, AnnotationResult, LabelingProjectFile
from app.db.models.dataset_management import Dataset, DatasetFiles
from app.module.annotation.schema import (
DatasetMappingCreateRequest,
@@ -20,9 +20,11 @@ logger = get_logger(__name__)
class DatasetMappingService:
"""数据集映射服务"""
def __init__(self, db: AsyncSession):
self.db = db
SNAPSHOT_INSERT_BATCH_SIZE = 500
def _build_query_with_dataset_name(self):
"""Build base query with dataset name joined"""
@@ -49,11 +51,14 @@ class DatasetMappingService:
Returns:
(total_count, annotated_count) 元组
"""
# 获取数据集总数据量(统计 ACTIVE 和 COMPLETED 状态的文件)
# 获取标注项目快照数据量(统计快照内的文件)
total_result = await self.db.execute(
select(func.count()).select_from(DatasetFiles).where(
select(func.count())
.select_from(LabelingProjectFile)
.join(DatasetFiles, LabelingProjectFile.file_id == DatasetFiles.id)
.where(
LabelingProjectFile.project_id == project_id,
DatasetFiles.dataset_id == dataset_id,
DatasetFiles.status.in_(["ACTIVE", "COMPLETED"]),
)
)
total_count = int(total_result.scalar() or 0)
@@ -213,6 +218,48 @@ class DatasetMappingService:
logger.debug(f"Mapping created: {labeling_project.id}")
return await self._to_response(labeling_project)
async def create_mapping_with_snapshot(
self,
labeling_project: LabelingProject,
file_ids: List[str],
) -> DatasetMappingResponse:
"""创建数据集映射并写入快照文件"""
logger.debug(
"Create dataset mapping with snapshot: %s -> %s, files=%d",
labeling_project.dataset_id,
labeling_project.labeling_project_id,
len(file_ids),
)
self.db.add(labeling_project)
await self.db.flush()
assert labeling_project.id, "labeling_project.id must be set before snapshot insert"
if file_ids:
await self._insert_snapshot_records(labeling_project.id, file_ids)
await self.db.commit()
await self.db.refresh(labeling_project)
logger.debug("Mapping created with snapshot: %s", labeling_project.id)
return await self._to_response(labeling_project)
async def _insert_snapshot_records(self, project_id: str, file_ids: List[str]) -> None:
batch: List[dict] = []
for file_id in file_ids:
batch.append(
{
"id": str(uuid.uuid4()),
"project_id": project_id,
"file_id": file_id,
}
)
if len(batch) >= self.SNAPSHOT_INSERT_BATCH_SIZE:
await self.db.execute(insert(LabelingProjectFile).values(batch))
batch.clear()
if batch:
await self.db.execute(insert(LabelingProjectFile).values(batch))
async def get_mapping_by_source_uuid(
self,