LSF

2026-01-07 00:00:16 +08:00
parent 7d4dcb756b
commit d5b75fee0d
14 changed files with 1267 additions and 124 deletions
@@ -2,9 +2,10 @@ from fastapi import APIRouter

 from .config import router as about_router
 from .project import router as project_router
-from .task import router as task_router
-from .template import router as template_router
-from .auto import router as auto_router
+from .task import router as task_router
+from .template import router as template_router
+from .auto import router as auto_router
+from .editor import router as editor_router

 router = APIRouter(
    prefix="/annotation",
@@ -13,6 +14,7 @@ router = APIRouter(

 router.include_router(about_router)
 router.include_router(project_router)
-router.include_router(task_router)
-router.include_router(template_router)
-router.include_router(auto_router)
+router.include_router(task_router)
+router.include_router(template_router)
+router.include_router(auto_router)
+router.include_router(editor_router)
@@ -0,0 +1,90 @@
+"""
+Label Studio Editor（前端嵌入式）接口
+
+说明：
+- 不依赖 Label Studio Server；仅复用其“编辑器”前端库
+- DataMate 负责提供 tasks/annotations 数据与保存能力
+- 当前为 TEXT POC：只支持 dataset_type=TEXT 的项目
+"""
+
+from __future__ import annotations
+
+from fastapi import APIRouter, Depends, Query, Path
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.core.logging import get_logger
+from app.db.session import get_db
+from app.module.annotation.schema.editor import (
+    EditorProjectInfo,
+    EditorTaskListResponse,
+    EditorTaskResponse,
+    UpsertAnnotationRequest,
+    UpsertAnnotationResponse,
+)
+from app.module.annotation.service.editor import AnnotationEditorService
+from app.module.shared.schema import StandardResponse
+
+logger = get_logger(__name__)
+
+router = APIRouter(
+    prefix="/editor",
+    tags=["annotation/editor"],
+)
+
+
+@router.get(
+    "/projects/{project_id}",
+    response_model=StandardResponse[EditorProjectInfo],
+)
+async def get_editor_project_info(
+    project_id: str = Path(..., description="标注项目ID（t_dm_labeling_projects.id）"),
+    db: AsyncSession = Depends(get_db),
+):
+    service = AnnotationEditorService(db)
+    info = await service.get_project_info(project_id)
+    return StandardResponse(code=200, message="success", data=info)
+
+
+@router.get(
+    "/projects/{project_id}/tasks",
+    response_model=StandardResponse[EditorTaskListResponse],
+)
+async def list_editor_tasks(
+    project_id: str = Path(..., description="标注项目ID（t_dm_labeling_projects.id）"),
+    page: int = Query(0, ge=0, description="页码（从0开始）"),
+    size: int = Query(50, ge=1, le=200, description="每页大小"),
+    db: AsyncSession = Depends(get_db),
+):
+    service = AnnotationEditorService(db)
+    result = await service.list_tasks(project_id, page=page, size=size)
+    return StandardResponse(code=200, message="success", data=result)
+
+
+@router.get(
+    "/projects/{project_id}/tasks/{file_id}",
+    response_model=StandardResponse[EditorTaskResponse],
+)
+async def get_editor_task(
+    project_id: str = Path(..., description="标注项目ID（t_dm_labeling_projects.id）"),
+    file_id: str = Path(..., description="文件ID（t_dm_dataset_files.id）"),
+    db: AsyncSession = Depends(get_db),
+):
+    service = AnnotationEditorService(db)
+    task = await service.get_task(project_id, file_id)
+    return StandardResponse(code=200, message="success", data=task)
+
+
+@router.put(
+    "/projects/{project_id}/tasks/{file_id}/annotation",
+    response_model=StandardResponse[UpsertAnnotationResponse],
+)
+async def upsert_editor_annotation(
+    request: UpsertAnnotationRequest,
+    project_id: str = Path(..., description="标注项目ID（t_dm_labeling_projects.id）"),
+    file_id: str = Path(..., description="文件ID（t_dm_dataset_files.id）"),
+    db: AsyncSession = Depends(get_db),
+):
+    service = AnnotationEditorService(db)
+    result = await service.upsert_annotation(project_id, file_id, request)
+    return StandardResponse(code=200, message="success", data=result)
+
@@ -30,6 +30,15 @@ from .mapping import (
    DeleteDatasetResponse,
 )

+from .editor import (
+    EditorProjectInfo,
+    EditorTaskListItem,
+    EditorTaskListResponse,
+    EditorTaskResponse,
+    UpsertAnnotationRequest,
+    UpsertAnnotationResponse,
+)
+
 # Rebuild model to resolve forward references
 DatasetMappingResponse.model_rebuild()

@@ -51,4 +60,10 @@ __all__ = [
    "UpdateAnnotationTemplateRequest",
    "AnnotationTemplateResponse",
    "AnnotationTemplateListResponse",
-]
+    "EditorProjectInfo",
+    "EditorTaskListItem",
+    "EditorTaskListResponse",
+    "EditorTaskResponse",
+    "UpsertAnnotationRequest",
+    "UpsertAnnotationResponse",
+]
@@ -0,0 +1,83 @@
+"""
+标注编辑器（Label Studio Editor）接口模型
+
+设计目标：
+- 单人单份最终标签：每个 project_id + file_id 只维护 1 条最终标注结果
+- 完全兼容 Label Studio：标注结果以 annotation 原始 JSON 形式存储与返回
+"""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field, ConfigDict
+
+
+class EditorProjectInfo(BaseModel):
+    """编辑器项目元信息"""
+
+    project_id: str = Field(..., alias="projectId", description="DataMate 标注项目ID（t_dm_labeling_projects.id）")
+    dataset_id: str = Field(..., alias="datasetId", description="数据集ID（t_dm_datasets.id）")
+    template_id: Optional[str] = Field(None, alias="templateId", description="模板ID（t_dm_annotation_templates.id）")
+    label_config: Optional[str] = Field(None, alias="labelConfig", description="Label Studio XML 配置")
+    supported: bool = Field(..., description="当前数据类型是否支持内嵌编辑器")
+    unsupported_reason: Optional[str] = Field(None, alias="unsupportedReason", description="不支持原因（当 supported=false）")
+
+    model_config = ConfigDict(populate_by_name=True)
+
+
+class EditorTaskListItem(BaseModel):
+    """编辑器任务列表条目（对应一个数据集文件）"""
+
+    file_id: str = Field(..., alias="fileId", description="文件ID")
+    file_name: str = Field(..., alias="fileName", description="文件名")
+    file_type: Optional[str] = Field(None, alias="fileType", description="文件类型")
+    has_annotation: bool = Field(..., alias="hasAnnotation", description="是否已有最终标注")
+    annotation_updated_at: Optional[datetime] = Field(None, alias="annotationUpdatedAt", description="标注更新时间")
+
+    model_config = ConfigDict(populate_by_name=True)
+
+
+class EditorTaskListResponse(BaseModel):
+    """编辑器任务列表响应"""
+
+    content: List[EditorTaskListItem] = Field(..., description="任务列表")
+    total_elements: int = Field(..., alias="totalElements", description="总条数")
+    total_pages: int = Field(..., alias="totalPages", description="总页数")
+    page: int = Field(..., description="页码（从0开始）")
+    size: int = Field(..., description="每页大小")
+
+    model_config = ConfigDict(populate_by_name=True)
+
+
+class EditorTaskResponse(BaseModel):
+    """编辑器任务详情（可直接喂给 Label Studio Editor 的 task 对象）"""
+
+    task: Dict[str, Any] = Field(..., description="Label Studio task 对象")
+    annotation_updated_at: Optional[datetime] = Field(None, alias="annotationUpdatedAt", description="标注更新时间")
+
+    model_config = ConfigDict(populate_by_name=True)
+
+
+class UpsertAnnotationRequest(BaseModel):
+    """保存/覆盖最终标注（Label Studio annotation 原始对象）"""
+
+    annotation: Dict[str, Any] = Field(..., description="Label Studio annotation 对象（包含 result 等）")
+    expected_updated_at: Optional[datetime] = Field(
+        None,
+        alias="expectedUpdatedAt",
+        description="乐观锁：若提供则要求与当前记录 updated_at 一致，否则返回 409",
+    )
+
+    model_config = ConfigDict(populate_by_name=True)
+
+
+class UpsertAnnotationResponse(BaseModel):
+    """保存/覆盖最终标注响应"""
+
+    annotation_id: str = Field(..., alias="annotationId", description="标注结果ID（t_dm_annotation_results.id）")
+    updated_at: datetime = Field(..., alias="updatedAt", description="标注更新时间")
+
+    model_config = ConfigDict(populate_by_name=True)
+
@@ -0,0 +1,295 @@
+"""
+标注编辑器（Label Studio Editor）服务
+
+职责：
+- 解析 DataMate 标注项目（t_dm_labeling_projects）
+- 以“文件下载/预览接口”读取文本内容，构造 Label Studio task
+- 以原始 annotation JSON 形式 upsert 最终标注结果（单人单份）
+"""
+
+from __future__ import annotations
+
+import uuid
+from datetime import datetime
+from typing import Any, Dict, List, Optional, Tuple
+
+import httpx
+from fastapi import HTTPException
+from sqlalchemy import func, select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.core.config import settings
+from app.core.logging import get_logger
+from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject
+from app.module.annotation.schema.editor import (
+    EditorProjectInfo,
+    EditorTaskListItem,
+    EditorTaskListResponse,
+    EditorTaskResponse,
+    UpsertAnnotationRequest,
+    UpsertAnnotationResponse,
+)
+from app.module.annotation.service.template import AnnotationTemplateService
+
+logger = get_logger(__name__)
+
+
+class AnnotationEditorService:
+    """Label Studio Editor 集成服务（TEXT POC 版）"""
+
+    def __init__(self, db: AsyncSession):
+        self.db = db
+        self.template_service = AnnotationTemplateService()
+
+    async def _get_project_or_404(self, project_id: str) -> LabelingProject:
+        result = await self.db.execute(
+            select(LabelingProject).where(
+                LabelingProject.id == project_id,
+                LabelingProject.deleted_at.is_(None),
+            )
+        )
+        project = result.scalar_one_or_none()
+        if not project:
+            raise HTTPException(status_code=404, detail=f"标注项目不存在: {project_id}")
+        return project
+
+    async def _get_dataset_type(self, dataset_id: str) -> Optional[str]:
+        result = await self.db.execute(
+            select(Dataset.dataset_type).where(Dataset.id == dataset_id)
+        )
+        return result.scalar_one_or_none()
+
+    async def _get_label_config(self, template_id: Optional[str]) -> Optional[str]:
+        if not template_id:
+            return None
+        template = await self.template_service.get_template(self.db, template_id)
+        return getattr(template, "label_config", None) if template else None
+
+    async def get_project_info(self, project_id: str) -> EditorProjectInfo:
+        project = await self._get_project_or_404(project_id)
+
+        dataset_type = await self._get_dataset_type(project.dataset_id)
+        supported = (dataset_type or "").upper() == "TEXT"
+        unsupported_reason = None
+        if not supported:
+            unsupported_reason = f"当前仅支持 TEXT，项目数据类型为: {dataset_type or 'UNKNOWN'}"
+
+        label_config = await self._get_label_config(project.template_id)
+
+        return EditorProjectInfo(
+            projectId=project.id,
+            datasetId=project.dataset_id,
+            templateId=project.template_id,
+            labelConfig=label_config,
+            supported=supported,
+            unsupportedReason=unsupported_reason,
+        )
+
+    async def list_tasks(self, project_id: str, page: int = 0, size: int = 50) -> EditorTaskListResponse:
+        project = await self._get_project_or_404(project_id)
+
+        count_result = await self.db.execute(
+            select(func.count()).select_from(DatasetFiles).where(
+                DatasetFiles.dataset_id == project.dataset_id
+            )
+        )
+        total = int(count_result.scalar() or 0)
+
+        files_result = await self.db.execute(
+            select(DatasetFiles)
+            .where(DatasetFiles.dataset_id == project.dataset_id)
+            .order_by(DatasetFiles.created_at.desc())
+            .offset(page * size)
+            .limit(size)
+        )
+        files = files_result.scalars().all()
+
+        file_ids = [str(f.id) for f in files]  # type: ignore[arg-type]
+        updated_map: Dict[str, datetime] = {}
+        if file_ids:
+            ann_result = await self.db.execute(
+                select(AnnotationResult.file_id, AnnotationResult.updated_at).where(
+                    AnnotationResult.project_id == project_id,
+                    AnnotationResult.file_id.in_(file_ids),
+                )
+            )
+            for file_id, updated_at in ann_result.all():
+                if file_id and updated_at:
+                    updated_map[str(file_id)] = updated_at
+
+        items: List[EditorTaskListItem] = []
+        for f in files:
+            fid = str(f.id)  # type: ignore[arg-type]
+            items.append(
+                EditorTaskListItem(
+                    fileId=fid,
+                    fileName=str(getattr(f, "file_name", "")),
+                    fileType=getattr(f, "file_type", None),
+                    hasAnnotation=fid in updated_map,
+                    annotationUpdatedAt=updated_map.get(fid),
+                )
+            )
+
+        total_pages = (total + size - 1) // size if size > 0 else 0
+        return EditorTaskListResponse(
+            content=items,
+            totalElements=total,
+            totalPages=total_pages,
+            page=page,
+            size=size,
+        )
+
+    async def _fetch_text_content_via_download_api(self, dataset_id: str, file_id: str) -> str:
+        base = settings.datamate_backend_base_url.rstrip("/")
+        url = f"{base}/data-management/datasets/{dataset_id}/files/{file_id}/download"
+
+        try:
+            async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
+                resp = await client.get(url)
+                resp.raise_for_status()
+
+                content_length = resp.headers.get("content-length")
+                if content_length:
+                    try:
+                        if int(content_length) > settings.editor_max_text_bytes:
+                            raise HTTPException(
+                                status_code=413,
+                                detail=f"文本文件过大，限制 {settings.editor_max_text_bytes} 字节",
+                            )
+                    except ValueError:
+                        # content-length 非法则忽略，走实际长度判断
+                        pass
+
+                data = resp.content
+                if len(data) > settings.editor_max_text_bytes:
+                    raise HTTPException(
+                        status_code=413,
+                        detail=f"文本文件过大，限制 {settings.editor_max_text_bytes} 字节",
+                    )
+
+                # TEXT POC：默认按 UTF-8 解码，不可解码字符用替换符处理
+                return data.decode("utf-8", errors="replace")
+
+        except HTTPException:
+            raise
+        except httpx.HTTPStatusError as e:
+            logger.error(f"读取文本失败: dataset={dataset_id}, file={file_id}, http={e.response.status_code}")
+            raise HTTPException(status_code=502, detail="读取文本失败（下载接口返回错误）")
+        except Exception as e:
+            logger.error(f"读取文本失败: dataset={dataset_id}, file={file_id}, err={e}")
+            raise HTTPException(status_code=502, detail="读取文本失败（下载接口调用异常）")
+
+    async def get_task(self, project_id: str, file_id: str) -> EditorTaskResponse:
+        project = await self._get_project_or_404(project_id)
+
+        # TEXT 支持校验
+        dataset_type = await self._get_dataset_type(project.dataset_id)
+        if (dataset_type or "").upper() != "TEXT":
+            raise HTTPException(status_code=400, detail="当前仅支持 TEXT 项目的内嵌编辑器")
+
+        file_result = await self.db.execute(
+            select(DatasetFiles).where(
+                DatasetFiles.id == file_id,
+                DatasetFiles.dataset_id == project.dataset_id,
+            )
+        )
+        file_record = file_result.scalar_one_or_none()
+        if not file_record:
+            raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
+
+        text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
+
+        ann_result = await self.db.execute(
+            select(AnnotationResult).where(
+                AnnotationResult.project_id == project_id,
+                AnnotationResult.file_id == file_id,
+            )
+        )
+        ann = ann_result.scalar_one_or_none()
+
+        task: Dict[str, Any] = {
+            "id": file_id,
+            "data": {
+                "text": text_content,
+                "file_id": file_id,
+                "dataset_id": project.dataset_id,
+                "file_name": getattr(file_record, "file_name", ""),
+            },
+            "annotations": [],
+        }
+
+        annotation_updated_at = None
+        if ann:
+            annotation_updated_at = ann.updated_at
+            # 直接返回存储的 annotation 原始对象（Label Studio 兼容）
+            task["annotations"] = [ann.annotation]
+
+        return EditorTaskResponse(
+            task=task,
+            annotationUpdatedAt=annotation_updated_at,
+        )
+
+    async def upsert_annotation(self, project_id: str, file_id: str, request: UpsertAnnotationRequest) -> UpsertAnnotationResponse:
+        project = await self._get_project_or_404(project_id)
+
+        # 校验文件归属
+        file_check = await self.db.execute(
+            select(DatasetFiles.id).where(
+                DatasetFiles.id == file_id,
+                DatasetFiles.dataset_id == project.dataset_id,
+            )
+        )
+        if not file_check.scalar_one_or_none():
+            raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
+
+        annotation_payload = dict(request.annotation or {})
+        result = annotation_payload.get("result")
+        if not isinstance(result, list):
+            raise HTTPException(status_code=400, detail="annotation.result 必须为数组")
+
+        existing_result = await self.db.execute(
+            select(AnnotationResult).where(
+                AnnotationResult.project_id == project_id,
+                AnnotationResult.file_id == file_id,
+            )
+        )
+        existing = existing_result.scalar_one_or_none()
+
+        now = datetime.utcnow()
+
+        if existing:
+            if request.expected_updated_at and existing.updated_at:
+                if existing.updated_at != request.expected_updated_at.replace(tzinfo=None):
+                    raise HTTPException(status_code=409, detail="标注已被更新，请刷新后重试")
+
+            # 固定 annotation.id 为记录ID，保持稳定
+            annotation_payload["id"] = existing.id
+            existing.annotation = annotation_payload  # type: ignore[assignment]
+            existing.updated_at = now  # type: ignore[assignment]
+            await self.db.commit()
+            await self.db.refresh(existing)
+
+            return UpsertAnnotationResponse(
+                annotationId=existing.id,
+                updatedAt=existing.updated_at or now,
+            )
+
+        new_id = str(uuid.uuid4())
+        annotation_payload["id"] = new_id
+        record = AnnotationResult(
+            id=new_id,
+            project_id=project_id,
+            file_id=file_id,
+            annotation=annotation_payload,
+            created_at=now,
+            updated_at=now,
+        )
+        self.db.add(record)
+        await self.db.commit()
+        await self.db.refresh(record)
+
+        return UpsertAnnotationResponse(
+            annotationId=record.id,
+            updatedAt=record.updated_at or now,
+        )
+