You've already forked DataMate
LSF
This commit is contained in:
@@ -2,9 +2,10 @@ from fastapi import APIRouter
|
||||
|
||||
from .config import router as about_router
|
||||
from .project import router as project_router
|
||||
from .task import router as task_router
|
||||
from .template import router as template_router
|
||||
from .auto import router as auto_router
|
||||
from .task import router as task_router
|
||||
from .template import router as template_router
|
||||
from .auto import router as auto_router
|
||||
from .editor import router as editor_router
|
||||
|
||||
router = APIRouter(
|
||||
prefix="/annotation",
|
||||
@@ -13,6 +14,7 @@ router = APIRouter(
|
||||
|
||||
router.include_router(about_router)
|
||||
router.include_router(project_router)
|
||||
router.include_router(task_router)
|
||||
router.include_router(template_router)
|
||||
router.include_router(auto_router)
|
||||
router.include_router(task_router)
|
||||
router.include_router(template_router)
|
||||
router.include_router(auto_router)
|
||||
router.include_router(editor_router)
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Label Studio Editor(前端嵌入式)接口
|
||||
|
||||
说明:
|
||||
- 不依赖 Label Studio Server;仅复用其“编辑器”前端库
|
||||
- DataMate 负责提供 tasks/annotations 数据与保存能力
|
||||
- 当前为 TEXT POC:只支持 dataset_type=TEXT 的项目
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, Query, Path
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.logging import get_logger
|
||||
from app.db.session import get_db
|
||||
from app.module.annotation.schema.editor import (
|
||||
EditorProjectInfo,
|
||||
EditorTaskListResponse,
|
||||
EditorTaskResponse,
|
||||
UpsertAnnotationRequest,
|
||||
UpsertAnnotationResponse,
|
||||
)
|
||||
from app.module.annotation.service.editor import AnnotationEditorService
|
||||
from app.module.shared.schema import StandardResponse
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
router = APIRouter(
|
||||
prefix="/editor",
|
||||
tags=["annotation/editor"],
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/projects/{project_id}",
|
||||
response_model=StandardResponse[EditorProjectInfo],
|
||||
)
|
||||
async def get_editor_project_info(
|
||||
project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
service = AnnotationEditorService(db)
|
||||
info = await service.get_project_info(project_id)
|
||||
return StandardResponse(code=200, message="success", data=info)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/projects/{project_id}/tasks",
|
||||
response_model=StandardResponse[EditorTaskListResponse],
|
||||
)
|
||||
async def list_editor_tasks(
|
||||
project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"),
|
||||
page: int = Query(0, ge=0, description="页码(从0开始)"),
|
||||
size: int = Query(50, ge=1, le=200, description="每页大小"),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
service = AnnotationEditorService(db)
|
||||
result = await service.list_tasks(project_id, page=page, size=size)
|
||||
return StandardResponse(code=200, message="success", data=result)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/projects/{project_id}/tasks/{file_id}",
|
||||
response_model=StandardResponse[EditorTaskResponse],
|
||||
)
|
||||
async def get_editor_task(
|
||||
project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"),
|
||||
file_id: str = Path(..., description="文件ID(t_dm_dataset_files.id)"),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
service = AnnotationEditorService(db)
|
||||
task = await service.get_task(project_id, file_id)
|
||||
return StandardResponse(code=200, message="success", data=task)
|
||||
|
||||
|
||||
@router.put(
|
||||
"/projects/{project_id}/tasks/{file_id}/annotation",
|
||||
response_model=StandardResponse[UpsertAnnotationResponse],
|
||||
)
|
||||
async def upsert_editor_annotation(
|
||||
request: UpsertAnnotationRequest,
|
||||
project_id: str = Path(..., description="标注项目ID(t_dm_labeling_projects.id)"),
|
||||
file_id: str = Path(..., description="文件ID(t_dm_dataset_files.id)"),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
service = AnnotationEditorService(db)
|
||||
result = await service.upsert_annotation(project_id, file_id, request)
|
||||
return StandardResponse(code=200, message="success", data=result)
|
||||
|
||||
@@ -30,6 +30,15 @@ from .mapping import (
|
||||
DeleteDatasetResponse,
|
||||
)
|
||||
|
||||
from .editor import (
|
||||
EditorProjectInfo,
|
||||
EditorTaskListItem,
|
||||
EditorTaskListResponse,
|
||||
EditorTaskResponse,
|
||||
UpsertAnnotationRequest,
|
||||
UpsertAnnotationResponse,
|
||||
)
|
||||
|
||||
# Rebuild model to resolve forward references
|
||||
DatasetMappingResponse.model_rebuild()
|
||||
|
||||
@@ -51,4 +60,10 @@ __all__ = [
|
||||
"UpdateAnnotationTemplateRequest",
|
||||
"AnnotationTemplateResponse",
|
||||
"AnnotationTemplateListResponse",
|
||||
]
|
||||
"EditorProjectInfo",
|
||||
"EditorTaskListItem",
|
||||
"EditorTaskListResponse",
|
||||
"EditorTaskResponse",
|
||||
"UpsertAnnotationRequest",
|
||||
"UpsertAnnotationResponse",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
"""
|
||||
标注编辑器(Label Studio Editor)接口模型
|
||||
|
||||
设计目标:
|
||||
- 单人单份最终标签:每个 project_id + file_id 只维护 1 条最终标注结果
|
||||
- 完全兼容 Label Studio:标注结果以 annotation 原始 JSON 形式存储与返回
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from pydantic import BaseModel, Field, ConfigDict
|
||||
|
||||
|
||||
class EditorProjectInfo(BaseModel):
|
||||
"""编辑器项目元信息"""
|
||||
|
||||
project_id: str = Field(..., alias="projectId", description="DataMate 标注项目ID(t_dm_labeling_projects.id)")
|
||||
dataset_id: str = Field(..., alias="datasetId", description="数据集ID(t_dm_datasets.id)")
|
||||
template_id: Optional[str] = Field(None, alias="templateId", description="模板ID(t_dm_annotation_templates.id)")
|
||||
label_config: Optional[str] = Field(None, alias="labelConfig", description="Label Studio XML 配置")
|
||||
supported: bool = Field(..., description="当前数据类型是否支持内嵌编辑器")
|
||||
unsupported_reason: Optional[str] = Field(None, alias="unsupportedReason", description="不支持原因(当 supported=false)")
|
||||
|
||||
model_config = ConfigDict(populate_by_name=True)
|
||||
|
||||
|
||||
class EditorTaskListItem(BaseModel):
|
||||
"""编辑器任务列表条目(对应一个数据集文件)"""
|
||||
|
||||
file_id: str = Field(..., alias="fileId", description="文件ID")
|
||||
file_name: str = Field(..., alias="fileName", description="文件名")
|
||||
file_type: Optional[str] = Field(None, alias="fileType", description="文件类型")
|
||||
has_annotation: bool = Field(..., alias="hasAnnotation", description="是否已有最终标注")
|
||||
annotation_updated_at: Optional[datetime] = Field(None, alias="annotationUpdatedAt", description="标注更新时间")
|
||||
|
||||
model_config = ConfigDict(populate_by_name=True)
|
||||
|
||||
|
||||
class EditorTaskListResponse(BaseModel):
|
||||
"""编辑器任务列表响应"""
|
||||
|
||||
content: List[EditorTaskListItem] = Field(..., description="任务列表")
|
||||
total_elements: int = Field(..., alias="totalElements", description="总条数")
|
||||
total_pages: int = Field(..., alias="totalPages", description="总页数")
|
||||
page: int = Field(..., description="页码(从0开始)")
|
||||
size: int = Field(..., description="每页大小")
|
||||
|
||||
model_config = ConfigDict(populate_by_name=True)
|
||||
|
||||
|
||||
class EditorTaskResponse(BaseModel):
|
||||
"""编辑器任务详情(可直接喂给 Label Studio Editor 的 task 对象)"""
|
||||
|
||||
task: Dict[str, Any] = Field(..., description="Label Studio task 对象")
|
||||
annotation_updated_at: Optional[datetime] = Field(None, alias="annotationUpdatedAt", description="标注更新时间")
|
||||
|
||||
model_config = ConfigDict(populate_by_name=True)
|
||||
|
||||
|
||||
class UpsertAnnotationRequest(BaseModel):
|
||||
"""保存/覆盖最终标注(Label Studio annotation 原始对象)"""
|
||||
|
||||
annotation: Dict[str, Any] = Field(..., description="Label Studio annotation 对象(包含 result 等)")
|
||||
expected_updated_at: Optional[datetime] = Field(
|
||||
None,
|
||||
alias="expectedUpdatedAt",
|
||||
description="乐观锁:若提供则要求与当前记录 updated_at 一致,否则返回 409",
|
||||
)
|
||||
|
||||
model_config = ConfigDict(populate_by_name=True)
|
||||
|
||||
|
||||
class UpsertAnnotationResponse(BaseModel):
|
||||
"""保存/覆盖最终标注响应"""
|
||||
|
||||
annotation_id: str = Field(..., alias="annotationId", description="标注结果ID(t_dm_annotation_results.id)")
|
||||
updated_at: datetime = Field(..., alias="updatedAt", description="标注更新时间")
|
||||
|
||||
model_config = ConfigDict(populate_by_name=True)
|
||||
|
||||
295
runtime/datamate-python/app/module/annotation/service/editor.py
Normal file
295
runtime/datamate-python/app/module/annotation/service/editor.py
Normal file
@@ -0,0 +1,295 @@
|
||||
"""
|
||||
标注编辑器(Label Studio Editor)服务
|
||||
|
||||
职责:
|
||||
- 解析 DataMate 标注项目(t_dm_labeling_projects)
|
||||
- 以“文件下载/预览接口”读取文本内容,构造 Label Studio task
|
||||
- 以原始 annotation JSON 形式 upsert 最终标注结果(单人单份)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
import httpx
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy import func, select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.core.config import settings
|
||||
from app.core.logging import get_logger
|
||||
from app.db.models import AnnotationResult, Dataset, DatasetFiles, LabelingProject
|
||||
from app.module.annotation.schema.editor import (
|
||||
EditorProjectInfo,
|
||||
EditorTaskListItem,
|
||||
EditorTaskListResponse,
|
||||
EditorTaskResponse,
|
||||
UpsertAnnotationRequest,
|
||||
UpsertAnnotationResponse,
|
||||
)
|
||||
from app.module.annotation.service.template import AnnotationTemplateService
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class AnnotationEditorService:
|
||||
"""Label Studio Editor 集成服务(TEXT POC 版)"""
|
||||
|
||||
def __init__(self, db: AsyncSession):
|
||||
self.db = db
|
||||
self.template_service = AnnotationTemplateService()
|
||||
|
||||
async def _get_project_or_404(self, project_id: str) -> LabelingProject:
|
||||
result = await self.db.execute(
|
||||
select(LabelingProject).where(
|
||||
LabelingProject.id == project_id,
|
||||
LabelingProject.deleted_at.is_(None),
|
||||
)
|
||||
)
|
||||
project = result.scalar_one_or_none()
|
||||
if not project:
|
||||
raise HTTPException(status_code=404, detail=f"标注项目不存在: {project_id}")
|
||||
return project
|
||||
|
||||
async def _get_dataset_type(self, dataset_id: str) -> Optional[str]:
|
||||
result = await self.db.execute(
|
||||
select(Dataset.dataset_type).where(Dataset.id == dataset_id)
|
||||
)
|
||||
return result.scalar_one_or_none()
|
||||
|
||||
async def _get_label_config(self, template_id: Optional[str]) -> Optional[str]:
|
||||
if not template_id:
|
||||
return None
|
||||
template = await self.template_service.get_template(self.db, template_id)
|
||||
return getattr(template, "label_config", None) if template else None
|
||||
|
||||
async def get_project_info(self, project_id: str) -> EditorProjectInfo:
|
||||
project = await self._get_project_or_404(project_id)
|
||||
|
||||
dataset_type = await self._get_dataset_type(project.dataset_id)
|
||||
supported = (dataset_type or "").upper() == "TEXT"
|
||||
unsupported_reason = None
|
||||
if not supported:
|
||||
unsupported_reason = f"当前仅支持 TEXT,项目数据类型为: {dataset_type or 'UNKNOWN'}"
|
||||
|
||||
label_config = await self._get_label_config(project.template_id)
|
||||
|
||||
return EditorProjectInfo(
|
||||
projectId=project.id,
|
||||
datasetId=project.dataset_id,
|
||||
templateId=project.template_id,
|
||||
labelConfig=label_config,
|
||||
supported=supported,
|
||||
unsupportedReason=unsupported_reason,
|
||||
)
|
||||
|
||||
async def list_tasks(self, project_id: str, page: int = 0, size: int = 50) -> EditorTaskListResponse:
|
||||
project = await self._get_project_or_404(project_id)
|
||||
|
||||
count_result = await self.db.execute(
|
||||
select(func.count()).select_from(DatasetFiles).where(
|
||||
DatasetFiles.dataset_id == project.dataset_id
|
||||
)
|
||||
)
|
||||
total = int(count_result.scalar() or 0)
|
||||
|
||||
files_result = await self.db.execute(
|
||||
select(DatasetFiles)
|
||||
.where(DatasetFiles.dataset_id == project.dataset_id)
|
||||
.order_by(DatasetFiles.created_at.desc())
|
||||
.offset(page * size)
|
||||
.limit(size)
|
||||
)
|
||||
files = files_result.scalars().all()
|
||||
|
||||
file_ids = [str(f.id) for f in files] # type: ignore[arg-type]
|
||||
updated_map: Dict[str, datetime] = {}
|
||||
if file_ids:
|
||||
ann_result = await self.db.execute(
|
||||
select(AnnotationResult.file_id, AnnotationResult.updated_at).where(
|
||||
AnnotationResult.project_id == project_id,
|
||||
AnnotationResult.file_id.in_(file_ids),
|
||||
)
|
||||
)
|
||||
for file_id, updated_at in ann_result.all():
|
||||
if file_id and updated_at:
|
||||
updated_map[str(file_id)] = updated_at
|
||||
|
||||
items: List[EditorTaskListItem] = []
|
||||
for f in files:
|
||||
fid = str(f.id) # type: ignore[arg-type]
|
||||
items.append(
|
||||
EditorTaskListItem(
|
||||
fileId=fid,
|
||||
fileName=str(getattr(f, "file_name", "")),
|
||||
fileType=getattr(f, "file_type", None),
|
||||
hasAnnotation=fid in updated_map,
|
||||
annotationUpdatedAt=updated_map.get(fid),
|
||||
)
|
||||
)
|
||||
|
||||
total_pages = (total + size - 1) // size if size > 0 else 0
|
||||
return EditorTaskListResponse(
|
||||
content=items,
|
||||
totalElements=total,
|
||||
totalPages=total_pages,
|
||||
page=page,
|
||||
size=size,
|
||||
)
|
||||
|
||||
async def _fetch_text_content_via_download_api(self, dataset_id: str, file_id: str) -> str:
|
||||
base = settings.datamate_backend_base_url.rstrip("/")
|
||||
url = f"{base}/data-management/datasets/{dataset_id}/files/{file_id}/download"
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0, follow_redirects=True) as client:
|
||||
resp = await client.get(url)
|
||||
resp.raise_for_status()
|
||||
|
||||
content_length = resp.headers.get("content-length")
|
||||
if content_length:
|
||||
try:
|
||||
if int(content_length) > settings.editor_max_text_bytes:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
|
||||
)
|
||||
except ValueError:
|
||||
# content-length 非法则忽略,走实际长度判断
|
||||
pass
|
||||
|
||||
data = resp.content
|
||||
if len(data) > settings.editor_max_text_bytes:
|
||||
raise HTTPException(
|
||||
status_code=413,
|
||||
detail=f"文本文件过大,限制 {settings.editor_max_text_bytes} 字节",
|
||||
)
|
||||
|
||||
# TEXT POC:默认按 UTF-8 解码,不可解码字符用替换符处理
|
||||
return data.decode("utf-8", errors="replace")
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except httpx.HTTPStatusError as e:
|
||||
logger.error(f"读取文本失败: dataset={dataset_id}, file={file_id}, http={e.response.status_code}")
|
||||
raise HTTPException(status_code=502, detail="读取文本失败(下载接口返回错误)")
|
||||
except Exception as e:
|
||||
logger.error(f"读取文本失败: dataset={dataset_id}, file={file_id}, err={e}")
|
||||
raise HTTPException(status_code=502, detail="读取文本失败(下载接口调用异常)")
|
||||
|
||||
async def get_task(self, project_id: str, file_id: str) -> EditorTaskResponse:
|
||||
project = await self._get_project_or_404(project_id)
|
||||
|
||||
# TEXT 支持校验
|
||||
dataset_type = await self._get_dataset_type(project.dataset_id)
|
||||
if (dataset_type or "").upper() != "TEXT":
|
||||
raise HTTPException(status_code=400, detail="当前仅支持 TEXT 项目的内嵌编辑器")
|
||||
|
||||
file_result = await self.db.execute(
|
||||
select(DatasetFiles).where(
|
||||
DatasetFiles.id == file_id,
|
||||
DatasetFiles.dataset_id == project.dataset_id,
|
||||
)
|
||||
)
|
||||
file_record = file_result.scalar_one_or_none()
|
||||
if not file_record:
|
||||
raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
|
||||
|
||||
text_content = await self._fetch_text_content_via_download_api(project.dataset_id, file_id)
|
||||
|
||||
ann_result = await self.db.execute(
|
||||
select(AnnotationResult).where(
|
||||
AnnotationResult.project_id == project_id,
|
||||
AnnotationResult.file_id == file_id,
|
||||
)
|
||||
)
|
||||
ann = ann_result.scalar_one_or_none()
|
||||
|
||||
task: Dict[str, Any] = {
|
||||
"id": file_id,
|
||||
"data": {
|
||||
"text": text_content,
|
||||
"file_id": file_id,
|
||||
"dataset_id": project.dataset_id,
|
||||
"file_name": getattr(file_record, "file_name", ""),
|
||||
},
|
||||
"annotations": [],
|
||||
}
|
||||
|
||||
annotation_updated_at = None
|
||||
if ann:
|
||||
annotation_updated_at = ann.updated_at
|
||||
# 直接返回存储的 annotation 原始对象(Label Studio 兼容)
|
||||
task["annotations"] = [ann.annotation]
|
||||
|
||||
return EditorTaskResponse(
|
||||
task=task,
|
||||
annotationUpdatedAt=annotation_updated_at,
|
||||
)
|
||||
|
||||
async def upsert_annotation(self, project_id: str, file_id: str, request: UpsertAnnotationRequest) -> UpsertAnnotationResponse:
|
||||
project = await self._get_project_or_404(project_id)
|
||||
|
||||
# 校验文件归属
|
||||
file_check = await self.db.execute(
|
||||
select(DatasetFiles.id).where(
|
||||
DatasetFiles.id == file_id,
|
||||
DatasetFiles.dataset_id == project.dataset_id,
|
||||
)
|
||||
)
|
||||
if not file_check.scalar_one_or_none():
|
||||
raise HTTPException(status_code=404, detail=f"文件不存在或不属于该项目: {file_id}")
|
||||
|
||||
annotation_payload = dict(request.annotation or {})
|
||||
result = annotation_payload.get("result")
|
||||
if not isinstance(result, list):
|
||||
raise HTTPException(status_code=400, detail="annotation.result 必须为数组")
|
||||
|
||||
existing_result = await self.db.execute(
|
||||
select(AnnotationResult).where(
|
||||
AnnotationResult.project_id == project_id,
|
||||
AnnotationResult.file_id == file_id,
|
||||
)
|
||||
)
|
||||
existing = existing_result.scalar_one_or_none()
|
||||
|
||||
now = datetime.utcnow()
|
||||
|
||||
if existing:
|
||||
if request.expected_updated_at and existing.updated_at:
|
||||
if existing.updated_at != request.expected_updated_at.replace(tzinfo=None):
|
||||
raise HTTPException(status_code=409, detail="标注已被更新,请刷新后重试")
|
||||
|
||||
# 固定 annotation.id 为记录ID,保持稳定
|
||||
annotation_payload["id"] = existing.id
|
||||
existing.annotation = annotation_payload # type: ignore[assignment]
|
||||
existing.updated_at = now # type: ignore[assignment]
|
||||
await self.db.commit()
|
||||
await self.db.refresh(existing)
|
||||
|
||||
return UpsertAnnotationResponse(
|
||||
annotationId=existing.id,
|
||||
updatedAt=existing.updated_at or now,
|
||||
)
|
||||
|
||||
new_id = str(uuid.uuid4())
|
||||
annotation_payload["id"] = new_id
|
||||
record = AnnotationResult(
|
||||
id=new_id,
|
||||
project_id=project_id,
|
||||
file_id=file_id,
|
||||
annotation=annotation_payload,
|
||||
created_at=now,
|
||||
updated_at=now,
|
||||
)
|
||||
self.db.add(record)
|
||||
await self.db.commit()
|
||||
await self.db.refresh(record)
|
||||
|
||||
return UpsertAnnotationResponse(
|
||||
annotationId=record.id,
|
||||
updatedAt=record.updated_at or now,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user