feat: add labeling template. refactor: switch to Poetry, build and deploy of backend Python (#79)

* feat: Enhance annotation module with template management and validation

- Added DatasetMappingCreateRequest and DatasetMappingUpdateRequest schemas to handle dataset mapping requests with camelCase and snake_case support.
- Introduced Annotation Template schemas including CreateAnnotationTemplateRequest, UpdateAnnotationTemplateRequest, and AnnotationTemplateResponse for managing annotation templates.
- Implemented AnnotationTemplateService for creating, updating, retrieving, and deleting annotation templates, including validation of configurations and XML generation.
- Added utility class LabelStudioConfigValidator for validating Label Studio configurations and XML formats.
- Updated database schema for annotation templates and labeling projects to include new fields and constraints.
- Seeded initial annotation templates for various use cases including image classification, object detection, and text classification.

* feat: Enhance TemplateForm with improved validation and dynamic field rendering; update LabelStudio config validation for camelCase support

* feat: Update docker-compose.yml to mark datamate dataset volume and network as external

* feat: Add tag configuration management and related components

- Introduced new components for tag selection and browsing in the frontend.
- Added API endpoint to fetch tag configuration from the backend.
- Implemented tag configuration management in the backend, including loading from YAML.
- Enhanced template service to support dynamic tag rendering based on configuration.
- Updated validation utilities to incorporate tag configuration checks.
- Refactored existing code to utilize the new tag configuration structure.

* feat: Refactor LabelStudioTagConfig for improved configuration loading and validation

* feat: Update Makefile to include backend-python-docker-build in the build process

* feat: Migrate to poetry for better deps management

* Add pyyaml dependency and update Dockerfile to use Poetry for dependency management

- Added pyyaml (>=6.0.3,<7.0.0) to pyproject.toml dependencies.
- Updated Dockerfile to install Poetry and manage dependencies using it.
- Improved layer caching by copying only dependency files before the application code.
- Removed unnecessary installation of build dependencies to keep the final image size small.

* feat: Remove duplicated backend-python-docker-build target from Makefile

* fix: airflow is not ready for adding yet

* feat: update Python version to 3.12 and remove project installation step in Dockerfile
This commit is contained in:
Jason Wang
2025-11-13 15:32:30 +08:00
committed by GitHub
parent 2660845b74
commit 45743f39f5
40 changed files with 3223 additions and 262 deletions

View File

@@ -1,6 +1,6 @@
from fastapi import APIRouter
from .about import router as about_router
from .config import router as about_router
from .project import router as project_router
from .task import router as task_router
from .template import router as template_router

View File

@@ -1,25 +0,0 @@
from fastapi import APIRouter
from app.module.shared.schema import StandardResponse
from app.core.logging import get_logger
from app.core.config import settings
from ..schema import ConfigResponse
router = APIRouter(
prefix="/about",
tags=["annotation/about"]
)
logger = get_logger(__name__)
@router.get("", response_model=StandardResponse[ConfigResponse])
async def get_config():
"""获取配置信息"""
return StandardResponse(
code=200,
message="success",
data=ConfigResponse(
label_studio_url=settings.label_studio_base_url,
)
)

View File

@@ -0,0 +1,47 @@
from fastapi import APIRouter
from app.module.shared.schema import StandardResponse
from app.core.logging import get_logger
from app.core.config import settings
from ..schema import (
ConfigResponse,
TagConfigResponse
)
from ..config.tag_config import LabelStudioTagConfig
router = APIRouter(
prefix="/tags",
tags=["annotation/config"]
)
logger = get_logger(__name__)
@router.get("", response_model=StandardResponse[ConfigResponse])
async def get_config():
"""获取配置信息(已废弃,请使用 /api/annotation/about)"""
return StandardResponse(
code=200,
message="success",
data=ConfigResponse(
label_studio_url=settings.label_studio_base_url,
)
)
@router.get("/config", response_model=StandardResponse[TagConfigResponse], summary="获取标签配置")
async def get_tag_config():
"""
获取所有Label Studio标签类型的配置(对象+控件),用于前端动态渲染。
"""
# Ensure config is loaded by instantiating the class
tag_config = LabelStudioTagConfig()
config = LabelStudioTagConfig._config
if not config:
logger.error("Failed to load tag configuration")
return StandardResponse(
code=500,
message="Failed to load tag configuration",
data={"objects": {}, "controls": {}}
)
return StandardResponse(code=200, message="success", data=config)

View File

@@ -2,7 +2,7 @@ from typing import Optional
import math
import uuid
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi import APIRouter, Depends, HTTPException, Query, Path
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.session import get_db
@@ -149,7 +149,7 @@ async def create_mapping(
@router.get("", response_model=StandardResponse[PaginatedData[DatasetMappingResponse]])
async def list_mappings(
page: int = Query(1, ge=1, description="页码(从1开始)"),
page_size: int = Query(20, ge=1, le=100, description="每页记录数"),
page_size: int = Query(20, ge=1, le=100, description="每页记录数", alias="pageSize"),
db: AsyncSession = Depends(get_db)
):
"""
@@ -163,8 +163,6 @@ async def list_mappings(
# 计算 skip
skip = (page - 1) * page_size
logger.info(f"Listing mappings, page={page}, page_size={page_size}")
# 获取数据和总数
mappings, total = await service.get_all_mappings_with_count(
skip=skip,
@@ -183,7 +181,7 @@ async def list_mappings(
content=mappings
)
logger.info(f"Found {len(mappings)} mappings on page {page}, total: {total}")
logger.info(f"List mappings: page={page}, returned {len(mappings)}/{total}")
return StandardResponse(
code=200,
@@ -234,7 +232,7 @@ async def get_mapping(
async def get_mappings_by_source(
dataset_id: str,
page: int = Query(1, ge=1, description="页码(从1开始)"),
page_size: int = Query(20, ge=1, le=100, description="每页记录数"),
page_size: int = Query(20, ge=1, le=100, description="每页记录数", alias="pageSize"),
db: AsyncSession = Depends(get_db)
):
"""
@@ -283,49 +281,30 @@ async def get_mappings_by_source(
logger.error(f"Error getting mappings: {e}")
raise HTTPException(status_code=500, detail="Internal server error")
@router.delete("", response_model=StandardResponse[DeleteDatasetResponse])
@router.delete("/{project_id}", response_model=StandardResponse[DeleteDatasetResponse])
async def delete_mapping(
m: Optional[str] = Query(None, description="映射UUID"),
proj: Optional[str] = Query(None, description="Label Studio项目ID"),
project_id: str = Path(..., description="映射UUID(path param)"),
db: AsyncSession = Depends(get_db)
):
"""
删除映射关系和对应的 Label Studio 项目
可以通过以下任一方式指定要删除的映射:
- m: 映射UUID
- proj: Label Studio项目ID
- 两者都提供(优先使用 m)
通过 path 参数 `project_id` 指定要删除的映射(映射的 UUID)。
此操作会:
1. 删除 Label Studio 中的项目
2. 软删除数据库中的映射记录
"""
try:
# Log incoming request parameters for debugging
logger.debug(f"Delete mapping request received: m={m!r}, proj={proj!r}")
# 至少需要提供一个参数
if not m and not proj:
logger.debug("Missing both 'm' and 'proj' in delete request")
raise HTTPException(
status_code=400,
detail="Either 'm' (mapping UUID) or 'proj' (project ID) must be provided"
)
logger.debug(f"Delete mapping request received: project_id={project_id!r}")
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
token=settings.label_studio_user_token)
service = DatasetMappingService(db)
# 优先使用 mapping_id 查询
if m:
logger.debug(f"Deleting by mapping UUID: {m}")
mapping = await service.get_mapping_by_uuid(m)
# 如果没有提供 m,使用 proj 查询
elif proj:
logger.debug(f"Deleting by project ID: {proj}")
mapping = await service.get_mapping_by_labeling_project_id(proj)
else:
mapping = None
# 使用 mapping UUID 查询映射记录
logger.debug(f"Deleting by mapping UUID: {project_id}")
mapping = await service.get_mapping_by_uuid(project_id)
logger.debug(f"Mapping lookup result: {mapping}")

View File

@@ -1,6 +1,8 @@
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi import APIRouter, Depends, HTTPException, Query, Path
from sqlalchemy.ext.asyncio import AsyncSession
from typing import List, Optional
from typing import List, Optional, Dict, Any
from datetime import datetime
from pydantic import BaseModel, Field, ConfigDict
from app.db.session import get_db
from app.module.shared.schema import StandardResponse
@@ -17,6 +19,10 @@ from ..schema import (
SyncDatasetResponse,
SyncAnnotationsRequest,
SyncAnnotationsResponse,
UpdateFileTagsRequest,
UpdateFileTagsResponse,
UpdateFileTagsRequest,
UpdateFileTagsResponse
)
@@ -32,24 +38,10 @@ async def sync_dataset_content(
db: AsyncSession = Depends(get_db)
):
"""
同步数据集内容(包括文件和标注)
Sync Dataset Content (Files and Annotations)
根据指定的mapping ID,同步DM程序数据集中的内容到Label Studio数据集中。
默认同时同步文件和标注数据。
Args:
request: 同步请求,包含:
- id: 映射ID(mapping UUID)
- batchSize: 批处理大小
- filePriority: 文件同步优先级
- labelPriority: 标签同步优先级
- syncAnnotations: 是否同步标注(默认True)
- annotationDirection: 标注同步方向(默认bidirectional)
- overwrite: 是否允许覆盖DataMate中的标注(默认True)
- overwriteLabelingProject: 是否允许覆盖Label Studio中的标注(默认True)
Returns:
同步结果
"""
try:
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
@@ -123,28 +115,10 @@ async def sync_annotations(
db: AsyncSession = Depends(get_db)
):
"""
仅同步标注结果(支持双向同步)
根据指定mapping ID和同步方向,在DM数据集和Label Studio之间同步标注结果
标注结果存储在数据集文件表的tags字段中,使用简化格式
同步策略:
- 默认为双向同步,基于时间戳自动解决冲突
- overwrite: 控制是否允许用Label Studio的标注覆盖DataMate(基于时间戳比较)
- overwriteLabelingProject: 控制是否允许用DataMate的标注覆盖Label Studio(基于时间戳比较)
- 如果Label Studio标注的updated_at更新,且overwrite=True,则覆盖DataMate
- 如果DataMate标注的updated_at更新,且overwriteLabelingProject=True,则覆盖Label Studio
Args:
request: 同步请求,包含:
- id: 映射ID(mapping UUID)
- batchSize: 批处理大小
- direction: 同步方向 (ls_to_dm/dm_to_ls/bidirectional)
- overwrite: 是否允许覆盖DataMate中的标注(默认True)
- overwriteLabelingProject: 是否允许覆盖Label Studio中的标注(默认True)
Returns:
同步结果,包含同步统计信息和冲突解决情况
Sync Annotations Only (Bidirectional Support)
同步指定 mapping 下的标注数据,支持单向或双向同步,基于时间戳自动解决冲突
请求与响应由 Pydantic 模型 `SyncAnnotationsRequest` / `SyncAnnotationsResponse` 定义
"""
try:
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
@@ -207,9 +181,9 @@ async def sync_annotations(
@router.get("/check-ls-connection")
async def check_label_studio_connection():
"""
检查Label Studio连接状态
用于诊断Label Studio连接问题,返回连接状态和配置信息
Check Label Studio Connection Status
诊断 Label Studio 连接并返回简要连接信息(状态、base URL、token 摘要、项目统计)。
"""
try:
ls_client = LabelStudioClient(
@@ -258,4 +232,55 @@ async def check_label_studio_connection():
)
except Exception as e:
logger.error(f"Error checking Label Studio connection: {e}")
raise HTTPException(status_code=500, detail=str(e))
raise HTTPException(status_code=500, detail=str(e))
@router.put(
"/{file_id}",
response_model=StandardResponse[UpdateFileTagsResponse],
)
async def update_file_tags(
request: UpdateFileTagsRequest,
file_id: str = Path(..., description="文件ID"),
db: AsyncSession = Depends(get_db)
):
"""
Update File Tags (Partial Update)
接收部分标签更新并合并到指定文件(只修改提交的标签,其余保持不变),并更新 `tags_updated_at`。
请求与响应使用 Pydantic 模型 `UpdateFileTagsRequest` / `UpdateFileTagsResponse`。
"""
service = DatasetManagementService(db)
success, error_msg, updated_at = await service.update_file_tags_partial(
file_id=file_id,
new_tags=request.tags
)
if not success:
if "not found" in (error_msg or "").lower():
raise HTTPException(status_code=404, detail=error_msg)
raise HTTPException(status_code=500, detail=error_msg or "更新标签失败")
# 获取更新后的完整标签列表
from sqlalchemy.future import select
from app.db.models import DatasetFiles
result = await db.execute(
select(DatasetFiles).where(DatasetFiles.id == file_id)
)
file_record = result.scalar_one_or_none()
if not file_record:
raise HTTPException(status_code=404, detail=f"File not found: {file_id}")
response_data = UpdateFileTagsResponse(
fileId=file_id,
tags=file_record.tags or [], # type: ignore
tagsUpdatedAt=updated_at or datetime.now()
)
return StandardResponse(
code=200,
message="标签更新成功",
data=response_data
)

View File

@@ -7,7 +7,7 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.db.session import get_db
from app.module.shared.schema import StandardResponse
from app.module.annotation.schema.template import (
from app.module.annotation.schema import (
CreateAnnotationTemplateRequest,
UpdateAnnotationTemplateRequest,
AnnotationTemplateResponse,
@@ -15,7 +15,7 @@ from app.module.annotation.schema.template import (
)
from app.module.annotation.service.template import AnnotationTemplateService
router = APIRouter(prefix="/templates", tags=["Annotation Template"])
router = APIRouter(prefix="/template", tags=["annotation/template"])
template_service = AnnotationTemplateService()
@@ -23,7 +23,6 @@ template_service = AnnotationTemplateService()
@router.post(
"",
response_model=StandardResponse[AnnotationTemplateResponse],
summary="创建标注模板"
)
async def create_template(
request: CreateAnnotationTemplateRequest,
@@ -47,7 +46,6 @@ async def create_template(
@router.get(
"/{template_id}",
response_model=StandardResponse[AnnotationTemplateResponse],
summary="获取模板详情"
)
async def get_template(
template_id: str,
@@ -65,9 +63,8 @@ async def get_template(
@router.get(
"",
response_model=StandardResponse[AnnotationTemplateListResponse],
summary="获取模板列表"
)
async def list_templates(
async def list_template(
page: int = Query(1, ge=1, description="页码"),
size: int = Query(10, ge=1, le=100, description="每页大小"),
category: Optional[str] = Query(None, description="分类筛选"),
@@ -101,7 +98,6 @@ async def list_templates(
@router.put(
"/{template_id}",
response_model=StandardResponse[AnnotationTemplateResponse],
summary="更新模板"
)
async def update_template(
template_id: str,
@@ -122,7 +118,6 @@ async def update_template(
@router.delete(
"/{template_id}",
response_model=StandardResponse[bool],
summary="删除模板"
)
async def delete_template(
template_id: str,