feat: add labeling template. refactor: switch to Poetry, build and deploy of backend Python (#79)

* feat: Enhance annotation module with template management and validation

- Added DatasetMappingCreateRequest and DatasetMappingUpdateRequest schemas to handle dataset mapping requests with camelCase and snake_case support.
- Introduced Annotation Template schemas including CreateAnnotationTemplateRequest, UpdateAnnotationTemplateRequest, and AnnotationTemplateResponse for managing annotation templates.
- Implemented AnnotationTemplateService for creating, updating, retrieving, and deleting annotation templates, including validation of configurations and XML generation.
- Added utility class LabelStudioConfigValidator for validating Label Studio configurations and XML formats.
- Updated database schema for annotation templates and labeling projects to include new fields and constraints.
- Seeded initial annotation templates for various use cases including image classification, object detection, and text classification.

* feat: Enhance TemplateForm with improved validation and dynamic field rendering; update LabelStudio config validation for camelCase support

* feat: Update docker-compose.yml to mark datamate dataset volume and network as external

* feat: Add tag configuration management and related components

- Introduced new components for tag selection and browsing in the frontend.
- Added API endpoint to fetch tag configuration from the backend.
- Implemented tag configuration management in the backend, including loading from YAML.
- Enhanced template service to support dynamic tag rendering based on configuration.
- Updated validation utilities to incorporate tag configuration checks.
- Refactored existing code to utilize the new tag configuration structure.

* feat: Refactor LabelStudioTagConfig for improved configuration loading and validation

* feat: Update Makefile to include backend-python-docker-build in the build process

* feat: Migrate to poetry for better deps management

* Add pyyaml dependency and update Dockerfile to use Poetry for dependency management

- Added pyyaml (>=6.0.3,<7.0.0) to pyproject.toml dependencies.
- Updated Dockerfile to install Poetry and manage dependencies using it.
- Improved layer caching by copying only dependency files before the application code.
- Removed unnecessary installation of build dependencies to keep the final image size small.

* feat: Remove duplicated backend-python-docker-build target from Makefile

* fix: airflow is not ready for adding yet

* feat: update Python version to 3.12 and remove project installation step in Dockerfile
This commit is contained in:
Jason Wang
2025-11-13 15:32:30 +08:00
committed by GitHub
parent 2660845b74
commit 45743f39f5
40 changed files with 3223 additions and 262 deletions

View File

@@ -83,7 +83,7 @@ class DatasetMappingService:
labeling_project: LabelingProject
) -> DatasetMappingResponse:
"""创建数据集映射"""
logger.info(f"Create dataset mapping: {labeling_project.dataset_id} -> {labeling_project.labeling_project_id}")
logger.debug(f"Create dataset mapping: {labeling_project.dataset_id} -> {labeling_project.labeling_project_id}")
# Use the passed object directly
self.db.add(labeling_project)
@@ -201,7 +201,7 @@ class DatasetMappingService:
)
await self.db.commit()
if result.rowcount > 0:
if result.rowcount and result.rowcount > 0: # type: ignore
return await self.get_mapping_by_uuid(mapping_id)
return None
@@ -219,7 +219,7 @@ class DatasetMappingService:
)
await self.db.commit()
success = result.rowcount > 0
success = result.rowcount and result.rowcount > 0 # type: ignore
if success:
logger.info(f"Mapping soft-deleted: {mapping_id}")
else:

View File

@@ -122,7 +122,7 @@ class SyncService:
return {}
all_tasks = result.get("tasks", [])
logger.info(f"Successfully fetched {len(all_tasks)} tasks")
logger.debug(f"Successfully fetched {len(all_tasks)} tasks")
# 使用字典推导式构建映射
dm_file_to_task_mapping = {
@@ -131,7 +131,7 @@ class SyncService:
if task.get('data', {}).get('file_id') is not None
}
logger.info(f"Found {len(dm_file_to_task_mapping)} existing task mappings")
logger.debug(f"Found {len(dm_file_to_task_mapping)} existing task mappings")
return dm_file_to_task_mapping
except Exception as e:
@@ -163,10 +163,10 @@ class SyncService:
)
if not files_response or not files_response.content:
logger.info(f"No more files on page {page + 1}")
logger.debug(f"No more files on page {page + 1}")
break
logger.info(f"Processing page {page + 1}, {len(files_response.content)} files")
logger.debug(f"Processing page {page + 1}, {len(files_response.content)} files")
# 筛选新文件并构建任务数据
new_tasks = []
@@ -178,7 +178,7 @@ class SyncService:
task_data = self._build_task_data(file_info, dataset_id)
new_tasks.append(task_data)
logger.info(f"Page {page + 1}: {len(new_tasks)} new files, {len(files_response.content) - len(new_tasks)} existing")
logger.debug(f"Page {page + 1}: {len(new_tasks)} new files, {len(files_response.content) - len(new_tasks)} existing")
# 批量创建任务
if new_tasks:
@@ -202,16 +202,16 @@ class SyncService:
deleted_file_ids = set(existing_dm_file_mapping.keys()) - current_file_ids
if not deleted_file_ids:
logger.info("No tasks to delete")
logger.debug("No tasks to delete")
return 0
tasks_to_delete = [existing_dm_file_mapping[fid] for fid in deleted_file_ids]
logger.info(f"Deleting {len(tasks_to_delete)} orphaned tasks")
logger.debug(f"Deleting {len(tasks_to_delete)} orphaned tasks")
delete_result = await self.ls_client.delete_tasks_batch(tasks_to_delete)
deleted_count = delete_result.get("successful", 0)
logger.info(f"Successfully deleted {deleted_count} tasks")
logger.debug(f"Successfully deleted {deleted_count} tasks")
return deleted_count
async def sync_dataset_files(
@@ -229,7 +229,7 @@ class SyncService:
Returns:
同步结果响应
"""
logger.info(f"Start syncing dataset files by mapping: {mapping_id}")
logger.debug(f"Start syncing dataset files by mapping: {mapping_id}")
# 获取映射关系
mapping = await self.mapping_service.get_mapping_by_uuid(mapping_id)
@@ -247,7 +247,7 @@ class SyncService:
# 委托给sync_files执行实际同步
result = await self.sync_files(mapping, batch_size)
logger.info(f"Sync completed: created={result['created']}, deleted={result['deleted']}, total={result['total']}")
logger.info(f"Sync files completed: created={result['created']}, deleted={result['deleted']}, total={result['total']}")
return SyncDatasetResponse(
id=mapping.id,
@@ -342,7 +342,7 @@ class SyncService:
Returns:
同步统计信息: {"created": int, "deleted": int, "total": int}
"""
logger.info(f"Syncing files for dataset {mapping.dataset_id} to project {mapping.labeling_project_id}")
logger.debug(f"Syncing files for dataset {mapping.dataset_id} to project {mapping.labeling_project_id}")
# 获取DM数据集信息
dataset_info = await self.dm_client.get_dataset(mapping.dataset_id)
@@ -350,12 +350,12 @@ class SyncService:
raise NoDatasetInfoFoundError(mapping.dataset_id)
total_files = dataset_info.fileCount
logger.info(f"Total files in DM dataset: {total_files}")
logger.debug(f"Total files in DM dataset: {total_files}")
# 获取Label Studio中已存在的文件映射
existing_dm_file_mapping = await self.get_existing_dm_file_mapping(mapping.labeling_project_id)
existing_file_ids = set(existing_dm_file_mapping.keys())
logger.info(f"{len(existing_file_ids)} tasks already exist in Label Studio")
logger.debug(f"{len(existing_file_ids)} tasks already exist in Label Studio")
# 分页获取DM文件并创建新任务
current_file_ids, created_count = await self._fetch_dm_files_paginated(
@@ -371,7 +371,7 @@ class SyncService:
current_file_ids
)
logger.info(f"File sync completed: total={total_files}, created={created_count}, deleted={deleted_count}")
logger.debug(f"File sync completed: total={total_files}, created={created_count}, deleted={deleted_count}")
return {
"created": created_count,

View File

@@ -17,6 +17,7 @@ from app.module.annotation.schema.template import (
TemplateConfiguration
)
from app.module.annotation.utils.config_validator import LabelStudioConfigValidator
from app.module.annotation.config import LabelStudioTagConfig
class AnnotationTemplateService:
@@ -33,6 +34,7 @@ class AnnotationTemplateService:
Returns:
Label Studio XML字符串
"""
tag_config = LabelStudioTagConfig()
xml_parts = ['<View>']
# 生成对象定义
@@ -56,15 +58,22 @@ class AnnotationTemplateService:
tag_type = label.type.capitalize() if label.type else "Choices"
# 处理带选项的标签类型
# 检查是否需要子元素
if label.options or label.labels:
choices = label.options or label.labels or []
xml_parts.append(f' <{tag_type} {" ".join(label_attrs)}>')
# 从配置获取子元素标签名
child_tag = tag_config.get_child_tag(tag_type)
if not child_tag:
# 默认使用 Label
child_tag = "Label"
for choice in choices:
xml_parts.append(f' <Label value="{choice}"/>')
xml_parts.append(f' <{child_tag} value="{choice}"/>')
xml_parts.append(f' </{tag_type}>')
else:
# 处理简单标签类型
# 处理简单标签类型(不需要子元素)
xml_parts.append(f' <{tag_type} {" ".join(label_attrs)}/>')
xml_parts.append('</View>')