feat: add labeling template. refactor: switch to Poetry, build and deploy of backend Python (#79)

* feat: Enhance annotation module with template management and validation

- Added DatasetMappingCreateRequest and DatasetMappingUpdateRequest schemas to handle dataset mapping requests with camelCase and snake_case support.
- Introduced Annotation Template schemas including CreateAnnotationTemplateRequest, UpdateAnnotationTemplateRequest, and AnnotationTemplateResponse for managing annotation templates.
- Implemented AnnotationTemplateService for creating, updating, retrieving, and deleting annotation templates, including validation of configurations and XML generation.
- Added utility class LabelStudioConfigValidator for validating Label Studio configurations and XML formats.
- Updated database schema for annotation templates and labeling projects to include new fields and constraints.
- Seeded initial annotation templates for various use cases including image classification, object detection, and text classification.

* feat: Enhance TemplateForm with improved validation and dynamic field rendering; update LabelStudio config validation for camelCase support

* feat: Update docker-compose.yml to mark datamate dataset volume and network as external

* feat: Add tag configuration management and related components

- Introduced new components for tag selection and browsing in the frontend.
- Added API endpoint to fetch tag configuration from the backend.
- Implemented tag configuration management in the backend, including loading from YAML.
- Enhanced template service to support dynamic tag rendering based on configuration.
- Updated validation utilities to incorporate tag configuration checks.
- Refactored existing code to utilize the new tag configuration structure.

* feat: Refactor LabelStudioTagConfig for improved configuration loading and validation

* feat: Update Makefile to include backend-python-docker-build in the build process

* feat: Migrate to poetry for better deps management

* Add pyyaml dependency and update Dockerfile to use Poetry for dependency management

- Added pyyaml (>=6.0.3,<7.0.0) to pyproject.toml dependencies.
- Updated Dockerfile to install Poetry and manage dependencies using it.
- Improved layer caching by copying only dependency files before the application code.
- Removed unnecessary installation of build dependencies to keep the final image size small.

* feat: Remove duplicated backend-python-docker-build target from Makefile

* fix: airflow is not ready for adding yet

* feat: update Python version to 3.12 and remove project installation step in Dockerfile
This commit is contained in:
Jason Wang
2025-11-13 15:32:30 +08:00
committed by GitHub
parent 2660845b74
commit 45743f39f5
40 changed files with 3223 additions and 262 deletions

View File

@@ -122,7 +122,7 @@ class SyncService:
return {}
all_tasks = result.get("tasks", [])
logger.info(f"Successfully fetched {len(all_tasks)} tasks")
logger.debug(f"Successfully fetched {len(all_tasks)} tasks")
# 使用字典推导式构建映射
dm_file_to_task_mapping = {
@@ -131,7 +131,7 @@ class SyncService:
if task.get('data', {}).get('file_id') is not None
}
logger.info(f"Found {len(dm_file_to_task_mapping)} existing task mappings")
logger.debug(f"Found {len(dm_file_to_task_mapping)} existing task mappings")
return dm_file_to_task_mapping
except Exception as e:
@@ -163,10 +163,10 @@ class SyncService:
)
if not files_response or not files_response.content:
logger.info(f"No more files on page {page + 1}")
logger.debug(f"No more files on page {page + 1}")
break
logger.info(f"Processing page {page + 1}, {len(files_response.content)} files")
logger.debug(f"Processing page {page + 1}, {len(files_response.content)} files")
# 筛选新文件并构建任务数据
new_tasks = []
@@ -178,7 +178,7 @@ class SyncService:
task_data = self._build_task_data(file_info, dataset_id)
new_tasks.append(task_data)
logger.info(f"Page {page + 1}: {len(new_tasks)} new files, {len(files_response.content) - len(new_tasks)} existing")
logger.debug(f"Page {page + 1}: {len(new_tasks)} new files, {len(files_response.content) - len(new_tasks)} existing")
# 批量创建任务
if new_tasks:
@@ -202,16 +202,16 @@ class SyncService:
deleted_file_ids = set(existing_dm_file_mapping.keys()) - current_file_ids
if not deleted_file_ids:
logger.info("No tasks to delete")
logger.debug("No tasks to delete")
return 0
tasks_to_delete = [existing_dm_file_mapping[fid] for fid in deleted_file_ids]
logger.info(f"Deleting {len(tasks_to_delete)} orphaned tasks")
logger.debug(f"Deleting {len(tasks_to_delete)} orphaned tasks")
delete_result = await self.ls_client.delete_tasks_batch(tasks_to_delete)
deleted_count = delete_result.get("successful", 0)
logger.info(f"Successfully deleted {deleted_count} tasks")
logger.debug(f"Successfully deleted {deleted_count} tasks")
return deleted_count
async def sync_dataset_files(
@@ -229,7 +229,7 @@ class SyncService:
Returns:
同步结果响应
"""
logger.info(f"Start syncing dataset files by mapping: {mapping_id}")
logger.debug(f"Start syncing dataset files by mapping: {mapping_id}")
# 获取映射关系
mapping = await self.mapping_service.get_mapping_by_uuid(mapping_id)
@@ -247,7 +247,7 @@ class SyncService:
# 委托给sync_files执行实际同步
result = await self.sync_files(mapping, batch_size)
logger.info(f"Sync completed: created={result['created']}, deleted={result['deleted']}, total={result['total']}")
logger.info(f"Sync files completed: created={result['created']}, deleted={result['deleted']}, total={result['total']}")
return SyncDatasetResponse(
id=mapping.id,
@@ -342,7 +342,7 @@ class SyncService:
Returns:
同步统计信息: {"created": int, "deleted": int, "total": int}
"""
logger.info(f"Syncing files for dataset {mapping.dataset_id} to project {mapping.labeling_project_id}")
logger.debug(f"Syncing files for dataset {mapping.dataset_id} to project {mapping.labeling_project_id}")
# 获取DM数据集信息
dataset_info = await self.dm_client.get_dataset(mapping.dataset_id)
@@ -350,12 +350,12 @@ class SyncService:
raise NoDatasetInfoFoundError(mapping.dataset_id)
total_files = dataset_info.fileCount
logger.info(f"Total files in DM dataset: {total_files}")
logger.debug(f"Total files in DM dataset: {total_files}")
# 获取Label Studio中已存在的文件映射
existing_dm_file_mapping = await self.get_existing_dm_file_mapping(mapping.labeling_project_id)
existing_file_ids = set(existing_dm_file_mapping.keys())
logger.info(f"{len(existing_file_ids)} tasks already exist in Label Studio")
logger.debug(f"{len(existing_file_ids)} tasks already exist in Label Studio")
# 分页获取DM文件并创建新任务
current_file_ids, created_count = await self._fetch_dm_files_paginated(
@@ -371,7 +371,7 @@ class SyncService:
current_file_ids
)
logger.info(f"File sync completed: total={total_files}, created={created_count}, deleted={deleted_count}")
logger.debug(f"File sync completed: total={total_files}, created={created_count}, deleted={deleted_count}")
return {
"created": created_count,