feat: add labeling template. refactor: switch to Poetry, build and deploy of backend Python (#79)

* feat: Enhance annotation module with template management and validation - Added DatasetMappingCreateRequest and DatasetMappingUpdateRequest schemas to handle dataset mapping requests with camelCase and snake_case support. - Introduced Annotation Template schemas including CreateAnnotationTemplateRequest, UpdateAnnotationTemplateRequest, and AnnotationTemplateResponse for managing annotation templates. - Implemented AnnotationTemplateService for creating, updating, retrieving, and deleting annotation templates, including validation of configurations and XML generation. - Added utility class LabelStudioConfigValidator for validating Label Studio configurations and XML formats. - Updated database schema for annotation templates and labeling projects to include new fields and constraints. - Seeded initial annotation templates for various use cases including image classification, object detection, and text classification. * feat: Enhance TemplateForm with improved validation and dynamic field rendering; update LabelStudio config validation for camelCase support * feat: Update docker-compose.yml to mark datamate dataset volume and network as external * feat: Add tag configuration management and related components - Introduced new components for tag selection and browsing in the frontend. - Added API endpoint to fetch tag configuration from the backend. - Implemented tag configuration management in the backend, including loading from YAML. - Enhanced template service to support dynamic tag rendering based on configuration. - Updated validation utilities to incorporate tag configuration checks. - Refactored existing code to utilize the new tag configuration structure. * feat: Refactor LabelStudioTagConfig for improved configuration loading and validation * feat: Update Makefile to include backend-python-docker-build in the build process * feat: Migrate to poetry for better deps management * Add pyyaml dependency and update Dockerfile to use Poetry for dependency management - Added pyyaml (>=6.0.3,<7.0.0) to pyproject.toml dependencies. - Updated Dockerfile to install Poetry and manage dependencies using it. - Improved layer caching by copying only dependency files before the application code. - Removed unnecessary installation of build dependencies to keep the final image size small. * feat: Remove duplicated backend-python-docker-build target from Makefile * fix: airflow is not ready for adding yet * feat: update Python version to 3.12 and remove project installation step in Dockerfile
2025-11-13 15:32:30 +08:00
parent 2660845b74
commit 45743f39f5
40 changed files with 3223 additions and 262 deletions
--- a/runtime/datamate-python/app/module/annotation/service/sync.py
+++ b/runtime/datamate-python/app/module/annotation/service/sync.py
@@ -122,7 +122,7 @@ class SyncService:
                return {}
            
            all_tasks = result.get("tasks", [])
-            logger.info(f"Successfully fetched {len(all_tasks)} tasks")
+            logger.debug(f"Successfully fetched {len(all_tasks)} tasks")

            # 使用字典推导式构建映射
            dm_file_to_task_mapping = {
@@ -131,7 +131,7 @@ class SyncService:
                if task.get('data', {}).get('file_id') is not None
            }
            
-            logger.info(f"Found {len(dm_file_to_task_mapping)} existing task mappings")
+            logger.debug(f"Found {len(dm_file_to_task_mapping)} existing task mappings")
            return dm_file_to_task_mapping

        except Exception as e:
@@ -163,10 +163,10 @@ class SyncService:
            )
            
            if not files_response or not files_response.content:
-                logger.info(f"No more files on page {page + 1}")
+                logger.debug(f"No more files on page {page + 1}")
                break
            
-            logger.info(f"Processing page {page + 1}, {len(files_response.content)} files")
+            logger.debug(f"Processing page {page + 1}, {len(files_response.content)} files")
            
            # 筛选新文件并构建任务数据
            new_tasks = []
@@ -178,7 +178,7 @@ class SyncService:
                    task_data = self._build_task_data(file_info, dataset_id)
                    new_tasks.append(task_data)
            
-            logger.info(f"Page {page + 1}: {len(new_tasks)} new files, {len(files_response.content) - len(new_tasks)} existing")
+            logger.debug(f"Page {page + 1}: {len(new_tasks)} new files, {len(files_response.content) - len(new_tasks)} existing")
            
            # 批量创建任务
            if new_tasks:
@@ -202,16 +202,16 @@ class SyncService:
        deleted_file_ids = set(existing_dm_file_mapping.keys()) - current_file_ids
        
        if not deleted_file_ids:
-            logger.info("No tasks to delete")
+            logger.debug("No tasks to delete")
            return 0
        
        tasks_to_delete = [existing_dm_file_mapping[fid] for fid in deleted_file_ids]
-        logger.info(f"Deleting {len(tasks_to_delete)} orphaned tasks")
+        logger.debug(f"Deleting {len(tasks_to_delete)} orphaned tasks")
        
        delete_result = await self.ls_client.delete_tasks_batch(tasks_to_delete)
        deleted_count = delete_result.get("successful", 0)
        
-        logger.info(f"Successfully deleted {deleted_count} tasks")
+        logger.debug(f"Successfully deleted {deleted_count} tasks")
        return deleted_count
    
    async def sync_dataset_files(
@@ -229,7 +229,7 @@ class SyncService:
        Returns:
            同步结果响应
        """
-        logger.info(f"Start syncing dataset files by mapping: {mapping_id}")
+        logger.debug(f"Start syncing dataset files by mapping: {mapping_id}")
        
        # 获取映射关系
        mapping = await self.mapping_service.get_mapping_by_uuid(mapping_id)
@@ -247,7 +247,7 @@ class SyncService:
            # 委托给sync_files执行实际同步
            result = await self.sync_files(mapping, batch_size)
            
-            logger.info(f"Sync completed: created={result['created']}, deleted={result['deleted']}, total={result['total']}")
+            logger.info(f"Sync files completed: created={result['created']}, deleted={result['deleted']}, total={result['total']}")
            
            return SyncDatasetResponse(
                id=mapping.id,
@@ -342,7 +342,7 @@ class SyncService:
        Returns:
            同步统计信息: {"created": int, "deleted": int, "total": int}
        """
-        logger.info(f"Syncing files for dataset {mapping.dataset_id} to project {mapping.labeling_project_id}")
+        logger.debug(f"Syncing files for dataset {mapping.dataset_id} to project {mapping.labeling_project_id}")
        
        # 获取DM数据集信息
        dataset_info = await self.dm_client.get_dataset(mapping.dataset_id)
@@ -350,12 +350,12 @@ class SyncService:
            raise NoDatasetInfoFoundError(mapping.dataset_id)
        
        total_files = dataset_info.fileCount
-        logger.info(f"Total files in DM dataset: {total_files}")
+        logger.debug(f"Total files in DM dataset: {total_files}")

        # 获取Label Studio中已存在的文件映射
        existing_dm_file_mapping = await self.get_existing_dm_file_mapping(mapping.labeling_project_id)
        existing_file_ids = set(existing_dm_file_mapping.keys())
-        logger.info(f"{len(existing_file_ids)} tasks already exist in Label Studio")
+        logger.debug(f"{len(existing_file_ids)} tasks already exist in Label Studio")
        
        # 分页获取DM文件并创建新任务
        current_file_ids, created_count = await self._fetch_dm_files_paginated(
@@ -371,7 +371,7 @@ class SyncService:
            current_file_ids
        )
        
-        logger.info(f"File sync completed: total={total_files}, created={created_count}, deleted={deleted_count}")
+        logger.debug(f"File sync completed: total={total_files}, created={created_count}, deleted={deleted_count}")
        
        return {
            "created": created_count,