refactor: Reorganize datamate-python (#34)

refactor: Reorganize datamate-python (previously label-studio-adapter) into a DDD style structure.
2025-10-30 01:32:59 +08:00
parent 0614157c0b
commit 2f7341dc1f
79 changed files with 1077 additions and 1577 deletions
--- a/runtime/datamate-python/app/module/init.py
+++ b/runtime/datamate-python/app/module/init.py
@@ -0,0 +1,11 @@
+from fastapi import APIRouter
+
+from .annotation.interface import router as annotation_router
+
+router = APIRouter(
+    prefix="/api"
+)
+
+router.include_router(annotation_router)
+
+__all__ = ["router"]
--- a/runtime/datamate-python/app/module/annotation/init.py
+++ b/runtime/datamate-python/app/module/annotation/init.py
--- a/runtime/datamate-python/app/module/annotation/client/init.py
+++ b/runtime/datamate-python/app/module/annotation/client/init.py
@@ -0,0 +1,3 @@
+from .labelstudio import LabelStudioClient
+
+__all__ = ["LabelStudioClient"]
--- a/runtime/datamate-python/app/module/annotation/client/labelstudio/init.py
+++ b/runtime/datamate-python/app/module/annotation/client/labelstudio/init.py
@@ -0,0 +1,3 @@
+from .client import Client as LabelStudioClient
+
+__all__ = ["LabelStudioClient"]
--- a/runtime/datamate-python/app/module/annotation/client/labelstudio/client.py
+++ b/runtime/datamate-python/app/module/annotation/client/labelstudio/client.py
@@ -0,0 +1,449 @@
+import httpx
+from typing import Optional, Dict, Any, List
+
+from app.core.config import settings
+from app.core.logging import get_logger
+
+from .schema import (
+    LabelStudioProject, 
+    LabelStudioCreateProjectRequest,
+    LabelStudioCreateTaskRequest
+)
+
+logger = get_logger(__name__)
+
+class Client:
+    """Label Studio服务客户端
+    
+    使用 HTTP REST API 直接与 Label Studio 交互
+    认证方式：使用 Authorization: Token {token} 头部进行认证
+    """
+    
+    # 默认标注配置模板
+    DEFAULT_LABEL_CONFIGS = {
+        "image": """
+        <View>
+          <Image name="image" value="$image"/>
+          <RectangleLabels name="label" toName="image">
+            <Label value="Object" background="red"/>
+          </RectangleLabels>
+        </View>
+        """,
+        "text": """
+        <View>
+          <Text name="text" value="$text"/>
+          <Choices name="sentiment" toName="text">
+            <Choice value="positive"/>
+            <Choice value="negative"/>
+            <Choice value="neutral"/>
+          </Choices>
+        </View>
+        """,
+        "audio": """
+        <View>
+          <Audio name="audio" value="$audio"/>
+          <AudioRegionLabels name="label" toName="audio">
+            <Label value="Speech" background="red"/>
+            <Label value="Noise" background="blue"/>
+          </AudioRegionLabels>
+        </View>
+        """,
+        "video": """
+        <View>
+          <Video name="video" value="$video"/>
+          <VideoRegionLabels name="label" toName="video">
+            <Label value="Action" background="red"/>
+          </VideoRegionLabels>
+        </View>
+        """
+    }
+    
+    def __init__(
+        self, 
+        base_url: Optional[str] = None, 
+        token: Optional[str] = None,
+        timeout: float = 30.0
+    ):
+        """初始化 Label Studio 客户端
+        
+        Args:
+            base_url: Label Studio 服务地址
+            token: API Token（使用 Authorization: Token {token} 头部）
+            timeout: 请求超时时间（秒）
+        """
+        self.base_url = (base_url or settings.label_studio_base_url).rstrip("/")
+        self.token = token or settings.label_studio_user_token
+        self.timeout = timeout
+        
+        if not self.token:
+            raise ValueError("Label Studio API token is required")
+        
+        # 初始化 HTTP 客户端
+        self.client = httpx.AsyncClient(
+            base_url=self.base_url,
+            timeout=self.timeout,
+            headers={
+                "Authorization": f"Token {self.token}",
+                "Content-Type": "application/json"
+            }
+        )
+        
+        logger.debug(f"Label Studio client initialized: {self.base_url}")
+    
+    def get_label_config_by_type(self, data_type: str) -> str:
+        """根据数据类型获取标注配置"""
+        return self.DEFAULT_LABEL_CONFIGS.get(data_type.lower(), self.DEFAULT_LABEL_CONFIGS["image"])
+    
+    async def create_project(
+        self, 
+        title: str, 
+        description: str = "", 
+        label_config: Optional[str] = None,
+        data_type: str = "image"
+    ) -> Optional[Dict[str, Any]]:
+        """创建Label Studio项目"""
+        try:
+            logger.debug(f"Creating Label Studio project: {title}")
+            
+            if not label_config:
+                label_config = self.get_label_config_by_type(data_type)
+            
+            project_data = {
+                "title": title,
+                "description": description,
+                "label_config": label_config.strip()
+            }
+            
+            response = await self.client.post("/api/projects", json=project_data)
+            response.raise_for_status()
+            
+            project = response.json()
+            project_id = project.get("id")
+            
+            if not project_id:
+                raise Exception("Label Studio response does not contain project ID")
+            
+            logger.debug(f"Project created successfully, ID: {project_id}")
+            return project
+        
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Create project failed HTTP {e.response.status_code}: {e.response.text}")
+            return None
+        except Exception as e:
+            logger.error(f"Error while creating Label Studio project: {e}")
+            return None
+    
+    async def import_tasks(
+        self,
+        project_id: int,
+        tasks: List[Dict[str, Any]],
+        commit_to_project: bool = True,
+        return_task_ids: bool = True
+    ) -> Optional[Dict[str, Any]]:
+        """批量导入任务到Label Studio项目"""
+        try:
+            logger.debug(f"Importing {len(tasks)} tasks into project {project_id}")
+            
+            response = await self.client.post(
+                f"/api/projects/{project_id}/import",
+                json=tasks,
+                params={
+                    "commit_to_project": str(commit_to_project).lower(),
+                    "return_task_ids": str(return_task_ids).lower()
+                }
+            )
+            response.raise_for_status()
+            
+            result = response.json()
+            task_count = result.get("task_count", len(tasks))
+            
+            logger.debug(f"Tasks imported successfully: {task_count}")
+            return result
+            
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Import tasks failed HTTP {e.response.status_code}: {e.response.text}")
+            return None
+        except Exception as e:
+            logger.error(f"Error while importing tasks: {e}")
+            return None
+    
+    async def create_tasks_batch(
+        self,
+        project_id: str,
+        tasks: List[Dict[str, Any]]
+    ) -> Optional[Dict[str, Any]]:
+        """批量创建任务的便利方法"""
+        try:
+            pid = int(project_id)
+            return await self.import_tasks(pid, tasks)
+        except ValueError as e:
+            logger.error(f"Invalid project ID format: {project_id}, error: {e}")
+            return None
+        except Exception as e:
+            logger.error(f"Error while creating tasks in batch: {e}")
+            return None
+    
+    async def create_task(
+        self,
+        project_id: str,
+        data: Dict[str, Any],
+        meta: Optional[Dict[str, Any]] = None
+    ) -> Optional[Dict[str, Any]]:
+        """创建单个任务"""
+        try:
+            task = {"data": data}
+            if meta:
+                task["meta"] = meta
+            
+            return await self.create_tasks_batch(project_id, [task])
+            
+        except Exception as e:
+            logger.error(f"Error while creating single task: {e}")
+            return None
+    
+    async def get_project_tasks(
+        self,
+        project_id: str,
+        page: Optional[int] = None,
+        page_size: int = 1000
+    ) -> Optional[Dict[str, Any]]:
+        """获取项目任务信息
+        
+        Args:
+            project_id: 项目ID
+            page: 页码（从1开始）。如果为None，则获取所有任务
+            page_size: 每页大小
+            
+        Returns:
+            如果指定了page参数，返回包含分页信息的字典：
+            {
+                "count": 总任务数,
+                "page": 当前页码,
+                "page_size": 每页大小,
+                "project_id": 项目ID,
+                "tasks": 当前页的任务列表
+            }
+            
+            如果page为None，返回包含所有任务的字典：
+            
+                "count": 总任务数,
+                "project_id": 项目ID,
+                "tasks": 所有任务列表
+            }
+        """
+        try:
+            pid = int(project_id)
+            
+            # 如果指定了page，直接获取单页任务
+            if page is not None:
+                logger.debug(f"Fetching tasks for project {pid}, page {page} (page_size={page_size})")
+                
+                response = await self.client.get(
+                    f"/api/tasks",
+                    params={
+                        "project": pid,
+                        "page": page,
+                        "page_size": page_size
+                    }
+                )
+                response.raise_for_status()
+                
+                result = response.json()
+                
+                # 返回单页结果，包含分页信息
+                return {
+                    "count": result.get("total", len(result.get("tasks", []))),
+                    "page": page,
+                    "page_size": page_size,
+                    "project_id": pid,
+                    "tasks": result.get("tasks", [])
+                }
+            
+            # 如果未指定page，获取所有任务
+            logger.debug(f"(page) not specified, fetching all tasks.")
+            all_tasks = []
+        
+            response = await self.client.get(
+                f"/api/tasks",
+                params={
+                    "project": pid
+                }
+            )
+            response.raise_for_status()
+            
+            result = response.json()
+            tasks = result.get("tasks", [])
+            
+            if not tasks:
+                logger.debug(f"No tasks found for this project.")
+
+            
+            all_tasks.extend(tasks)
+            logger.debug(f"Fetched {len(tasks)} tasks.")
+        
+            # 返回所有任务，不包含分页信息
+            return {
+                "count": len(all_tasks),
+                "project_id": pid,
+                "tasks": all_tasks
+            }
+            
+        except httpx.HTTPStatusError as e:
+            logger.error(f"获取项目任务失败 HTTP {e.response.status_code}: {e.response.text}")
+            return None
+        except Exception as e:
+            logger.error(f"获取项目任务时发生错误: {e}")
+            return None
+    
+    async def delete_task(
+        self,
+        task_id: int
+    ) -> bool:
+        """删除单个任务"""
+        try:
+            logger.debug(f"Deleting task: {task_id}")
+            
+            response = await self.client.delete(f"/api/tasks/{task_id}")
+            response.raise_for_status()
+            
+            logger.debug(f"Task deleted: {task_id}")
+            return True
+            
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Delete task {task_id} failed HTTP {e.response.status_code}: {e.response.text}")
+            return False
+        except Exception as e:
+            logger.error(f"Error while deleting task {task_id}: {e}")
+            return False
+    
+    async def delete_tasks_batch(
+        self,
+        task_ids: List[int]
+    ) -> Dict[str, int]:
+        """批量删除任务"""
+        try:
+            logger.debug(f"Deleting {len(task_ids)} tasks in batch")
+            
+            successful_deletions = 0
+            failed_deletions = 0
+            
+            for task_id in task_ids:
+                if await self.delete_task(task_id):
+                    successful_deletions += 1
+                else:
+                    failed_deletions += 1
+            
+            logger.debug(f"Batch deletion finished: success {successful_deletions}, failed {failed_deletions}")
+            
+            return {
+                "successful": successful_deletions,
+                "failed": failed_deletions,
+                "total": len(task_ids)
+            }
+            
+        except Exception as e:
+            logger.error(f"Error while deleting tasks in batch: {e}")
+            return {
+                "successful": 0,
+                "failed": len(task_ids),
+                "total": len(task_ids)
+            }
+    
+    async def get_project(self, project_id: int) -> Optional[Dict[str, Any]]:
+        """获取项目信息"""
+        try:
+            logger.debug(f"Fetching project info: {project_id}")
+            
+            response = await self.client.get(f"/api/projects/{project_id}")
+            response.raise_for_status()
+            
+            return response.json()
+            
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Get project info failed HTTP {e.response.status_code}: {e.response.text}")
+            return None
+        except Exception as e:
+            logger.error(f"Error while getting project info: {e}")
+            return None
+    
+    async def delete_project(self, project_id: int) -> bool:
+        """删除项目"""
+        try:
+            logger.debug(f"Deleting project: {project_id}")
+            
+            response = await self.client.delete(f"/api/projects/{project_id}")
+            response.raise_for_status()
+            
+            logger.debug(f"Project deleted: {project_id}")
+            return True
+            
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Delete project {project_id} failed HTTP {e.response.status_code}: {e.response.text}")
+            return False
+        except Exception as e:
+            logger.error(f"Error while deleting project {project_id}: {e}")
+            return False
+    
+    async def create_local_storage(
+        self,
+        project_id: int,
+        path: str,
+        title: str,
+        use_blob_urls: bool = True,
+        regex_filter: Optional[str] = None,
+        description: Optional[str] = None
+    ) -> Optional[Dict[str, Any]]:
+        """创建本地文件存储配置
+        
+        Args:
+            project_id: Label Studio 项目 ID
+            path: 本地文件路径（在 Label Studio 容器中的路径）
+            title: 存储配置标题
+            use_blob_urls: 是否使用 blob URLs（建议 True）
+            regex_filter: 文件过滤正则表达式（可选）
+            description: 存储描述（可选）
+            
+        Returns:
+            创建的存储配置信息，失败返回 None
+        """
+        try:
+            logger.debug(f"Creating local storage for project {project_id}: {path}")
+            
+            storage_data = {
+                "project": project_id,
+                "path": path,
+                "title": title,
+                "use_blob_urls": use_blob_urls
+            }
+            
+            if regex_filter:
+                storage_data["regex_filter"] = regex_filter
+            if description:
+                storage_data["description"] = description
+            
+            response = await self.client.post(
+                "/api/storages/localfiles/",
+                json=storage_data
+            )
+            response.raise_for_status()
+            
+            storage = response.json()
+            storage_id = storage.get("id")
+            
+            logger.debug(f"Local storage created successfully, ID: {storage_id}")
+            return storage
+            
+        except httpx.HTTPStatusError as e:
+            logger.error(f"Create local storage failed HTTP {e.response.status_code}: {e.response.text}")
+            return None
+        except Exception as e:
+            logger.error(f"Error while creating local storage: {e}")
+            return None
+
+    async def close(self):
+        """关闭客户端连接"""
+        try:
+            await self.client.aclose()
+            logger.debug("Label Studio client closed")
+        except Exception as e:
+            logger.error(f"Error while closing Label Studio client: {e}")
--- a/runtime/datamate-python/app/module/annotation/client/labelstudio/schema.py
+++ b/runtime/datamate-python/app/module/annotation/client/labelstudio/schema.py
@@ -0,0 +1,40 @@
+from pydantic import Field
+from typing import Dict, Any, Optional
+from datetime import datetime
+
+from app.module.shared.schema import BaseResponseModel
+
+
+class LabelStudioProject(BaseResponseModel):
+    """Label Studio项目模型"""
+    id: int = Field(..., description="项目ID")
+    title: str = Field(..., description="项目标题")
+    description: Optional[str] = Field(None, description="项目描述")
+    label_config: str = Field(..., description="标注配置")
+    created_at: Optional[datetime] = Field(None, description="创建时间")
+    updated_at: Optional[datetime] = Field(None, description="更新时间")
+
+class LabelStudioTaskData(BaseResponseModel):
+    """Label Studio任务数据模型"""
+    image: Optional[str] = Field(None, description="图像URL")
+    text: Optional[str] = Field(None, description="文本内容")
+    audio: Optional[str] = Field(None, description="音频URL")
+    video: Optional[str] = Field(None, description="视频URL")
+    filename: Optional[str] = Field(None, description="文件名")
+
+class LabelStudioTask(BaseResponseModel):
+    """Label Studio任务模型"""
+    data: LabelStudioTaskData = Field(..., description="任务数据")
+    project: Optional[int] = Field(None, description="项目ID")
+    meta: Optional[Dict[str, Any]] = Field(None, description="元数据")
+
+class LabelStudioCreateProjectRequest(BaseResponseModel):
+    """创建Label Studio项目请求模型"""
+    title: str = Field(..., description="项目标题")
+    description: str = Field("", description="项目描述")
+    label_config: str = Field(..., description="标注配置")
+    
+class LabelStudioCreateTaskRequest(BaseResponseModel):
+    """创建Label Studio任务请求模型"""
+    data: Dict[str, Any] = Field(..., description="任务数据")
+    project: Optional[int] = Field(None, description="项目ID")
--- a/runtime/datamate-python/app/module/annotation/interface/init.py
+++ b/runtime/datamate-python/app/module/annotation/interface/init.py
@@ -0,0 +1,12 @@
+from fastapi import APIRouter
+
+from .project import router as project_router
+from .task import router as task_router
+
+router = APIRouter(
+    prefix="/annotation",
+    tags = ["annotation"]
+)
+
+router.include_router(project_router)
+router.include_router(task_router)
--- a/runtime/datamate-python/app/module/annotation/interface/project.py
+++ b/runtime/datamate-python/app/module/annotation/interface/project.py
@@ -0,0 +1,353 @@
+from typing import Optional
+import math
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db.session import get_db
+from app.module.shared.schema import StandardResponse, PaginatedData
+from app.module.dataset import DatasetManagementService
+from app.core.logging import get_logger
+from app.core.config import settings
+
+from ..client import LabelStudioClient
+from ..service.mapping import DatasetMappingService
+from ..schema import (
+    DatasetMappingCreateRequest,
+    DatasetMappingCreateResponse,
+    DeleteDatasetResponse,
+    DatasetMappingResponse,
+)
+
+router = APIRouter(
+    prefix="/project",
+    tags=["annotation/project"]
+)
+logger = get_logger(__name__)
+
+@router.post("/", response_model=StandardResponse[DatasetMappingCreateResponse], status_code=201)
+async def create_mapping(
+    request: DatasetMappingCreateRequest,
+    db: AsyncSession = Depends(get_db)
+):
+    """
+    创建数据集映射
+    
+    根据指定的DM程序中的数据集，创建Label Studio中的数据集，
+    在数据库中记录这一关联关系，返回Label Studio数据集的ID
+    
+    注意：一个数据集可以创建多个标注项目
+    """
+    try:
+        dm_client = DatasetManagementService(db)
+        ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
+                                      token=settings.label_studio_user_token)
+        service = DatasetMappingService(db)
+        
+        logger.info(f"Create dataset mapping request: {request.dataset_id}")
+        
+        # 从DM服务获取数据集信息
+        dataset_info = await dm_client.get_dataset(request.dataset_id)
+        if not dataset_info:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Dataset not found in DM service: {request.dataset_id}"
+            )
+        
+        # 确定数据类型（基于数据集类型）
+        data_type = "image"  # 默认值
+        if dataset_info.type and dataset_info.type.code:
+            type_code = dataset_info.type.code.lower()
+            if "audio" in type_code:
+                data_type = "audio"
+            elif "video" in type_code:
+                data_type = "video"
+            elif "text" in type_code:
+                data_type = "text"
+        
+        project_name = f"{dataset_info.name}"
+        
+        # 在Label Studio中创建项目
+        project_data = await ls_client.create_project(
+            title=project_name,
+            description=dataset_info.description or f"Imported from DM dataset {dataset_info.id}",
+            data_type=data_type
+        )
+        
+        if not project_data:
+            raise HTTPException(
+                status_code=500,
+                detail="Fail to create Label Studio project."
+            )
+        
+        project_id = project_data["id"]
+        
+        # 配置本地存储：dataset/<id>
+        local_storage_path = f"{settings.label_studio_local_storage_dataset_base_path}/{request.dataset_id}"
+        storage_result = await ls_client.create_local_storage(
+            project_id=project_id,
+            path=local_storage_path,
+            title="Dataset_BLOB",
+            use_blob_urls=True,
+            description=f"Local storage for dataset {dataset_info.name}"
+        )
+        
+        if not storage_result:
+            # 本地存储配置失败，记录警告但不中断流程
+            logger.warning(f"Failed to configure local storage for project {project_id}")
+        else:
+            logger.info(f"Local storage configured for project {project_id}: {local_storage_path}")
+        
+        # 创建映射关系，包含项目名称
+        mapping = await service.create_mapping(
+            request, 
+            str(project_id),
+            project_name
+        )
+        
+        response_data = DatasetMappingCreateResponse(
+            id=mapping.id,
+            labeling_project_id=str(mapping.labeling_project_id),
+            labeling_project_name=mapping.name or project_name,
+            message="Dataset mapping created successfully"
+        )
+        
+        return StandardResponse(
+            code=201,
+            message="success",
+            data=response_data
+        )
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error while creating dataset mapping: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+    
+@router.get("/", response_model=StandardResponse[PaginatedData[DatasetMappingResponse]])
+async def list_mappings(
+    page: int = Query(1, ge=1, description="页码（从1开始）"),
+    page_size: int = Query(20, ge=1, le=100, description="每页记录数"),
+    db: AsyncSession = Depends(get_db)
+):
+    """
+    查询所有映射关系（分页）
+    
+    返回所有有效的数据集映射关系（未被软删除的），支持分页查询
+    """
+    try:
+        service = DatasetMappingService(db)
+        
+        # 计算 skip
+        skip = (page - 1) * page_size
+        
+        logger.info(f"Listing mappings, page={page}, page_size={page_size}")
+        
+        # 获取数据和总数
+        mappings, total = await service.get_all_mappings_with_count(
+            skip=skip, 
+            limit=page_size
+        )
+        
+        # 计算总页数
+        total_pages = math.ceil(total / page_size) if total > 0 else 0
+        
+        # 构造分页响应
+        paginated_data = PaginatedData(
+            page=page,
+            size=page_size,
+            total_elements=total,
+            total_pages=total_pages,
+            content=mappings
+        )
+        
+        logger.info(f"Found {len(mappings)} mappings on page {page}, total: {total}")
+        
+        return StandardResponse(
+            code=200,
+            message="success",
+            data=paginated_data
+        )
+        
+    except Exception as e:
+        logger.error(f"Error listing mappings: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+
+@router.get("/{mapping_id}", response_model=StandardResponse[DatasetMappingResponse])
+async def get_mapping(
+    mapping_id: str,
+    db: AsyncSession = Depends(get_db)
+):
+    """
+    根据 UUID 查询单个映射关系
+    """
+    try:
+        service = DatasetMappingService(db)
+        
+        logger.info(f"Get mapping: {mapping_id}")
+        
+        mapping = await service.get_mapping_by_uuid(mapping_id)
+        
+        if not mapping:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Mapping not found: {mapping_id}"
+            )
+        
+        logger.info(f"Found mapping: {mapping.id}")
+        
+        return StandardResponse(
+            code=200,
+            message="success",
+            data=mapping
+        )
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error getting mapping: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+
+@router.get("/by-source/{dataset_id}", response_model=StandardResponse[PaginatedData[DatasetMappingResponse]])
+async def get_mappings_by_source(
+    dataset_id: str,
+    page: int = Query(1, ge=1, description="页码（从1开始）"),
+    page_size: int = Query(20, ge=1, le=100, description="每页记录数"),
+    db: AsyncSession = Depends(get_db)
+):
+    """
+    根据源数据集 ID 查询所有映射关系（分页）
+    
+    返回该数据集创建的所有标注项目（不包括已删除的），支持分页查询
+    """
+    try:
+        service = DatasetMappingService(db)
+        
+        # 计算 skip
+        skip = (page - 1) * page_size
+        
+        logger.info(f"Get mappings by source dataset id: {dataset_id}, page={page}, page_size={page_size}")
+        
+        # 获取数据和总数
+        mappings, total = await service.get_mappings_by_source_with_count(
+            dataset_id=dataset_id,
+            skip=skip,
+            limit=page_size
+        )
+        
+        # 计算总页数
+        total_pages = math.ceil(total / page_size) if total > 0 else 0
+        
+        # 构造分页响应
+        paginated_data = PaginatedData(
+            page=page,
+            size=page_size,
+            total_elements=total,
+            total_pages=total_pages,
+            content=mappings
+        )
+        
+        logger.info(f"Found {len(mappings)} mappings on page {page}, total: {total}")
+        
+        return StandardResponse(
+            code=200,
+            message="success",
+            data=paginated_data
+        )
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error getting mappings: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
+
+@router.delete("/", response_model=StandardResponse[DeleteDatasetResponse])
+async def delete_mapping(
+    m: Optional[str] = Query(None, description="映射UUID"),
+    proj: Optional[str] = Query(None, description="Label Studio项目ID"),
+    db: AsyncSession = Depends(get_db)
+):
+    """
+    删除映射关系和对应的 Label Studio 项目
+    
+    可以通过以下任一方式指定要删除的映射：
+    - m: 映射UUID
+    - proj: Label Studio项目ID
+    - 两者都提供（优先使用 m）
+    
+    此操作会：
+    1. 删除 Label Studio 中的项目
+    2. 软删除数据库中的映射记录
+    """
+    try:
+        # 至少需要提供一个参数
+        if not m and not proj:
+            raise HTTPException(
+                status_code=400,
+                detail="Either 'm' (mapping UUID) or 'proj' (project ID) must be provided"
+            )
+
+        ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
+                                      token=settings.label_studio_user_token)
+        service = DatasetMappingService(db)
+        
+        # 优先使用 mapping_id 查询
+        if m:
+            logger.debug(f"Deleting by mapping UUID: {m}")
+            mapping = await service.get_mapping_by_uuid(m)
+        # 如果没有提供 m，使用 proj 查询
+        elif proj:
+            logger.debug(f"Deleting by project ID: {proj}")
+            mapping = await service.get_mapping_by_labeling_project_id(proj)
+        else:
+            mapping = None
+        
+        if not mapping:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Mapping either not found or not specified."
+            )
+        
+        id = mapping.id
+        labeling_project_id = mapping.labeling_project_id
+        labeling_project_name = mapping.name
+
+        logger.debug(f"Found mapping: {id}, Label Studio project ID: {labeling_project_id}")
+        
+        # 1. 删除 Label Studio 项目
+        try:
+            delete_success = await ls_client.delete_project(int(labeling_project_id))
+            if delete_success:
+                logger.debug(f"Successfully deleted Label Studio project: {labeling_project_id}")
+            else:
+                logger.warning(f"Failed to delete Label Studio project or project not found: {labeling_project_id}")
+        except Exception as e:
+            logger.error(f"Error deleting Label Studio project: {e}")
+            # 继续执行，即使 Label Studio 项目删除失败也要删除映射记录
+        
+        # 2. 软删除映射记录
+        soft_delete_success = await service.soft_delete_mapping(id)
+        
+        if not soft_delete_success:
+            raise HTTPException(
+                status_code=500,
+                detail="Failed to delete mapping record"
+            )
+
+        logger.info(f"Successfully deleted mapping: {id}, Label Studio project: {labeling_project_id}")
+
+        return StandardResponse(
+            code=200,
+            message="success",
+            data=DeleteDatasetResponse(
+                id=id,
+                status="success",
+                message=f"Successfully deleted mapping and Label Studio project '{labeling_project_name}'"
+            )
+        )
+        
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Error deleting mapping: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
--- a/runtime/datamate-python/app/module/annotation/interface/task.py
+++ b/runtime/datamate-python/app/module/annotation/interface/task.py
@@ -0,0 +1,76 @@
+from fastapi import APIRouter, Depends, HTTPException, Query
+from sqlalchemy.ext.asyncio import AsyncSession
+from typing import List, Optional
+
+from app.db.session import get_db
+from app.module.shared.schema import StandardResponse
+from app.module.dataset import DatasetManagementService
+from app.core.logging import get_logger
+from app.core.config import settings
+from app.exception import NoDatasetInfoFoundError, DatasetMappingNotFoundError
+
+from ..client import LabelStudioClient
+from ..service.sync import SyncService
+from ..service.mapping import DatasetMappingService
+from ..schema import (
+    SyncDatasetRequest,
+    SyncDatasetResponse,
+)
+
+
+router = APIRouter(
+    prefix="/task",
+    tags=["annotation/task"]
+)
+logger = get_logger(__name__)
+
+@router.post("/sync", response_model=StandardResponse[SyncDatasetResponse])
+async def sync_dataset_content(
+    request: SyncDatasetRequest,
+    db: AsyncSession = Depends(get_db)
+):
+    """
+    同步数据集内容
+    
+    根据指定的mapping ID，同步DM程序数据集中的内容到Label Studio数据集中，
+    在数据库中记录更新时间，返回更新状态
+    """
+    try:
+        ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
+                                      token=settings.label_studio_user_token)
+        dm_client = DatasetManagementService(db)
+        mapping_service = DatasetMappingService(db)
+        sync_service = SyncService(dm_client, ls_client, mapping_service)
+
+        logger.info(f"Sync dataset content request: mapping_id={request.id}")
+
+        # request.id 合法性校验
+        mapping = await mapping_service.get_mapping_by_uuid(request.id)
+        if not mapping:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Mapping not found: {request.id}"
+            )
+        
+        # 执行同步（使用映射中的源数据集UUID）
+        result = await sync_service.sync_dataset_files(request.id, request.batch_size)
+        
+        logger.info(f"Sync completed: {result.synced_files}/{result.total_files} files")
+        
+        return StandardResponse(
+            code=200,
+            message="success",
+            data=result
+        )
+        
+    except HTTPException:
+        raise
+    except NoDatasetInfoFoundError as e:
+        logger.error(f"Failed to get dataset info: {e}")
+        raise HTTPException(status_code=404, detail=str(e))
+    except DatasetMappingNotFoundError as e:
+        logger.error(f"Mapping not found: {e}")
+        raise HTTPException(status_code=404, detail=str(e))
+    except Exception as e:
+        logger.error(f"Error syncing dataset content: {e}")
+        raise HTTPException(status_code=500, detail="Internal server error")
--- a/runtime/datamate-python/app/module/annotation/schema/init.py
+++ b/runtime/datamate-python/app/module/annotation/schema/init.py
@@ -0,0 +1,24 @@
+from .mapping import (
+    DatasetMappingBase,
+    DatasetMappingCreateRequest,
+    DatasetMappingCreateResponse,
+    DatasetMappingUpdateRequest,
+    DatasetMappingResponse,
+    DeleteDatasetResponse
+)
+
+from .sync import (
+    SyncDatasetRequest,
+    SyncDatasetResponse
+)
+
+__all__ = [
+    "DatasetMappingBase",
+    "DatasetMappingCreateRequest",
+    "DatasetMappingCreateResponse",
+    "DatasetMappingUpdateRequest",
+    "DatasetMappingResponse",
+    "SyncDatasetRequest",
+    "SyncDatasetResponse",
+    "DeleteDatasetResponse"
+]
--- a/runtime/datamate-python/app/module/annotation/schema/mapping.py
+++ b/runtime/datamate-python/app/module/annotation/schema/mapping.py
@@ -0,0 +1,42 @@
+from pydantic import Field
+from typing import Optional
+from datetime import datetime
+
+from app.module.shared.schema import BaseResponseModel
+
+class DatasetMappingBase(BaseResponseModel):
+    """数据集映射 基础模型"""
+    dataset_id: str = Field(..., description="源数据集ID")
+
+class DatasetMappingCreateRequest(DatasetMappingBase):
+    """数据集映射 创建 请求模型"""
+    pass
+
+class DatasetMappingCreateResponse(BaseResponseModel):
+    """数据集映射 创建 响应模型"""
+    id: str = Field(..., description="映射UUID")
+    labeling_project_id: str = Field(..., description="Label Studio项目ID")
+    labeling_project_name: str = Field(..., description="Label Studio项目名称")
+    message: str = Field(..., description="响应消息")
+
+class DatasetMappingUpdateRequest(BaseResponseModel):
+    """数据集映射 更新 请求模型"""
+    dataset_id: Optional[str] = Field(None, description="源数据集ID")
+
+class DatasetMappingResponse(DatasetMappingBase):
+    """数据集映射 查询 响应模型"""
+    id: str = Field(..., description="映射UUID")
+    labeling_project_id: str = Field(..., description="标注项目ID")
+    name: Optional[str] = Field(None, description="标注项目名称")
+    created_at: datetime = Field(..., description="创建时间")
+    deleted_at: Optional[datetime] = Field(None, description="删除时间")
+    
+    class Config:
+        from_attributes = True
+        populate_by_name = True
+        
+class DeleteDatasetResponse(BaseResponseModel):
+    """删除数据集响应模型"""
+    id: str = Field(..., description="映射UUID")
+    status: str = Field(..., description="删除状态")
+    message: str = Field(..., description="响应消息")
--- a/runtime/datamate-python/app/module/annotation/schema/sync.py
+++ b/runtime/datamate-python/app/module/annotation/schema/sync.py
@@ -0,0 +1,19 @@
+from pydantic import Field
+from typing import Optional
+from datetime import datetime
+
+from app.module.shared.schema import BaseResponseModel
+
+
+class SyncDatasetRequest(BaseResponseModel):
+    """同步数据集请求模型"""
+    id: str = Field(..., description="映射ID（mapping UUID）")
+    batch_size: int = Field(50, ge=1, le=100, description="批处理大小")
+
+class SyncDatasetResponse(BaseResponseModel):
+    """同步数据集响应模型"""
+    id: str = Field(..., description="映射UUID")
+    status: str = Field(..., description="同步状态")
+    synced_files: int = Field(..., description="已同步文件数量")
+    total_files: int = Field(0, description="总文件数量")
+    message: str = Field(..., description="响应消息")
--- a/runtime/datamate-python/app/module/annotation/service/init.py
+++ b/runtime/datamate-python/app/module/annotation/service/init.py
--- a/runtime/datamate-python/app/module/annotation/service/mapping.py
+++ b/runtime/datamate-python/app/module/annotation/service/mapping.py
@@ -0,0 +1,283 @@
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+from sqlalchemy import update, func
+from typing import Optional, List, Tuple
+from datetime import datetime
+import uuid
+
+from app.core.logging import get_logger
+from app.db.models import LabelingProject
+from app.module.annotation.schema import (
+    DatasetMappingCreateRequest, 
+    DatasetMappingUpdateRequest, 
+    DatasetMappingResponse
+)
+
+logger = get_logger(__name__)
+
+class DatasetMappingService:
+    """数据集映射服务"""
+    
+    def __init__(self, db: AsyncSession):
+        self.db = db
+    
+    async def create_mapping(
+        self, 
+        mapping_data: DatasetMappingCreateRequest, 
+        labeling_project_id: str,
+        labeling_project_name: str
+    ) -> DatasetMappingResponse:
+        """创建数据集映射"""
+        logger.info(f"Create dataset mapping: {mapping_data.dataset_id} -> {labeling_project_id}")
+        
+        db_mapping = LabelingProject(
+            id=str(uuid.uuid4()),
+            dataset_id=mapping_data.dataset_id,
+            labeling_project_id=labeling_project_id,
+            name=labeling_project_name
+        )
+
+        self.db.add(db_mapping)
+        await self.db.commit()
+        await self.db.refresh(db_mapping)
+        
+        logger.debug(f"Mapping created: {db_mapping.id}")
+        return DatasetMappingResponse.model_validate(db_mapping)
+    
+    async def get_mapping_by_source_uuid(
+        self, 
+        dataset_id: str
+    ) -> Optional[DatasetMappingResponse]:
+        """根据源数据集ID获取映射（返回第一个未删除的）"""
+        logger.debug(f"Get mapping by source dataset id: {dataset_id}")
+        
+        result = await self.db.execute(
+            select(LabelingProject).where(
+                LabelingProject.dataset_id == dataset_id,
+                LabelingProject.deleted_at.is_(None)
+            )
+        )
+        mapping = result.scalar_one_or_none()
+        
+        if mapping:
+            logger.debug(f"Found mapping: {mapping.id}")
+            return DatasetMappingResponse.model_validate(mapping)
+        
+        logger.debug(f"No mapping found for source dataset id: {dataset_id}")
+        return None
+    
+    async def get_mappings_by_dataset_id(
+        self, 
+        dataset_id: str,
+        include_deleted: bool = False
+    ) -> List[DatasetMappingResponse]:
+        """根据源数据集ID获取所有映射关系"""
+        logger.debug(f"Get all mappings by source dataset id: {dataset_id}")
+        
+        query = select(LabelingProject).where(
+            LabelingProject.dataset_id == dataset_id
+        )
+        
+        if not include_deleted:
+            query = query.where(LabelingProject.deleted_at.is_(None))
+        
+        result = await self.db.execute(
+            query.order_by(LabelingProject.created_at.desc())
+        )
+        mappings = result.scalars().all()
+        
+        logger.debug(f"Found {len(mappings)} mappings")
+        return [DatasetMappingResponse.model_validate(mapping) for mapping in mappings]
+    
+    async def get_mapping_by_labeling_project_id(
+        self, 
+        labeling_project_id: str
+    ) -> Optional[DatasetMappingResponse]:
+        """根据Label Studio项目ID获取映射"""
+        logger.debug(f"Get mapping by Label Studio project id: {labeling_project_id}")
+        
+        result = await self.db.execute(
+            select(LabelingProject).where(
+                LabelingProject.labeling_project_id == labeling_project_id,
+                LabelingProject.deleted_at.is_(None)
+            )
+        )
+        mapping = result.scalar_one_or_none()
+        
+        if mapping:
+            logger.debug(f"Found mapping: {mapping.mapping_id}")
+            return DatasetMappingResponse.model_validate(mapping)
+        
+        logger.debug(f"No mapping found for Label Studio project id: {labeling_project_id}")
+        return None
+    
+    async def get_mapping_by_uuid(self, mapping_id: str) -> Optional[DatasetMappingResponse]:
+        """根据映射UUID获取映射"""
+        logger.debug(f"Get mapping: {mapping_id}")
+        
+        result = await self.db.execute(
+            select(LabelingProject).where(
+                LabelingProject.id == mapping_id,
+                LabelingProject.deleted_at.is_(None)
+            )
+        )
+        mapping = result.scalar_one_or_none()
+        
+        if mapping:
+            logger.debug(f"Found mapping: {mapping.id}")
+            return DatasetMappingResponse.model_validate(mapping)
+        
+        logger.debug(f"Mapping not found: {mapping_id}")
+        return None
+    
+    async def update_mapping(
+        self, 
+        mapping_id: str, 
+        update_data: DatasetMappingUpdateRequest
+    ) -> Optional[DatasetMappingResponse]:
+        """更新映射信息"""
+        logger.info(f"Update mapping: {mapping_id}")
+        
+        mapping = await self.get_mapping_by_uuid(mapping_id)
+        if not mapping:
+            return None
+        
+        update_values = update_data.model_dump(exclude_unset=True)
+        update_values["last_updated_at"] = datetime.now()
+        
+        result = await self.db.execute(
+            update(LabelingProject)
+            .where(LabelingProject.id == mapping_id)
+            .values(**update_values)
+        )
+        await self.db.commit()
+        
+        if result.rowcount > 0:
+            return await self.get_mapping_by_uuid(mapping_id)
+        return None
+    
+    async def soft_delete_mapping(self, mapping_id: str) -> bool:
+        """软删除映射"""
+        logger.info(f"Soft delete mapping: {mapping_id}")
+        
+        result = await self.db.execute(
+            update(LabelingProject)
+            .where(
+                LabelingProject.id == mapping_id,
+                LabelingProject.deleted_at.is_(None)
+            )
+            .values(deleted_at=datetime.now())
+        )
+        await self.db.commit()
+        
+        success = result.rowcount > 0
+        if success:
+            logger.info(f"Mapping soft-deleted: {mapping_id}")
+        else:
+            logger.warning(f"Mapping not exists or already deleted: {mapping_id}")
+        
+        return success
+    
+    async def get_all_mappings(
+        self, 
+        skip: int = 0, 
+        limit: int = 100
+    ) -> List[DatasetMappingResponse]:
+        """获取所有有效映射"""
+        logger.debug(f"List all mappings, skip: {skip}, limit: {limit}")
+        
+        result = await self.db.execute(
+            select(LabelingProject)
+            .where(LabelingProject.deleted_at.is_(None))
+            .offset(skip)
+            .limit(limit)
+            .order_by(LabelingProject.created_at.desc())
+        )
+        mappings = result.scalars().all()
+        
+        logger.debug(f"Found {len(mappings)} mappings")
+        return [DatasetMappingResponse.model_validate(mapping) for mapping in mappings]
+    
+    async def count_mappings(self, include_deleted: bool = False) -> int:
+        """统计映射总数"""
+        query = select(func.count()).select_from(LabelingProject)
+        
+        if not include_deleted:
+            query = query.where(LabelingProject.deleted_at.is_(None))
+        
+        result = await self.db.execute(query)
+        return result.scalar_one()
+    
+    async def get_all_mappings_with_count(
+        self, 
+        skip: int = 0, 
+        limit: int = 100,
+        include_deleted: bool = False
+    ) -> Tuple[List[DatasetMappingResponse], int]:
+        """获取所有映射及总数（用于分页）"""
+        logger.debug(f"List all mappings with count, skip: {skip}, limit: {limit}")
+        
+        # 构建查询
+        query = select(LabelingProject)
+        if not include_deleted:
+            query = query.where(LabelingProject.deleted_at.is_(None))
+        
+        # 获取总数
+        count_query = select(func.count()).select_from(LabelingProject)
+        if not include_deleted:
+            count_query = count_query.where(LabelingProject.deleted_at.is_(None))
+        
+        count_result = await self.db.execute(count_query)
+        total = count_result.scalar_one()
+        
+        # 获取数据
+        result = await self.db.execute(
+            query
+            .offset(skip)
+            .limit(limit)
+            .order_by(LabelingProject.created_at.desc())
+        )
+        mappings = result.scalars().all()
+        
+        logger.debug(f"Found {len(mappings)} mappings, total: {total}")
+        return [DatasetMappingResponse.model_validate(mapping) for mapping in mappings], total
+    
+    async def get_mappings_by_source_with_count(
+        self,
+        dataset_id: str,
+        skip: int = 0,
+        limit: int = 100,
+        include_deleted: bool = False
+    ) -> Tuple[List[DatasetMappingResponse], int]:
+        """根据源数据集ID获取映射关系及总数（用于分页）"""
+        logger.debug(f"Get mappings by source dataset id with count: {dataset_id}")
+        
+        # 构建查询
+        query = select(LabelingProject).where(
+            LabelingProject.dataset_id == dataset_id
+        )
+        
+        if not include_deleted:
+            query = query.where(LabelingProject.deleted_at.is_(None))
+        
+        # 获取总数
+        count_query = select(func.count()).select_from(LabelingProject).where(
+            LabelingProject.dataset_id == dataset_id
+        )
+        if not include_deleted:
+            count_query = count_query.where(LabelingProject.deleted_at.is_(None))
+        
+        count_result = await self.db.execute(count_query)
+        total = count_result.scalar_one()
+        
+        # 获取数据
+        result = await self.db.execute(
+            query
+            .offset(skip)
+            .limit(limit)
+            .order_by(LabelingProject.created_at.desc())
+        )
+        mappings = result.scalars().all()
+        
+        logger.debug(f"Found {len(mappings)} mappings, total: {total}")
+        return [DatasetMappingResponse.model_validate(mapping) for mapping in mappings], total
--- a/runtime/datamate-python/app/module/annotation/service/sync.py
+++ b/runtime/datamate-python/app/module/annotation/service/sync.py
@@ -0,0 +1,272 @@
+from typing import Optional, List, Dict, Any, Tuple
+from app.module.dataset import DatasetManagementService
+
+from app.core.logging import get_logger
+from app.core.config import settings
+from app.exception import NoDatasetInfoFoundError
+
+from ..client import LabelStudioClient
+from ..schema import SyncDatasetResponse
+from ..service.mapping import DatasetMappingService
+
+logger = get_logger(__name__)
+
+class SyncService:
+    """数据同步服务"""
+    
+    def __init__(
+        self, 
+        dm_client: DatasetManagementService, 
+        ls_client: LabelStudioClient,
+        mapping_service: DatasetMappingService
+    ):
+        self.dm_client = dm_client
+        self.ls_client = ls_client
+        self.mapping_service = mapping_service
+    
+    def determine_data_type(self, file_type: str) -> str:
+        """根据文件类型确定数据类型"""
+        file_type_lower = file_type.lower()
+        
+        if any(ext in file_type_lower for ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'svg', 'webp']):
+            return 'image'
+        elif any(ext in file_type_lower for ext in ['mp3', 'wav', 'flac', 'aac', 'ogg']):
+            return 'audio'
+        elif any(ext in file_type_lower for ext in ['mp4', 'avi', 'mov', 'wmv', 'flv', 'webm']):
+            return 'video'
+        elif any(ext in file_type_lower for ext in ['txt', 'doc', 'docx', 'pdf']):
+            return 'text'
+        else:
+            return 'image'  # 默认为图像类型
+    
+    async def get_existing_dm_file_mapping(self, project_id: str) -> Dict[str, int]:
+        """
+        获取Label Studio项目中已存在的DM文件ID到任务ID的映射
+        
+        Args:
+            project_id: Label Studio项目ID
+            
+        Returns:
+            file_id到task_id的映射字典
+        """
+        try:
+            logger.info(f"Fetching existing task mappings for project {project_id} (page_size={settings.ls_task_page_size})")
+            dm_file_to_task_mapping = {}
+
+            # 使用Label Studio客户端封装的方法获取所有任务
+            page_size = getattr(settings, 'ls_task_page_size', 1000)
+            
+            # 调用封装好的方法获取所有任务，page=None表示获取全部
+            result = await self.ls_client.get_project_tasks(
+                project_id=project_id,
+                page=None,  # 不指定page，获取所有任务
+                page_size=page_size
+            )
+
+            logger.info(f"Fetched tasks result: {result}")
+            
+            if not result:
+                logger.warning(f"Failed to fetch tasks for project {project_id}")
+                return {}
+            
+            logger.info(f"Successfully fetched tasks for project {project_id}")
+            
+            all_tasks = result.get("tasks", [])
+
+            # 遍历所有任务，构建映射
+            for task in all_tasks:
+                # logger.debug(task)
+                try:
+                    file_id = task.get('data', {}).get('file_id')
+                    task_id = task.get('id')
+                    
+                    dm_file_to_task_mapping[str(file_id)] = task_id
+                except Exception as e:
+                    logger.error(f"Error processing task {task.get('id')}: {e}")
+                    continue
+            logger.debug(dm_file_to_task_mapping)
+            logger.info(f"Found {len(dm_file_to_task_mapping)} existing task mappings")
+            return dm_file_to_task_mapping
+
+        except Exception as e:
+            logger.error(f"Error while fetching existing tasks: {e}")
+            return {}  # 发生错误时返回空字典，会同步所有文件
+    
+    async def sync_dataset_files(
+        self, 
+        id: str, 
+        batch_size: int = 50
+    ) -> SyncDatasetResponse:
+        """同步数据集文件到Label Studio"""
+        logger.info(f"Start syncing dataset by mapping: {id}")
+        
+        # 获取映射关系
+        mapping = await self.mapping_service.get_mapping_by_uuid(id)
+        if not mapping:
+            logger.error(f"Dataset mapping not found: {id}")
+            return SyncDatasetResponse(
+                id="",
+                status="error",
+                synced_files=0,
+                total_files=0,
+                message=f"Dataset mapping not found: {id}"
+            )
+        
+        try:
+            # 获取数据集信息
+            dataset_info = await self.dm_client.get_dataset(mapping.dataset_id)
+            if not dataset_info:
+                raise NoDatasetInfoFoundError(mapping.dataset_id)
+            
+            synced_files = 0
+            deleted_tasks = 0
+            total_files = dataset_info.fileCount
+            page = 0
+            
+            logger.info(f"Total files in dataset: {total_files}")
+            
+            # 获取Label Studio中已存在的DM文件ID到任务ID的映射
+            existing_dm_file_mapping = await self.get_existing_dm_file_mapping(mapping.labeling_project_id)
+            existing_file_ids = set(existing_dm_file_mapping.keys())
+            logger.info(f"{len(existing_file_ids)} tasks already exist in Label Studio")
+            
+            # 收集DM中当前存在的所有文件ID
+            current_file_ids = set()
+            while True:
+                files_response = await self.dm_client.get_dataset_files(
+                    mapping.dataset_id, 
+                    page=page, 
+                    size=batch_size,
+                )
+                
+                if not files_response or not files_response.content:
+                    logger.info(f"No more files on page {page + 1}")
+                    break
+                
+                logger.info(f"Processing page {page + 1}, total {len(files_response.content)} files")
+                
+                # 筛选出新文件并批量创建任务
+                tasks = []
+                new_files_count = 0
+                existing_files_count = 0
+                
+                for file_info in files_response.content:
+                    # 记录当前DM中存在的文件ID
+                    current_file_ids.add(str(file_info.id))
+                    
+                    # 检查文件是否已存在
+                    if str(file_info.id) in existing_file_ids:
+                        existing_files_count += 1
+                        logger.debug(f"Skip existing file: {file_info.originalName} (ID: {file_info.id})")
+                        continue
+                    
+                    new_files_count += 1
+                    
+                    data_type = self.determine_data_type(file_info.fileType)
+
+                    # 替换文件路径前缀:只替换开头的前缀，不影响路径中间可能出现的相同字符串
+                    file_path = file_info.filePath.removeprefix(settings.dm_file_path_prefix)
+                    file_path = settings.label_studio_file_path_prefix + file_path
+                    
+                    # 构造任务数据
+                    task_data = {
+                        "data": {
+                            f"{data_type}": file_path,
+                            "file_path": file_info.filePath,
+                            "file_id": file_info.id,
+                            "original_name": file_info.originalName,
+                            "dataset_id": mapping.dataset_id,
+                        }
+                    }
+                    tasks.append(task_data)
+                
+                logger.info(f"Page {page + 1}: new files {new_files_count}, existing files {existing_files_count}")
+                
+                # 批量创建Label Studio任务
+                if tasks:
+                    batch_result = await self.ls_client.create_tasks_batch(
+                        mapping.labeling_project_id,
+                        tasks
+                    )
+                    
+                    if batch_result:
+                        synced_files += len(tasks)
+                        logger.info(f"Successfully synced {len(tasks)} files")
+                    else:
+                        logger.warning(f"Batch task creation failed, fallback to single creation")
+                        # 如果批量创建失败，尝试单个创建
+                        for task_data in tasks:
+                            task_result = await self.ls_client.create_task(
+                                mapping.labeling_project_id,
+                                task_data["data"],
+                                task_data.get("meta")
+                            )
+                            if task_result:
+                                synced_files += 1
+                
+                # 检查是否还有更多页面
+                if page >= files_response.totalPages - 1:
+                    break
+                page += 1
+            
+            # 清理在DM中不存在但在Label Studio中存在的任务
+            tasks_to_delete = []
+            for file_id, task_id in existing_dm_file_mapping.items():
+                if file_id not in current_file_ids:
+                    tasks_to_delete.append(task_id)
+                    logger.debug(f"Mark task for deletion: {task_id} (DM file ID: {file_id})")
+            
+            if tasks_to_delete:
+                logger.info(f"Deleting {len(tasks_to_delete)} tasks not present in DM")
+                delete_result = await self.ls_client.delete_tasks_batch(tasks_to_delete)
+                deleted_tasks = delete_result.get("successful", 0)
+                logger.info(f"Successfully deleted {deleted_tasks} tasks")
+            else:
+                logger.info("No tasks to delete")
+            
+            logger.info(f"Sync completed: total_files={total_files}, created={synced_files}, deleted={deleted_tasks}")
+            
+            return SyncDatasetResponse(
+                id=mapping.id,
+                status="success",
+                synced_files=synced_files,
+                total_files=total_files,
+                message=f"Sync completed: created {synced_files} files, deleted {deleted_tasks} tasks"
+            )
+            
+        except Exception as e:
+            logger.error(f"Error while syncing dataset: {e}")
+            return SyncDatasetResponse(
+                id=mapping.id,
+                status="error",
+                synced_files=0,
+                total_files=0,
+                message=f"Sync failed: {str(e)}"
+            )
+    
+    async def get_sync_status(
+        self, 
+        dataset_id: str
+    ) -> Optional[Dict[str, Any]]:
+        """获取同步状态"""
+        mapping = await self.mapping_service.get_mapping_by_source_uuid(dataset_id)
+        if not mapping:
+            return None
+        
+        # 获取DM数据集信息
+        dataset_info = await self.dm_client.get_dataset(dataset_id)
+        
+        # 获取Label Studio项目任务数量
+        tasks_info = await self.ls_client.get_project_tasks(mapping.labeling_project_id)
+        
+        return {
+            "id": mapping.id,
+            "dataset_id": dataset_id,
+            "labeling_project_id": mapping.labeling_project_id,
+            "dm_total_files": dataset_info.fileCount if dataset_info else 0,
+            "ls_total_tasks": tasks_info.get("count", 0) if tasks_info else 0,
+            "sync_ratio": (
+                tasks_info.get("count", 0) / dataset_info.fileCount 
+                if dataset_info and dataset_info.fileCount > 0 and tasks_info else 0
+            )
+        }
--- a/runtime/datamate-python/app/module/dataset/init.py
+++ b/runtime/datamate-python/app/module/dataset/init.py
@@ -0,0 +1,3 @@
+from .service import DatasetManagementService
+
+__all__ = ["DatasetManagementService"]
--- a/runtime/datamate-python/app/module/dataset/schema/init.py
+++ b/runtime/datamate-python/app/module/dataset/schema/init.py
@@ -0,0 +1,16 @@
+from .dataset_file import (
+    DatasetFileResponse,
+    PagedDatasetFileResponse,
+)
+
+from .dataset import (
+    DatasetResponse,
+    DatasetTypeResponse,
+)
+
+__all__ = [
+    "DatasetResponse",
+    "DatasetFileResponse",
+    "PagedDatasetFileResponse",
+    "DatasetTypeResponse",
+]
--- a/runtime/datamate-python/app/module/dataset/schema/dataset.py
+++ b/runtime/datamate-python/app/module/dataset/schema/dataset.py
@@ -0,0 +1,36 @@
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+from datetime import datetime
+
+class DatasetTypeResponse(BaseModel):
+    """数据集类型响应模型"""
+    code: str = Field(..., description="类型编码")
+    name: str = Field(..., description="类型名称")
+    description: Optional[str] = Field(None, description="类型描述")
+    supportedFormats: List[str] = Field(default_factory=list, description="支持的文件格式")
+    icon: Optional[str] = Field(None, description="图标")
+
+class DatasetResponse(BaseModel):
+    """DM服务数据集响应模型"""
+    id: str = Field(..., description="数据集ID")
+    name: str = Field(..., description="数据集名称")
+    description: Optional[str] = Field(None, description="数据集描述")
+    datasetType: str = Field(..., description="数据集类型", alias="datasetType")
+    status: str = Field(..., description="数据集状态")
+    fileCount: int = Field(..., description="文件数量")
+    totalSize: int = Field(..., description="总大小（字节）")
+    createdAt: Optional[datetime] = Field(None, description="创建时间")
+    updatedAt: Optional[datetime] = Field(None, description="更新时间")
+    createdBy: Optional[str] = Field(None, description="创建者")
+    
+    # 为了向后兼容，添加一个属性方法返回类型对象
+    @property
+    def type(self) -> DatasetTypeResponse:
+        """兼容属性：返回类型对象"""
+        return DatasetTypeResponse(
+            code=self.datasetType,
+            name=self.datasetType,
+            description=None,
+            supportedFormats=[],
+            icon=None
+        )
--- a/runtime/datamate-python/app/module/dataset/schema/dataset_file.py
+++ b/runtime/datamate-python/app/module/dataset/schema/dataset_file.py
@@ -0,0 +1,26 @@
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+from datetime import datetime
+
+class DatasetFileResponse(BaseModel):
+    """DM服务数据集文件响应模型"""
+    id: str = Field(..., description="文件ID")
+    fileName: str = Field(..., description="文件名")
+    fileType: str = Field(..., description="文件类型")
+    filePath: str = Field(..., description="文件路径")
+    originalName: Optional[str] = Field(None, description="原始文件名")
+    size: Optional[int] = Field(None, description="文件大小（字节）")
+    status: Optional[str] = Field(None, description="文件状态")
+    uploadedAt: Optional[datetime] = Field(None, description="上传时间")
+    description: Optional[str] = Field(None, description="文件描述")
+    uploadedBy: Optional[str] = Field(None, description="上传者")
+    lastAccessTime: Optional[datetime] = Field(None, description="最后访问时间")
+
+class PagedDatasetFileResponse(BaseModel):
+    """DM服务分页文件响应模型"""
+    content: List[DatasetFileResponse] = Field(..., description="文件列表")
+    totalElements: int = Field(..., description="总元素数")
+    totalPages: int = Field(..., description="总页数")
+    page: int = Field(..., description="当前页码")
+    size: int = Field(..., description="每页大小")
+    
--- a/runtime/datamate-python/app/module/dataset/service/init.py
+++ b/runtime/datamate-python/app/module/dataset/service/init.py
@@ -0,0 +1,3 @@
+from .service import Service as DatasetManagementService
+
+__all__ = ["DatasetManagementService"]
--- a/runtime/datamate-python/app/module/dataset/service/service.py
+++ b/runtime/datamate-python/app/module/dataset/service/service.py
@@ -0,0 +1,160 @@
+from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy.future import select
+from sqlalchemy import func
+from typing import Optional
+
+from app.core.config import settings
+from app.core.logging import get_logger
+from app.db.models import Dataset, DatasetFiles
+
+from ..schema import DatasetResponse, PagedDatasetFileResponse, DatasetFileResponse
+
+logger = get_logger(__name__)
+
+class Service:
+    """数据管理服务客户端 - 直接访问数据库"""
+    
+    def __init__(self, db: AsyncSession):
+        """
+        初始化 DM 客户端
+        
+        Args:
+            db: 数据库会话
+        """
+        self.db = db
+        logger.info("Initialize DM service client (Database mode)")
+
+    async def get_dataset(self, dataset_id: str) -> Optional[DatasetResponse]:
+        """获取数据集详情"""
+        try:
+            logger.info(f"Getting dataset detail: {dataset_id} ...")
+            
+            result = await self.db.execute(
+                select(Dataset).where(Dataset.id == dataset_id)
+            )
+            dataset = result.scalar_one_or_none()
+            
+            if not dataset:
+                logger.error(f"Dataset not found: {dataset_id}")
+                return None
+            
+            # 将数据库模型转换为响应模型
+            # type: ignore 用于忽略 SQLAlchemy 的类型检查问题
+            return DatasetResponse(
+                id=dataset.id,  # type: ignore
+                name=dataset.name,  # type: ignore
+                description=dataset.description or "",  # type: ignore
+                datasetType=dataset.dataset_type,  # type: ignore
+                status=dataset.status,  # type: ignore
+                fileCount=dataset.file_count or 0,  # type: ignore
+                totalSize=dataset.size_bytes or 0,  # type: ignore
+                createdAt=dataset.created_at,  # type: ignore
+                updatedAt=dataset.updated_at,  # type: ignore
+                createdBy=dataset.created_by  # type: ignore
+            )
+        except Exception as e:
+            logger.error(f"Failed to get dataset {dataset_id}: {e}")
+            return None
+    
+    async def get_dataset_files(
+        self, 
+        dataset_id: str, 
+        page: int = 0, 
+        size: int = 100,
+        file_type: Optional[str] = None,
+        status: Optional[str] = None
+    ) -> Optional[PagedDatasetFileResponse]:
+        """获取数据集文件列表"""
+        try:
+            logger.info(f"Get dataset files: dataset={dataset_id}, page={page}, size={size}")
+            
+            # 构建查询
+            query = select(DatasetFiles).where(DatasetFiles.dataset_id == dataset_id)
+            
+            # 添加可选过滤条件
+            if file_type:
+                query = query.where(DatasetFiles.file_type == file_type)
+            if status:
+                query = query.where(DatasetFiles.status == status)
+            
+            # 获取总数
+            count_query = select(func.count()).select_from(DatasetFiles).where(
+                DatasetFiles.dataset_id == dataset_id
+            )
+            if file_type:
+                count_query = count_query.where(DatasetFiles.file_type == file_type)
+            if status:
+                count_query = count_query.where(DatasetFiles.status == status)
+                
+            count_result = await self.db.execute(count_query)
+            total = count_result.scalar_one()
+            
+            # 分页查询
+            query = query.offset(page * size).limit(size).order_by(DatasetFiles.created_at.desc())
+            result = await self.db.execute(query)
+            files = result.scalars().all()
+            
+            # 转换为响应模型
+            # type: ignore 用于忽略 SQLAlchemy 的类型检查问题
+            content = [
+                DatasetFileResponse(
+                    id=f.id,  # type: ignore
+                    fileName=f.file_name,  # type: ignore
+                    fileType=f.file_type or "",  # type: ignore
+                    filePath=f.file_path,  # type: ignore
+                    originalName=f.file_name,  # type: ignore
+                    size=f.file_size,  # type: ignore
+                    status=f.status,  # type: ignore
+                    uploadedAt=f.upload_time,  # type: ignore
+                    description=None,
+                    uploadedBy=None,
+                    lastAccessTime=f.last_access_time  # type: ignore
+                )
+                for f in files
+            ]
+            
+            total_pages = (total + size - 1) // size if size > 0 else 0
+            
+            return PagedDatasetFileResponse(
+                content=content,
+                totalElements=total,
+                totalPages=total_pages,
+                page=page,
+                size=size
+            )
+        except Exception as e:
+            logger.error(f"Failed to get dataset files for {dataset_id}: {e}")
+            return None
+    
+    async def download_file(self, dataset_id: str, file_id: str) -> Optional[bytes]:
+        """
+        下载文件内容
+        注意：此方法保留接口兼容性，但实际文件下载可能需要通过文件系统或对象存储
+        """
+        logger.warning(f"download_file is deprecated when using database mode. Use get_file_download_url instead.")
+        return None
+    
+    async def get_file_download_url(self, dataset_id: str, file_id: str) -> Optional[str]:
+        """获取文件下载URL（或文件路径）"""
+        try:
+            result = await self.db.execute(
+                select(DatasetFiles).where(
+                    DatasetFiles.id == file_id,
+                    DatasetFiles.dataset_id == dataset_id
+                )
+            )
+            file = result.scalar_one_or_none()
+            
+            if not file:
+                logger.error(f"File not found: {file_id} in dataset {dataset_id}")
+                return None
+            
+            # 返回文件路径（可以是本地路径或对象存储URL）
+            return file.file_path  # type: ignore
+        except Exception as e:
+            logger.error(f"Failed to get file path for {file_id}: {e}")
+            return None
+    
+    async def close(self):
+        """关闭客户端连接（数据库模式下无需操作）"""
+        logger.info("DM service client closed (Database mode)")
--- a/runtime/datamate-python/app/module/management/init.py
+++ b/runtime/datamate-python/app/module/management/init.py
--- a/runtime/datamate-python/app/module/management/api/init.py
+++ b/runtime/datamate-python/app/module/management/api/init.py
--- a/runtime/datamate-python/app/module/management/api/system.py
+++ b/runtime/datamate-python/app/module/management/api/system.py
@@ -0,0 +1,33 @@
+from fastapi import APIRouter
+from typing import Dict, Any
+from app.core.config import settings
+from app.schemas import StandardResponse
+
+router = APIRouter()
+
+@router.get("/health", response_model=StandardResponse[Dict[str, Any]])
+async def health_check():
+    """健康检查端点"""
+    return StandardResponse(
+        code=200,
+        message="success",
+        data={
+            "status": "healthy",
+            "service": "Label Studio Adapter",
+            "version": settings.app_version
+        }
+    )
+
+@router.get("/config", response_model=StandardResponse[Dict[str, Any]])
+async def get_config():
+    """获取配置信息"""
+    return StandardResponse(
+        code=200,
+        message="success",
+        data={
+            "app_name": settings.app_name,
+            "version": settings.app_version,
+            "label_studio_url": settings.label_studio_base_url,
+            "debug": settings.debug
+        }
+    )
--- a/runtime/datamate-python/app/module/management/service/init.py
+++ b/runtime/datamate-python/app/module/management/service/init.py
--- a/runtime/datamate-python/app/module/shared/init.py
+++ b/runtime/datamate-python/app/module/shared/init.py
--- a/runtime/datamate-python/app/module/shared/schema/init.py
+++ b/runtime/datamate-python/app/module/shared/schema/init.py
@@ -0,0 +1,11 @@
+from .common import (
+    BaseResponseModel,
+    StandardResponse,
+    PaginatedData
+)
+
+__all__ = [
+    "BaseResponseModel",
+    "StandardResponse",
+    "PaginatedData"
+]
--- a/runtime/datamate-python/app/module/shared/schema/common.py
+++ b/runtime/datamate-python/app/module/shared/schema/common.py
@@ -0,0 +1,62 @@
+"""
+通用响应模型
+"""
+from typing import Generic, TypeVar, Optional, List
+from pydantic import BaseModel, Field
+
+# 定义泛型类型变量
+T = TypeVar('T')
+
+# 定义一个将 snake_case 转换为 camelCase 的函数
+def to_camel(string: str) -> str:
+    """将 snake_case 字符串转换为 camelCase"""
+    components = string.split('_')
+    # 首字母小写，其余单词首字母大写
+    return components[0] + ''.join(x.title() for x in components[1:])
+
+class BaseResponseModel(BaseModel):
+    """基础响应模型，启用别名生成器"""
+    
+    class Config:
+        populate_by_name = True
+        alias_generator = to_camel
+
+class StandardResponse(BaseResponseModel, Generic[T]):
+    """
+    标准API响应格式
+    
+    所有API端点应返回此格式，确保响应的一致性
+    """
+    code: int = Field(..., description="HTTP状态码")
+    message: str = Field(..., description="响应消息")
+    data: Optional[T] = Field(None, description="响应数据")
+    
+    class Config:
+        populate_by_name = True
+        alias_generator = to_camel
+        json_schema_extra = {
+            "example": {
+                "code": 200,
+                "message": "success",
+                "data": {}
+            }
+        }
+
+class PaginatedData(BaseResponseModel, Generic[T]):
+    """分页数据容器"""
+    page: int = Field(..., description="当前页码（从1开始）")
+    size: int = Field(..., description="页大小")
+    total_elements: int = Field(..., description="总条数")
+    total_pages: int = Field(..., description="总页数")
+    content: List[T] = Field(..., description="当前页数据")
+    
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "page": 1,
+                "size": 20,
+                "totalElements": 100,
+                "totalPages": 5,
+                "content": []
+            }
+        }