refactor: Reorganize datamate-python (#34)

refactor: Reorganize datamate-python (previously label-studio-adapter) into a DDD style structure.
This commit is contained in:
Jason Wang
2025-10-30 01:32:59 +08:00
committed by GitHub
parent 0614157c0b
commit 2f7341dc1f
79 changed files with 1077 additions and 1577 deletions

View File

@@ -0,0 +1,11 @@
from fastapi import APIRouter
from .annotation.interface import router as annotation_router
router = APIRouter(
prefix="/api"
)
router.include_router(annotation_router)
__all__ = ["router"]

View File

@@ -0,0 +1,3 @@
from .labelstudio import LabelStudioClient
__all__ = ["LabelStudioClient"]

View File

@@ -0,0 +1,3 @@
from .client import Client as LabelStudioClient
__all__ = ["LabelStudioClient"]

View File

@@ -0,0 +1,449 @@
import httpx
from typing import Optional, Dict, Any, List
from app.core.config import settings
from app.core.logging import get_logger
from .schema import (
LabelStudioProject,
LabelStudioCreateProjectRequest,
LabelStudioCreateTaskRequest
)
logger = get_logger(__name__)
class Client:
"""Label Studio服务客户端
使用 HTTP REST API 直接与 Label Studio 交互
认证方式:使用 Authorization: Token {token} 头部进行认证
"""
# 默认标注配置模板
DEFAULT_LABEL_CONFIGS = {
"image": """
<View>
<Image name="image" value="$image"/>
<RectangleLabels name="label" toName="image">
<Label value="Object" background="red"/>
</RectangleLabels>
</View>
""",
"text": """
<View>
<Text name="text" value="$text"/>
<Choices name="sentiment" toName="text">
<Choice value="positive"/>
<Choice value="negative"/>
<Choice value="neutral"/>
</Choices>
</View>
""",
"audio": """
<View>
<Audio name="audio" value="$audio"/>
<AudioRegionLabels name="label" toName="audio">
<Label value="Speech" background="red"/>
<Label value="Noise" background="blue"/>
</AudioRegionLabels>
</View>
""",
"video": """
<View>
<Video name="video" value="$video"/>
<VideoRegionLabels name="label" toName="video">
<Label value="Action" background="red"/>
</VideoRegionLabels>
</View>
"""
}
def __init__(
self,
base_url: Optional[str] = None,
token: Optional[str] = None,
timeout: float = 30.0
):
"""初始化 Label Studio 客户端
Args:
base_url: Label Studio 服务地址
token: API Token(使用 Authorization: Token {token} 头部)
timeout: 请求超时时间(秒)
"""
self.base_url = (base_url or settings.label_studio_base_url).rstrip("/")
self.token = token or settings.label_studio_user_token
self.timeout = timeout
if not self.token:
raise ValueError("Label Studio API token is required")
# 初始化 HTTP 客户端
self.client = httpx.AsyncClient(
base_url=self.base_url,
timeout=self.timeout,
headers={
"Authorization": f"Token {self.token}",
"Content-Type": "application/json"
}
)
logger.debug(f"Label Studio client initialized: {self.base_url}")
def get_label_config_by_type(self, data_type: str) -> str:
"""根据数据类型获取标注配置"""
return self.DEFAULT_LABEL_CONFIGS.get(data_type.lower(), self.DEFAULT_LABEL_CONFIGS["image"])
async def create_project(
self,
title: str,
description: str = "",
label_config: Optional[str] = None,
data_type: str = "image"
) -> Optional[Dict[str, Any]]:
"""创建Label Studio项目"""
try:
logger.debug(f"Creating Label Studio project: {title}")
if not label_config:
label_config = self.get_label_config_by_type(data_type)
project_data = {
"title": title,
"description": description,
"label_config": label_config.strip()
}
response = await self.client.post("/api/projects", json=project_data)
response.raise_for_status()
project = response.json()
project_id = project.get("id")
if not project_id:
raise Exception("Label Studio response does not contain project ID")
logger.debug(f"Project created successfully, ID: {project_id}")
return project
except httpx.HTTPStatusError as e:
logger.error(f"Create project failed HTTP {e.response.status_code}: {e.response.text}")
return None
except Exception as e:
logger.error(f"Error while creating Label Studio project: {e}")
return None
async def import_tasks(
self,
project_id: int,
tasks: List[Dict[str, Any]],
commit_to_project: bool = True,
return_task_ids: bool = True
) -> Optional[Dict[str, Any]]:
"""批量导入任务到Label Studio项目"""
try:
logger.debug(f"Importing {len(tasks)} tasks into project {project_id}")
response = await self.client.post(
f"/api/projects/{project_id}/import",
json=tasks,
params={
"commit_to_project": str(commit_to_project).lower(),
"return_task_ids": str(return_task_ids).lower()
}
)
response.raise_for_status()
result = response.json()
task_count = result.get("task_count", len(tasks))
logger.debug(f"Tasks imported successfully: {task_count}")
return result
except httpx.HTTPStatusError as e:
logger.error(f"Import tasks failed HTTP {e.response.status_code}: {e.response.text}")
return None
except Exception as e:
logger.error(f"Error while importing tasks: {e}")
return None
async def create_tasks_batch(
self,
project_id: str,
tasks: List[Dict[str, Any]]
) -> Optional[Dict[str, Any]]:
"""批量创建任务的便利方法"""
try:
pid = int(project_id)
return await self.import_tasks(pid, tasks)
except ValueError as e:
logger.error(f"Invalid project ID format: {project_id}, error: {e}")
return None
except Exception as e:
logger.error(f"Error while creating tasks in batch: {e}")
return None
async def create_task(
self,
project_id: str,
data: Dict[str, Any],
meta: Optional[Dict[str, Any]] = None
) -> Optional[Dict[str, Any]]:
"""创建单个任务"""
try:
task = {"data": data}
if meta:
task["meta"] = meta
return await self.create_tasks_batch(project_id, [task])
except Exception as e:
logger.error(f"Error while creating single task: {e}")
return None
async def get_project_tasks(
self,
project_id: str,
page: Optional[int] = None,
page_size: int = 1000
) -> Optional[Dict[str, Any]]:
"""获取项目任务信息
Args:
project_id: 项目ID
page: 页码(从1开始)。如果为None,则获取所有任务
page_size: 每页大小
Returns:
如果指定了page参数,返回包含分页信息的字典:
{
"count": 总任务数,
"page": 当前页码,
"page_size": 每页大小,
"project_id": 项目ID,
"tasks": 当前页的任务列表
}
如果page为None,返回包含所有任务的字典:
"count": 总任务数,
"project_id": 项目ID,
"tasks": 所有任务列表
}
"""
try:
pid = int(project_id)
# 如果指定了page,直接获取单页任务
if page is not None:
logger.debug(f"Fetching tasks for project {pid}, page {page} (page_size={page_size})")
response = await self.client.get(
f"/api/tasks",
params={
"project": pid,
"page": page,
"page_size": page_size
}
)
response.raise_for_status()
result = response.json()
# 返回单页结果,包含分页信息
return {
"count": result.get("total", len(result.get("tasks", []))),
"page": page,
"page_size": page_size,
"project_id": pid,
"tasks": result.get("tasks", [])
}
# 如果未指定page,获取所有任务
logger.debug(f"(page) not specified, fetching all tasks.")
all_tasks = []
response = await self.client.get(
f"/api/tasks",
params={
"project": pid
}
)
response.raise_for_status()
result = response.json()
tasks = result.get("tasks", [])
if not tasks:
logger.debug(f"No tasks found for this project.")
all_tasks.extend(tasks)
logger.debug(f"Fetched {len(tasks)} tasks.")
# 返回所有任务,不包含分页信息
return {
"count": len(all_tasks),
"project_id": pid,
"tasks": all_tasks
}
except httpx.HTTPStatusError as e:
logger.error(f"获取项目任务失败 HTTP {e.response.status_code}: {e.response.text}")
return None
except Exception as e:
logger.error(f"获取项目任务时发生错误: {e}")
return None
async def delete_task(
self,
task_id: int
) -> bool:
"""删除单个任务"""
try:
logger.debug(f"Deleting task: {task_id}")
response = await self.client.delete(f"/api/tasks/{task_id}")
response.raise_for_status()
logger.debug(f"Task deleted: {task_id}")
return True
except httpx.HTTPStatusError as e:
logger.error(f"Delete task {task_id} failed HTTP {e.response.status_code}: {e.response.text}")
return False
except Exception as e:
logger.error(f"Error while deleting task {task_id}: {e}")
return False
async def delete_tasks_batch(
self,
task_ids: List[int]
) -> Dict[str, int]:
"""批量删除任务"""
try:
logger.debug(f"Deleting {len(task_ids)} tasks in batch")
successful_deletions = 0
failed_deletions = 0
for task_id in task_ids:
if await self.delete_task(task_id):
successful_deletions += 1
else:
failed_deletions += 1
logger.debug(f"Batch deletion finished: success {successful_deletions}, failed {failed_deletions}")
return {
"successful": successful_deletions,
"failed": failed_deletions,
"total": len(task_ids)
}
except Exception as e:
logger.error(f"Error while deleting tasks in batch: {e}")
return {
"successful": 0,
"failed": len(task_ids),
"total": len(task_ids)
}
async def get_project(self, project_id: int) -> Optional[Dict[str, Any]]:
"""获取项目信息"""
try:
logger.debug(f"Fetching project info: {project_id}")
response = await self.client.get(f"/api/projects/{project_id}")
response.raise_for_status()
return response.json()
except httpx.HTTPStatusError as e:
logger.error(f"Get project info failed HTTP {e.response.status_code}: {e.response.text}")
return None
except Exception as e:
logger.error(f"Error while getting project info: {e}")
return None
async def delete_project(self, project_id: int) -> bool:
"""删除项目"""
try:
logger.debug(f"Deleting project: {project_id}")
response = await self.client.delete(f"/api/projects/{project_id}")
response.raise_for_status()
logger.debug(f"Project deleted: {project_id}")
return True
except httpx.HTTPStatusError as e:
logger.error(f"Delete project {project_id} failed HTTP {e.response.status_code}: {e.response.text}")
return False
except Exception as e:
logger.error(f"Error while deleting project {project_id}: {e}")
return False
async def create_local_storage(
self,
project_id: int,
path: str,
title: str,
use_blob_urls: bool = True,
regex_filter: Optional[str] = None,
description: Optional[str] = None
) -> Optional[Dict[str, Any]]:
"""创建本地文件存储配置
Args:
project_id: Label Studio 项目 ID
path: 本地文件路径(在 Label Studio 容器中的路径)
title: 存储配置标题
use_blob_urls: 是否使用 blob URLs(建议 True)
regex_filter: 文件过滤正则表达式(可选)
description: 存储描述(可选)
Returns:
创建的存储配置信息,失败返回 None
"""
try:
logger.debug(f"Creating local storage for project {project_id}: {path}")
storage_data = {
"project": project_id,
"path": path,
"title": title,
"use_blob_urls": use_blob_urls
}
if regex_filter:
storage_data["regex_filter"] = regex_filter
if description:
storage_data["description"] = description
response = await self.client.post(
"/api/storages/localfiles/",
json=storage_data
)
response.raise_for_status()
storage = response.json()
storage_id = storage.get("id")
logger.debug(f"Local storage created successfully, ID: {storage_id}")
return storage
except httpx.HTTPStatusError as e:
logger.error(f"Create local storage failed HTTP {e.response.status_code}: {e.response.text}")
return None
except Exception as e:
logger.error(f"Error while creating local storage: {e}")
return None
async def close(self):
"""关闭客户端连接"""
try:
await self.client.aclose()
logger.debug("Label Studio client closed")
except Exception as e:
logger.error(f"Error while closing Label Studio client: {e}")

View File

@@ -0,0 +1,40 @@
from pydantic import Field
from typing import Dict, Any, Optional
from datetime import datetime
from app.module.shared.schema import BaseResponseModel
class LabelStudioProject(BaseResponseModel):
"""Label Studio项目模型"""
id: int = Field(..., description="项目ID")
title: str = Field(..., description="项目标题")
description: Optional[str] = Field(None, description="项目描述")
label_config: str = Field(..., description="标注配置")
created_at: Optional[datetime] = Field(None, description="创建时间")
updated_at: Optional[datetime] = Field(None, description="更新时间")
class LabelStudioTaskData(BaseResponseModel):
"""Label Studio任务数据模型"""
image: Optional[str] = Field(None, description="图像URL")
text: Optional[str] = Field(None, description="文本内容")
audio: Optional[str] = Field(None, description="音频URL")
video: Optional[str] = Field(None, description="视频URL")
filename: Optional[str] = Field(None, description="文件名")
class LabelStudioTask(BaseResponseModel):
"""Label Studio任务模型"""
data: LabelStudioTaskData = Field(..., description="任务数据")
project: Optional[int] = Field(None, description="项目ID")
meta: Optional[Dict[str, Any]] = Field(None, description="元数据")
class LabelStudioCreateProjectRequest(BaseResponseModel):
"""创建Label Studio项目请求模型"""
title: str = Field(..., description="项目标题")
description: str = Field("", description="项目描述")
label_config: str = Field(..., description="标注配置")
class LabelStudioCreateTaskRequest(BaseResponseModel):
"""创建Label Studio任务请求模型"""
data: Dict[str, Any] = Field(..., description="任务数据")
project: Optional[int] = Field(None, description="项目ID")

View File

@@ -0,0 +1,12 @@
from fastapi import APIRouter
from .project import router as project_router
from .task import router as task_router
router = APIRouter(
prefix="/annotation",
tags = ["annotation"]
)
router.include_router(project_router)
router.include_router(task_router)

View File

@@ -0,0 +1,353 @@
from typing import Optional
import math
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.ext.asyncio import AsyncSession
from app.db.session import get_db
from app.module.shared.schema import StandardResponse, PaginatedData
from app.module.dataset import DatasetManagementService
from app.core.logging import get_logger
from app.core.config import settings
from ..client import LabelStudioClient
from ..service.mapping import DatasetMappingService
from ..schema import (
DatasetMappingCreateRequest,
DatasetMappingCreateResponse,
DeleteDatasetResponse,
DatasetMappingResponse,
)
router = APIRouter(
prefix="/project",
tags=["annotation/project"]
)
logger = get_logger(__name__)
@router.post("/", response_model=StandardResponse[DatasetMappingCreateResponse], status_code=201)
async def create_mapping(
request: DatasetMappingCreateRequest,
db: AsyncSession = Depends(get_db)
):
"""
创建数据集映射
根据指定的DM程序中的数据集,创建Label Studio中的数据集,
在数据库中记录这一关联关系,返回Label Studio数据集的ID
注意:一个数据集可以创建多个标注项目
"""
try:
dm_client = DatasetManagementService(db)
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
token=settings.label_studio_user_token)
service = DatasetMappingService(db)
logger.info(f"Create dataset mapping request: {request.dataset_id}")
# 从DM服务获取数据集信息
dataset_info = await dm_client.get_dataset(request.dataset_id)
if not dataset_info:
raise HTTPException(
status_code=404,
detail=f"Dataset not found in DM service: {request.dataset_id}"
)
# 确定数据类型(基于数据集类型)
data_type = "image" # 默认值
if dataset_info.type and dataset_info.type.code:
type_code = dataset_info.type.code.lower()
if "audio" in type_code:
data_type = "audio"
elif "video" in type_code:
data_type = "video"
elif "text" in type_code:
data_type = "text"
project_name = f"{dataset_info.name}"
# 在Label Studio中创建项目
project_data = await ls_client.create_project(
title=project_name,
description=dataset_info.description or f"Imported from DM dataset {dataset_info.id}",
data_type=data_type
)
if not project_data:
raise HTTPException(
status_code=500,
detail="Fail to create Label Studio project."
)
project_id = project_data["id"]
# 配置本地存储:dataset/<id>
local_storage_path = f"{settings.label_studio_local_storage_dataset_base_path}/{request.dataset_id}"
storage_result = await ls_client.create_local_storage(
project_id=project_id,
path=local_storage_path,
title="Dataset_BLOB",
use_blob_urls=True,
description=f"Local storage for dataset {dataset_info.name}"
)
if not storage_result:
# 本地存储配置失败,记录警告但不中断流程
logger.warning(f"Failed to configure local storage for project {project_id}")
else:
logger.info(f"Local storage configured for project {project_id}: {local_storage_path}")
# 创建映射关系,包含项目名称
mapping = await service.create_mapping(
request,
str(project_id),
project_name
)
response_data = DatasetMappingCreateResponse(
id=mapping.id,
labeling_project_id=str(mapping.labeling_project_id),
labeling_project_name=mapping.name or project_name,
message="Dataset mapping created successfully"
)
return StandardResponse(
code=201,
message="success",
data=response_data
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error while creating dataset mapping: {e}")
raise HTTPException(status_code=500, detail="Internal server error")
@router.get("/", response_model=StandardResponse[PaginatedData[DatasetMappingResponse]])
async def list_mappings(
page: int = Query(1, ge=1, description="页码(从1开始)"),
page_size: int = Query(20, ge=1, le=100, description="每页记录数"),
db: AsyncSession = Depends(get_db)
):
"""
查询所有映射关系(分页)
返回所有有效的数据集映射关系(未被软删除的),支持分页查询
"""
try:
service = DatasetMappingService(db)
# 计算 skip
skip = (page - 1) * page_size
logger.info(f"Listing mappings, page={page}, page_size={page_size}")
# 获取数据和总数
mappings, total = await service.get_all_mappings_with_count(
skip=skip,
limit=page_size
)
# 计算总页数
total_pages = math.ceil(total / page_size) if total > 0 else 0
# 构造分页响应
paginated_data = PaginatedData(
page=page,
size=page_size,
total_elements=total,
total_pages=total_pages,
content=mappings
)
logger.info(f"Found {len(mappings)} mappings on page {page}, total: {total}")
return StandardResponse(
code=200,
message="success",
data=paginated_data
)
except Exception as e:
logger.error(f"Error listing mappings: {e}")
raise HTTPException(status_code=500, detail="Internal server error")
@router.get("/{mapping_id}", response_model=StandardResponse[DatasetMappingResponse])
async def get_mapping(
mapping_id: str,
db: AsyncSession = Depends(get_db)
):
"""
根据 UUID 查询单个映射关系
"""
try:
service = DatasetMappingService(db)
logger.info(f"Get mapping: {mapping_id}")
mapping = await service.get_mapping_by_uuid(mapping_id)
if not mapping:
raise HTTPException(
status_code=404,
detail=f"Mapping not found: {mapping_id}"
)
logger.info(f"Found mapping: {mapping.id}")
return StandardResponse(
code=200,
message="success",
data=mapping
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting mapping: {e}")
raise HTTPException(status_code=500, detail="Internal server error")
@router.get("/by-source/{dataset_id}", response_model=StandardResponse[PaginatedData[DatasetMappingResponse]])
async def get_mappings_by_source(
dataset_id: str,
page: int = Query(1, ge=1, description="页码(从1开始)"),
page_size: int = Query(20, ge=1, le=100, description="每页记录数"),
db: AsyncSession = Depends(get_db)
):
"""
根据源数据集 ID 查询所有映射关系(分页)
返回该数据集创建的所有标注项目(不包括已删除的),支持分页查询
"""
try:
service = DatasetMappingService(db)
# 计算 skip
skip = (page - 1) * page_size
logger.info(f"Get mappings by source dataset id: {dataset_id}, page={page}, page_size={page_size}")
# 获取数据和总数
mappings, total = await service.get_mappings_by_source_with_count(
dataset_id=dataset_id,
skip=skip,
limit=page_size
)
# 计算总页数
total_pages = math.ceil(total / page_size) if total > 0 else 0
# 构造分页响应
paginated_data = PaginatedData(
page=page,
size=page_size,
total_elements=total,
total_pages=total_pages,
content=mappings
)
logger.info(f"Found {len(mappings)} mappings on page {page}, total: {total}")
return StandardResponse(
code=200,
message="success",
data=paginated_data
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error getting mappings: {e}")
raise HTTPException(status_code=500, detail="Internal server error")
@router.delete("/", response_model=StandardResponse[DeleteDatasetResponse])
async def delete_mapping(
m: Optional[str] = Query(None, description="映射UUID"),
proj: Optional[str] = Query(None, description="Label Studio项目ID"),
db: AsyncSession = Depends(get_db)
):
"""
删除映射关系和对应的 Label Studio 项目
可以通过以下任一方式指定要删除的映射:
- m: 映射UUID
- proj: Label Studio项目ID
- 两者都提供(优先使用 m)
此操作会:
1. 删除 Label Studio 中的项目
2. 软删除数据库中的映射记录
"""
try:
# 至少需要提供一个参数
if not m and not proj:
raise HTTPException(
status_code=400,
detail="Either 'm' (mapping UUID) or 'proj' (project ID) must be provided"
)
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
token=settings.label_studio_user_token)
service = DatasetMappingService(db)
# 优先使用 mapping_id 查询
if m:
logger.debug(f"Deleting by mapping UUID: {m}")
mapping = await service.get_mapping_by_uuid(m)
# 如果没有提供 m,使用 proj 查询
elif proj:
logger.debug(f"Deleting by project ID: {proj}")
mapping = await service.get_mapping_by_labeling_project_id(proj)
else:
mapping = None
if not mapping:
raise HTTPException(
status_code=404,
detail=f"Mapping either not found or not specified."
)
id = mapping.id
labeling_project_id = mapping.labeling_project_id
labeling_project_name = mapping.name
logger.debug(f"Found mapping: {id}, Label Studio project ID: {labeling_project_id}")
# 1. 删除 Label Studio 项目
try:
delete_success = await ls_client.delete_project(int(labeling_project_id))
if delete_success:
logger.debug(f"Successfully deleted Label Studio project: {labeling_project_id}")
else:
logger.warning(f"Failed to delete Label Studio project or project not found: {labeling_project_id}")
except Exception as e:
logger.error(f"Error deleting Label Studio project: {e}")
# 继续执行,即使 Label Studio 项目删除失败也要删除映射记录
# 2. 软删除映射记录
soft_delete_success = await service.soft_delete_mapping(id)
if not soft_delete_success:
raise HTTPException(
status_code=500,
detail="Failed to delete mapping record"
)
logger.info(f"Successfully deleted mapping: {id}, Label Studio project: {labeling_project_id}")
return StandardResponse(
code=200,
message="success",
data=DeleteDatasetResponse(
id=id,
status="success",
message=f"Successfully deleted mapping and Label Studio project '{labeling_project_name}'"
)
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error deleting mapping: {e}")
raise HTTPException(status_code=500, detail="Internal server error")

View File

@@ -0,0 +1,76 @@
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.ext.asyncio import AsyncSession
from typing import List, Optional
from app.db.session import get_db
from app.module.shared.schema import StandardResponse
from app.module.dataset import DatasetManagementService
from app.core.logging import get_logger
from app.core.config import settings
from app.exception import NoDatasetInfoFoundError, DatasetMappingNotFoundError
from ..client import LabelStudioClient
from ..service.sync import SyncService
from ..service.mapping import DatasetMappingService
from ..schema import (
SyncDatasetRequest,
SyncDatasetResponse,
)
router = APIRouter(
prefix="/task",
tags=["annotation/task"]
)
logger = get_logger(__name__)
@router.post("/sync", response_model=StandardResponse[SyncDatasetResponse])
async def sync_dataset_content(
request: SyncDatasetRequest,
db: AsyncSession = Depends(get_db)
):
"""
同步数据集内容
根据指定的mapping ID,同步DM程序数据集中的内容到Label Studio数据集中,
在数据库中记录更新时间,返回更新状态
"""
try:
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
token=settings.label_studio_user_token)
dm_client = DatasetManagementService(db)
mapping_service = DatasetMappingService(db)
sync_service = SyncService(dm_client, ls_client, mapping_service)
logger.info(f"Sync dataset content request: mapping_id={request.id}")
# request.id 合法性校验
mapping = await mapping_service.get_mapping_by_uuid(request.id)
if not mapping:
raise HTTPException(
status_code=404,
detail=f"Mapping not found: {request.id}"
)
# 执行同步(使用映射中的源数据集UUID)
result = await sync_service.sync_dataset_files(request.id, request.batch_size)
logger.info(f"Sync completed: {result.synced_files}/{result.total_files} files")
return StandardResponse(
code=200,
message="success",
data=result
)
except HTTPException:
raise
except NoDatasetInfoFoundError as e:
logger.error(f"Failed to get dataset info: {e}")
raise HTTPException(status_code=404, detail=str(e))
except DatasetMappingNotFoundError as e:
logger.error(f"Mapping not found: {e}")
raise HTTPException(status_code=404, detail=str(e))
except Exception as e:
logger.error(f"Error syncing dataset content: {e}")
raise HTTPException(status_code=500, detail="Internal server error")

View File

@@ -0,0 +1,24 @@
from .mapping import (
DatasetMappingBase,
DatasetMappingCreateRequest,
DatasetMappingCreateResponse,
DatasetMappingUpdateRequest,
DatasetMappingResponse,
DeleteDatasetResponse
)
from .sync import (
SyncDatasetRequest,
SyncDatasetResponse
)
__all__ = [
"DatasetMappingBase",
"DatasetMappingCreateRequest",
"DatasetMappingCreateResponse",
"DatasetMappingUpdateRequest",
"DatasetMappingResponse",
"SyncDatasetRequest",
"SyncDatasetResponse",
"DeleteDatasetResponse"
]

View File

@@ -0,0 +1,42 @@
from pydantic import Field
from typing import Optional
from datetime import datetime
from app.module.shared.schema import BaseResponseModel
class DatasetMappingBase(BaseResponseModel):
"""数据集映射 基础模型"""
dataset_id: str = Field(..., description="源数据集ID")
class DatasetMappingCreateRequest(DatasetMappingBase):
"""数据集映射 创建 请求模型"""
pass
class DatasetMappingCreateResponse(BaseResponseModel):
"""数据集映射 创建 响应模型"""
id: str = Field(..., description="映射UUID")
labeling_project_id: str = Field(..., description="Label Studio项目ID")
labeling_project_name: str = Field(..., description="Label Studio项目名称")
message: str = Field(..., description="响应消息")
class DatasetMappingUpdateRequest(BaseResponseModel):
"""数据集映射 更新 请求模型"""
dataset_id: Optional[str] = Field(None, description="源数据集ID")
class DatasetMappingResponse(DatasetMappingBase):
"""数据集映射 查询 响应模型"""
id: str = Field(..., description="映射UUID")
labeling_project_id: str = Field(..., description="标注项目ID")
name: Optional[str] = Field(None, description="标注项目名称")
created_at: datetime = Field(..., description="创建时间")
deleted_at: Optional[datetime] = Field(None, description="删除时间")
class Config:
from_attributes = True
populate_by_name = True
class DeleteDatasetResponse(BaseResponseModel):
"""删除数据集响应模型"""
id: str = Field(..., description="映射UUID")
status: str = Field(..., description="删除状态")
message: str = Field(..., description="响应消息")

View File

@@ -0,0 +1,19 @@
from pydantic import Field
from typing import Optional
from datetime import datetime
from app.module.shared.schema import BaseResponseModel
class SyncDatasetRequest(BaseResponseModel):
"""同步数据集请求模型"""
id: str = Field(..., description="映射ID(mapping UUID)")
batch_size: int = Field(50, ge=1, le=100, description="批处理大小")
class SyncDatasetResponse(BaseResponseModel):
"""同步数据集响应模型"""
id: str = Field(..., description="映射UUID")
status: str = Field(..., description="同步状态")
synced_files: int = Field(..., description="已同步文件数量")
total_files: int = Field(0, description="总文件数量")
message: str = Field(..., description="响应消息")

View File

@@ -0,0 +1,283 @@
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy import update, func
from typing import Optional, List, Tuple
from datetime import datetime
import uuid
from app.core.logging import get_logger
from app.db.models import LabelingProject
from app.module.annotation.schema import (
DatasetMappingCreateRequest,
DatasetMappingUpdateRequest,
DatasetMappingResponse
)
logger = get_logger(__name__)
class DatasetMappingService:
"""数据集映射服务"""
def __init__(self, db: AsyncSession):
self.db = db
async def create_mapping(
self,
mapping_data: DatasetMappingCreateRequest,
labeling_project_id: str,
labeling_project_name: str
) -> DatasetMappingResponse:
"""创建数据集映射"""
logger.info(f"Create dataset mapping: {mapping_data.dataset_id} -> {labeling_project_id}")
db_mapping = LabelingProject(
id=str(uuid.uuid4()),
dataset_id=mapping_data.dataset_id,
labeling_project_id=labeling_project_id,
name=labeling_project_name
)
self.db.add(db_mapping)
await self.db.commit()
await self.db.refresh(db_mapping)
logger.debug(f"Mapping created: {db_mapping.id}")
return DatasetMappingResponse.model_validate(db_mapping)
async def get_mapping_by_source_uuid(
self,
dataset_id: str
) -> Optional[DatasetMappingResponse]:
"""根据源数据集ID获取映射(返回第一个未删除的)"""
logger.debug(f"Get mapping by source dataset id: {dataset_id}")
result = await self.db.execute(
select(LabelingProject).where(
LabelingProject.dataset_id == dataset_id,
LabelingProject.deleted_at.is_(None)
)
)
mapping = result.scalar_one_or_none()
if mapping:
logger.debug(f"Found mapping: {mapping.id}")
return DatasetMappingResponse.model_validate(mapping)
logger.debug(f"No mapping found for source dataset id: {dataset_id}")
return None
async def get_mappings_by_dataset_id(
self,
dataset_id: str,
include_deleted: bool = False
) -> List[DatasetMappingResponse]:
"""根据源数据集ID获取所有映射关系"""
logger.debug(f"Get all mappings by source dataset id: {dataset_id}")
query = select(LabelingProject).where(
LabelingProject.dataset_id == dataset_id
)
if not include_deleted:
query = query.where(LabelingProject.deleted_at.is_(None))
result = await self.db.execute(
query.order_by(LabelingProject.created_at.desc())
)
mappings = result.scalars().all()
logger.debug(f"Found {len(mappings)} mappings")
return [DatasetMappingResponse.model_validate(mapping) for mapping in mappings]
async def get_mapping_by_labeling_project_id(
self,
labeling_project_id: str
) -> Optional[DatasetMappingResponse]:
"""根据Label Studio项目ID获取映射"""
logger.debug(f"Get mapping by Label Studio project id: {labeling_project_id}")
result = await self.db.execute(
select(LabelingProject).where(
LabelingProject.labeling_project_id == labeling_project_id,
LabelingProject.deleted_at.is_(None)
)
)
mapping = result.scalar_one_or_none()
if mapping:
logger.debug(f"Found mapping: {mapping.mapping_id}")
return DatasetMappingResponse.model_validate(mapping)
logger.debug(f"No mapping found for Label Studio project id: {labeling_project_id}")
return None
async def get_mapping_by_uuid(self, mapping_id: str) -> Optional[DatasetMappingResponse]:
"""根据映射UUID获取映射"""
logger.debug(f"Get mapping: {mapping_id}")
result = await self.db.execute(
select(LabelingProject).where(
LabelingProject.id == mapping_id,
LabelingProject.deleted_at.is_(None)
)
)
mapping = result.scalar_one_or_none()
if mapping:
logger.debug(f"Found mapping: {mapping.id}")
return DatasetMappingResponse.model_validate(mapping)
logger.debug(f"Mapping not found: {mapping_id}")
return None
async def update_mapping(
self,
mapping_id: str,
update_data: DatasetMappingUpdateRequest
) -> Optional[DatasetMappingResponse]:
"""更新映射信息"""
logger.info(f"Update mapping: {mapping_id}")
mapping = await self.get_mapping_by_uuid(mapping_id)
if not mapping:
return None
update_values = update_data.model_dump(exclude_unset=True)
update_values["last_updated_at"] = datetime.now()
result = await self.db.execute(
update(LabelingProject)
.where(LabelingProject.id == mapping_id)
.values(**update_values)
)
await self.db.commit()
if result.rowcount > 0:
return await self.get_mapping_by_uuid(mapping_id)
return None
async def soft_delete_mapping(self, mapping_id: str) -> bool:
"""软删除映射"""
logger.info(f"Soft delete mapping: {mapping_id}")
result = await self.db.execute(
update(LabelingProject)
.where(
LabelingProject.id == mapping_id,
LabelingProject.deleted_at.is_(None)
)
.values(deleted_at=datetime.now())
)
await self.db.commit()
success = result.rowcount > 0
if success:
logger.info(f"Mapping soft-deleted: {mapping_id}")
else:
logger.warning(f"Mapping not exists or already deleted: {mapping_id}")
return success
async def get_all_mappings(
self,
skip: int = 0,
limit: int = 100
) -> List[DatasetMappingResponse]:
"""获取所有有效映射"""
logger.debug(f"List all mappings, skip: {skip}, limit: {limit}")
result = await self.db.execute(
select(LabelingProject)
.where(LabelingProject.deleted_at.is_(None))
.offset(skip)
.limit(limit)
.order_by(LabelingProject.created_at.desc())
)
mappings = result.scalars().all()
logger.debug(f"Found {len(mappings)} mappings")
return [DatasetMappingResponse.model_validate(mapping) for mapping in mappings]
async def count_mappings(self, include_deleted: bool = False) -> int:
"""统计映射总数"""
query = select(func.count()).select_from(LabelingProject)
if not include_deleted:
query = query.where(LabelingProject.deleted_at.is_(None))
result = await self.db.execute(query)
return result.scalar_one()
async def get_all_mappings_with_count(
self,
skip: int = 0,
limit: int = 100,
include_deleted: bool = False
) -> Tuple[List[DatasetMappingResponse], int]:
"""获取所有映射及总数(用于分页)"""
logger.debug(f"List all mappings with count, skip: {skip}, limit: {limit}")
# 构建查询
query = select(LabelingProject)
if not include_deleted:
query = query.where(LabelingProject.deleted_at.is_(None))
# 获取总数
count_query = select(func.count()).select_from(LabelingProject)
if not include_deleted:
count_query = count_query.where(LabelingProject.deleted_at.is_(None))
count_result = await self.db.execute(count_query)
total = count_result.scalar_one()
# 获取数据
result = await self.db.execute(
query
.offset(skip)
.limit(limit)
.order_by(LabelingProject.created_at.desc())
)
mappings = result.scalars().all()
logger.debug(f"Found {len(mappings)} mappings, total: {total}")
return [DatasetMappingResponse.model_validate(mapping) for mapping in mappings], total
async def get_mappings_by_source_with_count(
self,
dataset_id: str,
skip: int = 0,
limit: int = 100,
include_deleted: bool = False
) -> Tuple[List[DatasetMappingResponse], int]:
"""根据源数据集ID获取映射关系及总数(用于分页)"""
logger.debug(f"Get mappings by source dataset id with count: {dataset_id}")
# 构建查询
query = select(LabelingProject).where(
LabelingProject.dataset_id == dataset_id
)
if not include_deleted:
query = query.where(LabelingProject.deleted_at.is_(None))
# 获取总数
count_query = select(func.count()).select_from(LabelingProject).where(
LabelingProject.dataset_id == dataset_id
)
if not include_deleted:
count_query = count_query.where(LabelingProject.deleted_at.is_(None))
count_result = await self.db.execute(count_query)
total = count_result.scalar_one()
# 获取数据
result = await self.db.execute(
query
.offset(skip)
.limit(limit)
.order_by(LabelingProject.created_at.desc())
)
mappings = result.scalars().all()
logger.debug(f"Found {len(mappings)} mappings, total: {total}")
return [DatasetMappingResponse.model_validate(mapping) for mapping in mappings], total

View File

@@ -0,0 +1,272 @@
from typing import Optional, List, Dict, Any, Tuple
from app.module.dataset import DatasetManagementService
from app.core.logging import get_logger
from app.core.config import settings
from app.exception import NoDatasetInfoFoundError
from ..client import LabelStudioClient
from ..schema import SyncDatasetResponse
from ..service.mapping import DatasetMappingService
logger = get_logger(__name__)
class SyncService:
"""数据同步服务"""
def __init__(
self,
dm_client: DatasetManagementService,
ls_client: LabelStudioClient,
mapping_service: DatasetMappingService
):
self.dm_client = dm_client
self.ls_client = ls_client
self.mapping_service = mapping_service
def determine_data_type(self, file_type: str) -> str:
"""根据文件类型确定数据类型"""
file_type_lower = file_type.lower()
if any(ext in file_type_lower for ext in ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'svg', 'webp']):
return 'image'
elif any(ext in file_type_lower for ext in ['mp3', 'wav', 'flac', 'aac', 'ogg']):
return 'audio'
elif any(ext in file_type_lower for ext in ['mp4', 'avi', 'mov', 'wmv', 'flv', 'webm']):
return 'video'
elif any(ext in file_type_lower for ext in ['txt', 'doc', 'docx', 'pdf']):
return 'text'
else:
return 'image' # 默认为图像类型
async def get_existing_dm_file_mapping(self, project_id: str) -> Dict[str, int]:
"""
获取Label Studio项目中已存在的DM文件ID到任务ID的映射
Args:
project_id: Label Studio项目ID
Returns:
file_id到task_id的映射字典
"""
try:
logger.info(f"Fetching existing task mappings for project {project_id} (page_size={settings.ls_task_page_size})")
dm_file_to_task_mapping = {}
# 使用Label Studio客户端封装的方法获取所有任务
page_size = getattr(settings, 'ls_task_page_size', 1000)
# 调用封装好的方法获取所有任务,page=None表示获取全部
result = await self.ls_client.get_project_tasks(
project_id=project_id,
page=None, # 不指定page,获取所有任务
page_size=page_size
)
logger.info(f"Fetched tasks result: {result}")
if not result:
logger.warning(f"Failed to fetch tasks for project {project_id}")
return {}
logger.info(f"Successfully fetched tasks for project {project_id}")
all_tasks = result.get("tasks", [])
# 遍历所有任务,构建映射
for task in all_tasks:
# logger.debug(task)
try:
file_id = task.get('data', {}).get('file_id')
task_id = task.get('id')
dm_file_to_task_mapping[str(file_id)] = task_id
except Exception as e:
logger.error(f"Error processing task {task.get('id')}: {e}")
continue
logger.debug(dm_file_to_task_mapping)
logger.info(f"Found {len(dm_file_to_task_mapping)} existing task mappings")
return dm_file_to_task_mapping
except Exception as e:
logger.error(f"Error while fetching existing tasks: {e}")
return {} # 发生错误时返回空字典,会同步所有文件
async def sync_dataset_files(
self,
id: str,
batch_size: int = 50
) -> SyncDatasetResponse:
"""同步数据集文件到Label Studio"""
logger.info(f"Start syncing dataset by mapping: {id}")
# 获取映射关系
mapping = await self.mapping_service.get_mapping_by_uuid(id)
if not mapping:
logger.error(f"Dataset mapping not found: {id}")
return SyncDatasetResponse(
id="",
status="error",
synced_files=0,
total_files=0,
message=f"Dataset mapping not found: {id}"
)
try:
# 获取数据集信息
dataset_info = await self.dm_client.get_dataset(mapping.dataset_id)
if not dataset_info:
raise NoDatasetInfoFoundError(mapping.dataset_id)
synced_files = 0
deleted_tasks = 0
total_files = dataset_info.fileCount
page = 0
logger.info(f"Total files in dataset: {total_files}")
# 获取Label Studio中已存在的DM文件ID到任务ID的映射
existing_dm_file_mapping = await self.get_existing_dm_file_mapping(mapping.labeling_project_id)
existing_file_ids = set(existing_dm_file_mapping.keys())
logger.info(f"{len(existing_file_ids)} tasks already exist in Label Studio")
# 收集DM中当前存在的所有文件ID
current_file_ids = set()
while True:
files_response = await self.dm_client.get_dataset_files(
mapping.dataset_id,
page=page,
size=batch_size,
)
if not files_response or not files_response.content:
logger.info(f"No more files on page {page + 1}")
break
logger.info(f"Processing page {page + 1}, total {len(files_response.content)} files")
# 筛选出新文件并批量创建任务
tasks = []
new_files_count = 0
existing_files_count = 0
for file_info in files_response.content:
# 记录当前DM中存在的文件ID
current_file_ids.add(str(file_info.id))
# 检查文件是否已存在
if str(file_info.id) in existing_file_ids:
existing_files_count += 1
logger.debug(f"Skip existing file: {file_info.originalName} (ID: {file_info.id})")
continue
new_files_count += 1
data_type = self.determine_data_type(file_info.fileType)
# 替换文件路径前缀:只替换开头的前缀,不影响路径中间可能出现的相同字符串
file_path = file_info.filePath.removeprefix(settings.dm_file_path_prefix)
file_path = settings.label_studio_file_path_prefix + file_path
# 构造任务数据
task_data = {
"data": {
f"{data_type}": file_path,
"file_path": file_info.filePath,
"file_id": file_info.id,
"original_name": file_info.originalName,
"dataset_id": mapping.dataset_id,
}
}
tasks.append(task_data)
logger.info(f"Page {page + 1}: new files {new_files_count}, existing files {existing_files_count}")
# 批量创建Label Studio任务
if tasks:
batch_result = await self.ls_client.create_tasks_batch(
mapping.labeling_project_id,
tasks
)
if batch_result:
synced_files += len(tasks)
logger.info(f"Successfully synced {len(tasks)} files")
else:
logger.warning(f"Batch task creation failed, fallback to single creation")
# 如果批量创建失败,尝试单个创建
for task_data in tasks:
task_result = await self.ls_client.create_task(
mapping.labeling_project_id,
task_data["data"],
task_data.get("meta")
)
if task_result:
synced_files += 1
# 检查是否还有更多页面
if page >= files_response.totalPages - 1:
break
page += 1
# 清理在DM中不存在但在Label Studio中存在的任务
tasks_to_delete = []
for file_id, task_id in existing_dm_file_mapping.items():
if file_id not in current_file_ids:
tasks_to_delete.append(task_id)
logger.debug(f"Mark task for deletion: {task_id} (DM file ID: {file_id})")
if tasks_to_delete:
logger.info(f"Deleting {len(tasks_to_delete)} tasks not present in DM")
delete_result = await self.ls_client.delete_tasks_batch(tasks_to_delete)
deleted_tasks = delete_result.get("successful", 0)
logger.info(f"Successfully deleted {deleted_tasks} tasks")
else:
logger.info("No tasks to delete")
logger.info(f"Sync completed: total_files={total_files}, created={synced_files}, deleted={deleted_tasks}")
return SyncDatasetResponse(
id=mapping.id,
status="success",
synced_files=synced_files,
total_files=total_files,
message=f"Sync completed: created {synced_files} files, deleted {deleted_tasks} tasks"
)
except Exception as e:
logger.error(f"Error while syncing dataset: {e}")
return SyncDatasetResponse(
id=mapping.id,
status="error",
synced_files=0,
total_files=0,
message=f"Sync failed: {str(e)}"
)
async def get_sync_status(
self,
dataset_id: str
) -> Optional[Dict[str, Any]]:
"""获取同步状态"""
mapping = await self.mapping_service.get_mapping_by_source_uuid(dataset_id)
if not mapping:
return None
# 获取DM数据集信息
dataset_info = await self.dm_client.get_dataset(dataset_id)
# 获取Label Studio项目任务数量
tasks_info = await self.ls_client.get_project_tasks(mapping.labeling_project_id)
return {
"id": mapping.id,
"dataset_id": dataset_id,
"labeling_project_id": mapping.labeling_project_id,
"dm_total_files": dataset_info.fileCount if dataset_info else 0,
"ls_total_tasks": tasks_info.get("count", 0) if tasks_info else 0,
"sync_ratio": (
tasks_info.get("count", 0) / dataset_info.fileCount
if dataset_info and dataset_info.fileCount > 0 and tasks_info else 0
)
}

View File

@@ -0,0 +1,3 @@
from .service import DatasetManagementService
__all__ = ["DatasetManagementService"]

View File

@@ -0,0 +1,16 @@
from .dataset_file import (
DatasetFileResponse,
PagedDatasetFileResponse,
)
from .dataset import (
DatasetResponse,
DatasetTypeResponse,
)
__all__ = [
"DatasetResponse",
"DatasetFileResponse",
"PagedDatasetFileResponse",
"DatasetTypeResponse",
]

View File

@@ -0,0 +1,36 @@
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any
from datetime import datetime
class DatasetTypeResponse(BaseModel):
"""数据集类型响应模型"""
code: str = Field(..., description="类型编码")
name: str = Field(..., description="类型名称")
description: Optional[str] = Field(None, description="类型描述")
supportedFormats: List[str] = Field(default_factory=list, description="支持的文件格式")
icon: Optional[str] = Field(None, description="图标")
class DatasetResponse(BaseModel):
"""DM服务数据集响应模型"""
id: str = Field(..., description="数据集ID")
name: str = Field(..., description="数据集名称")
description: Optional[str] = Field(None, description="数据集描述")
datasetType: str = Field(..., description="数据集类型", alias="datasetType")
status: str = Field(..., description="数据集状态")
fileCount: int = Field(..., description="文件数量")
totalSize: int = Field(..., description="总大小(字节)")
createdAt: Optional[datetime] = Field(None, description="创建时间")
updatedAt: Optional[datetime] = Field(None, description="更新时间")
createdBy: Optional[str] = Field(None, description="创建者")
# 为了向后兼容,添加一个属性方法返回类型对象
@property
def type(self) -> DatasetTypeResponse:
"""兼容属性:返回类型对象"""
return DatasetTypeResponse(
code=self.datasetType,
name=self.datasetType,
description=None,
supportedFormats=[],
icon=None
)

View File

@@ -0,0 +1,26 @@
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any
from datetime import datetime
class DatasetFileResponse(BaseModel):
"""DM服务数据集文件响应模型"""
id: str = Field(..., description="文件ID")
fileName: str = Field(..., description="文件名")
fileType: str = Field(..., description="文件类型")
filePath: str = Field(..., description="文件路径")
originalName: Optional[str] = Field(None, description="原始文件名")
size: Optional[int] = Field(None, description="文件大小(字节)")
status: Optional[str] = Field(None, description="文件状态")
uploadedAt: Optional[datetime] = Field(None, description="上传时间")
description: Optional[str] = Field(None, description="文件描述")
uploadedBy: Optional[str] = Field(None, description="上传者")
lastAccessTime: Optional[datetime] = Field(None, description="最后访问时间")
class PagedDatasetFileResponse(BaseModel):
"""DM服务分页文件响应模型"""
content: List[DatasetFileResponse] = Field(..., description="文件列表")
totalElements: int = Field(..., description="总元素数")
totalPages: int = Field(..., description="总页数")
page: int = Field(..., description="当前页码")
size: int = Field(..., description="每页大小")

View File

@@ -0,0 +1,3 @@
from .service import Service as DatasetManagementService
__all__ = ["DatasetManagementService"]

View File

@@ -0,0 +1,160 @@
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy import func
from typing import Optional
from app.core.config import settings
from app.core.logging import get_logger
from app.db.models import Dataset, DatasetFiles
from ..schema import DatasetResponse, PagedDatasetFileResponse, DatasetFileResponse
logger = get_logger(__name__)
class Service:
"""数据管理服务客户端 - 直接访问数据库"""
def __init__(self, db: AsyncSession):
"""
初始化 DM 客户端
Args:
db: 数据库会话
"""
self.db = db
logger.info("Initialize DM service client (Database mode)")
async def get_dataset(self, dataset_id: str) -> Optional[DatasetResponse]:
"""获取数据集详情"""
try:
logger.info(f"Getting dataset detail: {dataset_id} ...")
result = await self.db.execute(
select(Dataset).where(Dataset.id == dataset_id)
)
dataset = result.scalar_one_or_none()
if not dataset:
logger.error(f"Dataset not found: {dataset_id}")
return None
# 将数据库模型转换为响应模型
# type: ignore 用于忽略 SQLAlchemy 的类型检查问题
return DatasetResponse(
id=dataset.id, # type: ignore
name=dataset.name, # type: ignore
description=dataset.description or "", # type: ignore
datasetType=dataset.dataset_type, # type: ignore
status=dataset.status, # type: ignore
fileCount=dataset.file_count or 0, # type: ignore
totalSize=dataset.size_bytes or 0, # type: ignore
createdAt=dataset.created_at, # type: ignore
updatedAt=dataset.updated_at, # type: ignore
createdBy=dataset.created_by # type: ignore
)
except Exception as e:
logger.error(f"Failed to get dataset {dataset_id}: {e}")
return None
async def get_dataset_files(
self,
dataset_id: str,
page: int = 0,
size: int = 100,
file_type: Optional[str] = None,
status: Optional[str] = None
) -> Optional[PagedDatasetFileResponse]:
"""获取数据集文件列表"""
try:
logger.info(f"Get dataset files: dataset={dataset_id}, page={page}, size={size}")
# 构建查询
query = select(DatasetFiles).where(DatasetFiles.dataset_id == dataset_id)
# 添加可选过滤条件
if file_type:
query = query.where(DatasetFiles.file_type == file_type)
if status:
query = query.where(DatasetFiles.status == status)
# 获取总数
count_query = select(func.count()).select_from(DatasetFiles).where(
DatasetFiles.dataset_id == dataset_id
)
if file_type:
count_query = count_query.where(DatasetFiles.file_type == file_type)
if status:
count_query = count_query.where(DatasetFiles.status == status)
count_result = await self.db.execute(count_query)
total = count_result.scalar_one()
# 分页查询
query = query.offset(page * size).limit(size).order_by(DatasetFiles.created_at.desc())
result = await self.db.execute(query)
files = result.scalars().all()
# 转换为响应模型
# type: ignore 用于忽略 SQLAlchemy 的类型检查问题
content = [
DatasetFileResponse(
id=f.id, # type: ignore
fileName=f.file_name, # type: ignore
fileType=f.file_type or "", # type: ignore
filePath=f.file_path, # type: ignore
originalName=f.file_name, # type: ignore
size=f.file_size, # type: ignore
status=f.status, # type: ignore
uploadedAt=f.upload_time, # type: ignore
description=None,
uploadedBy=None,
lastAccessTime=f.last_access_time # type: ignore
)
for f in files
]
total_pages = (total + size - 1) // size if size > 0 else 0
return PagedDatasetFileResponse(
content=content,
totalElements=total,
totalPages=total_pages,
page=page,
size=size
)
except Exception as e:
logger.error(f"Failed to get dataset files for {dataset_id}: {e}")
return None
async def download_file(self, dataset_id: str, file_id: str) -> Optional[bytes]:
"""
下载文件内容
注意:此方法保留接口兼容性,但实际文件下载可能需要通过文件系统或对象存储
"""
logger.warning(f"download_file is deprecated when using database mode. Use get_file_download_url instead.")
return None
async def get_file_download_url(self, dataset_id: str, file_id: str) -> Optional[str]:
"""获取文件下载URL(或文件路径)"""
try:
result = await self.db.execute(
select(DatasetFiles).where(
DatasetFiles.id == file_id,
DatasetFiles.dataset_id == dataset_id
)
)
file = result.scalar_one_or_none()
if not file:
logger.error(f"File not found: {file_id} in dataset {dataset_id}")
return None
# 返回文件路径(可以是本地路径或对象存储URL)
return file.file_path # type: ignore
except Exception as e:
logger.error(f"Failed to get file path for {file_id}: {e}")
return None
async def close(self):
"""关闭客户端连接(数据库模式下无需操作)"""
logger.info("DM service client closed (Database mode)")

View File

@@ -0,0 +1,33 @@
from fastapi import APIRouter
from typing import Dict, Any
from app.core.config import settings
from app.schemas import StandardResponse
router = APIRouter()
@router.get("/health", response_model=StandardResponse[Dict[str, Any]])
async def health_check():
"""健康检查端点"""
return StandardResponse(
code=200,
message="success",
data={
"status": "healthy",
"service": "Label Studio Adapter",
"version": settings.app_version
}
)
@router.get("/config", response_model=StandardResponse[Dict[str, Any]])
async def get_config():
"""获取配置信息"""
return StandardResponse(
code=200,
message="success",
data={
"app_name": settings.app_name,
"version": settings.app_version,
"label_studio_url": settings.label_studio_base_url,
"debug": settings.debug
}
)

View File

@@ -0,0 +1,11 @@
from .common import (
BaseResponseModel,
StandardResponse,
PaginatedData
)
__all__ = [
"BaseResponseModel",
"StandardResponse",
"PaginatedData"
]

View File

@@ -0,0 +1,62 @@
"""
通用响应模型
"""
from typing import Generic, TypeVar, Optional, List
from pydantic import BaseModel, Field
# 定义泛型类型变量
T = TypeVar('T')
# 定义一个将 snake_case 转换为 camelCase 的函数
def to_camel(string: str) -> str:
"""将 snake_case 字符串转换为 camelCase"""
components = string.split('_')
# 首字母小写,其余单词首字母大写
return components[0] + ''.join(x.title() for x in components[1:])
class BaseResponseModel(BaseModel):
"""基础响应模型,启用别名生成器"""
class Config:
populate_by_name = True
alias_generator = to_camel
class StandardResponse(BaseResponseModel, Generic[T]):
"""
标准API响应格式
所有API端点应返回此格式,确保响应的一致性
"""
code: int = Field(..., description="HTTP状态码")
message: str = Field(..., description="响应消息")
data: Optional[T] = Field(None, description="响应数据")
class Config:
populate_by_name = True
alias_generator = to_camel
json_schema_extra = {
"example": {
"code": 200,
"message": "success",
"data": {}
}
}
class PaginatedData(BaseResponseModel, Generic[T]):
"""分页数据容器"""
page: int = Field(..., description="当前页码(从1开始)")
size: int = Field(..., description="页大小")
total_elements: int = Field(..., description="总条数")
total_pages: int = Field(..., description="总页数")
content: List[T] = Field(..., description="当前页数据")
class Config:
json_schema_extra = {
"example": {
"page": 1,
"size": 20,
"totalElements": 100,
"totalPages": 5,
"content": []
}
}