You've already forked DataMate
feat: File and Annotation 2-way sync implementation (#63)
* feat: Refactor configuration and sync logic for improved dataset handling and logging * feat: Enhance annotation synchronization and dataset file management - Added new fields `tags_updated_at` to `DatasetFiles` model for tracking the last update time of tags. - Implemented new asynchronous methods in the Label Studio client for fetching, creating, updating, and deleting task annotations. - Introduced bidirectional synchronization for annotations between DataMate and Label Studio, allowing for flexible data management. - Updated sync service to handle annotation conflicts based on timestamps, ensuring data integrity during synchronization. - Enhanced dataset file response model to include tags and their update timestamps. - Modified database initialization script to create a new column for `tags_updated_at` in the dataset files table. - Updated requirements to ensure compatibility with the latest dependencies.
This commit is contained in:
@@ -1,94 +1,19 @@
|
|||||||
# ====================================
|
# Dev settings
|
||||||
# Label Studio Adapter Configuration
|
|
||||||
# ====================================
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# 应用程序配置
|
|
||||||
# =========================
|
|
||||||
APP_NAME="Label Studio Adapter"
|
|
||||||
APP_VERSION="1.0.0"
|
|
||||||
APP_DESCRIPTION="Adapter for integrating Data Management System with Label Studio"
|
|
||||||
DEBUG=true
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# 服务器配置
|
|
||||||
# =========================
|
|
||||||
HOST=0.0.0.0
|
HOST=0.0.0.0
|
||||||
PORT=18000
|
PORT=18000
|
||||||
|
|
||||||
# =========================
|
DEBUG=true
|
||||||
# 日志配置
|
LOG_LEVEL=DEBUG
|
||||||
# =========================
|
LOG_FILE_DIR=./logs
|
||||||
LOG_LEVEL=INFO
|
|
||||||
|
|
||||||
# =========================
|
# DataBase
|
||||||
# Label Studio 服务配置
|
MYSQL_HOST=localhost
|
||||||
# =========================
|
|
||||||
# Label Studio 服务地址(根据部署方式调整)
|
|
||||||
# Docker 环境:http://label-studio:8080
|
|
||||||
# 本地开发:http://127.0.0.1:8000
|
|
||||||
LABEL_STUDIO_BASE_URL=http://label-studio:8080
|
|
||||||
|
|
||||||
# Label Studio 用户名和密码(用于自动创建用户)
|
|
||||||
LABEL_STUDIO_USERNAME=admin@example.com
|
|
||||||
LABEL_STUDIO_PASSWORD=password
|
|
||||||
|
|
||||||
# Label Studio API 认证 Token(Legacy Token,推荐使用)
|
|
||||||
# 从 Label Studio UI 的 Account & Settings > Access Token 获取
|
|
||||||
LABEL_STUDIO_USER_TOKEN=your-label-studio-token-here
|
|
||||||
|
|
||||||
# Label Studio 本地文件存储基础路径(容器内路径,用于 Docker 部署时的权限检查)
|
|
||||||
LABEL_STUDIO_LOCAL_BASE=/label-studio/local_files
|
|
||||||
|
|
||||||
# Label Studio 本地文件服务路径前缀(任务数据中的文件路径前缀)
|
|
||||||
LABEL_STUDIO_FILE_PATH_PREFIX=/data/local-files/?d=
|
|
||||||
|
|
||||||
# Label Studio 容器中的本地存储路径(用于配置 Local Storage)
|
|
||||||
LABEL_STUDIO_LOCAL_STORAGE_DATASET_BASE_PATH=/label-studio/local_files/dataset
|
|
||||||
LABEL_STUDIO_LOCAL_STORAGE_UPLOAD_BASE_PATH=/label-studio/local_files/upload
|
|
||||||
|
|
||||||
# Label Studio 任务列表分页大小
|
|
||||||
LS_TASK_PAGE_SIZE=1000
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Data Management 服务配置
|
|
||||||
# =========================
|
|
||||||
# DM 存储文件夹前缀(通常与 Label Studio 的 local-files 文件夹映射一致)
|
|
||||||
DM_FILE_PATH_PREFIX=/
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Adapter 数据库配置 (MySQL)
|
|
||||||
# =========================
|
|
||||||
# 优先级1:如果配置了 MySQL,将优先使用 MySQL 数据库
|
|
||||||
MYSQL_HOST=adapter-db
|
|
||||||
MYSQL_PORT=3306
|
MYSQL_PORT=3306
|
||||||
MYSQL_USER=label_studio_user
|
MYSQL_USER=root
|
||||||
MYSQL_PASSWORD=user_password
|
MYSQL_PASSWORD=password
|
||||||
MYSQL_DATABASE=label_studio_adapter
|
MYSQL_DATABASE=datamate
|
||||||
|
|
||||||
# =========================
|
# Label Studio settings
|
||||||
# CORS 配置
|
LABEL_STUDIO_BASE_URL=http://localhost:8080
|
||||||
# =========================
|
|
||||||
# 允许的来源(生产环境建议配置具体域名)
|
|
||||||
ALLOWED_ORIGINS=["*"]
|
|
||||||
|
|
||||||
# 允许的 HTTP 方法
|
LABEL_STUDIO_USER_TOKEN="demo_dev_token"
|
||||||
ALLOWED_METHODS=["*"]
|
|
||||||
|
|
||||||
# 允许的请求头
|
|
||||||
ALLOWED_HEADERS=["*"]
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# Docker Compose 配置
|
|
||||||
# =========================
|
|
||||||
# Docker Compose 项目名称前缀
|
|
||||||
COMPOSE_PROJECT_NAME=ls-adapter
|
|
||||||
|
|
||||||
# =========================
|
|
||||||
# 同步配置(未来扩展)
|
|
||||||
# =========================
|
|
||||||
# 批量同步任务的批次大小
|
|
||||||
SYNC_BATCH_SIZE=100
|
|
||||||
|
|
||||||
# 同步失败时的最大重试次数
|
|
||||||
MAX_RETRIES=3
|
|
||||||
|
|||||||
4
runtime/datamate-python/.gitignore
vendored
4
runtime/datamate-python/.gitignore
vendored
@@ -3,4 +3,6 @@
|
|||||||
.dev.env
|
.dev.env
|
||||||
|
|
||||||
# logs
|
# logs
|
||||||
logs/
|
logs/
|
||||||
|
|
||||||
|
doc/
|
||||||
@@ -1,8 +1,6 @@
|
|||||||
from pydantic_settings import BaseSettings
|
from pydantic_settings import BaseSettings
|
||||||
from pydantic import model_validator
|
from pydantic import model_validator
|
||||||
from typing import Optional, List
|
from typing import Optional
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
class Settings(BaseSettings):
|
class Settings(BaseSettings):
|
||||||
"""应用程序配置"""
|
"""应用程序配置"""
|
||||||
@@ -10,39 +8,34 @@ class Settings(BaseSettings):
|
|||||||
class Config:
|
class Config:
|
||||||
env_file = ".env"
|
env_file = ".env"
|
||||||
case_sensitive = False
|
case_sensitive = False
|
||||||
extra = 'ignore' # 允许额外字段(如 Shell 脚本专用的环境变量)
|
extra = 'ignore'
|
||||||
|
|
||||||
# =========================
|
# Service
|
||||||
# Adapter 服务配置
|
app_name: str = "DataMate Python Backend"
|
||||||
# =========================
|
|
||||||
app_name: str = "Label Studio Adapter"
|
|
||||||
app_version: str = "1.0.0"
|
app_version: str = "1.0.0"
|
||||||
app_description: str = "Adapter for integrating Data Management System with Label Studio"
|
app_description: str = "Adapter for integrating Data Management System with Label Studio"
|
||||||
|
|
||||||
# 日志配置
|
|
||||||
log_level: str = "INFO"
|
|
||||||
debug: bool = True
|
|
||||||
log_file_dir: str = "/var/log/datamate"
|
|
||||||
|
|
||||||
# 服务器配置
|
|
||||||
host: str = "0.0.0.0"
|
host: str = "0.0.0.0"
|
||||||
port: int = 8000
|
port: int = 18000
|
||||||
|
|
||||||
# CORS配置
|
# CORS
|
||||||
# allowed_origins: List[str] = ["*"]
|
# allowed_origins: List[str] = ["*"]
|
||||||
# allowed_methods: List[str] = ["*"]
|
# allowed_methods: List[str] = ["*"]
|
||||||
# allowed_headers: List[str] = ["*"]
|
# allowed_headers: List[str] = ["*"]
|
||||||
|
|
||||||
# MySQL数据库配置 (优先级1)
|
# Log
|
||||||
|
log_level: str = "INFO"
|
||||||
|
debug: bool = True
|
||||||
|
log_file_dir: str = "/var/log/datamate"
|
||||||
|
|
||||||
|
# Database
|
||||||
mysql_host: str = "datamate-database"
|
mysql_host: str = "datamate-database"
|
||||||
mysql_port: int = 3306
|
mysql_port: int = 3306
|
||||||
mysql_user: str = "root"
|
mysql_user: str = "root"
|
||||||
mysql_password: str = "password"
|
mysql_password: str = "password"
|
||||||
mysql_database: str = "datamate"
|
mysql_database: str = "datamate"
|
||||||
|
|
||||||
# 直接数据库URL配置(如果提供,将覆盖上述配置)
|
database_url: str = "" # Will be overridden by build_database_url() if not provided
|
||||||
# 初始值为空字符串,在 model_validator 中会被设置为完整的 URL
|
|
||||||
database_url: str = ""
|
|
||||||
|
|
||||||
@model_validator(mode='after')
|
@model_validator(mode='after')
|
||||||
def build_database_url(self):
|
def build_database_url(self):
|
||||||
@@ -55,22 +48,18 @@ class Settings(BaseSettings):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
# =========================
|
# Label Studio
|
||||||
# Label Studio 服务配置
|
|
||||||
# =========================
|
|
||||||
label_studio_base_url: str = "http://label-studio:8000"
|
label_studio_base_url: str = "http://label-studio:8000"
|
||||||
label_studio_username: Optional[str] = "admin@demo.com" # Label Studio 用户名(用于登录)
|
label_studio_username: Optional[str] = "admin@demo.com"
|
||||||
label_studio_password: Optional[str] = "demoadmin" # Label Studio 密码(用于登录)
|
label_studio_password: Optional[str] = "demoadmin"
|
||||||
label_studio_user_token: Optional[str] = "abc123abc123" # Legacy Token
|
label_studio_user_token: Optional[str] = "abc123abc123" # Legacy Token
|
||||||
|
|
||||||
label_studio_local_storage_dataset_base_path: str = "/label-studio/local" # Label Studio容器中的本地存储基础路径
|
label_studio_local_document_root: str = "/label-studio/local" # Label Studio local file storage path
|
||||||
label_studio_file_path_prefix: str = "/data/local-files/?d=" # Label Studio本地文件服务路径前缀
|
label_studio_file_path_prefix: str = "/data/local-files/?d=" # Label Studio local file serving URL prefix
|
||||||
|
|
||||||
ls_task_page_size: int = 1000
|
ls_task_page_size: int = 1000
|
||||||
|
|
||||||
# =========================
|
# DataMate
|
||||||
# Data Management 服务配置
|
|
||||||
# =========================
|
|
||||||
dm_file_path_prefix: str = "/dataset" # DM存储文件夹前缀
|
dm_file_path_prefix: str = "/dataset" # DM存储文件夹前缀
|
||||||
|
|
||||||
# 全局设置实例
|
# 全局设置实例
|
||||||
|
|||||||
@@ -64,6 +64,7 @@ class DatasetFiles(Base):
|
|||||||
file_size = Column(BigInteger, default=0, comment="文件大小(字节)")
|
file_size = Column(BigInteger, default=0, comment="文件大小(字节)")
|
||||||
check_sum = Column(String(64), nullable=True, comment="文件校验和")
|
check_sum = Column(String(64), nullable=True, comment="文件校验和")
|
||||||
tags = Column(JSON, nullable=True, comment="文件标签信息")
|
tags = Column(JSON, nullable=True, comment="文件标签信息")
|
||||||
|
tags_updated_at = Column(TIMESTAMP, nullable=True, comment="标签最后更新时间")
|
||||||
dataset_filemetadata = Column("metadata", JSON, nullable=True, comment="文件元数据")
|
dataset_filemetadata = Column("metadata", JSON, nullable=True, comment="文件元数据")
|
||||||
status = Column(String(50), default='ACTIVE', comment="文件状态:ACTIVE/DELETED/PROCESSING")
|
status = Column(String(50), default='ACTIVE', comment="文件状态:ACTIVE/DELETED/PROCESSING")
|
||||||
upload_time = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="上传时间")
|
upload_time = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="上传时间")
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ async def lifespan(app: FastAPI):
|
|||||||
yield
|
yield
|
||||||
|
|
||||||
# @shutdown
|
# @shutdown
|
||||||
logger.info("DataMate Python Backend shutting down ...")
|
logger.info("DataMate Python Backend shutting down ...\n\n")
|
||||||
|
|
||||||
# 创建FastAPI应用
|
# 创建FastAPI应用
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
@@ -69,11 +69,7 @@ app = FastAPI(
|
|||||||
app.include_router(router)
|
app.include_router(router)
|
||||||
|
|
||||||
# 输出注册的路由(每行一个)
|
# 输出注册的路由(每行一个)
|
||||||
logger.debug("Registered routes:")
|
logger.debug(f"Registered routes refer to http://localhost:{settings.port}/redoc")
|
||||||
for route in app.routes:
|
|
||||||
route_path = getattr(route, "path", None)
|
|
||||||
if route_path:
|
|
||||||
logger.debug(f" {route_path}")
|
|
||||||
|
|
||||||
# 注册全局异常处理器
|
# 注册全局异常处理器
|
||||||
app.add_exception_handler(StarletteHTTPException, starlette_http_exception_handler) # type: ignore
|
app.add_exception_handler(StarletteHTTPException, starlette_http_exception_handler) # type: ignore
|
||||||
@@ -102,7 +98,7 @@ async def root():
|
|||||||
data={
|
data={
|
||||||
"message": f"{settings.app_name} is running",
|
"message": f"{settings.app_name} is running",
|
||||||
"version": settings.app_version,
|
"version": settings.app_version,
|
||||||
"docs_url": "/docs",
|
"docs_url": "/redoc",
|
||||||
"label_studio_url": settings.label_studio_base_url
|
"label_studio_url": settings.label_studio_base_url
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -380,6 +380,148 @@ class Client:
|
|||||||
logger.error(f"Error while deleting project {project_id}: {e}")
|
logger.error(f"Error while deleting project {project_id}: {e}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
async def get_task_annotations(
|
||||||
|
self,
|
||||||
|
task_id: int
|
||||||
|
) -> Optional[List[Dict[str, Any]]]:
|
||||||
|
"""获取任务的标注结果
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task_id: 任务ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
标注结果列表,每个标注包含完整的annotation信息
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.debug(f"Fetching annotations for task: {task_id}")
|
||||||
|
|
||||||
|
response = await self.client.get(f"/api/tasks/{task_id}/annotations")
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
annotations = response.json()
|
||||||
|
logger.debug(f"Fetched {len(annotations)} annotations for task {task_id}")
|
||||||
|
|
||||||
|
return annotations
|
||||||
|
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
logger.error(f"Get task annotations failed HTTP {e.response.status_code}: {e.response.text}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error while getting task annotations: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def create_annotation(
|
||||||
|
self,
|
||||||
|
task_id: int,
|
||||||
|
result: List[Dict[str, Any]],
|
||||||
|
completed_by: Optional[int] = None
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
"""为任务创建新的标注
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task_id: 任务ID
|
||||||
|
result: 标注结果列表
|
||||||
|
completed_by: 完成标注的用户ID(可选)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
创建的标注信息,失败返回None
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.debug(f"Creating annotation for task: {task_id}")
|
||||||
|
|
||||||
|
annotation_data = {
|
||||||
|
"result": result,
|
||||||
|
"task": task_id
|
||||||
|
}
|
||||||
|
|
||||||
|
if completed_by:
|
||||||
|
annotation_data["completed_by"] = completed_by
|
||||||
|
|
||||||
|
response = await self.client.post(
|
||||||
|
f"/api/tasks/{task_id}/annotations",
|
||||||
|
json=annotation_data
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
annotation = response.json()
|
||||||
|
logger.debug(f"Created annotation {annotation.get('id')} for task {task_id}")
|
||||||
|
|
||||||
|
return annotation
|
||||||
|
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
logger.error(f"Create annotation failed HTTP {e.response.status_code}: {e.response.text}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error while creating annotation: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def update_annotation(
|
||||||
|
self,
|
||||||
|
annotation_id: int,
|
||||||
|
result: List[Dict[str, Any]]
|
||||||
|
) -> Optional[Dict[str, Any]]:
|
||||||
|
"""更新已存在的标注
|
||||||
|
|
||||||
|
Args:
|
||||||
|
annotation_id: 标注ID
|
||||||
|
result: 新的标注结果列表
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
更新后的标注信息,失败返回None
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.debug(f"Updating annotation: {annotation_id}")
|
||||||
|
|
||||||
|
annotation_data = {
|
||||||
|
"result": result
|
||||||
|
}
|
||||||
|
|
||||||
|
response = await self.client.patch(
|
||||||
|
f"/api/annotations/{annotation_id}",
|
||||||
|
json=annotation_data
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
annotation = response.json()
|
||||||
|
logger.debug(f"Updated annotation {annotation_id}")
|
||||||
|
|
||||||
|
return annotation
|
||||||
|
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
logger.error(f"Update annotation failed HTTP {e.response.status_code}: {e.response.text}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error while updating annotation: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def delete_annotation(
|
||||||
|
self,
|
||||||
|
annotation_id: int
|
||||||
|
) -> bool:
|
||||||
|
"""删除标注
|
||||||
|
|
||||||
|
Args:
|
||||||
|
annotation_id: 标注ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
成功返回True,失败返回False
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.debug(f"Deleting annotation: {annotation_id}")
|
||||||
|
|
||||||
|
response = await self.client.delete(f"/api/annotations/{annotation_id}")
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
logger.debug(f"Deleted annotation {annotation_id}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
except httpx.HTTPStatusError as e:
|
||||||
|
logger.error(f"Delete annotation failed HTTP {e.response.status_code}: {e.response.text}")
|
||||||
|
return False
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error while deleting annotation: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
async def create_local_storage(
|
async def create_local_storage(
|
||||||
self,
|
self,
|
||||||
project_id: int,
|
project_id: int,
|
||||||
|
|||||||
@@ -80,7 +80,7 @@ async def create_mapping(
|
|||||||
project_id = project_data["id"]
|
project_id = project_data["id"]
|
||||||
|
|
||||||
# 配置本地存储:dataset/<id>
|
# 配置本地存储:dataset/<id>
|
||||||
local_storage_path = f"{settings.label_studio_local_storage_dataset_base_path}/{request.dataset_id}"
|
local_storage_path = f"{settings.label_studio_local_document_root}/{request.dataset_id}"
|
||||||
storage_result = await ls_client.create_local_storage(
|
storage_result = await ls_client.create_local_storage(
|
||||||
project_id=project_id,
|
project_id=project_id,
|
||||||
path=local_storage_path,
|
path=local_storage_path,
|
||||||
|
|||||||
@@ -15,6 +15,8 @@ from ..service.mapping import DatasetMappingService
|
|||||||
from ..schema import (
|
from ..schema import (
|
||||||
SyncDatasetRequest,
|
SyncDatasetRequest,
|
||||||
SyncDatasetResponse,
|
SyncDatasetResponse,
|
||||||
|
SyncAnnotationsRequest,
|
||||||
|
SyncAnnotationsResponse,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -30,10 +32,24 @@ async def sync_dataset_content(
|
|||||||
db: AsyncSession = Depends(get_db)
|
db: AsyncSession = Depends(get_db)
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
同步数据集内容
|
同步数据集内容(包括文件和标注)
|
||||||
|
|
||||||
根据指定的mapping ID,同步DM程序数据集中的内容到Label Studio数据集中,
|
根据指定的mapping ID,同步DM程序数据集中的内容到Label Studio数据集中。
|
||||||
在数据库中记录更新时间,返回更新状态
|
默认同时同步文件和标注数据。
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: 同步请求,包含:
|
||||||
|
- id: 映射ID(mapping UUID)
|
||||||
|
- batchSize: 批处理大小
|
||||||
|
- filePriority: 文件同步优先级
|
||||||
|
- labelPriority: 标签同步优先级
|
||||||
|
- syncAnnotations: 是否同步标注(默认True)
|
||||||
|
- annotationDirection: 标注同步方向(默认bidirectional)
|
||||||
|
- overwrite: 是否允许覆盖DataMate中的标注(默认True)
|
||||||
|
- overwriteLabelingProject: 是否允许覆盖Label Studio中的标注(默认True)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
同步结果
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
|
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
|
||||||
@@ -42,9 +58,9 @@ async def sync_dataset_content(
|
|||||||
mapping_service = DatasetMappingService(db)
|
mapping_service = DatasetMappingService(db)
|
||||||
sync_service = SyncService(dm_client, ls_client, mapping_service)
|
sync_service = SyncService(dm_client, ls_client, mapping_service)
|
||||||
|
|
||||||
logger.info(f"Sync dataset content request: mapping_id={request.id}")
|
logger.debug(f"Sync dataset content request: mapping_id={request.id}, sync_annotations={request.sync_annotations}")
|
||||||
|
|
||||||
# request.id 合法性校验
|
# request.id validation
|
||||||
mapping = await mapping_service.get_mapping_by_uuid(request.id)
|
mapping = await mapping_service.get_mapping_by_uuid(request.id)
|
||||||
if not mapping:
|
if not mapping:
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
@@ -52,9 +68,34 @@ async def sync_dataset_content(
|
|||||||
detail=f"Mapping not found: {request.id}"
|
detail=f"Mapping not found: {request.id}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# 执行同步(使用映射中的源数据集UUID)
|
# Sync dataset files
|
||||||
result = await sync_service.sync_dataset_files(request.id, request.batch_size)
|
result = await sync_service.sync_dataset_files(request.id, request.batch_size)
|
||||||
|
|
||||||
|
# Sync annotations if requested
|
||||||
|
if request.sync_annotations:
|
||||||
|
logger.info(f"Syncing annotations: direction={request.annotation_direction}")
|
||||||
|
|
||||||
|
# 根据方向执行标注同步
|
||||||
|
if request.annotation_direction == "ls_to_dm":
|
||||||
|
await sync_service.sync_annotations_from_ls_to_dm(
|
||||||
|
mapping,
|
||||||
|
request.batch_size,
|
||||||
|
request.overwrite
|
||||||
|
)
|
||||||
|
elif request.annotation_direction == "dm_to_ls":
|
||||||
|
await sync_service.sync_annotations_from_dm_to_ls(
|
||||||
|
mapping,
|
||||||
|
request.batch_size,
|
||||||
|
request.overwrite_labeling_project
|
||||||
|
)
|
||||||
|
elif request.annotation_direction == "bidirectional":
|
||||||
|
await sync_service.sync_annotations_bidirectional(
|
||||||
|
mapping,
|
||||||
|
request.batch_size,
|
||||||
|
request.overwrite,
|
||||||
|
request.overwrite_labeling_project
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(f"Sync completed: {result.synced_files}/{result.total_files} files")
|
logger.info(f"Sync completed: {result.synced_files}/{result.total_files} files")
|
||||||
|
|
||||||
return StandardResponse(
|
return StandardResponse(
|
||||||
@@ -73,4 +114,148 @@ async def sync_dataset_content(
|
|||||||
raise HTTPException(status_code=404, detail=str(e))
|
raise HTTPException(status_code=404, detail=str(e))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error syncing dataset content: {e}")
|
logger.error(f"Error syncing dataset content: {e}")
|
||||||
raise HTTPException(status_code=500, detail="Internal server error")
|
raise HTTPException(status_code=500, detail="Internal server error")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/annotation/sync", response_model=StandardResponse[SyncAnnotationsResponse])
|
||||||
|
async def sync_annotations(
|
||||||
|
request: SyncAnnotationsRequest,
|
||||||
|
db: AsyncSession = Depends(get_db)
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
仅同步标注结果(支持双向同步)
|
||||||
|
|
||||||
|
根据指定的mapping ID和同步方向,在DM数据集和Label Studio之间同步标注结果。
|
||||||
|
标注结果存储在数据集文件表的tags字段中,使用简化格式。
|
||||||
|
|
||||||
|
同步策略:
|
||||||
|
- 默认为双向同步,基于时间戳自动解决冲突
|
||||||
|
- overwrite: 控制是否允许用Label Studio的标注覆盖DataMate(基于时间戳比较)
|
||||||
|
- overwriteLabelingProject: 控制是否允许用DataMate的标注覆盖Label Studio(基于时间戳比较)
|
||||||
|
- 如果Label Studio标注的updated_at更新,且overwrite=True,则覆盖DataMate
|
||||||
|
- 如果DataMate标注的updated_at更新,且overwriteLabelingProject=True,则覆盖Label Studio
|
||||||
|
|
||||||
|
Args:
|
||||||
|
request: 同步请求,包含:
|
||||||
|
- id: 映射ID(mapping UUID)
|
||||||
|
- batchSize: 批处理大小
|
||||||
|
- direction: 同步方向 (ls_to_dm/dm_to_ls/bidirectional)
|
||||||
|
- overwrite: 是否允许覆盖DataMate中的标注(默认True)
|
||||||
|
- overwriteLabelingProject: 是否允许覆盖Label Studio中的标注(默认True)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
同步结果,包含同步统计信息和冲突解决情况
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
|
||||||
|
token=settings.label_studio_user_token)
|
||||||
|
dm_client = DatasetManagementService(db)
|
||||||
|
mapping_service = DatasetMappingService(db)
|
||||||
|
sync_service = SyncService(dm_client, ls_client, mapping_service)
|
||||||
|
|
||||||
|
logger.info(f"Sync annotations request: mapping_id={request.id}, direction={request.direction}, overwrite={request.overwrite}, overwrite_ls={request.overwrite_labeling_project}")
|
||||||
|
|
||||||
|
# 验证映射是否存在
|
||||||
|
mapping = await mapping_service.get_mapping_by_uuid(request.id)
|
||||||
|
if not mapping:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404,
|
||||||
|
detail=f"Mapping not found: {request.id}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# 根据方向执行同步
|
||||||
|
if request.direction == "ls_to_dm":
|
||||||
|
result = await sync_service.sync_annotations_from_ls_to_dm(
|
||||||
|
mapping,
|
||||||
|
request.batch_size,
|
||||||
|
request.overwrite
|
||||||
|
)
|
||||||
|
elif request.direction == "dm_to_ls":
|
||||||
|
result = await sync_service.sync_annotations_from_dm_to_ls(
|
||||||
|
mapping,
|
||||||
|
request.batch_size,
|
||||||
|
request.overwrite_labeling_project
|
||||||
|
)
|
||||||
|
elif request.direction == "bidirectional":
|
||||||
|
result = await sync_service.sync_annotations_bidirectional(
|
||||||
|
mapping,
|
||||||
|
request.batch_size,
|
||||||
|
request.overwrite,
|
||||||
|
request.overwrite_labeling_project
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Invalid direction: {request.direction}"
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(f"Annotation sync completed: synced_to_dm={result.synced_to_dm}, synced_to_ls={result.synced_to_ls}, conflicts_resolved={result.conflicts_resolved}")
|
||||||
|
|
||||||
|
return StandardResponse(
|
||||||
|
code=200,
|
||||||
|
message="success",
|
||||||
|
data=result
|
||||||
|
)
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error syncing annotations: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/check-ls-connection")
|
||||||
|
async def check_label_studio_connection():
|
||||||
|
"""
|
||||||
|
检查Label Studio连接状态
|
||||||
|
|
||||||
|
用于诊断Label Studio连接问题,返回连接状态和配置信息
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
ls_client = LabelStudioClient(
|
||||||
|
base_url=settings.label_studio_base_url,
|
||||||
|
token=settings.label_studio_user_token
|
||||||
|
)
|
||||||
|
|
||||||
|
# 尝试获取项目列表来测试连接
|
||||||
|
try:
|
||||||
|
response = await ls_client.client.get("/api/projects")
|
||||||
|
response.raise_for_status()
|
||||||
|
projects = response.json()
|
||||||
|
|
||||||
|
token_display = settings.label_studio_user_token[:10] + "..." if settings.label_studio_user_token else "None"
|
||||||
|
|
||||||
|
return StandardResponse(
|
||||||
|
code=200,
|
||||||
|
message="success",
|
||||||
|
data={
|
||||||
|
"status": "connected",
|
||||||
|
"base_url": settings.label_studio_base_url,
|
||||||
|
"token": token_display,
|
||||||
|
"projects_count": len(projects.get("results", [])) if isinstance(projects, dict) else len(projects),
|
||||||
|
"message": "Successfully connected to Label Studio"
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
token_display = settings.label_studio_user_token[:10] + "..." if settings.label_studio_user_token else "None"
|
||||||
|
|
||||||
|
return StandardResponse(
|
||||||
|
code=500,
|
||||||
|
message="error",
|
||||||
|
data={
|
||||||
|
"status": "disconnected",
|
||||||
|
"base_url": settings.label_studio_base_url,
|
||||||
|
"token": token_display,
|
||||||
|
"error": str(e),
|
||||||
|
"message": f"Failed to connect to Label Studio: {str(e)}",
|
||||||
|
"troubleshooting": [
|
||||||
|
"1. Check if Label Studio is running: docker ps | grep label-studio",
|
||||||
|
"2. Verify LABEL_STUDIO_BASE_URL in .env file",
|
||||||
|
"3. Verify LABEL_STUDIO_USER_TOKEN is valid",
|
||||||
|
"4. Check network connectivity between services"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking Label Studio connection: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail=str(e))
|
||||||
@@ -11,6 +11,8 @@ from .mapping import (
|
|||||||
from .sync import (
|
from .sync import (
|
||||||
SyncDatasetRequest,
|
SyncDatasetRequest,
|
||||||
SyncDatasetResponse,
|
SyncDatasetResponse,
|
||||||
|
SyncAnnotationsRequest,
|
||||||
|
SyncAnnotationsResponse,
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -21,5 +23,7 @@ __all__ = [
|
|||||||
"DatasetMappingResponse",
|
"DatasetMappingResponse",
|
||||||
"SyncDatasetRequest",
|
"SyncDatasetRequest",
|
||||||
"SyncDatasetResponse",
|
"SyncDatasetResponse",
|
||||||
|
"SyncAnnotationsRequest",
|
||||||
|
"SyncAnnotationsResponse",
|
||||||
"DeleteDatasetResponse",
|
"DeleteDatasetResponse",
|
||||||
]
|
]
|
||||||
@@ -1,4 +1,7 @@
|
|||||||
from pydantic import Field
|
from typing import Literal, List, Dict, Any, Optional
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from pydantic import Field, ConfigDict
|
||||||
|
|
||||||
from app.module.shared.schema import BaseResponseModel
|
from app.module.shared.schema import BaseResponseModel
|
||||||
from app.module.shared.schema import StandardResponse
|
from app.module.shared.schema import StandardResponse
|
||||||
@@ -6,8 +9,27 @@ from app.module.shared.schema import StandardResponse
|
|||||||
|
|
||||||
class SyncDatasetRequest(BaseResponseModel):
|
class SyncDatasetRequest(BaseResponseModel):
|
||||||
"""同步数据集请求模型"""
|
"""同步数据集请求模型"""
|
||||||
|
model_config = ConfigDict(populate_by_name=True)
|
||||||
|
|
||||||
id: str = Field(..., description="映射ID(mapping UUID)")
|
id: str = Field(..., description="映射ID(mapping UUID)")
|
||||||
batch_size: int = Field(50, ge=1, le=100, description="批处理大小")
|
batch_size: int = Field(50, ge=1, le=100, description="批处理大小", alias="batchSize")
|
||||||
|
file_priority: Literal[0, 1] = Field(0, description="0 数据集为主,1 标注平台为主", alias="filePriority")
|
||||||
|
label_priority: Literal[0, 1] = Field(0, description="0 数据集为主,1 标注平台为主", alias="labelPriority")
|
||||||
|
sync_annotations: bool = Field(True, description="是否同步标注数据", alias="syncAnnotations")
|
||||||
|
annotation_direction: Literal["ls_to_dm", "dm_to_ls", "bidirectional"] = Field(
|
||||||
|
"bidirectional",
|
||||||
|
description="标注同步方向: ls_to_dm(Label Studio到数据集), dm_to_ls(数据集到Label Studio), bidirectional(双向)",
|
||||||
|
alias="annotationDirection"
|
||||||
|
)
|
||||||
|
overwrite: bool = Field(
|
||||||
|
True,
|
||||||
|
description="是否覆盖DataMate中的标注(基于时间戳比较)"
|
||||||
|
)
|
||||||
|
overwrite_labeling_project: bool = Field(
|
||||||
|
True,
|
||||||
|
description="是否覆盖Label Studio中的标注(基于时间戳比较)",
|
||||||
|
alias="overwriteLabelingProject"
|
||||||
|
)
|
||||||
|
|
||||||
class SyncDatasetResponse(BaseResponseModel):
|
class SyncDatasetResponse(BaseResponseModel):
|
||||||
"""同步数据集响应模型"""
|
"""同步数据集响应模型"""
|
||||||
@@ -18,4 +40,53 @@ class SyncDatasetResponse(BaseResponseModel):
|
|||||||
message: str = Field(..., description="响应消息")
|
message: str = Field(..., description="响应消息")
|
||||||
|
|
||||||
class SyncDatasetResponseStd(StandardResponse[SyncDatasetResponse]):
|
class SyncDatasetResponseStd(StandardResponse[SyncDatasetResponse]):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SyncAnnotationsRequest(BaseResponseModel):
|
||||||
|
"""同步标注请求模型
|
||||||
|
|
||||||
|
使用camelCase作为API接口字段名(通过alias),但Python代码内部使用snake_case。
|
||||||
|
Pydantic会自动处理两种格式的转换。
|
||||||
|
"""
|
||||||
|
model_config = ConfigDict(populate_by_name=True)
|
||||||
|
|
||||||
|
id: str = Field(..., description="映射ID(mapping UUID)")
|
||||||
|
batch_size: int = Field(50, ge=1, le=100, description="批处理大小", alias="batchSize")
|
||||||
|
direction: Literal["ls_to_dm", "dm_to_ls", "bidirectional"] = Field(
|
||||||
|
"bidirectional",
|
||||||
|
description="同步方向: ls_to_dm(Label Studio到数据集), dm_to_ls(数据集到Label Studio), bidirectional(双向)"
|
||||||
|
)
|
||||||
|
overwrite: bool = Field(
|
||||||
|
True,
|
||||||
|
description="是否覆盖DataMate中的标注(基于时间戳比较)。True时,如果Label Studio的标注更新时间更新,则覆盖DataMate的标注"
|
||||||
|
)
|
||||||
|
overwrite_labeling_project: bool = Field(
|
||||||
|
True,
|
||||||
|
description="是否覆盖Label Studio中的标注(基于时间戳比较)。True时,如果DataMate的标注更新时间更新,则覆盖Label Studio的标注",
|
||||||
|
alias="overwriteLabelingProject"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TagInfo(BaseResponseModel):
|
||||||
|
"""标注信息结构(不包含时间戳,时间戳存储在文件级别的tags_updated_at字段)"""
|
||||||
|
from_name: str = Field(..., description="标注工具名称")
|
||||||
|
to_name: str = Field(..., description="目标对象名称")
|
||||||
|
type: str = Field(..., description="标注类型")
|
||||||
|
values: Dict[str, Any] = Field(..., description="标注值")
|
||||||
|
|
||||||
|
|
||||||
|
class SyncAnnotationsResponse(BaseResponseModel):
|
||||||
|
"""同步标注响应模型"""
|
||||||
|
id: str = Field(..., description="映射UUID")
|
||||||
|
status: str = Field(..., description="同步状态: success/partial/error")
|
||||||
|
synced_to_dm: int = Field(0, description="同步到数据集的标注数量")
|
||||||
|
synced_to_ls: int = Field(0, description="同步到Label Studio的标注数量")
|
||||||
|
skipped: int = Field(0, description="跳过的标注数量")
|
||||||
|
failed: int = Field(0, description="失败的标注数量")
|
||||||
|
conflicts_resolved: int = Field(0, description="解决的冲突数量")
|
||||||
|
message: str = Field(..., description="响应消息")
|
||||||
|
|
||||||
|
|
||||||
|
class SyncAnnotationsResponseStd(StandardResponse[SyncAnnotationsResponse]):
|
||||||
pass
|
pass
|
||||||
File diff suppressed because it is too large
Load Diff
@@ -15,6 +15,8 @@ class DatasetFileResponse(BaseModel):
|
|||||||
description: Optional[str] = Field(None, description="文件描述")
|
description: Optional[str] = Field(None, description="文件描述")
|
||||||
uploadedBy: Optional[str] = Field(None, description="上传者")
|
uploadedBy: Optional[str] = Field(None, description="上传者")
|
||||||
lastAccessTime: Optional[datetime] = Field(None, description="最后访问时间")
|
lastAccessTime: Optional[datetime] = Field(None, description="最后访问时间")
|
||||||
|
tags: Optional[List[Dict[str, Any]]] = Field(None, description="文件标签/标注信息")
|
||||||
|
tags_updated_at: Optional[datetime] = Field(None, description="标签最后更新时间", alias="tagsUpdatedAt")
|
||||||
|
|
||||||
class PagedDatasetFileResponse(BaseModel):
|
class PagedDatasetFileResponse(BaseModel):
|
||||||
"""DM服务分页文件响应模型"""
|
"""DM服务分页文件响应模型"""
|
||||||
|
|||||||
@@ -108,7 +108,9 @@ class Service:
|
|||||||
uploadedAt=f.upload_time, # type: ignore
|
uploadedAt=f.upload_time, # type: ignore
|
||||||
description=None,
|
description=None,
|
||||||
uploadedBy=None,
|
uploadedBy=None,
|
||||||
lastAccessTime=f.last_access_time # type: ignore
|
lastAccessTime=f.last_access_time, # type: ignore
|
||||||
|
tags=f.tags, # type: ignore
|
||||||
|
tags_updated_at=f.tags_updated_at # type: ignore
|
||||||
)
|
)
|
||||||
for f in files
|
for f in files
|
||||||
]
|
]
|
||||||
|
|||||||
Binary file not shown.
@@ -1,3 +1,6 @@
|
|||||||
|
export LOG_LEVEL=DEBUG
|
||||||
|
export DEBUG=true
|
||||||
|
|
||||||
uvicorn app.main:app \
|
uvicorn app.main:app \
|
||||||
--host 0.0.0.0 \
|
--host 0.0.0.0 \
|
||||||
--port 18000 \
|
--port 18000 \
|
||||||
|
|||||||
@@ -55,6 +55,7 @@ CREATE TABLE IF NOT EXISTS t_dm_dataset_files (
|
|||||||
file_size BIGINT DEFAULT 0 COMMENT '文件大小(字节)',
|
file_size BIGINT DEFAULT 0 COMMENT '文件大小(字节)',
|
||||||
check_sum VARCHAR(64) COMMENT '文件校验和',
|
check_sum VARCHAR(64) COMMENT '文件校验和',
|
||||||
tags JSON COMMENT '文件标签信息',
|
tags JSON COMMENT '文件标签信息',
|
||||||
|
tags_updated_at TIMESTAMP NULL COMMENT '标签最后更新时间',
|
||||||
metadata JSON COMMENT '文件元数据',
|
metadata JSON COMMENT '文件元数据',
|
||||||
status VARCHAR(50) DEFAULT 'ACTIVE' COMMENT '文件状态:ACTIVE/DELETED/PROCESSING',
|
status VARCHAR(50) DEFAULT 'ACTIVE' COMMENT '文件状态:ACTIVE/DELETED/PROCESSING',
|
||||||
upload_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '上传时间',
|
upload_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '上传时间',
|
||||||
|
|||||||
Reference in New Issue
Block a user