feat: File and Annotation 2-way sync implementation (#63)

* feat: Refactor configuration and sync logic for improved dataset handling and logging

* feat: Enhance annotation synchronization and dataset file management

- Added new fields `tags_updated_at` to `DatasetFiles` model for tracking the last update time of tags.
- Implemented new asynchronous methods in the Label Studio client for fetching, creating, updating, and deleting task annotations.
- Introduced bidirectional synchronization for annotations between DataMate and Label Studio, allowing for flexible data management.
- Updated sync service to handle annotation conflicts based on timestamps, ensuring data integrity during synchronization.
- Enhanced dataset file response model to include tags and their update timestamps.
- Modified database initialization script to create a new column for `tags_updated_at` in the dataset files table.
- Updated requirements to ensure compatibility with the latest dependencies.
This commit is contained in:
Jason Wang
2025-11-07 15:03:07 +08:00
committed by GitHub
parent d136bad38c
commit 78f50ea520
16 changed files with 1336 additions and 290 deletions

View File

@@ -1,94 +1,19 @@
# ==================================== # Dev settings
# Label Studio Adapter Configuration
# ====================================
# =========================
# 应用程序配置
# =========================
APP_NAME="Label Studio Adapter"
APP_VERSION="1.0.0"
APP_DESCRIPTION="Adapter for integrating Data Management System with Label Studio"
DEBUG=true
# =========================
# 服务器配置
# =========================
HOST=0.0.0.0 HOST=0.0.0.0
PORT=18000 PORT=18000
# ========================= DEBUG=true
# 日志配置 LOG_LEVEL=DEBUG
# ========================= LOG_FILE_DIR=./logs
LOG_LEVEL=INFO
# ========================= # DataBase
# Label Studio 服务配置 MYSQL_HOST=localhost
# =========================
# Label Studio 服务地址(根据部署方式调整)
# Docker 环境:http://label-studio:8080
# 本地开发:http://127.0.0.1:8000
LABEL_STUDIO_BASE_URL=http://label-studio:8080
# Label Studio 用户名和密码(用于自动创建用户)
LABEL_STUDIO_USERNAME=admin@example.com
LABEL_STUDIO_PASSWORD=password
# Label Studio API 认证 Token(Legacy Token,推荐使用)
# 从 Label Studio UI 的 Account & Settings > Access Token 获取
LABEL_STUDIO_USER_TOKEN=your-label-studio-token-here
# Label Studio 本地文件存储基础路径(容器内路径,用于 Docker 部署时的权限检查)
LABEL_STUDIO_LOCAL_BASE=/label-studio/local_files
# Label Studio 本地文件服务路径前缀(任务数据中的文件路径前缀)
LABEL_STUDIO_FILE_PATH_PREFIX=/data/local-files/?d=
# Label Studio 容器中的本地存储路径(用于配置 Local Storage)
LABEL_STUDIO_LOCAL_STORAGE_DATASET_BASE_PATH=/label-studio/local_files/dataset
LABEL_STUDIO_LOCAL_STORAGE_UPLOAD_BASE_PATH=/label-studio/local_files/upload
# Label Studio 任务列表分页大小
LS_TASK_PAGE_SIZE=1000
# =========================
# Data Management 服务配置
# =========================
# DM 存储文件夹前缀(通常与 Label Studio 的 local-files 文件夹映射一致)
DM_FILE_PATH_PREFIX=/
# =========================
# Adapter 数据库配置 (MySQL)
# =========================
# 优先级1:如果配置了 MySQL,将优先使用 MySQL 数据库
MYSQL_HOST=adapter-db
MYSQL_PORT=3306 MYSQL_PORT=3306
MYSQL_USER=label_studio_user MYSQL_USER=root
MYSQL_PASSWORD=user_password MYSQL_PASSWORD=password
MYSQL_DATABASE=label_studio_adapter MYSQL_DATABASE=datamate
# ========================= # Label Studio settings
# CORS 配置 LABEL_STUDIO_BASE_URL=http://localhost:8080
# =========================
# 允许的来源(生产环境建议配置具体域名)
ALLOWED_ORIGINS=["*"]
# 允许的 HTTP 方法 LABEL_STUDIO_USER_TOKEN="demo_dev_token"
ALLOWED_METHODS=["*"]
# 允许的请求头
ALLOWED_HEADERS=["*"]
# =========================
# Docker Compose 配置
# =========================
# Docker Compose 项目名称前缀
COMPOSE_PROJECT_NAME=ls-adapter
# =========================
# 同步配置(未来扩展)
# =========================
# 批量同步任务的批次大小
SYNC_BATCH_SIZE=100
# 同步失败时的最大重试次数
MAX_RETRIES=3

View File

@@ -4,3 +4,5 @@
# logs # logs
logs/ logs/
doc/

View File

@@ -1,8 +1,6 @@
from pydantic_settings import BaseSettings from pydantic_settings import BaseSettings
from pydantic import model_validator from pydantic import model_validator
from typing import Optional, List from typing import Optional
import os
from pathlib import Path
class Settings(BaseSettings): class Settings(BaseSettings):
"""应用程序配置""" """应用程序配置"""
@@ -10,39 +8,34 @@ class Settings(BaseSettings):
class Config: class Config:
env_file = ".env" env_file = ".env"
case_sensitive = False case_sensitive = False
extra = 'ignore' # 允许额外字段(如 Shell 脚本专用的环境变量) extra = 'ignore'
# ========================= # Service
# Adapter 服务配置 app_name: str = "DataMate Python Backend"
# =========================
app_name: str = "Label Studio Adapter"
app_version: str = "1.0.0" app_version: str = "1.0.0"
app_description: str = "Adapter for integrating Data Management System with Label Studio" app_description: str = "Adapter for integrating Data Management System with Label Studio"
# 日志配置
log_level: str = "INFO"
debug: bool = True
log_file_dir: str = "/var/log/datamate"
# 服务器配置
host: str = "0.0.0.0" host: str = "0.0.0.0"
port: int = 8000 port: int = 18000
# CORS配置 # CORS
# allowed_origins: List[str] = ["*"] # allowed_origins: List[str] = ["*"]
# allowed_methods: List[str] = ["*"] # allowed_methods: List[str] = ["*"]
# allowed_headers: List[str] = ["*"] # allowed_headers: List[str] = ["*"]
# MySQL数据库配置 (优先级1) # Log
log_level: str = "INFO"
debug: bool = True
log_file_dir: str = "/var/log/datamate"
# Database
mysql_host: str = "datamate-database" mysql_host: str = "datamate-database"
mysql_port: int = 3306 mysql_port: int = 3306
mysql_user: str = "root" mysql_user: str = "root"
mysql_password: str = "password" mysql_password: str = "password"
mysql_database: str = "datamate" mysql_database: str = "datamate"
# 直接数据库URL配置(如果提供,将覆盖上述配置) database_url: str = "" # Will be overridden by build_database_url() if not provided
# 初始值为空字符串,在 model_validator 中会被设置为完整的 URL
database_url: str = ""
@model_validator(mode='after') @model_validator(mode='after')
def build_database_url(self): def build_database_url(self):
@@ -55,22 +48,18 @@ class Settings(BaseSettings):
return self return self
# ========================= # Label Studio
# Label Studio 服务配置
# =========================
label_studio_base_url: str = "http://label-studio:8000" label_studio_base_url: str = "http://label-studio:8000"
label_studio_username: Optional[str] = "admin@demo.com" # Label Studio 用户名(用于登录) label_studio_username: Optional[str] = "admin@demo.com"
label_studio_password: Optional[str] = "demoadmin" # Label Studio 密码(用于登录) label_studio_password: Optional[str] = "demoadmin"
label_studio_user_token: Optional[str] = "abc123abc123" # Legacy Token label_studio_user_token: Optional[str] = "abc123abc123" # Legacy Token
label_studio_local_storage_dataset_base_path: str = "/label-studio/local" # Label Studio容器中的本地存储基础路径 label_studio_local_document_root: str = "/label-studio/local" # Label Studio local file storage path
label_studio_file_path_prefix: str = "/data/local-files/?d=" # Label Studio本地文件服务路径前缀 label_studio_file_path_prefix: str = "/data/local-files/?d=" # Label Studio local file serving URL prefix
ls_task_page_size: int = 1000 ls_task_page_size: int = 1000
# ========================= # DataMate
# Data Management 服务配置
# =========================
dm_file_path_prefix: str = "/dataset" # DM存储文件夹前缀 dm_file_path_prefix: str = "/dataset" # DM存储文件夹前缀
# 全局设置实例 # 全局设置实例

View File

@@ -64,6 +64,7 @@ class DatasetFiles(Base):
file_size = Column(BigInteger, default=0, comment="文件大小(字节)") file_size = Column(BigInteger, default=0, comment="文件大小(字节)")
check_sum = Column(String(64), nullable=True, comment="文件校验和") check_sum = Column(String(64), nullable=True, comment="文件校验和")
tags = Column(JSON, nullable=True, comment="文件标签信息") tags = Column(JSON, nullable=True, comment="文件标签信息")
tags_updated_at = Column(TIMESTAMP, nullable=True, comment="标签最后更新时间")
dataset_filemetadata = Column("metadata", JSON, nullable=True, comment="文件元数据") dataset_filemetadata = Column("metadata", JSON, nullable=True, comment="文件元数据")
status = Column(String(50), default='ACTIVE', comment="文件状态:ACTIVE/DELETED/PROCESSING") status = Column(String(50), default='ACTIVE', comment="文件状态:ACTIVE/DELETED/PROCESSING")
upload_time = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="上传时间") upload_time = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="上传时间")

View File

@@ -45,7 +45,7 @@ async def lifespan(app: FastAPI):
yield yield
# @shutdown # @shutdown
logger.info("DataMate Python Backend shutting down ...") logger.info("DataMate Python Backend shutting down ...\n\n")
# 创建FastAPI应用 # 创建FastAPI应用
app = FastAPI( app = FastAPI(
@@ -69,11 +69,7 @@ app = FastAPI(
app.include_router(router) app.include_router(router)
# 输出注册的路由(每行一个) # 输出注册的路由(每行一个)
logger.debug("Registered routes:") logger.debug(f"Registered routes refer to http://localhost:{settings.port}/redoc")
for route in app.routes:
route_path = getattr(route, "path", None)
if route_path:
logger.debug(f" {route_path}")
# 注册全局异常处理器 # 注册全局异常处理器
app.add_exception_handler(StarletteHTTPException, starlette_http_exception_handler) # type: ignore app.add_exception_handler(StarletteHTTPException, starlette_http_exception_handler) # type: ignore
@@ -102,7 +98,7 @@ async def root():
data={ data={
"message": f"{settings.app_name} is running", "message": f"{settings.app_name} is running",
"version": settings.app_version, "version": settings.app_version,
"docs_url": "/docs", "docs_url": "/redoc",
"label_studio_url": settings.label_studio_base_url "label_studio_url": settings.label_studio_base_url
} }
) )

View File

@@ -380,6 +380,148 @@ class Client:
logger.error(f"Error while deleting project {project_id}: {e}") logger.error(f"Error while deleting project {project_id}: {e}")
return False return False
async def get_task_annotations(
self,
task_id: int
) -> Optional[List[Dict[str, Any]]]:
"""获取任务的标注结果
Args:
task_id: 任务ID
Returns:
标注结果列表,每个标注包含完整的annotation信息
"""
try:
logger.debug(f"Fetching annotations for task: {task_id}")
response = await self.client.get(f"/api/tasks/{task_id}/annotations")
response.raise_for_status()
annotations = response.json()
logger.debug(f"Fetched {len(annotations)} annotations for task {task_id}")
return annotations
except httpx.HTTPStatusError as e:
logger.error(f"Get task annotations failed HTTP {e.response.status_code}: {e.response.text}")
return None
except Exception as e:
logger.error(f"Error while getting task annotations: {e}")
return None
async def create_annotation(
self,
task_id: int,
result: List[Dict[str, Any]],
completed_by: Optional[int] = None
) -> Optional[Dict[str, Any]]:
"""为任务创建新的标注
Args:
task_id: 任务ID
result: 标注结果列表
completed_by: 完成标注的用户ID(可选)
Returns:
创建的标注信息,失败返回None
"""
try:
logger.debug(f"Creating annotation for task: {task_id}")
annotation_data = {
"result": result,
"task": task_id
}
if completed_by:
annotation_data["completed_by"] = completed_by
response = await self.client.post(
f"/api/tasks/{task_id}/annotations",
json=annotation_data
)
response.raise_for_status()
annotation = response.json()
logger.debug(f"Created annotation {annotation.get('id')} for task {task_id}")
return annotation
except httpx.HTTPStatusError as e:
logger.error(f"Create annotation failed HTTP {e.response.status_code}: {e.response.text}")
return None
except Exception as e:
logger.error(f"Error while creating annotation: {e}")
return None
async def update_annotation(
self,
annotation_id: int,
result: List[Dict[str, Any]]
) -> Optional[Dict[str, Any]]:
"""更新已存在的标注
Args:
annotation_id: 标注ID
result: 新的标注结果列表
Returns:
更新后的标注信息,失败返回None
"""
try:
logger.debug(f"Updating annotation: {annotation_id}")
annotation_data = {
"result": result
}
response = await self.client.patch(
f"/api/annotations/{annotation_id}",
json=annotation_data
)
response.raise_for_status()
annotation = response.json()
logger.debug(f"Updated annotation {annotation_id}")
return annotation
except httpx.HTTPStatusError as e:
logger.error(f"Update annotation failed HTTP {e.response.status_code}: {e.response.text}")
return None
except Exception as e:
logger.error(f"Error while updating annotation: {e}")
return None
async def delete_annotation(
self,
annotation_id: int
) -> bool:
"""删除标注
Args:
annotation_id: 标注ID
Returns:
成功返回True,失败返回False
"""
try:
logger.debug(f"Deleting annotation: {annotation_id}")
response = await self.client.delete(f"/api/annotations/{annotation_id}")
response.raise_for_status()
logger.debug(f"Deleted annotation {annotation_id}")
return True
except httpx.HTTPStatusError as e:
logger.error(f"Delete annotation failed HTTP {e.response.status_code}: {e.response.text}")
return False
except Exception as e:
logger.error(f"Error while deleting annotation: {e}")
return False
async def create_local_storage( async def create_local_storage(
self, self,
project_id: int, project_id: int,

View File

@@ -80,7 +80,7 @@ async def create_mapping(
project_id = project_data["id"] project_id = project_data["id"]
# 配置本地存储:dataset/<id> # 配置本地存储:dataset/<id>
local_storage_path = f"{settings.label_studio_local_storage_dataset_base_path}/{request.dataset_id}" local_storage_path = f"{settings.label_studio_local_document_root}/{request.dataset_id}"
storage_result = await ls_client.create_local_storage( storage_result = await ls_client.create_local_storage(
project_id=project_id, project_id=project_id,
path=local_storage_path, path=local_storage_path,

View File

@@ -15,6 +15,8 @@ from ..service.mapping import DatasetMappingService
from ..schema import ( from ..schema import (
SyncDatasetRequest, SyncDatasetRequest,
SyncDatasetResponse, SyncDatasetResponse,
SyncAnnotationsRequest,
SyncAnnotationsResponse,
) )
@@ -30,10 +32,24 @@ async def sync_dataset_content(
db: AsyncSession = Depends(get_db) db: AsyncSession = Depends(get_db)
): ):
""" """
同步数据集内容 同步数据集内容(包括文件和标注)
根据指定的mapping ID,同步DM程序数据集中的内容到Label Studio数据集中 根据指定的mapping ID,同步DM程序数据集中的内容到Label Studio数据集中
在数据库中记录更新时间,返回更新状态 默认同时同步文件和标注数据。
Args:
request: 同步请求,包含:
- id: 映射ID(mapping UUID)
- batchSize: 批处理大小
- filePriority: 文件同步优先级
- labelPriority: 标签同步优先级
- syncAnnotations: 是否同步标注(默认True)
- annotationDirection: 标注同步方向(默认bidirectional)
- overwrite: 是否允许覆盖DataMate中的标注(默认True)
- overwriteLabelingProject: 是否允许覆盖Label Studio中的标注(默认True)
Returns:
同步结果
""" """
try: try:
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url, ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
@@ -42,9 +58,9 @@ async def sync_dataset_content(
mapping_service = DatasetMappingService(db) mapping_service = DatasetMappingService(db)
sync_service = SyncService(dm_client, ls_client, mapping_service) sync_service = SyncService(dm_client, ls_client, mapping_service)
logger.info(f"Sync dataset content request: mapping_id={request.id}") logger.debug(f"Sync dataset content request: mapping_id={request.id}, sync_annotations={request.sync_annotations}")
# request.id 合法性校验 # request.id validation
mapping = await mapping_service.get_mapping_by_uuid(request.id) mapping = await mapping_service.get_mapping_by_uuid(request.id)
if not mapping: if not mapping:
raise HTTPException( raise HTTPException(
@@ -52,9 +68,34 @@ async def sync_dataset_content(
detail=f"Mapping not found: {request.id}" detail=f"Mapping not found: {request.id}"
) )
# 执行同步(使用映射中的源数据集UUID) # Sync dataset files
result = await sync_service.sync_dataset_files(request.id, request.batch_size) result = await sync_service.sync_dataset_files(request.id, request.batch_size)
# Sync annotations if requested
if request.sync_annotations:
logger.info(f"Syncing annotations: direction={request.annotation_direction}")
# 根据方向执行标注同步
if request.annotation_direction == "ls_to_dm":
await sync_service.sync_annotations_from_ls_to_dm(
mapping,
request.batch_size,
request.overwrite
)
elif request.annotation_direction == "dm_to_ls":
await sync_service.sync_annotations_from_dm_to_ls(
mapping,
request.batch_size,
request.overwrite_labeling_project
)
elif request.annotation_direction == "bidirectional":
await sync_service.sync_annotations_bidirectional(
mapping,
request.batch_size,
request.overwrite,
request.overwrite_labeling_project
)
logger.info(f"Sync completed: {result.synced_files}/{result.total_files} files") logger.info(f"Sync completed: {result.synced_files}/{result.total_files} files")
return StandardResponse( return StandardResponse(
@@ -74,3 +115,147 @@ async def sync_dataset_content(
except Exception as e: except Exception as e:
logger.error(f"Error syncing dataset content: {e}") logger.error(f"Error syncing dataset content: {e}")
raise HTTPException(status_code=500, detail="Internal server error") raise HTTPException(status_code=500, detail="Internal server error")
@router.post("/annotation/sync", response_model=StandardResponse[SyncAnnotationsResponse])
async def sync_annotations(
request: SyncAnnotationsRequest,
db: AsyncSession = Depends(get_db)
):
"""
仅同步标注结果(支持双向同步)
根据指定的mapping ID和同步方向,在DM数据集和Label Studio之间同步标注结果。
标注结果存储在数据集文件表的tags字段中,使用简化格式。
同步策略:
- 默认为双向同步,基于时间戳自动解决冲突
- overwrite: 控制是否允许用Label Studio的标注覆盖DataMate(基于时间戳比较)
- overwriteLabelingProject: 控制是否允许用DataMate的标注覆盖Label Studio(基于时间戳比较)
- 如果Label Studio标注的updated_at更新,且overwrite=True,则覆盖DataMate
- 如果DataMate标注的updated_at更新,且overwriteLabelingProject=True,则覆盖Label Studio
Args:
request: 同步请求,包含:
- id: 映射ID(mapping UUID)
- batchSize: 批处理大小
- direction: 同步方向 (ls_to_dm/dm_to_ls/bidirectional)
- overwrite: 是否允许覆盖DataMate中的标注(默认True)
- overwriteLabelingProject: 是否允许覆盖Label Studio中的标注(默认True)
Returns:
同步结果,包含同步统计信息和冲突解决情况
"""
try:
ls_client = LabelStudioClient(base_url=settings.label_studio_base_url,
token=settings.label_studio_user_token)
dm_client = DatasetManagementService(db)
mapping_service = DatasetMappingService(db)
sync_service = SyncService(dm_client, ls_client, mapping_service)
logger.info(f"Sync annotations request: mapping_id={request.id}, direction={request.direction}, overwrite={request.overwrite}, overwrite_ls={request.overwrite_labeling_project}")
# 验证映射是否存在
mapping = await mapping_service.get_mapping_by_uuid(request.id)
if not mapping:
raise HTTPException(
status_code=404,
detail=f"Mapping not found: {request.id}"
)
# 根据方向执行同步
if request.direction == "ls_to_dm":
result = await sync_service.sync_annotations_from_ls_to_dm(
mapping,
request.batch_size,
request.overwrite
)
elif request.direction == "dm_to_ls":
result = await sync_service.sync_annotations_from_dm_to_ls(
mapping,
request.batch_size,
request.overwrite_labeling_project
)
elif request.direction == "bidirectional":
result = await sync_service.sync_annotations_bidirectional(
mapping,
request.batch_size,
request.overwrite,
request.overwrite_labeling_project
)
else:
raise HTTPException(
status_code=400,
detail=f"Invalid direction: {request.direction}"
)
logger.info(f"Annotation sync completed: synced_to_dm={result.synced_to_dm}, synced_to_ls={result.synced_to_ls}, conflicts_resolved={result.conflicts_resolved}")
return StandardResponse(
code=200,
message="success",
data=result
)
except HTTPException:
raise
except Exception as e:
logger.error(f"Error syncing annotations: {e}")
raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
@router.get("/check-ls-connection")
async def check_label_studio_connection():
"""
检查Label Studio连接状态
用于诊断Label Studio连接问题,返回连接状态和配置信息
"""
try:
ls_client = LabelStudioClient(
base_url=settings.label_studio_base_url,
token=settings.label_studio_user_token
)
# 尝试获取项目列表来测试连接
try:
response = await ls_client.client.get("/api/projects")
response.raise_for_status()
projects = response.json()
token_display = settings.label_studio_user_token[:10] + "..." if settings.label_studio_user_token else "None"
return StandardResponse(
code=200,
message="success",
data={
"status": "connected",
"base_url": settings.label_studio_base_url,
"token": token_display,
"projects_count": len(projects.get("results", [])) if isinstance(projects, dict) else len(projects),
"message": "Successfully connected to Label Studio"
}
)
except Exception as e:
token_display = settings.label_studio_user_token[:10] + "..." if settings.label_studio_user_token else "None"
return StandardResponse(
code=500,
message="error",
data={
"status": "disconnected",
"base_url": settings.label_studio_base_url,
"token": token_display,
"error": str(e),
"message": f"Failed to connect to Label Studio: {str(e)}",
"troubleshooting": [
"1. Check if Label Studio is running: docker ps | grep label-studio",
"2. Verify LABEL_STUDIO_BASE_URL in .env file",
"3. Verify LABEL_STUDIO_USER_TOKEN is valid",
"4. Check network connectivity between services"
]
}
)
except Exception as e:
logger.error(f"Error checking Label Studio connection: {e}")
raise HTTPException(status_code=500, detail=str(e))

View File

@@ -11,6 +11,8 @@ from .mapping import (
from .sync import ( from .sync import (
SyncDatasetRequest, SyncDatasetRequest,
SyncDatasetResponse, SyncDatasetResponse,
SyncAnnotationsRequest,
SyncAnnotationsResponse,
) )
__all__ = [ __all__ = [
@@ -21,5 +23,7 @@ __all__ = [
"DatasetMappingResponse", "DatasetMappingResponse",
"SyncDatasetRequest", "SyncDatasetRequest",
"SyncDatasetResponse", "SyncDatasetResponse",
"SyncAnnotationsRequest",
"SyncAnnotationsResponse",
"DeleteDatasetResponse", "DeleteDatasetResponse",
] ]

View File

@@ -1,4 +1,7 @@
from pydantic import Field from typing import Literal, List, Dict, Any, Optional
from datetime import datetime
from pydantic import Field, ConfigDict
from app.module.shared.schema import BaseResponseModel from app.module.shared.schema import BaseResponseModel
from app.module.shared.schema import StandardResponse from app.module.shared.schema import StandardResponse
@@ -6,8 +9,27 @@ from app.module.shared.schema import StandardResponse
class SyncDatasetRequest(BaseResponseModel): class SyncDatasetRequest(BaseResponseModel):
"""同步数据集请求模型""" """同步数据集请求模型"""
model_config = ConfigDict(populate_by_name=True)
id: str = Field(..., description="映射ID(mapping UUID)") id: str = Field(..., description="映射ID(mapping UUID)")
batch_size: int = Field(50, ge=1, le=100, description="批处理大小") batch_size: int = Field(50, ge=1, le=100, description="批处理大小", alias="batchSize")
file_priority: Literal[0, 1] = Field(0, description="0 数据集为主,1 标注平台为主", alias="filePriority")
label_priority: Literal[0, 1] = Field(0, description="0 数据集为主,1 标注平台为主", alias="labelPriority")
sync_annotations: bool = Field(True, description="是否同步标注数据", alias="syncAnnotations")
annotation_direction: Literal["ls_to_dm", "dm_to_ls", "bidirectional"] = Field(
"bidirectional",
description="标注同步方向: ls_to_dm(Label Studio到数据集), dm_to_ls(数据集到Label Studio), bidirectional(双向)",
alias="annotationDirection"
)
overwrite: bool = Field(
True,
description="是否覆盖DataMate中的标注(基于时间戳比较)"
)
overwrite_labeling_project: bool = Field(
True,
description="是否覆盖Label Studio中的标注(基于时间戳比较)",
alias="overwriteLabelingProject"
)
class SyncDatasetResponse(BaseResponseModel): class SyncDatasetResponse(BaseResponseModel):
"""同步数据集响应模型""" """同步数据集响应模型"""
@@ -19,3 +41,52 @@ class SyncDatasetResponse(BaseResponseModel):
class SyncDatasetResponseStd(StandardResponse[SyncDatasetResponse]): class SyncDatasetResponseStd(StandardResponse[SyncDatasetResponse]):
pass pass
class SyncAnnotationsRequest(BaseResponseModel):
"""同步标注请求模型
使用camelCase作为API接口字段名(通过alias),但Python代码内部使用snake_case。
Pydantic会自动处理两种格式的转换。
"""
model_config = ConfigDict(populate_by_name=True)
id: str = Field(..., description="映射ID(mapping UUID)")
batch_size: int = Field(50, ge=1, le=100, description="批处理大小", alias="batchSize")
direction: Literal["ls_to_dm", "dm_to_ls", "bidirectional"] = Field(
"bidirectional",
description="同步方向: ls_to_dm(Label Studio到数据集), dm_to_ls(数据集到Label Studio), bidirectional(双向)"
)
overwrite: bool = Field(
True,
description="是否覆盖DataMate中的标注(基于时间戳比较)。True时,如果Label Studio的标注更新时间更新,则覆盖DataMate的标注"
)
overwrite_labeling_project: bool = Field(
True,
description="是否覆盖Label Studio中的标注(基于时间戳比较)。True时,如果DataMate的标注更新时间更新,则覆盖Label Studio的标注",
alias="overwriteLabelingProject"
)
class TagInfo(BaseResponseModel):
"""标注信息结构(不包含时间戳,时间戳存储在文件级别的tags_updated_at字段)"""
from_name: str = Field(..., description="标注工具名称")
to_name: str = Field(..., description="目标对象名称")
type: str = Field(..., description="标注类型")
values: Dict[str, Any] = Field(..., description="标注值")
class SyncAnnotationsResponse(BaseResponseModel):
"""同步标注响应模型"""
id: str = Field(..., description="映射UUID")
status: str = Field(..., description="同步状态: success/partial/error")
synced_to_dm: int = Field(0, description="同步到数据集的标注数量")
synced_to_ls: int = Field(0, description="同步到Label Studio的标注数量")
skipped: int = Field(0, description="跳过的标注数量")
failed: int = Field(0, description="失败的标注数量")
conflicts_resolved: int = Field(0, description="解决的冲突数量")
message: str = Field(..., description="响应消息")
class SyncAnnotationsResponseStd(StandardResponse[SyncAnnotationsResponse]):
pass

File diff suppressed because it is too large Load Diff

View File

@@ -15,6 +15,8 @@ class DatasetFileResponse(BaseModel):
description: Optional[str] = Field(None, description="文件描述") description: Optional[str] = Field(None, description="文件描述")
uploadedBy: Optional[str] = Field(None, description="上传者") uploadedBy: Optional[str] = Field(None, description="上传者")
lastAccessTime: Optional[datetime] = Field(None, description="最后访问时间") lastAccessTime: Optional[datetime] = Field(None, description="最后访问时间")
tags: Optional[List[Dict[str, Any]]] = Field(None, description="文件标签/标注信息")
tags_updated_at: Optional[datetime] = Field(None, description="标签最后更新时间", alias="tagsUpdatedAt")
class PagedDatasetFileResponse(BaseModel): class PagedDatasetFileResponse(BaseModel):
"""DM服务分页文件响应模型""" """DM服务分页文件响应模型"""

View File

@@ -108,7 +108,9 @@ class Service:
uploadedAt=f.upload_time, # type: ignore uploadedAt=f.upload_time, # type: ignore
description=None, description=None,
uploadedBy=None, uploadedBy=None,
lastAccessTime=f.last_access_time # type: ignore lastAccessTime=f.last_access_time, # type: ignore
tags=f.tags, # type: ignore
tags_updated_at=f.tags_updated_at # type: ignore
) )
for f in files for f in files
] ]

View File

@@ -1,3 +1,6 @@
export LOG_LEVEL=DEBUG
export DEBUG=true
uvicorn app.main:app \ uvicorn app.main:app \
--host 0.0.0.0 \ --host 0.0.0.0 \
--port 18000 \ --port 18000 \

View File

@@ -55,6 +55,7 @@ CREATE TABLE IF NOT EXISTS t_dm_dataset_files (
file_size BIGINT DEFAULT 0 COMMENT '文件大小(字节)', file_size BIGINT DEFAULT 0 COMMENT '文件大小(字节)',
check_sum VARCHAR(64) COMMENT '文件校验和', check_sum VARCHAR(64) COMMENT '文件校验和',
tags JSON COMMENT '文件标签信息', tags JSON COMMENT '文件标签信息',
tags_updated_at TIMESTAMP NULL COMMENT '标签最后更新时间',
metadata JSON COMMENT '文件元数据', metadata JSON COMMENT '文件元数据',
status VARCHAR(50) DEFAULT 'ACTIVE' COMMENT '文件状态:ACTIVE/DELETED/PROCESSING', status VARCHAR(50) DEFAULT 'ACTIVE' COMMENT '文件状态:ACTIVE/DELETED/PROCESSING',
upload_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '上传时间', upload_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '上传时间',