refactor: Reorganize datamate-python (#34)

refactor: Reorganize datamate-python (previously label-studio-adapter) into a DDD style structure.
This commit is contained in:
Jason Wang
2025-10-30 01:32:59 +08:00
committed by GitHub
parent 0614157c0b
commit 2f7341dc1f
79 changed files with 1077 additions and 1577 deletions

View File

@@ -0,0 +1,3 @@
from .service import DatasetManagementService
__all__ = ["DatasetManagementService"]

View File

@@ -0,0 +1,16 @@
from .dataset_file import (
DatasetFileResponse,
PagedDatasetFileResponse,
)
from .dataset import (
DatasetResponse,
DatasetTypeResponse,
)
__all__ = [
"DatasetResponse",
"DatasetFileResponse",
"PagedDatasetFileResponse",
"DatasetTypeResponse",
]

View File

@@ -0,0 +1,36 @@
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any
from datetime import datetime
class DatasetTypeResponse(BaseModel):
"""数据集类型响应模型"""
code: str = Field(..., description="类型编码")
name: str = Field(..., description="类型名称")
description: Optional[str] = Field(None, description="类型描述")
supportedFormats: List[str] = Field(default_factory=list, description="支持的文件格式")
icon: Optional[str] = Field(None, description="图标")
class DatasetResponse(BaseModel):
"""DM服务数据集响应模型"""
id: str = Field(..., description="数据集ID")
name: str = Field(..., description="数据集名称")
description: Optional[str] = Field(None, description="数据集描述")
datasetType: str = Field(..., description="数据集类型", alias="datasetType")
status: str = Field(..., description="数据集状态")
fileCount: int = Field(..., description="文件数量")
totalSize: int = Field(..., description="总大小(字节)")
createdAt: Optional[datetime] = Field(None, description="创建时间")
updatedAt: Optional[datetime] = Field(None, description="更新时间")
createdBy: Optional[str] = Field(None, description="创建者")
# 为了向后兼容,添加一个属性方法返回类型对象
@property
def type(self) -> DatasetTypeResponse:
"""兼容属性:返回类型对象"""
return DatasetTypeResponse(
code=self.datasetType,
name=self.datasetType,
description=None,
supportedFormats=[],
icon=None
)

View File

@@ -0,0 +1,26 @@
from pydantic import BaseModel, Field
from typing import List, Optional, Dict, Any
from datetime import datetime
class DatasetFileResponse(BaseModel):
"""DM服务数据集文件响应模型"""
id: str = Field(..., description="文件ID")
fileName: str = Field(..., description="文件名")
fileType: str = Field(..., description="文件类型")
filePath: str = Field(..., description="文件路径")
originalName: Optional[str] = Field(None, description="原始文件名")
size: Optional[int] = Field(None, description="文件大小(字节)")
status: Optional[str] = Field(None, description="文件状态")
uploadedAt: Optional[datetime] = Field(None, description="上传时间")
description: Optional[str] = Field(None, description="文件描述")
uploadedBy: Optional[str] = Field(None, description="上传者")
lastAccessTime: Optional[datetime] = Field(None, description="最后访问时间")
class PagedDatasetFileResponse(BaseModel):
"""DM服务分页文件响应模型"""
content: List[DatasetFileResponse] = Field(..., description="文件列表")
totalElements: int = Field(..., description="总元素数")
totalPages: int = Field(..., description="总页数")
page: int = Field(..., description="当前页码")
size: int = Field(..., description="每页大小")

View File

@@ -0,0 +1,3 @@
from .service import Service as DatasetManagementService
__all__ = ["DatasetManagementService"]

View File

@@ -0,0 +1,160 @@
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy import func
from typing import Optional
from app.core.config import settings
from app.core.logging import get_logger
from app.db.models import Dataset, DatasetFiles
from ..schema import DatasetResponse, PagedDatasetFileResponse, DatasetFileResponse
logger = get_logger(__name__)
class Service:
"""数据管理服务客户端 - 直接访问数据库"""
def __init__(self, db: AsyncSession):
"""
初始化 DM 客户端
Args:
db: 数据库会话
"""
self.db = db
logger.info("Initialize DM service client (Database mode)")
async def get_dataset(self, dataset_id: str) -> Optional[DatasetResponse]:
"""获取数据集详情"""
try:
logger.info(f"Getting dataset detail: {dataset_id} ...")
result = await self.db.execute(
select(Dataset).where(Dataset.id == dataset_id)
)
dataset = result.scalar_one_or_none()
if not dataset:
logger.error(f"Dataset not found: {dataset_id}")
return None
# 将数据库模型转换为响应模型
# type: ignore 用于忽略 SQLAlchemy 的类型检查问题
return DatasetResponse(
id=dataset.id, # type: ignore
name=dataset.name, # type: ignore
description=dataset.description or "", # type: ignore
datasetType=dataset.dataset_type, # type: ignore
status=dataset.status, # type: ignore
fileCount=dataset.file_count or 0, # type: ignore
totalSize=dataset.size_bytes or 0, # type: ignore
createdAt=dataset.created_at, # type: ignore
updatedAt=dataset.updated_at, # type: ignore
createdBy=dataset.created_by # type: ignore
)
except Exception as e:
logger.error(f"Failed to get dataset {dataset_id}: {e}")
return None
async def get_dataset_files(
self,
dataset_id: str,
page: int = 0,
size: int = 100,
file_type: Optional[str] = None,
status: Optional[str] = None
) -> Optional[PagedDatasetFileResponse]:
"""获取数据集文件列表"""
try:
logger.info(f"Get dataset files: dataset={dataset_id}, page={page}, size={size}")
# 构建查询
query = select(DatasetFiles).where(DatasetFiles.dataset_id == dataset_id)
# 添加可选过滤条件
if file_type:
query = query.where(DatasetFiles.file_type == file_type)
if status:
query = query.where(DatasetFiles.status == status)
# 获取总数
count_query = select(func.count()).select_from(DatasetFiles).where(
DatasetFiles.dataset_id == dataset_id
)
if file_type:
count_query = count_query.where(DatasetFiles.file_type == file_type)
if status:
count_query = count_query.where(DatasetFiles.status == status)
count_result = await self.db.execute(count_query)
total = count_result.scalar_one()
# 分页查询
query = query.offset(page * size).limit(size).order_by(DatasetFiles.created_at.desc())
result = await self.db.execute(query)
files = result.scalars().all()
# 转换为响应模型
# type: ignore 用于忽略 SQLAlchemy 的类型检查问题
content = [
DatasetFileResponse(
id=f.id, # type: ignore
fileName=f.file_name, # type: ignore
fileType=f.file_type or "", # type: ignore
filePath=f.file_path, # type: ignore
originalName=f.file_name, # type: ignore
size=f.file_size, # type: ignore
status=f.status, # type: ignore
uploadedAt=f.upload_time, # type: ignore
description=None,
uploadedBy=None,
lastAccessTime=f.last_access_time # type: ignore
)
for f in files
]
total_pages = (total + size - 1) // size if size > 0 else 0
return PagedDatasetFileResponse(
content=content,
totalElements=total,
totalPages=total_pages,
page=page,
size=size
)
except Exception as e:
logger.error(f"Failed to get dataset files for {dataset_id}: {e}")
return None
async def download_file(self, dataset_id: str, file_id: str) -> Optional[bytes]:
"""
下载文件内容
注意:此方法保留接口兼容性,但实际文件下载可能需要通过文件系统或对象存储
"""
logger.warning(f"download_file is deprecated when using database mode. Use get_file_download_url instead.")
return None
async def get_file_download_url(self, dataset_id: str, file_id: str) -> Optional[str]:
"""获取文件下载URL(或文件路径)"""
try:
result = await self.db.execute(
select(DatasetFiles).where(
DatasetFiles.id == file_id,
DatasetFiles.dataset_id == dataset_id
)
)
file = result.scalar_one_or_none()
if not file:
logger.error(f"File not found: {file_id} in dataset {dataset_id}")
return None
# 返回文件路径(可以是本地路径或对象存储URL)
return file.file_path # type: ignore
except Exception as e:
logger.error(f"Failed to get file path for {file_id}: {e}")
return None
async def close(self):
"""关闭客户端连接(数据库模式下无需操作)"""
logger.info("DM service client closed (Database mode)")