Files
DataMate/runtime/datamate-python/app/db/models/dataset_management.py
Jerry Yan d0972cbc9d feat(data-management): 实现数据集文件版本管理和内部路径保护
- 将数据集文件查询方法替换为只查询可见文件的版本
- 引入文件状态管理(ACTIVE/ARCHIVED)和内部目录结构
- 实现文件重复处理策略,支持版本控制模式而非覆盖
- 添加内部数据目录保护,防止访问.datamate等系统目录
- 重构文件上传流程,引入暂存目录和事务后清理机制
- 实现文件版本归档功能,保留历史版本到专用存储位置
- 优化文件路径规范化和安全验证逻辑
- 修复文件删除逻辑,确保归档文件不会被错误移除
- 更新数据集压缩下载功能以排除内部系统文件
2026-02-04 23:53:35 +08:00

118 lines
6.8 KiB
Python

"""
Tables of Dataset Management Module
"""
import uuid
from sqlalchemy import Column, String, BigInteger, Boolean, TIMESTAMP, Text, Integer, JSON, Date
from sqlalchemy.sql import func
from app.db.session import Base
class Dataset(Base):
"""数据集模型(支持医学影像、文本、问答等多种类型)"""
__tablename__ = "t_dm_datasets"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID")
parent_dataset_id = Column(String(36), nullable=True, comment="父数据集ID(UUID)")
name = Column(String(255), nullable=False, comment="数据集名称")
description = Column(Text, nullable=True, comment="数据集描述")
dataset_type = Column(String(50), nullable=False, comment="数据集类型:IMAGE/TEXT/QA/MULTIMODAL/OTHER")
category = Column(String(100), nullable=True, comment="数据集分类:医学影像/问答/文献等")
path = Column(String(500), nullable=True, comment="数据存储路径")
format = Column(String(50), nullable=True, comment="数据格式:DCM/JPG/JSON/CSV等")
schema_info = Column(JSON, nullable=True, comment="数据结构信息")
size_bytes = Column(BigInteger, default=0, comment="数据大小(字节)")
file_count = Column(BigInteger, default=0, comment="文件数量")
record_count = Column(BigInteger, default=0, comment="记录数量")
retention_days = Column(Integer, default=0, comment="数据保留天数(0表示长期保留)")
tags = Column(JSON, nullable=True, comment="标签列表")
dataset_metadata = Column("metadata", JSON, nullable=True, comment="元数据信息")
status = Column(String(50), default='DRAFT', comment="状态:DRAFT/ACTIVE/ARCHIVED")
is_public = Column(Boolean, default=False, comment="是否公开")
is_featured = Column(Boolean, default=False, comment="是否推荐")
version = Column(BigInteger, nullable=False, default=0, comment="版本号")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
created_by = Column(String(255), nullable=True, comment="创建者")
updated_by = Column(String(255), nullable=True, comment="更新者")
def __repr__(self):
return f"<Dataset(id={self.id}, name={self.name}, type={self.dataset_type})>"
class DatasetTag(Base):
"""数据集标签关联模型"""
__tablename__ = "t_dm_dataset_tags"
dataset_id = Column(String(36), primary_key=True, comment="数据集ID(UUID)")
tag_id = Column(String(36), primary_key=True, comment="标签ID(UUID)")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
def __repr__(self):
return f"<DatasetTag(dataset_id={self.dataset_id}, tag_id={self.tag_id})>"
class DatasetFiles(Base):
"""DM数据集文件模型"""
__tablename__ = "t_dm_dataset_files"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID")
dataset_id = Column(String(36), nullable=False, comment="所属数据集ID(UUID)")
file_name = Column(String(255), nullable=False, comment="文件名")
file_path = Column(String(1000), nullable=False, comment="文件路径")
logical_path = Column(String(1000), nullable=False, comment="文件逻辑路径(相对数据集根目录)")
version = Column(BigInteger, nullable=False, default=1, comment="文件版本号(同 logical_path 递增)")
file_type = Column(String(50), nullable=True, comment="文件格式:JPG/PNG/DCM/TXT等")
file_size = Column(BigInteger, default=0, comment="文件大小(字节)")
check_sum = Column(String(64), nullable=True, comment="文件校验和")
tags = Column(JSON, nullable=True, comment="文件标签信息")
tags_updated_at = Column(TIMESTAMP, nullable=True, comment="标签最后更新时间")
dataset_filemetadata = Column("metadata", JSON, nullable=True, comment="文件元数据")
status = Column(String(50), default='ACTIVE', comment="文件状态:ACTIVE/ARCHIVED/DELETED/PROCESSING")
upload_time = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="上传时间")
last_access_time = Column(TIMESTAMP, nullable=True, comment="最后访问时间")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
def __repr__(self):
return f"<DatasetFiles(id={self.id}, dataset_id={self.dataset_id}, file_name={self.file_name})>"
class DatasetStatistics(Base):
"""数据集统计信息模型"""
__tablename__ = "t_dm_dataset_statistics"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID")
dataset_id = Column(String(36), nullable=False, comment="数据集ID(UUID)")
stat_date = Column(Date, nullable=False, comment="统计日期")
total_files = Column(BigInteger, default=0, comment="总文件数")
total_size = Column(BigInteger, default=0, comment="总大小(字节)")
processed_files = Column(BigInteger, default=0, comment="已处理文件数")
error_files = Column(BigInteger, default=0, comment="错误文件数")
download_count = Column(BigInteger, default=0, comment="下载次数")
view_count = Column(BigInteger, default=0, comment="查看次数")
quality_metrics = Column(JSON, nullable=True, comment="质量指标")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
def __repr__(self):
return f"<DatasetStatistics(id={self.id}, dataset_id={self.dataset_id}, date={self.stat_date})>"
class Tag(Base):
"""标签集合模型"""
__tablename__ = "t_dm_tags"
id = Column(String(36), primary_key=True, default=lambda: str(uuid.uuid4()), comment="UUID")
name = Column(String(100), nullable=False, unique=True, comment="标签名称")
description = Column(Text, nullable=True, comment="标签描述")
category = Column(String(50), nullable=True, comment="标签分类")
color = Column(String(7), nullable=True, comment="标签颜色(十六进制)")
usage_count = Column(BigInteger, default=0, comment="使用次数")
created_at = Column(TIMESTAMP, server_default=func.current_timestamp(), comment="创建时间")
updated_at = Column(TIMESTAMP, server_default=func.current_timestamp(), onupdate=func.current_timestamp(), comment="更新时间")
def __repr__(self):
return f"<Tag(id={self.id}, name={self.name}, category={self.category})>"