Files
FrameTour-RenderWorker/services/cache.py
Jerry Yan 6126856361 feat(cache): 实现上传文件缓存功能
- 在文件上传成功后将文件加入缓存系统
- 添加 add_to_cache 方法支持本地文件缓存
- 实现原子操作确保缓存写入安全
- 集成锁机制防止并发冲突
- 自动触发缓存清理策略
- 记录详细的缓存操作日志
2026-01-26 10:41:26 +08:00

417 lines
13 KiB
Python

# -*- coding: utf-8 -*-
"""
素材缓存服务
提供素材下载缓存功能,避免相同素材重复下载。
"""
import os
import hashlib
import logging
import shutil
import time
import uuid
from typing import Optional, Tuple
from urllib.parse import urlparse, unquote
from services import storage
logger = logging.getLogger(__name__)
def _extract_cache_key(url: str) -> str:
"""
从 URL 提取缓存键
去除签名等查询参数,保留路径作为唯一标识。
Args:
url: 完整的素材 URL
Returns:
缓存键(URL 路径的 MD5 哈希)
"""
parsed = urlparse(url)
# 使用 scheme + host + path 作为唯一标识(忽略签名等查询参数)
cache_key_source = f"{parsed.scheme}://{parsed.netloc}{unquote(parsed.path)}"
return hashlib.md5(cache_key_source.encode('utf-8')).hexdigest()
def _get_file_extension(url: str) -> str:
"""
从 URL 提取文件扩展名
Args:
url: 素材 URL
Returns:
文件扩展名(如 .mp4, .png),无法识别时返回空字符串
"""
parsed = urlparse(url)
path = unquote(parsed.path)
_, ext = os.path.splitext(path)
return ext.lower() if ext else ''
class MaterialCache:
"""
素材缓存管理器
负责素材文件的缓存存储和检索。
"""
LOCK_TIMEOUT_SEC = 30.0
LOCK_POLL_INTERVAL_SEC = 0.1
def __init__(self, cache_dir: str, enabled: bool = True, max_size_gb: float = 0):
"""
初始化缓存管理器
Args:
cache_dir: 缓存目录路径
enabled: 是否启用缓存
max_size_gb: 最大缓存大小(GB),0 表示不限制
"""
self.cache_dir = cache_dir
self.enabled = enabled
self.max_size_bytes = int(max_size_gb * 1024 * 1024 * 1024) if max_size_gb > 0 else 0
if self.enabled:
os.makedirs(self.cache_dir, exist_ok=True)
logger.info(f"Material cache initialized: {cache_dir}")
def get_cache_path(self, url: str) -> str:
"""
获取素材的缓存文件路径
Args:
url: 素材 URL
Returns:
缓存文件的完整路径
"""
cache_key = _extract_cache_key(url)
ext = _get_file_extension(url)
filename = f"{cache_key}{ext}"
return os.path.join(self.cache_dir, filename)
def _get_lock_path(self, cache_key: str) -> str:
"""获取缓存锁文件路径"""
assert self.cache_dir
return os.path.join(self.cache_dir, f"{cache_key}.lock")
def _acquire_lock(self, cache_key: str) -> Optional[str]:
"""获取缓存锁(跨进程安全)"""
if not self.enabled:
return None
lock_path = self._get_lock_path(cache_key)
deadline = time.monotonic() + self.LOCK_TIMEOUT_SEC
while True:
try:
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
os.close(fd)
return lock_path
except FileExistsError:
if time.monotonic() >= deadline:
logger.warning(f"Cache lock timeout: {lock_path}")
return None
time.sleep(self.LOCK_POLL_INTERVAL_SEC)
except Exception as e:
logger.warning(f"Cache lock error: {e}")
return None
def _release_lock(self, lock_path: Optional[str]) -> None:
"""释放缓存锁"""
if not lock_path:
return
try:
os.remove(lock_path)
except FileNotFoundError:
return
except Exception as e:
logger.warning(f"Cache lock release error: {e}")
def is_cached(self, url: str) -> Tuple[bool, str]:
"""
检查素材是否已缓存
Args:
url: 素材 URL
Returns:
(是否已缓存, 缓存文件路径)
"""
if not self.enabled:
return False, ''
cache_path = self.get_cache_path(url)
exists = os.path.exists(cache_path) and os.path.getsize(cache_path) > 0
return exists, cache_path
def get_or_download(
self,
url: str,
dest: str,
timeout: int = 300,
max_retries: int = 5
) -> bool:
"""
从缓存获取素材,若未缓存则下载并缓存
Args:
url: 素材 URL
dest: 目标文件路径(任务工作目录中的路径)
timeout: 下载超时时间(秒)
max_retries: 最大重试次数
Returns:
是否成功
"""
# 确保目标目录存在
dest_dir = os.path.dirname(dest)
if dest_dir:
os.makedirs(dest_dir, exist_ok=True)
# 缓存未启用时直接下载
if not self.enabled:
return storage.download_file(url, dest, max_retries=max_retries, timeout=timeout)
cache_key = _extract_cache_key(url)
lock_path = self._acquire_lock(cache_key)
if not lock_path:
logger.warning(f"Cache lock unavailable, downloading without cache: {url[:80]}...")
return storage.download_file(url, dest, max_retries=max_retries, timeout=timeout)
try:
cache_path = self.get_cache_path(url)
cached = os.path.exists(cache_path) and os.path.getsize(cache_path) > 0
if cached:
# 命中缓存,复制到目标路径
try:
shutil.copy2(cache_path, dest)
# 更新访问时间(用于 LRU 清理)
os.utime(cache_path, None)
file_size = os.path.getsize(dest)
logger.info(f"Cache hit: {url[:80]}... -> {dest} ({file_size} bytes)")
return True
except Exception as e:
logger.warning(f"Failed to copy from cache: {e}, will re-download")
# 缓存复制失败,删除可能损坏的缓存文件
try:
os.remove(cache_path)
except Exception:
pass
# 未命中缓存,下载到缓存目录
logger.debug(f"Cache miss: {url[:80]}...")
# 先下载到临时文件(唯一文件名,避免并发覆盖)
temp_cache_path = os.path.join(
self.cache_dir,
f"{cache_key}.{uuid.uuid4().hex}.downloading"
)
try:
if not storage.download_file(url, temp_cache_path, max_retries=max_retries, timeout=timeout):
# 下载失败,清理临时文件
if os.path.exists(temp_cache_path):
os.remove(temp_cache_path)
return False
if not os.path.exists(temp_cache_path) or os.path.getsize(temp_cache_path) <= 0:
if os.path.exists(temp_cache_path):
os.remove(temp_cache_path)
return False
# 下载成功,原子替换缓存文件
os.replace(temp_cache_path, cache_path)
# 复制到目标路径
shutil.copy2(cache_path, dest)
file_size = os.path.getsize(dest)
logger.info(f"Downloaded and cached: {url[:80]}... ({file_size} bytes)")
# 检查是否需要清理缓存
if self.max_size_bytes > 0:
self._cleanup_if_needed()
return True
except Exception as e:
logger.error(f"Cache download error: {e}")
# 清理临时文件
if os.path.exists(temp_cache_path):
try:
os.remove(temp_cache_path)
except Exception:
pass
return False
finally:
self._release_lock(lock_path)
def add_to_cache(self, url: str, source_path: str) -> bool:
"""
将本地文件添加到缓存
Args:
url: 对应的 URL(用于生成缓存键)
source_path: 本地文件路径
Returns:
是否成功
"""
if not self.enabled:
return False
if not os.path.exists(source_path):
logger.warning(f"Source file not found for cache: {source_path}")
return False
cache_key = _extract_cache_key(url)
lock_path = self._acquire_lock(cache_key)
if not lock_path:
logger.warning(f"Cache lock unavailable for adding: {url[:80]}...")
return False
try:
cache_path = self.get_cache_path(url)
# 先复制到临时文件
temp_cache_path = os.path.join(
self.cache_dir,
f"{cache_key}.{uuid.uuid4().hex}.adding"
)
shutil.copy2(source_path, temp_cache_path)
# 原子替换
os.replace(temp_cache_path, cache_path)
# 更新访问时间
os.utime(cache_path, None)
logger.info(f"Added to cache: {url[:80]}... <- {source_path}")
# 检查清理
if self.max_size_bytes > 0:
self._cleanup_if_needed()
return True
except Exception as e:
logger.error(f"Failed to add to cache: {e}")
if 'temp_cache_path' in locals() and os.path.exists(temp_cache_path):
try:
os.remove(temp_cache_path)
except Exception:
pass
return False
finally:
self._release_lock(lock_path)
def _cleanup_if_needed(self) -> None:
"""
检查并清理缓存(LRU 策略)
当缓存大小超过限制时,删除最久未访问的文件。
"""
if self.max_size_bytes <= 0:
return
try:
# 获取所有缓存文件及其信息
cache_files = []
total_size = 0
for filename in os.listdir(self.cache_dir):
if filename.endswith('.downloading') or filename.endswith('.lock'):
continue
file_path = os.path.join(self.cache_dir, filename)
if os.path.isfile(file_path):
stat = os.stat(file_path)
cache_files.append({
'path': file_path,
'size': stat.st_size,
'atime': stat.st_atime
})
total_size += stat.st_size
# 如果未超过限制,无需清理
if total_size <= self.max_size_bytes:
return
# 按访问时间排序(最久未访问的在前)
cache_files.sort(key=lambda x: x['atime'])
# 删除文件直到低于限制的 80%
target_size = int(self.max_size_bytes * 0.8)
deleted_count = 0
for file_info in cache_files:
if total_size <= target_size:
break
# 从文件名提取 cache_key,检查是否有锁(说明正在被使用)
filename = os.path.basename(file_info['path'])
cache_key = os.path.splitext(filename)[0]
lock_path = self._get_lock_path(cache_key)
if os.path.exists(lock_path):
# 该文件正在被其他任务使用,跳过删除
logger.debug(f"Cache cleanup: skipping locked file {filename}")
continue
try:
os.remove(file_info['path'])
total_size -= file_info['size']
deleted_count += 1
except Exception as e:
logger.warning(f"Failed to delete cache file: {e}")
if deleted_count > 0:
logger.info(f"Cache cleanup: deleted {deleted_count} files, current size: {total_size / (1024*1024*1024):.2f} GB")
except Exception as e:
logger.warning(f"Cache cleanup error: {e}")
def clear(self) -> None:
"""清空所有缓存"""
if not self.enabled:
return
try:
if os.path.exists(self.cache_dir):
shutil.rmtree(self.cache_dir)
os.makedirs(self.cache_dir, exist_ok=True)
logger.info("Cache cleared")
except Exception as e:
logger.error(f"Failed to clear cache: {e}")
def get_stats(self) -> dict:
"""
获取缓存统计信息
Returns:
包含缓存统计的字典
"""
if not self.enabled or not os.path.exists(self.cache_dir):
return {'enabled': False, 'file_count': 0, 'total_size_mb': 0}
file_count = 0
total_size = 0
for filename in os.listdir(self.cache_dir):
if filename.endswith('.downloading') or filename.endswith('.lock'):
continue
file_path = os.path.join(self.cache_dir, filename)
if os.path.isfile(file_path):
file_count += 1
total_size += os.path.getsize(file_path)
return {
'enabled': True,
'cache_dir': self.cache_dir,
'file_count': file_count,
'total_size_mb': round(total_size / (1024 * 1024), 2),
'max_size_gb': self.max_size_bytes / (1024 * 1024 * 1024) if self.max_size_bytes > 0 else 0
}