feat(cache): 添加素材缓存功能以避免重复下载

- 新增素材缓存配置选项包括启用状态、缓存目录和最大缓存大小 - 实现 MaterialCache 类提供缓存存储和检索功能 - 修改 download_file 方法支持缓存下载模式 - 添加缓存清理机制使用 LRU 策略管理磁盘空间 - 配置默认值优化本地开发体验 - 实现缓存统计和监控功能
2026-01-17 15:07:12 +08:00
parent d5cd0dca03
commit fe757408b6
3 changed files with 321 additions and 4 deletions
--- a/services/cache.py
+++ b/services/cache.py
@@ -0,0 +1,291 @@
+# -*- coding: utf-8 -*-
+"""
+素材缓存服务
+
+提供素材下载缓存功能，避免相同素材重复下载。
+"""
+
+import os
+import hashlib
+import logging
+import shutil
+import time
+from typing import Optional, Tuple
+from urllib.parse import urlparse, unquote
+
+from services import storage
+
+logger = logging.getLogger(__name__)
+
+
+def _extract_cache_key(url: str) -> str:
+    """
+    从 URL 提取缓存键
+
+    去除签名等查询参数，保留路径作为唯一标识。
+
+    Args:
+        url: 完整的素材 URL
+
+    Returns:
+        缓存键（URL 路径的 MD5 哈希）
+    """
+    parsed = urlparse(url)
+    # 使用 scheme + host + path 作为唯一标识（忽略签名等查询参数）
+    cache_key_source = f"{parsed.scheme}://{parsed.netloc}{unquote(parsed.path)}"
+    return hashlib.md5(cache_key_source.encode('utf-8')).hexdigest()
+
+
+def _get_file_extension(url: str) -> str:
+    """
+    从 URL 提取文件扩展名
+
+    Args:
+        url: 素材 URL
+
+    Returns:
+        文件扩展名（如 .mp4, .png），无法识别时返回空字符串
+    """
+    parsed = urlparse(url)
+    path = unquote(parsed.path)
+    _, ext = os.path.splitext(path)
+    return ext.lower() if ext else ''
+
+
+class MaterialCache:
+    """
+    素材缓存管理器
+
+    负责素材文件的缓存存储和检索。
+    """
+
+    def __init__(self, cache_dir: str, enabled: bool = True, max_size_gb: float = 0):
+        """
+        初始化缓存管理器
+
+        Args:
+            cache_dir: 缓存目录路径
+            enabled: 是否启用缓存
+            max_size_gb: 最大缓存大小（GB），0 表示不限制
+        """
+        self.cache_dir = cache_dir
+        self.enabled = enabled
+        self.max_size_bytes = int(max_size_gb * 1024 * 1024 * 1024) if max_size_gb > 0 else 0
+
+        if self.enabled:
+            os.makedirs(self.cache_dir, exist_ok=True)
+            logger.info(f"Material cache initialized: {cache_dir}")
+
+    def get_cache_path(self, url: str) -> str:
+        """
+        获取素材的缓存文件路径
+
+        Args:
+            url: 素材 URL
+
+        Returns:
+            缓存文件的完整路径
+        """
+        cache_key = _extract_cache_key(url)
+        ext = _get_file_extension(url)
+        filename = f"{cache_key}{ext}"
+        return os.path.join(self.cache_dir, filename)
+
+    def is_cached(self, url: str) -> Tuple[bool, str]:
+        """
+        检查素材是否已缓存
+
+        Args:
+            url: 素材 URL
+
+        Returns:
+            (是否已缓存, 缓存文件路径)
+        """
+        if not self.enabled:
+            return False, ''
+
+        cache_path = self.get_cache_path(url)
+        exists = os.path.exists(cache_path) and os.path.getsize(cache_path) > 0
+        return exists, cache_path
+
+    def get_or_download(
+        self,
+        url: str,
+        dest: str,
+        timeout: int = 300,
+        max_retries: int = 5
+    ) -> bool:
+        """
+        从缓存获取素材，若未缓存则下载并缓存
+
+        Args:
+            url: 素材 URL
+            dest: 目标文件路径（任务工作目录中的路径）
+            timeout: 下载超时时间（秒）
+            max_retries: 最大重试次数
+
+        Returns:
+            是否成功
+        """
+        # 确保目标目录存在
+        dest_dir = os.path.dirname(dest)
+        if dest_dir:
+            os.makedirs(dest_dir, exist_ok=True)
+
+        # 缓存未启用时直接下载
+        if not self.enabled:
+            return storage.download_file(url, dest, max_retries=max_retries, timeout=timeout)
+
+        # 检查缓存
+        cached, cache_path = self.is_cached(url)
+
+        if cached:
+            # 命中缓存，复制到目标路径
+            try:
+                shutil.copy2(cache_path, dest)
+                # 更新访问时间（用于 LRU 清理）
+                os.utime(cache_path, None)
+                file_size = os.path.getsize(dest)
+                logger.info(f"Cache hit: {url[:80]}... -> {dest} ({file_size} bytes)")
+                return True
+            except Exception as e:
+                logger.warning(f"Failed to copy from cache: {e}, will re-download")
+                # 缓存复制失败，删除可能损坏的缓存文件
+                try:
+                    os.remove(cache_path)
+                except Exception:
+                    pass
+
+        # 未命中缓存，下载到缓存目录
+        logger.debug(f"Cache miss: {url[:80]}...")
+
+        # 先下载到临时文件
+        temp_cache_path = cache_path + '.downloading'
+        try:
+            if not storage.download_file(url, temp_cache_path, max_retries=max_retries, timeout=timeout):
+                # 下载失败，清理临时文件
+                if os.path.exists(temp_cache_path):
+                    os.remove(temp_cache_path)
+                return False
+
+            # 下载成功，移动到正式缓存路径
+            if os.path.exists(cache_path):
+                os.remove(cache_path)
+            os.rename(temp_cache_path, cache_path)
+
+            # 复制到目标路径
+            shutil.copy2(cache_path, dest)
+            file_size = os.path.getsize(dest)
+            logger.info(f"Downloaded and cached: {url[:80]}... ({file_size} bytes)")
+
+            # 检查是否需要清理缓存
+            if self.max_size_bytes > 0:
+                self._cleanup_if_needed()
+
+            return True
+
+        except Exception as e:
+            logger.error(f"Cache download error: {e}")
+            # 清理临时文件
+            if os.path.exists(temp_cache_path):
+                try:
+                    os.remove(temp_cache_path)
+                except Exception:
+                    pass
+            return False
+
+    def _cleanup_if_needed(self) -> None:
+        """
+        检查并清理缓存（LRU 策略）
+
+        当缓存大小超过限制时，删除最久未访问的文件。
+        """
+        if self.max_size_bytes <= 0:
+            return
+
+        try:
+            # 获取所有缓存文件及其信息
+            cache_files = []
+            total_size = 0
+
+            for filename in os.listdir(self.cache_dir):
+                if filename.endswith('.downloading'):
+                    continue
+                file_path = os.path.join(self.cache_dir, filename)
+                if os.path.isfile(file_path):
+                    stat = os.stat(file_path)
+                    cache_files.append({
+                        'path': file_path,
+                        'size': stat.st_size,
+                        'atime': stat.st_atime
+                    })
+                    total_size += stat.st_size
+
+            # 如果未超过限制，无需清理
+            if total_size <= self.max_size_bytes:
+                return
+
+            # 按访问时间排序（最久未访问的在前）
+            cache_files.sort(key=lambda x: x['atime'])
+
+            # 删除文件直到低于限制的 80%
+            target_size = int(self.max_size_bytes * 0.8)
+            deleted_count = 0
+
+            for file_info in cache_files:
+                if total_size <= target_size:
+                    break
+                try:
+                    os.remove(file_info['path'])
+                    total_size -= file_info['size']
+                    deleted_count += 1
+                except Exception as e:
+                    logger.warning(f"Failed to delete cache file: {e}")
+
+            if deleted_count > 0:
+                logger.info(f"Cache cleanup: deleted {deleted_count} files, current size: {total_size / (1024*1024*1024):.2f} GB")
+
+        except Exception as e:
+            logger.warning(f"Cache cleanup error: {e}")
+
+    def clear(self) -> None:
+        """清空所有缓存"""
+        if not self.enabled:
+            return
+
+        try:
+            if os.path.exists(self.cache_dir):
+                shutil.rmtree(self.cache_dir)
+                os.makedirs(self.cache_dir, exist_ok=True)
+                logger.info("Cache cleared")
+        except Exception as e:
+            logger.error(f"Failed to clear cache: {e}")
+
+    def get_stats(self) -> dict:
+        """
+        获取缓存统计信息
+
+        Returns:
+            包含缓存统计的字典
+        """
+        if not self.enabled or not os.path.exists(self.cache_dir):
+            return {'enabled': False, 'file_count': 0, 'total_size_mb': 0}
+
+        file_count = 0
+        total_size = 0
+
+        for filename in os.listdir(self.cache_dir):
+            if filename.endswith('.downloading'):
+                continue
+            file_path = os.path.join(self.cache_dir, filename)
+            if os.path.isfile(file_path):
+                file_count += 1
+                total_size += os.path.getsize(file_path)
+
+        return {
+            'enabled': True,
+            'cache_dir': self.cache_dir,
+            'file_count': file_count,
+            'total_size_mb': round(total_size / (1024 * 1024), 2),
+            'max_size_gb': self.max_size_bytes / (1024 * 1024 * 1024) if self.max_size_bytes > 0 else 0
+        }