feat(tracing): 集成 OpenTelemetry 链路追踪功能

- 在 base.py 中添加文件下载、上传和 FFmpeg 执行的链路追踪
- 在 api_client.py 中实现 API 请求的链路追踪和错误标记
- 在 lease_service.py 中添加租约续期的链路追踪支持
- 在 task_executor.py 中集成任务执行的完整链路追踪
- 新增 util/tracing.py 工具模块提供统一的追踪上下文管理
- 在 .env.example 中添加 OTEL 配置选项
- 在 index.py 中初始化和关闭链路追踪功能
This commit is contained in:
2026-02-07 00:11:01 +08:00
parent c9a6133be9
commit 9b373dea34
8 changed files with 549 additions and 149 deletions

View File

@@ -15,12 +15,15 @@ import threading
from abc import ABC
from typing import Optional, List, Dict, Any, Tuple, TYPE_CHECKING
from opentelemetry.trace import SpanKind
from core.handler import TaskHandler
from domain.task import Task
from domain.result import TaskResult, ErrorCode
from domain.config import WorkerConfig
from services import storage
from services.cache import MaterialCache
from util.tracing import mark_span_error, start_span
from constant import (
HW_ACCEL_NONE, HW_ACCEL_QSV, HW_ACCEL_CUDA,
VIDEO_ENCODE_PARAMS, VIDEO_ENCODE_PARAMS_QSV, VIDEO_ENCODE_PARAMS_CUDA
@@ -410,21 +413,30 @@ class BaseHandler(TaskHandler, ABC):
if timeout is None:
timeout = self.config.download_timeout
try:
if use_cache:
# 使用缓存下载
result = self.material_cache.get_or_download(url, dest, timeout=timeout)
else:
# 直接下载(不走缓存)
result = storage.download_file(url, dest, timeout=timeout)
with start_span(
"render.task.file.download",
kind=SpanKind.CLIENT,
attributes={
"render.file.destination": dest,
"render.file.use_cache": use_cache,
},
) as span:
try:
if use_cache:
result = self.material_cache.get_or_download(url, dest, timeout=timeout)
else:
result = storage.download_file(url, dest, timeout=timeout)
if result:
file_size = os.path.getsize(dest) if os.path.exists(dest) else 0
logger.debug(f"Downloaded: {url} -> {dest} ({file_size} bytes)")
return result
except Exception as e:
logger.error(f"Download failed: {url} -> {e}")
return False
if result:
file_size = os.path.getsize(dest) if os.path.exists(dest) else 0
logger.debug(f"Downloaded: {url} -> {dest} ({file_size} bytes)")
if span is not None:
span.set_attribute("render.file.size_bytes", file_size)
return result
except Exception as e:
mark_span_error(span, str(e), ErrorCode.E_INPUT_UNAVAILABLE.value)
logger.error(f"Download failed: {url} -> {e}")
return False
def upload_file(
self,
@@ -445,37 +457,45 @@ class BaseHandler(TaskHandler, ABC):
Returns:
访问 URL,失败返回 None
"""
# 获取上传 URL
upload_info = self.api_client.get_upload_url(task_id, file_type, file_name)
if not upload_info:
logger.error(f"[task:{task_id}] Failed to get upload URL")
return None
with start_span(
"render.task.file.upload",
kind=SpanKind.CLIENT,
attributes={
"render.file.type": file_type,
"render.file.path": file_path,
},
) as span:
upload_info = self.api_client.get_upload_url(task_id, file_type, file_name)
if not upload_info:
logger.error(f"[task:{task_id}] Failed to get upload URL")
return None
upload_url = upload_info.get('uploadUrl')
access_url = upload_info.get('accessUrl')
upload_url = upload_info.get('uploadUrl')
access_url = upload_info.get('accessUrl')
if not upload_url:
logger.error(f"[task:{task_id}] Invalid upload URL response")
return None
if not upload_url:
logger.error(f"[task:{task_id}] Invalid upload URL response")
return None
# 上传文件
try:
result = storage.upload_file(upload_url, file_path, timeout=self.config.upload_timeout)
if result:
file_size = os.path.getsize(file_path)
logger.info(f"[task:{task_id}] Uploaded: {file_path} ({file_size} bytes)")
try:
result = storage.upload_file(upload_url, file_path, timeout=self.config.upload_timeout)
if result:
file_size = os.path.getsize(file_path)
logger.info(f"[task:{task_id}] Uploaded: {file_path} ({file_size} bytes)")
if span is not None:
span.set_attribute("render.file.size_bytes", file_size)
# 将上传成功的文件加入缓存
if access_url:
self.material_cache.add_to_cache(access_url, file_path)
if access_url:
self.material_cache.add_to_cache(access_url, file_path)
return access_url
return access_url
else:
logger.error(f"[task:{task_id}] Upload failed: {file_path}")
return None
except Exception as e:
logger.error(f"[task:{task_id}] Upload error: {e}")
return None
except Exception as e:
mark_span_error(span, str(e), ErrorCode.E_UPLOAD_FAILED.value)
logger.error(f"[task:{task_id}] Upload error: {e}")
return None
def run_ffmpeg(
self,
@@ -507,29 +527,42 @@ class BaseHandler(TaskHandler, ABC):
cmd_str = cmd_str[:500] + '...'
logger.info(f"[task:{task_id}] FFmpeg: {cmd_str}")
try:
run_args = subprocess_args(False)
run_args['stdout'] = subprocess.DEVNULL
run_args['stderr'] = subprocess.PIPE
result = subprocess.run(
cmd_to_run,
timeout=timeout,
**run_args
)
with start_span(
"render.task.ffmpeg.run",
attributes={
"render.ffmpeg.timeout_seconds": timeout,
"render.ffmpeg.command": cmd_str,
},
) as span:
try:
run_args = subprocess_args(False)
run_args['stdout'] = subprocess.DEVNULL
run_args['stderr'] = subprocess.PIPE
result = subprocess.run(
cmd_to_run,
timeout=timeout,
**run_args
)
if result.returncode != 0:
stderr = (result.stderr or b'').decode('utf-8', errors='replace')[:1000]
logger.error(f"[task:{task_id}] FFmpeg failed (code={result.returncode}): {stderr}")
if span is not None:
span.set_attribute("render.ffmpeg.return_code", result.returncode)
if result.returncode != 0:
stderr = (result.stderr or b'').decode('utf-8', errors='replace')[:1000]
logger.error(f"[task:{task_id}] FFmpeg failed (code={result.returncode}): {stderr}")
mark_span_error(span, stderr or "ffmpeg failed", ErrorCode.E_FFMPEG_FAILED.value)
return False
return True
except subprocess.TimeoutExpired:
logger.error(f"[task:{task_id}] FFmpeg timeout after {timeout}s")
mark_span_error(span, f"timeout after {timeout}s", ErrorCode.E_TIMEOUT.value)
return False
except Exception as e:
logger.error(f"[task:{task_id}] FFmpeg error: {e}")
mark_span_error(span, str(e), ErrorCode.E_FFMPEG_FAILED.value)
return False
return True
except subprocess.TimeoutExpired:
logger.error(f"[task:{task_id}] FFmpeg timeout after {timeout}s")
return False
except Exception as e:
logger.error(f"[task:{task_id}] FFmpeg error: {e}")
return False
def probe_duration(self, file_path: str) -> Optional[float]:
"""