feat(tracing): 集成 OpenTelemetry 链路追踪功能

- 在 base.py 中添加文件下载、上传和 FFmpeg 执行的链路追踪
- 在 api_client.py 中实现 API 请求的链路追踪和错误标记
- 在 lease_service.py 中添加租约续期的链路追踪支持
- 在 task_executor.py 中集成任务执行的完整链路追踪
- 新增 util/tracing.py 工具模块提供统一的追踪上下文管理
- 在 .env.example 中添加 OTEL 配置选项
- 在 index.py 中初始化和关闭链路追踪功能
This commit is contained in:
2026-02-07 00:11:01 +08:00
parent c9a6133be9
commit 9b373dea34
8 changed files with 549 additions and 149 deletions

View File

@@ -11,7 +11,6 @@ from concurrent.futures import ThreadPoolExecutor, Future
from typing import Dict, Optional, TYPE_CHECKING
from domain.task import Task, TaskType
from domain.result import TaskResult, ErrorCode
# 需要 GPU 加速的任务类型
GPU_REQUIRED_TASK_TYPES = {
@@ -22,6 +21,13 @@ from domain.config import WorkerConfig
from core.handler import TaskHandler
from services.lease_service import LeaseService
from services.gpu_scheduler import GPUScheduler
from util.tracing import (
capture_otel_context,
get_current_task_context,
mark_span_error,
start_span,
task_trace_scope,
)
if TYPE_CHECKING:
from services.api_client import APIClientV2
@@ -174,77 +180,84 @@ class TaskExecutor:
task: 任务实体
"""
task_id = task.task_id
logger.info(f"[task:{task_id}] Starting {task.task_type.value}")
# 启动租约续期服务
lease_service = LeaseService(
self.api_client,
task_id,
interval=self.config.lease_extension_threshold,
extension=self.config.lease_extension_duration
)
lease_service.start()
# 获取 GPU 设备(仅对需要 GPU 的任务类型)
device_index = None
needs_gpu = task.task_type in GPU_REQUIRED_TASK_TYPES
if needs_gpu and self.gpu_scheduler.enabled:
device_index = self.gpu_scheduler.acquire()
if device_index is not None:
logger.info(f"[task:{task_id}] Assigned to GPU device {device_index}")
# 获取处理器(需要在设置 GPU 设备前获取)
handler = self.handlers.get(task.task_type)
device_index = None
lease_service = None
try:
# 报告任务开始
self.api_client.report_start(task_id)
with task_trace_scope(task, span_name="render.task.execute") as task_span:
logger.info(f"[task:{task_id}] Starting {task.task_type.value}")
if not handler:
raise ValueError(f"No handler for task type: {task.task_type}")
lease_service = LeaseService(
self.api_client,
task_id,
interval=self.config.lease_extension_threshold,
extension=self.config.lease_extension_duration,
parent_otel_context=capture_otel_context(),
task_trace_context=get_current_task_context(),
)
with start_span("render.task.lease.start"):
lease_service.start()
# 设置 GPU 设备(线程本地存储)
if device_index is not None:
handler.set_gpu_device(device_index)
needs_gpu = task.task_type in GPU_REQUIRED_TASK_TYPES
if needs_gpu and self.gpu_scheduler.enabled:
with start_span("render.task.gpu.acquire"):
device_index = self.gpu_scheduler.acquire()
if device_index is not None:
logger.info(f"[task:{task_id}] Assigned to GPU device {device_index}")
# 执行前钩子
handler.before_handle(task)
try:
with start_span("render.task.report.start"):
self.api_client.report_start(task_id)
# 执行任务
result = handler.handle(task)
if not handler:
raise ValueError(f"No handler for task type: {task.task_type}")
# 执行后钩子
handler.after_handle(task, result)
if device_index is not None:
handler.set_gpu_device(device_index)
# 上报结果
if result.success:
self.api_client.report_success(task_id, result.data)
logger.info(f"[task:{task_id}] Completed successfully")
else:
error_code = result.error_code.value if result.error_code else 'E_UNKNOWN'
self.api_client.report_fail(task_id, error_code, result.error_message or '')
logger.error(f"[task:{task_id}] Failed: {result.error_message}")
with start_span("render.task.handler.before"):
handler.before_handle(task)
except Exception as e:
logger.error(f"[task:{task_id}] Exception: {e}", exc_info=True)
self.api_client.report_fail(task_id, 'E_UNKNOWN', str(e))
with start_span("render.task.handler.execute"):
result = handler.handle(task)
finally:
# 清除 GPU 设备设置
if handler:
handler.clear_gpu_device()
with start_span("render.task.handler.after"):
handler.after_handle(task, result)
# 释放 GPU 设备(仅当实际分配了设备时)
if device_index is not None:
self.gpu_scheduler.release(device_index)
if result.success:
with start_span("render.task.report.success"):
self.api_client.report_success(task_id, result.data)
if task_span is not None:
task_span.set_attribute("render.task.result", "success")
logger.info(f"[task:{task_id}] Completed successfully")
else:
error_code = result.error_code.value if result.error_code else 'E_UNKNOWN'
with start_span("render.task.report.fail"):
self.api_client.report_fail(task_id, error_code, result.error_message or '')
mark_span_error(task_span, result.error_message or "task failed", error_code)
logger.error(f"[task:{task_id}] Failed: {result.error_message}")
# 停止租约续期
lease_service.stop()
except Exception as e:
mark_span_error(task_span, str(e), "E_UNKNOWN")
logger.error(f"[task:{task_id}] Exception: {e}", exc_info=True)
with start_span("render.task.report.exception"):
self.api_client.report_fail(task_id, 'E_UNKNOWN', str(e))
# 从当前任务中移除
with self.lock:
self.current_tasks.pop(task_id, None)
self.current_futures.pop(task_id, None)
finally:
if handler:
handler.clear_gpu_device()
if device_index is not None:
with start_span("render.task.gpu.release"):
self.gpu_scheduler.release(device_index)
if lease_service is not None:
with start_span("render.task.lease.stop"):
lease_service.stop()
with self.lock:
self.current_tasks.pop(task_id, None)
self.current_futures.pop(task_id, None)
def shutdown(self, wait: bool = True):
"""