You've already forked FrameTour-RenderWorker
feat(tracing): 集成 OpenTelemetry 链路追踪功能
- 在 base.py 中添加文件下载、上传和 FFmpeg 执行的链路追踪 - 在 api_client.py 中实现 API 请求的链路追踪和错误标记 - 在 lease_service.py 中添加租约续期的链路追踪支持 - 在 task_executor.py 中集成任务执行的完整链路追踪 - 新增 util/tracing.py 工具模块提供统一的追踪上下文管理 - 在 .env.example 中添加 OTEL 配置选项 - 在 index.py 中初始化和关闭链路追踪功能
This commit is contained in:
@@ -11,7 +11,6 @@ from concurrent.futures import ThreadPoolExecutor, Future
|
||||
from typing import Dict, Optional, TYPE_CHECKING
|
||||
|
||||
from domain.task import Task, TaskType
|
||||
from domain.result import TaskResult, ErrorCode
|
||||
|
||||
# 需要 GPU 加速的任务类型
|
||||
GPU_REQUIRED_TASK_TYPES = {
|
||||
@@ -22,6 +21,13 @@ from domain.config import WorkerConfig
|
||||
from core.handler import TaskHandler
|
||||
from services.lease_service import LeaseService
|
||||
from services.gpu_scheduler import GPUScheduler
|
||||
from util.tracing import (
|
||||
capture_otel_context,
|
||||
get_current_task_context,
|
||||
mark_span_error,
|
||||
start_span,
|
||||
task_trace_scope,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from services.api_client import APIClientV2
|
||||
@@ -174,77 +180,84 @@ class TaskExecutor:
|
||||
task: 任务实体
|
||||
"""
|
||||
task_id = task.task_id
|
||||
logger.info(f"[task:{task_id}] Starting {task.task_type.value}")
|
||||
|
||||
# 启动租约续期服务
|
||||
lease_service = LeaseService(
|
||||
self.api_client,
|
||||
task_id,
|
||||
interval=self.config.lease_extension_threshold,
|
||||
extension=self.config.lease_extension_duration
|
||||
)
|
||||
lease_service.start()
|
||||
|
||||
# 获取 GPU 设备(仅对需要 GPU 的任务类型)
|
||||
device_index = None
|
||||
needs_gpu = task.task_type in GPU_REQUIRED_TASK_TYPES
|
||||
if needs_gpu and self.gpu_scheduler.enabled:
|
||||
device_index = self.gpu_scheduler.acquire()
|
||||
if device_index is not None:
|
||||
logger.info(f"[task:{task_id}] Assigned to GPU device {device_index}")
|
||||
|
||||
# 获取处理器(需要在设置 GPU 设备前获取)
|
||||
handler = self.handlers.get(task.task_type)
|
||||
device_index = None
|
||||
lease_service = None
|
||||
|
||||
try:
|
||||
# 报告任务开始
|
||||
self.api_client.report_start(task_id)
|
||||
with task_trace_scope(task, span_name="render.task.execute") as task_span:
|
||||
logger.info(f"[task:{task_id}] Starting {task.task_type.value}")
|
||||
|
||||
if not handler:
|
||||
raise ValueError(f"No handler for task type: {task.task_type}")
|
||||
lease_service = LeaseService(
|
||||
self.api_client,
|
||||
task_id,
|
||||
interval=self.config.lease_extension_threshold,
|
||||
extension=self.config.lease_extension_duration,
|
||||
parent_otel_context=capture_otel_context(),
|
||||
task_trace_context=get_current_task_context(),
|
||||
)
|
||||
with start_span("render.task.lease.start"):
|
||||
lease_service.start()
|
||||
|
||||
# 设置 GPU 设备(线程本地存储)
|
||||
if device_index is not None:
|
||||
handler.set_gpu_device(device_index)
|
||||
needs_gpu = task.task_type in GPU_REQUIRED_TASK_TYPES
|
||||
if needs_gpu and self.gpu_scheduler.enabled:
|
||||
with start_span("render.task.gpu.acquire"):
|
||||
device_index = self.gpu_scheduler.acquire()
|
||||
if device_index is not None:
|
||||
logger.info(f"[task:{task_id}] Assigned to GPU device {device_index}")
|
||||
|
||||
# 执行前钩子
|
||||
handler.before_handle(task)
|
||||
try:
|
||||
with start_span("render.task.report.start"):
|
||||
self.api_client.report_start(task_id)
|
||||
|
||||
# 执行任务
|
||||
result = handler.handle(task)
|
||||
if not handler:
|
||||
raise ValueError(f"No handler for task type: {task.task_type}")
|
||||
|
||||
# 执行后钩子
|
||||
handler.after_handle(task, result)
|
||||
if device_index is not None:
|
||||
handler.set_gpu_device(device_index)
|
||||
|
||||
# 上报结果
|
||||
if result.success:
|
||||
self.api_client.report_success(task_id, result.data)
|
||||
logger.info(f"[task:{task_id}] Completed successfully")
|
||||
else:
|
||||
error_code = result.error_code.value if result.error_code else 'E_UNKNOWN'
|
||||
self.api_client.report_fail(task_id, error_code, result.error_message or '')
|
||||
logger.error(f"[task:{task_id}] Failed: {result.error_message}")
|
||||
with start_span("render.task.handler.before"):
|
||||
handler.before_handle(task)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[task:{task_id}] Exception: {e}", exc_info=True)
|
||||
self.api_client.report_fail(task_id, 'E_UNKNOWN', str(e))
|
||||
with start_span("render.task.handler.execute"):
|
||||
result = handler.handle(task)
|
||||
|
||||
finally:
|
||||
# 清除 GPU 设备设置
|
||||
if handler:
|
||||
handler.clear_gpu_device()
|
||||
with start_span("render.task.handler.after"):
|
||||
handler.after_handle(task, result)
|
||||
|
||||
# 释放 GPU 设备(仅当实际分配了设备时)
|
||||
if device_index is not None:
|
||||
self.gpu_scheduler.release(device_index)
|
||||
if result.success:
|
||||
with start_span("render.task.report.success"):
|
||||
self.api_client.report_success(task_id, result.data)
|
||||
if task_span is not None:
|
||||
task_span.set_attribute("render.task.result", "success")
|
||||
logger.info(f"[task:{task_id}] Completed successfully")
|
||||
else:
|
||||
error_code = result.error_code.value if result.error_code else 'E_UNKNOWN'
|
||||
with start_span("render.task.report.fail"):
|
||||
self.api_client.report_fail(task_id, error_code, result.error_message or '')
|
||||
mark_span_error(task_span, result.error_message or "task failed", error_code)
|
||||
logger.error(f"[task:{task_id}] Failed: {result.error_message}")
|
||||
|
||||
# 停止租约续期
|
||||
lease_service.stop()
|
||||
except Exception as e:
|
||||
mark_span_error(task_span, str(e), "E_UNKNOWN")
|
||||
logger.error(f"[task:{task_id}] Exception: {e}", exc_info=True)
|
||||
with start_span("render.task.report.exception"):
|
||||
self.api_client.report_fail(task_id, 'E_UNKNOWN', str(e))
|
||||
|
||||
# 从当前任务中移除
|
||||
with self.lock:
|
||||
self.current_tasks.pop(task_id, None)
|
||||
self.current_futures.pop(task_id, None)
|
||||
finally:
|
||||
if handler:
|
||||
handler.clear_gpu_device()
|
||||
|
||||
if device_index is not None:
|
||||
with start_span("render.task.gpu.release"):
|
||||
self.gpu_scheduler.release(device_index)
|
||||
|
||||
if lease_service is not None:
|
||||
with start_span("render.task.lease.stop"):
|
||||
lease_service.stop()
|
||||
|
||||
with self.lock:
|
||||
self.current_tasks.pop(task_id, None)
|
||||
self.current_futures.pop(task_id, None)
|
||||
|
||||
def shutdown(self, wait: bool = True):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user