feat(tracing): 集成 OpenTelemetry 链路追踪功能

- 在 base.py 中添加文件下载、上传和 FFmpeg 执行的链路追踪
- 在 api_client.py 中实现 API 请求的链路追踪和错误标记
- 在 lease_service.py 中添加租约续期的链路追踪支持
- 在 task_executor.py 中集成任务执行的完整链路追踪
- 新增 util/tracing.py 工具模块提供统一的追踪上下文管理
- 在 .env.example 中添加 OTEL 配置选项
- 在 index.py 中初始化和关闭链路追踪功能
This commit is contained in:
2026-02-07 00:11:01 +08:00
parent c9a6133be9
commit 9b373dea34
8 changed files with 549 additions and 149 deletions

View File

@@ -10,10 +10,14 @@ import subprocess
import time
import requests
from typing import Dict, List, Optional, Any
from urllib.parse import urlparse
from opentelemetry.trace import SpanKind, Status, StatusCode
from domain.task import Task
from domain.config import WorkerConfig
from util.system import get_hw_accel_info_str
from util.tracing import inject_trace_headers, mark_span_error, start_span
logger = logging.getLogger(__name__)
@@ -55,6 +59,45 @@ class APIClientV2:
'Accept': 'application/json'
})
def _request_with_trace(
self,
method: str,
url: str,
*,
task_id: Optional[str] = None,
span_name: str = "",
**kwargs: Any,
) -> requests.Response:
request_kwargs = dict(kwargs)
headers = request_kwargs.pop("headers", None)
if task_id:
request_kwargs["headers"] = inject_trace_headers(headers)
elif headers:
request_kwargs["headers"] = headers
parsed_url = urlparse(url)
attributes = {
"http.request.method": method.upper(),
"url.path": parsed_url.path,
"server.address": parsed_url.hostname or "",
}
if parsed_url.port:
attributes["server.port"] = parsed_url.port
name = span_name or f"render.api.{method.lower()}"
with start_span(name, task_id=task_id, kind=SpanKind.CLIENT, attributes=attributes) as span:
try:
response = self.session.request(method=method, url=url, **request_kwargs)
except Exception as exc:
mark_span_error(span, str(exc), "HTTP_REQUEST_ERROR")
raise
if span is not None:
span.set_attribute("http.response.status_code", response.status_code)
if response.status_code >= 400:
span.set_status(Status(StatusCode.ERROR, f"HTTP {response.status_code}"))
return response
def sync(self, current_task_ids: List[str]) -> List[Task]:
"""
心跳同步并拉取任务
@@ -128,10 +171,13 @@ class APIClientV2:
url = f"{self.base_url}/render/v2/task/{task_id}/start"
try:
resp = self.session.post(
url,
resp = self._request_with_trace(
method="POST",
url=url,
task_id=task_id,
span_name="render.task.api.report_start",
json={'workerId': self.worker_id},
timeout=10
timeout=10,
)
if resp.status_code == 200:
logger.debug(f"[task:{task_id}] Start reported")
@@ -157,13 +203,16 @@ class APIClientV2:
url = f"{self.base_url}/render/v2/task/{task_id}/success"
try:
resp = self.session.post(
url,
resp = self._request_with_trace(
method="POST",
url=url,
task_id=task_id,
span_name="render.task.api.report_success",
json={
'workerId': self.worker_id,
'result': result
},
timeout=10
timeout=10,
)
if resp.status_code == 200:
logger.debug(f"[task:{task_id}] Success reported")
@@ -190,14 +239,17 @@ class APIClientV2:
url = f"{self.base_url}/render/v2/task/{task_id}/fail"
try:
resp = self.session.post(
url,
resp = self._request_with_trace(
method="POST",
url=url,
task_id=task_id,
span_name="render.task.api.report_fail",
json={
'workerId': self.worker_id,
'errorCode': error_code,
'errorMessage': error_message[:1000] # 限制长度
},
timeout=10
timeout=10,
)
if resp.status_code == 200:
logger.debug(f"[task:{task_id}] Failure reported")
@@ -228,7 +280,14 @@ class APIClientV2:
payload['fileName'] = file_name
try:
resp = self.session.post(url, json=payload, timeout=10)
resp = self._request_with_trace(
method="POST",
url=url,
task_id=task_id,
span_name="render.task.api.get_upload_url",
json=payload,
timeout=10,
)
if resp.status_code == 200:
data = resp.json()
if data.get('code') == 200:
@@ -256,13 +315,16 @@ class APIClientV2:
url = f"{self.base_url}/render/v2/task/{task_id}/extend-lease"
try:
resp = self.session.post(
url,
resp = self._request_with_trace(
method="POST",
url=url,
task_id=task_id,
span_name="render.task.api.extend_lease",
params={
'workerId': self.worker_id,
'extension': extension
},
timeout=10
timeout=10,
)
if resp.status_code == 200:
logger.debug(f"[task:{task_id}] Lease extended by {extension}s")
@@ -287,7 +349,13 @@ class APIClientV2:
url = f"{self.base_url}/render/v2/task/{task_id}"
try:
resp = self.session.get(url, timeout=10)
resp = self._request_with_trace(
method="GET",
url=url,
task_id=task_id,
span_name="render.task.api.get_task_info",
timeout=10,
)
if resp.status_code == 200:
data = resp.json()
if data.get('code') == 200:

View File

@@ -8,10 +8,13 @@
import logging
import threading
import time
from typing import TYPE_CHECKING
from typing import TYPE_CHECKING, Any, Optional
if TYPE_CHECKING:
from services.api_client import APIClientV2
from util.tracing import TaskTraceContext
from util.tracing import bind_trace_context, start_span
logger = logging.getLogger(__name__)
@@ -29,7 +32,9 @@ class LeaseService:
api_client: 'APIClientV2',
task_id: str,
interval: int = 60,
extension: int = 300
extension: int = 300,
parent_otel_context: Any = None,
task_trace_context: Optional['TaskTraceContext'] = None,
):
"""
初始化租约服务
@@ -44,6 +49,8 @@ class LeaseService:
self.task_id = task_id
self.interval = interval
self.extension = extension
self.parent_otel_context = parent_otel_context
self.task_trace_context = task_trace_context
self.running = False
self.thread: threading.Thread = None
self._stop_event = threading.Event()
@@ -79,25 +86,29 @@ class LeaseService:
def _run(self):
"""续期线程主循环"""
while self.running:
# 等待指定间隔或收到停止信号
if self._stop_event.wait(timeout=self.interval):
# 收到停止信号
break
with bind_trace_context(self.parent_otel_context, self.task_trace_context):
while self.running:
if self._stop_event.wait(timeout=self.interval):
break
if self.running:
self._extend_lease()
if self.running:
self._extend_lease()
def _extend_lease(self):
"""执行租约续期"""
try:
success = self.api_client.extend_lease(self.task_id, self.extension)
if success:
logger.debug(f"[task:{self.task_id}] Lease extended by {self.extension}s")
else:
logger.warning(f"[task:{self.task_id}] Failed to extend lease")
except Exception as e:
logger.warning(f"[task:{self.task_id}] Lease extension error: {e}")
with start_span(
"render.task.lease.extend",
task_id=self.task_id,
attributes={"render.lease.extension_seconds": self.extension},
):
try:
success = self.api_client.extend_lease(self.task_id, self.extension)
if success:
logger.debug(f"[task:{self.task_id}] Lease extended by {self.extension}s")
else:
logger.warning(f"[task:{self.task_id}] Failed to extend lease")
except Exception as e:
logger.warning(f"[task:{self.task_id}] Lease extension error: {e}")
def __enter__(self):
"""上下文管理器入口"""

View File

@@ -151,6 +151,10 @@ def _upload_with_rclone(url: str, file_path: str) -> bool:
if new_url == url:
return False
if new_url.startswith(("http://", "https://")):
logger.warning(f"rclone upload skipped: URL still starts with http after replace: {new_url}")
return False
cmd = [
"rclone",
"copyto",

View File

@@ -11,7 +11,6 @@ from concurrent.futures import ThreadPoolExecutor, Future
from typing import Dict, Optional, TYPE_CHECKING
from domain.task import Task, TaskType
from domain.result import TaskResult, ErrorCode
# 需要 GPU 加速的任务类型
GPU_REQUIRED_TASK_TYPES = {
@@ -22,6 +21,13 @@ from domain.config import WorkerConfig
from core.handler import TaskHandler
from services.lease_service import LeaseService
from services.gpu_scheduler import GPUScheduler
from util.tracing import (
capture_otel_context,
get_current_task_context,
mark_span_error,
start_span,
task_trace_scope,
)
if TYPE_CHECKING:
from services.api_client import APIClientV2
@@ -174,77 +180,84 @@ class TaskExecutor:
task: 任务实体
"""
task_id = task.task_id
logger.info(f"[task:{task_id}] Starting {task.task_type.value}")
# 启动租约续期服务
lease_service = LeaseService(
self.api_client,
task_id,
interval=self.config.lease_extension_threshold,
extension=self.config.lease_extension_duration
)
lease_service.start()
# 获取 GPU 设备(仅对需要 GPU 的任务类型)
device_index = None
needs_gpu = task.task_type in GPU_REQUIRED_TASK_TYPES
if needs_gpu and self.gpu_scheduler.enabled:
device_index = self.gpu_scheduler.acquire()
if device_index is not None:
logger.info(f"[task:{task_id}] Assigned to GPU device {device_index}")
# 获取处理器(需要在设置 GPU 设备前获取)
handler = self.handlers.get(task.task_type)
device_index = None
lease_service = None
try:
# 报告任务开始
self.api_client.report_start(task_id)
with task_trace_scope(task, span_name="render.task.execute") as task_span:
logger.info(f"[task:{task_id}] Starting {task.task_type.value}")
if not handler:
raise ValueError(f"No handler for task type: {task.task_type}")
lease_service = LeaseService(
self.api_client,
task_id,
interval=self.config.lease_extension_threshold,
extension=self.config.lease_extension_duration,
parent_otel_context=capture_otel_context(),
task_trace_context=get_current_task_context(),
)
with start_span("render.task.lease.start"):
lease_service.start()
# 设置 GPU 设备(线程本地存储)
if device_index is not None:
handler.set_gpu_device(device_index)
needs_gpu = task.task_type in GPU_REQUIRED_TASK_TYPES
if needs_gpu and self.gpu_scheduler.enabled:
with start_span("render.task.gpu.acquire"):
device_index = self.gpu_scheduler.acquire()
if device_index is not None:
logger.info(f"[task:{task_id}] Assigned to GPU device {device_index}")
# 执行前钩子
handler.before_handle(task)
try:
with start_span("render.task.report.start"):
self.api_client.report_start(task_id)
# 执行任务
result = handler.handle(task)
if not handler:
raise ValueError(f"No handler for task type: {task.task_type}")
# 执行后钩子
handler.after_handle(task, result)
if device_index is not None:
handler.set_gpu_device(device_index)
# 上报结果
if result.success:
self.api_client.report_success(task_id, result.data)
logger.info(f"[task:{task_id}] Completed successfully")
else:
error_code = result.error_code.value if result.error_code else 'E_UNKNOWN'
self.api_client.report_fail(task_id, error_code, result.error_message or '')
logger.error(f"[task:{task_id}] Failed: {result.error_message}")
with start_span("render.task.handler.before"):
handler.before_handle(task)
except Exception as e:
logger.error(f"[task:{task_id}] Exception: {e}", exc_info=True)
self.api_client.report_fail(task_id, 'E_UNKNOWN', str(e))
with start_span("render.task.handler.execute"):
result = handler.handle(task)
finally:
# 清除 GPU 设备设置
if handler:
handler.clear_gpu_device()
with start_span("render.task.handler.after"):
handler.after_handle(task, result)
# 释放 GPU 设备(仅当实际分配了设备时)
if device_index is not None:
self.gpu_scheduler.release(device_index)
if result.success:
with start_span("render.task.report.success"):
self.api_client.report_success(task_id, result.data)
if task_span is not None:
task_span.set_attribute("render.task.result", "success")
logger.info(f"[task:{task_id}] Completed successfully")
else:
error_code = result.error_code.value if result.error_code else 'E_UNKNOWN'
with start_span("render.task.report.fail"):
self.api_client.report_fail(task_id, error_code, result.error_message or '')
mark_span_error(task_span, result.error_message or "task failed", error_code)
logger.error(f"[task:{task_id}] Failed: {result.error_message}")
# 停止租约续期
lease_service.stop()
except Exception as e:
mark_span_error(task_span, str(e), "E_UNKNOWN")
logger.error(f"[task:{task_id}] Exception: {e}", exc_info=True)
with start_span("render.task.report.exception"):
self.api_client.report_fail(task_id, 'E_UNKNOWN', str(e))
# 从当前任务中移除
with self.lock:
self.current_tasks.pop(task_id, None)
self.current_futures.pop(task_id, None)
finally:
if handler:
handler.clear_gpu_device()
if device_index is not None:
with start_span("render.task.gpu.release"):
self.gpu_scheduler.release(device_index)
if lease_service is not None:
with start_span("render.task.lease.stop"):
lease_service.stop()
with self.lock:
self.current_tasks.pop(task_id, None)
self.current_futures.pop(task_id, None)
def shutdown(self, wait: bool = True):
"""