You've already forked FrameTour-RenderWorker
feat(tracing): 集成 OpenTelemetry 链路追踪功能
- 在 base.py 中添加文件下载、上传和 FFmpeg 执行的链路追踪 - 在 api_client.py 中实现 API 请求的链路追踪和错误标记 - 在 lease_service.py 中添加租约续期的链路追踪支持 - 在 task_executor.py 中集成任务执行的完整链路追踪 - 新增 util/tracing.py 工具模块提供统一的追踪上下文管理 - 在 .env.example 中添加 OTEL 配置选项 - 在 index.py 中初始化和关闭链路追踪功能
This commit is contained in:
@@ -10,10 +10,14 @@ import subprocess
|
||||
import time
|
||||
import requests
|
||||
from typing import Dict, List, Optional, Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from opentelemetry.trace import SpanKind, Status, StatusCode
|
||||
|
||||
from domain.task import Task
|
||||
from domain.config import WorkerConfig
|
||||
from util.system import get_hw_accel_info_str
|
||||
from util.tracing import inject_trace_headers, mark_span_error, start_span
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -55,6 +59,45 @@ class APIClientV2:
|
||||
'Accept': 'application/json'
|
||||
})
|
||||
|
||||
def _request_with_trace(
|
||||
self,
|
||||
method: str,
|
||||
url: str,
|
||||
*,
|
||||
task_id: Optional[str] = None,
|
||||
span_name: str = "",
|
||||
**kwargs: Any,
|
||||
) -> requests.Response:
|
||||
request_kwargs = dict(kwargs)
|
||||
headers = request_kwargs.pop("headers", None)
|
||||
if task_id:
|
||||
request_kwargs["headers"] = inject_trace_headers(headers)
|
||||
elif headers:
|
||||
request_kwargs["headers"] = headers
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
attributes = {
|
||||
"http.request.method": method.upper(),
|
||||
"url.path": parsed_url.path,
|
||||
"server.address": parsed_url.hostname or "",
|
||||
}
|
||||
if parsed_url.port:
|
||||
attributes["server.port"] = parsed_url.port
|
||||
|
||||
name = span_name or f"render.api.{method.lower()}"
|
||||
with start_span(name, task_id=task_id, kind=SpanKind.CLIENT, attributes=attributes) as span:
|
||||
try:
|
||||
response = self.session.request(method=method, url=url, **request_kwargs)
|
||||
except Exception as exc:
|
||||
mark_span_error(span, str(exc), "HTTP_REQUEST_ERROR")
|
||||
raise
|
||||
|
||||
if span is not None:
|
||||
span.set_attribute("http.response.status_code", response.status_code)
|
||||
if response.status_code >= 400:
|
||||
span.set_status(Status(StatusCode.ERROR, f"HTTP {response.status_code}"))
|
||||
return response
|
||||
|
||||
def sync(self, current_task_ids: List[str]) -> List[Task]:
|
||||
"""
|
||||
心跳同步并拉取任务
|
||||
@@ -128,10 +171,13 @@ class APIClientV2:
|
||||
url = f"{self.base_url}/render/v2/task/{task_id}/start"
|
||||
|
||||
try:
|
||||
resp = self.session.post(
|
||||
url,
|
||||
resp = self._request_with_trace(
|
||||
method="POST",
|
||||
url=url,
|
||||
task_id=task_id,
|
||||
span_name="render.task.api.report_start",
|
||||
json={'workerId': self.worker_id},
|
||||
timeout=10
|
||||
timeout=10,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
logger.debug(f"[task:{task_id}] Start reported")
|
||||
@@ -157,13 +203,16 @@ class APIClientV2:
|
||||
url = f"{self.base_url}/render/v2/task/{task_id}/success"
|
||||
|
||||
try:
|
||||
resp = self.session.post(
|
||||
url,
|
||||
resp = self._request_with_trace(
|
||||
method="POST",
|
||||
url=url,
|
||||
task_id=task_id,
|
||||
span_name="render.task.api.report_success",
|
||||
json={
|
||||
'workerId': self.worker_id,
|
||||
'result': result
|
||||
},
|
||||
timeout=10
|
||||
timeout=10,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
logger.debug(f"[task:{task_id}] Success reported")
|
||||
@@ -190,14 +239,17 @@ class APIClientV2:
|
||||
url = f"{self.base_url}/render/v2/task/{task_id}/fail"
|
||||
|
||||
try:
|
||||
resp = self.session.post(
|
||||
url,
|
||||
resp = self._request_with_trace(
|
||||
method="POST",
|
||||
url=url,
|
||||
task_id=task_id,
|
||||
span_name="render.task.api.report_fail",
|
||||
json={
|
||||
'workerId': self.worker_id,
|
||||
'errorCode': error_code,
|
||||
'errorMessage': error_message[:1000] # 限制长度
|
||||
},
|
||||
timeout=10
|
||||
timeout=10,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
logger.debug(f"[task:{task_id}] Failure reported")
|
||||
@@ -228,7 +280,14 @@ class APIClientV2:
|
||||
payload['fileName'] = file_name
|
||||
|
||||
try:
|
||||
resp = self.session.post(url, json=payload, timeout=10)
|
||||
resp = self._request_with_trace(
|
||||
method="POST",
|
||||
url=url,
|
||||
task_id=task_id,
|
||||
span_name="render.task.api.get_upload_url",
|
||||
json=payload,
|
||||
timeout=10,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get('code') == 200:
|
||||
@@ -256,13 +315,16 @@ class APIClientV2:
|
||||
url = f"{self.base_url}/render/v2/task/{task_id}/extend-lease"
|
||||
|
||||
try:
|
||||
resp = self.session.post(
|
||||
url,
|
||||
resp = self._request_with_trace(
|
||||
method="POST",
|
||||
url=url,
|
||||
task_id=task_id,
|
||||
span_name="render.task.api.extend_lease",
|
||||
params={
|
||||
'workerId': self.worker_id,
|
||||
'extension': extension
|
||||
},
|
||||
timeout=10
|
||||
timeout=10,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
logger.debug(f"[task:{task_id}] Lease extended by {extension}s")
|
||||
@@ -287,7 +349,13 @@ class APIClientV2:
|
||||
url = f"{self.base_url}/render/v2/task/{task_id}"
|
||||
|
||||
try:
|
||||
resp = self.session.get(url, timeout=10)
|
||||
resp = self._request_with_trace(
|
||||
method="GET",
|
||||
url=url,
|
||||
task_id=task_id,
|
||||
span_name="render.task.api.get_task_info",
|
||||
timeout=10,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data.get('code') == 200:
|
||||
|
||||
@@ -8,10 +8,13 @@
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
from typing import TYPE_CHECKING
|
||||
from typing import TYPE_CHECKING, Any, Optional
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from services.api_client import APIClientV2
|
||||
from util.tracing import TaskTraceContext
|
||||
|
||||
from util.tracing import bind_trace_context, start_span
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -29,7 +32,9 @@ class LeaseService:
|
||||
api_client: 'APIClientV2',
|
||||
task_id: str,
|
||||
interval: int = 60,
|
||||
extension: int = 300
|
||||
extension: int = 300,
|
||||
parent_otel_context: Any = None,
|
||||
task_trace_context: Optional['TaskTraceContext'] = None,
|
||||
):
|
||||
"""
|
||||
初始化租约服务
|
||||
@@ -44,6 +49,8 @@ class LeaseService:
|
||||
self.task_id = task_id
|
||||
self.interval = interval
|
||||
self.extension = extension
|
||||
self.parent_otel_context = parent_otel_context
|
||||
self.task_trace_context = task_trace_context
|
||||
self.running = False
|
||||
self.thread: threading.Thread = None
|
||||
self._stop_event = threading.Event()
|
||||
@@ -79,25 +86,29 @@ class LeaseService:
|
||||
|
||||
def _run(self):
|
||||
"""续期线程主循环"""
|
||||
while self.running:
|
||||
# 等待指定间隔或收到停止信号
|
||||
if self._stop_event.wait(timeout=self.interval):
|
||||
# 收到停止信号
|
||||
break
|
||||
with bind_trace_context(self.parent_otel_context, self.task_trace_context):
|
||||
while self.running:
|
||||
if self._stop_event.wait(timeout=self.interval):
|
||||
break
|
||||
|
||||
if self.running:
|
||||
self._extend_lease()
|
||||
if self.running:
|
||||
self._extend_lease()
|
||||
|
||||
def _extend_lease(self):
|
||||
"""执行租约续期"""
|
||||
try:
|
||||
success = self.api_client.extend_lease(self.task_id, self.extension)
|
||||
if success:
|
||||
logger.debug(f"[task:{self.task_id}] Lease extended by {self.extension}s")
|
||||
else:
|
||||
logger.warning(f"[task:{self.task_id}] Failed to extend lease")
|
||||
except Exception as e:
|
||||
logger.warning(f"[task:{self.task_id}] Lease extension error: {e}")
|
||||
with start_span(
|
||||
"render.task.lease.extend",
|
||||
task_id=self.task_id,
|
||||
attributes={"render.lease.extension_seconds": self.extension},
|
||||
):
|
||||
try:
|
||||
success = self.api_client.extend_lease(self.task_id, self.extension)
|
||||
if success:
|
||||
logger.debug(f"[task:{self.task_id}] Lease extended by {self.extension}s")
|
||||
else:
|
||||
logger.warning(f"[task:{self.task_id}] Failed to extend lease")
|
||||
except Exception as e:
|
||||
logger.warning(f"[task:{self.task_id}] Lease extension error: {e}")
|
||||
|
||||
def __enter__(self):
|
||||
"""上下文管理器入口"""
|
||||
|
||||
@@ -151,6 +151,10 @@ def _upload_with_rclone(url: str, file_path: str) -> bool:
|
||||
if new_url == url:
|
||||
return False
|
||||
|
||||
if new_url.startswith(("http://", "https://")):
|
||||
logger.warning(f"rclone upload skipped: URL still starts with http after replace: {new_url}")
|
||||
return False
|
||||
|
||||
cmd = [
|
||||
"rclone",
|
||||
"copyto",
|
||||
|
||||
@@ -11,7 +11,6 @@ from concurrent.futures import ThreadPoolExecutor, Future
|
||||
from typing import Dict, Optional, TYPE_CHECKING
|
||||
|
||||
from domain.task import Task, TaskType
|
||||
from domain.result import TaskResult, ErrorCode
|
||||
|
||||
# 需要 GPU 加速的任务类型
|
||||
GPU_REQUIRED_TASK_TYPES = {
|
||||
@@ -22,6 +21,13 @@ from domain.config import WorkerConfig
|
||||
from core.handler import TaskHandler
|
||||
from services.lease_service import LeaseService
|
||||
from services.gpu_scheduler import GPUScheduler
|
||||
from util.tracing import (
|
||||
capture_otel_context,
|
||||
get_current_task_context,
|
||||
mark_span_error,
|
||||
start_span,
|
||||
task_trace_scope,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from services.api_client import APIClientV2
|
||||
@@ -174,77 +180,84 @@ class TaskExecutor:
|
||||
task: 任务实体
|
||||
"""
|
||||
task_id = task.task_id
|
||||
logger.info(f"[task:{task_id}] Starting {task.task_type.value}")
|
||||
|
||||
# 启动租约续期服务
|
||||
lease_service = LeaseService(
|
||||
self.api_client,
|
||||
task_id,
|
||||
interval=self.config.lease_extension_threshold,
|
||||
extension=self.config.lease_extension_duration
|
||||
)
|
||||
lease_service.start()
|
||||
|
||||
# 获取 GPU 设备(仅对需要 GPU 的任务类型)
|
||||
device_index = None
|
||||
needs_gpu = task.task_type in GPU_REQUIRED_TASK_TYPES
|
||||
if needs_gpu and self.gpu_scheduler.enabled:
|
||||
device_index = self.gpu_scheduler.acquire()
|
||||
if device_index is not None:
|
||||
logger.info(f"[task:{task_id}] Assigned to GPU device {device_index}")
|
||||
|
||||
# 获取处理器(需要在设置 GPU 设备前获取)
|
||||
handler = self.handlers.get(task.task_type)
|
||||
device_index = None
|
||||
lease_service = None
|
||||
|
||||
try:
|
||||
# 报告任务开始
|
||||
self.api_client.report_start(task_id)
|
||||
with task_trace_scope(task, span_name="render.task.execute") as task_span:
|
||||
logger.info(f"[task:{task_id}] Starting {task.task_type.value}")
|
||||
|
||||
if not handler:
|
||||
raise ValueError(f"No handler for task type: {task.task_type}")
|
||||
lease_service = LeaseService(
|
||||
self.api_client,
|
||||
task_id,
|
||||
interval=self.config.lease_extension_threshold,
|
||||
extension=self.config.lease_extension_duration,
|
||||
parent_otel_context=capture_otel_context(),
|
||||
task_trace_context=get_current_task_context(),
|
||||
)
|
||||
with start_span("render.task.lease.start"):
|
||||
lease_service.start()
|
||||
|
||||
# 设置 GPU 设备(线程本地存储)
|
||||
if device_index is not None:
|
||||
handler.set_gpu_device(device_index)
|
||||
needs_gpu = task.task_type in GPU_REQUIRED_TASK_TYPES
|
||||
if needs_gpu and self.gpu_scheduler.enabled:
|
||||
with start_span("render.task.gpu.acquire"):
|
||||
device_index = self.gpu_scheduler.acquire()
|
||||
if device_index is not None:
|
||||
logger.info(f"[task:{task_id}] Assigned to GPU device {device_index}")
|
||||
|
||||
# 执行前钩子
|
||||
handler.before_handle(task)
|
||||
try:
|
||||
with start_span("render.task.report.start"):
|
||||
self.api_client.report_start(task_id)
|
||||
|
||||
# 执行任务
|
||||
result = handler.handle(task)
|
||||
if not handler:
|
||||
raise ValueError(f"No handler for task type: {task.task_type}")
|
||||
|
||||
# 执行后钩子
|
||||
handler.after_handle(task, result)
|
||||
if device_index is not None:
|
||||
handler.set_gpu_device(device_index)
|
||||
|
||||
# 上报结果
|
||||
if result.success:
|
||||
self.api_client.report_success(task_id, result.data)
|
||||
logger.info(f"[task:{task_id}] Completed successfully")
|
||||
else:
|
||||
error_code = result.error_code.value if result.error_code else 'E_UNKNOWN'
|
||||
self.api_client.report_fail(task_id, error_code, result.error_message or '')
|
||||
logger.error(f"[task:{task_id}] Failed: {result.error_message}")
|
||||
with start_span("render.task.handler.before"):
|
||||
handler.before_handle(task)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[task:{task_id}] Exception: {e}", exc_info=True)
|
||||
self.api_client.report_fail(task_id, 'E_UNKNOWN', str(e))
|
||||
with start_span("render.task.handler.execute"):
|
||||
result = handler.handle(task)
|
||||
|
||||
finally:
|
||||
# 清除 GPU 设备设置
|
||||
if handler:
|
||||
handler.clear_gpu_device()
|
||||
with start_span("render.task.handler.after"):
|
||||
handler.after_handle(task, result)
|
||||
|
||||
# 释放 GPU 设备(仅当实际分配了设备时)
|
||||
if device_index is not None:
|
||||
self.gpu_scheduler.release(device_index)
|
||||
if result.success:
|
||||
with start_span("render.task.report.success"):
|
||||
self.api_client.report_success(task_id, result.data)
|
||||
if task_span is not None:
|
||||
task_span.set_attribute("render.task.result", "success")
|
||||
logger.info(f"[task:{task_id}] Completed successfully")
|
||||
else:
|
||||
error_code = result.error_code.value if result.error_code else 'E_UNKNOWN'
|
||||
with start_span("render.task.report.fail"):
|
||||
self.api_client.report_fail(task_id, error_code, result.error_message or '')
|
||||
mark_span_error(task_span, result.error_message or "task failed", error_code)
|
||||
logger.error(f"[task:{task_id}] Failed: {result.error_message}")
|
||||
|
||||
# 停止租约续期
|
||||
lease_service.stop()
|
||||
except Exception as e:
|
||||
mark_span_error(task_span, str(e), "E_UNKNOWN")
|
||||
logger.error(f"[task:{task_id}] Exception: {e}", exc_info=True)
|
||||
with start_span("render.task.report.exception"):
|
||||
self.api_client.report_fail(task_id, 'E_UNKNOWN', str(e))
|
||||
|
||||
# 从当前任务中移除
|
||||
with self.lock:
|
||||
self.current_tasks.pop(task_id, None)
|
||||
self.current_futures.pop(task_id, None)
|
||||
finally:
|
||||
if handler:
|
||||
handler.clear_gpu_device()
|
||||
|
||||
if device_index is not None:
|
||||
with start_span("render.task.gpu.release"):
|
||||
self.gpu_scheduler.release(device_index)
|
||||
|
||||
if lease_service is not None:
|
||||
with start_span("render.task.lease.stop"):
|
||||
lease_service.stop()
|
||||
|
||||
with self.lock:
|
||||
self.current_tasks.pop(task_id, None)
|
||||
self.current_futures.pop(task_id, None)
|
||||
|
||||
def shutdown(self, wait: bool = True):
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user