feat(tracing): 集成 OpenTelemetry 链路追踪功能

- 在 base.py 中添加文件下载、上传和 FFmpeg 执行的链路追踪
- 在 api_client.py 中实现 API 请求的链路追踪和错误标记
- 在 lease_service.py 中添加租约续期的链路追踪支持
- 在 task_executor.py 中集成任务执行的完整链路追踪
- 新增 util/tracing.py 工具模块提供统一的追踪上下文管理
- 在 .env.example 中添加 OTEL 配置选项
- 在 index.py 中初始化和关闭链路追踪功能
This commit is contained in:
2026-02-07 00:11:01 +08:00
parent c9a6133be9
commit 9b373dea34
8 changed files with 549 additions and 149 deletions

View File

@@ -10,10 +10,14 @@ import subprocess
import time
import requests
from typing import Dict, List, Optional, Any
from urllib.parse import urlparse
from opentelemetry.trace import SpanKind, Status, StatusCode
from domain.task import Task
from domain.config import WorkerConfig
from util.system import get_hw_accel_info_str
from util.tracing import inject_trace_headers, mark_span_error, start_span
logger = logging.getLogger(__name__)
@@ -55,6 +59,45 @@ class APIClientV2:
'Accept': 'application/json'
})
def _request_with_trace(
self,
method: str,
url: str,
*,
task_id: Optional[str] = None,
span_name: str = "",
**kwargs: Any,
) -> requests.Response:
request_kwargs = dict(kwargs)
headers = request_kwargs.pop("headers", None)
if task_id:
request_kwargs["headers"] = inject_trace_headers(headers)
elif headers:
request_kwargs["headers"] = headers
parsed_url = urlparse(url)
attributes = {
"http.request.method": method.upper(),
"url.path": parsed_url.path,
"server.address": parsed_url.hostname or "",
}
if parsed_url.port:
attributes["server.port"] = parsed_url.port
name = span_name or f"render.api.{method.lower()}"
with start_span(name, task_id=task_id, kind=SpanKind.CLIENT, attributes=attributes) as span:
try:
response = self.session.request(method=method, url=url, **request_kwargs)
except Exception as exc:
mark_span_error(span, str(exc), "HTTP_REQUEST_ERROR")
raise
if span is not None:
span.set_attribute("http.response.status_code", response.status_code)
if response.status_code >= 400:
span.set_status(Status(StatusCode.ERROR, f"HTTP {response.status_code}"))
return response
def sync(self, current_task_ids: List[str]) -> List[Task]:
"""
心跳同步并拉取任务
@@ -128,10 +171,13 @@ class APIClientV2:
url = f"{self.base_url}/render/v2/task/{task_id}/start"
try:
resp = self.session.post(
url,
resp = self._request_with_trace(
method="POST",
url=url,
task_id=task_id,
span_name="render.task.api.report_start",
json={'workerId': self.worker_id},
timeout=10
timeout=10,
)
if resp.status_code == 200:
logger.debug(f"[task:{task_id}] Start reported")
@@ -157,13 +203,16 @@ class APIClientV2:
url = f"{self.base_url}/render/v2/task/{task_id}/success"
try:
resp = self.session.post(
url,
resp = self._request_with_trace(
method="POST",
url=url,
task_id=task_id,
span_name="render.task.api.report_success",
json={
'workerId': self.worker_id,
'result': result
},
timeout=10
timeout=10,
)
if resp.status_code == 200:
logger.debug(f"[task:{task_id}] Success reported")
@@ -190,14 +239,17 @@ class APIClientV2:
url = f"{self.base_url}/render/v2/task/{task_id}/fail"
try:
resp = self.session.post(
url,
resp = self._request_with_trace(
method="POST",
url=url,
task_id=task_id,
span_name="render.task.api.report_fail",
json={
'workerId': self.worker_id,
'errorCode': error_code,
'errorMessage': error_message[:1000] # 限制长度
},
timeout=10
timeout=10,
)
if resp.status_code == 200:
logger.debug(f"[task:{task_id}] Failure reported")
@@ -228,7 +280,14 @@ class APIClientV2:
payload['fileName'] = file_name
try:
resp = self.session.post(url, json=payload, timeout=10)
resp = self._request_with_trace(
method="POST",
url=url,
task_id=task_id,
span_name="render.task.api.get_upload_url",
json=payload,
timeout=10,
)
if resp.status_code == 200:
data = resp.json()
if data.get('code') == 200:
@@ -256,13 +315,16 @@ class APIClientV2:
url = f"{self.base_url}/render/v2/task/{task_id}/extend-lease"
try:
resp = self.session.post(
url,
resp = self._request_with_trace(
method="POST",
url=url,
task_id=task_id,
span_name="render.task.api.extend_lease",
params={
'workerId': self.worker_id,
'extension': extension
},
timeout=10
timeout=10,
)
if resp.status_code == 200:
logger.debug(f"[task:{task_id}] Lease extended by {extension}s")
@@ -287,7 +349,13 @@ class APIClientV2:
url = f"{self.base_url}/render/v2/task/{task_id}"
try:
resp = self.session.get(url, timeout=10)
resp = self._request_with_trace(
method="GET",
url=url,
task_id=task_id,
span_name="render.task.api.get_task_info",
timeout=10,
)
if resp.status_code == 200:
data = resp.json()
if data.get('code') == 200: