|
| 1 | +"""OTel metrics for LLM calls. |
| 2 | +
|
| 3 | +Single source of truth for LLM-call instrumentation across all agentex code |
| 4 | +paths — temporal+openai_agents streaming today, sync ACP and the Claude SDK |
| 5 | +plugin in future PRs. Centralizing the instrument definitions here means |
| 6 | +those follow-ups don't need to redefine the metric names, units, or |
| 7 | +description strings; they import ``get_llm_metrics()`` and record values. |
| 8 | +
|
| 9 | +The meter is no-op when the application hasn't configured a ``MeterProvider``, |
| 10 | +so importing this module is safe for runtimes that don't use OTel. Instruments |
| 11 | +are created lazily on first ``get_llm_metrics()`` call so a ``MeterProvider`` |
| 12 | +configured *after* this module is imported still binds correctly. |
| 13 | +
|
| 14 | +Cardinality is bounded: |
| 15 | +- All metrics carry only ``model`` (the LLM model name). |
| 16 | +- ``requests`` additionally carries ``status``, drawn from a small fixed set |
| 17 | + (see ``classify_status``). |
| 18 | +
|
| 19 | +Resource attributes (``service.name``, ``k8s.*``, etc.) come from the |
| 20 | +application's OTel resource configuration and are added to every series |
| 21 | +automatically. |
| 22 | +""" |
| 23 | + |
| 24 | +from __future__ import annotations |
| 25 | + |
| 26 | +from typing import Optional |
| 27 | + |
| 28 | +from opentelemetry import metrics |
| 29 | + |
| 30 | + |
| 31 | +class LLMMetrics: |
| 32 | + """Lazily-created OTel instruments for LLM call telemetry.""" |
| 33 | + |
| 34 | + def __init__(self) -> None: |
| 35 | + meter = metrics.get_meter("agentex.llm") |
| 36 | + self.requests = meter.create_counter( |
| 37 | + name="agentex.llm.requests", |
| 38 | + unit="1", |
| 39 | + description=( |
| 40 | + "LLM call count tagged with status (success / rate_limit / " |
| 41 | + "server_error / client_error / timeout / network_error / " |
| 42 | + "other_error). Use to alert on 429s, 5xxs, etc." |
| 43 | + ), |
| 44 | + ) |
| 45 | + self.ttft_ms = meter.create_histogram( |
| 46 | + name="agentex.llm.ttft", |
| 47 | + unit="ms", |
| 48 | + description="Time from request submission to first content token (ms)", |
| 49 | + ) |
| 50 | + # ttat (time-to-first-answering-token) is distinct from ttft for reasoning |
| 51 | + # models: ttft fires on the first reasoning chunk (which arrives quickly), |
| 52 | + # while ttat fires on the first user-visible answer token (text or tool |
| 53 | + # call). For non-reasoning models the two are equal. |
| 54 | + self.ttat_ms = meter.create_histogram( |
| 55 | + name="agentex.llm.ttat", |
| 56 | + unit="ms", |
| 57 | + description="Time from request submission to first answering token (text or tool-call delta) — excludes reasoning chunks", |
| 58 | + ) |
| 59 | + # Note: TPS denominator is the model-generation window |
| 60 | + # (last_token_time - first_token_time), not total stream wall time. |
| 61 | + # This isolates raw model throughput from event-loop / tool-call latency. |
| 62 | + self.tps = meter.create_histogram( |
| 63 | + name="agentex.llm.tps", |
| 64 | + unit="tokens/s", |
| 65 | + description="Output tokens per second over the generation window", |
| 66 | + ) |
| 67 | + self.input_tokens = meter.create_counter( |
| 68 | + name="agentex.llm.input_tokens", |
| 69 | + unit="tokens", |
| 70 | + description="Total input tokens sent to the LLM", |
| 71 | + ) |
| 72 | + self.output_tokens = meter.create_counter( |
| 73 | + name="agentex.llm.output_tokens", |
| 74 | + unit="tokens", |
| 75 | + description="Total output tokens returned by the LLM", |
| 76 | + ) |
| 77 | + self.cached_input_tokens = meter.create_counter( |
| 78 | + name="agentex.llm.cached_input_tokens", |
| 79 | + unit="tokens", |
| 80 | + description="Subset of input tokens served from prompt cache", |
| 81 | + ) |
| 82 | + self.reasoning_tokens = meter.create_counter( |
| 83 | + name="agentex.llm.reasoning_tokens", |
| 84 | + unit="tokens", |
| 85 | + description="Output tokens spent on reasoning (subset of output_tokens)", |
| 86 | + ) |
| 87 | + |
| 88 | + |
| 89 | +_llm_metrics: Optional[LLMMetrics] = None |
| 90 | + |
| 91 | + |
| 92 | +def get_llm_metrics() -> LLMMetrics: |
| 93 | + """Return the LLM metrics singleton, creating it on first use.""" |
| 94 | + global _llm_metrics |
| 95 | + if _llm_metrics is None: |
| 96 | + _llm_metrics = LLMMetrics() |
| 97 | + return _llm_metrics |
| 98 | + |
| 99 | + |
| 100 | +def classify_status(exc: Optional[BaseException]) -> str: |
| 101 | + """Categorize an LLM call's outcome into a small fixed set of status labels. |
| 102 | +
|
| 103 | + A successful call returns ``"success"``. Exceptions are mapped by type name |
| 104 | + so we don't depend on a specific provider SDK's exception class hierarchy: |
| 105 | + OpenAI, Anthropic, and other providers all use names like ``RateLimitError``, |
| 106 | + ``APITimeoutError``, ``InternalServerError``, etc. |
| 107 | + """ |
| 108 | + if exc is None: |
| 109 | + return "success" |
| 110 | + name = type(exc).__name__ |
| 111 | + if "RateLimit" in name: |
| 112 | + return "rate_limit" |
| 113 | + if "Timeout" in name: |
| 114 | + return "timeout" |
| 115 | + if any(s in name for s in ("ServerError", "InternalServer", "ServiceUnavailable", "BadGateway")): |
| 116 | + return "server_error" |
| 117 | + if "Connection" in name: |
| 118 | + return "network_error" |
| 119 | + if any(s in name for s in ("BadRequest", "Authentication", "Permission", "NotFound", "Conflict", "UnprocessableEntity")): |
| 120 | + return "client_error" |
| 121 | + return "other_error" |
0 commit comments