@@ -334,6 +334,13 @@ def __init__(
334334 self ._activity_inflight = 0
335335 self ._heartbeat_interval = float (heartbeat_interval )
336336 self ._process_started_at = time .time ()
337+ # CPU sampling baseline. The heartbeat reports an *instantaneous*
338+ # cpu_percent — CPU time burned in the interval since the previous
339+ # heartbeat, divided by that interval — rather than the lifetime
340+ # average. ``_last_cpu_sample_at`` is None until the first sample
341+ # has been taken; subsequent samples diff against it.
342+ self ._last_cpu_sample_at : float | None = None
343+ self ._last_cpu_total_seconds : float = 0.0
337344 configured_metrics = metrics if metrics is not None else getattr (client , "metrics" , NOOP_METRICS )
338345 self .metrics : MetricsRecorder = configured_metrics or NOOP_METRICS
339346 self .interceptors = tuple (interceptors )
@@ -1152,32 +1159,54 @@ def _current_process_metrics(self) -> dict[str, Any]:
11521159 import os
11531160 import socket
11541161
1162+ now = time .time ()
11551163 metrics : dict [str , Any ] = {
1156- "process_uptime_seconds" : int (time . time () - self ._process_started_at ),
1164+ "process_uptime_seconds" : int (now - self ._process_started_at ),
11571165 "process_id" : os .getpid (),
11581166 }
11591167
1168+ # ``memory_bytes`` is the *current* resident set size, not the
1169+ # lifetime peak. ``resource.getrusage().ru_maxrss`` is the high-
1170+ # water mark since the process started, which masks freed memory
1171+ # and never decreases — wrong shape for a heartbeat metric meant
1172+ # to show what the worker is using right now. On Linux we read
1173+ # the second field of ``/proc/self/statm`` (resident pages) and
1174+ # multiply by the page size. Platforms without ``/proc`` get no
1175+ # ``memory_bytes`` field rather than a misleading lifetime peak.
1176+ if sys .platform .startswith ("linux" ):
1177+ try :
1178+ with open ("/proc/self/statm" ) as statm :
1179+ fields = statm .read ().split ()
1180+ if len (fields ) >= 2 :
1181+ metrics ["memory_bytes" ] = int (fields [1 ]) * os .sysconf ("SC_PAGE_SIZE" )
1182+ except (OSError , ValueError ):
1183+ pass
1184+
1185+ # ``cpu_percent`` is the share of wall time the process spent on
1186+ # CPU during the interval since the previous heartbeat — not the
1187+ # lifetime average, which converges to a fixed value and stops
1188+ # tracking live load. The first sample bootstraps from process
1189+ # start so the very first heartbeat still has a number.
11601190 try :
11611191 import resource
11621192
11631193 usage = resource .getrusage (resource .RUSAGE_SELF )
1164- # ru_maxrss is kilobytes on Linux and bytes on macOS — normalize
1165- # to bytes. The server stores whatever is sent so the units stay
1166- # consistent across SDKs.
1167- if sys .platform == "darwin" :
1168- metrics ["memory_bytes" ] = int (usage .ru_maxrss )
1169- else :
1170- metrics ["memory_bytes" ] = int (usage .ru_maxrss ) * 1024
1171-
11721194 cpu_seconds = float (usage .ru_utime ) + float (usage .ru_stime )
1173- wall_seconds = max (0.001 , time .time () - self ._process_started_at )
1195+ if self ._last_cpu_sample_at is None :
1196+ interval = max (0.001 , now - self ._process_started_at )
1197+ delta_cpu = max (0.0 , cpu_seconds )
1198+ else :
1199+ interval = max (0.001 , now - self ._last_cpu_sample_at )
1200+ delta_cpu = max (0.0 , cpu_seconds - self ._last_cpu_total_seconds )
11741201 metrics ["cpu_percent" ] = max (
1175- 0.0 , min (100.0 , round ((cpu_seconds / wall_seconds ) * 100.0 , 2 ))
1202+ 0.0 , min (100.0 , round ((delta_cpu / interval ) * 100.0 , 2 ))
11761203 )
1204+ self ._last_cpu_sample_at = now
1205+ self ._last_cpu_total_seconds = cpu_seconds
11771206 except (ImportError , OSError ):
1178- # `resource` is POSIX-only — Windows skips getrusage but still
1179- # reports pid + uptime + host so the operator surface remains
1180- # populated.
1207+ # `` resource`` is POSIX-only — Windows skips the CPU sample
1208+ # but still reports pid + uptime + host so the operator
1209+ # surface stays populated.
11811210 pass
11821211
11831212 try :
0 commit comments