From 54669d2ef2b986824575918dae63ad159a838f7b Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 28 May 2026 10:41:40 -0500 Subject: [PATCH 1/5] removed NS branch dependency --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 61 ++- .../plugins/inband/amdsmi/amdsmi_collector.py | 88 +++- .../plugins/inband/amdsmi/amdsmidata.py | 157 ++++-- .../plugins/inband/amdsmi/analyzer_args.py | 7 + test/unit/plugin/test_amdsmi_analyzer.py | 64 ++- test/unit/plugin/test_amdsmi_data.py | 446 ++++++++++++++++++ 6 files changed, 759 insertions(+), 64 deletions(-) create mode 100644 test/unit/plugin/test_amdsmi_data.py diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index 9a9cea71..b831d1c0 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -50,6 +50,59 @@ class AmdSmiAnalyzer(CperAnalysisTaskMixin, DataAnalyzer[AmdSmiDataModel, None]) DATA_MODEL = AmdSmiDataModel + def check_expected_power_management( + self, + amdsmi_metric_data: list[AmdSmiMetric], + expected_power_management: str, + ) -> None: + """Check amd-smi metric ``power_management`` matches the expected value per GPU. + + Args: + amdsmi_metric_data: Per-GPU metric data from ``amd-smi metric``. + expected_power_management: Expected value (e.g. ``DISABLED``). + """ + expected = expected_power_management.strip().upper() + mismatches: dict[int, str] = {} + missing: list[int] = [] + + for metric in amdsmi_metric_data: + gpu = metric.gpu + actual_raw = metric.power.power_management + if actual_raw is None: + missing.append(gpu) + continue + actual_str = str(actual_raw).strip() + if actual_str.upper() in {"N/A", "NA", ""}: + missing.append(gpu) + continue + if actual_str.upper() != expected: + mismatches[gpu] = actual_str + + if missing: + self._log_event( + category=EventCategory.PLATFORM, + description="GPU power_management value not available", + priority=EventPriority.WARNING, + data={ + "gpus": missing, + "expected_power_management": expected_power_management, + }, + console_log=True, + ) + + if mismatches: + self._log_event( + category=EventCategory.PLATFORM, + description="GPU power_management mismatch", + priority=EventPriority.ERROR, + data={ + "gpus": list(mismatches.keys()), + "power_management_values": mismatches, + "expected_power_management": expected_power_management, + }, + console_log=True, + ) + def check_expected_max_power( self, amdsmi_static_data: list[AmdSmiStatic], @@ -63,7 +116,9 @@ def check_expected_max_power( """ incorrect_max_power_gpus: dict[int, Union[int, str, float]] = {} for gpu in amdsmi_static_data: - if gpu.limit is None or gpu.limit.max_power is None: + limit = gpu.limit + max_power_vu = limit.resolved_max_power() if limit is not None else None + if max_power_vu is None or max_power_vu.value is None: self._log_event( category=EventCategory.PLATFORM, description=f"GPU: {gpu.gpu} has no max power limit set", @@ -71,7 +126,7 @@ def check_expected_max_power( data={"gpu": gpu.gpu}, ) continue - max_power_value = gpu.limit.max_power.value + max_power_value = max_power_vu.value try: max_power_float = float(max_power_value) except ValueError: @@ -733,6 +788,8 @@ def analyze_data( args = AmdSmiAnalyzerArgs() if data.metric is not None and len(data.metric) > 0: + if args.expected_power_management: + self.check_expected_power_management(data.metric, args.expected_power_management) if args.l0_to_recovery_count_error_threshold is not None: self.check_amdsmi_metric_pcie( data.metric, diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index d4f22c46..b54a1860 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -60,6 +60,7 @@ StaticClockData, StaticDriver, StaticFrequencyLevels, + StaticLimit, StaticNuma, StaticPolicy, StaticRas, @@ -519,11 +520,17 @@ def _get_amdsmi_version(self) -> Optional[AmdSmiVersion]: return None try: + amdgpu_ver = version_data.get("amdgpu_version") or version_data.get("amdgpu") + hsmp_ver = version_data.get("amd_hsmp_driver_version") or version_data.get( + "amd_hsmp_version" + ) return AmdSmiVersion( tool="amdsmi", version=version_data.get("amdsmi_library_version", ""), amdsmi_library_version=version_data.get("amdsmi_library_version", ""), rocm_version=version_data.get("rocm_version", ""), + amdgpu_version=str(amdgpu_ver) if amdgpu_ver not in (None, "") else None, + amd_hsmp_driver_version=str(hsmp_ver) if hsmp_ver not in (None, "") else None, ) except ValidationError as err: self._log_event( @@ -902,6 +909,7 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: ) # Driver + os_kernel = driver.get("os_kernel_version") if driver else None driver_model = StaticDriver( name=self._normalize( driver.get("driver_name") if driver else None, default="unknown" @@ -909,6 +917,9 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: version=self._normalize( driver.get("driver_version") if driver else None, default="unknown" ), + os_kernel_version=( + None if os_kernel in (None, "", "N/A") else str(os_kernel).strip() + ), ) # VBIOS @@ -964,6 +975,8 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: # Cache info cache_info_model = self._parse_cache_info(cache) + limit_model = self._parse_limit(item.get("limit")) + # Clock clock_dict_model = self._parse_clock_dict(clock) @@ -974,7 +987,7 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: asic=asic_model, bus=bus_model, vbios=vbios_model, - limit=None, + limit=limit_model, driver=driver_model, board=board_model, ras=ras_model, @@ -999,6 +1012,35 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: return out + def _parse_limit(self, data: Optional[object]) -> Optional[StaticLimit]: + """Parse static power/limit block (legacy flat or ROCm 7+ ``ppt0``/``ppt1``).""" + if data is None or not isinstance(data, dict) or not data: + return None + try: + return StaticLimit.model_validate(data) + except ValidationError as err: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to build StaticLimit; skipping limit data", + data={"errors": err.errors(include_url=False)}, + priority=EventPriority.WARNING, + ) + return None + + def _parse_current_level(self, data: dict) -> Optional[int]: + """Extract current DPM level index from static clock JSON.""" + cur_raw = data.get("current") + if cur_raw is None: + cur_raw = data.get("current level") + if isinstance(cur_raw, (int, float)): + return int(cur_raw) + if isinstance(cur_raw, str) and cur_raw.strip() and cur_raw.upper() != "N/A": + try: + return int(cur_raw.strip()) + except ValueError: + return None + return None + def _parse_soc_pstate(self, data: dict) -> Optional[StaticSocPstate]: """Parse SOC P-state data @@ -1241,6 +1283,22 @@ def _parse_clock(self, data: dict) -> Optional[StaticClockData]: if not isinstance(data, dict): return None + current = self._parse_current_level(data) + freq_levels_raw = data.get("frequency_levels") + if isinstance(freq_levels_raw, dict) and freq_levels_raw: + try: + levels = StaticFrequencyLevels.model_validate(freq_levels_raw) + return StaticClockData.model_validate( + {"frequency_levels": levels, "current level": current} + ) + except ValidationError as err: + self._log_event( + category=EventCategory.APPLICATION, + description="Failed to parse static clock frequency_levels", + data={"errors": err.errors(include_url=False)}, + priority=EventPriority.WARNING, + ) + freqs_raw = data.get("frequency") if not isinstance(freqs_raw, list) or not freqs_raw: return None @@ -1272,18 +1330,6 @@ def _fmt(n: Optional[int]) -> Optional[str]: level1: Optional[str] = _fmt(freqs_mhz[1]) if len(freqs_mhz) > 1 else None level2: Optional[str] = _fmt(freqs_mhz[2]) if len(freqs_mhz) > 2 else None - cur_raw = data.get("current") - current: Optional[int] - if isinstance(cur_raw, (int, float)): - current = int(cur_raw) - elif isinstance(cur_raw, str) and cur_raw.strip() and cur_raw.upper() != "N/A": - try: - current = int(cur_raw.strip()) - except Exception: - current = None - else: - current = None - try: levels = StaticFrequencyLevels.model_validate( {"Level 0": level0, "Level 1": level1, "Level 2": level2} @@ -1310,9 +1356,19 @@ def _parse_clock_dict(self, data: dict) -> Optional[dict[str, Union[StaticClockD clock_dict: dict[str, Union[StaticClockData, None]] = {} - clock_data = self._parse_clock(data) - if clock_data: - clock_dict["clk"] = clock_data + for key, val in data.items(): + if val is None or (isinstance(val, str) and val.strip().upper() in {"N/A", "NA", ""}): + clock_dict[str(key)] = None + continue + if isinstance(val, dict): + parsed = self._parse_clock(val) + if parsed is not None: + clock_dict[str(key)] = parsed + + if not clock_dict and (data.get("frequency") or data.get("frequency_levels")): + parsed = self._parse_clock(data) + if parsed is not None: + clock_dict["clk"] = parsed return clock_dict if clock_dict else None diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 940047ba..de14afe9 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -43,6 +43,43 @@ _NUM_UNIT_RE = re.compile(r"^\s*([-+]?\d+(?:\.\d+)?)(?:\s*([A-Za-z%/][A-Za-z0-9%/._-]*))?\s*$") +def _value_unit_is_na(x: Any) -> bool: + return x is None or (isinstance(x, str) and x.strip().upper() in {"N/A", "NA", ""}) + + +def _coerce_value_unit_raw(v: Any) -> Any: + """Normalize raw amd-smi input into ``ValueUnit``-compatible data for validation.""" + if _value_unit_is_na(v): + return None + + if isinstance(v, dict): + val = v.get("value") + unit = v.get("unit", "") + if _value_unit_is_na(val): + return None + if isinstance(val, str): + m = _NUM_UNIT_RE.match(val.strip()) + if m and not unit: + num, u = m.groups() + unit = u or unit or "" + val = float(num) if "." in num else int(num) + return {"value": val, "unit": unit} + + if isinstance(v, (int, float)): + return {"value": v, "unit": ""} + + if isinstance(v, str): + s = v.strip() + m = _NUM_UNIT_RE.match(s) + if m: + num, unit = m.groups() + val = float(num) if "." in num else int(num) + return {"value": val, "unit": unit or ""} + return {"value": s, "unit": ""} + + return v + + def na_to_none(values: Union[int, str]): if values == "N/A": return None @@ -124,40 +161,7 @@ class ValueUnit(BaseModel): @model_validator(mode="before") @classmethod def _coerce(cls, v): - # treat N/A as None - def na(x) -> bool: - return x is None or (isinstance(x, str) and x.strip().upper() in {"N/A", "NA", ""}) - - if na(v): - return None - - if isinstance(v, dict): - val = v.get("value") - unit = v.get("unit", "") - if na(val): - return None - if isinstance(val, str): - m = _NUM_UNIT_RE.match(val.strip()) - if m and not unit: - num, u = m.groups() - unit = u or unit or "" - val = float(num) if "." in num else int(num) - return {"value": val, "unit": unit} - - # numbers - if isinstance(v, (int, float)): - return {"value": v, "unit": ""} - - if isinstance(v, str): - s = v.strip() - m = _NUM_UNIT_RE.match(s) - if m: - num, unit = m.groups() - val = float(num) if "." in num else int(num) - return {"value": val, "unit": unit or ""} - return {"value": s, "unit": ""} - - return v + return _coerce_value_unit_raw(v) @field_validator("unit") @classmethod @@ -165,6 +169,20 @@ def _clean_unit(cls, u): return "" if u is None else str(u).strip() +def coerce_value_unit_input(v: Any) -> Any: + """Normalize raw amd-smi values into ``ValueUnit``-compatible input. + + Use as a ``mode='before'`` field validator on ``ValueUnit`` / ``Optional[ValueUnit]`` fields. + Accepts ``ValueUnit``, dict, number, or strings such as ``"138 MHz"`` / ``"N/A"``. + Returns ``None`` when the value is not present or is N/A. + """ + if v is None: + return None + if isinstance(v, ValueUnit): + return v + return _coerce_value_unit_raw(v) + + # Process class ProcessMemoryUsage(BaseModel): gtt_mem: Optional[ValueUnit] @@ -313,6 +331,17 @@ class StaticVbios(BaseModel): version: str +class StaticPowerLimit(AmdSmiBaseModel): + """Per-PPT power limits (ROCm 7+ static --limit JSON).""" + + max_power_limit: Optional[ValueUnit] = None + min_power_limit: Optional[ValueUnit] = None + socket_power_limit: Optional[ValueUnit] = None + na_validator = field_validator( + "max_power_limit", "min_power_limit", "socket_power_limit", mode="before" + )(na_to_none) + + class StaticLimit(AmdSmiBaseModel): max_power: Optional[ValueUnit] = None min_power: Optional[ValueUnit] = None @@ -323,6 +352,10 @@ class StaticLimit(AmdSmiBaseModel): shutdown_edge_temperature: Optional[ValueUnit] = None shutdown_hotspot_temperature: Optional[ValueUnit] = None shutdown_vram_temperature: Optional[ValueUnit] = None + ppt0: Optional[StaticPowerLimit] = None + ppt1: Optional[StaticPowerLimit] = None + ptl_state: Optional[str] = None + ptl_format: Optional[str] = None na_validator = field_validator( "max_power", "min_power", @@ -333,13 +366,26 @@ class StaticLimit(AmdSmiBaseModel): "shutdown_edge_temperature", "shutdown_hotspot_temperature", "shutdown_vram_temperature", + "ppt0", + "ppt1", + "ptl_state", + "ptl_format", mode="before", )(na_to_none) + def resolved_max_power(self) -> Optional[ValueUnit]: + """Return max power cap from legacy flat fields or ``ppt0`` (ROCm 7+).""" + if self.max_power is not None: + return self.max_power + if self.ppt0 is not None and self.ppt0.max_power_limit is not None: + return self.ppt0.max_power_limit + return None + class StaticDriver(BaseModel): name: str version: str + os_kernel_version: Optional[str] = None class StaticBoard(BaseModel): @@ -418,14 +464,21 @@ class StaticCacheInfoItem(AmdSmiBaseModel): na_validator = field_validator("cache_size", mode="before")(na_to_none) -class StaticFrequencyLevels(BaseModel): +class StaticFrequencyLevels(AmdSmiBaseModel): + """Static clock frequency levels; each level is normalized to ``ValueUnit``.""" + model_config = ConfigDict( populate_by_name=True, + extra="forbid", ) - Level_0: str = Field(..., alias="Level 0") - Level_1: Optional[str] = Field(default=None, alias="Level 1") - Level_2: Optional[str] = Field(default=None, alias="Level 2") + Level_0: ValueUnit = Field(..., alias="Level 0") + Level_1: Optional[ValueUnit] = Field(default=None, alias="Level 1") + Level_2: Optional[ValueUnit] = Field(default=None, alias="Level 2") + + _level_value_unit = field_validator("Level_0", "Level_1", "Level_2", mode="before")( + coerce_value_unit_input + ) class StaticClockData(BaseModel): @@ -514,6 +567,7 @@ class MetricPower(BaseModel): gfx_voltage: Optional[ValueUnit] soc_voltage: Optional[ValueUnit] mem_voltage: Optional[ValueUnit] + ubb_power: Optional[ValueUnit] = None throttle_status: Optional[str] power_management: Optional[str] na_validator = field_validator( @@ -521,10 +575,12 @@ class MetricPower(BaseModel): "gfx_voltage", "soc_voltage", "mem_voltage", + "ubb_power", "throttle_status", "power_management", mode="before", )(na_to_none) + _ubb_power_vu = field_validator("ubb_power", mode="before")(coerce_value_unit_input) class MetricClockData(BaseModel): @@ -557,7 +613,12 @@ class MetricPcie(BaseModel): current_bandwidth_sent: Optional[int] current_bandwidth_received: Optional[int] max_packet_size: Optional[int] - lc_perf_other_end_recovery: Optional[int] + lc_perf_other_end_recovery_count: Optional[int] = Field( + default=None, + validation_alias=AliasChoices( + "lc_perf_other_end_recovery_count", "lc_perf_other_end_recovery" + ), + ) na_validator = field_validator( "width", "speed", @@ -570,7 +631,7 @@ class MetricPcie(BaseModel): "current_bandwidth_sent", "current_bandwidth_received", "max_packet_size", - "lc_perf_other_end_recovery", + "lc_perf_other_end_recovery_count", mode="before", )(na_to_none) @@ -774,10 +835,12 @@ class EccData(BaseModel): class AmdSmiMetric(BaseModel): + model_config = ConfigDict(extra="ignore") + gpu: int - usage: MetricUsage + usage: Union[MetricUsage, str] power: MetricPower - clock: dict[str, MetricClockData] + clock: dict[str, Union[MetricClockData, dict]] temperature: MetricTemperature pcie: MetricPcie ecc: MetricEccTotals @@ -789,8 +852,11 @@ class AmdSmiMetric(BaseModel): energy: Optional[MetricEnergy] mem_usage: MetricMemUsage throttle: MetricThrottle + gpuboard: Optional[Union[dict[str, Any], str]] = None + baseboard: Optional[Union[dict[str, Any], str]] = None na_validator = field_validator("xgmi_err", "perf_level", mode="before")(na_to_none) + _board_dict_na = field_validator("gpuboard", "baseboard", mode="before")(na_to_none_dict) @field_validator("ecc_blocks", mode="before") @classmethod @@ -1050,10 +1116,13 @@ def ref_max_power_w(self) -> Optional[int]: """First available max power limit (W) from static data, lowest GPU index first.""" for gpu in self._sorted_static_gpus(): lim = gpu.limit - if lim is None or lim.max_power is None or lim.max_power.value is None: + if lim is None: + continue + max_power = lim.resolved_max_power() + if max_power is None or max_power.value is None: continue try: - return int(float(lim.max_power.value)) + return int(float(max_power.value)) except (TypeError, ValueError): continue return None diff --git a/nodescraper/plugins/inband/amdsmi/analyzer_args.py b/nodescraper/plugins/inband/amdsmi/analyzer_args.py index 3a5d2cfb..cd08bd49 100644 --- a/nodescraper/plugins/inband/amdsmi/analyzer_args.py +++ b/nodescraper/plugins/inband/amdsmi/analyzer_args.py @@ -43,6 +43,13 @@ class AmdSmiAnalyzerArgs(AnalyzerArgs): expected_max_power: Optional[int] = Field( default=None, description="Expected maximum power value (e.g. watts)." ) + expected_power_management: Optional[str] = Field( + default=None, + description=( + "Expected amd-smi metric power_management value per GPU " + "(e.g. DISABLED for active/full power, ENABLED for power-managed idle)." + ), + ) expected_driver_version: Optional[str] = Field( default=None, description="Expected AMD driver version string." ) diff --git a/test/unit/plugin/test_amdsmi_analyzer.py b/test/unit/plugin/test_amdsmi_analyzer.py index f3966c97..a33f5013 100644 --- a/test/unit/plugin/test_amdsmi_analyzer.py +++ b/test/unit/plugin/test_amdsmi_analyzer.py @@ -286,6 +286,30 @@ def test_check_expected_max_power_missing(mock_analyzer): assert "has no max power limit set" in analyzer.result.events[0].description +def test_check_expected_max_power_ppt0(mock_analyzer): + """Test check_expected_max_power uses ppt0.max_power_limit on ROCm 7+ static JSON.""" + from nodescraper.plugins.inband.amdsmi.amdsmidata import StaticPowerLimit + + analyzer = mock_analyzer + gpu = create_static_gpu(0, max_power=550.0) + gpu.limit = StaticLimit( + ppt0=StaticPowerLimit( + max_power_limit=ValueUnit(value=750, unit="W"), + min_power_limit=ValueUnit(value=0, unit="W"), + socket_power_limit=ValueUnit(value=600, unit="W"), + ) + ) + gpu.limit.max_power = None + + analyzer.check_expected_max_power([gpu], 750) + assert len(analyzer.result.events) == 0 + + analyzer.result = analyzer._init_result() + analyzer.check_expected_max_power([gpu], 550) + assert len(analyzer.result.events) == 1 + assert "Max power mismatch" in analyzer.result.events[0].description + + def test_check_expected_driver_version_success(mock_analyzer): """Test check_expected_driver_version passes when all GPUs have correct driver.""" analyzer = mock_analyzer @@ -816,7 +840,7 @@ def test_analyze_data_no_static_data(mock_analyzer): "current_bandwidth_sent", "current_bandwidth_received", "max_packet_size", - "lc_perf_other_end_recovery", + "lc_perf_other_end_recovery_count", ] _ECC_TOTALS_KEYS = [ "total_correctable_count", @@ -850,6 +874,7 @@ def _minimal_amdsmi_metric( pcie: Optional[MetricPcie] = None, ecc: Optional[MetricEccTotals] = None, ecc_blocks: Optional[dict] = None, + power_management: Optional[str] = None, ) -> AmdSmiMetric: """Build minimal AmdSmiMetric for PCIe/ECC tests with all required fields present.""" pcie_dict = pcie.model_dump() if pcie is not None else {k: None for k in _PCIE_KEYS} @@ -873,7 +898,7 @@ def _minimal_amdsmi_metric( "soc_voltage": None, "mem_voltage": None, "throttle_status": None, - "power_management": None, + "power_management": power_management, }, "clock": {}, "temperature": {"edge": None, "hotspot": None, "mem": None}, @@ -901,6 +926,41 @@ def _minimal_amdsmi_metric( ) +def test_check_expected_power_management_success(mock_analyzer): + """All GPUs with DISABLED power_management pass.""" + analyzer = mock_analyzer + metrics = [ + _minimal_amdsmi_metric(0, power_management="DISABLED"), + _minimal_amdsmi_metric(1, power_management="disabled"), + ] + analyzer.check_expected_power_management(metrics, "DISABLED") + assert len(analyzer.result.events) == 0 + + +def test_check_expected_power_management_mismatch(mock_analyzer): + """ENABLED power_management fails when DISABLED is expected.""" + analyzer = mock_analyzer + metrics = [ + _minimal_amdsmi_metric(0, power_management="DISABLED"), + _minimal_amdsmi_metric(1, power_management="ENABLED"), + ] + analyzer.check_expected_power_management(metrics, "DISABLED") + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.ERROR + assert "power_management mismatch" in analyzer.result.events[0].description + assert analyzer.result.events[0].data["gpus"] == [1] + + +def test_check_expected_power_management_missing(mock_analyzer): + """Missing power_management logs warning.""" + analyzer = mock_analyzer + metrics = [_minimal_amdsmi_metric(0, power_management=None)] + analyzer.check_expected_power_management(metrics, "DISABLED") + assert len(analyzer.result.events) == 1 + assert analyzer.result.events[0].priority == EventPriority.WARNING + assert "not available" in analyzer.result.events[0].description + + def test_check_amdsmi_metric_pcie_width_fail(mock_analyzer): """PCIe width not x16 generates error.""" analyzer = mock_analyzer diff --git a/test/unit/plugin/test_amdsmi_data.py b/test/unit/plugin/test_amdsmi_data.py new file mode 100644 index 00000000..85323b8b --- /dev/null +++ b/test/unit/plugin/test_amdsmi_data.py @@ -0,0 +1,446 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Unit tests for amd-smi pydantic models (ROCm 7.13 / legacy JSON shapes).""" + +from typing import Any, Optional + +import pytest + +from nodescraper.plugins.inband.amdsmi.amdsmidata import ( + AmdSmiDataModel, + AmdSmiMetric, + MetricPcie, + MetricPower, + StaticClockData, + StaticFrequencyLevels, + StaticLimit, + ValueUnit, +) + +DUMMY_STATIC_ASIC: dict[str, Any] = { + "market_name": "AMD Instinct MI350X", + "vendor_id": "0x1002", + "vendor_name": "Advanced Micro Devices Inc. [AMD/ATI]", + "subvendor_id": "0x1002", + "device_id": "0x75a0", + "subsystem_id": "0x75a0", + "rev_id": "0x00", + "asic_serial": "0xDUMMYASIC00000001", + "oam_id": 6, + "num_compute_units": 256, + "target_graphics_version": "gfx950", +} + +DUMMY_STATIC_BUS: dict[str, Any] = { + "bdf": "0000:05:00.0", + "max_pcie_width": 16, + "max_pcie_speed": {"value": 32, "unit": "GT/s"}, + "pcie_interface_version": "Gen 5", + "slot_type": "OAM", +} + +DUMMY_STATIC_BOARD: dict[str, str] = { + "model_number": "DUMMY-MI350X-OAM", + "product_serial": "DUMMY-SN-000001", + "fru_id": "DUMMY-FRU", + "product_name": "AMD Instinct MI350X OAM", + "manufacturer_name": "AMD", +} + +DUMMY_STATIC_DRIVER: dict[str, Optional[str]] = { + "name": "amdgpu", + "version": "6.12.12", + "os_kernel_version": "6.8.0-dummy", +} + +DUMMY_STATIC_RAS: dict[str, Any] = { + "eeprom_version": "N/A", + "parity_schema": "DISABLED", + "single_bit_schema": "ENABLED", + "double_bit_schema": "ENABLED", + "poison_schema": "ENABLED", + "ecc_block_state": {}, +} + +DUMMY_STATIC_VRAM: dict[str, Any] = { + "type": "HBM3", + "vendor": "Micron", + "size": {"value": 192, "unit": "GB"}, + "bit_width": {"value": 4096, "unit": "bit"}, +} + +DUMMY_LIMIT_LEGACY: dict[str, Any] = { + "max_power": {"value": 550, "unit": "W"}, + "min_power": {"value": 0, "unit": "W"}, + "socket_power": {"value": 0, "unit": "W"}, +} + +DUMMY_LIMIT_PPT0: dict[str, Any] = { + "ppt0": { + "max_power_limit": {"value": 750, "unit": "W"}, + "min_power_limit": {"value": 0, "unit": "W"}, + "socket_power_limit": {"value": 600, "unit": "W"}, + }, + "ppt1": { + "max_power_limit": "N/A", + "min_power_limit": "N/A", + "socket_power_limit": "N/A", + }, +} + +DUMMY_CLOCK_LEGACY_FREQUENCY: dict[str, Any] = { + "frequency": [500_000_000, 900_000_000, 1_300_000_000], + "current": 0, +} + +DUMMY_CLOCK_FREQUENCY_LEVELS: dict[str, Any] = { + "GFX_0": { + "frequency_levels": { + "Level 0": {"value": 500, "unit": "MHz"}, + "Level 1": "900 MHz", + "Level 2": "1300 MHz", + }, + "current level": 0, + } +} + +DUMMY_METRIC_USAGE: dict[str, Any] = { + "gfx_activity": {"value": 0, "unit": "%"}, + "umc_activity": {"value": 0, "unit": "%"}, + "mm_activity": "N/A", + "vcn_activity": ["N/A", "N/A", "N/A", "N/A"], + "jpeg_activity": ["N/A"] * 12, + "gfx_busy_inst": None, + "jpeg_busy": None, + "vcn_busy": None, +} + +DUMMY_METRIC_POWER: dict[str, Any] = { + "socket_power": {"value": 140, "unit": "W"}, + "gfx_voltage": "N/A", + "soc_voltage": "N/A", + "mem_voltage": "N/A", + "ubb_power": {"value": 1200, "unit": "W"}, + "throttle_status": "UNTHROTTLED", + "power_management": "DISABLED", +} + +DUMMY_METRIC_PCIE: dict[str, Any] = { + "width": 16, + "speed": {"value": 32, "unit": "GT/s"}, + "bandwidth": {"value": 512, "unit": "Gb/s"}, + "replay_count": 0, + "l0_to_recovery_count": 0, + "replay_roll_over_count": 0, + "nak_sent_count": 0, + "nak_received_count": 0, + "current_bandwidth_sent": 0, + "current_bandwidth_received": 0, + "max_packet_size": 512, + "lc_perf_other_end_recovery_count": 1, +} + +DUMMY_METRIC_TEMPERATURE: dict[str, Any] = { + "edge": "N/A", + "hotspot": {"value": 37, "unit": "C"}, + "mem": {"value": 42, "unit": "C"}, +} + +DUMMY_METRIC_GPUBOARD: dict[str, Any] = { + "temperature": { + "NODE_RETIMER_X": 43, + "NODE_OAM_X_IBC": 53, + } +} + +DUMMY_METRIC_BASEBOARD: dict[str, Any] = { + "temperature": { + "UBB_FRONT": 55, + "UBB_FPGA": 78, + } +} + +_PCIE_KEYS = list(DUMMY_METRIC_PCIE.keys()) + + +def _deep_update(base: dict[str, Any], overrides: dict[str, Any]) -> dict[str, Any]: + """Shallow merge with one level of nested dict merge.""" + out = base.copy() + for key, val in overrides.items(): + if isinstance(val, dict) and isinstance(out.get(key), dict): + nested = out[key].copy() + nested.update(val) + out[key] = nested + else: + out[key] = val + return out + + +def dummy_static_gpu_dict( + *, + gpu: int = 0, + limit: Optional[dict[str, Any]] = None, + clock: Optional[dict[str, Any]] = None, + **overrides: Any, +) -> dict[str, Any]: + """Build a minimal static GPU dict for ``AmdSmiStatic.model_validate``.""" + base: dict[str, Any] = { + "gpu": gpu, + "asic": dict(DUMMY_STATIC_ASIC), + "bus": dict(DUMMY_STATIC_BUS), + "driver": dict(DUMMY_STATIC_DRIVER), + "board": dict(DUMMY_STATIC_BOARD), + "ras": dict(DUMMY_STATIC_RAS), + "process_isolation": "", + "numa": {"node": 0, "affinity": 0}, + "vram": dict(DUMMY_STATIC_VRAM), + "cache_info": [], + "limit": limit, + "clock": clock, + } + return _deep_update(base, overrides) + + +def dummy_metric_dict( + *, + gpu: int = 0, + usage: Any = None, + power: Optional[dict[str, Any]] = None, + pcie: Optional[dict[str, Any]] = None, + **overrides: Any, +) -> dict[str, Any]: + """Build a minimal metric GPU dict for ``AmdSmiMetric.model_validate``.""" + base: dict[str, Any] = { + "gpu": gpu, + "usage": DUMMY_METRIC_USAGE if usage is None else usage, + "power": dict(DUMMY_METRIC_POWER) if power is None else power, + "clock": { + "GFX_0": { + "clk": {"value": 132, "unit": "MHz"}, + "min_clk": {"value": 500, "unit": "MHz"}, + "max_clk": {"value": 2100, "unit": "MHz"}, + } + }, + "temperature": dict(DUMMY_METRIC_TEMPERATURE), + "pcie": dict(DUMMY_METRIC_PCIE) if pcie is None else pcie, + "ecc": { + "total_correctable_count": 0, + "total_uncorrectable_count": 0, + "total_deferred_count": 0, + "cache_correctable_count": 0, + "cache_uncorrectable_count": 0, + }, + "ecc_blocks": {}, + "fan": {"speed": "N/A", "max": "N/A", "rpm": "N/A", "usage": "N/A"}, + "voltage_curve": None, + "perf_level": "auto", + "xgmi_err": "N/A", + "energy": None, + "mem_usage": { + "total_vram": {"value": 192, "unit": "GB"}, + "used_vram": {"value": 0, "unit": "GB"}, + "free_vram": {"value": 192, "unit": "GB"}, + "total_visible_vram": None, + "used_visible_vram": None, + "free_visible_vram": None, + "total_gtt": None, + "used_gtt": None, + "free_gtt": None, + }, + "throttle": {}, + "gpuboard": dict(DUMMY_METRIC_GPUBOARD), + "baseboard": dict(DUMMY_METRIC_BASEBOARD), + } + return _deep_update(base, overrides) + + +def dummy_amdsmi_data_dict( + *, + static: Optional[list[dict[str, Any]]] = None, + metric: Optional[list[dict[str, Any]]] = None, + **overrides: Any, +) -> dict[str, Any]: + """Build top-level ``AmdSmiDataModel`` input with dummy static + metric lists.""" + base: dict[str, Any] = { + "static": ( + static + if static is not None + else [ + dummy_static_gpu_dict( + limit=dict(DUMMY_LIMIT_PPT0), clock=dict(DUMMY_CLOCK_FREQUENCY_LEVELS) + ), + ] + ), + "metric": metric if metric is not None else [dummy_metric_dict(gpu=0)], + } + return _deep_update(base, overrides) + + +def _pcie_dict(**overrides: Any) -> MetricPcie: + """Full PCIe model from dummy defaults plus overrides.""" + base = dict(DUMMY_METRIC_PCIE) + base.update(overrides) + for key, val in list(base.items()): + if isinstance(val, ValueUnit): + base[key] = {"value": val.value, "unit": val.unit} + return MetricPcie.model_validate(base) + + +@pytest.mark.parametrize( + "level_input,expected_value,expected_unit", + [ + ("138 MHz", 138, "MHz"), + ({"value": 500, "unit": "MHz"}, 500, "MHz"), + (900, 900, ""), + (ValueUnit(value=1300, unit="MHz"), 1300, "MHz"), + ], +) +def test_static_frequency_levels_coerce_to_value_unit(level_input, expected_value, expected_unit): + """Static frequency levels accept strings, dicts, numbers, and ValueUnit.""" + levels = StaticFrequencyLevels.model_validate( + {"Level 0": level_input, "Level 1": None, "Level 2": None} + ) + assert isinstance(levels.Level_0, ValueUnit) + assert levels.Level_0.value == expected_value + assert levels.Level_0.unit == expected_unit + assert levels.Level_1 is None + assert levels.Level_2 is None + + +def test_static_frequency_levels_optional_levels(): + """Level 1/2 coerce from amd-smi string and structured JSON forms.""" + levels = StaticFrequencyLevels.model_validate( + DUMMY_CLOCK_FREQUENCY_LEVELS["GFX_0"]["frequency_levels"] + ) + assert levels.Level_0.value == 500 + assert levels.Level_1 is not None and levels.Level_1.value == 900 + assert levels.Level_2 is not None and levels.Level_2.value == 1300 + + +def test_static_limit_legacy_max_power(): + """Legacy flat max_power field still resolves.""" + limit = StaticLimit.model_validate(DUMMY_LIMIT_LEGACY) + resolved = limit.resolved_max_power() + assert resolved is not None + assert resolved.value == 550 + + +def test_static_limit_ppt0_max_power(): + """ROCm 7+ ppt0.max_power_limit resolves when flat max_power is absent.""" + limit = StaticLimit.model_validate(DUMMY_LIMIT_PPT0) + assert limit.max_power is None + resolved = limit.resolved_max_power() + assert resolved is not None + assert resolved.value == 750 + + +def test_ref_max_power_w_prefers_ppt0_when_legacy_missing(): + """Reference snapshot uses ppt0 on newer static JSON.""" + data = AmdSmiDataModel.model_validate( + dummy_amdsmi_data_dict( + static=[dummy_static_gpu_dict(limit=dict(DUMMY_LIMIT_PPT0))], + ) + ) + assert data.ref_max_power_w == 750 + + +def test_ref_max_power_w_legacy_limit(): + """Reference snapshot uses legacy max_power when present.""" + data = AmdSmiDataModel.model_validate( + dummy_amdsmi_data_dict( + static=[dummy_static_gpu_dict(limit=dict(DUMMY_LIMIT_LEGACY))], + ) + ) + assert data.ref_max_power_w == 550 + + +def test_static_clock_frequency_levels_json(): + """Static clock accepts ROCm 7+ frequency_levels object.""" + clock = StaticClockData.model_validate( + { + "frequency_levels": DUMMY_CLOCK_FREQUENCY_LEVELS["GFX_0"]["frequency_levels"], + "current level": 0, + } + ) + assert clock.frequency_levels.Level_0.value == 500 + assert clock.frequency_levels.Level_1 is not None + assert clock.frequency_levels.Level_1.value == 900 + + +def test_amdsmi_data_model_dummy_metric_round_trip(): + """Full dummy metric payload validates and preserves key ROCm 7.13 fields.""" + metric = AmdSmiMetric.model_validate(dummy_metric_dict()) + assert metric.gpu == 0 + assert metric.power.power_management == "DISABLED" + assert metric.power.ubb_power is not None + assert metric.power.ubb_power.value == 1200 + assert metric.pcie.width == 16 + assert metric.pcie.lc_perf_other_end_recovery_count == 1 + assert metric.gpuboard is not None + assert metric.baseboard is not None + + +def test_metric_pcie_lc_perf_alias(): + """Metric PCIe accepts legacy lc_perf_other_end_recovery JSON key.""" + pcie = _pcie_dict(lc_perf_other_end_recovery=2) + assert pcie.lc_perf_other_end_recovery_count == 2 + + +def test_metric_pcie_lc_perf_count_key(): + """Metric PCIe accepts renamed lc_perf_other_end_recovery_count key.""" + pcie = _pcie_dict(lc_perf_other_end_recovery_count=3) + assert pcie.lc_perf_other_end_recovery_count == 3 + + +def test_metric_power_ubb_coercion(): + """Metric power accepts ubb_power as string or ValueUnit.""" + power = MetricPower.model_validate( + { + **DUMMY_METRIC_POWER, + "ubb_power": "1200 W", + } + ) + assert power.ubb_power is not None + assert power.ubb_power.value == 1200 + assert power.ubb_power.unit == "W" + + +def test_metric_usage_string_na(): + """Whole usage block may be N/A on some platforms.""" + metric = AmdSmiMetric.model_validate(dummy_metric_dict(usage="N/A")) + assert metric.usage == "N/A" + + +def test_metric_gpuboard_baseboard_optional(): + """Store gpuboard/baseboard sections from ROCm 7.1+ metric output.""" + metric = AmdSmiMetric.model_validate( + dummy_metric_dict( + gpuboard=DUMMY_METRIC_GPUBOARD, + baseboard=DUMMY_METRIC_BASEBOARD, + ) + ) + assert metric.gpuboard == DUMMY_METRIC_GPUBOARD + assert metric.baseboard == DUMMY_METRIC_BASEBOARD From 0342522c0b03c96f7b9cfa6d8e3fc78ae01fb457 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 28 May 2026 10:52:58 -0500 Subject: [PATCH 2/5] updates --- .../plugins/inband/amdsmi/amdsmidata.py | 105 +++++++++++++----- .../fixtures/amdsmi_plugin_config.json | 14 +++ test/functional/test_plugin_configs.py | 1 + test/unit/plugin/test_amdsmi_analyzer.py | 24 ++++ test/unit/plugin/test_amdsmi_data.py | 68 ++++++++++++ 5 files changed, 186 insertions(+), 26 deletions(-) create mode 100644 test/functional/fixtures/amdsmi_plugin_config.json diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index de14afe9..24f3b7e1 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -584,16 +584,43 @@ class MetricPower(BaseModel): class MetricClockData(BaseModel): - clk: Optional[ValueUnit] - min_clk: Optional[ValueUnit] - max_clk: Optional[ValueUnit] - clk_locked: Optional[Union[int, str, dict]] - deep_sleep: Optional[Union[int, str, dict]] + model_config = ConfigDict(extra="ignore") + + clk: Optional[ValueUnit] = None + min_clk: Optional[ValueUnit] = None + max_clk: Optional[ValueUnit] = None + clk_locked: Optional[Union[int, str, dict]] = None + deep_sleep: Optional[Union[int, str, dict]] = None na_validator = field_validator( "clk", "min_clk", "max_clk", "clk_locked", "deep_sleep", mode="before" )(na_to_none) +_METRIC_CLOCK_LEAF_KEYS = frozenset({"clk", "min_clk", "max_clk", "clk_locked", "deep_sleep"}) + + +def _looks_like_metric_clock_leaf(value: object) -> bool: + """True when *value* matches legacy amd-smi per-clock metric JSON.""" + return isinstance(value, dict) and bool(_METRIC_CLOCK_LEAF_KEYS & value.keys()) + + +def _normalize_metric_clock_map(clock: Any) -> Any: + """Coerce standard clock leaves to ``MetricClockData``; keep nested maps as dicts.""" + if not isinstance(clock, dict): + return clock + normalized: dict[str, Any] = {} + for key, val in clock.items(): + if val is None or (isinstance(val, str) and _value_unit_is_na(val)): + normalized[key] = None + elif isinstance(val, MetricClockData): + normalized[key] = val + elif _looks_like_metric_clock_leaf(val): + normalized[key] = MetricClockData.model_validate(val) + else: + normalized[key] = val + return normalized + + class MetricTemperature(BaseModel): edge: Optional[ValueUnit] hotspot: Optional[ValueUnit] @@ -602,23 +629,38 @@ class MetricTemperature(BaseModel): class MetricPcie(BaseModel): - width: Optional[int] - speed: Optional[ValueUnit] - bandwidth: Optional[ValueUnit] - replay_count: Optional[int] - l0_to_recovery_count: Optional[int] - replay_roll_over_count: Optional[int] - nak_sent_count: Optional[int] - nak_received_count: Optional[int] - current_bandwidth_sent: Optional[int] - current_bandwidth_received: Optional[int] - max_packet_size: Optional[int] - lc_perf_other_end_recovery_count: Optional[int] = Field( - default=None, - validation_alias=AliasChoices( - "lc_perf_other_end_recovery_count", "lc_perf_other_end_recovery" - ), - ) + width: Optional[int] = None + speed: Optional[ValueUnit] = None + bandwidth: Optional[ValueUnit] = None + replay_count: Optional[int] = None + l0_to_recovery_count: Optional[int] = None + replay_roll_over_count: Optional[int] = None + nak_sent_count: Optional[int] = None + nak_received_count: Optional[int] = None + current_bandwidth_sent: Optional[int] = None + current_bandwidth_received: Optional[int] = None + max_packet_size: Optional[int] = None + lc_perf_other_end_recovery_count: Optional[int] = None + + @model_validator(mode="before") + @classmethod + def _normalize_recovery_count_key(cls, data: Any) -> Any: + if not isinstance(data, dict): + return data + out = dict(data) + count = out.get("lc_perf_other_end_recovery_count") + legacy = out.get("lc_perf_other_end_recovery") + if count is None and legacy is not None: + out["lc_perf_other_end_recovery_count"] = legacy + elif legacy is None and count is not None: + out["lc_perf_other_end_recovery"] = count + return out + + @property + def lc_perf_other_end_recovery(self) -> Optional[int]: + """Legacy amd-smi JSON / attribute name (alias for ``lc_perf_other_end_recovery_count``).""" + return self.lc_perf_other_end_recovery_count + na_validator = field_validator( "width", "speed", @@ -835,12 +877,12 @@ class EccData(BaseModel): class AmdSmiMetric(BaseModel): - model_config = ConfigDict(extra="ignore") + model_config = ConfigDict(extra="ignore", populate_by_name=True) gpu: int usage: Union[MetricUsage, str] power: MetricPower - clock: dict[str, Union[MetricClockData, dict]] + clock: dict[str, Union[MetricClockData, dict[str, Any]]] temperature: MetricTemperature pcie: MetricPcie ecc: MetricEccTotals @@ -852,12 +894,23 @@ class AmdSmiMetric(BaseModel): energy: Optional[MetricEnergy] mem_usage: MetricMemUsage throttle: MetricThrottle - gpuboard: Optional[Union[dict[str, Any], str]] = None - baseboard: Optional[Union[dict[str, Any], str]] = None + gpuboard: Optional[Union[dict[str, Any], str]] = Field( + default=None, + validation_alias=AliasChoices("gpuboard", "gpu_board"), + ) + baseboard: Optional[Union[dict[str, Any], str]] = Field( + default=None, + validation_alias=AliasChoices("baseboard", "base_board"), + ) na_validator = field_validator("xgmi_err", "perf_level", mode="before")(na_to_none) _board_dict_na = field_validator("gpuboard", "baseboard", mode="before")(na_to_none_dict) + @field_validator("clock", mode="before") + @classmethod + def _normalize_clock(cls, clock: Any) -> Any: + return _normalize_metric_clock_map(clock) + @field_validator("ecc_blocks", mode="before") @classmethod def validate_ecc_blocks(cls, value: Union[dict[str, EccData], str]) -> dict[str, EccData]: diff --git a/test/functional/fixtures/amdsmi_plugin_config.json b/test/functional/fixtures/amdsmi_plugin_config.json new file mode 100644 index 00000000..73de0eb3 --- /dev/null +++ b/test/functional/fixtures/amdsmi_plugin_config.json @@ -0,0 +1,14 @@ +{ + "global_args": {}, + "plugins": { + "AmdSmiPlugin": { + "analysis_args": { + "expected_power_management": "DISABLED", + "expected_xgmi_speed": [32.0] + } + } + }, + "result_collators": {}, + "name": "AmdSmiPlugin config", + "desc": "Functional test config for AmdSmiPlugin ROCm 7.13 checks" +} diff --git a/test/functional/test_plugin_configs.py b/test/functional/test_plugin_configs.py index 90c68218..6ee1a004 100644 --- a/test/functional/test_plugin_configs.py +++ b/test/functional/test_plugin_configs.py @@ -42,6 +42,7 @@ def fixtures_dir(): def plugin_config_files(fixtures_dir): """Return dict mapping plugin names to their config file paths.""" return { + "AmdSmiPlugin": fixtures_dir / "amdsmi_plugin_config.json", "BiosPlugin": fixtures_dir / "bios_plugin_config.json", "CmdlinePlugin": fixtures_dir / "cmdline_plugin_config.json", "DimmPlugin": fixtures_dir / "dimm_plugin_config.json", diff --git a/test/unit/plugin/test_amdsmi_analyzer.py b/test/unit/plugin/test_amdsmi_analyzer.py index a33f5013..cc479a90 100644 --- a/test/unit/plugin/test_amdsmi_analyzer.py +++ b/test/unit/plugin/test_amdsmi_analyzer.py @@ -961,6 +961,30 @@ def test_check_expected_power_management_missing(mock_analyzer): assert "not available" in analyzer.result.events[0].description +def test_analyze_data_expected_power_management(mock_analyzer): + """analyze_data accepts expected_power_management from plugin config.""" + analyzer = mock_analyzer + data = AmdSmiDataModel( + metric=[ + _minimal_amdsmi_metric(0, power_management="DISABLED"), + _minimal_amdsmi_metric(1, power_management="DISABLED"), + ] + ) + args = AmdSmiAnalyzerArgs(expected_power_management="DISABLED") + result = analyzer.analyze_data(data, args) + assert not any("power_management mismatch" in e.description for e in result.events) + + +def test_amdsmi_analyzer_args_rejects_unknown_fields(): + """Plugin config must only use declared AmdSmiAnalyzerArgs fields.""" + from pydantic import ValidationError + + with pytest.raises(ValidationError): + AmdSmiAnalyzerArgs.model_validate( + {"expected_power_management": "DISABLED", "not_a_field": 1} + ) + + def test_check_amdsmi_metric_pcie_width_fail(mock_analyzer): """PCIe width not x16 generates error.""" analyzer = mock_analyzer diff --git a/test/unit/plugin/test_amdsmi_data.py b/test/unit/plugin/test_amdsmi_data.py index 85323b8b..2c58fa0e 100644 --- a/test/unit/plugin/test_amdsmi_data.py +++ b/test/unit/plugin/test_amdsmi_data.py @@ -32,6 +32,7 @@ from nodescraper.plugins.inband.amdsmi.amdsmidata import ( AmdSmiDataModel, AmdSmiMetric, + MetricClockData, MetricPcie, MetricPower, StaticClockData, @@ -407,12 +408,35 @@ def test_metric_pcie_lc_perf_alias(): """Metric PCIe accepts legacy lc_perf_other_end_recovery JSON key.""" pcie = _pcie_dict(lc_perf_other_end_recovery=2) assert pcie.lc_perf_other_end_recovery_count == 2 + assert pcie.lc_perf_other_end_recovery == 2 def test_metric_pcie_lc_perf_count_key(): """Metric PCIe accepts renamed lc_perf_other_end_recovery_count key.""" pcie = _pcie_dict(lc_perf_other_end_recovery_count=3) assert pcie.lc_perf_other_end_recovery_count == 3 + assert pcie.lc_perf_other_end_recovery == 3 + + +def test_metric_legacy_mi300_shape(): + """Legacy metric JSON without MI355-only clock/board fields still validates.""" + pcie_legacy = { + k: v for k, v in DUMMY_METRIC_PCIE.items() if k != "lc_perf_other_end_recovery_count" + } + pcie_legacy["lc_perf_other_end_recovery"] = 1 + metric = AmdSmiMetric.model_validate( + dummy_metric_dict( + pcie=pcie_legacy, + gpuboard=None, + baseboard=None, + ) + ) + assert metric.gpuboard is None + assert metric.baseboard is None + assert isinstance(metric.clock["GFX_0"], MetricClockData) + assert metric.clock["GFX_0"].clk is not None + assert metric.pcie.lc_perf_other_end_recovery_count == 1 + assert metric.pcie.lc_perf_other_end_recovery == 1 def test_metric_power_ubb_coercion(): @@ -444,3 +468,47 @@ def test_metric_gpuboard_baseboard_optional(): ) assert metric.gpuboard == DUMMY_METRIC_GPUBOARD assert metric.baseboard == DUMMY_METRIC_BASEBOARD + + +def test_metric_gpuboard_gpu_board_alias(): + """amd-smi may emit gpu_board / base_board instead of gpuboard / baseboard.""" + metric = AmdSmiMetric.model_validate( + dummy_metric_dict( + **{ + "gpu_board": DUMMY_METRIC_GPUBOARD, + "base_board": DUMMY_METRIC_BASEBOARD, + } + ) + ) + assert metric.gpuboard == DUMMY_METRIC_GPUBOARD + assert metric.baseboard == DUMMY_METRIC_BASEBOARD + + +def test_metric_clock_per_aid_na_maps(): + """MI355 metric clock may expose per-AID/MID maps instead of MetricClockData leaves.""" + metric = AmdSmiMetric.model_validate( + dummy_metric_dict( + clock={ + "GFX_0": { + "clk": {"value": 132, "unit": "MHz"}, + "min_clk": {"value": 500, "unit": "MHz"}, + "max_clk": {"value": 2100, "unit": "MHz"}, + "clk_locked": 0, + "deep_sleep": 0, + }, + "uclk_aid": {"AID_0": "N/A", "AID_1": "N/A"}, + "socclks_mid": {"MID_0": "N/A", "MID_1": "N/A"}, + }, + pcie={ + **DUMMY_METRIC_PCIE, + "current_bandwidth_sent": "N/A", + "current_bandwidth_received": "N/A", + "max_packet_size": "N/A", + "lc_perf_other_end_recovery_count": 0, + }, + ) + ) + assert isinstance(metric.clock["uclk_aid"], dict) + assert metric.clock["uclk_aid"]["AID_0"] == "N/A" + assert isinstance(metric.clock["GFX_0"], MetricClockData) + assert metric.pcie.lc_perf_other_end_recovery_count == 0 From 958d4bfbad4aa39578224e3381e22d6865455fd7 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 28 May 2026 10:56:48 -0500 Subject: [PATCH 3/5] updates --- .../plugins/inband/amdsmi/amdsmi_collector.py | 6 ++++- .../plugins/inband/amdsmi/amdsmidata.py | 23 +++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index b54a1860..65f4f779 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -72,6 +72,7 @@ ValueUnit, XgmiLinks, XgmiMetrics, + normalize_amdsmi_metric_dict, ) from nodescraper.plugins.inband.amdsmi.collector_args import AmdSmiCollectorArgs from nodescraper.utils import get_exception_traceback @@ -409,7 +410,10 @@ def get_metric(self) -> Optional[List[AmdSmiMetric]]: if isinstance(ret, dict) and "gpu_data" in ret: ret = ret["gpu_data"] data = ret if isinstance(ret, list) else [ret] - built = self._build_amdsmi_sub_data(AmdSmiMetric, data) + normalized = [ + normalize_amdsmi_metric_dict(item) if isinstance(item, dict) else item for item in data + ] + built = self._build_amdsmi_sub_data(AmdSmiMetric, normalized) return built if isinstance(built, list) else ([built] if built else []) def get_xgmi_data( diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 24f3b7e1..eed0e32b 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -621,6 +621,29 @@ def _normalize_metric_clock_map(clock: Any) -> Any: return normalized +def normalize_amdsmi_metric_dict(item: dict[str, Any]) -> dict[str, Any]: + """Normalize raw ``amd-smi metric`` JSON before ``AmdSmiMetric`` validation (ROCm 7.13 + legacy).""" + out = dict(item) + if "gpu_board" in out and "gpuboard" not in out: + out["gpuboard"] = out.pop("gpu_board") + if "base_board" in out and "baseboard" not in out: + out["baseboard"] = out.pop("base_board") + pcie = out.get("pcie") + if isinstance(pcie, dict): + pcie_out = dict(pcie) + count = pcie_out.get("lc_perf_other_end_recovery_count") + legacy = pcie_out.get("lc_perf_other_end_recovery") + if count is None and legacy is not None: + pcie_out["lc_perf_other_end_recovery_count"] = legacy + elif legacy is None and count is not None: + pcie_out["lc_perf_other_end_recovery"] = count + out["pcie"] = pcie_out + clock = out.get("clock") + if isinstance(clock, dict): + out["clock"] = _normalize_metric_clock_map(clock) + return out + + class MetricTemperature(BaseModel): edge: Optional[ValueUnit] hotspot: Optional[ValueUnit] From 8fccd06bdeb99dcdcbadeb0f56f24c4aea18902d Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 28 May 2026 14:32:43 -0500 Subject: [PATCH 4/5] updates --- .../plugins/inband/amdsmi/amdsmi_collector.py | 9 ++- .../plugins/inband/amdsmi/amdsmidata.py | 81 ++++++++++++++++--- test/unit/plugin/test_amdsmi_data.py | 4 +- 3 files changed, 78 insertions(+), 16 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 65f4f779..ef60995a 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -73,6 +73,7 @@ XgmiLinks, XgmiMetrics, normalize_amdsmi_metric_dict, + normalize_static_limit_dict, ) from nodescraper.plugins.inband.amdsmi.collector_args import AmdSmiCollectorArgs from nodescraper.utils import get_exception_traceback @@ -916,10 +917,12 @@ def get_static(self) -> Optional[list[AmdSmiStatic]]: os_kernel = driver.get("os_kernel_version") if driver else None driver_model = StaticDriver( name=self._normalize( - driver.get("driver_name") if driver else None, default="unknown" + (driver.get("driver_name") or driver.get("name")) if driver else None, + default="unknown", ), version=self._normalize( - driver.get("driver_version") if driver else None, default="unknown" + (driver.get("driver_version") or driver.get("version")) if driver else None, + default="unknown", ), os_kernel_version=( None if os_kernel in (None, "", "N/A") else str(os_kernel).strip() @@ -1021,7 +1024,7 @@ def _parse_limit(self, data: Optional[object]) -> Optional[StaticLimit]: if data is None or not isinstance(data, dict) or not data: return None try: - return StaticLimit.model_validate(data) + return StaticLimit.model_validate(normalize_static_limit_dict(data)) except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index eed0e32b..fd603028 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -334,15 +334,28 @@ class StaticVbios(BaseModel): class StaticPowerLimit(AmdSmiBaseModel): """Per-PPT power limits (ROCm 7+ static --limit JSON).""" + model_config = ConfigDict( + populate_by_name=True, + extra="ignore", + ) + max_power_limit: Optional[ValueUnit] = None min_power_limit: Optional[ValueUnit] = None socket_power_limit: Optional[ValueUnit] = None na_validator = field_validator( "max_power_limit", "min_power_limit", "socket_power_limit", mode="before" )(na_to_none) + _ppt_vu = field_validator( + "max_power_limit", "min_power_limit", "socket_power_limit", mode="before" + )(coerce_value_unit_input) class StaticLimit(AmdSmiBaseModel): + model_config = ConfigDict( + populate_by_name=True, + extra="ignore", + ) + max_power: Optional[ValueUnit] = None min_power: Optional[ValueUnit] = None socket_power: Optional[ValueUnit] = None @@ -372,6 +385,18 @@ class StaticLimit(AmdSmiBaseModel): "ptl_format", mode="before", )(na_to_none) + _limit_value_unit = field_validator( + "max_power", + "min_power", + "socket_power", + "slowdown_edge_temperature", + "slowdown_hotspot_temperature", + "slowdown_vram_temperature", + "shutdown_edge_temperature", + "shutdown_hotspot_temperature", + "shutdown_vram_temperature", + mode="before", + )(coerce_value_unit_input) def resolved_max_power(self) -> Optional[ValueUnit]: """Return max power cap from legacy flat fields or ``ppt0`` (ROCm 7+).""" @@ -381,10 +406,44 @@ def resolved_max_power(self) -> Optional[ValueUnit]: return self.ppt0.max_power_limit return None + @model_validator(mode="before") + @classmethod + def _normalize_limit_input(cls, data: Any) -> Any: + if isinstance(data, dict): + return normalize_static_limit_dict(data) + return data + + +def _normalize_static_ppt_block(val: object) -> object: + """Drop N/A-only ``ppt0``/``ppt1`` blocks; pass through already-parsed models.""" + if val is None: + return None + if isinstance(val, str) and val.strip().upper() in {"N/A", "NA", ""}: + return None + if not isinstance(val, dict): + return val + out: dict[str, Any] = {} + for key, raw in val.items(): + if isinstance(raw, str) and raw.strip().upper() in {"N/A", "NA", ""}: + continue + if raw is not None: + out[key] = raw + return out or None + + +def normalize_static_limit_dict(data: dict[str, Any]) -> dict[str, Any]: + """Normalize static ``limit`` JSON for ``StaticLimit`` (ROCm 7.13 + legacy).""" + out = dict(data) + for ppt_key in ("ppt0", "ppt1"): + out[ppt_key] = _normalize_static_ppt_block(out.get(ppt_key)) + return out + class StaticDriver(BaseModel): - name: str - version: str + model_config = ConfigDict(populate_by_name=True) + + name: str = Field(validation_alias=AliasChoices("name", "driver_name")) + version: str = Field(validation_alias=AliasChoices("version", "driver_version")) os_kernel_version: Optional[str] = None @@ -497,13 +556,13 @@ class AmdSmiStatic(BaseModel): gpu: int asic: StaticAsic bus: StaticBus - vbios: Optional[StaticVbios] - limit: Optional[StaticLimit] + vbios: Optional[StaticVbios] = None + limit: Optional[StaticLimit] = None driver: StaticDriver board: StaticBoard ras: StaticRas - soc_pstate: Optional[StaticSocPstate] - xgmi_plpd: Optional[StaticXgmiPlpd] + soc_pstate: Optional[StaticSocPstate] = None + xgmi_plpd: Optional[StaticXgmiPlpd] = None process_isolation: str numa: StaticNuma vram: StaticVram @@ -633,9 +692,9 @@ def normalize_amdsmi_metric_dict(item: dict[str, Any]) -> dict[str, Any]: pcie_out = dict(pcie) count = pcie_out.get("lc_perf_other_end_recovery_count") legacy = pcie_out.get("lc_perf_other_end_recovery") - if count is None and legacy is not None: + if legacy is not None: pcie_out["lc_perf_other_end_recovery_count"] = legacy - elif legacy is None and count is not None: + elif count is not None: pcie_out["lc_perf_other_end_recovery"] = count out["pcie"] = pcie_out clock = out.get("clock") @@ -673,9 +732,9 @@ def _normalize_recovery_count_key(cls, data: Any) -> Any: out = dict(data) count = out.get("lc_perf_other_end_recovery_count") legacy = out.get("lc_perf_other_end_recovery") - if count is None and legacy is not None: + if legacy is not None: out["lc_perf_other_end_recovery_count"] = legacy - elif legacy is None and count is not None: + elif count is not None: out["lc_perf_other_end_recovery"] = count return out @@ -929,7 +988,7 @@ class AmdSmiMetric(BaseModel): na_validator = field_validator("xgmi_err", "perf_level", mode="before")(na_to_none) _board_dict_na = field_validator("gpuboard", "baseboard", mode="before")(na_to_none_dict) - @field_validator("clock", mode="before") + @field_validator("clock", mode="plain") @classmethod def _normalize_clock(cls, clock: Any) -> Any: return _normalize_metric_clock_map(clock) diff --git a/test/unit/plugin/test_amdsmi_data.py b/test/unit/plugin/test_amdsmi_data.py index 2c58fa0e..9e28fbb9 100644 --- a/test/unit/plugin/test_amdsmi_data.py +++ b/test/unit/plugin/test_amdsmi_data.py @@ -72,8 +72,8 @@ } DUMMY_STATIC_DRIVER: dict[str, Optional[str]] = { - "name": "amdgpu", - "version": "6.12.12", + "driver_name": "amdgpu", + "driver_version": "6.12.12", "os_kernel_version": "6.8.0-dummy", } From 9377b62688c7710b249e19de708c8e3fc9bb9f26 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 28 May 2026 15:28:48 -0500 Subject: [PATCH 5/5] updating with details on which GPU has issues --- .../plugins/inband/amdsmi/amdsmi_analyzer.py | 197 ++++++++++++++++-- test/unit/plugin/test_amdsmi_analyzer.py | 56 ++++- 2 files changed, 225 insertions(+), 28 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py index b831d1c0..da427753 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_analyzer.py @@ -25,7 +25,7 @@ ############################################################################### import io from collections import defaultdict -from typing import Any, Optional, Union +from typing import Any, Mapping, Optional, Union from nodescraper.enums import EventCategory, EventPriority from nodescraper.interfaces import DataAnalyzer @@ -45,6 +45,59 @@ from .cper import CperAnalysisTaskMixin +def _gpu_mismatch_description( + check_name: str, + expected: object, + actual_by_gpu: Mapping[int, object], + *, + unit: str = "", +) -> tuple[str, str]: + """Return (event description, console details) for per-GPU expected vs actual.""" + gpu_ids = sorted(actual_by_gpu.keys()) + exp = f"{expected}{unit}" if unit else str(expected) + per_gpu_short = ", ".join(f"GPU {gpu}={actual_by_gpu[gpu]}{unit}" for gpu in gpu_ids) + description = f"{check_name} mismatch (expected {exp}): {per_gpu_short}" + details = "; ".join( + f"GPU {gpu}: expected {exp}, actual {actual_by_gpu[gpu]}{unit}" for gpu in gpu_ids + ) + return description, details + + +def _gpu_unavailable_description( + check_name: str, + gpu_ids: list[int], + *, + expected: Optional[object] = None, +) -> tuple[str, str]: + """Return (event description, console details) when a metric is missing per GPU.""" + gpus = sorted(gpu_ids) + gpu_text = ", ".join(f"GPU {g}" for g in gpus) + if expected is not None: + description = f"{check_name} not available on {gpu_text} (expected {expected})" + else: + description = f"{check_name} not available on {gpu_text}" + return description, description + + +def _static_mismatch_description(payload: dict[str, Any]) -> tuple[str, str]: + """Build description/details from ``check_static_data`` per-GPU payload.""" + per_gpu = payload.get("per_gpu") or [] + if not per_gpu: + return "amd-smi static data mismatch", "amd-smi static data mismatch" + parts: list[str] = [] + for entry in per_gpu: + gpu = entry["gpu"] + for mm in entry.get("mismatches") or []: + parts.append( + f"GPU {gpu} {mm['field']}: expected {mm['expected']!s}, actual {mm['actual']!s}" + ) + details = "; ".join(parts) + summary = payload.get("summary") or {} + gpus_affected = summary.get("gpus_affected", len(per_gpu)) + description = f"amd-smi static data mismatch on {gpus_affected} GPU(s): {details}" + return description, details + + class AmdSmiAnalyzer(CperAnalysisTaskMixin, DataAnalyzer[AmdSmiDataModel, None]): """Check AMD SMI Application data for PCIe, ECC errors, and CPER data.""" @@ -79,26 +132,38 @@ def check_expected_power_management( mismatches[gpu] = actual_str if missing: + desc, details = _gpu_unavailable_description( + "GPU power_management", + missing, + expected=expected_power_management, + ) self._log_event( category=EventCategory.PLATFORM, - description="GPU power_management value not available", + description=desc, priority=EventPriority.WARNING, data={ "gpus": missing, "expected_power_management": expected_power_management, + "details": details, }, console_log=True, ) if mismatches: + desc, details = _gpu_mismatch_description( + "GPU power_management", + expected_power_management, + mismatches, + ) self._log_event( category=EventCategory.PLATFORM, - description="GPU power_management mismatch", + description=desc, priority=EventPriority.ERROR, data={ "gpus": list(mismatches.keys()), "power_management_values": mismatches, "expected_power_management": expected_power_management, + "details": details, }, console_log=True, ) @@ -121,9 +186,17 @@ def check_expected_max_power( if max_power_vu is None or max_power_vu.value is None: self._log_event( category=EventCategory.PLATFORM, - description=f"GPU: {gpu.gpu} has no max power limit set", + description=f"GPU {gpu.gpu}: max power limit not set (expected {expected_max_power} W)", priority=EventPriority.WARNING, - data={"gpu": gpu.gpu}, + data={ + "gpu": gpu.gpu, + "expected_max_power": expected_max_power, + "details": ( + f"GPU {gpu.gpu}: max power limit not set " + f"(expected {expected_max_power} W)" + ), + }, + console_log=True, ) continue max_power_value = max_power_vu.value @@ -143,15 +216,23 @@ def check_expected_max_power( if max_power_float != expected_max_power: incorrect_max_power_gpus[gpu.gpu] = max_power_float if incorrect_max_power_gpus: + desc, details = _gpu_mismatch_description( + "GPU max power", + expected_max_power, + incorrect_max_power_gpus, + unit=" W", + ) self._log_event( category=EventCategory.PLATFORM, - description="Max power mismatch", + description=desc, priority=EventPriority.ERROR, data={ "gpus": list(incorrect_max_power_gpus.keys()), "max_power_values": incorrect_max_power_gpus, "expected_max_power": expected_max_power, + "details": details, }, + console_log=True, ) def check_expected_driver_version( @@ -177,15 +258,23 @@ def check_expected_driver_version( bad_driver_gpus.append(gpu.gpu) if bad_driver_gpus: + bad_versions = {g: versions_by_gpu[g] for g in bad_driver_gpus} + desc, details = _gpu_mismatch_description( + "GPU driver version", + expected_driver_version, + bad_versions, + ) self._log_event( category=EventCategory.PLATFORM, - description="Driver Version Mismatch", + description=desc, priority=EventPriority.ERROR, data={ "gpus": bad_driver_gpus, - "driver_version": {g: versions_by_gpu[g] for g in bad_driver_gpus}, + "driver_version": bad_versions, "expected_driver_version": expected_driver_version, + "details": details, }, + console_log=True, ) def check_amdsmi_metric_pcie( @@ -338,11 +427,17 @@ def check_amdsmi_metric_ecc_totals(self, amdsmi_metric_data: list[AmdSmiMetric]) for priority, count, desc in ecc_checks: if count is not None and count > 0: + details = f"GPU {gpu}: {desc} count={count}" self._log_event( category=EventCategory.RAS, - description="GPU ECC error count detected", + description=details, priority=priority, - data={"gpu": gpu, "error_count": count, "error_type": desc}, + data={ + "gpu": gpu, + "error_count": count, + "error_type": desc, + "details": details, + }, console_log=True, ) @@ -436,12 +531,19 @@ def expected_gpu_processes( gpu_exceeds_num_processes[process.gpu] = process_count if gpu_exceeds_num_processes: + desc, details = _gpu_mismatch_description( + "GPU process count", + max_num_processes, + gpu_exceeds_num_processes, + ) self._log_event( category=EventCategory.PLATFORM, - description="Number of processes exceeds max processes", + description=desc, priority=EventPriority.ERROR, data={ "gpu_exceeds_num_processes": gpu_exceeds_num_processes, + "expected_max_processes": max_num_processes, + "details": details, }, console_log=True, ) @@ -549,11 +651,14 @@ def check_static_data( if mismatches: payload = self._format_static_mismatch_payload(mismatches) + desc, details = _static_mismatch_description(payload) + payload["details"] = details self._log_event( category=EventCategory.PLATFORM, - description="amd-smi static data mismatch", + description=desc, priority=EventPriority.ERROR, data=payload, + console_log=True, ) def _format_static_mismatch_payload( @@ -628,15 +733,27 @@ def check_firmware_versions( ) if mismatches or missing: + detail_parts: list[str] = [] + for item in mismatches: + detail_parts.append( + f"GPU {item['gpu']} {item['fw_id']}: " + f"expected {item['expected']!s}, actual {item['actual']!s}" + ) + for item in missing: + detail_parts.append(f"GPU {item['gpu']} {item['fw_id']}: missing") + details = "; ".join(detail_parts) + description = f"GPU firmware version mismatch: {details}" self._log_event( category=EventCategory.FW, - description="Firmware version mismatch", + description=description, priority=EventPriority.ERROR, data={ "expected_firmware_versions": expected_firmware_versions, "mismatches": mismatches, "missing": missing, + "details": details, }, + console_log=True, ) def check_expected_memory_partition_mode( @@ -685,15 +802,41 @@ def check_expected_memory_partition_mode( ) if bad_memory_partition_mode_gpus: + mismatch_by_gpu: dict[int, str] = {} + for entry in bad_memory_partition_mode_gpus: + gpu_id = int(entry["gpu_id"]) # type: ignore[arg-type] + if "memory_partition_mode" in entry: + mismatch_by_gpu[gpu_id] = ( + f"memory={entry['memory_partition_mode']!s} " + f"(expected {expected_memory_partition_mode!s})" + ) + if "compute_partition_mode" in entry: + compute_detail = ( + f"compute={entry['compute_partition_mode']!s} " + f"(expected {expected_compute_partition_mode!s})" + ) + mismatch_by_gpu[gpu_id] = ( + f"{mismatch_by_gpu[gpu_id]}; {compute_detail}" + if gpu_id in mismatch_by_gpu + else compute_detail + ) + desc, details = _gpu_mismatch_description( + "GPU partition mode", + f"memory={expected_memory_partition_mode!s}, " + f"compute={expected_compute_partition_mode!s}", + mismatch_by_gpu, + ) self._log_event( category=EventCategory.PLATFORM, - description="Partition Mode Mismatch", + description=desc, priority=EventPriority.ERROR, data={ "actual_partition_data": bad_memory_partition_mode_gpus, "expected_memory_partition_mode": expected_memory_partition_mode, "expected_compute_partition_mode": expected_compute_partition_mode, + "details": details, }, + console_log=True, ) def check_expected_xgmi_link_speed( @@ -729,45 +872,65 @@ def check_expected_xgmi_link_speed( link_metric = xgmi_data.link_metrics try: if link_metric.bit_rate is None or link_metric.bit_rate.value is None: + details = ( + f"GPU {xgmi_data.gpu}: XGMI link speed not available " + f"(expected {expected_xgmi_speed} GT/s)" + ) self._log_event( category=EventCategory.IO, - description="XGMI link speed is not available", + description=details, priority=EventPriority.ERROR, data={ "gpu": xgmi_data.gpu, + "expected_xgmi_speed": expected_xgmi_speed, "xgmi_bit_rate": ( link_metric.bit_rate.unit if link_metric.bit_rate else "N/A" ), + "details": details, }, + console_log=True, ) continue xgmi_float = float(link_metric.bit_rate.value) except (ValueError, TypeError): + details = ( + f"GPU {xgmi_data.gpu}: XGMI link speed is not a valid number " + f"(expected {expected_xgmi_speed} GT/s)" + ) self._log_event( category=EventCategory.IO, - description="XGMI link speed is not a valid number", + description=details, priority=EventPriority.ERROR, data={ "gpu": xgmi_data.gpu, + "expected_xgmi_speed": expected_xgmi_speed, "xgmi_bit_rate": ( link_metric.bit_rate.value if link_metric.bit_rate else "N/A" ), + "details": details, }, + console_log=True, ) continue expected_floats = [float(e) for e in expected_xgmi_speed] if xgmi_float not in expected_floats: + details = ( + f"GPU {xgmi_data.gpu}: XGMI link speed {xgmi_float} GT/s " + f"not in expected {expected_xgmi_speed} GT/s" + ) self._log_event( category=EventCategory.IO, - description="XGMI link speed is not as expected", + description=details, priority=EventPriority.ERROR, data={ "expected_xgmi_speed": expected_xgmi_speed, "xgmi_bit_rate": xgmi_float, "gpu": xgmi_data.gpu, + "details": details, }, + console_log=True, ) def analyze_data( diff --git a/test/unit/plugin/test_amdsmi_analyzer.py b/test/unit/plugin/test_amdsmi_analyzer.py index cc479a90..560a9ba4 100644 --- a/test/unit/plugin/test_amdsmi_analyzer.py +++ b/test/unit/plugin/test_amdsmi_analyzer.py @@ -29,7 +29,11 @@ import pytest from nodescraper.enums import EventPriority -from nodescraper.plugins.inband.amdsmi.amdsmi_analyzer import AmdSmiAnalyzer +from nodescraper.plugins.inband.amdsmi.amdsmi_analyzer import ( + AmdSmiAnalyzer, + _gpu_mismatch_description, + _gpu_unavailable_description, +) from nodescraper.plugins.inband.amdsmi.amdsmidata import ( AmdSmiDataModel, AmdSmiMetric, @@ -267,7 +271,7 @@ def test_check_expected_max_power_mismatch(mock_analyzer): assert len(analyzer.result.events) == 1 assert analyzer.result.events[0].category == "PLATFORM" assert analyzer.result.events[0].priority == EventPriority.ERROR - assert "Max power mismatch" in analyzer.result.events[0].description + assert "GPU max power mismatch" in analyzer.result.events[0].description def test_check_expected_max_power_missing(mock_analyzer): @@ -283,7 +287,7 @@ def test_check_expected_max_power_missing(mock_analyzer): assert len(analyzer.result.events) == 1 assert analyzer.result.events[0].priority == EventPriority.WARNING - assert "has no max power limit set" in analyzer.result.events[0].description + assert "max power limit not set" in analyzer.result.events[0].description def test_check_expected_max_power_ppt0(mock_analyzer): @@ -307,7 +311,7 @@ def test_check_expected_max_power_ppt0(mock_analyzer): analyzer.result = analyzer._init_result() analyzer.check_expected_max_power([gpu], 550) assert len(analyzer.result.events) == 1 - assert "Max power mismatch" in analyzer.result.events[0].description + assert "GPU max power mismatch" in analyzer.result.events[0].description def test_check_expected_driver_version_success(mock_analyzer): @@ -338,7 +342,7 @@ def test_check_expected_driver_version_mismatch(mock_analyzer): assert len(analyzer.result.events) == 1 assert analyzer.result.events[0].category == "PLATFORM" assert analyzer.result.events[0].priority == EventPriority.ERROR - assert "Driver Version Mismatch" in analyzer.result.events[0].description + assert "GPU driver version mismatch" in analyzer.result.events[0].description def test_expected_gpu_processes_success(mock_analyzer): @@ -402,7 +406,10 @@ def test_expected_gpu_processes_exceeds(mock_analyzer): assert len(analyzer.result.events) == 1 assert analyzer.result.events[0].priority == EventPriority.ERROR - assert "Number of processes exceeds max processes" in analyzer.result.events[0].description + assert ( + "GPU process count mismatch (expected 5): GPU 0=10" in analyzer.result.events[0].description + ) + assert analyzer.result.events[0].data["details"] == "GPU 0: expected 5, actual 10" def test_expected_gpu_processes_no_data(mock_analyzer): @@ -674,7 +681,7 @@ def test_check_expected_xgmi_link_speed_mismatch(mock_analyzer): assert len(analyzer.result.events) == 1 assert analyzer.result.events[0].category == "IO" assert analyzer.result.events[0].priority == EventPriority.ERROR - assert "XGMI link speed is not as expected" in analyzer.result.events[0].description + assert "not in expected" in analyzer.result.events[0].description def test_check_expected_xgmi_link_speed_multiple_valid_speeds(mock_analyzer): @@ -741,7 +748,7 @@ def test_check_expected_xgmi_link_speed_missing_bit_rate(mock_analyzer): assert len(analyzer.result.events) == 1 assert analyzer.result.events[0].priority == EventPriority.ERROR - assert "XGMI link speed is not available" in analyzer.result.events[0].description + assert "XGMI link speed not available" in analyzer.result.events[0].description def test_analyze_data_full_workflow(mock_analyzer): @@ -926,6 +933,25 @@ def _minimal_amdsmi_metric( ) +def test_gpu_mismatch_description_helpers(): + """Per-GPU mismatch helpers include GPU id, expected, and actual in output.""" + desc, details = _gpu_mismatch_description( + "GPU power_management", "DISABLED", {0: "ENABLED", 3: "ENABLED"} + ) + assert "GPU 0=ENABLED" in desc + assert "GPU 3=ENABLED" in desc + assert "expected DISABLED" in desc + assert "GPU 0: expected DISABLED, actual ENABLED" in details + assert "GPU 3: expected DISABLED, actual ENABLED" in details + + desc_missing, _ = _gpu_unavailable_description( + "GPU power_management", [1, 2], expected="DISABLED" + ) + assert "GPU 1" in desc_missing + assert "GPU 2" in desc_missing + assert "expected DISABLED" in desc_missing + + def test_check_expected_power_management_success(mock_analyzer): """All GPUs with DISABLED power_management pass.""" analyzer = mock_analyzer @@ -948,7 +974,8 @@ def test_check_expected_power_management_mismatch(mock_analyzer): assert len(analyzer.result.events) == 1 assert analyzer.result.events[0].priority == EventPriority.ERROR assert "power_management mismatch" in analyzer.result.events[0].description - assert analyzer.result.events[0].data["gpus"] == [1] + assert "GPU 1=ENABLED" in analyzer.result.events[0].description + assert analyzer.result.events[0].data["details"] == ("GPU 1: expected DISABLED, actual ENABLED") def test_check_expected_power_management_missing(mock_analyzer): @@ -1098,9 +1125,16 @@ def test_check_amdsmi_metric_ecc_totals(mock_analyzer): ] analyzer.check_amdsmi_metric_ecc_totals(metrics) assert len(analyzer.result.events) == 5 - # Analyzer uses generic description "GPU ECC error count detected"; type is in data["error_type"] + expected_descriptions = { + "GPU 0: Total correctable ECC errors count=1", + "GPU 1: Total uncorrectable ECC errors count=1", + "GPU 2: Total deferred ECC errors count=1", + "GPU 3: Cache correctable ECC errors count=1", + "GPU 4: Cache uncorrectable ECC errors count=1", + } + assert {e.description for e in analyzer.result.events} == expected_descriptions for e in analyzer.result.events: - assert e.description == "GPU ECC error count detected" + assert e.description == e.data["details"] assert "error_type" in e.data and "error_count" in e.data error_types = [e.data["error_type"] for e in analyzer.result.events] assert "Total correctable ECC errors" in error_types