From 92c09c37fbdc28ab9e833c1299464fb93078f165 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 4 May 2026 10:59:16 -0500 Subject: [PATCH 01/25] fix --- nodescraper/models/datamodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nodescraper/models/datamodel.py b/nodescraper/models/datamodel.py index 78a5df06..c310c810 100644 --- a/nodescraper/models/datamodel.py +++ b/nodescraper/models/datamodel.py @@ -29,7 +29,7 @@ import tarfile from typing import TypeVar, Union -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, field_validator from nodescraper.utils import get_unique_filename @@ -37,7 +37,7 @@ class FileModel(BaseModel): - file_contents: bytes = Field(exclude=True) + file_contents: bytes file_name: str @field_validator("file_contents", mode="before") From c45ddc1ec25c8f98e835fd59aba4da440f6445e2 Mon Sep 17 00:00:00 2001 From: "Harding, Andrew" Date: Tue, 5 May 2026 15:58:02 -0500 Subject: [PATCH 02/25] -Added support for paginated Redfish GET requests -Added constants file to start maintaining strings which are part of DMTF's Redfish standard --- nodescraper/base/redfishcollectortask.py | 22 +++++++++ nodescraper/connection/redfish/__init__.py | 10 ++++ .../connection/redfish/redfish_connection.py | 48 +++++++++++++++++++ .../connection/redfish/redfish_constants.py | 38 +++++++++++++++ .../connection/redfish/redfish_oem_diag.py | 4 +- .../ooband/redfish_endpoint/collector_args.py | 10 ++++ .../redfish_endpoint/endpoint_collector.py | 41 +++++++++++----- 7 files changed, 158 insertions(+), 15 deletions(-) create mode 100644 nodescraper/connection/redfish/redfish_constants.py diff --git a/nodescraper/base/redfishcollectortask.py b/nodescraper/base/redfishcollectortask.py index b8401213..293b2c7a 100644 --- a/nodescraper/base/redfishcollectortask.py +++ b/nodescraper/base/redfishcollectortask.py @@ -77,3 +77,25 @@ def _run_redfish_get( if log_artifact: self.result.artifacts.append(res) return res + + def _run_redfish_get_paged( + self, + path: str, + max_pages: int = 200, + log_artifact: bool = True, + ) -> RedfishGetResult: + """ + Run a Redfish GET and follow Members@odata.nextLink pagination, merging all pages into a single response. + + Args: + path (str): Redfish URI path. + max_pages (int, optional): safety cap on the number of pages to follow. Defaults to 200. + log_artifact (bool, optional): whether we should log the merged result. Defaults to True. + + Returns: + RedfishGetResult: path, success, merged data (or error), status_code. + """ + res = self.connection.run_get_paged(path, max_pages=max_pages) + if log_artifact: + self.result.artifacts.append(res) + return res diff --git a/nodescraper/connection/redfish/__init__.py b/nodescraper/connection/redfish/__init__.py index ee812113..f98faaac 100644 --- a/nodescraper/connection/redfish/__init__.py +++ b/nodescraper/connection/redfish/__init__.py @@ -28,6 +28,12 @@ RedfishConnectionError, RedfishGetResult, ) +from .redfish_constants import ( + RF_MEMBERS, + RF_MEMBERS_COUNT, + RF_MEMBERS_NEXT_LINK, + RF_ODATA_ID, +) from .redfish_manager import RedfishConnectionManager from .redfish_oem_diag import ( collect_oem_diagnostic_data, @@ -45,4 +51,8 @@ "RedfishPath", "collect_oem_diagnostic_data", "get_oem_diagnostic_allowable_values", + "RF_MEMBERS", + "RF_MEMBERS_COUNT", + "RF_MEMBERS_NEXT_LINK", + "RF_ODATA_ID", ] diff --git a/nodescraper/connection/redfish/redfish_connection.py b/nodescraper/connection/redfish/redfish_connection.py index 449b4edb..4398b06f 100644 --- a/nodescraper/connection/redfish/redfish_connection.py +++ b/nodescraper/connection/redfish/redfish_connection.py @@ -34,6 +34,7 @@ from requests import Response from requests.auth import HTTPBasicAuth +from .redfish_constants import RF_MEMBERS, RF_MEMBERS_COUNT, RF_MEMBERS_NEXT_LINK from .redfish_path import RedfishPath DEFAULT_REDFISH_API_ROOT = "redfish/v1" @@ -183,6 +184,53 @@ def run_get(self, path: Union[str, RedfishPath]) -> RedfishGetResult: status_code=None, ) + def run_get_paged( + self, + path: Union[str, RedfishPath], + max_pages: int = 200, + ) -> RedfishGetResult: + """Run a Redfish GET and transparently follow Members@odata.nextLink pagination. + + Each subsequent page's Members list is appended to the first page's Members list + so the caller receives a single merged response body. The Members@odata.nextLink key + and Members@odata.count are updated to reflect the merged result. If there is no + Members@odata.nextLink in the first response this behaves identically to run_get. + max_pages is a safety cap on the number of pages to fetch (default 200). + """ + first = self.run_get(path) + if not first.success or first.data is None: + return first + + # Short-circuit when there is nothing to page through. + if RF_MEMBERS_NEXT_LINK not in first.data: + return first + + merged_members: list = list(first.data.get(RF_MEMBERS) or []) + merged_data: dict = dict(first.data) + pages_fetched = 1 + next_link: Optional[str] = first.data.get(RF_MEMBERS_NEXT_LINK) + last_status_code = first.status_code + + while next_link and pages_fetched < max_pages: + page_result = self.run_get(next_link) + last_status_code = page_result.status_code + if not page_result.success or page_result.data is None: + break + merged_members.extend(page_result.data.get(RF_MEMBERS) or []) + next_link = page_result.data.get(RF_MEMBERS_NEXT_LINK) + pages_fetched += 1 + + merged_data[RF_MEMBERS] = merged_members + merged_data[RF_MEMBERS_COUNT] = len(merged_members) + merged_data.pop(RF_MEMBERS_NEXT_LINK, None) + + return RedfishGetResult( + path=first.path, + success=True, + data=merged_data, + status_code=last_status_code, + ) + def copy(self) -> "RedfishConnection": """Return a new connection with the same config and its own session (for concurrent use).""" return RedfishConnection( diff --git a/nodescraper/connection/redfish/redfish_constants.py b/nodescraper/connection/redfish/redfish_constants.py new file mode 100644 index 00000000..786109cb --- /dev/null +++ b/nodescraper/connection/redfish/redfish_constants.py @@ -0,0 +1,38 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Redfish field name constants shared across the Redfish package(s).""" + +# Key holding the list of member resources in a Redfish collection. +RF_MEMBERS = "Members" + +# Key for the total member count of a Redfish collection. +RF_MEMBERS_COUNT = "Members@odata.count" + +# Key indicating the next page of a paginated Redfish collection. +RF_MEMBERS_NEXT_LINK = "Members@odata.nextLink" + +# Key holding the resource-link in every Redfish resource. +RF_ODATA_ID = "@odata.id" diff --git a/nodescraper/connection/redfish/redfish_oem_diag.py b/nodescraper/connection/redfish/redfish_oem_diag.py index 94133337..affabf6e 100644 --- a/nodescraper/connection/redfish/redfish_oem_diag.py +++ b/nodescraper/connection/redfish/redfish_oem_diag.py @@ -38,6 +38,7 @@ from nodescraper.enums import TaskState from .redfish_connection import RedfishConnection, RedfishConnectionError +from .redfish_constants import RF_ODATA_ID from .redfish_path import RedfishPath _module_logger = logging.getLogger(__name__) @@ -65,9 +66,6 @@ def _log_collect_diag_response( ) -# Redfish JSON key for resource link -RF_ODATA_ID = "@odata.id" - # @Redfish.AllowableValues: Redfish annotation for the list of allowable values for a string REDFISH_ANNOTATION_ALLOWABLE_VALUES = "Redfish.AllowableValues" diff --git a/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py b/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py index 662caffa..55bb4269 100644 --- a/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py +++ b/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py @@ -55,6 +55,16 @@ class RedfishEndpointCollectorArgs(BaseModel): le=32, description="Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.", ) + follow_next_link: bool = Field( + default=False, + description="If True, follow Members@odata.nextLink pagination for each URI and merge all pages into a single response.", + ) + max_pages: int = Field( + default=200, + ge=1, + le=10_000, # Some arbitrary value - may need to be revisited + description="When follow_next_link is True: safety cap on the number of pages to follow per URI (default 200).", + ) @field_validator("uris", mode="before") @classmethod diff --git a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py index 0a1305a2..e0878c1a 100644 --- a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py +++ b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py @@ -29,16 +29,18 @@ from urllib.parse import urlparse from nodescraper.base import RedfishDataCollector -from nodescraper.connection.redfish import RedfishConnection, RedfishGetResult +from nodescraper.connection.redfish import ( + RF_MEMBERS, + RF_ODATA_ID, + RedfishConnection, + RedfishGetResult, +) from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus from nodescraper.models import TaskResult from .collector_args import RedfishEndpointCollectorArgs from .endpoint_data import RedfishEndpointDataModel -ODATA_ID = "@odata.id" -MEMBERS = "Members" - def _normalize_path(odata_id: str, api_root: str) -> str: """Convert @odata.id value (URL or path) to a normalized path under api_root.""" @@ -61,17 +63,17 @@ def _extract_odata_ids(obj: Any) -> list[str]: """Recursively extract all @odata.id values from a Redfish JSON body.""" out: list[str] = [] if isinstance(obj, dict): - if ODATA_ID in obj and isinstance(obj[ODATA_ID], str): - out.append(obj[ODATA_ID]) + if RF_ODATA_ID in obj and isinstance(obj[RF_ODATA_ID], str): + out.append(obj[RF_ODATA_ID]) for k, v in obj.items(): - if k == MEMBERS and isinstance(v, list): + if k == RF_MEMBERS and isinstance(v, list): for item in v: if ( isinstance(item, dict) - and ODATA_ID in item - and isinstance(item[ODATA_ID], str) + and RF_ODATA_ID in item + and isinstance(item[RF_ODATA_ID], str) ): - out.append(item[ODATA_ID]) + out.append(item[RF_ODATA_ID]) elif isinstance(v, dict): out.extend(_extract_odata_ids(v)) elif isinstance(v, list): @@ -136,6 +138,13 @@ def _fetch_one(connection_copy: RedfishConnection, path: str) -> RedfishGetResul return connection_copy.run_get(path) +def _fetch_one_paged( + connection_copy: RedfishConnection, path: str, max_pages: int +) -> RedfishGetResult: + """Run a paged GET on a connection copy, following Members@odata.nextLink (used from worker threads).""" + return connection_copy.run_get_paged(path, max_pages=max_pages) + + class RedfishEndpointCollector( RedfishDataCollector[RedfishEndpointDataModel, RedfishEndpointCollectorArgs] ): @@ -199,12 +208,17 @@ def collect_data( max_workers = getattr(args, "max_workers", 1) if args else 1 max_workers = min(max_workers, len(paths)) + follow_next_link = getattr(args, "follow_next_link", False) is True + max_pages = getattr(args, "max_pages", 200) if args else 200 if max_workers <= 1: # Sequential responses = {} for path in paths: - res = self._run_redfish_get(path, log_artifact=True) + if follow_next_link: + res = self._run_redfish_get_paged(path, max_pages=max_pages, log_artifact=True) + else: + res = self._run_redfish_get(path, log_artifact=True) if res.success and res.data is not None: responses[res.path] = res.data else: @@ -222,7 +236,10 @@ def collect_data( futures = {} for path in paths: conn = self.connection.copy() - futures[executor.submit(_fetch_one, conn, path)] = path + if follow_next_link: + futures[executor.submit(_fetch_one_paged, conn, path, max_pages)] = path + else: + futures[executor.submit(_fetch_one, conn, path)] = path for future in as_completed(futures): path = futures[future] try: From 27bd68f9205eb05fae2d26d12315dfda15d7129a Mon Sep 17 00:00:00 2001 From: "Harding, Andrew" Date: Tue, 5 May 2026 17:33:20 -0500 Subject: [PATCH 03/25] -Converted Redfish constants to unix format --- .../connection/redfish/redfish_constants.py | 76 +++++++++---------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/nodescraper/connection/redfish/redfish_constants.py b/nodescraper/connection/redfish/redfish_constants.py index 786109cb..5fcc5316 100644 --- a/nodescraper/connection/redfish/redfish_constants.py +++ b/nodescraper/connection/redfish/redfish_constants.py @@ -1,38 +1,38 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -"""Redfish field name constants shared across the Redfish package(s).""" - -# Key holding the list of member resources in a Redfish collection. -RF_MEMBERS = "Members" - -# Key for the total member count of a Redfish collection. -RF_MEMBERS_COUNT = "Members@odata.count" - -# Key indicating the next page of a paginated Redfish collection. -RF_MEMBERS_NEXT_LINK = "Members@odata.nextLink" - -# Key holding the resource-link in every Redfish resource. -RF_ODATA_ID = "@odata.id" +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Redfish field name constants shared across the Redfish package(s).""" + +# Key holding the list of member resources in a Redfish collection. +RF_MEMBERS = "Members" + +# Key for the total member count of a Redfish collection. +RF_MEMBERS_COUNT = "Members@odata.count" + +# Key indicating the next page of a paginated Redfish collection. +RF_MEMBERS_NEXT_LINK = "Members@odata.nextLink" + +# Key holding the resource-link in every Redfish resource. +RF_ODATA_ID = "@odata.id" From 60c5059e4ff3272ebc2180c2b9a8ea6579de0660 Mon Sep 17 00:00:00 2001 From: "Harding, Andrew" Date: Wed, 6 May 2026 09:54:57 -0500 Subject: [PATCH 04/25] -Added follow_next_link and max_pages to README --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 27c14577..2fda59b4 100644 --- a/README.md +++ b/README.md @@ -491,7 +491,12 @@ The RedfishEndpointPlugin collects Redfish URIs (GET responses) and optionally r } ``` +**`collection_args`** - **`uris`**: List of Redfish paths (e.g. `/redfish/v1/`, `/redfish/v1/Systems/1`) to GET and store. +- **`follow_next_link`**: Optional (default `false`). When `true`, the collector follows `Members@odata.nextLink` pagination for each URI and merges all pages into a single response. +- **`max_pages`**: Optional (default `200`). Safety cap on the number of pages to follow per URI when `follow_next_link` is enabled. + +**`analysis_args`** - **`checks`**: Optional. Map of URI to expected values or constraints for analysis. Supports exact match (e.g. `"PowerState": "On"`), `anyOf`, `min`/`max`, etc. #### **'summary' sub command** From ade28002e3adc094a218cdb9a69332ff74db1758 Mon Sep 17 00:00:00 2001 From: "Harding, Andrew" Date: Wed, 6 May 2026 15:49:20 -0500 Subject: [PATCH 05/25] -Updated const comments --- nodescraper/connection/redfish/redfish_constants.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/nodescraper/connection/redfish/redfish_constants.py b/nodescraper/connection/redfish/redfish_constants.py index 5fcc5316..d27f0223 100644 --- a/nodescraper/connection/redfish/redfish_constants.py +++ b/nodescraper/connection/redfish/redfish_constants.py @@ -25,14 +25,14 @@ ############################################################################### """Redfish field name constants shared across the Redfish package(s).""" -# Key holding the list of member resources in a Redfish collection. +# Resource collection property which identifies members of the collection. RF_MEMBERS = "Members" -# Key for the total member count of a Redfish collection. +# Resource collection property which defines the total number of resources/members. RF_MEMBERS_COUNT = "Members@odata.count" -# Key indicating the next page of a paginated Redfish collection. +# Resource collection property which points to the next set of partial members from the originating operation. RF_MEMBERS_NEXT_LINK = "Members@odata.nextLink" -# Key holding the resource-link in every Redfish resource. +# Resource identifier property (optional for registry resources, required for all other resources and resource collections). RF_ODATA_ID = "@odata.id" From ff7dbc11575ecc5ac87c9d2b66486a0b32cba190 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 7 May 2026 10:48:59 -0500 Subject: [PATCH 06/25] updates --- nodescraper/base/inbandcollectortask.py | 3 +++ nodescraper/base/redfishcollectortask.py | 3 +++ nodescraper/cli/cli.py | 2 +- nodescraper/cli/host_cli_embed.py | 7 +------ nodescraper/constants.py | 2 ++ nodescraper/interfaces/connectionmanager.py | 3 +++ nodescraper/interfaces/datacollectortask.py | 5 ++++- nodescraper/interfaces/dataplugin.py | 6 ++++++ nodescraper/interfaces/plugin.py | 6 +++++- nodescraper/interfaces/task.py | 5 ++++- nodescraper/models/event.py | 3 ++- 11 files changed, 34 insertions(+), 11 deletions(-) diff --git a/nodescraper/base/inbandcollectortask.py b/nodescraper/base/inbandcollectortask.py index 16039bda..3b2d072f 100644 --- a/nodescraper/base/inbandcollectortask.py +++ b/nodescraper/base/inbandcollectortask.py @@ -28,6 +28,7 @@ from nodescraper.connection.inband import InBandConnection from nodescraper.connection.inband.inband import BaseFileArtifact, CommandArtifact +from nodescraper.constants import DEFAULT_EVENT_REPORTER from nodescraper.enums import EventPriority, OSFamily, SystemInteractionLevel from nodescraper.generictypes import TCollectArg, TDataModel from nodescraper.interfaces import DataCollector, TaskResultHook @@ -52,6 +53,7 @@ def __init__( max_event_priority_level: Union[EventPriority, str] = EventPriority.CRITICAL, parent: Optional[str] = None, task_result_hooks: Optional[list[TaskResultHook]] = None, + event_reporter: str = DEFAULT_EVENT_REPORTER, **kwargs, ): super().__init__( @@ -62,6 +64,7 @@ def __init__( connection=connection, parent=parent, task_result_hooks=task_result_hooks, + event_reporter=event_reporter, ) if self.system_info.os_family not in self.SUPPORTED_OS_FAMILY: raise SystemCompatibilityError( diff --git a/nodescraper/base/redfishcollectortask.py b/nodescraper/base/redfishcollectortask.py index b8401213..73b4228f 100644 --- a/nodescraper/base/redfishcollectortask.py +++ b/nodescraper/base/redfishcollectortask.py @@ -27,6 +27,7 @@ from typing import Generic, Optional, Union from nodescraper.connection.redfish import RedfishConnection, RedfishGetResult +from nodescraper.constants import DEFAULT_EVENT_REPORTER from nodescraper.enums import EventPriority from nodescraper.generictypes import TCollectArg, TDataModel from nodescraper.interfaces import DataCollector, TaskResultHook @@ -47,6 +48,7 @@ def __init__( max_event_priority_level: Union[EventPriority, str] = EventPriority.CRITICAL, parent: Optional[str] = None, task_result_hooks: Optional[list[TaskResultHook]] = None, + event_reporter: str = DEFAULT_EVENT_REPORTER, **kwargs, ): super().__init__( @@ -56,6 +58,7 @@ def __init__( max_event_priority_level=max_event_priority_level, parent=parent, task_result_hooks=task_result_hooks, + event_reporter=event_reporter, **kwargs, ) diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index 054c2c5b..16c639bb 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -200,7 +200,7 @@ def _add_cli_root_globals( def build_global_argument_parser(*, add_help: bool = True) -> argparse.ArgumentParser: - """Globals only (no subcommands), for host CLIs such as amd-error-scraper ``error-scraper``.""" + """Globals only (no subcommands), for host CLIs.""" plugin_reg = PluginRegistry() config_reg = _config_registry_with_all_plugins(plugin_reg) parser = argparse.ArgumentParser( diff --git a/nodescraper/cli/host_cli_embed.py b/nodescraper/cli/host_cli_embed.py index bffeb378..864639b6 100644 --- a/nodescraper/cli/host_cli_embed.py +++ b/nodescraper/cli/host_cli_embed.py @@ -39,12 +39,7 @@ def apply_host_cli_args_to_parsed_args( parsed_args: argparse.Namespace, host_ns: Optional[argparse.Namespace], ) -> None: - """Copy host profile fields from an embedding host onto parsed top-level args. - - Used when ``main(..., host_cli_args=...)`` is invoked (e.g. from the - error-scraper wrapper) so ``--connection-config`` profile data loaded by the - host is visible to :func:`get_system_info` and the rest of the CLI. - """ + """Copy host profile fields from an embedding host onto parsed top-level args.""" if host_ns is None: return for attr in ( diff --git a/nodescraper/constants.py b/nodescraper/constants.py index 8769d5a2..981f0827 100644 --- a/nodescraper/constants.py +++ b/nodescraper/constants.py @@ -24,3 +24,5 @@ # ############################################################################### DEFAULT_LOGGER = "nodescraper" + +DEFAULT_EVENT_REPORTER = "NODE_SCRAPER" diff --git a/nodescraper/interfaces/connectionmanager.py b/nodescraper/interfaces/connectionmanager.py index ccb5e793..f735bd6f 100644 --- a/nodescraper/interfaces/connectionmanager.py +++ b/nodescraper/interfaces/connectionmanager.py @@ -33,6 +33,7 @@ from pydantic import BaseModel +from nodescraper.constants import DEFAULT_EVENT_REPORTER from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus from nodescraper.models import SystemInfo, TaskResult from nodescraper.typeutils import TypeUtils @@ -93,6 +94,7 @@ def __init__( parent: Optional[str] = None, task_result_hooks: Optional[list[TaskResultHook], None] = None, connection_args: Optional[Union[TConnectArg, dict]] = None, + event_reporter: str = DEFAULT_EVENT_REPORTER, **kwargs, ): super().__init__( @@ -101,6 +103,7 @@ def __init__( max_event_priority_level=max_event_priority_level, parent="connection" if not parent else parent, task_result_hooks=task_result_hooks, + event_reporter=event_reporter, **kwargs, ) diff --git a/nodescraper/interfaces/datacollectortask.py b/nodescraper/interfaces/datacollectortask.py index 020bf053..9c7cea09 100644 --- a/nodescraper/interfaces/datacollectortask.py +++ b/nodescraper/interfaces/datacollectortask.py @@ -32,6 +32,7 @@ from pydantic import BaseModel, ValidationError +from nodescraper.constants import DEFAULT_EVENT_REPORTER from nodescraper.enums import ( EventCategory, EventPriority, @@ -144,6 +145,7 @@ def __init__( max_event_priority_level: Union[EventPriority, str] = EventPriority.CRITICAL, parent: Optional[str] = None, task_result_hooks: Optional[list[TaskResultHook]] = None, + event_reporter: str = DEFAULT_EVENT_REPORTER, **kwargs, ): """data collector init function @@ -151,7 +153,7 @@ def __init__( Args: system_info (SystemInfo): system info object for target system for data collection system_interaction (SystemInteraction): enum to indicate the type of actions that can be performed when interacting with the system - event_reporter (str, optional): Described the reporter of the event. Defaults to DEFAULT_EVENT_REPORTER. + event_reporter (str, optional): Reporter string stored on emitted events. Defaults to DEFAULT_EVENT_REPORTER. logger (Optional[logging.Logger], optional): python logger object. Defaults to None. log_path (Optional[str], optional): file system log path. Defaults to None. """ @@ -161,6 +163,7 @@ def __init__( max_event_priority_level=max_event_priority_level, parent=parent, task_result_hooks=task_result_hooks, + event_reporter=event_reporter, ) if isinstance(system_interaction_level, str): diff --git a/nodescraper/interfaces/dataplugin.py b/nodescraper/interfaces/dataplugin.py index ed632fb4..e86cb578 100644 --- a/nodescraper/interfaces/dataplugin.py +++ b/nodescraper/interfaces/dataplugin.py @@ -31,6 +31,7 @@ from pydantic import Field +from nodescraper.constants import DEFAULT_EVENT_REPORTER from nodescraper.enums import EventPriority, ExecutionStatus, SystemInteractionLevel from nodescraper.generictypes import TAnalyzeArg, TCollectArg, TDataModel from nodescraper.interfaces.dataanalyzertask import DataAnalyzer @@ -74,6 +75,7 @@ def __init__( connection_args: Optional[Union[TConnectArg, dict]] = None, task_result_hooks: Optional[list[TaskResultHook]] = None, log_path: Optional[str] = None, + event_reporter: str = DEFAULT_EVENT_REPORTER, **kwargs, ): super().__init__( @@ -83,6 +85,7 @@ def __init__( connection_args, task_result_hooks, log_path, + event_reporter=event_reporter, **kwargs, ) self._validate_class_var() @@ -186,6 +189,7 @@ def collect( logger=self.logger, parent=self.__class__.__name__, task_result_hooks=self.task_result_hooks, + event_reporter=self.event_reporter, ) if ( @@ -219,6 +223,7 @@ def collect( parent=self.__class__.__name__, task_result_hooks=self.task_result_hooks, log_path=self.log_path, + event_reporter=self.event_reporter, ) self.collection_result, self._data = collection_task.collect_data(collection_args) @@ -293,6 +298,7 @@ def analyze( max_event_priority_level=max_event_priority_level, parent=self.__class__.__name__, task_result_hooks=self.task_result_hooks, + event_reporter=self.event_reporter, ) self.analysis_result = analyzer_task.analyze_data(self.data, analysis_args) return self.analysis_result diff --git a/nodescraper/interfaces/plugin.py b/nodescraper/interfaces/plugin.py index 0194ef2d..d7ef4d55 100644 --- a/nodescraper/interfaces/plugin.py +++ b/nodescraper/interfaces/plugin.py @@ -28,7 +28,7 @@ import logging from typing import Callable, Generic, Optional, Type, Union -from nodescraper.constants import DEFAULT_LOGGER +from nodescraper.constants import DEFAULT_EVENT_REPORTER, DEFAULT_LOGGER from nodescraper.models import PluginResult, SystemInfo from nodescraper.taskresulthooks.filesystemloghook import FileSystemLogHook @@ -50,6 +50,7 @@ def __init__( task_result_hooks: Optional[list[TaskResultHook]] = None, log_path: Optional[str] = None, queue_callback: Optional[Callable] = None, + event_reporter: str = DEFAULT_EVENT_REPORTER, **kwargs, ): """Initialize plugin @@ -86,6 +87,8 @@ def __init__( self.queue_callback = queue_callback + self.event_reporter = event_reporter + self.connection_manager = connection_manager if connection_args and self.CONNECTION_TYPE and not self.connection_manager: @@ -95,6 +98,7 @@ def __init__( connection_args=connection_args, parent=self.__class__.__name__, task_result_hooks=self.task_result_hooks, + event_reporter=event_reporter, ) @classmethod diff --git a/nodescraper/interfaces/task.py b/nodescraper/interfaces/task.py index 16d1a70b..3704e437 100644 --- a/nodescraper/interfaces/task.py +++ b/nodescraper/interfaces/task.py @@ -29,7 +29,7 @@ import logging from typing import Any, Optional, Union -from nodescraper.constants import DEFAULT_LOGGER +from nodescraper.constants import DEFAULT_EVENT_REPORTER, DEFAULT_LOGGER from nodescraper.enums import EventCategory, EventPriority from nodescraper.models import Event, SystemInfo, TaskResult @@ -54,12 +54,14 @@ def __init__( max_event_priority_level: Union[EventPriority, str] = EventPriority.CRITICAL, parent: Optional[str] = None, task_result_hooks: Optional[list[TaskResultHook]] = None, + event_reporter: str = DEFAULT_EVENT_REPORTER, **kwargs: dict[str, Any], ): if logger is None: logger = logging.getLogger(DEFAULT_LOGGER) self.system_info = system_info self.logger = logger + self.event_reporter = event_reporter self.max_event_priority_level = max_event_priority_level self.parent = parent if not task_result_hooks: @@ -122,6 +124,7 @@ def _build_event( priority = self.max_event_priority_level event = Event( + reporter=self.event_reporter, category=category, description=description, priority=priority, diff --git a/nodescraper/models/event.py b/nodescraper/models/event.py index 33cf2801..ebf92125 100644 --- a/nodescraper/models/event.py +++ b/nodescraper/models/event.py @@ -32,6 +32,7 @@ from pydantic import BaseModel, Field, field_serializer, field_validator +from nodescraper.constants import DEFAULT_EVENT_REPORTER from nodescraper.enums import EventPriority @@ -66,7 +67,7 @@ class Event(BaseModel): timestamp: datetime.datetime = Field( default_factory=lambda: datetime.datetime.now(datetime.timezone.utc) ) - reporter: str = "NODE_SCRAPER" + reporter: str = DEFAULT_EVENT_REPORTER category: str description: str data: dict = Field(default_factory=dict) From 89d3ffcdb12669f074480ddd73986908a05f6995 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 7 May 2026 14:01:51 -0500 Subject: [PATCH 07/25] updates --- nodescraper/models/__init__.py | 12 ++- nodescraper/models/event.py | 30 ++++-- nodescraper/models/priority_override.py | 126 ++++++++++++++++++++++++ test/unit/framework/test_dataplugin.py | 1 + test/unit/test_serialization.py | 75 ++++++++++++++ 5 files changed, 237 insertions(+), 7 deletions(-) create mode 100644 nodescraper/models/priority_override.py create mode 100644 test/unit/test_serialization.py diff --git a/nodescraper/models/__init__.py b/nodescraper/models/__init__.py index af9673c1..6b7ebb00 100644 --- a/nodescraper/models/__init__.py +++ b/nodescraper/models/__init__.py @@ -25,11 +25,16 @@ ############################################################################### from .analyzerargs import AnalyzerArgs from .collectorargs import CollectorArgs -from .datamodel import DataModel +from .datamodel import DataModel, FileModel, TDataModel from .datapluginresult import DataPluginResult from .event import Event from .pluginconfig import PluginConfig from .pluginresult import PluginResult +from .priority_override import ( + NO_CHANGE, + PriorityOverrideRule, + apply_priority_override_rules, +) from .systeminfo import SystemInfo from .taskresult import TaskResult from .timerangeargs import TimeRangeAnalysisArgs @@ -38,11 +43,16 @@ "AnalyzerArgs", "CollectorArgs", "DataModel", + "FileModel", + "TDataModel", "TaskResult", "Event", "SystemInfo", "PluginResult", "DataPluginResult", "PluginConfig", + "NO_CHANGE", + "PriorityOverrideRule", + "apply_priority_override_rules", "TimeRangeAnalysisArgs", ] diff --git a/nodescraper/models/event.py b/nodescraper/models/event.py index ebf92125..cc084d7c 100644 --- a/nodescraper/models/event.py +++ b/nodescraper/models/event.py @@ -114,15 +114,30 @@ def validate_category(cls, category: Optional[Union[str, Enum]]) -> str: @field_validator("priority", mode="before") @classmethod - def validate_priority(cls, priority: Union[str, EventPriority]) -> EventPriority: - """Allow priority to be set via string priority name + def validate_priority(cls, priority: Union[str, int, EventPriority]) -> EventPriority: + """Allow priority via :class:`EventPriority`, name string, or integer value. + + Integer values use :class:`~enum.IntEnum` construction (same numeric scale as + ``EventPriority``). Values outside the enum (e.g. foreign severity codes) map + to :attr:`EventPriority.ERROR`. Booleans are rejected (``bool`` is a subclass + of ``int`` in Python). + Args: - priority (Union[str, EventPriority]): event priority string or enum + priority: Enum, member name, or integer severity. + Raises: - ValueError: if priority string is an invalid value + ValueError: if *priority* is a boolean or an invalid string name. + Returns: - EventPriority: priority enum + Resolved :class:`EventPriority`. """ + if isinstance(priority, bool): + raise ValueError("priority must not be a boolean") + if isinstance(priority, int): + try: + return EventPriority(priority) + except ValueError: + return EventPriority.ERROR if isinstance(priority, str): try: return getattr(EventPriority, priority.upper()) @@ -132,7 +147,10 @@ def validate_priority(cls, priority: Union[str, EventPriority]) -> EventPriority ) from e if isinstance(priority, EventPriority): return priority - raise ValueError("priority must be an EventPriority or its name as a string") + raise ValueError( + "priority must be an EventPriority, its name as a string, or an int " + "(unknown ints map to ERROR)" + ) @field_serializer("priority") def serialize_priority(self, priority: EventPriority, _info) -> str: diff --git a/nodescraper/models/priority_override.py b/nodescraper/models/priority_override.py new file mode 100644 index 00000000..7f40aac7 --- /dev/null +++ b/nodescraper/models/priority_override.py @@ -0,0 +1,126 @@ +############################################################################### +# +# MIT License +# +# Copyright (C) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +"""First-match-wins priority override rules for :class:`~nodescraper.models.event.Event`.""" + +from __future__ import annotations + +import re +from typing import Optional + +from pydantic import BaseModel, ConfigDict, field_validator, model_validator + +from nodescraper.enums import EventPriority +from nodescraper.models.event import Event + +__all__ = [ + "NO_CHANGE", + "PriorityOverrideRule", + "apply_priority_override_rules", +] + +NO_CHANGE = "NO_CHANGE" + + +def _normalize_category(category: str) -> str: + category = str(category).strip().upper() + return re.sub(r"[\s-]", "_", category) + + +class PriorityOverrideRule(BaseModel): + """One override rule; first matching rule wins when applied to an event list.""" + + model_config = ConfigDict(extra="forbid", str_strip_whitespace=True) + + match_all: bool = False + new_priority: str + description: Optional[str] = None + message: Optional[str] = None + event_category: Optional[str] = None + + @field_validator("new_priority", mode="before") + @classmethod + def _validate_new_priority_token(cls, value: object) -> str: + if value is None: + raise ValueError("new_priority is required") + if value == NO_CHANGE: + return NO_CHANGE + if isinstance(value, EventPriority): + return value.name + if not isinstance(value, str): + raise ValueError("new_priority must be a string or EventPriority") + upper = value.upper() + if upper == NO_CHANGE: + return NO_CHANGE + if upper not in {p.name for p in EventPriority}: + raise ValueError( + f"new_priority must be {NO_CHANGE} or one of " f"{[p.name for p in EventPriority]}" + ) + return upper + + @model_validator(mode="after") + def _require_match_all_or_selectors(self) -> PriorityOverrideRule: + if self.match_all: + return self + if self.description is None and self.message is None and self.event_category is None: + raise ValueError( + "set match_all=True or provide at least one selector among " + "description, message, and event_category" + ) + return self + + def matches_event(self, event: Event) -> bool: + """Return True if this rule applies to *event*.""" + if self.match_all: + return True + if self.description is not None and event.description != self.description: + return False + if self.message is not None: + match_content = event.data.get("match_content", "") + if not isinstance(match_content, str): + match_content = str(match_content) + if self.message not in match_content and self.message not in event.description: + return False + if self.event_category is not None: + if _normalize_category(self.event_category) != _normalize_category(event.category): + return False + return True + + +def apply_priority_override_rules(events: list[Event], rules: list[dict]) -> None: + """Apply *rules* in order to each event in *events* (in place); first match wins. + + ``new_priority`` may be :data:`NO_CHANGE` to keep the current priority while still + stopping further rules for that event. + """ + parsed = [PriorityOverrideRule.model_validate(r) for r in rules] + for event in events: + for rule in parsed: + if not rule.matches_event(event): + continue + if rule.new_priority != NO_CHANGE: + event.priority = EventPriority[rule.new_priority] + break diff --git a/test/unit/framework/test_dataplugin.py b/test/unit/framework/test_dataplugin.py index c6e5cb48..a220cfe6 100644 --- a/test/unit/framework/test_dataplugin.py +++ b/test/unit/framework/test_dataplugin.py @@ -151,6 +151,7 @@ def test_collect_creates_connection_manager(self, plugin, conn_mock, system_info logger=plugin.logger, parent=plugin.__class__.__name__, task_result_hooks=plugin.task_result_hooks, + event_reporter=plugin.event_reporter, ) mock_collect.assert_called_once() assert result.status == ExecutionStatus.OK diff --git a/test/unit/test_serialization.py b/test/unit/test_serialization.py new file mode 100644 index 00000000..a4e009b3 --- /dev/null +++ b/test/unit/test_serialization.py @@ -0,0 +1,75 @@ +############################################################################### +# +# MIT License +# +# Copyright (C) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +import pytest +from pydantic import BaseModel + +from nodescraper.serialization import safe_dump_to_json_dict + + +class _Sample(BaseModel): + a: int + b: str = "x" + + +def test_safe_dump_to_json_dict_round_trip() -> None: + m = _Sample(a=7) + d = safe_dump_to_json_dict(m) + assert d == {"a": 7, "b": "x"} + + +def test_safe_dump_to_json_dict_exclude() -> None: + m = _Sample(a=7) + d = safe_dump_to_json_dict(m, exclude={"b"}) + assert d == {"a": 7} + + +def test_safe_dump_falls_back_when_model_dump_json_fails(monkeypatch: pytest.MonkeyPatch) -> None: + m = _Sample(a=3) + + def _boom(self, **kwargs): + raise RuntimeError("json path failed") + + monkeypatch.setattr(_Sample, "model_dump_json", _boom) + d = safe_dump_to_json_dict(m) + assert d == {"a": 3, "b": "x"} + + +def test_safe_dump_chains_when_both_fail(monkeypatch: pytest.MonkeyPatch) -> None: + m = _Sample(a=1) + + def _boom_json(self, **kwargs): + raise RuntimeError("first") + + def _boom_dump(self, **kwargs): + raise RuntimeError("second") + + monkeypatch.setattr(_Sample, "model_dump_json", _boom_json) + monkeypatch.setattr(_Sample, "model_dump", _boom_dump) + with pytest.raises(RuntimeError, match="second") as exc_info: + safe_dump_to_json_dict(m) + assert exc_info.value.__cause__ is not None + assert "first" in str(exc_info.value.__cause__) From eae6ddaac75e9bd7bbb778d8b967684837b7ccd8 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 7 May 2026 15:06:38 -0500 Subject: [PATCH 08/25] added serialization --- nodescraper/serialization.py | 72 ++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 nodescraper/serialization.py diff --git a/nodescraper/serialization.py b/nodescraper/serialization.py new file mode 100644 index 00000000..64d4f19e --- /dev/null +++ b/nodescraper/serialization.py @@ -0,0 +1,72 @@ +############################################################################### +# +# MIT License +# +# Copyright (C) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +from __future__ import annotations + +import json +from typing import Any, cast + +from pydantic import BaseModel + +__all__ = ["safe_dump_to_json_dict"] + + +def safe_dump_to_json_dict( + model: BaseModel, + *, + exclude: set[str] | frozenset[str] | None = None, + by_alias: bool = True, +) -> dict[str, Any]: + """Best-effort JSON-like ``dict`` from a Pydantic model. + + Args: + model: Model instance to export. + exclude: Field names to omit (same shape as Pydantic ``exclude`` for sets). + by_alias: When ``True``, use field aliases in the output. + + Returns: + A plain ``dict`` suitable for JSON tools and schema validators. + """ + ex: set[str] | frozenset[str] | None = exclude + ex_inc = cast(Any, ex) + try: + raw = model.model_dump_json( + by_alias=by_alias, + exclude=ex_inc, + serialize_as_any=True, + ) + return json.loads(raw) + except Exception as first_exc: + try: + dumped = model.model_dump( + mode="python", + by_alias=by_alias, + exclude=ex_inc, + serialize_as_any=True, + ) + except Exception as second_exc: + raise second_exc from first_exc + return dumped From e6f5a823a684340c48bb675fbb3c8d9dc1fecb44 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 8 May 2026 14:02:19 -0500 Subject: [PATCH 09/25] appending exception traceback --- nodescraper/interfaces/connectionmanager.py | 3 +++ nodescraper/interfaces/datacollectortask.py | 8 ++++++++ nodescraper/interfaces/dataplugin.py | 5 +++++ nodescraper/interfaces/task.py | 17 ++++++++++++++++- nodescraper/utils.py | 10 ++++++---- 5 files changed, 38 insertions(+), 5 deletions(-) diff --git a/nodescraper/interfaces/connectionmanager.py b/nodescraper/interfaces/connectionmanager.py index f735bd6f..bcd04c80 100644 --- a/nodescraper/interfaces/connectionmanager.py +++ b/nodescraper/interfaces/connectionmanager.py @@ -64,6 +64,9 @@ def wrapper( priority=EventPriority.CRITICAL, console_log=True, ) + connection_manager.logger.exception( + "Exception connecting with %s", connection_manager.__class__.__name__ + ) connection_manager.result.status = ExecutionStatus.EXECUTION_FAILURE result = connection_manager.result diff --git a/nodescraper/interfaces/datacollectortask.py b/nodescraper/interfaces/datacollectortask.py index 9c7cea09..6cc21181 100644 --- a/nodescraper/interfaces/datacollectortask.py +++ b/nodescraper/interfaces/datacollectortask.py @@ -97,6 +97,11 @@ def wrapper( priority=EventPriority.CRITICAL, console_log=True, ) + collector.logger.error( + "Pydantic validation error in data collector %s: %s", + collector.__class__.__name__, + exception.errors(include_url=False), + ) else: collector._log_event( category=EventCategory.RUNTIME, @@ -105,6 +110,9 @@ def wrapper( priority=EventPriority.CRITICAL, console_log=True, ) + collector.logger.exception( + "Exception in data collector %s", collector.__class__.__name__ + ) collector.result.status = ExecutionStatus.EXECUTION_FAILURE result = collector.result data = None diff --git a/nodescraper/interfaces/dataplugin.py b/nodescraper/interfaces/dataplugin.py index e86cb578..9ff31d70 100644 --- a/nodescraper/interfaces/dataplugin.py +++ b/nodescraper/interfaces/dataplugin.py @@ -235,6 +235,11 @@ def collect( message=str(e), ) except Exception as e: + self.logger.exception( + "Unhandled exception running collector %s for plugin %s", + self.COLLECTOR.__name__, + self.__class__.__name__, + ) self.collection_result = TaskResult( task=self.COLLECTOR.__name__, parent=self.__class__.__name__, diff --git a/nodescraper/interfaces/task.py b/nodescraper/interfaces/task.py index 3704e437..79b4a389 100644 --- a/nodescraper/interfaces/task.py +++ b/nodescraper/interfaces/task.py @@ -154,7 +154,22 @@ def _log_event( ) if console_log: - self.logger.log(getattr(logging, priority.name, logging.INFO), description) + level = getattr(logging, priority.name, logging.INFO) + prefix = "" + if data: + et = data.get("exception_type") + if et: + prefix = f"[{et}] " + self.logger.log(level, "%s%s", prefix, description) + if data: + tb = data.get("traceback") + if tb: + tb_text = "".join(tb) if isinstance(tb, list) else str(tb) + if tb_text.strip(): + self.logger.log(level, "Traceback:\n%s", tb_text.rstrip()) + det = data.get("details") + if det and not tb: + self.logger.log(level, "Details: %s", det) self.result.events.append(event) diff --git a/nodescraper/utils.py b/nodescraper/utils.py index 3b9edf34..e7a201b8 100644 --- a/nodescraper/utils.py +++ b/nodescraper/utils.py @@ -53,18 +53,20 @@ def _generate_next_value_(name, start, count, last_values): return name -def get_exception_traceback(exception: Exception) -> dict: +def get_exception_traceback(exception: BaseException) -> dict: """get traceback and exception type from an exception Args: - exception (Exception): exception + exception (BaseException): exception Returns: - dict: exception details dict + dict: exception details dict (traceback is full format_exception lines, not frames only) """ return { "exception_type": type(exception).__name__, - "traceback": traceback.format_tb(exception.__traceback__), + "traceback": traceback.format_exception( + type(exception), exception, exception.__traceback__ + ), } From 1e7ef9c20409c1c123b0336f9069b3e27e7925e0 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 8 May 2026 15:57:52 -0500 Subject: [PATCH 10/25] utest --- .../redfish/test_redfish_connection_paging.py | 160 ++++++++++++++++++ .../plugin/test_redfish_endpoint_collector.py | 110 ++++++++++++ 2 files changed, 270 insertions(+) create mode 100644 test/unit/connection/redfish/test_redfish_connection_paging.py diff --git a/test/unit/connection/redfish/test_redfish_connection_paging.py b/test/unit/connection/redfish/test_redfish_connection_paging.py new file mode 100644 index 00000000..f922709c --- /dev/null +++ b/test/unit/connection/redfish/test_redfish_connection_paging.py @@ -0,0 +1,160 @@ +############################################################################### +# +# MIT License +# +# Copyright (C) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from unittest.mock import patch + +import pytest + +from nodescraper.connection.redfish import ( + RF_MEMBERS, + RF_MEMBERS_COUNT, + RF_MEMBERS_NEXT_LINK, + RedfishConnection, + RedfishGetResult, +) + + +@pytest.fixture +def rf_conn() -> RedfishConnection: + return RedfishConnection( + base_url="https://bmc.example", + username="u", + password="p", + verify_ssl=False, + ) + + +def test_run_get_paged_no_next_link_returns_first_unchanged(rf_conn: RedfishConnection) -> None: + first_body = {RF_MEMBERS: [{"x": 1}], "Name": "Col"} + first = RedfishGetResult( + path="/redfish/v1/Systems", + success=True, + data=first_body, + status_code=200, + ) + with patch.object(rf_conn, "run_get", return_value=first) as mock_get: + out = rf_conn.run_get_paged("/redfish/v1/Systems") + + mock_get.assert_called_once() + assert out.success is True + assert out.data == first_body + assert RF_MEMBERS_NEXT_LINK not in out.data + + +def test_run_get_paged_merges_members_and_strips_next_link(rf_conn: RedfishConnection) -> None: + page1 = { + RF_MEMBERS: [{"@odata.id": "/1"}], + RF_MEMBERS_NEXT_LINK: "/redfish/v1/Systems?$skip=1", + f"{RF_MEMBERS}@odata.count": 99, + } + page2 = { + RF_MEMBERS: [{"@odata.id": "/2"}], + } + + def fake_get(path: str) -> RedfishGetResult: + p = str(path).strip() + if not p.startswith("/"): + p = "/" + p + if "skip" not in p: + return RedfishGetResult( + path="/redfish/v1/Systems", success=True, data=page1, status_code=200 + ) + return RedfishGetResult(path=p, success=True, data=page2, status_code=200) + + with patch.object(rf_conn, "run_get", side_effect=fake_get): + out = rf_conn.run_get_paged("/redfish/v1/Systems", max_pages=10) + + assert out.success is True + assert out.path == "/redfish/v1/Systems" + assert out.data is not None + assert out.data[RF_MEMBERS] == [{"@odata.id": "/1"}, {"@odata.id": "/2"}] + assert out.data[RF_MEMBERS_COUNT] == 2 + assert RF_MEMBERS_NEXT_LINK not in out.data + + +def test_run_get_paged_stops_on_followup_failure_keeps_partial_merge( + rf_conn: RedfishConnection, +) -> None: + page1 = { + RF_MEMBERS: [{"@odata.id": "/1"}], + RF_MEMBERS_NEXT_LINK: "/next", + } + + def fake_get(path: str) -> RedfishGetResult: + ps = str(path) + if "next" not in ps: + return RedfishGetResult(path="/col", success=True, data=page1, status_code=200) + return RedfishGetResult(path="/next", success=False, error="timeout", status_code=None) + + with patch.object(rf_conn, "run_get", side_effect=fake_get): + out = rf_conn.run_get_paged("/col") + + assert out.success is True + assert out.data is not None + assert out.data[RF_MEMBERS] == [{"@odata.id": "/1"}] + assert RF_MEMBERS_NEXT_LINK not in out.data + + +def test_run_get_paged_respects_max_pages(rf_conn: RedfishConnection) -> None: + """max_pages=2 allows initial GET plus one nextLink follow only.""" + + def body_with_next(mid: str) -> dict: + return { + RF_MEMBERS: [{"id": mid}], + RF_MEMBERS_NEXT_LINK: "/page2", + } + + calls: list[str] = [] + + def fake_get(path: str) -> RedfishGetResult: + calls.append(str(path)) + ps = str(path) + if len(calls) == 1: + return RedfishGetResult( + path="/start", success=True, data=body_with_next("a"), status_code=200 + ) + return RedfishGetResult(path=ps, success=True, data=body_with_next("b"), status_code=200) + + with patch.object(rf_conn, "run_get", side_effect=fake_get): + out = rf_conn.run_get_paged("/start", max_pages=2) + + assert len(calls) == 2 + assert out.data is not None + assert len(out.data[RF_MEMBERS]) == 2 + assert RF_MEMBERS_NEXT_LINK not in out.data + + +def test_run_get_paged_first_request_failure_passthrough(rf_conn: RedfishConnection) -> None: + err = RedfishGetResult( + path="/redfish/v1/Bad", + success=False, + error="nope", + status_code=404, + ) + with patch.object(rf_conn, "run_get", return_value=err): + out = rf_conn.run_get_paged("/redfish/v1/Bad") + + assert out is err + assert out.success is False diff --git a/test/unit/plugin/test_redfish_endpoint_collector.py b/test/unit/plugin/test_redfish_endpoint_collector.py index 7a786409..501a0d8b 100644 --- a/test/unit/plugin/test_redfish_endpoint_collector.py +++ b/test/unit/plugin/test_redfish_endpoint_collector.py @@ -237,6 +237,116 @@ def test_fetch_one_calls_run_get(): assert out.path == "/redfish/v1" +def test_fetch_one_paged_calls_run_get_paged(): + conn = MagicMock() + conn.run_get_paged.return_value = RedfishGetResult( + path="/redfish/v1/Systems", + success=True, + data={"Members": []}, + status_code=200, + ) + out = ec._fetch_one_paged(conn, "/redfish/v1/Systems", max_pages=42) + conn.run_get_paged.assert_called_once_with("/redfish/v1/Systems", max_pages=42) + assert out.success is True + + +def test_run_redfish_get_paged_appends_artifact_when_enabled( + redfish_endpoint_collector, redfish_conn_mock +): + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path="/redfish/v1/Systems", + success=True, + data={"Members": []}, + status_code=200, + ) + redfish_endpoint_collector.result.artifacts.clear() + res = redfish_endpoint_collector._run_redfish_get_paged( + "/redfish/v1/Systems", max_pages=5, log_artifact=True + ) + redfish_conn_mock.run_get_paged.assert_called_once_with("/redfish/v1/Systems", max_pages=5) + assert res.success is True + assert len(redfish_endpoint_collector.result.artifacts) == 1 + + +def test_run_redfish_get_paged_skips_artifact_when_disabled( + redfish_endpoint_collector, redfish_conn_mock +): + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path="/x", + success=True, + data={"Members": []}, + status_code=200, + ) + redfish_endpoint_collector.result.artifacts.clear() + redfish_endpoint_collector._run_redfish_get_paged("/x", log_artifact=False) + assert len(redfish_endpoint_collector.result.artifacts) == 0 + + +def test_collect_follow_next_link_sequential_uses_run_get_paged( + redfish_endpoint_collector, redfish_conn_mock +): + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path="/redfish/v1/Systems", + success=True, + data={"Members": [{"@odata.id": "/redfish/v1/Systems/1"}], "Members@odata.count": 1}, + status_code=200, + ) + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs( + uris=["/redfish/v1/Systems"], + follow_next_link=True, + max_pages=50, + ) + ) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.responses["/redfish/v1/Systems"]["Members@odata.count"] == 1 + redfish_conn_mock.run_get_paged.assert_called_once_with("/redfish/v1/Systems", max_pages=50) + redfish_conn_mock.run_get.assert_not_called() + + +def test_collect_follow_next_link_concurrent_uses_connection_copy( + redfish_endpoint_collector, redfish_conn_mock +): + copy_a = MagicMock() + copy_b = MagicMock() + copies = [copy_a, copy_b] + + def next_copy(): + return copies.pop(0) if copies else MagicMock() + + redfish_conn_mock.copy.side_effect = next_copy + + copy_a.run_get_paged.return_value = RedfishGetResult( + path="/a", + success=True, + data={"Members": []}, + status_code=200, + ) + copy_b.run_get_paged.return_value = RedfishGetResult( + path="/b", + success=True, + data={"Members": []}, + status_code=200, + ) + + result, data = redfish_endpoint_collector.collect_data( + args=RedfishEndpointCollectorArgs( + uris=["/a", "/b"], + max_workers=2, + follow_next_link=True, + max_pages=10, + ) + ) + assert result.status == ExecutionStatus.OK + assert data is not None + assert len(data.responses) == 2 + copy_a.run_get_paged.assert_called_once_with("/a", max_pages=10) + copy_b.run_get_paged.assert_called_once_with("/b", max_pages=10) + assert redfish_conn_mock.copy.call_count == 2 + assert len(result.artifacts) == 2 + + def test_discover_tree_single_root(): conn = MagicMock() conn.run_get.return_value = RedfishGetResult( From e8b29dae0ff19abd8e06ea6784eecb257e619e23 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 8 May 2026 16:11:23 -0500 Subject: [PATCH 11/25] fix for workflow --- .github/workflows/code_quality_checks.yml | 11 +++++++++-- .github/workflows/functional-test.yml | 2 +- .github/workflows/release-trusted-publisher.yml | 2 +- .github/workflows/unit-test.yml | 2 +- nodescraper/models/datamodel.py | 4 ++-- 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/code_quality_checks.yml b/.github/workflows/code_quality_checks.yml index d3706b1c..2408a8ce 100644 --- a/.github/workflows/code_quality_checks.yml +++ b/.github/workflows/code_quality_checks.yml @@ -14,12 +14,19 @@ jobs: container: python:3.9 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 # Fetch all history for pre-commit to work + + # python:3.9 image has no git; pre-commit requires it. + - name: Install git + run: | + apt-get update + apt-get install -y --no-install-recommends git + - name: Configure git for container run: | - git config --global --add safe.directory /__w/node-scraper/node-scraper + git config --global --add safe.directory "$GITHUB_WORKSPACE" git config --global user.email "ci@github.com" git config --global user.name "CI Bot" - name: setup environment and run pre-commit hooks diff --git a/.github/workflows/functional-test.yml b/.github/workflows/functional-test.yml index 8fd1fcf4..2c6618b5 100644 --- a/.github/workflows/functional-test.yml +++ b/.github/workflows/functional-test.yml @@ -15,7 +15,7 @@ jobs: container: python:3.9 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install xmllint run: | diff --git a/.github/workflows/release-trusted-publisher.yml b/.github/workflows/release-trusted-publisher.yml index 78e37b0a..55267d48 100644 --- a/.github/workflows/release-trusted-publisher.yml +++ b/.github/workflows/release-trusted-publisher.yml @@ -24,7 +24,7 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 # Fetch all history and tags token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml index 7a4b17c7..fe6dc9c2 100644 --- a/.github/workflows/unit-test.yml +++ b/.github/workflows/unit-test.yml @@ -15,7 +15,7 @@ jobs: container: python:3.9 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install xmllint run: | diff --git a/nodescraper/models/datamodel.py b/nodescraper/models/datamodel.py index 78a5df06..c310c810 100644 --- a/nodescraper/models/datamodel.py +++ b/nodescraper/models/datamodel.py @@ -29,7 +29,7 @@ import tarfile from typing import TypeVar, Union -from pydantic import BaseModel, Field, field_validator +from pydantic import BaseModel, field_validator from nodescraper.utils import get_unique_filename @@ -37,7 +37,7 @@ class FileModel(BaseModel): - file_contents: bytes = Field(exclude=True) + file_contents: bytes file_name: str @field_validator("file_contents", mode="before") From bc500d4e9820d6bd85a52659f59ce4322a5a263a Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 11 May 2026 15:05:52 -0500 Subject: [PATCH 12/25] added session_id --- nodescraper/base/inbandcollectortask.py | 2 + nodescraper/base/redfishcollectortask.py | 2 + nodescraper/cli/cli.py | 2 + nodescraper/cli/invocation.py | 4 + nodescraper/interfaces/connectionmanager.py | 2 + nodescraper/interfaces/datacollectortask.py | 2 + nodescraper/interfaces/dataplugin.py | 5 + nodescraper/interfaces/plugin.py | 3 + nodescraper/interfaces/task.py | 17 +++ nodescraper/pluginexecutor.py | 16 +++ test/unit/framework/test_datacollector.py | 11 ++ test/unit/framework/test_dataplugin.py | 138 ++++++++++++++++++++ test/unit/framework/test_plugin_executor.py | 5 + 13 files changed, 209 insertions(+) diff --git a/nodescraper/base/inbandcollectortask.py b/nodescraper/base/inbandcollectortask.py index 3b2d072f..d12b58dc 100644 --- a/nodescraper/base/inbandcollectortask.py +++ b/nodescraper/base/inbandcollectortask.py @@ -54,6 +54,7 @@ def __init__( parent: Optional[str] = None, task_result_hooks: Optional[list[TaskResultHook]] = None, event_reporter: str = DEFAULT_EVENT_REPORTER, + session_id: Optional[str] = None, **kwargs, ): super().__init__( @@ -65,6 +66,7 @@ def __init__( parent=parent, task_result_hooks=task_result_hooks, event_reporter=event_reporter, + session_id=session_id, ) if self.system_info.os_family not in self.SUPPORTED_OS_FAMILY: raise SystemCompatibilityError( diff --git a/nodescraper/base/redfishcollectortask.py b/nodescraper/base/redfishcollectortask.py index 73b4228f..73e59a7f 100644 --- a/nodescraper/base/redfishcollectortask.py +++ b/nodescraper/base/redfishcollectortask.py @@ -49,6 +49,7 @@ def __init__( parent: Optional[str] = None, task_result_hooks: Optional[list[TaskResultHook]] = None, event_reporter: str = DEFAULT_EVENT_REPORTER, + session_id: Optional[str] = None, **kwargs, ): super().__init__( @@ -59,6 +60,7 @@ def __init__( parent=parent, task_result_hooks=task_result_hooks, event_reporter=event_reporter, + session_id=session_id, **kwargs, ) diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index 16c639bb..9890ef39 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -31,6 +31,7 @@ import os import platform import sys +import uuid from typing import Optional import nodescraper @@ -642,6 +643,7 @@ def main( timestamp=timestamp, sname=sname, host_cli_args=host_cli_args, + session_id=str(uuid.uuid4()), ) log_system_info(log_path, system_info, logger) diff --git a/nodescraper/cli/invocation.py b/nodescraper/cli/invocation.py index 12cd3a94..ee59e4a6 100644 --- a/nodescraper/cli/invocation.py +++ b/nodescraper/cli/invocation.py @@ -71,6 +71,7 @@ class PluginRunInvocation: timestamp: str sname: str host_cli_args: Optional[argparse.Namespace] = None + session_id: Optional[str] = None def run_plugin_queue_with_invocation( @@ -84,6 +85,7 @@ def run_plugin_queue_with_invocation( timestamp: str, sname: str, host_cli_args: Optional[argparse.Namespace] = None, + session_id: Optional[str] = None, ) -> list[PluginResult]: """Constructs the plugin executor, binds invocation context, and runs the plugin queue.""" inv = PluginRunInvocation( @@ -96,6 +98,7 @@ def run_plugin_queue_with_invocation( timestamp=timestamp, sname=sname, host_cli_args=host_cli_args, + session_id=session_id, ) plugin_executor = PluginExecutor( logger=logger, @@ -104,6 +107,7 @@ def run_plugin_queue_with_invocation( system_info=system_info, log_path=log_path, plugin_registry=plugin_reg, + session_id=session_id, ) with plugin_run_invocation_scope(inv): return plugin_executor.run_queue() diff --git a/nodescraper/interfaces/connectionmanager.py b/nodescraper/interfaces/connectionmanager.py index bcd04c80..6b468f06 100644 --- a/nodescraper/interfaces/connectionmanager.py +++ b/nodescraper/interfaces/connectionmanager.py @@ -98,6 +98,7 @@ def __init__( task_result_hooks: Optional[list[TaskResultHook], None] = None, connection_args: Optional[Union[TConnectArg, dict]] = None, event_reporter: str = DEFAULT_EVENT_REPORTER, + session_id: Optional[str] = None, **kwargs, ): super().__init__( @@ -107,6 +108,7 @@ def __init__( parent="connection" if not parent else parent, task_result_hooks=task_result_hooks, event_reporter=event_reporter, + session_id=session_id, **kwargs, ) diff --git a/nodescraper/interfaces/datacollectortask.py b/nodescraper/interfaces/datacollectortask.py index 6cc21181..3c30a6ea 100644 --- a/nodescraper/interfaces/datacollectortask.py +++ b/nodescraper/interfaces/datacollectortask.py @@ -154,6 +154,7 @@ def __init__( parent: Optional[str] = None, task_result_hooks: Optional[list[TaskResultHook]] = None, event_reporter: str = DEFAULT_EVENT_REPORTER, + session_id: Optional[str] = None, **kwargs, ): """data collector init function @@ -172,6 +173,7 @@ def __init__( parent=parent, task_result_hooks=task_result_hooks, event_reporter=event_reporter, + session_id=session_id, ) if isinstance(system_interaction_level, str): diff --git a/nodescraper/interfaces/dataplugin.py b/nodescraper/interfaces/dataplugin.py index 9ff31d70..8448dff3 100644 --- a/nodescraper/interfaces/dataplugin.py +++ b/nodescraper/interfaces/dataplugin.py @@ -76,6 +76,7 @@ def __init__( task_result_hooks: Optional[list[TaskResultHook]] = None, log_path: Optional[str] = None, event_reporter: str = DEFAULT_EVENT_REPORTER, + session_id: Optional[str] = None, **kwargs, ): super().__init__( @@ -86,6 +87,7 @@ def __init__( task_result_hooks, log_path, event_reporter=event_reporter, + session_id=session_id, **kwargs, ) self._validate_class_var() @@ -190,6 +192,7 @@ def collect( parent=self.__class__.__name__, task_result_hooks=self.task_result_hooks, event_reporter=self.event_reporter, + session_id=self.session_id, ) if ( @@ -224,6 +227,7 @@ def collect( task_result_hooks=self.task_result_hooks, log_path=self.log_path, event_reporter=self.event_reporter, + session_id=self.session_id, ) self.collection_result, self._data = collection_task.collect_data(collection_args) @@ -304,6 +308,7 @@ def analyze( parent=self.__class__.__name__, task_result_hooks=self.task_result_hooks, event_reporter=self.event_reporter, + session_id=self.session_id, ) self.analysis_result = analyzer_task.analyze_data(self.data, analysis_args) return self.analysis_result diff --git a/nodescraper/interfaces/plugin.py b/nodescraper/interfaces/plugin.py index d7ef4d55..06959b54 100644 --- a/nodescraper/interfaces/plugin.py +++ b/nodescraper/interfaces/plugin.py @@ -51,6 +51,7 @@ def __init__( log_path: Optional[str] = None, queue_callback: Optional[Callable] = None, event_reporter: str = DEFAULT_EVENT_REPORTER, + session_id: Optional[str] = None, **kwargs, ): """Initialize plugin @@ -88,6 +89,7 @@ def __init__( self.queue_callback = queue_callback self.event_reporter = event_reporter + self.session_id = session_id self.connection_manager = connection_manager @@ -99,6 +101,7 @@ def __init__( parent=self.__class__.__name__, task_result_hooks=self.task_result_hooks, event_reporter=event_reporter, + session_id=self.session_id, ) @classmethod diff --git a/nodescraper/interfaces/task.py b/nodescraper/interfaces/task.py index 79b4a389..f915e8d1 100644 --- a/nodescraper/interfaces/task.py +++ b/nodescraper/interfaces/task.py @@ -27,6 +27,7 @@ import copy import datetime import logging +import uuid from typing import Any, Optional, Union from nodescraper.constants import DEFAULT_EVENT_REPORTER, DEFAULT_LOGGER @@ -55,6 +56,7 @@ def __init__( parent: Optional[str] = None, task_result_hooks: Optional[list[TaskResultHook]] = None, event_reporter: str = DEFAULT_EVENT_REPORTER, + session_id: Optional[str] = None, **kwargs: dict[str, Any], ): if logger is None: @@ -67,6 +69,18 @@ def __init__( if not task_result_hooks: task_result_hooks = [] self.task_result_hooks = task_result_hooks + + if session_id is not None: + try: + uuid.UUID(session_id) + self.session_id = session_id + except (ValueError, AttributeError, TypeError) as e: + raise ValueError( + f"session_id must be a valid UUID string, got: {session_id}" + ) from e + else: + self.session_id = None + self.result: TaskResult = self._init_result() @property @@ -117,6 +131,9 @@ def _build_event( if self.parent: data["parent"] = self.parent + if self.session_id: + data["session_id"] = self.session_id + if self.system_info.metadata: data["system_metadata"] = copy.copy(self.system_info.metadata) diff --git a/nodescraper/pluginexecutor.py b/nodescraper/pluginexecutor.py index 8a6998f2..0821ff20 100644 --- a/nodescraper/pluginexecutor.py +++ b/nodescraper/pluginexecutor.py @@ -28,6 +28,7 @@ import copy import inspect import logging +import uuid from collections import deque from typing import Optional, Type, Union @@ -53,6 +54,7 @@ def __init__( logger: Optional[logging.Logger] = None, plugin_registry: Optional[PluginRegistry] = None, log_path: Optional[str] = None, + session_id: Optional[str] = None, ): if logger is None: @@ -65,6 +67,17 @@ def __init__( system_info = SystemInfo() self.system_info = system_info + if session_id is not None: + try: + uuid.UUID(session_id) + self.session_id = session_id + except (ValueError, AttributeError, TypeError) as e: + raise ValueError( + f"session_id must be a valid UUID string, got: {session_id}" + ) from e + else: + self.session_id = None + self.plugin_config = self.merge_configs(plugin_configs) self.connection_library: dict[type[ConnectionManager], ConnectionManager] = {} @@ -90,6 +103,7 @@ def __init__( logger=self.logger, connection_args=connection_args, task_result_hooks=self.connection_result_hooks, + session_id=self.session_id, ) self.logger.info("System Name: %s", self.system_info.name) @@ -157,6 +171,7 @@ def run_queue(self) -> list[PluginResult]: "logger": self.logger, "queue_callback": plugin_queue.append, "log_path": self.log_path, + "session_id": self.session_id, } if plugin_class.CONNECTION_TYPE: @@ -192,6 +207,7 @@ def run_queue(self) -> list[PluginResult]: system_info=self.system_info, logger=self.logger, task_result_hooks=self.connection_result_hooks, + session_id=self.session_id, ) init_payload["connection_manager"] = self.connection_library[mgr_impl] diff --git a/test/unit/framework/test_datacollector.py b/test/unit/framework/test_datacollector.py index 30fde48f..410b4d85 100644 --- a/test/unit/framework/test_datacollector.py +++ b/test/unit/framework/test_datacollector.py @@ -154,6 +154,17 @@ class RestrictedSkuCollector(DummyCollector): assert res.status == ExecutionStatus.OK +def test_supported_skus_coerce_non_string_items(conn_mock): + class IntSkuCollector(DummyCollector): + SUPPORTED_SKUS = {42} + + args = {"name": "h", "sku": "42", "platform": "X", "os_family": 1} + info = SystemInfo(**args) + col = IntSkuCollector(info, conn_mock) + res, data = col.collect_data() + assert res.status == ExecutionStatus.OK + + def test_missing_data_model(): with pytest.raises(TypeError, match="No data model set for DummyCollector1"): diff --git a/test/unit/framework/test_dataplugin.py b/test/unit/framework/test_dataplugin.py index a220cfe6..29816efa 100644 --- a/test/unit/framework/test_dataplugin.py +++ b/test/unit/framework/test_dataplugin.py @@ -23,6 +23,8 @@ # SOFTWARE. # ############################################################################### +import json +from pathlib import Path from unittest.mock import MagicMock, patch import pytest @@ -404,3 +406,139 @@ def test_analyze_no_data_available(self, plugin_with_conn): assert result.status == ExecutionStatus.NOT_RAN assert "No data available" in result.message + + +class ContentModel(StandardDataModel): + def get_compare_content(self) -> str: + return self.value + + +class ErrMatchAnalyzer(DataAnalyzer): + DATA_MODEL = ContentModel + + def analyze_data(self, data, args=None): + return TaskResult(status=ExecutionStatus.OK) + + @staticmethod + def get_error_matches(content: str) -> list[str]: + return ["z", "a"] if content else [] + + +class ContentCollector(DataCollector): + DATA_MODEL = ContentModel + + def collect_data(self, args=None): + return TaskResult(status=ExecutionStatus.OK), ContentModel(value="x") + + +class ExtractPlugin(DataPlugin): + DATA_MODEL = ContentModel + CONNECTION_TYPE = MockConnectionManager + COLLECTOR = ContentCollector + ANALYZER = ErrMatchAnalyzer + + +class LogImportModel(ContentModel): + @classmethod + def import_model(cls, model_input): + if isinstance(model_input, str) and model_input.endswith(".log"): + return cls(value=Path(model_input).read_text(encoding="utf-8")) + return super().import_model(model_input) + + +class LogImportPlugin(DataPlugin): + DATA_MODEL = LogImportModel + CONNECTION_TYPE = MockConnectionManager + COLLECTOR = ContentCollector + ANALYZER = ErrMatchAnalyzer + + +class TestDataPluginRunPaths: + def test_find_datamodel_path_requires_directory(self) -> None: + assert CoreDataPlugin.find_datamodel_path_in_run("/no/such/run") is None + + def test_find_datamodel_path_success(self, tmp_path: Path) -> None: + collector_dir = tmp_path / "extract_plugin" / "content_collector" + collector_dir.mkdir(parents=True) + (collector_dir / "result.json").write_text( + json.dumps({"parent": "ExtractPlugin"}), encoding="utf-8" + ) + (collector_dir / "contentmodel.json").write_text( + json.dumps({"value": "from_run"}), encoding="utf-8" + ) + + found = ExtractPlugin.find_datamodel_path_in_run(str(tmp_path)) + assert found is not None + assert found.endswith("contentmodel.json") + + def test_find_datamodel_path_wrong_parent(self, tmp_path: Path) -> None: + collector_dir = tmp_path / "extract_plugin" / "content_collector" + collector_dir.mkdir(parents=True) + (collector_dir / "result.json").write_text( + json.dumps({"parent": "OtherPlugin"}), encoding="utf-8" + ) + (collector_dir / "contentmodel.json").write_text("{}", encoding="utf-8") + + assert ExtractPlugin.find_datamodel_path_in_run(str(tmp_path)) is None + + def test_find_datamodel_path_invalid_result_json(self, tmp_path: Path) -> None: + collector_dir = tmp_path / "extract_plugin" / "content_collector" + collector_dir.mkdir(parents=True) + (collector_dir / "result.json").write_text("{not json", encoding="utf-8") + (collector_dir / "contentmodel.json").write_text("{}", encoding="utf-8") + + assert ExtractPlugin.find_datamodel_path_in_run(str(tmp_path)) is None + + def test_load_datamodel_from_path_json(self, tmp_path: Path) -> None: + p = tmp_path / "dm.json" + p.write_text(json.dumps({"value": "file"}), encoding="utf-8") + m = ExtractPlugin.load_datamodel_from_path(str(p)) + assert isinstance(m, ContentModel) + assert m.value == "file" + + def test_load_datamodel_from_path_missing(self) -> None: + assert ExtractPlugin.load_datamodel_from_path("/nonexistent/x.json") is None + + def test_load_datamodel_from_path_log_with_custom_import(self, tmp_path: Path) -> None: + log = tmp_path / "capture.log" + log.write_text("from_log", encoding="utf-8") + m = LogImportPlugin.load_datamodel_from_path(str(log)) + assert isinstance(m, LogImportModel) + assert m.value == "from_log" + + def test_load_datamodel_from_path_log_without_override_returns_none( + self, tmp_path: Path + ) -> None: + log = tmp_path / "plain.log" + log.write_text("{}", encoding="utf-8") + assert ExtractPlugin.load_datamodel_from_path(str(log)) is None + + def test_get_extracted_errors_sorted(self) -> None: + dm = ContentModel(value="has text") + out = ExtractPlugin.get_extracted_errors(dm) + assert out == ["a", "z"] + + def test_get_extracted_errors_without_hooks(self) -> None: + assert CoreDataPlugin.get_extracted_errors(StandardDataModel()) is None + + def test_load_run_data_from_run_dir(self, tmp_path: Path) -> None: + collector_dir = tmp_path / "extract_plugin" / "content_collector" + collector_dir.mkdir(parents=True) + (collector_dir / "result.json").write_text( + json.dumps({"parent": "ExtractPlugin"}), encoding="utf-8" + ) + (collector_dir / "contentmodel.json").write_text( + json.dumps({"value": "run"}), encoding="utf-8" + ) + + loaded = ExtractPlugin.load_run_data(str(tmp_path)) + assert loaded is not None + assert loaded["value"] == "run" + assert loaded["extracted_errors"] == ["a", "z"] + + def test_load_run_data_direct_file(self, tmp_path: Path) -> None: + p = tmp_path / "direct.json" + p.write_text(json.dumps({"value": "direct"}), encoding="utf-8") + loaded = ExtractPlugin.load_run_data(str(p)) + assert loaded is not None + assert loaded["value"] == "direct" diff --git a/test/unit/framework/test_plugin_executor.py b/test/unit/framework/test_plugin_executor.py index 7ed75b93..0f3568f5 100644 --- a/test/unit/framework/test_plugin_executor.py +++ b/test/unit/framework/test_plugin_executor.py @@ -103,6 +103,11 @@ def test_config_merge(input_configs: list[PluginConfig], output_config: PluginCo assert PluginExecutor.merge_configs(input_configs) == output_config +def test_plugin_executor_rejects_invalid_session_id(): + with pytest.raises(ValueError, match="session_id must be a valid UUID"): + PluginExecutor(plugin_configs=[], session_id="not-a-uuid") + + def test_plugin_queue(plugin_registry): executor = PluginExecutor( plugin_configs=[PluginConfig(global_args={"test_arg": "abc"}, plugins={"TestPluginB": {}})], From 6489f1ca11a6750a4147d96197ff3b9e02909c83 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 13 May 2026 13:04:23 -0500 Subject: [PATCH 13/25] network enhancements --- nodescraper/interfaces/task.py | 15 + nodescraper/models/event.py | 21 +- .../plugins/inband/network/ethtool_vendor.py | 660 ++++++++++++++++++ .../inband/network/network_analyzer.py | 64 +- .../inband/network/network_collector.py | 162 +++++ .../plugins/inband/network/networkdata.py | 5 + test/unit/plugin/test_network_analyzer.py | 48 +- test/unit/plugin/test_network_collector.py | 41 ++ 8 files changed, 994 insertions(+), 22 deletions(-) create mode 100644 nodescraper/plugins/inband/network/ethtool_vendor.py diff --git a/nodescraper/interfaces/task.py b/nodescraper/interfaces/task.py index 16d1a70b..8855a48a 100644 --- a/nodescraper/interfaces/task.py +++ b/nodescraper/interfaces/task.py @@ -27,6 +27,7 @@ import copy import datetime import logging +import uuid from typing import Any, Optional, Union from nodescraper.constants import DEFAULT_LOGGER @@ -54,6 +55,7 @@ def __init__( max_event_priority_level: Union[EventPriority, str] = EventPriority.CRITICAL, parent: Optional[str] = None, task_result_hooks: Optional[list[TaskResultHook]] = None, + session_id: Optional[str] = None, **kwargs: dict[str, Any], ): if logger is None: @@ -65,6 +67,16 @@ def __init__( if not task_result_hooks: task_result_hooks = [] self.task_result_hooks = task_result_hooks + + if session_id is None and "session_id" in kwargs: + session_id = kwargs.pop("session_id") # type: ignore[assignment] + if session_id is not None: + try: + uuid.UUID(str(session_id)) + except (ValueError, TypeError, AttributeError): + raise ValueError("session_id must be a valid UUID") from None + self.session_id: Optional[str] = str(session_id) if session_id is not None else None + self.result: TaskResult = self._init_result() @property @@ -115,6 +127,9 @@ def _build_event( if self.parent: data["parent"] = self.parent + if self.session_id is not None: + data["session_id"] = self.session_id + if self.system_info.metadata: data["system_metadata"] = copy.copy(self.system_info.metadata) diff --git a/nodescraper/models/event.py b/nodescraper/models/event.py index 33cf2801..25315ef2 100644 --- a/nodescraper/models/event.py +++ b/nodescraper/models/event.py @@ -28,7 +28,7 @@ import re import uuid from enum import Enum -from typing import Any, Optional, Union +from typing import Any, Optional, Union, cast from pydantic import BaseModel, Field, field_serializer, field_validator @@ -113,15 +113,22 @@ def validate_category(cls, category: Optional[Union[str, Enum]]) -> str: @field_validator("priority", mode="before") @classmethod - def validate_priority(cls, priority: Union[str, EventPriority]) -> EventPriority: - """Allow priority to be set via string priority name + def validate_priority(cls, priority: Union[str, int, EventPriority]) -> EventPriority: + """Allow priority as EventPriority, enum name string, or IntEnum value (unknown int -> ERROR). + Args: - priority (Union[str, EventPriority]): event priority string or enum + priority: EventPriority, name string, integer matching a level, or unknown int (maps to ERROR). + Raises: - ValueError: if priority string is an invalid value - Returns: - EventPriority: priority enum + ValueError: if priority string is invalid, or if a boolean is passed. """ + if type(priority) is bool: + raise ValueError("priority must not be a boolean") + if isinstance(priority, int): + try: + return cast(EventPriority, EventPriority(priority)) + except ValueError: + return EventPriority.ERROR if isinstance(priority, str): try: return getattr(EventPriority, priority.upper()) diff --git a/nodescraper/plugins/inband/network/ethtool_vendor.py b/nodescraper/plugins/inband/network/ethtool_vendor.py new file mode 100644 index 00000000..04dbf4a4 --- /dev/null +++ b/nodescraper/plugins/inband/network/ethtool_vendor.py @@ -0,0 +1,660 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Vendor-specific ethtool -S statistics models (Pollara / Thor2 / ConnectX-7).""" + +from typing import ClassVar, Optional, Union + +from pydantic import BaseModel, Field, model_validator +from typing_extensions import Self + + +class PollaraEthtoolStatistics(BaseModel): + """ifname ionic. Keeping only fields of interest. Skip queue-specific stats for now""" + + rx_csum_error: Optional[int] = None + hw_tx_dropped: Optional[int] = None + hw_rx_dropped: Optional[int] = None + hw_rx_over_errors: Optional[int] = None + hw_rx_missed_errors: Optional[int] = None + hw_tx_aborted_errors: Optional[int] = None + frames_rx_bad_fcs: Optional[int] = None + frames_rx_bad_all: Optional[int] = None + frames_rx_pause: Optional[int] = None + frames_rx_bad_length: Optional[int] = None + frames_rx_undersized: Optional[int] = None + frames_rx_oversized: Optional[int] = None + frames_rx_fragments: Optional[int] = None + frames_rx_jabber: Optional[int] = None + frames_rx_pripause: Optional[int] = None + frames_rx_stomped_crc: Optional[int] = None + frames_rx_too_long: Optional[int] = None + frames_rx_dropped: Optional[int] = None + frames_rx_less_than_64b: Optional[int] = None + frames_tx_bad: Optional[int] = None + frames_tx_pause: Optional[int] = None + frames_tx_pripause: Optional[int] = None + frames_tx_less_than_64b: Optional[int] = None + frames_tx_pri_0: Optional[int] = None + frames_tx_pri_1: Optional[int] = None + frames_tx_pri_2: Optional[int] = None + frames_tx_pri_3: Optional[int] = None + frames_tx_pri_4: Optional[int] = None + frames_tx_pri_5: Optional[int] = None + frames_tx_pri_6: Optional[int] = None + frames_tx_pri_7: Optional[int] = None + frames_rx_pri_0: Optional[int] = None + frames_rx_pri_1: Optional[int] = None + frames_rx_pri_2: Optional[int] = None + frames_rx_pri_3: Optional[int] = None + frames_rx_pri_4: Optional[int] = None + frames_rx_pri_5: Optional[int] = None + frames_rx_pri_6: Optional[int] = None + frames_rx_pri_7: Optional[int] = None + tx_pripause_0_1us_count: Optional[int] = None + tx_pripause_1_1us_count: Optional[int] = None + tx_pripause_2_1us_count: Optional[int] = None + tx_pripause_3_1us_count: Optional[int] = None + tx_pripause_4_1us_count: Optional[int] = None + tx_pripause_5_1us_count: Optional[int] = None + tx_pripause_6_1us_count: Optional[int] = None + tx_pripause_7_1us_count: Optional[int] = None + rx_pripause_0_1us_count: Optional[int] = None + rx_pripause_1_1us_count: Optional[int] = None + rx_pripause_2_1us_count: Optional[int] = None + rx_pripause_3_1us_count: Optional[int] = None + rx_pripause_4_1us_count: Optional[int] = None + rx_pripause_5_1us_count: Optional[int] = None + rx_pripause_6_1us_count: Optional[int] = None + rx_pripause_7_1us_count: Optional[int] = None + rx_pause_1us_count: Optional[int] = None + frames_tx_truncated: Optional[int] = None + + error_fields: ClassVar[list[str]] = [ + "rx_csum_error", + "hw_tx_dropped", + "hw_rx_dropped", + "hw_rx_over_errors", + "hw_rx_missed_errors", + "hw_tx_aborted_errors", + "frames_rx_bad_fcs", + "frames_rx_bad_all", + "frames_rx_bad_length", + "frames_rx_undersized", + "frames_rx_oversized", + "frames_rx_fragments", + "frames_rx_jabber", + "frames_rx_stomped_crc", + "frames_rx_too_long", + "frames_rx_dropped", + "frames_rx_less_than_64b", + "frames_tx_bad", + "frames_tx_pause", + "frames_tx_pripause", + "frames_tx_less_than_64b", + "frames_tx_pri_0", + "frames_tx_pri_1", + "frames_tx_pri_2", + "frames_tx_pri_3", + "frames_tx_pri_4", + "frames_tx_pri_5", + "frames_tx_pri_6", + "frames_tx_pri_7", + "tx_pripause_0_1us_count", + "tx_pripause_1_1us_count", + "tx_pripause_2_1us_count", + "tx_pripause_3_1us_count", + "tx_pripause_4_1us_count", + "tx_pripause_5_1us_count", + "tx_pripause_6_1us_count", + "tx_pripause_7_1us_count", + "frames_tx_truncated", + ] + + warning_fields: ClassVar[list[str]] = [ + "frames_rx_pause", + "frames_rx_pripause", + "frames_rx_pri_0", + "frames_rx_pri_1", + "frames_rx_pri_2", + "frames_rx_pri_3", + "frames_rx_pri_4", + "frames_rx_pri_5", + "frames_rx_pri_6", + "frames_rx_pri_7", + "rx_pripause_0_1us_count", + "rx_pripause_1_1us_count", + "rx_pripause_2_1us_count", + "rx_pripause_3_1us_count", + "rx_pripause_4_1us_count", + "rx_pripause_5_1us_count", + "rx_pripause_6_1us_count", + "rx_pripause_7_1us_count", + "rx_pause_1us_count", + ] + + +class Thor2EthtoolStatistics(BaseModel): + """ifname bnxt. Keeping only fields of interest. Skip queue-specific stats for now""" + + rx_total_l4_csum_errors: Optional[int] = None + rx_total_resets: Optional[int] = None + rx_total_buf_errors: Optional[int] = None + rx_total_oom_discards: Optional[int] = None + rx_total_netpoll_discards: Optional[int] = None + rx_total_ring_discards: Optional[int] = None + tx_total_resets: Optional[int] = None + tx_total_ring_discards: Optional[int] = None + total_missed_irqs: Optional[int] = None + ktls_tx_rec_err: Optional[int] = None + ktls_rx_resync_discard: Optional[int] = None + rx_fcs_err_frames: Optional[int] = None + rx_pause_frames: Optional[int] = None + rx_pfc_frames: Optional[int] = None + rx_align_err_frames: Optional[int] = None + rx_ovrsz_frames: Optional[int] = None + rx_jbr_frames: Optional[int] = None + rx_mtu_err_frames: Optional[int] = None + rx_pfc_ena_frames_pri0: Optional[int] = None + rx_pfc_ena_frames_pri1: Optional[int] = None + rx_pfc_ena_frames_pri2: Optional[int] = None + rx_pfc_ena_frames_pri3: Optional[int] = None + rx_pfc_ena_frames_pri4: Optional[int] = None + rx_pfc_ena_frames_pri5: Optional[int] = None + rx_pfc_ena_frames_pri6: Optional[int] = None + rx_pfc_ena_frames_pri7: Optional[int] = None + rx_undrsz_frames: Optional[int] = None + rx_runt_bytes: Optional[int] = None + rx_runt_frames: Optional[int] = None + rx_stat_discard: Optional[int] = None + rx_stat_err: Optional[int] = None + tx_pause_frames: Optional[int] = None + tx_pfc_frames: Optional[int] = None + tx_jabber_frames: Optional[int] = None + tx_fcs_err_frames: Optional[int] = None + tx_err: Optional[int] = None + tx_fifo_underruns: Optional[int] = None + tx_pfc_ena_frames_pri0: Optional[int] = None + tx_pfc_ena_frames_pri1: Optional[int] = None + tx_pfc_ena_frames_pri2: Optional[int] = None + tx_pfc_ena_frames_pri3: Optional[int] = None + tx_pfc_ena_frames_pri4: Optional[int] = None + tx_pfc_ena_frames_pri5: Optional[int] = None + tx_pfc_ena_frames_pri6: Optional[int] = None + tx_pfc_ena_frames_pri7: Optional[int] = None + tx_total_collisions: Optional[int] = None + tx_stat_discard: Optional[int] = None + tx_stat_error: Optional[int] = None + link_down_events: Optional[int] = None + continuous_pause_events: Optional[int] = None + resume_pause_events: Optional[int] = None + continuous_roce_pause_events: Optional[int] = None + resume_roce_pause_events: Optional[int] = None + pfc_pri0_rx_transitions: Optional[int] = None + pfc_pri1_rx_transitions: Optional[int] = None + pfc_pri2_rx_transitions: Optional[int] = None + pfc_pri3_rx_transitions: Optional[int] = None + pfc_pri4_rx_transitions: Optional[int] = None + pfc_pri5_rx_transitions: Optional[int] = None + pfc_pri6_rx_transitions: Optional[int] = None + pfc_pri7_rx_transitions: Optional[int] = None + rx_pcs_symbol_err: Optional[int] = None + rx_discard_bytes_cos0: Optional[int] = None + rx_discard_packets_cos0: Optional[int] = None + rx_discard_bytes_cos1: Optional[int] = None + rx_discard_packets_cos1: Optional[int] = None + rx_discard_bytes_cos2: Optional[int] = None + rx_discard_packets_cos2: Optional[int] = None + rx_discard_bytes_cos3: Optional[int] = None + rx_discard_packets_cos3: Optional[int] = None + rx_discard_bytes_cos4: Optional[int] = None + rx_discard_packets_cos4: Optional[int] = None + rx_discard_bytes_cos5: Optional[int] = None + rx_discard_packets_cos5: Optional[int] = None + rx_discard_bytes_cos6: Optional[int] = None + rx_discard_packets_cos6: Optional[int] = None + rx_discard_bytes_cos7: Optional[int] = None + rx_discard_packets_cos7: Optional[int] = None + rx_fec_uncorrectable_blocks: Optional[int] = None + rx_filter_miss: Optional[int] = None + pfc_pri0_tx_transitions: Optional[int] = None + pfc_pri1_tx_transitions: Optional[int] = None + pfc_pri2_tx_transitions: Optional[int] = None + pfc_pri3_tx_transitions: Optional[int] = None + pfc_pri4_tx_transitions: Optional[int] = None + pfc_pri5_tx_transitions: Optional[int] = None + pfc_pri6_tx_transitions: Optional[int] = None + pfc_pri7_tx_transitions: Optional[int] = None + hw_db_recov_dbs_dropped: Optional[int] = None + hw_db_recov_oo_drop_count: Optional[int] = None + lpbk_tx_discards: Optional[int] = None + lpbk_tx_errors: Optional[int] = None + lpbk_rx_discards: Optional[int] = None + lpbk_rx_errors: Optional[int] = None + + error_fields: ClassVar[list[str]] = [ + "rx_total_l4_csum_errors", + "rx_total_buf_errors", + "rx_total_oom_discards", + "rx_total_netpoll_discards", + "rx_total_ring_discards", + "tx_total_ring_discards", + "total_missed_irqs", + "ktls_tx_rec_err", + "ktls_rx_resync_discard", + "rx_fcs_err_frames", + "rx_align_err_frames", + "rx_ovrsz_frames", + "rx_jbr_frames", + "rx_mtu_err_frames", + "rx_undrsz_frames", + "rx_runt_bytes", + "rx_runt_frames", + "rx_stat_discard", + "rx_stat_err", + "tx_pause_frames", + "tx_pfc_frames", + "tx_jabber_frames", + "tx_fcs_err_frames", + "tx_err", + "tx_fifo_underruns", + "tx_pfc_ena_frames_pri0", + "tx_pfc_ena_frames_pri1", + "tx_pfc_ena_frames_pri2", + "tx_pfc_ena_frames_pri3", + "tx_pfc_ena_frames_pri4", + "tx_pfc_ena_frames_pri5", + "tx_pfc_ena_frames_pri6", + "tx_pfc_ena_frames_pri7", + "tx_total_collisions", + "tx_stat_discard", + "tx_stat_error", + "link_down_events", + "continuous_pause_events", + "resume_pause_events", + "continuous_roce_pause_events", + "resume_roce_pause_events", + "rx_pcs_symbol_err", + "rx_discard_bytes_cos0", + "rx_discard_packets_cos0", + "rx_discard_bytes_cos1", + "rx_discard_packets_cos1", + "rx_discard_bytes_cos2", + "rx_discard_packets_cos2", + "rx_discard_bytes_cos3", + "rx_discard_packets_cos3", + "rx_discard_bytes_cos4", + "rx_discard_packets_cos4", + "rx_discard_bytes_cos5", + "rx_discard_packets_cos5", + "rx_discard_bytes_cos6", + "rx_discard_packets_cos6", + "rx_discard_bytes_cos7", + "rx_discard_packets_cos7", + "rx_fec_uncorrectable_blocks", + "rx_filter_miss", + "pfc_pri0_tx_transitions", + "pfc_pri1_tx_transitions", + "pfc_pri2_tx_transitions", + "pfc_pri3_tx_transitions", + "pfc_pri4_tx_transitions", + "pfc_pri5_tx_transitions", + "pfc_pri6_tx_transitions", + "pfc_pri7_tx_transitions", + "hw_db_recov_dbs_dropped", + "hw_db_recov_oo_drop_count", + "lpbk_tx_discards", + "lpbk_tx_errors", + "lpbk_rx_discards", + "lpbk_rx_errors", + ] + + warning_fields: ClassVar[list[str]] = [ + "rx_total_resets", + "tx_total_resets", + "rx_pause_frames", + "rx_pfc_frames", + "rx_pfc_ena_frames_pri0", + "rx_pfc_ena_frames_pri1", + "rx_pfc_ena_frames_pri2", + "rx_pfc_ena_frames_pri3", + "rx_pfc_ena_frames_pri4", + "rx_pfc_ena_frames_pri5", + "rx_pfc_ena_frames_pri6", + "rx_pfc_ena_frames_pri7", + "pfc_pri0_rx_transitions", + "pfc_pri1_rx_transitions", + "pfc_pri2_rx_transitions", + "pfc_pri3_rx_transitions", + "pfc_pri4_rx_transitions", + "pfc_pri5_rx_transitions", + "pfc_pri6_rx_transitions", + "pfc_pri7_rx_transitions", + ] + + +class Cx7EthtoolStatistics(BaseModel): + """ifname mlx. Keeping only fields of interest. Skip queue-specific stats for now""" + + rx_xdp_drop: Optional[int] = None + rx_xdp_tx_err: Optional[int] = None + tx_queue_dropped: Optional[int] = None + tx_cqe_err: Optional[int] = None + tx_xdp_err: Optional[int] = None + rx_wqe_err: Optional[int] = None + rx_oversize_pkts_sw_drop: Optional[int] = None + rx_buff_alloc_err: Optional[int] = None + rx_arfs_err: Optional[int] = None + rx_tls_err: Optional[int] = None + rx_xsk_xdp_drop: Optional[int] = None + rx_xsk_wqe_err: Optional[int] = None + rx_xsk_oversize_pkts_sw_drop: Optional[int] = None + rx_xsk_buff_alloc_err: Optional[int] = None + tx_xsk_err: Optional[int] = None + rx_out_of_buffer: Optional[int] = None + rx_if_down_packets: Optional[int] = None + rx_steer_missed_packets: Optional[int] = None + rx_oversize_pkts_buffer: Optional[int] = None + rx_crc_errors_phy: Optional[int] = None + rx_in_range_len_errors_phy: Optional[int] = None + rx_out_of_range_len_phy: Optional[int] = None + rx_oversize_pkts_phy: Optional[int] = None + rx_symbol_err_phy: Optional[int] = None + rx_unsupported_op_phy: Optional[int] = None + rx_pause_ctrl_phy: Optional[int] = None + tx_pause_ctrl_phy: Optional[int] = None + rx_discards_phy: Optional[int] = None + tx_discards_phy: Optional[int] = None + tx_errors_phy: Optional[int] = None + rx_undersize_pkts_phy: Optional[int] = None + rx_fragments_phy: Optional[int] = None + rx_jabbers_phy: Optional[int] = None + link_down_events_phy: Optional[int] = None + rx_pcs_symbol_err_phy: Optional[int] = None + rx_pci_signal_integrity: Optional[int] = None + tx_pci_signal_integrity: Optional[int] = None + outbound_pci_stalled_rd: Optional[int] = None + outbound_pci_stalled_wr: Optional[int] = None + outbound_pci_stalled_rd_events: Optional[int] = None + outbound_pci_stalled_wr_events: Optional[int] = None + rx_prio0_discards: Optional[int] = None + rx_prio1_discards: Optional[int] = None + rx_prio2_discards: Optional[int] = None + rx_prio3_discards: Optional[int] = None + rx_prio4_discards: Optional[int] = None + rx_prio5_discards: Optional[int] = None + rx_prio6_discards: Optional[int] = None + rx_prio7_discards: Optional[int] = None + rx_global_pause: Optional[int] = None + rx_prio0_pause: Optional[int] = None + rx_prio1_pause: Optional[int] = None + rx_prio2_pause: Optional[int] = None + rx_prio3_pause: Optional[int] = None + rx_prio4_pause: Optional[int] = None + rx_prio5_pause: Optional[int] = None + rx_prio6_pause: Optional[int] = None + rx_prio7_pause: Optional[int] = None + rx_global_pause_duration: Optional[int] = None + rx_prio0_pause_duration: Optional[int] = None + rx_prio1_pause_duration: Optional[int] = None + rx_prio2_pause_duration: Optional[int] = None + rx_prio3_pause_duration: Optional[int] = None + rx_prio4_pause_duration: Optional[int] = None + rx_prio5_pause_duration: Optional[int] = None + rx_prio6_pause_duration: Optional[int] = None + rx_prio7_pause_duration: Optional[int] = None + tx_global_pause: Optional[int] = None + tx_prio0_pause: Optional[int] = None + tx_prio1_pause: Optional[int] = None + tx_prio2_pause: Optional[int] = None + tx_prio3_pause: Optional[int] = None + tx_prio4_pause: Optional[int] = None + tx_prio5_pause: Optional[int] = None + tx_prio6_pause: Optional[int] = None + tx_prio7_pause: Optional[int] = None + tx_global_pause_duration: Optional[int] = None + tx_prio0_pause_duration: Optional[int] = None + tx_prio1_pause_duration: Optional[int] = None + tx_prio2_pause_duration: Optional[int] = None + tx_prio3_pause_duration: Optional[int] = None + tx_prio4_pause_duration: Optional[int] = None + tx_prio5_pause_duration: Optional[int] = None + tx_prio6_pause_duration: Optional[int] = None + tx_prio7_pause_duration: Optional[int] = None + rx_global_pause_transition: Optional[int] = None + rx_prio0_pause_transition: Optional[int] = None + rx_prio1_pause_transition: Optional[int] = None + rx_prio2_pause_transition: Optional[int] = None + rx_prio3_pause_transition: Optional[int] = None + rx_prio4_pause_transition: Optional[int] = None + rx_prio5_pause_transition: Optional[int] = None + rx_prio6_pause_transition: Optional[int] = None + rx_prio7_pause_transition: Optional[int] = None + tx_pause_storm_warning_events: Optional[int] = None + tx_pause_storm_error_events: Optional[int] = None + module_unplug: Optional[int] = None + module_bus_stuck: Optional[int] = None + module_high_temp: Optional[int] = None + module_bad_shorted: Optional[int] = None + ipsec_rx_drop_pkts: Optional[int] = None + ipsec_rx_drop_bytes: Optional[int] = None + ipsec_rx_drop_mismatch_sa_sel: Optional[int] = None + ipsec_tx_drop_pkts: Optional[int] = None + ipsec_tx_drop_bytes: Optional[int] = None + ipsec_rx_drop_sp_alloc: Optional[int] = None + ipsec_rx_drop_sadb_miss: Optional[int] = None + ipsec_rx_drop_syndrome: Optional[int] = None + ipsec_tx_drop_bundle: Optional[int] = None + ipsec_tx_drop_no_state: Optional[int] = None + ipsec_tx_drop_not_ip: Optional[int] = None + ipsec_tx_drop_trailer: Optional[int] = None + rx_prio0_buf_discard: Optional[int] = None + rx_prio0_cong_discard: Optional[int] = None + rx_prio1_buf_discard: Optional[int] = None + rx_prio1_cong_discard: Optional[int] = None + rx_prio2_buf_discard: Optional[int] = None + rx_prio2_cong_discard: Optional[int] = None + rx_prio3_buf_discard: Optional[int] = None + rx_prio3_cong_discard: Optional[int] = None + rx_prio4_buf_discard: Optional[int] = None + rx_prio4_cong_discard: Optional[int] = None + rx_prio5_buf_discard: Optional[int] = None + rx_prio5_cong_discard: Optional[int] = None + rx_prio6_buf_discard: Optional[int] = None + rx_prio6_cong_discard: Optional[int] = None + rx_prio7_buf_discard: Optional[int] = None + rx_prio7_cong_discard: Optional[int] = None + + error_fields: ClassVar[list[str]] = [ + "rx_xdp_drop", + "rx_xdp_tx_err", + "tx_queue_dropped", + "tx_cqe_err", + "tx_xdp_err", + "rx_wqe_err", + "rx_oversize_pkts_sw_drop", + "rx_buff_alloc_err", + "rx_arfs_err", + "rx_tls_err", + "rx_xsk_xdp_drop", + "rx_xsk_wqe_err", + "rx_xsk_oversize_pkts_sw_drop", + "rx_xsk_buff_alloc_err", + "tx_xsk_err", + "rx_out_of_buffer", + "rx_if_down_packets", + "rx_steer_missed_packets", + "rx_oversize_pkts_buffer", + "rx_crc_errors_phy", + "rx_in_range_len_errors_phy", + "rx_out_of_range_len_phy", + "rx_oversize_pkts_phy", + "rx_symbol_err_phy", + "rx_unsupported_op_phy", + "tx_pause_ctrl_phy", + "rx_discards_phy", + "tx_discards_phy", + "tx_errors_phy", + "rx_undersize_pkts_phy", + "rx_fragments_phy", + "rx_jabbers_phy", + "link_down_events_phy", + "rx_pcs_symbol_err_phy", + "rx_pci_signal_integrity", + "tx_pci_signal_integrity", + "outbound_pci_stalled_rd", + "outbound_pci_stalled_wr", + "outbound_pci_stalled_rd_events", + "outbound_pci_stalled_wr_events", + "rx_prio0_discards", + "rx_prio1_discards", + "rx_prio2_discards", + "rx_prio3_discards", + "rx_prio4_discards", + "rx_prio5_discards", + "rx_prio6_discards", + "rx_prio7_discards", + "tx_global_pause", + "tx_prio0_pause", + "tx_prio1_pause", + "tx_prio2_pause", + "tx_prio3_pause", + "tx_prio4_pause", + "tx_prio5_pause", + "tx_prio6_pause", + "tx_prio7_pause", + "tx_global_pause_duration", + "tx_prio0_pause_duration", + "tx_prio1_pause_duration", + "tx_prio2_pause_duration", + "tx_prio3_pause_duration", + "tx_prio4_pause_duration", + "tx_prio5_pause_duration", + "tx_prio6_pause_duration", + "tx_prio7_pause_duration", + "tx_pause_storm_warning_events", + "tx_pause_storm_error_events", + "module_unplug", + "module_bus_stuck", + "module_high_temp", + "module_bad_shorted", + "ipsec_rx_drop_pkts", + "ipsec_rx_drop_bytes", + "ipsec_rx_drop_mismatch_sa_sel", + "ipsec_tx_drop_pkts", + "ipsec_tx_drop_bytes", + "ipsec_rx_drop_sp_alloc", + "ipsec_rx_drop_sadb_miss", + "ipsec_rx_drop_syndrome", + "ipsec_tx_drop_bundle", + "ipsec_tx_drop_no_state", + "ipsec_tx_drop_not_ip", + "ipsec_tx_drop_trailer", + "rx_prio0_buf_discard", + "rx_prio0_cong_discard", + "rx_prio1_buf_discard", + "rx_prio1_cong_discard", + "rx_prio2_buf_discard", + "rx_prio2_cong_discard", + "rx_prio3_buf_discard", + "rx_prio3_cong_discard", + "rx_prio4_buf_discard", + "rx_prio4_cong_discard", + "rx_prio5_buf_discard", + "rx_prio5_cong_discard", + "rx_prio6_buf_discard", + "rx_prio6_cong_discard", + "rx_prio7_buf_discard", + "rx_prio7_cong_discard", + ] + + warning_fields: ClassVar[list[str]] = [ + "rx_pause_ctrl_phy", + "rx_global_pause", + "rx_prio0_pause", + "rx_prio1_pause", + "rx_prio2_pause", + "rx_prio3_pause", + "rx_prio4_pause", + "rx_prio5_pause", + "rx_prio6_pause", + "rx_prio7_pause", + "rx_global_pause_transition", + "rx_prio0_pause_transition", + "rx_prio1_pause_transition", + "rx_prio2_pause_transition", + "rx_prio3_pause_transition", + "rx_prio4_pause_transition", + "rx_prio5_pause_transition", + "rx_prio6_pause_transition", + "rx_prio7_pause_transition", + "rx_global_pause_duration", + "rx_prio0_pause_duration", + "rx_prio1_pause_duration", + "rx_prio2_pause_duration", + "rx_prio3_pause_duration", + "rx_prio4_pause_duration", + "rx_prio5_pause_duration", + "rx_prio6_pause_duration", + "rx_prio7_pause_duration", + ] + + +VendorEthtoolStatisticsModel = ( + PollaraEthtoolStatistics | Thor2EthtoolStatistics | Cx7EthtoolStatistics +) + +VendorEthtoolStatisticsCls = Union[ + type[PollaraEthtoolStatistics], + type[Thor2EthtoolStatistics], + type[Cx7EthtoolStatistics], +] + + +# Map ifname prefixes to vendor-specific statistic models +# If netdev is ens, use Cx7 +# If netdev is benic, check if it starts with ionic or bnxt to determine if it's Pollara or Thor2 +VENDOR_PREFIX_MAP: dict[str, VendorEthtoolStatisticsCls] = { + "ionic": PollaraEthtoolStatistics, + "bnxt": Thor2EthtoolStatistics, + "mlx": Cx7EthtoolStatistics, +} + + +class EthtoolStatistics(BaseModel): + """Per-netdev ethtool -S row with optional vendor-parsed counters.""" + + netdev: Optional[str] = None + rdma_ifname: Optional[str] = Field( + default=None, + description="RDMA interface name from 'rdma link -j' used for vendor prefix selection", + ) + vendor_statistics: Optional[VendorEthtoolStatisticsModel] = None + + @model_validator(mode="after") + def validate_atleast_one_field(self) -> Self: + if not self.model_fields_set: + raise ValueError("At least one field must be set in EthtoolStatistics") + return self diff --git a/nodescraper/plugins/inband/network/network_analyzer.py b/nodescraper/plugins/inband/network/network_analyzer.py index dbd39fc8..27280c37 100644 --- a/nodescraper/plugins/inband/network/network_analyzer.py +++ b/nodescraper/plugins/inband/network/network_analyzer.py @@ -61,17 +61,16 @@ class NetworkAnalyzer(RegexAnalyzer[NetworkDataModel, NetworkAnalyzerArgs]): def analyze_data( self, data: NetworkDataModel, args: Optional[NetworkAnalyzerArgs] = None ) -> TaskResult: - """Analyze network statistics for non-zero error counters. - Currently only checks ethtool -S statistics. + """Analyze ethtool -S statistics: regex-based (per interface) and vendor-based (RDMA-scoped). Args: - data: Network data model with ethtool_info containing interface statistics. + data: Network data model with ethtool_info and/or rdma_ethtool_statistics. args: Optional analyzer arguments with custom error regex support. Returns: - TaskResult with status OK if no errors, ERROR if any error counter > 0. + TaskResult with OK, WARNING (no data or vendor warning counters only), or ERROR. """ - if not data.ethtool_info: + if not data.ethtool_info and not data.rdma_ethtool_statistics: self.result.message = "No network devices found" self.result.status = ExecutionStatus.WARNING return self.result @@ -81,26 +80,23 @@ def analyze_data( final_error_regex = self._convert_and_extend_error_regex(args.error_regex, self.ERROR_REGEX) - error_state = False + regex_error = False for interface_name, ethtool_info in data.ethtool_info.items(): - errors_on_interface = [] # (error_field, value) - # Loop through all statistics in the ethtool statistics dict + errors_on_interface: list[tuple[str, int]] = [] for stat_name, stat_value in ethtool_info.statistics.items(): - # Check if this statistic matches any error field pattern for error_regex_obj in final_error_regex: if error_regex_obj.regex.match(stat_name): - # Try to convert string value to int try: value = int(stat_value) except (ValueError, TypeError): - break # Skip non-numeric values + break if value > 0: errors_on_interface.append((stat_name, value)) - break # Stop checking patterns once we find a match + break if errors_on_interface: - error_state = True + regex_error = True error_names = [e[0] for e in errors_on_interface] errors_data = {field: value for field, value in errors_on_interface} self._log_event( @@ -114,9 +110,49 @@ def analyze_data( console_log=True, ) - if error_state: + vendor_error = False + vendor_warning = False + for stat in data.rdma_ethtool_statistics: + if stat.vendor_statistics is None: + continue + + vs = stat.vendor_statistics + error_fields = vs.error_fields + warning_fields = vs.warning_fields + + for field_name in error_fields + warning_fields: + error_value = getattr(vs, field_name, None) + if error_value is not None and error_value > 0: + is_warning_tier = field_name in warning_fields + priority = EventPriority.WARNING if is_warning_tier else EventPriority.ERROR + if is_warning_tier: + vendor_warning = True + else: + vendor_error = True + desc = ( + f"Ethtool warning detected: {field_name}" + if is_warning_tier + else f"Ethtool error detected: {field_name}" + ) + self._log_event( + category=EventCategory.NETWORK, + description=desc, + data={ + "netdev": stat.netdev, + "rdma_ifname": stat.rdma_ifname, + "error_field": field_name, + "error_count": error_value, + }, + priority=priority, + console_log=True, + ) + + if regex_error or vendor_error: self.result.message = "Network errors detected in statistics" self.result.status = ExecutionStatus.ERROR + elif vendor_warning: + self.result.message = "Network vendor ethtool warning counters non-zero" + self.result.status = ExecutionStatus.WARNING else: self.result.message = "No network errors detected in statistics" self.result.status = ExecutionStatus.OK diff --git a/nodescraper/plugins/inband/network/network_collector.py b/nodescraper/plugins/inband/network/network_collector.py index d530bd98..a962ddf7 100644 --- a/nodescraper/plugins/inband/network/network_collector.py +++ b/nodescraper/plugins/inband/network/network_collector.py @@ -23,15 +23,24 @@ # SOFTWARE. # ############################################################################### +import json import re from typing import Dict, List, Optional, Tuple +from pydantic import ValidationError + from nodescraper.base import InBandDataCollector from nodescraper.connection.inband import TextFileArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily from nodescraper.models import TaskResult +from nodescraper.utils import get_exception_traceback from .collector_args import NetworkCollectorArgs +from .ethtool_vendor import ( + VENDOR_PREFIX_MAP, + EthtoolStatistics, + VendorEthtoolStatisticsModel, +) from .networkdata import ( EthtoolInfo, IpAddress, @@ -53,6 +62,7 @@ class NetworkCollector(InBandDataCollector[NetworkDataModel, NetworkCollectorArg CMD_NEIGHBOR = "ip neighbor show" CMD_ETHTOOL_TEMPLATE = "ethtool {interface}" CMD_ETHTOOL_S_TEMPLATE = "ethtool -S {interface}" + CMD_RDMA_LINK_JSON = "rdma link -j" CMD_PING = "ping" CMD_WGET = "wget" CMD_CURL = "curl" @@ -519,6 +529,151 @@ def _collect_ethtool_info(self, interfaces: List[NetworkInterface]) -> Dict[str, return ethtool_data + def _collect_rdma_link_json(self) -> Optional[list[dict]]: + """Parse JSON from `rdma link -j`. Returns None on failure, [] when no links.""" + res = self._run_sut_cmd(self.CMD_RDMA_LINK_JSON) + if res.exit_code != 0: + self._log_event( + category=EventCategory.NETWORK, + description="rdma link -j failed (RDMA-scoped ethtool collection skipped)", + data={ + "command": self.CMD_RDMA_LINK_JSON, + "exit_code": res.exit_code, + "stderr": res.stderr, + }, + priority=EventPriority.WARNING, + ) + return None + if not res.stdout.strip(): + return [] + try: + parsed = json.loads(res.stdout) + except json.JSONDecodeError as e: + self._log_event( + category=EventCategory.NETWORK, + description="Failed to parse rdma link -j JSON", + data={"exception": get_exception_traceback(e)}, + priority=EventPriority.WARNING, + ) + return None + if not isinstance(parsed, list): + self._log_event( + category=EventCategory.NETWORK, + description="Unexpected rdma link -j JSON type", + data={"data_type": type(parsed).__name__}, + priority=EventPriority.WARNING, + ) + return None + return parsed + + def _collect_rdma_scoped_ethtool_statistic( + self, netdev: str, ifname: str + ) -> Optional[EthtoolStatistics]: + """Run `ethtool -S` for netdev and attach vendor-parsed stats (prefix from RDMA ifname).""" + cmd_s = f"ethtool -S {netdev}" + res = self._run_sut_cmd(cmd_s, sudo=True) + if res.exit_code != 0: + self._log_event( + category=EventCategory.NETWORK, + description=f"Error executing ethtool -S for device {netdev}", + data={ + "command": cmd_s, + "exit_code": res.exit_code, + "stderr": res.stderr, + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return None + + if res.stdout: + self.result.artifacts.append( + TextFileArtifact( + filename=f"rdma-ethtool-{netdev}.log", + contents=res.stdout, + ) + ) + stats_dict = self._parse_ethtool_statistics(res.stdout, netdev) + + vendor_stats: VendorEthtoolStatisticsModel | None = None + for prefix, vendor_cls in VENDOR_PREFIX_MAP.items(): + if ifname.startswith(prefix): + vendor_fields = set(vendor_cls.model_fields.keys()) + stat_fields = set(stats_dict.keys()) - {"netdev"} + + missing_fields = vendor_fields - stat_fields + if missing_fields: + sorted_missing = sorted(missing_fields) + self._log_event( + category=EventCategory.NETWORK, + description=f"Missing fields in ethtool statistic for {netdev}", + data={ + "netdev": netdev, + "ifname": ifname, + "missing_fields_count": len(sorted_missing), + "missing_fields": sorted_missing[:50], + }, + priority=EventPriority.WARNING, + ) + + filtered_stats = {k: v for k, v in stats_dict.items() if k in vendor_fields} + try: + vendor_stats = vendor_cls.model_validate(filtered_stats) + except ValidationError as ve: + self._log_event( + category=EventCategory.NETWORK, + description=f"Failed to build vendor ethtool model for {netdev}", + data={"exception": get_exception_traceback(ve)}, + priority=EventPriority.WARNING, + ) + break + + return EthtoolStatistics( + netdev=netdev, + rdma_ifname=ifname or None, + vendor_statistics=vendor_stats, + ) + + def _collect_rdma_scoped_ethtool(self) -> tuple[List[str], List[EthtoolStatistics]]: + """Collect ethtool -S for netdevs listed on RDMA links (error-scraper EthtoolCollector parity).""" + netdev_list: List[str] = [] + statistics_list: List[EthtoolStatistics] = [] + + link_data = self._collect_rdma_link_json() + if link_data is None: + return netdev_list, statistics_list + + for link in link_data: + if not isinstance(link, dict): + self._log_event( + category=EventCategory.NETWORK, + description="Invalid data type for RDMA link entry", + data={"data_type": type(link).__name__}, + priority=EventPriority.WARNING, + ) + continue + + netdev = link.get("netdev") or "" + ifname = link.get("ifname") or "" + + if netdev: + netdev_list.append(netdev) + stat = self._collect_rdma_scoped_ethtool_statistic(netdev, ifname) + if stat is not None: + statistics_list.append(stat) + + if netdev_list: + self._log_event( + category=EventCategory.NETWORK, + description=( + f"Collected RDMA-scoped ethtool -S for {len(statistics_list)}/" + f"{len(netdev_list)} netdev(s) from rdma link" + ), + priority=EventPriority.INFO, + ) + + return netdev_list, statistics_list + def _collect_lldp_info(self) -> None: """Collect LLDP information using lldpcli and lldpctl commands.""" # Run lldpcli show neighbor @@ -618,6 +773,8 @@ def collect_data( neighbors = [] ethtool_data = {} network_accessible: Optional[bool] = None + rdma_ethtool_netdevs: List[str] = [] + rdma_ethtool_statistics: List[EthtoolStatistics] = [] # Check network connectivity if URL is provided if args and args.url: @@ -662,6 +819,9 @@ def collect_data( priority=EventPriority.INFO, ) + if self.system_info.os_family == OSFamily.LINUX: + rdma_ethtool_netdevs, rdma_ethtool_statistics = self._collect_rdma_scoped_ethtool() + # Collect routing table res_route = self._run_sut_cmd(self.CMD_ROUTE) if res_route.exit_code == 0: @@ -724,6 +884,8 @@ def collect_data( rules=rules, neighbors=neighbors, ethtool_info=ethtool_data, + rdma_ethtool_netdevs=rdma_ethtool_netdevs, + rdma_ethtool_statistics=rdma_ethtool_statistics, accessible=network_accessible, ) self.result.status = ExecutionStatus.OK diff --git a/nodescraper/plugins/inband/network/networkdata.py b/nodescraper/plugins/inband/network/networkdata.py index 20caaeca..c90a1fc1 100644 --- a/nodescraper/plugins/inband/network/networkdata.py +++ b/nodescraper/plugins/inband/network/networkdata.py @@ -29,6 +29,8 @@ from nodescraper.models import DataModel +from .ethtool_vendor import EthtoolStatistics + class IpAddress(BaseModel): """Individual IP address on an interface""" @@ -117,4 +119,7 @@ class NetworkDataModel(DataModel): ethtool_info: Dict[str, EthtoolInfo] = Field( default_factory=dict ) # Interface name -> EthtoolInfo mapping + # RDMA-scoped ethtool -S: netdevs from `rdma link -j` with vendor-parsed counters + rdma_ethtool_netdevs: List[str] = Field(default_factory=list) + rdma_ethtool_statistics: List[EthtoolStatistics] = Field(default_factory=list) accessible: Optional[bool] = None # Network accessibility check via ping diff --git a/test/unit/plugin/test_network_analyzer.py b/test/unit/plugin/test_network_analyzer.py index e886b765..6b7aeff3 100644 --- a/test/unit/plugin/test_network_analyzer.py +++ b/test/unit/plugin/test_network_analyzer.py @@ -27,6 +27,10 @@ from nodescraper.enums import EventPriority, ExecutionStatus from nodescraper.plugins.inband.network.analyzer_args import NetworkAnalyzerArgs +from nodescraper.plugins.inband.network.ethtool_vendor import ( + EthtoolStatistics, + Thor2EthtoolStatistics, +) from nodescraper.plugins.inband.network.network_analyzer import NetworkAnalyzer from nodescraper.plugins.inband.network.networkdata import ( EthtoolInfo, @@ -158,13 +162,55 @@ def test_multiple_interfaces_with_errors(network_analyzer): def test_empty_ethtool_info(network_analyzer): - """Test with empty ethtool_info: WARNING and message logged.""" + """Test with empty ethtool_info and no RDMA ethtool: WARNING and message logged.""" model = NetworkDataModel(ethtool_info={}) result = network_analyzer.analyze_data(model) assert result.status == ExecutionStatus.WARNING assert result.message == "No network devices found" +def test_rdma_ethtool_vendor_error_only(network_analyzer): + """RDMA-scoped vendor ethtool: error-tier counter raises ERROR.""" + stat = EthtoolStatistics( + netdev="eth0", + rdma_ifname="bnxt0", + vendor_statistics=Thor2EthtoolStatistics(tx_pfc_frames=4), + ) + model = NetworkDataModel(ethtool_info={}, rdma_ethtool_statistics=[stat]) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.ERROR + assert "Network errors detected" in result.message + assert len(result.events) == 1 + assert result.events[0].data["error_field"] == "tx_pfc_frames" + assert result.events[0].data["error_count"] == 4 + assert result.events[0].priority == EventPriority.ERROR + + +def test_rdma_ethtool_vendor_warning_only(network_analyzer): + """RDMA-scoped vendor ethtool: only warning-tier counters -> WARNING status.""" + stat = EthtoolStatistics( + netdev="eth0", + rdma_ifname="bnxt0", + vendor_statistics=Thor2EthtoolStatistics(rx_pause_frames=2), + ) + model = NetworkDataModel(ethtool_info={}, rdma_ethtool_statistics=[stat]) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.WARNING + assert "warning counters" in result.message + assert len(result.events) == 1 + assert result.events[0].data["error_field"] == "rx_pause_frames" + assert result.events[0].priority == EventPriority.WARNING + + +def test_rdma_ethtool_no_vendor_model_ok(network_analyzer): + """RDMA ethtool row without parsed vendor statistics is ignored by vendor path.""" + stat = EthtoolStatistics(netdev="eth0", rdma_ifname="unknown0", vendor_statistics=None) + model = NetworkDataModel(ethtool_info={}, rdma_ethtool_statistics=[stat]) + result = network_analyzer.analyze_data(model) + assert result.status == ExecutionStatus.OK + assert len(result.events) == 0 + + def test_regex_patterns_priority_numbers(network_analyzer): """Test that regex patterns match various priority numbers (0-7 and beyond).""" ethtool = EthtoolInfo( diff --git a/test/unit/plugin/test_network_collector.py b/test/unit/plugin/test_network_collector.py index 6382adeb..a7b1faae 100644 --- a/test/unit/plugin/test_network_collector.py +++ b/test/unit/plugin/test_network_collector.py @@ -648,3 +648,44 @@ def run_sut_cmd_side_effect(cmd, **kwargs): result, accessible = collector.check_network_accessibility() assert result.status == ExecutionStatus.ERRORS_DETECTED assert accessible is False + + +def test_collect_data_includes_rdma_ethtool(collector, conn_mock): + """RDMA-scoped ethtool -S is stored on NetworkDataModel when rdma link succeeds.""" + import json + + collector.system_info.os_family = OSFamily.LINUX + + rdma_link = [{"netdev": "eth0", "ifname": "bnxt0"}] + ethtool_s_bnxt = "NIC statistics:\n tx_pfc_frames: 0\n rx_pause_frames: 0\n" + + def run_sut_cmd_side_effect(cmd, **kwargs): + if "addr show" in cmd: + return MagicMock(exit_code=0, stdout=IP_ADDR_OUTPUT, command=cmd) + elif "route show" in cmd: + return MagicMock(exit_code=0, stdout=IP_ROUTE_OUTPUT, command=cmd) + elif "rule show" in cmd: + return MagicMock(exit_code=0, stdout=IP_RULE_OUTPUT, command=cmd) + elif "neighbor show" in cmd: + return MagicMock(exit_code=0, stdout=IP_NEIGHBOR_OUTPUT, command=cmd) + elif "rdma link -j" in cmd: + return MagicMock(exit_code=0, stdout=json.dumps(rdma_link), command=cmd) + elif "ethtool -S" in cmd and "eth0" in cmd: + return MagicMock(exit_code=0, stdout=ethtool_s_bnxt, command=cmd) + elif "ethtool" in cmd: + return MagicMock(exit_code=1, stdout="", command=cmd) + elif "lldpcli" in cmd or "lldpctl" in cmd: + return MagicMock(exit_code=1, stdout="", command=cmd) + return MagicMock(exit_code=1, stdout="", command=cmd) + + collector._run_sut_cmd = MagicMock(side_effect=run_sut_cmd_side_effect) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data is not None + assert "eth0" in data.rdma_ethtool_netdevs + assert len(data.rdma_ethtool_statistics) == 1 + assert data.rdma_ethtool_statistics[0].netdev == "eth0" + assert data.rdma_ethtool_statistics[0].rdma_ifname == "bnxt0" + assert data.rdma_ethtool_statistics[0].vendor_statistics is not None From b7c291368034282fd3f2de4232d320df678da3a7 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 15 May 2026 11:08:55 -0500 Subject: [PATCH 14/25] utest fix --- nodescraper/plugins/inband/network/ethtool_vendor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nodescraper/plugins/inband/network/ethtool_vendor.py b/nodescraper/plugins/inband/network/ethtool_vendor.py index 04dbf4a4..47e91be9 100644 --- a/nodescraper/plugins/inband/network/ethtool_vendor.py +++ b/nodescraper/plugins/inband/network/ethtool_vendor.py @@ -622,9 +622,11 @@ class Cx7EthtoolStatistics(BaseModel): ] -VendorEthtoolStatisticsModel = ( - PollaraEthtoolStatistics | Thor2EthtoolStatistics | Cx7EthtoolStatistics -) +VendorEthtoolStatisticsModel = Union[ + PollaraEthtoolStatistics, + Thor2EthtoolStatistics, + Cx7EthtoolStatistics, +] VendorEthtoolStatisticsCls = Union[ type[PollaraEthtoolStatistics], From 825b2e0fe26b08736fb571ebdc9a9cc3d53362e9 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 15 May 2026 12:19:03 -0500 Subject: [PATCH 15/25] fix --- nodescraper/plugins/inband/network/network_collector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nodescraper/plugins/inband/network/network_collector.py b/nodescraper/plugins/inband/network/network_collector.py index a962ddf7..7e5e4a39 100644 --- a/nodescraper/plugins/inband/network/network_collector.py +++ b/nodescraper/plugins/inband/network/network_collector.py @@ -595,7 +595,7 @@ def _collect_rdma_scoped_ethtool_statistic( ) stats_dict = self._parse_ethtool_statistics(res.stdout, netdev) - vendor_stats: VendorEthtoolStatisticsModel | None = None + vendor_stats: Optional[VendorEthtoolStatisticsModel] = None for prefix, vendor_cls in VENDOR_PREFIX_MAP.items(): if ifname.startswith(prefix): vendor_fields = set(vendor_cls.model_fields.keys()) From 4151ccd89da30001c2ceaa111ab7800117edb601 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 19 May 2026 00:27:01 +0000 Subject: [PATCH 16/25] docs: Update plugin documentation [automated] --- docs/PLUGIN_DOC.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 5e84641c..6ae77cc4 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -16,7 +16,7 @@ | KernelPlugin | sh -c 'uname -a'
sh -c 'cat /proc/sys/kernel/numa_balancing'
wmic os get Version /Value | **Analyzer Args:**
- `exp_kernel`: Union[str, list] — Expected kernel version string(s) to match (e.g. from uname -a).
- `exp_numa`: Optional[int] — Expected value for kernel.numa_balancing (e.g. 0 or 1).
- `regex_match`: bool — If True, match exp_kernel as regex; otherwise exact match. | - | [KernelDataModel](#KernelDataModel-Model) | [KernelCollector](#Collector-Class-KernelCollector) | [KernelAnalyzer](#Data-Analyzer-Class-KernelAnalyzer) | | KernelModulePlugin | cat /proc/modules
modinfo amdgpu
wmic os get Version /Value | **Analyzer Args:**
- `kernel_modules`: dict[str, dict] — Expected kernel module name -> {version, etc.}. Analyzer checks collected modules match.
- `regex_filter`: list[str] — List of regex patterns to filter which collected modules are checked (default: amd). | - | [KernelModuleDataModel](#KernelModuleDataModel-Model) | [KernelModuleCollector](#Collector-Class-KernelModuleCollector) | [KernelModuleAnalyzer](#Data-Analyzer-Class-KernelModuleAnalyzer) | | MemoryPlugin | free -b
lsmem
numactl -H
wmic OS get FreePhysicalMemory /Value; wmic ComputerSystem get TotalPhysicalMemory /Value | **Analyzer Args:**
- `ratio`: float — Required free-memory ratio (0-1). Analysis fails if free/total < ratio.
- `memory_threshold`: str — Minimum free memory required (e.g. '30Gi', '1T'). Used when ratio is not sufficient. | - | [MemoryDataModel](#MemoryDataModel-Model) | [MemoryCollector](#Collector-Class-MemoryCollector) | [MemoryAnalyzer](#Data-Analyzer-Class-MemoryAnalyzer) | -| NetworkPlugin | ip addr show
curl
ethtool -S {interface}
ethtool {interface}
lldpcli show neighbor
lldpctl
ip neighbor show
ping
ip route show
ip rule show
wget | **Built-in Regexes:**
- tx_pfc_frames is non-zero: `^tx_pfc_frames$`
- tx_pfc_ena_frames_pri* is non-zero: `^tx_pfc_ena_frames_pri\d+$`
- pfc_pri*_tx_transitions is non-zero: `^pfc_pri\d+_tx_transitions$`
**Analyzer Args:**
- `error_regex`: Union[list[nodescraper.base.regexanalyzer.ErrorRegex], list[dict], NoneType] — Custom error regex patterns; each item can be ErrorRegex or dict with category/pattern. | **Collection Args:**
- `url`: Optional[str] — Optional URL to probe for network connectivity (used with netprobe).
- `netprobe`: Optional[Literal['ping', 'wget', 'curl']] — Tool to use for network connectivity probe: ping, wget, or curl. | [NetworkDataModel](#NetworkDataModel-Model) | [NetworkCollector](#Collector-Class-NetworkCollector) | [NetworkAnalyzer](#Data-Analyzer-Class-NetworkAnalyzer) | +| NetworkPlugin | ip addr show
curl
ethtool -S {interface}
ethtool {interface}
lldpcli show neighbor
lldpctl
ip neighbor show
ping
rdma link -j
ip route show
ip rule show
wget | **Built-in Regexes:**
- tx_pfc_frames is non-zero: `^tx_pfc_frames$`
- tx_pfc_ena_frames_pri* is non-zero: `^tx_pfc_ena_frames_pri\d+$`
- pfc_pri*_tx_transitions is non-zero: `^pfc_pri\d+_tx_transitions$`
**Analyzer Args:**
- `error_regex`: Union[list[nodescraper.base.regexanalyzer.ErrorRegex], list[dict], NoneType] — Custom error regex patterns; each item can be ErrorRegex or dict with category/pattern. | **Collection Args:**
- `url`: Optional[str] — Optional URL to probe for network connectivity (used with netprobe).
- `netprobe`: Optional[Literal['ping', 'wget', 'curl']] — Tool to use for network connectivity probe: ping, wget, or curl. | [NetworkDataModel](#NetworkDataModel-Model) | [NetworkCollector](#Collector-Class-NetworkCollector) | [NetworkAnalyzer](#Data-Analyzer-Class-NetworkAnalyzer) | | NicPlugin | niccli --listdev
niccli --list
niccli --list_devices
niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering
niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering
niccli -dev {device_num} nvm -getoption performance_profile
niccli --dev {device_num} nvm --getoption performance_profile
niccli -dev {device_num} nvm -getoption support_rdma -scope 0
niccli -dev {device_num} getqos
niccli --dev {device_num} nvm --getoption support_rdma
niccli --dev {device_num} qos --ets --show
niccli --version
nicctl show card
nicctl --version
nicctl show card flash partition --json
nicctl show card interrupts --json
nicctl show card logs --non-persistent
nicctl show card logs --boot-fault
nicctl show card logs --persistent
nicctl show card profile --json
nicctl show card time --json
nicctl show card statistics packet-buffer summary --json
nicctl show lif statistics --json
nicctl show lif internal queue-to-ud-pinning
nicctl show pipeline internal anomalies
nicctl show pipeline internal rsq-ring
nicctl show pipeline internal statistics memory
nicctl show port fsm
nicctl show port transceiver --json
nicctl show port statistics --json
nicctl show port internal mac
nicctl show qos headroom --json
nicctl show rdma queue --json
nicctl show rdma queue-pair --detail --json
nicctl show version firmware
nicctl show dcqcn
nicctl show environment
nicctl show lif
nicctl show pcie ats
nicctl show port
nicctl show qos
nicctl show rdma statistics
nicctl show version host-software
nicctl show dcqcn --card {card_id} --json
nicctl show card hardware-config --card {card_id} | **Analyzer Args:**
- `expected_values`: Optional[Dict[str, Dict[str, Any]]] — Per-command expected checks keyed by canonical key (see command_to_canonical_key).
- `performance_profile_expected`: str — Expected Broadcom performance_profile value (case-insensitive). Default RoCE.
- `support_rdma_disabled_values`: List[str] — Values that indicate RDMA is not supported (case-insensitive).
- `pcie_relaxed_ordering_expected`: str — Expected Broadcom pcie_relaxed_ordering value (e.g. 'Relaxed ordering = enabled'); checked case-insensitively. Defaul...
- `expected_qos_prio_map`: Optional[Dict[Any, Any]] — Expected priority-to-TC map (e.g. {0: 0, 1: 1}; keys may be int or str in config). Checked per device when set.
- `expected_qos_pfc_enabled`: Optional[int] — Expected PFC enabled value (0/1 or bitmask). Checked per device when set.
- `expected_qos_tsa_map`: Optional[Dict[Any, Any]] — Expected TSA map for ETS (e.g. {0: 'ets', 1: 'strict'}; keys may be int or str in config). Checked per device when set.
- `expected_qos_tc_bandwidth`: Optional[List[int]] — Expected TC bandwidth percentages. Checked per device when set.
- `require_qos_consistent_across_adapters`: bool — When True and no expected_qos_* are set, require all adapters to have the same prio_map, pfc_enabled, and tsa_map.
- `nicctl_log_error_regex`: Optional[List[Dict[str, Any]]] — Optional list of error patterns for nicctl show card logs. | **Collection Args:**
- `commands`: Optional[List[str]] — Optional list of niccli/nicctl commands to run. When None, default command set is used.
- `use_sudo_niccli`: bool — If True, run niccli commands with sudo when required.
- `use_sudo_nicctl`: bool — If True, run nicctl commands with sudo when required. | [NicDataModel](#NicDataModel-Model) | [NicCollector](#Collector-Class-NicCollector) | [NicAnalyzer](#Data-Analyzer-Class-NicAnalyzer) | | NvmePlugin | nvme smart-log {dev}
nvme error-log {dev} --log-entries=256
nvme id-ctrl {dev}
nvme id-ns {dev}{ns}
nvme fw-log {dev}
nvme self-test-log {dev}
nvme get-log {dev} --log-id=6 --log-len=512
nvme telemetry-log {dev} --output-file={dev}_{f_name}
nvme list -o json | - | - | [NvmeDataModel](#NvmeDataModel-Model) | [NvmeCollector](#Collector-Class-NvmeCollector) | - | | OsPlugin | sh -c '( lsb_release -ds || (cat /etc/*release | grep PRETTY_NAME) || uname -om ) 2>/dev/null | head -n1'
cat /etc/*release | grep VERSION_ID
wmic os get Version /value
wmic os get Caption /Value | **Analyzer Args:**
- `exp_os`: Union[str, list] — Expected OS name/version string(s) to match (e.g. from lsb_release or /etc/os-release).
- `exact_match`: bool — If True, require exact match for exp_os; otherwise substring match. | - | [OsDataModel](#OsDataModel-Model) | [OsCollector](#Collector-Class-OsCollector) | [OsAnalyzer](#Data-Analyzer-Class-OsAnalyzer) | @@ -405,6 +405,7 @@ Collect network configuration details using ip command - **CMD_NEIGHBOR**: `ip neighbor show` - **CMD_ETHTOOL_TEMPLATE**: `ethtool {interface}` - **CMD_ETHTOOL_S_TEMPLATE**: `ethtool -S {interface}` +- **CMD_RDMA_LINK_JSON**: `rdma link -j` - **CMD_PING**: `ping` - **CMD_WGET**: `wget` - **CMD_CURL**: `curl` @@ -425,6 +426,7 @@ NetworkDataModel - lldpctl - ip neighbor show - ping +- rdma link -j - ip route show - ip rule show - wget @@ -1134,6 +1136,8 @@ Complete network configuration data - **rules**: `List[nodescraper.plugins.inband.network.networkdata.RoutingRule]` - **neighbors**: `List[nodescraper.plugins.inband.network.networkdata.Neighbor]` - **ethtool_info**: `Dict[str, nodescraper.plugins.inband.network.networkdata.EthtoolInfo]` +- **rdma_ethtool_netdevs**: `List[str]` +- **rdma_ethtool_statistics**: `List[nodescraper.plugins.inband.network.ethtool_vendor.EthtoolStatistics]` - **accessible**: `Optional[bool]` ## NicDataModel Model From 309e219863da2ede2cf70dc08f564043adeb5848 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 19 May 2026 09:13:01 -0500 Subject: [PATCH 17/25] codeowners update --- .github/CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index fe9fc1c5..b77d429c 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1 +1 @@ -* @landrews-amd @alexandraBara @jaspals3123 +*@alexandraBara From 4f2f277bb4c5a358ed33b93a8df1c87f8a514786 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 19 May 2026 11:13:06 -0500 Subject: [PATCH 18/25] exiting with NOT_RAN when cards not present --- .../plugins/inband/nic/nic_collector.py | 42 +++++++++++++------ test/unit/plugin/test_niccli_collector.py | 38 +++++++++++++++++ 2 files changed, 67 insertions(+), 13 deletions(-) diff --git a/nodescraper/plugins/inband/nic/nic_collector.py b/nodescraper/plugins/inband/nic/nic_collector.py index 021fa3e3..c9c0b606 100644 --- a/nodescraper/plugins/inband/nic/nic_collector.py +++ b/nodescraper/plugins/inband/nic/nic_collector.py @@ -354,7 +354,9 @@ class NicCollector(InBandDataCollector[NicDataModel, NicCollectorArgs]): CMD_NICCLI_QOS_TEMPLATE_LEGACY, ] # New (> v233): double-dash options and qos --ets --show - CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_NEW = "niccli --dev {device_num} nvm --getoption support_rdma" + CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_NEW = ( + "niccli --dev {device_num} nvm --getoption support_rdma --scope 0" + ) CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE_NEW = ( "niccli --dev {device_num} nvm --getoption performance_profile" ) @@ -471,6 +473,19 @@ def collect_data( card_ids = [c.id for c in legacy_cards] card_list_from_text = [c.model_dump() for c in legacy_cards] + if custom_commands is None and not device_nums and not card_ids: + self._log_event( + category=EventCategory.NETWORK, + description="No Broadcom (niccli) or Pensando (nicctl) NIC hardware detected", + priority=EventPriority.INFO, + ) + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = ( + "No Broadcom (niccli) or Pensando (nicctl) NIC hardware detected; " + "NIC collection skipped" + ) + return self.result, None + # Build full command list (expand placeholders) if custom_commands is not None: commands_to_run: List[str] = [] @@ -486,18 +501,19 @@ def collect_data( else: commands_to_run = [] # niccli list already stored - per_device_templates = _get_niccli_per_device_templates(niccli_version) - for tpl in per_device_templates: - for d in device_nums: - commands_to_run.append(tpl.format(device_num=d)) - # nicctl global (card discovery already done via CMD_NICCTL_CARD_TEXT) - for c in NicCollector.CMD_NICCTL_GLOBAL: - commands_to_run.append(c) - for tpl in NicCollector.CMD_NICCTL_PER_CARD: - for cid in card_ids: - commands_to_run.append(tpl.format(card_id=cid)) - for cmd in NicCollector.CMD_NICCTL_LEGACY_TEXT: - commands_to_run.append(cmd) + if device_nums: + per_device_templates = _get_niccli_per_device_templates(niccli_version) + for tpl in per_device_templates: + for d in device_nums: + commands_to_run.append(tpl.format(device_num=d)) + if card_ids: + for c in NicCollector.CMD_NICCTL_GLOBAL: + commands_to_run.append(c) + for tpl in NicCollector.CMD_NICCTL_PER_CARD: + for cid in card_ids: + commands_to_run.append(tpl.format(card_id=cid)) + for cmd in NicCollector.CMD_NICCTL_LEGACY_TEXT: + commands_to_run.append(cmd) # Run each command and store (artifact-only commands are not added to results / data model). for cmd in commands_to_run: diff --git a/test/unit/plugin/test_niccli_collector.py b/test/unit/plugin/test_niccli_collector.py index c4e5adef..709ed8a3 100644 --- a/test/unit/plugin/test_niccli_collector.py +++ b/test/unit/plugin/test_niccli_collector.py @@ -242,6 +242,44 @@ def test_nic_data_model_with_pensando_nic(collector): assert data.pensando_nic_cards[1].serial_number == "FPL253710E5" +def test_collect_data_not_ran_when_no_nic_hardware(collector, conn_mock): + """Skip collection when discovery finds no Broadcom or Pensando NICs.""" + collector.system_info.os_family = OSFamily.LINUX + collector._run_sut_cmd = MagicMock( + return_value=MagicMock(exit_code=1, stdout="", stderr="not found", command="") + ) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.NOT_RAN + assert data is None + assert "skipped" in result.message.lower() + assert collector._run_sut_cmd.call_count <= 4 + + +def test_collect_data_skips_nicctl_commands_when_no_pensando_cards(collector, conn_mock): + """Do not run nicctl global/legacy commands when nicctl show card finds no cards.""" + collector.system_info.os_family = OSFamily.LINUX + commands_run: list[str] = [] + + def run_sut_cmd_side_effect(cmd, **kwargs): + commands_run.append(cmd) + if "niccli" in cmd and ("--list" in cmd or "--list_devices" in cmd or "--listdev" in cmd): + return MagicMock(exit_code=0, stdout=NICCLI_LISTDEV_OUTPUT, stderr="", command=cmd) + if cmd.strip() == "nicctl show card": + return MagicMock(exit_code=1, stdout="", stderr="no card", command=cmd) + return MagicMock(exit_code=0, stdout="", stderr="", command=cmd) + + collector._run_sut_cmd = MagicMock(side_effect=run_sut_cmd_side_effect) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data is not None + assert not any(c.startswith("nicctl show card flash") for c in commands_run) + assert not any(c == "nicctl --version" for c in commands_run) + + def test_collect_data_success(collector, conn_mock): """Test successful collection of niccli/nicctl data.""" collector.system_info.os_family = OSFamily.LINUX From 73cb9df41da8ef48ffe47c7b9a6e043ef1c044fb Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 19 May 2026 13:25:32 -0500 Subject: [PATCH 19/25] rocm versioning update --- .../plugins/inband/rocm/rocm_collector.py | 6 +- nodescraper/plugins/inband/rocm/rocmdata.py | 41 ++-- test/unit/plugin/test_rocm_collector.py | 190 +++++++++++++----- 3 files changed, 166 insertions(+), 71 deletions(-) diff --git a/nodescraper/plugins/inband/rocm/rocm_collector.py b/nodescraper/plugins/inband/rocm/rocm_collector.py index 7b910a69..27a6c4f5 100644 --- a/nodescraper/plugins/inband/rocm/rocm_collector.py +++ b/nodescraper/plugins/inband/rocm/rocm_collector.py @@ -25,6 +25,8 @@ ############################################################################### from typing import Optional +from pydantic import ValidationError + from nodescraper.base import InBandDataCollector from nodescraper.connection.inband import TextFileArtifact from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily @@ -41,7 +43,7 @@ class RocmCollector(InBandDataCollector[RocmDataModel, RocmCollectorArgs]): SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX} DATA_MODEL = RocmDataModel - CMD_ROCM_SUB_VERSIONS_TMPL = "grep . -r {rocm_path}/.info/*" + CMD_ROCM_SUB_VERSIONS_TMPL = "grep . -H -r -i {rocm_path}/.info/*" CMD_ROCMINFO_TMPL = "{rocm_path}/bin/rocminfo" CMD_ROCM_LATEST_TMPL = "ls -v -d {rocm_path}-[3-7]* | tail -1" CMD_ROCM_DIRS_TMPL = "ls -v -d {rocm_path}*" @@ -99,7 +101,7 @@ def collect_data( self.result.message = f"ROCm version: {rocm_data.rocm_version}" self.result.status = ExecutionStatus.OK break - except ValueError as e: + except (ValueError, ValidationError) as e: self._log_event( category=EventCategory.OS, description=f"Invalid ROCm version format: {res.stdout}", diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index c7e75608..cd1b0537 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -24,14 +24,18 @@ # ############################################################################### import re -from typing import List +from typing import ClassVar, List, Optional -from pydantic import field_validator +from pydantic import computed_field, field_validator from nodescraper.models import DataModel +_ROCM_VERSION_RE = re.compile(r"^(\d+(?:\.\d+){0,3})(?:-(\d+)(?:-gfx\w+(?:;gfx\w+)*)?)?$") + class RocmDataModel(DataModel): + ROCM_VERSION_FILENAME: ClassVar[str] = "version-rocm" + rocm_version: str rocm_sub_versions: dict[str, str] = {} rocminfo: List[str] = [] @@ -43,21 +47,28 @@ class RocmDataModel(DataModel): clinfo: List[str] = [] kfd_proc: List[str] = [] + @staticmethod + def _validate_version_string(version: str) -> str: + if not _ROCM_VERSION_RE.match(version): + raise ValueError(f"ROCm version has invalid format: {version}") + return version + @field_validator("rocm_version") @classmethod def validate_rocm_version(cls, rocm_version: str) -> str: - """ - Validate the ROCm version format. + return cls._validate_version_string(rocm_version) - Args: - rocm_version (str): The ROCm version string to validate. - - Raises: - ValueError: If the ROCm version does not match the expected format. + @field_validator("rocm_sub_versions") + @classmethod + def validate_rocm_sub_versions(cls, sub_versions: dict[str, str]) -> dict[str, str]: + for version in sub_versions.values(): + cls._validate_version_string(version) + return sub_versions - Returns: - str: The validated ROCm version string. - """ - if not re.match(r"^\d+(?:\.\d+){0,3}(-\d+)?$", rocm_version): - raise ValueError(f"ROCm version has invalid format: {rocm_version}") - return rocm_version + @computed_field + def build_number(self) -> Optional[str]: + """Build tag from version-rocm sub-version, or rocm_version when absent.""" + rocm_version = self.rocm_sub_versions.get(self.ROCM_VERSION_FILENAME, self.rocm_version) + if "-" in rocm_version: + return rocm_version.split("-")[1] + return None diff --git a/test/unit/plugin/test_rocm_collector.py b/test/unit/plugin/test_rocm_collector.py index d1d1c09b..0cb8523c 100644 --- a/test/unit/plugin/test_rocm_collector.py +++ b/test/unit/plugin/test_rocm_collector.py @@ -32,6 +32,48 @@ from nodescraper.enums.systeminteraction import SystemInteractionLevel from nodescraper.plugins.inband.rocm.rocm_collector import RocmCollector +ROCM_SUB_VERSIONS_GREP_CMD = "grep . -H -r -i /opt/rocm/.info/*" +# gfx942 (CDNA3 / MI300) and gfx950 (CDNA4 / MI350) — released ROCm 7.13 LLVM targets +ROCM_7_13_GFX_VERSION = "7.13.0-123-gfx942;gfx950" + +ROCM_6_4_SUB_VERSIONS_STDOUT = ( + "/opt/rocm/.info/version:6.4.0-47\n" + "/opt/rocm/.info/version-hip-libraries:6.4.0-47\n" + "/opt/rocm/.info/version-hiprt:6.4.0-47\n" + "/opt/rocm/.info/version-hiprt-devel:6.4.0-47\n" + "/opt/rocm/.info/version-hip-sdk:6.4.0-47\n" + "/opt/rocm/.info/version-lrt:6.4.0-47\n" + "/opt/rocm/.info/version-ml-libraries:6.4.0-47\n" + "/opt/rocm/.info/version-ml-sdk:6.4.0-47\n" + "/opt/rocm/.info/version-oclrt:6.4.0-47\n" + "/opt/rocm/.info/version-ocl-sdk:6.4.0-47\n" + "/opt/rocm/.info/version-openmp-sdk:6.4.0-47\n" + "/opt/rocm/.info/version-rocm:6.4.0-47\n" + "/opt/rocm/.info/version-rocm-developer-tools:6.4.0-47\n" + "/opt/rocm/.info/version-utils:6.4.0-47\n" +) + +ROCM_6_4_EXPECTED_SUB_VERSIONS = { + "version": "6.4.0-47", + "version-hip-libraries": "6.4.0-47", + "version-hiprt": "6.4.0-47", + "version-hiprt-devel": "6.4.0-47", + "version-hip-sdk": "6.4.0-47", + "version-lrt": "6.4.0-47", + "version-ml-libraries": "6.4.0-47", + "version-ml-sdk": "6.4.0-47", + "version-oclrt": "6.4.0-47", + "version-ocl-sdk": "6.4.0-47", + "version-openmp-sdk": "6.4.0-47", + "version-rocm": "6.4.0-47", + "version-rocm-developer-tools": "6.4.0-47", + "version-utils": "6.4.0-47", +} + + +def _optional_collection_failures(count: int = 8): + return [MagicMock(exit_code=1, stdout="") for _ in range(count)] + @pytest.fixture def collector(system_info, conn_mock): @@ -64,7 +106,7 @@ def test_collect_rocm_version_fallback(collector): """Test fallback to version file when version-rocm fails""" collector._run_sut_cmd = MagicMock( side_effect=[ - # Sub-versions (grep . -r /opt/rocm/.info/*) + # Sub-versions (grep . -H -r -i /opt/rocm/.info/*) MagicMock(exit_code=0, stdout=""), # First path: version-rocm (fails) MagicMock(exit_code=1, stdout="", command="grep . /opt/rocm/.info/version-rocm"), @@ -113,7 +155,7 @@ def test_collect_all_rocm_data(collector): # Mock all command outputs in sequence (order must match collector's call order) collector._run_sut_cmd = MagicMock( side_effect=[ - # Sub-versions (grep . -r /opt/rocm/.info/*) + # Sub-versions (grep . -H -r -i /opt/rocm/.info/*) MagicMock(exit_code=0, stdout="/opt/rocm/.info/version-rocm:6.2.0-66"), # ROCm version (grep . /opt/rocm/.info/version-rocm) MagicMock(exit_code=0, stdout="6.2.0-66"), @@ -159,6 +201,7 @@ def test_collect_all_rocm_data(collector): # Verify ROCm version assert data.rocm_version == "6.2.0-66" + assert data.build_number == "66" # Verify ROCm latest path assert data.rocm_latest_versioned_path == "/opt/rocm-1.1.0" @@ -206,7 +249,7 @@ def test_collect_with_clinfo_failure(collector): """Test that clinfo failure is handled gracefully and captured in artifact""" collector._run_sut_cmd = MagicMock( side_effect=[ - # Sub-versions (grep . -r /opt/rocm/.info/*) + # Sub-versions (grep . -H -r -i /opt/rocm/.info/*) MagicMock(exit_code=0, stdout="/opt/rocm/.info/version-rocm:6.2.0-66"), # ROCm version (grep . /opt/rocm/.info/version-rocm) MagicMock(exit_code=0, stdout="6.2.0-66"), @@ -249,7 +292,7 @@ def test_collect_minimal_data(collector): """Test collection when only version is available""" collector._run_sut_cmd = MagicMock( side_effect=[ - # Sub-versions (grep . -r /opt/rocm/.info/*) + # Sub-versions (grep . -H -r -i /opt/rocm/.info/*) MagicMock(exit_code=0, stdout=""), # ROCm version (grep . /opt/rocm/.info/version-rocm) MagicMock(exit_code=0, stdout="6.2.0-66"), @@ -282,13 +325,28 @@ def test_collect_minimal_data(collector): assert data.kfd_proc == [] +def test_sub_versions_grep_uses_h_flag(collector): + """Sub-version discovery must use grep -H so single-match output includes the filename.""" + collector._run_sut_cmd = MagicMock( + side_effect=[ + MagicMock(exit_code=1, stdout=""), + MagicMock(exit_code=1, stdout=""), + MagicMock(exit_code=1, stdout=""), + ] + ) + + collector.collect_data() + + assert collector._run_sut_cmd.call_args_list[0].args[0] == ROCM_SUB_VERSIONS_GREP_CMD + + def test_invalid_rocm_version_format(collector): """Test that invalid ROCm version format is handled gracefully""" collector._run_sut_cmd = MagicMock( - return_value=MagicMock( - exit_code=0, - stdout="invalid_version_format", - ) + side_effect=[ + MagicMock(exit_code=0, stdout=""), + MagicMock(exit_code=0, stdout="invalid_version_format"), + ] ) result, data = collector.collect_data() @@ -296,57 +354,36 @@ def test_invalid_rocm_version_format(collector): assert result.status == ExecutionStatus.ERROR assert data is None assert len(result.events) >= 1 + assert any( + event.category == EventCategory.OS.value + and "Invalid ROCm version format" in event.description + for event in result.events + ) -def test_collect_rocm_sub_versions(collector): - """Test collection of ROCm version and multiple sub-versions.""" - sub_versions_stdout = ( - "/opt/rocm/.info/version:6.4.0-47\n" - "/opt/rocm/.info/version-hip-libraries:6.4.0-47\n" - "/opt/rocm/.info/version-hiprt:6.4.0-47\n" - "/opt/rocm/.info/version-hiprt-devel:6.4.0-47\n" - "/opt/rocm/.info/version-hip-sdk:6.4.0-47\n" - "/opt/rocm/.info/version-lrt:6.4.0-47\n" - "/opt/rocm/.info/version-ml-libraries:6.4.0-47\n" - "/opt/rocm/.info/version-ml-sdk:6.4.0-47\n" - "/opt/rocm/.info/version-oclrt:6.4.0-47\n" - "/opt/rocm/.info/version-ocl-sdk:6.4.0-47\n" - "/opt/rocm/.info/version-openmp-sdk:6.4.0-47\n" - "/opt/rocm/.info/version-rocm:6.4.0-47\n" - "/opt/rocm/.info/version-rocm-developer-tools:6.4.0-47\n" - "/opt/rocm/.info/version-utils:6.4.0-47\n" +def test_collect_invalid_sub_version_format(collector): + """Invalid sub-version values fail model validation during collection.""" + collector._run_sut_cmd = MagicMock( + side_effect=[ + MagicMock(exit_code=0, stdout="/opt/rocm/.info/version-rocm:not-a-version\n"), + MagicMock(exit_code=0, stdout="6.2.0-66"), + ] ) - expected_sub_versions = { - "version": "6.4.0-47", - "version-hip-libraries": "6.4.0-47", - "version-hiprt": "6.4.0-47", - "version-hiprt-devel": "6.4.0-47", - "version-hip-sdk": "6.4.0-47", - "version-lrt": "6.4.0-47", - "version-ml-libraries": "6.4.0-47", - "version-ml-sdk": "6.4.0-47", - "version-oclrt": "6.4.0-47", - "version-ocl-sdk": "6.4.0-47", - "version-openmp-sdk": "6.4.0-47", - "version-rocm": "6.4.0-47", - "version-rocm-developer-tools": "6.4.0-47", - "version-utils": "6.4.0-47", - } + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.ERROR + assert data is None + assert any("Invalid ROCm version format" in event.description for event in result.events) + + +def test_collect_rocm_sub_versions(collector): + """Test collection of ROCm version and multiple sub-versions (error-scraper test_run_new_version).""" collector._run_sut_cmd = MagicMock( side_effect=[ - # First: grep . -r /opt/rocm/.info/* (sub-versions) - MagicMock(exit_code=0, stdout=sub_versions_stdout), - # Second: grep . /opt/rocm/.info/version-rocm (main version) + MagicMock(exit_code=0, stdout=ROCM_6_4_SUB_VERSIONS_STDOUT), MagicMock(exit_code=0, stdout="6.4.0-47"), - # Optional data (all fail for minimal test) - MagicMock(exit_code=1, stdout=""), # latest path - MagicMock(exit_code=1, stdout=""), # all paths - MagicMock(exit_code=1, stdout=""), # rocminfo - MagicMock(exit_code=1, stdout=""), # ld.so.conf - MagicMock(exit_code=1, stdout=""), # rocm_libs - MagicMock(exit_code=1, stdout=""), # env_vars - MagicMock(exit_code=1, stdout=""), # clinfo - MagicMock(exit_code=1, stdout=""), # kfd_proc + *_optional_collection_failures(), ] ) @@ -355,6 +392,51 @@ def test_collect_rocm_sub_versions(collector): assert result.status == ExecutionStatus.OK assert data is not None assert data.rocm_version == "6.4.0-47" - assert data.rocm_sub_versions == expected_sub_versions + assert data.rocm_sub_versions == ROCM_6_4_EXPECTED_SUB_VERSIONS + assert data.build_number == "47" assert any(event.category == "ROCM_VERSION_READ" for event in result.events) assert "ROCm version: 6.4.0-47" in result.message + + +def test_collect_rocm_version_with_gfx_suffix(collector): + """ROCm 7.13+ version strings may include build and gfx target suffixes.""" + gfx_version = ROCM_7_13_GFX_VERSION + collector._run_sut_cmd = MagicMock( + side_effect=[ + MagicMock( + exit_code=0, + stdout=f"/opt/rocm/.info/version-rocm:{gfx_version}\n", + ), + MagicMock(exit_code=0, stdout=gfx_version), + *_optional_collection_failures(), + ] + ) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rocm_version == gfx_version + assert data.rocm_sub_versions["version-rocm"] == gfx_version + assert data.build_number == "123" + + +def test_collect_sub_versions_skips_lines_without_filename(collector): + """Lines without a filename prefix are ignored (grep without -H can produce these).""" + collector._run_sut_cmd = MagicMock( + side_effect=[ + MagicMock( + exit_code=0, + stdout="6.4.0-47\n/opt/rocm/.info/version-rocm:6.4.0-47\n", + ), + MagicMock(exit_code=0, stdout="6.4.0-47"), + *_optional_collection_failures(), + ] + ) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rocm_sub_versions == {"version-rocm": "6.4.0-47"} + assert data.build_number == "47" From e3ceca6a5f02b2c2cc9218a31c876db574a92a38 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 21 May 2026 00:28:03 +0000 Subject: [PATCH 20/25] docs: Update plugin documentation [automated] --- docs/PLUGIN_DOC.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 6ae77cc4..509e8c84 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -17,7 +17,7 @@ | KernelModulePlugin | cat /proc/modules
modinfo amdgpu
wmic os get Version /Value | **Analyzer Args:**
- `kernel_modules`: dict[str, dict] — Expected kernel module name -> {version, etc.}. Analyzer checks collected modules match.
- `regex_filter`: list[str] — List of regex patterns to filter which collected modules are checked (default: amd). | - | [KernelModuleDataModel](#KernelModuleDataModel-Model) | [KernelModuleCollector](#Collector-Class-KernelModuleCollector) | [KernelModuleAnalyzer](#Data-Analyzer-Class-KernelModuleAnalyzer) | | MemoryPlugin | free -b
lsmem
numactl -H
wmic OS get FreePhysicalMemory /Value; wmic ComputerSystem get TotalPhysicalMemory /Value | **Analyzer Args:**
- `ratio`: float — Required free-memory ratio (0-1). Analysis fails if free/total < ratio.
- `memory_threshold`: str — Minimum free memory required (e.g. '30Gi', '1T'). Used when ratio is not sufficient. | - | [MemoryDataModel](#MemoryDataModel-Model) | [MemoryCollector](#Collector-Class-MemoryCollector) | [MemoryAnalyzer](#Data-Analyzer-Class-MemoryAnalyzer) | | NetworkPlugin | ip addr show
curl
ethtool -S {interface}
ethtool {interface}
lldpcli show neighbor
lldpctl
ip neighbor show
ping
rdma link -j
ip route show
ip rule show
wget | **Built-in Regexes:**
- tx_pfc_frames is non-zero: `^tx_pfc_frames$`
- tx_pfc_ena_frames_pri* is non-zero: `^tx_pfc_ena_frames_pri\d+$`
- pfc_pri*_tx_transitions is non-zero: `^pfc_pri\d+_tx_transitions$`
**Analyzer Args:**
- `error_regex`: Union[list[nodescraper.base.regexanalyzer.ErrorRegex], list[dict], NoneType] — Custom error regex patterns; each item can be ErrorRegex or dict with category/pattern. | **Collection Args:**
- `url`: Optional[str] — Optional URL to probe for network connectivity (used with netprobe).
- `netprobe`: Optional[Literal['ping', 'wget', 'curl']] — Tool to use for network connectivity probe: ping, wget, or curl. | [NetworkDataModel](#NetworkDataModel-Model) | [NetworkCollector](#Collector-Class-NetworkCollector) | [NetworkAnalyzer](#Data-Analyzer-Class-NetworkAnalyzer) | -| NicPlugin | niccli --listdev
niccli --list
niccli --list_devices
niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering
niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering
niccli -dev {device_num} nvm -getoption performance_profile
niccli --dev {device_num} nvm --getoption performance_profile
niccli -dev {device_num} nvm -getoption support_rdma -scope 0
niccli -dev {device_num} getqos
niccli --dev {device_num} nvm --getoption support_rdma
niccli --dev {device_num} qos --ets --show
niccli --version
nicctl show card
nicctl --version
nicctl show card flash partition --json
nicctl show card interrupts --json
nicctl show card logs --non-persistent
nicctl show card logs --boot-fault
nicctl show card logs --persistent
nicctl show card profile --json
nicctl show card time --json
nicctl show card statistics packet-buffer summary --json
nicctl show lif statistics --json
nicctl show lif internal queue-to-ud-pinning
nicctl show pipeline internal anomalies
nicctl show pipeline internal rsq-ring
nicctl show pipeline internal statistics memory
nicctl show port fsm
nicctl show port transceiver --json
nicctl show port statistics --json
nicctl show port internal mac
nicctl show qos headroom --json
nicctl show rdma queue --json
nicctl show rdma queue-pair --detail --json
nicctl show version firmware
nicctl show dcqcn
nicctl show environment
nicctl show lif
nicctl show pcie ats
nicctl show port
nicctl show qos
nicctl show rdma statistics
nicctl show version host-software
nicctl show dcqcn --card {card_id} --json
nicctl show card hardware-config --card {card_id} | **Analyzer Args:**
- `expected_values`: Optional[Dict[str, Dict[str, Any]]] — Per-command expected checks keyed by canonical key (see command_to_canonical_key).
- `performance_profile_expected`: str — Expected Broadcom performance_profile value (case-insensitive). Default RoCE.
- `support_rdma_disabled_values`: List[str] — Values that indicate RDMA is not supported (case-insensitive).
- `pcie_relaxed_ordering_expected`: str — Expected Broadcom pcie_relaxed_ordering value (e.g. 'Relaxed ordering = enabled'); checked case-insensitively. Defaul...
- `expected_qos_prio_map`: Optional[Dict[Any, Any]] — Expected priority-to-TC map (e.g. {0: 0, 1: 1}; keys may be int or str in config). Checked per device when set.
- `expected_qos_pfc_enabled`: Optional[int] — Expected PFC enabled value (0/1 or bitmask). Checked per device when set.
- `expected_qos_tsa_map`: Optional[Dict[Any, Any]] — Expected TSA map for ETS (e.g. {0: 'ets', 1: 'strict'}; keys may be int or str in config). Checked per device when set.
- `expected_qos_tc_bandwidth`: Optional[List[int]] — Expected TC bandwidth percentages. Checked per device when set.
- `require_qos_consistent_across_adapters`: bool — When True and no expected_qos_* are set, require all adapters to have the same prio_map, pfc_enabled, and tsa_map.
- `nicctl_log_error_regex`: Optional[List[Dict[str, Any]]] — Optional list of error patterns for nicctl show card logs. | **Collection Args:**
- `commands`: Optional[List[str]] — Optional list of niccli/nicctl commands to run. When None, default command set is used.
- `use_sudo_niccli`: bool — If True, run niccli commands with sudo when required.
- `use_sudo_nicctl`: bool — If True, run nicctl commands with sudo when required. | [NicDataModel](#NicDataModel-Model) | [NicCollector](#Collector-Class-NicCollector) | [NicAnalyzer](#Data-Analyzer-Class-NicAnalyzer) | +| NicPlugin | niccli --listdev
niccli --list
niccli --list_devices
niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering
niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering
niccli -dev {device_num} nvm -getoption performance_profile
niccli --dev {device_num} nvm --getoption performance_profile
niccli -dev {device_num} nvm -getoption support_rdma -scope 0
niccli -dev {device_num} getqos
niccli --dev {device_num} nvm --getoption support_rdma --scope 0
niccli --dev {device_num} qos --ets --show
niccli --version
nicctl show card
nicctl --version
nicctl show card flash partition --json
nicctl show card interrupts --json
nicctl show card logs --non-persistent
nicctl show card logs --boot-fault
nicctl show card logs --persistent
nicctl show card profile --json
nicctl show card time --json
nicctl show card statistics packet-buffer summary --json
nicctl show lif statistics --json
nicctl show lif internal queue-to-ud-pinning
nicctl show pipeline internal anomalies
nicctl show pipeline internal rsq-ring
nicctl show pipeline internal statistics memory
nicctl show port fsm
nicctl show port transceiver --json
nicctl show port statistics --json
nicctl show port internal mac
nicctl show qos headroom --json
nicctl show rdma queue --json
nicctl show rdma queue-pair --detail --json
nicctl show version firmware
nicctl show dcqcn
nicctl show environment
nicctl show lif
nicctl show pcie ats
nicctl show port
nicctl show qos
nicctl show rdma statistics
nicctl show version host-software
nicctl show dcqcn --card {card_id} --json
nicctl show card hardware-config --card {card_id} | **Analyzer Args:**
- `expected_values`: Optional[Dict[str, Dict[str, Any]]] — Per-command expected checks keyed by canonical key (see command_to_canonical_key).
- `performance_profile_expected`: str — Expected Broadcom performance_profile value (case-insensitive). Default RoCE.
- `support_rdma_disabled_values`: List[str] — Values that indicate RDMA is not supported (case-insensitive).
- `pcie_relaxed_ordering_expected`: str — Expected Broadcom pcie_relaxed_ordering value (e.g. 'Relaxed ordering = enabled'); checked case-insensitively. Defaul...
- `expected_qos_prio_map`: Optional[Dict[Any, Any]] — Expected priority-to-TC map (e.g. {0: 0, 1: 1}; keys may be int or str in config). Checked per device when set.
- `expected_qos_pfc_enabled`: Optional[int] — Expected PFC enabled value (0/1 or bitmask). Checked per device when set.
- `expected_qos_tsa_map`: Optional[Dict[Any, Any]] — Expected TSA map for ETS (e.g. {0: 'ets', 1: 'strict'}; keys may be int or str in config). Checked per device when set.
- `expected_qos_tc_bandwidth`: Optional[List[int]] — Expected TC bandwidth percentages. Checked per device when set.
- `require_qos_consistent_across_adapters`: bool — When True and no expected_qos_* are set, require all adapters to have the same prio_map, pfc_enabled, and tsa_map.
- `nicctl_log_error_regex`: Optional[List[Dict[str, Any]]] — Optional list of error patterns for nicctl show card logs. | **Collection Args:**
- `commands`: Optional[List[str]] — Optional list of niccli/nicctl commands to run. When None, default command set is used.
- `use_sudo_niccli`: bool — If True, run niccli commands with sudo when required.
- `use_sudo_nicctl`: bool — If True, run nicctl commands with sudo when required. | [NicDataModel](#NicDataModel-Model) | [NicCollector](#Collector-Class-NicCollector) | [NicAnalyzer](#Data-Analyzer-Class-NicAnalyzer) | | NvmePlugin | nvme smart-log {dev}
nvme error-log {dev} --log-entries=256
nvme id-ctrl {dev}
nvme id-ns {dev}{ns}
nvme fw-log {dev}
nvme self-test-log {dev}
nvme get-log {dev} --log-id=6 --log-len=512
nvme telemetry-log {dev} --output-file={dev}_{f_name}
nvme list -o json | - | - | [NvmeDataModel](#NvmeDataModel-Model) | [NvmeCollector](#Collector-Class-NvmeCollector) | - | | OsPlugin | sh -c '( lsb_release -ds || (cat /etc/*release | grep PRETTY_NAME) || uname -om ) 2>/dev/null | head -n1'
cat /etc/*release | grep VERSION_ID
wmic os get Version /value
wmic os get Caption /Value | **Analyzer Args:**
- `exp_os`: Union[str, list] — Expected OS name/version string(s) to match (e.g. from lsb_release or /etc/os-release).
- `exact_match`: bool — If True, require exact match for exp_os; otherwise substring match. | - | [OsDataModel](#OsDataModel-Model) | [OsCollector](#Collector-Class-OsCollector) | [OsAnalyzer](#Data-Analyzer-Class-OsAnalyzer) | | PackagePlugin | dnf list --installed
dpkg-query -W
pacman -Q
cat /etc/*release
wmic product get name,version | **Analyzer Args:**
- `exp_package_ver`: Dict[str, Optional[str]] — Map package name -> expected version (None = any version). Checked against installed packages.
- `regex_match`: bool — If True, match package versions with regex; otherwise exact or prefix match.
- `rocm_regex`: Optional[str] — Optional regex to identify ROCm package version (used when enable_rocm_regex is True).
- `enable_rocm_regex`: bool — If True, use rocm_regex (or default pattern) to extract ROCm version for checks. | - | [PackageDataModel](#PackageDataModel-Model) | [PackageCollector](#Collector-Class-PackageCollector) | [PackageAnalyzer](#Data-Analyzer-Class-PackageAnalyzer) | @@ -461,12 +461,12 @@ Collect raw output from niccli (Broadcom) and nicctl (Pensando) commands. niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering, niccli -dev {device_num} getqos ]` -- **CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_NEW**: `niccli --dev {device_num} nvm --getoption support_rdma` +- **CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_NEW**: `niccli --dev {device_num} nvm --getoption support_rdma --scope 0` - **CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE_NEW**: `niccli --dev {device_num} nvm --getoption performance_profile` - **CMD_NICCLI_PCIE_RELAXED_ORDERING_TEMPLATE_NEW**: `niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering` - **CMD_NICCLI_QOS_TEMPLATE_NEW**: `niccli --dev {device_num} qos --ets --show` - **CMD_NICCLI_PER_DEVICE_NEW**: `[ - niccli --dev {device_num} nvm --getoption support_rdma, + niccli --dev {device_num} nvm --getoption support_rdma --scope 0, niccli --dev {device_num} nvm --getoption performance_profile, niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering, niccli --dev {device_num} qos --ets --show @@ -533,7 +533,7 @@ NicDataModel - niccli --dev {device_num} nvm --getoption performance_profile - niccli -dev {device_num} nvm -getoption support_rdma -scope 0 - niccli -dev {device_num} getqos -- niccli --dev {device_num} nvm --getoption support_rdma +- niccli --dev {device_num} nvm --getoption support_rdma --scope 0 - niccli --dev {device_num} qos --ets --show - niccli --version - nicctl show card From f2bf3db1c5cb359233268a58e5048d4cb1577a3c Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 27 May 2026 08:58:22 -0500 Subject: [PATCH 21/25] readme fix --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2fda59b4..370005ee 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ system debug. ## Table of Contents - [Installation](#installation) - [Install from PyPI](#install-from-pypi) - - [Install From Source](#install-from-source) + - [Install from Source](#install-from-source) - [CLI Usage](#cli-usage) - [Execution Methods](#execution-methods) - [Example: Remote Execution](#example-remote-execution) @@ -38,7 +38,7 @@ Use a virtual environment if you prefer. After installation, confirm the CLI is node-scraper --help ``` -### Install From Source +### Install from Source Node Scraper requires Python 3.9+ for installation. After cloning this repository, call dev-setup.sh script with 'source'. This script creates an editable install of Node Scraper in a python virtual environment and also configures the pre-commit hooks for the project. From 508ef2aea22c0aa619acffa440fdc857eab4a369 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 27 May 2026 16:27:22 -0500 Subject: [PATCH 22/25] utest fix --- test/unit/framework/common/shared_utils.py | 7 +++++++ test/unit/framework/test_dataplugin.py | 3 ++- test/unit/framework/test_plugin_executor.py | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/test/unit/framework/common/shared_utils.py b/test/unit/framework/common/shared_utils.py index 1ffda40d..11e6f541 100644 --- a/test/unit/framework/common/shared_utils.py +++ b/test/unit/framework/common/shared_utils.py @@ -26,6 +26,7 @@ from typing import Optional from unittest.mock import MagicMock +from nodescraper.constants import DEFAULT_EVENT_REPORTER from nodescraper.enums import ExecutionStatus from nodescraper.interfaces import ConnectionManager, PluginInterface from nodescraper.models import AnalyzerArgs, PluginResult, TaskResult @@ -43,6 +44,9 @@ def __init__( parent=None, task_result_hooks=None, connection_args=None, + event_reporter: str = DEFAULT_EVENT_REPORTER, + session_id: Optional[str] = None, + **kwargs, ): super().__init__( system_info=system_info, @@ -50,6 +54,9 @@ def __init__( parent=parent, task_result_hooks=task_result_hooks, connection_args=connection_args, + event_reporter=event_reporter, + session_id=session_id, + **kwargs, ) # Use the class variable if available, otherwise create a new MagicMock self.connection = ( diff --git a/test/unit/framework/test_dataplugin.py b/test/unit/framework/test_dataplugin.py index 29816efa..e88f8cc5 100644 --- a/test/unit/framework/test_dataplugin.py +++ b/test/unit/framework/test_dataplugin.py @@ -28,7 +28,7 @@ from unittest.mock import MagicMock, patch import pytest -from common.shared_utils import MockConnectionManager +from framework.common.shared_utils import MockConnectionManager from nodescraper.enums import EventPriority, ExecutionStatus, SystemInteractionLevel from nodescraper.interfaces.dataanalyzertask import DataAnalyzer @@ -154,6 +154,7 @@ def test_collect_creates_connection_manager(self, plugin, conn_mock, system_info parent=plugin.__class__.__name__, task_result_hooks=plugin.task_result_hooks, event_reporter=plugin.event_reporter, + session_id=plugin.session_id, ) mock_collect.assert_called_once() assert result.status == ExecutionStatus.OK diff --git a/test/unit/framework/test_plugin_executor.py b/test/unit/framework/test_plugin_executor.py index 0f3568f5..fe9a8954 100644 --- a/test/unit/framework/test_plugin_executor.py +++ b/test/unit/framework/test_plugin_executor.py @@ -24,7 +24,7 @@ # ############################################################################### import pytest -from common.shared_utils import DummyDataModel, MockConnectionManager +from framework.common.shared_utils import DummyDataModel, MockConnectionManager from pydantic import BaseModel from nodescraper.enums import ExecutionStatus From bc0160ea6c60d56416622b247a4577194a324035 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 27 May 2026 16:30:14 -0500 Subject: [PATCH 23/25] enhancements --- .../device_enumeration_collector.py | 6 +++- .../plugins/inband/pcie/pcie_analyzer.py | 28 +++++++++++++++++-- .../test_device_enumeration_collector.py | 12 ++++++-- 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py b/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py index 82a82f91..9b0dc295 100644 --- a/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py +++ b/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py @@ -38,7 +38,11 @@ class DeviceEnumerationCollector(InBandDataCollector[DeviceEnumerationDataModel, DATA_MODEL = DeviceEnumerationDataModel - CMD_GPU_COUNT_LINUX = "lspci -d {vendorid_ep}: | grep -i 'VGA\\|Display\\|3D' | wc -l" + CMD_GPU_COUNT_LINUX = ( + "lspci -d {vendorid_ep}: | grep -iE " + "'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | " + "grep -vi 'Virtual Function' | wc -l" + ) CMD_VF_COUNT_LINUX = "lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l" CMD_LSCPU_LINUX = "lscpu" CMD_LSHW_LINUX = "lshw" diff --git a/nodescraper/plugins/inband/pcie/pcie_analyzer.py b/nodescraper/plugins/inband/pcie/pcie_analyzer.py index 7d9a7e58..43bf0213 100755 --- a/nodescraper/plugins/inband/pcie/pcie_analyzer.py +++ b/nodescraper/plugins/inband/pcie/pcie_analyzer.py @@ -53,6 +53,9 @@ T_CAP = TypeVar("T_CAP", bound=PcieCapStructure) +_AMD_PCIE_BRIDGE_DEVICE_IDS = frozenset({0x1500, 0x1501}) +_PCI_BASE_CLASS_BRIDGE = 0x06 + class PcieAnalyzerInputModel(BaseModel): """ @@ -870,6 +873,20 @@ def filter_pcie_data_by_device_id( new_cfg_space_dict[bdf] = pcie_data return new_cfg_space_dict + @staticmethod + def _is_amd_gpu_pcie_endpoint(cfg_space: PcieCfgSpace, vendorid_ep: int) -> bool: + """True if this config space is an AMD GPU/accelerator endpoint, not a bridge.""" + t0 = cfg_space.type_0_configuration + if t0.vendor_id.val != vendorid_ep: + return False + device_id = t0.device_id.val + if device_id in _AMD_PCIE_BRIDGE_DEVICE_IDS: + return False + base_class = t0.class_code.val + if base_class == _PCI_BASE_CLASS_BRIDGE: + return False + return True + def check_gpu_count( self, pcie_data: PcieDataModel, @@ -888,10 +905,15 @@ def check_gpu_count( return gpu_count_from_pcie = 0 + bridge_count = 0 for cfg_space in pcie_data.pcie_cfg_space.values(): - vendor_id = cfg_space.type_0_configuration.vendor_id.val - if vendor_id == self.system_info.vendorid_ep: + t0 = cfg_space.type_0_configuration + if t0.vendor_id.val != self.system_info.vendorid_ep: + continue + if self._is_amd_gpu_pcie_endpoint(cfg_space, self.system_info.vendorid_ep): gpu_count_from_pcie += 1 + else: + bridge_count += 1 if gpu_count_from_pcie != expected_gpu_count: self._log_event( @@ -900,6 +922,7 @@ def check_gpu_count( priority=EventPriority.ERROR, data={ "gpu_count_from_pcie": gpu_count_from_pcie, + "amd_pcie_bridge_count_excluded": bridge_count, "expected_gpu_count": expected_gpu_count, }, ) @@ -910,6 +933,7 @@ def check_gpu_count( priority=EventPriority.INFO, data={ "gpu_count": gpu_count_from_pcie, + "amd_pcie_bridge_count_excluded": bridge_count, }, ) diff --git a/test/unit/plugin/test_device_enumeration_collector.py b/test/unit/plugin/test_device_enumeration_collector.py index 795611a6..50335f1f 100644 --- a/test/unit/plugin/test_device_enumeration_collector.py +++ b/test/unit/plugin/test_device_enumeration_collector.py @@ -66,7 +66,11 @@ def test_collect_linux(system_info, device_enumeration_collector): exit_code=0, stdout="8", stderr="", - command="lspci -d 1002: | grep -i 'VGA\\|Display\\|3D' | wc -l", + command=( + "lspci -d 1002: | grep -iE " + "'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | " + "grep -vi 'Virtual Function' | wc -l" + ), ), MagicMock( exit_code=0, @@ -142,7 +146,11 @@ def test_collect_error(system_info, device_enumeration_collector): exit_code=1, stdout="some output", stderr="command failed", - command="lspci -d 1002: | grep -i 'VGA\\|Display\\|3D' | wc -l", + command=( + "lspci -d 1002: | grep -iE " + "'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | " + "grep -vi 'Virtual Function' | wc -l" + ), ), MagicMock( exit_code=1, From 47c1131a306a2eb0aa147a56022700d3718e35fb Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 28 May 2026 00:28:58 +0000 Subject: [PATCH 24/25] docs: Update plugin documentation [automated] --- docs/PLUGIN_DOC.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 509e8c84..23000097 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -24,7 +24,7 @@ | PciePlugin | lspci -d {vendor_id}: -nn
lspci -x
lspci -xxxx
lspci -PP
lspci -PP -d {vendor_id}:{dev_id}
lspci -vvv
lspci -vvvt | **Analyzer Args:**
- `exp_speed`: int — Expected PCIe link speed (generation 1–5).
- `exp_width`: int — Expected PCIe link width in lanes (1–16).
- `exp_sriov_count`: int — Expected SR-IOV virtual function count.
- `exp_gpu_count_override`: Optional[int] — Override expected GPU count for validation.
- `exp_max_payload_size`: Union[Dict[int, int], int, NoneType] — Expected max payload size: int for all devices, or dict keyed by device ID.
- `exp_max_rd_req_size`: Union[Dict[int, int], int, NoneType] — Expected max read request size: int for all devices, or dict keyed by device ID.
- `exp_ten_bit_tag_req_en`: Union[Dict[int, int], int, NoneType] — Expected 10-bit tag request enable: int for all devices, or dict keyed by device ID. | - | [PcieDataModel](#PcieDataModel-Model) | [PcieCollector](#Collector-Class-PcieCollector) | [PcieAnalyzer](#Data-Analyzer-Class-PcieAnalyzer) | | ProcessPlugin | top -b -n 1
rocm-smi --showpids
top -b -n 1 -o %CPU | **Analyzer Args:**
- `max_kfd_processes`: int — Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check.
- `max_cpu_usage`: float — Maximum allowed CPU usage (percent) for process checks. | **Collection Args:**
- `top_n_process`: int — Number of top processes by CPU usage to collect (e.g. for top -b -n 1 -o %%CPU). | [ProcessDataModel](#ProcessDataModel-Model) | [ProcessCollector](#Collector-Class-ProcessCollector) | [ProcessAnalyzer](#Data-Analyzer-Class-ProcessAnalyzer) | | RdmaPlugin | rdma link -j
rdma dev
rdma link
rdma statistic -j | - | - | [RdmaDataModel](#RdmaDataModel-Model) | [RdmaCollector](#Collector-Class-RdmaCollector) | [RdmaAnalyzer](#Data-Analyzer-Class-RdmaAnalyzer) | -| RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d {rocm_path}*
ls -v -d {rocm_path}-[3-7]* | tail -1
ldconfig -p | grep -i -E 'rocm'
grep . -r {rocm_path}/.info/* | **Analyzer Args:**
- `exp_rocm`: Union[str, list] — Expected ROCm version string(s) to match (e.g. from rocminfo).
- `exp_rocm_latest`: str — Expected 'latest' ROCm path or version string for versioned installs.
- `exp_rocm_sub_versions`: dict[str, Union[str, list]] — Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings. | **Collection Args:**
- `rocm_path`: str — Base path to ROCm installation (e.g. /opt/rocm). Used for rocminfo, clinfo, and version discovery. | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) | +| RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d {rocm_path}*
ls -v -d {rocm_path}-[3-7]* | tail -1
ldconfig -p | grep -i -E 'rocm'
grep . -H -r -i {rocm_path}/.info/* | **Analyzer Args:**
- `exp_rocm`: Union[str, list] — Expected ROCm version string(s) to match (e.g. from rocminfo).
- `exp_rocm_latest`: str — Expected 'latest' ROCm path or version string for versioned installs.
- `exp_rocm_sub_versions`: dict[str, Union[str, list]] — Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings. | **Collection Args:**
- `rocm_path`: str — Base path to ROCm installation (e.g. /opt/rocm). Used for rocminfo, clinfo, and version discovery. | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) | | StoragePlugin | sh -c 'df -lH -B1 | grep -v 'boot''
wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace | - | **Collection Args:**
- `skip_sudo`: bool — If True, do not use sudo when running df and related storage commands. | [StorageDataModel](#StorageDataModel-Model) | [StorageCollector](#Collector-Class-StorageCollector) | [StorageAnalyzer](#Data-Analyzer-Class-StorageAnalyzer) | | SysSettingsPlugin | cat /sys/{}
ls -1 /sys/{}
ls -l /sys/{} | **Analyzer Args:**
- `checks`: Optional[list[nodescraper.plugins.inband.sys_settings.analyzer_args.SysfsCheck]] — List of sysfs checks (path, expected values or pattern, display name). | **Collection Args:**
- `paths`: list[str] — Sysfs paths to read (cat). Paths with '*' are collected with ls -l (e.g. class/net/*/device).
- `directory_paths`: list[str] — Sysfs paths to list (ls -1); used for checks that match entry names by regex. | [SysSettingsDataModel](#SysSettingsDataModel-Model) | [SysSettingsCollector](#Collector-Class-SysSettingsCollector) | [SysSettingsAnalyzer](#Data-Analyzer-Class-SysSettingsAnalyzer) | | SysctlPlugin | sysctl -n | **Analyzer Args:**
- `exp_vm_swappiness`: Optional[int] — Expected vm.swappiness value.
- `exp_vm_numa_balancing`: Optional[int] — Expected vm.numa_balancing value.
- `exp_vm_oom_kill_allocating_task`: Optional[int] — Expected vm.oom_kill_allocating_task value.
- `exp_vm_compaction_proactiveness`: Optional[int] — Expected vm.compaction_proactiveness value.
- `exp_vm_compact_unevictable_allowed`: Optional[int] — Expected vm.compact_unevictable_allowed value.
- `exp_vm_extfrag_threshold`: Optional[int] — Expected vm.extfrag_threshold value.
- `exp_vm_zone_reclaim_mode`: Optional[int] — Expected vm.zone_reclaim_mode value.
- `exp_vm_dirty_background_ratio`: Optional[int] — Expected vm.dirty_background_ratio value.
- `exp_vm_dirty_ratio`: Optional[int] — Expected vm.dirty_ratio value.
- `exp_vm_dirty_writeback_centisecs`: Optional[int] — Expected vm.dirty_writeback_centisecs value.
- `exp_kernel_numa_balancing`: Optional[int] — Expected kernel.numa_balancing value. | - | [SysctlDataModel](#SysctlDataModel-Model) | [SysctlCollector](#Collector-Class-SysctlCollector) | [SysctlAnalyzer](#Data-Analyzer-Class-SysctlAnalyzer) | @@ -794,7 +794,7 @@ Collect ROCm version data ### Class Variables - **SUPPORTED_OS_FAMILY**: `{}` -- **CMD_ROCM_SUB_VERSIONS_TMPL**: `grep . -r {rocm_path}/.info/*` +- **CMD_ROCM_SUB_VERSIONS_TMPL**: `grep . -H -r -i {rocm_path}/.info/*` - **CMD_ROCMINFO_TMPL**: `{rocm_path}/bin/rocminfo` - **CMD_ROCM_LATEST_TMPL**: `ls -v -d {rocm_path}-[3-7]* | tail -1` - **CMD_ROCM_DIRS_TMPL**: `ls -v -d {rocm_path}*` @@ -818,7 +818,7 @@ RocmDataModel - ls -v -d {rocm_path}* - ls -v -d {rocm_path}-[3-7]* | tail -1 - ldconfig -p | grep -i -E 'rocm' -- grep . -r {rocm_path}/.info/* +- grep . -H -r -i {rocm_path}/.info/* ## Collector Class StorageCollector @@ -1285,6 +1285,7 @@ Data model for RDMA (Remote Direct Memory Access) statistics and link informatio ### Model annotations and fields +- **ROCM_VERSION_FILENAME**: `ClassVar[str]` - **rocm_version**: `str` - **rocm_sub_versions**: `dict[str, str]` - **rocminfo**: `List[str]` From 641082e74ad2bae908d040f2a266ef08f60e14ad Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 28 May 2026 13:57:00 +0000 Subject: [PATCH 25/25] docs: Update plugin documentation [automated] --- docs/PLUGIN_DOC.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 23000097..3b6c3a3f 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -7,7 +7,7 @@ | AmdSmiPlugin | bad-pages
firmware --json
list --json
metric -g all
partition --json
process --json
ras --cper --folder={folder}
ras --afid --cper-file {cper_file}
static -g all --json
static -g {gpu_id} --json
topology
version --json
xgmi -l
xgmi -m | **Analyzer Args:**
- `check_static_data`: bool — If True, run static data checks (e.g. driver version, partition mode).
- `expected_gpu_processes`: Optional[int] — Expected number of GPU processes.
- `expected_max_power`: Optional[int] — Expected maximum power value (e.g. watts).
- `expected_driver_version`: Optional[str] — Expected AMD driver version string.
- `expected_memory_partition_mode`: Optional[str] — Expected memory partition mode (e.g. sp3, dp).
- `expected_compute_partition_mode`: Optional[str] — Expected compute partition mode.
- `expected_firmware_versions`: Optional[dict[str, str]] — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).
- `l0_to_recovery_count_error_threshold`: Optional[int] — L0-to-recovery count above which an error is raised.
- `l0_to_recovery_count_warning_threshold`: Optional[int] — L0-to-recovery count above which a warning is raised.
- `vendorid_ep`: Optional[str] — Expected endpoint vendor ID (e.g. for PCIe).
- `vendorid_ep_vf`: Optional[str] — Expected endpoint VF vendor ID.
- `devid_ep`: Optional[str] — Expected endpoint device ID.
- `devid_ep_vf`: Optional[str] — Expected endpoint VF device ID.
- `sku_name`: Optional[str] — Expected SKU name string for GPU.
- `expected_xgmi_speed`: Optional[list[float]] — Expected xGMI speed value(s) (e.g. link rate).
- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for time-windowed analysis.
- `analysis_range_end`: Optional[datetime.datetime] — End of time range for time-windowed analysis. | **Collection Args:**
- `analysis_firmware_ids`: Optional[list[str]] — amd-smi fw_id values to record in analysis_ref.firmware_versions
- `cper_file_path`: Optional[str] — Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file). | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) | | BiosPlugin | sh -c 'cat /sys/devices/virtual/dmi/id/bios_version'
wmic bios get SMBIOSBIOSVersion /Value | **Analyzer Args:**
- `exp_bios_version`: list[str] — Expected BIOS version(s) to match against collected value (str or list).
- `regex_match`: bool — If True, match exp_bios_version as regex; otherwise exact match. | - | [BiosDataModel](#BiosDataModel-Model) | [BiosCollector](#Collector-Class-BiosCollector) | [BiosAnalyzer](#Data-Analyzer-Class-BiosAnalyzer) | | CmdlinePlugin | cat /proc/cmdline | **Analyzer Args:**
- `required_cmdline`: Union[str, List] — Command-line parameters that must be present (e.g. 'pci=bfsort').
- `banned_cmdline`: Union[str, List] — Command-line parameters that must not be present.
- `os_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-OS overrides for required_cmdline and banned_cmdline (keyed by OS identifier).
- `platform_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-platform overrides for required_cmdline and banned_cmdline (keyed by platform). | - | [CmdlineDataModel](#CmdlineDataModel-Model) | [CmdlineCollector](#Collector-Class-CmdlineCollector) | [CmdlineAnalyzer](#Data-Analyzer-Class-CmdlineAnalyzer) | -| DeviceEnumerationPlugin | powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count"
lspci -d {vendorid_ep}: | grep -i 'VGA\|Display\|3D' | wc -l
powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count"
lscpu
lshw
lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l
powershell -Command "(Get-VMHostPartitionableGpu | Measure-Object).Count" | **Analyzer Args:**
- `cpu_count`: Optional[list[int]] — Expected CPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `gpu_count`: Optional[list[int]] — Expected GPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `vf_count`: Optional[list[int]] — Expected virtual function count(s); pass as int or list of ints. Analysis passes if actual is in list. | - | [DeviceEnumerationDataModel](#DeviceEnumerationDataModel-Model) | [DeviceEnumerationCollector](#Collector-Class-DeviceEnumerationCollector) | [DeviceEnumerationAnalyzer](#Data-Analyzer-Class-DeviceEnumerationAnalyzer) | +| DeviceEnumerationPlugin | powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count"
lspci -d {vendorid_ep}: | grep -iE 'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | grep -vi 'Virtual Function' | wc -l
powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count"
lscpu
lshw
lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l
powershell -Command "(Get-VMHostPartitionableGpu | Measure-Object).Count" | **Analyzer Args:**
- `cpu_count`: Optional[list[int]] — Expected CPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `gpu_count`: Optional[list[int]] — Expected GPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `vf_count`: Optional[list[int]] — Expected virtual function count(s); pass as int or list of ints. Analysis passes if actual is in list. | - | [DeviceEnumerationDataModel](#DeviceEnumerationDataModel-Model) | [DeviceEnumerationCollector](#Collector-Class-DeviceEnumerationCollector) | [DeviceEnumerationAnalyzer](#Data-Analyzer-Class-DeviceEnumerationAnalyzer) | | DimmPlugin | sh -c 'dmidecode -t 17 | tr -s " " | grep -v "Volatile\|None\|Module" | grep Size' 2>/dev/null
dmidecode
wmic memorychip get Capacity | - | **Collection Args:**
- `skip_sudo`: bool — If True, do not use sudo when running dmidecode or wmic for memory info. | [DimmDataModel](#DimmDataModel-Model) | [DimmCollector](#Collector-Class-DimmCollector) | - | | DkmsPlugin | dkms status
dkms --version | **Analyzer Args:**
- `dkms_status`: Union[str, list] — Expected dkms status string(s) to match (e.g. 'amd/1.0.0'). At least one of dkms_status or dkms_version required.
- `dkms_version`: Union[str, list] — Expected dkms version string(s) to match. At least one of dkms_status or dkms_version required.
- `regex_match`: bool — If True, match dkms_status and dkms_version as regex; otherwise exact match. | - | [DkmsDataModel](#DkmsDataModel-Model) | [DkmsCollector](#Collector-Class-DkmsCollector) | [DkmsAnalyzer](#Data-Analyzer-Class-DkmsAnalyzer) | | DmesgPlugin | dmesg --time-format iso -x
ls -1 /var/log/dmesg* 2>/dev/null | grep -E '^/var/log/dmesg(\.[0-9]+(\.gz)?)?$' || true | **Built-in Regexes:**
- Out of memory error: `(?:oom_kill_process.*)|(?:Out of memory.*)`
- I/O Page Fault: `IO_PAGE_FAULT`
- Kernel Panic: `\bkernel panic\b.*`
- SQ Interrupt: `sq_intr`
- SRAM ECC: `sram_ecc.*`
- Failed to load driver. IP hardware init error.: `\[amdgpu\]\] \*ERROR\* hw_init of IP block.*`
- Failed to load driver. IP software init error.: `\[amdgpu\]\] \*ERROR\* sw_init of IP block.*`
- Real Time throttling activated: `sched: RT throttling activated.*`
- RCU preempt detected stalls: `rcu_preempt detected stalls.*`
- RCU preempt self-detected stall: `rcu_preempt self-detected stall.*`
- QCM fence timeout: `qcm fence wait loop timeout.*`
- General protection fault: `(?:[\w-]+(?:\[[0-9.]+\])?\s+)?general protectio...`
- Segmentation fault: `(?:segfault.*in .*\[)|(?:[Ss]egmentation [Ff]au...`
- Failed to disallow cf state: `amdgpu: Failed to disallow cf state.*`
- Failed to terminate tmr: `\*ERROR\* Failed to terminate tmr.*`
- Suspend of IP block failed: `\*ERROR\* suspend of IP block <\w+> failed.*`
- amdgpu Page Fault: `(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:\s+\[\S...`
- Page Fault: `page fault for address.*`
- Fatal error during GPU init: `(?:amdgpu)(.*Fatal error during GPU init)|(Fata...`
- PCIe AER Error Status: `(pcieport [\w:.]+: AER: aer_status:[^\n]*(?:\n[...`
- PCIe AER Correctable Error Status: `(.*aer_cor_status: 0x[0-9a-fA-F]+, aer_cor_mask...`
- PCIe AER Uncorrectable Error Status: `(.*aer_uncor_status: 0x[0-9a-fA-F]+, aer_uncor_...`
- PCIe AER Uncorrectable Error Severity with TLP Header: `(.*aer_uncor_severity: 0x[0-9a-fA-F]+.*)(\n.*TL...`
- Failed to read journal file: `Failed to read journal file.*`
- Journal file corrupted or uncleanly shut down: `journal corrupted or uncleanly shut down.*`
- ACPI BIOS Error: `ACPI BIOS Error`
- ACPI Error: `ACPI Error`
- Filesystem corrupted!: `EXT4-fs error \(device .*\):`
- Error in buffered IO, check filesystem integrity: `(Buffer I\/O error on dev)(?:ice)? (\w+)`
- PCIe card no longer present: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...`
- PCIe Link Down: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...`
- Mismatched clock configuration between PCIe device and host: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(curren...`
- RAS Correctable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Uncorrectable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Deferred Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Corrected PCIe Error: `((?:\[Hardware Error\]:\s+)?event severity: cor...`
- GPU Reset: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- GPU reset failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...`
- ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...`
- MCE Error: `\[Hardware Error\]:.+MC\d+_STATUS.*(?:\n.*){0,5}`
- Mode 2 Reset Failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)? (...`
- RAS Corrected Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- SGX Error: `x86/cpu: SGX disabled by BIOS`
- MMP Error: `Failed to load MMP firmware qat_4xxx_mmp.bin`
- GPU Throttled: `amdgpu \w{4}:\w{2}:\w{2}.\w: amdgpu: WARN: GPU ...`
- RAS Poison Consumed: `amdgpu[ 0-9a-fA-F:.]+:(?:\s*amdgpu:)?\s+(?:{\d+...`
- RAS Poison created: `amdgpu[ 0-9a-fA-F:.]+:(?:\s*amdgpu:)?\s+(?:{\d+...`
- Bad page threshold exceeded: `(amdgpu: Saved bad pages (\d+) reaches threshol...`
- RAS Hardware Error: `Hardware error from APEI Generic Hardware Error...`
- Error Address: `Error Address.*(?:\s.*)`
- RAS EDR Event: `EDR: EDR event received`
- DPC Event: `DPC: .*`
- LNet: ko2iblnd has no matching interfaces: `(?:\[[^\]]+\]\s*)?LNetError:.*ko2iblnd:\s*No ma...`
- LNet: Error starting up LNI: `(?:\[[^\]]+\]\s*)?LNetError:\s*.*Error\s*-?\d+\...`
- Lustre: network initialisation failed: `LustreError:.*ptlrpc_init_portals\(\).*network ...` | **Collection Args:**
- `collect_rotated_logs`: bool — If True, also collect rotated dmesg log files from /var/log/dmesg*.
- `skip_sudo`: bool — If True, do not use sudo when running dmesg or listing log files.
- `log_dmesg_data`: bool — If True, log the collected dmesg output in artifacts. | [DmesgData](#DmesgData-Model) | [DmesgCollector](#Collector-Class-DmesgCollector) | [DmesgAnalyzer](#Data-Analyzer-Class-DmesgAnalyzer) | @@ -142,7 +142,7 @@ Collect CPU and GPU count ### Class Variables -- **CMD_GPU_COUNT_LINUX**: `lspci -d {vendorid_ep}: | grep -i 'VGA\|Display\|3D' | wc -l` +- **CMD_GPU_COUNT_LINUX**: `lspci -d {vendorid_ep}: | grep -iE 'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | grep -vi 'Virtual Function' | wc -l` - **CMD_VF_COUNT_LINUX**: `lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l` - **CMD_LSCPU_LINUX**: `lscpu` - **CMD_LSHW_LINUX**: `lshw` @@ -157,7 +157,7 @@ DeviceEnumerationDataModel ### Commands - powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count" -- lspci -d {vendorid_ep}: | grep -i 'VGA\|Display\|3D' | wc -l +- lspci -d {vendorid_ep}: | grep -iE 'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | grep -vi 'Virtual Function' | wc -l - powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count" - lscpu - lshw