diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index fe9fc1c5..b77d429c 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1 +1 @@
-* @landrews-amd @alexandraBara @jaspals3123
+*@alexandraBara
diff --git a/.github/workflows/code_quality_checks.yml b/.github/workflows/code_quality_checks.yml
index d3706b1c..2408a8ce 100644
--- a/.github/workflows/code_quality_checks.yml
+++ b/.github/workflows/code_quality_checks.yml
@@ -14,12 +14,19 @@ jobs:
container: python:3.9
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history for pre-commit to work
+
+ # python:3.9 image has no git; pre-commit requires it.
+ - name: Install git
+ run: |
+ apt-get update
+ apt-get install -y --no-install-recommends git
+
- name: Configure git for container
run: |
- git config --global --add safe.directory /__w/node-scraper/node-scraper
+ git config --global --add safe.directory "$GITHUB_WORKSPACE"
git config --global user.email "ci@github.com"
git config --global user.name "CI Bot"
- name: setup environment and run pre-commit hooks
diff --git a/.github/workflows/functional-test.yml b/.github/workflows/functional-test.yml
index 8fd1fcf4..2c6618b5 100644
--- a/.github/workflows/functional-test.yml
+++ b/.github/workflows/functional-test.yml
@@ -15,7 +15,7 @@ jobs:
container: python:3.9
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Install xmllint
run: |
diff --git a/.github/workflows/release-trusted-publisher.yml b/.github/workflows/release-trusted-publisher.yml
index 78e37b0a..55267d48 100644
--- a/.github/workflows/release-trusted-publisher.yml
+++ b/.github/workflows/release-trusted-publisher.yml
@@ -24,7 +24,7 @@ jobs:
steps:
- name: Checkout code
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
with:
fetch-depth: 0 # Fetch all history and tags
token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/unit-test.yml b/.github/workflows/unit-test.yml
index 7a4b17c7..fe6dc9c2 100644
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -15,7 +15,7 @@ jobs:
container: python:3.9
steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
- name: Install xmllint
run: |
diff --git a/README.md b/README.md
index 27c14577..370005ee 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ system debug.
## Table of Contents
- [Installation](#installation)
- [Install from PyPI](#install-from-pypi)
- - [Install From Source](#install-from-source)
+ - [Install from Source](#install-from-source)
- [CLI Usage](#cli-usage)
- [Execution Methods](#execution-methods)
- [Example: Remote Execution](#example-remote-execution)
@@ -38,7 +38,7 @@ Use a virtual environment if you prefer. After installation, confirm the CLI is
node-scraper --help
```
-### Install From Source
+### Install from Source
Node Scraper requires Python 3.9+ for installation. After cloning this repository,
call dev-setup.sh script with 'source'. This script creates an editable install of Node Scraper in
a python virtual environment and also configures the pre-commit hooks for the project.
@@ -491,7 +491,12 @@ The RedfishEndpointPlugin collects Redfish URIs (GET responses) and optionally r
}
```
+**`collection_args`**
- **`uris`**: List of Redfish paths (e.g. `/redfish/v1/`, `/redfish/v1/Systems/1`) to GET and store.
+- **`follow_next_link`**: Optional (default `false`). When `true`, the collector follows `Members@odata.nextLink` pagination for each URI and merges all pages into a single response.
+- **`max_pages`**: Optional (default `200`). Safety cap on the number of pages to follow per URI when `follow_next_link` is enabled.
+
+**`analysis_args`**
- **`checks`**: Optional. Map of URI to expected values or constraints for analysis. Supports exact match (e.g. `"PowerState": "On"`), `anyOf`, `min`/`max`, etc.
#### **'summary' sub command**
diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md
index 5e84641c..3b6c3a3f 100644
--- a/docs/PLUGIN_DOC.md
+++ b/docs/PLUGIN_DOC.md
@@ -7,7 +7,7 @@
| AmdSmiPlugin | bad-pages
firmware --json
list --json
metric -g all
partition --json
process --json
ras --cper --folder={folder}
ras --afid --cper-file {cper_file}
static -g all --json
static -g {gpu_id} --json
topology
version --json
xgmi -l
xgmi -m | **Analyzer Args:**
- `check_static_data`: bool — If True, run static data checks (e.g. driver version, partition mode).
- `expected_gpu_processes`: Optional[int] — Expected number of GPU processes.
- `expected_max_power`: Optional[int] — Expected maximum power value (e.g. watts).
- `expected_driver_version`: Optional[str] — Expected AMD driver version string.
- `expected_memory_partition_mode`: Optional[str] — Expected memory partition mode (e.g. sp3, dp).
- `expected_compute_partition_mode`: Optional[str] — Expected compute partition mode.
- `expected_firmware_versions`: Optional[dict[str, str]] — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).
- `l0_to_recovery_count_error_threshold`: Optional[int] — L0-to-recovery count above which an error is raised.
- `l0_to_recovery_count_warning_threshold`: Optional[int] — L0-to-recovery count above which a warning is raised.
- `vendorid_ep`: Optional[str] — Expected endpoint vendor ID (e.g. for PCIe).
- `vendorid_ep_vf`: Optional[str] — Expected endpoint VF vendor ID.
- `devid_ep`: Optional[str] — Expected endpoint device ID.
- `devid_ep_vf`: Optional[str] — Expected endpoint VF device ID.
- `sku_name`: Optional[str] — Expected SKU name string for GPU.
- `expected_xgmi_speed`: Optional[list[float]] — Expected xGMI speed value(s) (e.g. link rate).
- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for time-windowed analysis.
- `analysis_range_end`: Optional[datetime.datetime] — End of time range for time-windowed analysis. | **Collection Args:**
- `analysis_firmware_ids`: Optional[list[str]] — amd-smi fw_id values to record in analysis_ref.firmware_versions
- `cper_file_path`: Optional[str] — Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file). | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) |
| BiosPlugin | sh -c 'cat /sys/devices/virtual/dmi/id/bios_version'
wmic bios get SMBIOSBIOSVersion /Value | **Analyzer Args:**
- `exp_bios_version`: list[str] — Expected BIOS version(s) to match against collected value (str or list).
- `regex_match`: bool — If True, match exp_bios_version as regex; otherwise exact match. | - | [BiosDataModel](#BiosDataModel-Model) | [BiosCollector](#Collector-Class-BiosCollector) | [BiosAnalyzer](#Data-Analyzer-Class-BiosAnalyzer) |
| CmdlinePlugin | cat /proc/cmdline | **Analyzer Args:**
- `required_cmdline`: Union[str, List] — Command-line parameters that must be present (e.g. 'pci=bfsort').
- `banned_cmdline`: Union[str, List] — Command-line parameters that must not be present.
- `os_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-OS overrides for required_cmdline and banned_cmdline (keyed by OS identifier).
- `platform_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-platform overrides for required_cmdline and banned_cmdline (keyed by platform). | - | [CmdlineDataModel](#CmdlineDataModel-Model) | [CmdlineCollector](#Collector-Class-CmdlineCollector) | [CmdlineAnalyzer](#Data-Analyzer-Class-CmdlineAnalyzer) |
-| DeviceEnumerationPlugin | powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count"
lspci -d {vendorid_ep}: | grep -i 'VGA\|Display\|3D' | wc -l
powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count"
lscpu
lshw
lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l
powershell -Command "(Get-VMHostPartitionableGpu | Measure-Object).Count" | **Analyzer Args:**
- `cpu_count`: Optional[list[int]] — Expected CPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `gpu_count`: Optional[list[int]] — Expected GPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `vf_count`: Optional[list[int]] — Expected virtual function count(s); pass as int or list of ints. Analysis passes if actual is in list. | - | [DeviceEnumerationDataModel](#DeviceEnumerationDataModel-Model) | [DeviceEnumerationCollector](#Collector-Class-DeviceEnumerationCollector) | [DeviceEnumerationAnalyzer](#Data-Analyzer-Class-DeviceEnumerationAnalyzer) |
+| DeviceEnumerationPlugin | powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count"
lspci -d {vendorid_ep}: | grep -iE 'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | grep -vi 'Virtual Function' | wc -l
powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count"
lscpu
lshw
lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l
powershell -Command "(Get-VMHostPartitionableGpu | Measure-Object).Count" | **Analyzer Args:**
- `cpu_count`: Optional[list[int]] — Expected CPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `gpu_count`: Optional[list[int]] — Expected GPU count(s); pass as int or list of ints. Analysis passes if actual is in list.
- `vf_count`: Optional[list[int]] — Expected virtual function count(s); pass as int or list of ints. Analysis passes if actual is in list. | - | [DeviceEnumerationDataModel](#DeviceEnumerationDataModel-Model) | [DeviceEnumerationCollector](#Collector-Class-DeviceEnumerationCollector) | [DeviceEnumerationAnalyzer](#Data-Analyzer-Class-DeviceEnumerationAnalyzer) |
| DimmPlugin | sh -c 'dmidecode -t 17 | tr -s " " | grep -v "Volatile\|None\|Module" | grep Size' 2>/dev/null
dmidecode
wmic memorychip get Capacity | - | **Collection Args:**
- `skip_sudo`: bool — If True, do not use sudo when running dmidecode or wmic for memory info. | [DimmDataModel](#DimmDataModel-Model) | [DimmCollector](#Collector-Class-DimmCollector) | - |
| DkmsPlugin | dkms status
dkms --version | **Analyzer Args:**
- `dkms_status`: Union[str, list] — Expected dkms status string(s) to match (e.g. 'amd/1.0.0'). At least one of dkms_status or dkms_version required.
- `dkms_version`: Union[str, list] — Expected dkms version string(s) to match. At least one of dkms_status or dkms_version required.
- `regex_match`: bool — If True, match dkms_status and dkms_version as regex; otherwise exact match. | - | [DkmsDataModel](#DkmsDataModel-Model) | [DkmsCollector](#Collector-Class-DkmsCollector) | [DkmsAnalyzer](#Data-Analyzer-Class-DkmsAnalyzer) |
| DmesgPlugin | dmesg --time-format iso -x
ls -1 /var/log/dmesg* 2>/dev/null | grep -E '^/var/log/dmesg(\.[0-9]+(\.gz)?)?$' || true | **Built-in Regexes:**
- Out of memory error: `(?:oom_kill_process.*)|(?:Out of memory.*)`
- I/O Page Fault: `IO_PAGE_FAULT`
- Kernel Panic: `\bkernel panic\b.*`
- SQ Interrupt: `sq_intr`
- SRAM ECC: `sram_ecc.*`
- Failed to load driver. IP hardware init error.: `\[amdgpu\]\] \*ERROR\* hw_init of IP block.*`
- Failed to load driver. IP software init error.: `\[amdgpu\]\] \*ERROR\* sw_init of IP block.*`
- Real Time throttling activated: `sched: RT throttling activated.*`
- RCU preempt detected stalls: `rcu_preempt detected stalls.*`
- RCU preempt self-detected stall: `rcu_preempt self-detected stall.*`
- QCM fence timeout: `qcm fence wait loop timeout.*`
- General protection fault: `(?:[\w-]+(?:\[[0-9.]+\])?\s+)?general protectio...`
- Segmentation fault: `(?:segfault.*in .*\[)|(?:[Ss]egmentation [Ff]au...`
- Failed to disallow cf state: `amdgpu: Failed to disallow cf state.*`
- Failed to terminate tmr: `\*ERROR\* Failed to terminate tmr.*`
- Suspend of IP block failed: `\*ERROR\* suspend of IP block <\w+> failed.*`
- amdgpu Page Fault: `(amdgpu \w{4}:\w{2}:\w{2}\.\w:\s+amdgpu:\s+\[\S...`
- Page Fault: `page fault for address.*`
- Fatal error during GPU init: `(?:amdgpu)(.*Fatal error during GPU init)|(Fata...`
- PCIe AER Error Status: `(pcieport [\w:.]+: AER: aer_status:[^\n]*(?:\n[...`
- PCIe AER Correctable Error Status: `(.*aer_cor_status: 0x[0-9a-fA-F]+, aer_cor_mask...`
- PCIe AER Uncorrectable Error Status: `(.*aer_uncor_status: 0x[0-9a-fA-F]+, aer_uncor_...`
- PCIe AER Uncorrectable Error Severity with TLP Header: `(.*aer_uncor_severity: 0x[0-9a-fA-F]+.*)(\n.*TL...`
- Failed to read journal file: `Failed to read journal file.*`
- Journal file corrupted or uncleanly shut down: `journal corrupted or uncleanly shut down.*`
- ACPI BIOS Error: `ACPI BIOS Error`
- ACPI Error: `ACPI Error`
- Filesystem corrupted!: `EXT4-fs error \(device .*\):`
- Error in buffered IO, check filesystem integrity: `(Buffer I\/O error on dev)(?:ice)? (\w+)`
- PCIe card no longer present: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...`
- PCIe Link Down: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(Slot\(...`
- Mismatched clock configuration between PCIe device and host: `pcieport (\w+:\w+:\w+\.\w+):\s+(\w+):\s+(curren...`
- RAS Correctable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Uncorrectable Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Deferred Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- RAS Corrected PCIe Error: `((?:\[Hardware Error\]:\s+)?event severity: cor...`
- GPU Reset: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- GPU reset failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...`
- ACA Error: `(Accelerator Check Architecture[^\n]*)(?:\n[^\n...`
- MCE Error: `\[Hardware Error\]:.+MC\d+_STATUS.*(?:\n.*){0,5}`
- Mode 2 Reset Failed: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)? (...`
- RAS Corrected Error: `(?:\d{4}-\d+-\d+T\d+:\d+:\d+,\d+[+-]\d+:\d+)?(....`
- SGX Error: `x86/cpu: SGX disabled by BIOS`
- MMP Error: `Failed to load MMP firmware qat_4xxx_mmp.bin`
- GPU Throttled: `amdgpu \w{4}:\w{2}:\w{2}.\w: amdgpu: WARN: GPU ...`
- RAS Poison Consumed: `amdgpu[ 0-9a-fA-F:.]+:(?:\s*amdgpu:)?\s+(?:{\d+...`
- RAS Poison created: `amdgpu[ 0-9a-fA-F:.]+:(?:\s*amdgpu:)?\s+(?:{\d+...`
- Bad page threshold exceeded: `(amdgpu: Saved bad pages (\d+) reaches threshol...`
- RAS Hardware Error: `Hardware error from APEI Generic Hardware Error...`
- Error Address: `Error Address.*(?:\s.*)`
- RAS EDR Event: `EDR: EDR event received`
- DPC Event: `DPC: .*`
- LNet: ko2iblnd has no matching interfaces: `(?:\[[^\]]+\]\s*)?LNetError:.*ko2iblnd:\s*No ma...`
- LNet: Error starting up LNI: `(?:\[[^\]]+\]\s*)?LNetError:\s*.*Error\s*-?\d+\...`
- Lustre: network initialisation failed: `LustreError:.*ptlrpc_init_portals\(\).*network ...` | **Collection Args:**
- `collect_rotated_logs`: bool — If True, also collect rotated dmesg log files from /var/log/dmesg*.
- `skip_sudo`: bool — If True, do not use sudo when running dmesg or listing log files.
- `log_dmesg_data`: bool — If True, log the collected dmesg output in artifacts. | [DmesgData](#DmesgData-Model) | [DmesgCollector](#Collector-Class-DmesgCollector) | [DmesgAnalyzer](#Data-Analyzer-Class-DmesgAnalyzer) |
@@ -16,15 +16,15 @@
| KernelPlugin | sh -c 'uname -a'
sh -c 'cat /proc/sys/kernel/numa_balancing'
wmic os get Version /Value | **Analyzer Args:**
- `exp_kernel`: Union[str, list] — Expected kernel version string(s) to match (e.g. from uname -a).
- `exp_numa`: Optional[int] — Expected value for kernel.numa_balancing (e.g. 0 or 1).
- `regex_match`: bool — If True, match exp_kernel as regex; otherwise exact match. | - | [KernelDataModel](#KernelDataModel-Model) | [KernelCollector](#Collector-Class-KernelCollector) | [KernelAnalyzer](#Data-Analyzer-Class-KernelAnalyzer) |
| KernelModulePlugin | cat /proc/modules
modinfo amdgpu
wmic os get Version /Value | **Analyzer Args:**
- `kernel_modules`: dict[str, dict] — Expected kernel module name -> {version, etc.}. Analyzer checks collected modules match.
- `regex_filter`: list[str] — List of regex patterns to filter which collected modules are checked (default: amd). | - | [KernelModuleDataModel](#KernelModuleDataModel-Model) | [KernelModuleCollector](#Collector-Class-KernelModuleCollector) | [KernelModuleAnalyzer](#Data-Analyzer-Class-KernelModuleAnalyzer) |
| MemoryPlugin | free -b
lsmem
numactl -H
wmic OS get FreePhysicalMemory /Value; wmic ComputerSystem get TotalPhysicalMemory /Value | **Analyzer Args:**
- `ratio`: float — Required free-memory ratio (0-1). Analysis fails if free/total < ratio.
- `memory_threshold`: str — Minimum free memory required (e.g. '30Gi', '1T'). Used when ratio is not sufficient. | - | [MemoryDataModel](#MemoryDataModel-Model) | [MemoryCollector](#Collector-Class-MemoryCollector) | [MemoryAnalyzer](#Data-Analyzer-Class-MemoryAnalyzer) |
-| NetworkPlugin | ip addr show
curl
ethtool -S {interface}
ethtool {interface}
lldpcli show neighbor
lldpctl
ip neighbor show
ping
ip route show
ip rule show
wget | **Built-in Regexes:**
- tx_pfc_frames is non-zero: `^tx_pfc_frames$`
- tx_pfc_ena_frames_pri* is non-zero: `^tx_pfc_ena_frames_pri\d+$`
- pfc_pri*_tx_transitions is non-zero: `^pfc_pri\d+_tx_transitions$`
**Analyzer Args:**
- `error_regex`: Union[list[nodescraper.base.regexanalyzer.ErrorRegex], list[dict], NoneType] — Custom error regex patterns; each item can be ErrorRegex or dict with category/pattern. | **Collection Args:**
- `url`: Optional[str] — Optional URL to probe for network connectivity (used with netprobe).
- `netprobe`: Optional[Literal['ping', 'wget', 'curl']] — Tool to use for network connectivity probe: ping, wget, or curl. | [NetworkDataModel](#NetworkDataModel-Model) | [NetworkCollector](#Collector-Class-NetworkCollector) | [NetworkAnalyzer](#Data-Analyzer-Class-NetworkAnalyzer) |
-| NicPlugin | niccli --listdev
niccli --list
niccli --list_devices
niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering
niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering
niccli -dev {device_num} nvm -getoption performance_profile
niccli --dev {device_num} nvm --getoption performance_profile
niccli -dev {device_num} nvm -getoption support_rdma -scope 0
niccli -dev {device_num} getqos
niccli --dev {device_num} nvm --getoption support_rdma
niccli --dev {device_num} qos --ets --show
niccli --version
nicctl show card
nicctl --version
nicctl show card flash partition --json
nicctl show card interrupts --json
nicctl show card logs --non-persistent
nicctl show card logs --boot-fault
nicctl show card logs --persistent
nicctl show card profile --json
nicctl show card time --json
nicctl show card statistics packet-buffer summary --json
nicctl show lif statistics --json
nicctl show lif internal queue-to-ud-pinning
nicctl show pipeline internal anomalies
nicctl show pipeline internal rsq-ring
nicctl show pipeline internal statistics memory
nicctl show port fsm
nicctl show port transceiver --json
nicctl show port statistics --json
nicctl show port internal mac
nicctl show qos headroom --json
nicctl show rdma queue --json
nicctl show rdma queue-pair --detail --json
nicctl show version firmware
nicctl show dcqcn
nicctl show environment
nicctl show lif
nicctl show pcie ats
nicctl show port
nicctl show qos
nicctl show rdma statistics
nicctl show version host-software
nicctl show dcqcn --card {card_id} --json
nicctl show card hardware-config --card {card_id} | **Analyzer Args:**
- `expected_values`: Optional[Dict[str, Dict[str, Any]]] — Per-command expected checks keyed by canonical key (see command_to_canonical_key).
- `performance_profile_expected`: str — Expected Broadcom performance_profile value (case-insensitive). Default RoCE.
- `support_rdma_disabled_values`: List[str] — Values that indicate RDMA is not supported (case-insensitive).
- `pcie_relaxed_ordering_expected`: str — Expected Broadcom pcie_relaxed_ordering value (e.g. 'Relaxed ordering = enabled'); checked case-insensitively. Defaul...
- `expected_qos_prio_map`: Optional[Dict[Any, Any]] — Expected priority-to-TC map (e.g. {0: 0, 1: 1}; keys may be int or str in config). Checked per device when set.
- `expected_qos_pfc_enabled`: Optional[int] — Expected PFC enabled value (0/1 or bitmask). Checked per device when set.
- `expected_qos_tsa_map`: Optional[Dict[Any, Any]] — Expected TSA map for ETS (e.g. {0: 'ets', 1: 'strict'}; keys may be int or str in config). Checked per device when set.
- `expected_qos_tc_bandwidth`: Optional[List[int]] — Expected TC bandwidth percentages. Checked per device when set.
- `require_qos_consistent_across_adapters`: bool — When True and no expected_qos_* are set, require all adapters to have the same prio_map, pfc_enabled, and tsa_map.
- `nicctl_log_error_regex`: Optional[List[Dict[str, Any]]] — Optional list of error patterns for nicctl show card logs. | **Collection Args:**
- `commands`: Optional[List[str]] — Optional list of niccli/nicctl commands to run. When None, default command set is used.
- `use_sudo_niccli`: bool — If True, run niccli commands with sudo when required.
- `use_sudo_nicctl`: bool — If True, run nicctl commands with sudo when required. | [NicDataModel](#NicDataModel-Model) | [NicCollector](#Collector-Class-NicCollector) | [NicAnalyzer](#Data-Analyzer-Class-NicAnalyzer) |
+| NetworkPlugin | ip addr show
curl
ethtool -S {interface}
ethtool {interface}
lldpcli show neighbor
lldpctl
ip neighbor show
ping
rdma link -j
ip route show
ip rule show
wget | **Built-in Regexes:**
- tx_pfc_frames is non-zero: `^tx_pfc_frames$`
- tx_pfc_ena_frames_pri* is non-zero: `^tx_pfc_ena_frames_pri\d+$`
- pfc_pri*_tx_transitions is non-zero: `^pfc_pri\d+_tx_transitions$`
**Analyzer Args:**
- `error_regex`: Union[list[nodescraper.base.regexanalyzer.ErrorRegex], list[dict], NoneType] — Custom error regex patterns; each item can be ErrorRegex or dict with category/pattern. | **Collection Args:**
- `url`: Optional[str] — Optional URL to probe for network connectivity (used with netprobe).
- `netprobe`: Optional[Literal['ping', 'wget', 'curl']] — Tool to use for network connectivity probe: ping, wget, or curl. | [NetworkDataModel](#NetworkDataModel-Model) | [NetworkCollector](#Collector-Class-NetworkCollector) | [NetworkAnalyzer](#Data-Analyzer-Class-NetworkAnalyzer) |
+| NicPlugin | niccli --listdev
niccli --list
niccli --list_devices
niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering
niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering
niccli -dev {device_num} nvm -getoption performance_profile
niccli --dev {device_num} nvm --getoption performance_profile
niccli -dev {device_num} nvm -getoption support_rdma -scope 0
niccli -dev {device_num} getqos
niccli --dev {device_num} nvm --getoption support_rdma --scope 0
niccli --dev {device_num} qos --ets --show
niccli --version
nicctl show card
nicctl --version
nicctl show card flash partition --json
nicctl show card interrupts --json
nicctl show card logs --non-persistent
nicctl show card logs --boot-fault
nicctl show card logs --persistent
nicctl show card profile --json
nicctl show card time --json
nicctl show card statistics packet-buffer summary --json
nicctl show lif statistics --json
nicctl show lif internal queue-to-ud-pinning
nicctl show pipeline internal anomalies
nicctl show pipeline internal rsq-ring
nicctl show pipeline internal statistics memory
nicctl show port fsm
nicctl show port transceiver --json
nicctl show port statistics --json
nicctl show port internal mac
nicctl show qos headroom --json
nicctl show rdma queue --json
nicctl show rdma queue-pair --detail --json
nicctl show version firmware
nicctl show dcqcn
nicctl show environment
nicctl show lif
nicctl show pcie ats
nicctl show port
nicctl show qos
nicctl show rdma statistics
nicctl show version host-software
nicctl show dcqcn --card {card_id} --json
nicctl show card hardware-config --card {card_id} | **Analyzer Args:**
- `expected_values`: Optional[Dict[str, Dict[str, Any]]] — Per-command expected checks keyed by canonical key (see command_to_canonical_key).
- `performance_profile_expected`: str — Expected Broadcom performance_profile value (case-insensitive). Default RoCE.
- `support_rdma_disabled_values`: List[str] — Values that indicate RDMA is not supported (case-insensitive).
- `pcie_relaxed_ordering_expected`: str — Expected Broadcom pcie_relaxed_ordering value (e.g. 'Relaxed ordering = enabled'); checked case-insensitively. Defaul...
- `expected_qos_prio_map`: Optional[Dict[Any, Any]] — Expected priority-to-TC map (e.g. {0: 0, 1: 1}; keys may be int or str in config). Checked per device when set.
- `expected_qos_pfc_enabled`: Optional[int] — Expected PFC enabled value (0/1 or bitmask). Checked per device when set.
- `expected_qos_tsa_map`: Optional[Dict[Any, Any]] — Expected TSA map for ETS (e.g. {0: 'ets', 1: 'strict'}; keys may be int or str in config). Checked per device when set.
- `expected_qos_tc_bandwidth`: Optional[List[int]] — Expected TC bandwidth percentages. Checked per device when set.
- `require_qos_consistent_across_adapters`: bool — When True and no expected_qos_* are set, require all adapters to have the same prio_map, pfc_enabled, and tsa_map.
- `nicctl_log_error_regex`: Optional[List[Dict[str, Any]]] — Optional list of error patterns for nicctl show card logs. | **Collection Args:**
- `commands`: Optional[List[str]] — Optional list of niccli/nicctl commands to run. When None, default command set is used.
- `use_sudo_niccli`: bool — If True, run niccli commands with sudo when required.
- `use_sudo_nicctl`: bool — If True, run nicctl commands with sudo when required. | [NicDataModel](#NicDataModel-Model) | [NicCollector](#Collector-Class-NicCollector) | [NicAnalyzer](#Data-Analyzer-Class-NicAnalyzer) |
| NvmePlugin | nvme smart-log {dev}
nvme error-log {dev} --log-entries=256
nvme id-ctrl {dev}
nvme id-ns {dev}{ns}
nvme fw-log {dev}
nvme self-test-log {dev}
nvme get-log {dev} --log-id=6 --log-len=512
nvme telemetry-log {dev} --output-file={dev}_{f_name}
nvme list -o json | - | - | [NvmeDataModel](#NvmeDataModel-Model) | [NvmeCollector](#Collector-Class-NvmeCollector) | - |
| OsPlugin | sh -c '( lsb_release -ds || (cat /etc/*release | grep PRETTY_NAME) || uname -om ) 2>/dev/null | head -n1'
cat /etc/*release | grep VERSION_ID
wmic os get Version /value
wmic os get Caption /Value | **Analyzer Args:**
- `exp_os`: Union[str, list] — Expected OS name/version string(s) to match (e.g. from lsb_release or /etc/os-release).
- `exact_match`: bool — If True, require exact match for exp_os; otherwise substring match. | - | [OsDataModel](#OsDataModel-Model) | [OsCollector](#Collector-Class-OsCollector) | [OsAnalyzer](#Data-Analyzer-Class-OsAnalyzer) |
| PackagePlugin | dnf list --installed
dpkg-query -W
pacman -Q
cat /etc/*release
wmic product get name,version | **Analyzer Args:**
- `exp_package_ver`: Dict[str, Optional[str]] — Map package name -> expected version (None = any version). Checked against installed packages.
- `regex_match`: bool — If True, match package versions with regex; otherwise exact or prefix match.
- `rocm_regex`: Optional[str] — Optional regex to identify ROCm package version (used when enable_rocm_regex is True).
- `enable_rocm_regex`: bool — If True, use rocm_regex (or default pattern) to extract ROCm version for checks. | - | [PackageDataModel](#PackageDataModel-Model) | [PackageCollector](#Collector-Class-PackageCollector) | [PackageAnalyzer](#Data-Analyzer-Class-PackageAnalyzer) |
| PciePlugin | lspci -d {vendor_id}: -nn
lspci -x
lspci -xxxx
lspci -PP
lspci -PP -d {vendor_id}:{dev_id}
lspci -vvv
lspci -vvvt | **Analyzer Args:**
- `exp_speed`: int — Expected PCIe link speed (generation 1–5).
- `exp_width`: int — Expected PCIe link width in lanes (1–16).
- `exp_sriov_count`: int — Expected SR-IOV virtual function count.
- `exp_gpu_count_override`: Optional[int] — Override expected GPU count for validation.
- `exp_max_payload_size`: Union[Dict[int, int], int, NoneType] — Expected max payload size: int for all devices, or dict keyed by device ID.
- `exp_max_rd_req_size`: Union[Dict[int, int], int, NoneType] — Expected max read request size: int for all devices, or dict keyed by device ID.
- `exp_ten_bit_tag_req_en`: Union[Dict[int, int], int, NoneType] — Expected 10-bit tag request enable: int for all devices, or dict keyed by device ID. | - | [PcieDataModel](#PcieDataModel-Model) | [PcieCollector](#Collector-Class-PcieCollector) | [PcieAnalyzer](#Data-Analyzer-Class-PcieAnalyzer) |
| ProcessPlugin | top -b -n 1
rocm-smi --showpids
top -b -n 1 -o %CPU | **Analyzer Args:**
- `max_kfd_processes`: int — Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check.
- `max_cpu_usage`: float — Maximum allowed CPU usage (percent) for process checks. | **Collection Args:**
- `top_n_process`: int — Number of top processes by CPU usage to collect (e.g. for top -b -n 1 -o %%CPU). | [ProcessDataModel](#ProcessDataModel-Model) | [ProcessCollector](#Collector-Class-ProcessCollector) | [ProcessAnalyzer](#Data-Analyzer-Class-ProcessAnalyzer) |
| RdmaPlugin | rdma link -j
rdma dev
rdma link
rdma statistic -j | - | - | [RdmaDataModel](#RdmaDataModel-Model) | [RdmaCollector](#Collector-Class-RdmaCollector) | [RdmaAnalyzer](#Data-Analyzer-Class-RdmaAnalyzer) |
-| RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d {rocm_path}*
ls -v -d {rocm_path}-[3-7]* | tail -1
ldconfig -p | grep -i -E 'rocm'
grep . -r {rocm_path}/.info/* | **Analyzer Args:**
- `exp_rocm`: Union[str, list] — Expected ROCm version string(s) to match (e.g. from rocminfo).
- `exp_rocm_latest`: str — Expected 'latest' ROCm path or version string for versioned installs.
- `exp_rocm_sub_versions`: dict[str, Union[str, list]] — Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings. | **Collection Args:**
- `rocm_path`: str — Base path to ROCm installation (e.g. /opt/rocm). Used for rocminfo, clinfo, and version discovery. | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) |
+| RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d {rocm_path}*
ls -v -d {rocm_path}-[3-7]* | tail -1
ldconfig -p | grep -i -E 'rocm'
grep . -H -r -i {rocm_path}/.info/* | **Analyzer Args:**
- `exp_rocm`: Union[str, list] — Expected ROCm version string(s) to match (e.g. from rocminfo).
- `exp_rocm_latest`: str — Expected 'latest' ROCm path or version string for versioned installs.
- `exp_rocm_sub_versions`: dict[str, Union[str, list]] — Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings. | **Collection Args:**
- `rocm_path`: str — Base path to ROCm installation (e.g. /opt/rocm). Used for rocminfo, clinfo, and version discovery. | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) |
| StoragePlugin | sh -c 'df -lH -B1 | grep -v 'boot''
wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace | - | **Collection Args:**
- `skip_sudo`: bool — If True, do not use sudo when running df and related storage commands. | [StorageDataModel](#StorageDataModel-Model) | [StorageCollector](#Collector-Class-StorageCollector) | [StorageAnalyzer](#Data-Analyzer-Class-StorageAnalyzer) |
| SysSettingsPlugin | cat /sys/{}
ls -1 /sys/{}
ls -l /sys/{} | **Analyzer Args:**
- `checks`: Optional[list[nodescraper.plugins.inband.sys_settings.analyzer_args.SysfsCheck]] — List of sysfs checks (path, expected values or pattern, display name). | **Collection Args:**
- `paths`: list[str] — Sysfs paths to read (cat). Paths with '*' are collected with ls -l (e.g. class/net/*/device).
- `directory_paths`: list[str] — Sysfs paths to list (ls -1); used for checks that match entry names by regex. | [SysSettingsDataModel](#SysSettingsDataModel-Model) | [SysSettingsCollector](#Collector-Class-SysSettingsCollector) | [SysSettingsAnalyzer](#Data-Analyzer-Class-SysSettingsAnalyzer) |
| SysctlPlugin | sysctl -n | **Analyzer Args:**
- `exp_vm_swappiness`: Optional[int] — Expected vm.swappiness value.
- `exp_vm_numa_balancing`: Optional[int] — Expected vm.numa_balancing value.
- `exp_vm_oom_kill_allocating_task`: Optional[int] — Expected vm.oom_kill_allocating_task value.
- `exp_vm_compaction_proactiveness`: Optional[int] — Expected vm.compaction_proactiveness value.
- `exp_vm_compact_unevictable_allowed`: Optional[int] — Expected vm.compact_unevictable_allowed value.
- `exp_vm_extfrag_threshold`: Optional[int] — Expected vm.extfrag_threshold value.
- `exp_vm_zone_reclaim_mode`: Optional[int] — Expected vm.zone_reclaim_mode value.
- `exp_vm_dirty_background_ratio`: Optional[int] — Expected vm.dirty_background_ratio value.
- `exp_vm_dirty_ratio`: Optional[int] — Expected vm.dirty_ratio value.
- `exp_vm_dirty_writeback_centisecs`: Optional[int] — Expected vm.dirty_writeback_centisecs value.
- `exp_kernel_numa_balancing`: Optional[int] — Expected kernel.numa_balancing value. | - | [SysctlDataModel](#SysctlDataModel-Model) | [SysctlCollector](#Collector-Class-SysctlCollector) | [SysctlAnalyzer](#Data-Analyzer-Class-SysctlAnalyzer) |
@@ -142,7 +142,7 @@ Collect CPU and GPU count
### Class Variables
-- **CMD_GPU_COUNT_LINUX**: `lspci -d {vendorid_ep}: | grep -i 'VGA\|Display\|3D' | wc -l`
+- **CMD_GPU_COUNT_LINUX**: `lspci -d {vendorid_ep}: | grep -iE 'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | grep -vi 'Virtual Function' | wc -l`
- **CMD_VF_COUNT_LINUX**: `lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l`
- **CMD_LSCPU_LINUX**: `lscpu`
- **CMD_LSHW_LINUX**: `lshw`
@@ -157,7 +157,7 @@ DeviceEnumerationDataModel
### Commands
- powershell -Command "(Get-WmiObject -Class Win32_Processor | Measure-Object).Count"
-- lspci -d {vendorid_ep}: | grep -i 'VGA\|Display\|3D' | wc -l
+- lspci -d {vendorid_ep}: | grep -iE 'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | grep -vi 'Virtual Function' | wc -l
- powershell -Command "(wmic path win32_VideoController get name | findstr AMD | Measure-Object).Count"
- lscpu
- lshw
@@ -405,6 +405,7 @@ Collect network configuration details using ip command
- **CMD_NEIGHBOR**: `ip neighbor show`
- **CMD_ETHTOOL_TEMPLATE**: `ethtool {interface}`
- **CMD_ETHTOOL_S_TEMPLATE**: `ethtool -S {interface}`
+- **CMD_RDMA_LINK_JSON**: `rdma link -j`
- **CMD_PING**: `ping`
- **CMD_WGET**: `wget`
- **CMD_CURL**: `curl`
@@ -425,6 +426,7 @@ NetworkDataModel
- lldpctl
- ip neighbor show
- ping
+- rdma link -j
- ip route show
- ip rule show
- wget
@@ -459,12 +461,12 @@ Collect raw output from niccli (Broadcom) and nicctl (Pensando) commands.
niccli -dev {device_num} nvm -getoption pcie_relaxed_ordering,
niccli -dev {device_num} getqos
]`
-- **CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_NEW**: `niccli --dev {device_num} nvm --getoption support_rdma`
+- **CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_NEW**: `niccli --dev {device_num} nvm --getoption support_rdma --scope 0`
- **CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE_NEW**: `niccli --dev {device_num} nvm --getoption performance_profile`
- **CMD_NICCLI_PCIE_RELAXED_ORDERING_TEMPLATE_NEW**: `niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering`
- **CMD_NICCLI_QOS_TEMPLATE_NEW**: `niccli --dev {device_num} qos --ets --show`
- **CMD_NICCLI_PER_DEVICE_NEW**: `[
- niccli --dev {device_num} nvm --getoption support_rdma,
+ niccli --dev {device_num} nvm --getoption support_rdma --scope 0,
niccli --dev {device_num} nvm --getoption performance_profile,
niccli --dev {device_num} nvm --getoption pcie_relaxed_ordering,
niccli --dev {device_num} qos --ets --show
@@ -531,7 +533,7 @@ NicDataModel
- niccli --dev {device_num} nvm --getoption performance_profile
- niccli -dev {device_num} nvm -getoption support_rdma -scope 0
- niccli -dev {device_num} getqos
-- niccli --dev {device_num} nvm --getoption support_rdma
+- niccli --dev {device_num} nvm --getoption support_rdma --scope 0
- niccli --dev {device_num} qos --ets --show
- niccli --version
- nicctl show card
@@ -792,7 +794,7 @@ Collect ROCm version data
### Class Variables
- **SUPPORTED_OS_FAMILY**: `{}`
-- **CMD_ROCM_SUB_VERSIONS_TMPL**: `grep . -r {rocm_path}/.info/*`
+- **CMD_ROCM_SUB_VERSIONS_TMPL**: `grep . -H -r -i {rocm_path}/.info/*`
- **CMD_ROCMINFO_TMPL**: `{rocm_path}/bin/rocminfo`
- **CMD_ROCM_LATEST_TMPL**: `ls -v -d {rocm_path}-[3-7]* | tail -1`
- **CMD_ROCM_DIRS_TMPL**: `ls -v -d {rocm_path}*`
@@ -816,7 +818,7 @@ RocmDataModel
- ls -v -d {rocm_path}*
- ls -v -d {rocm_path}-[3-7]* | tail -1
- ldconfig -p | grep -i -E 'rocm'
-- grep . -r {rocm_path}/.info/*
+- grep . -H -r -i {rocm_path}/.info/*
## Collector Class StorageCollector
@@ -1134,6 +1136,8 @@ Complete network configuration data
- **rules**: `List[nodescraper.plugins.inband.network.networkdata.RoutingRule]`
- **neighbors**: `List[nodescraper.plugins.inband.network.networkdata.Neighbor]`
- **ethtool_info**: `Dict[str, nodescraper.plugins.inband.network.networkdata.EthtoolInfo]`
+- **rdma_ethtool_netdevs**: `List[str]`
+- **rdma_ethtool_statistics**: `List[nodescraper.plugins.inband.network.ethtool_vendor.EthtoolStatistics]`
- **accessible**: `Optional[bool]`
## NicDataModel Model
@@ -1281,6 +1285,7 @@ Data model for RDMA (Remote Direct Memory Access) statistics and link informatio
### Model annotations and fields
+- **ROCM_VERSION_FILENAME**: `ClassVar[str]`
- **rocm_version**: `str`
- **rocm_sub_versions**: `dict[str, str]`
- **rocminfo**: `List[str]`
diff --git a/nodescraper/base/inbandcollectortask.py b/nodescraper/base/inbandcollectortask.py
index 16039bda..d12b58dc 100644
--- a/nodescraper/base/inbandcollectortask.py
+++ b/nodescraper/base/inbandcollectortask.py
@@ -28,6 +28,7 @@
from nodescraper.connection.inband import InBandConnection
from nodescraper.connection.inband.inband import BaseFileArtifact, CommandArtifact
+from nodescraper.constants import DEFAULT_EVENT_REPORTER
from nodescraper.enums import EventPriority, OSFamily, SystemInteractionLevel
from nodescraper.generictypes import TCollectArg, TDataModel
from nodescraper.interfaces import DataCollector, TaskResultHook
@@ -52,6 +53,8 @@ def __init__(
max_event_priority_level: Union[EventPriority, str] = EventPriority.CRITICAL,
parent: Optional[str] = None,
task_result_hooks: Optional[list[TaskResultHook]] = None,
+ event_reporter: str = DEFAULT_EVENT_REPORTER,
+ session_id: Optional[str] = None,
**kwargs,
):
super().__init__(
@@ -62,6 +65,8 @@ def __init__(
connection=connection,
parent=parent,
task_result_hooks=task_result_hooks,
+ event_reporter=event_reporter,
+ session_id=session_id,
)
if self.system_info.os_family not in self.SUPPORTED_OS_FAMILY:
raise SystemCompatibilityError(
diff --git a/nodescraper/base/redfishcollectortask.py b/nodescraper/base/redfishcollectortask.py
index b8401213..ed67c660 100644
--- a/nodescraper/base/redfishcollectortask.py
+++ b/nodescraper/base/redfishcollectortask.py
@@ -27,6 +27,7 @@
from typing import Generic, Optional, Union
from nodescraper.connection.redfish import RedfishConnection, RedfishGetResult
+from nodescraper.constants import DEFAULT_EVENT_REPORTER
from nodescraper.enums import EventPriority
from nodescraper.generictypes import TCollectArg, TDataModel
from nodescraper.interfaces import DataCollector, TaskResultHook
@@ -47,6 +48,8 @@ def __init__(
max_event_priority_level: Union[EventPriority, str] = EventPriority.CRITICAL,
parent: Optional[str] = None,
task_result_hooks: Optional[list[TaskResultHook]] = None,
+ event_reporter: str = DEFAULT_EVENT_REPORTER,
+ session_id: Optional[str] = None,
**kwargs,
):
super().__init__(
@@ -56,6 +59,8 @@ def __init__(
max_event_priority_level=max_event_priority_level,
parent=parent,
task_result_hooks=task_result_hooks,
+ event_reporter=event_reporter,
+ session_id=session_id,
**kwargs,
)
@@ -77,3 +82,25 @@ def _run_redfish_get(
if log_artifact:
self.result.artifacts.append(res)
return res
+
+ def _run_redfish_get_paged(
+ self,
+ path: str,
+ max_pages: int = 200,
+ log_artifact: bool = True,
+ ) -> RedfishGetResult:
+ """
+ Run a Redfish GET and follow Members@odata.nextLink pagination, merging all pages into a single response.
+
+ Args:
+ path (str): Redfish URI path.
+ max_pages (int, optional): safety cap on the number of pages to follow. Defaults to 200.
+ log_artifact (bool, optional): whether we should log the merged result. Defaults to True.
+
+ Returns:
+ RedfishGetResult: path, success, merged data (or error), status_code.
+ """
+ res = self.connection.run_get_paged(path, max_pages=max_pages)
+ if log_artifact:
+ self.result.artifacts.append(res)
+ return res
diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py
index 054c2c5b..9890ef39 100644
--- a/nodescraper/cli/cli.py
+++ b/nodescraper/cli/cli.py
@@ -31,6 +31,7 @@
import os
import platform
import sys
+import uuid
from typing import Optional
import nodescraper
@@ -200,7 +201,7 @@ def _add_cli_root_globals(
def build_global_argument_parser(*, add_help: bool = True) -> argparse.ArgumentParser:
- """Globals only (no subcommands), for host CLIs such as amd-error-scraper ``error-scraper``."""
+ """Globals only (no subcommands), for host CLIs."""
plugin_reg = PluginRegistry()
config_reg = _config_registry_with_all_plugins(plugin_reg)
parser = argparse.ArgumentParser(
@@ -642,6 +643,7 @@ def main(
timestamp=timestamp,
sname=sname,
host_cli_args=host_cli_args,
+ session_id=str(uuid.uuid4()),
)
log_system_info(log_path, system_info, logger)
diff --git a/nodescraper/cli/host_cli_embed.py b/nodescraper/cli/host_cli_embed.py
index bffeb378..864639b6 100644
--- a/nodescraper/cli/host_cli_embed.py
+++ b/nodescraper/cli/host_cli_embed.py
@@ -39,12 +39,7 @@ def apply_host_cli_args_to_parsed_args(
parsed_args: argparse.Namespace,
host_ns: Optional[argparse.Namespace],
) -> None:
- """Copy host profile fields from an embedding host onto parsed top-level args.
-
- Used when ``main(..., host_cli_args=...)`` is invoked (e.g. from the
- error-scraper wrapper) so ``--connection-config`` profile data loaded by the
- host is visible to :func:`get_system_info` and the rest of the CLI.
- """
+ """Copy host profile fields from an embedding host onto parsed top-level args."""
if host_ns is None:
return
for attr in (
diff --git a/nodescraper/cli/invocation.py b/nodescraper/cli/invocation.py
index 12cd3a94..ee59e4a6 100644
--- a/nodescraper/cli/invocation.py
+++ b/nodescraper/cli/invocation.py
@@ -71,6 +71,7 @@ class PluginRunInvocation:
timestamp: str
sname: str
host_cli_args: Optional[argparse.Namespace] = None
+ session_id: Optional[str] = None
def run_plugin_queue_with_invocation(
@@ -84,6 +85,7 @@ def run_plugin_queue_with_invocation(
timestamp: str,
sname: str,
host_cli_args: Optional[argparse.Namespace] = None,
+ session_id: Optional[str] = None,
) -> list[PluginResult]:
"""Constructs the plugin executor, binds invocation context, and runs the plugin queue."""
inv = PluginRunInvocation(
@@ -96,6 +98,7 @@ def run_plugin_queue_with_invocation(
timestamp=timestamp,
sname=sname,
host_cli_args=host_cli_args,
+ session_id=session_id,
)
plugin_executor = PluginExecutor(
logger=logger,
@@ -104,6 +107,7 @@ def run_plugin_queue_with_invocation(
system_info=system_info,
log_path=log_path,
plugin_registry=plugin_reg,
+ session_id=session_id,
)
with plugin_run_invocation_scope(inv):
return plugin_executor.run_queue()
diff --git a/nodescraper/connection/redfish/__init__.py b/nodescraper/connection/redfish/__init__.py
index ee812113..f98faaac 100644
--- a/nodescraper/connection/redfish/__init__.py
+++ b/nodescraper/connection/redfish/__init__.py
@@ -28,6 +28,12 @@
RedfishConnectionError,
RedfishGetResult,
)
+from .redfish_constants import (
+ RF_MEMBERS,
+ RF_MEMBERS_COUNT,
+ RF_MEMBERS_NEXT_LINK,
+ RF_ODATA_ID,
+)
from .redfish_manager import RedfishConnectionManager
from .redfish_oem_diag import (
collect_oem_diagnostic_data,
@@ -45,4 +51,8 @@
"RedfishPath",
"collect_oem_diagnostic_data",
"get_oem_diagnostic_allowable_values",
+ "RF_MEMBERS",
+ "RF_MEMBERS_COUNT",
+ "RF_MEMBERS_NEXT_LINK",
+ "RF_ODATA_ID",
]
diff --git a/nodescraper/connection/redfish/redfish_connection.py b/nodescraper/connection/redfish/redfish_connection.py
index 449b4edb..4398b06f 100644
--- a/nodescraper/connection/redfish/redfish_connection.py
+++ b/nodescraper/connection/redfish/redfish_connection.py
@@ -34,6 +34,7 @@
from requests import Response
from requests.auth import HTTPBasicAuth
+from .redfish_constants import RF_MEMBERS, RF_MEMBERS_COUNT, RF_MEMBERS_NEXT_LINK
from .redfish_path import RedfishPath
DEFAULT_REDFISH_API_ROOT = "redfish/v1"
@@ -183,6 +184,53 @@ def run_get(self, path: Union[str, RedfishPath]) -> RedfishGetResult:
status_code=None,
)
+ def run_get_paged(
+ self,
+ path: Union[str, RedfishPath],
+ max_pages: int = 200,
+ ) -> RedfishGetResult:
+ """Run a Redfish GET and transparently follow Members@odata.nextLink pagination.
+
+ Each subsequent page's Members list is appended to the first page's Members list
+ so the caller receives a single merged response body. The Members@odata.nextLink key
+ and Members@odata.count are updated to reflect the merged result. If there is no
+ Members@odata.nextLink in the first response this behaves identically to run_get.
+ max_pages is a safety cap on the number of pages to fetch (default 200).
+ """
+ first = self.run_get(path)
+ if not first.success or first.data is None:
+ return first
+
+ # Short-circuit when there is nothing to page through.
+ if RF_MEMBERS_NEXT_LINK not in first.data:
+ return first
+
+ merged_members: list = list(first.data.get(RF_MEMBERS) or [])
+ merged_data: dict = dict(first.data)
+ pages_fetched = 1
+ next_link: Optional[str] = first.data.get(RF_MEMBERS_NEXT_LINK)
+ last_status_code = first.status_code
+
+ while next_link and pages_fetched < max_pages:
+ page_result = self.run_get(next_link)
+ last_status_code = page_result.status_code
+ if not page_result.success or page_result.data is None:
+ break
+ merged_members.extend(page_result.data.get(RF_MEMBERS) or [])
+ next_link = page_result.data.get(RF_MEMBERS_NEXT_LINK)
+ pages_fetched += 1
+
+ merged_data[RF_MEMBERS] = merged_members
+ merged_data[RF_MEMBERS_COUNT] = len(merged_members)
+ merged_data.pop(RF_MEMBERS_NEXT_LINK, None)
+
+ return RedfishGetResult(
+ path=first.path,
+ success=True,
+ data=merged_data,
+ status_code=last_status_code,
+ )
+
def copy(self) -> "RedfishConnection":
"""Return a new connection with the same config and its own session (for concurrent use)."""
return RedfishConnection(
diff --git a/nodescraper/connection/redfish/redfish_constants.py b/nodescraper/connection/redfish/redfish_constants.py
new file mode 100644
index 00000000..d27f0223
--- /dev/null
+++ b/nodescraper/connection/redfish/redfish_constants.py
@@ -0,0 +1,38 @@
+###############################################################################
+#
+# MIT License
+#
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+###############################################################################
+"""Redfish field name constants shared across the Redfish package(s)."""
+
+# Resource collection property which identifies members of the collection.
+RF_MEMBERS = "Members"
+
+# Resource collection property which defines the total number of resources/members.
+RF_MEMBERS_COUNT = "Members@odata.count"
+
+# Resource collection property which points to the next set of partial members from the originating operation.
+RF_MEMBERS_NEXT_LINK = "Members@odata.nextLink"
+
+# Resource identifier property (optional for registry resources, required for all other resources and resource collections).
+RF_ODATA_ID = "@odata.id"
diff --git a/nodescraper/connection/redfish/redfish_oem_diag.py b/nodescraper/connection/redfish/redfish_oem_diag.py
index 94133337..affabf6e 100644
--- a/nodescraper/connection/redfish/redfish_oem_diag.py
+++ b/nodescraper/connection/redfish/redfish_oem_diag.py
@@ -38,6 +38,7 @@
from nodescraper.enums import TaskState
from .redfish_connection import RedfishConnection, RedfishConnectionError
+from .redfish_constants import RF_ODATA_ID
from .redfish_path import RedfishPath
_module_logger = logging.getLogger(__name__)
@@ -65,9 +66,6 @@ def _log_collect_diag_response(
)
-# Redfish JSON key for resource link
-RF_ODATA_ID = "@odata.id"
-
# @Redfish.AllowableValues: Redfish annotation for the list of allowable values for a string
REDFISH_ANNOTATION_ALLOWABLE_VALUES = "Redfish.AllowableValues"
diff --git a/nodescraper/constants.py b/nodescraper/constants.py
index 8769d5a2..981f0827 100644
--- a/nodescraper/constants.py
+++ b/nodescraper/constants.py
@@ -24,3 +24,5 @@
#
###############################################################################
DEFAULT_LOGGER = "nodescraper"
+
+DEFAULT_EVENT_REPORTER = "NODE_SCRAPER"
diff --git a/nodescraper/interfaces/connectionmanager.py b/nodescraper/interfaces/connectionmanager.py
index ccb5e793..6b468f06 100644
--- a/nodescraper/interfaces/connectionmanager.py
+++ b/nodescraper/interfaces/connectionmanager.py
@@ -33,6 +33,7 @@
from pydantic import BaseModel
+from nodescraper.constants import DEFAULT_EVENT_REPORTER
from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus
from nodescraper.models import SystemInfo, TaskResult
from nodescraper.typeutils import TypeUtils
@@ -63,6 +64,9 @@ def wrapper(
priority=EventPriority.CRITICAL,
console_log=True,
)
+ connection_manager.logger.exception(
+ "Exception connecting with %s", connection_manager.__class__.__name__
+ )
connection_manager.result.status = ExecutionStatus.EXECUTION_FAILURE
result = connection_manager.result
@@ -93,6 +97,8 @@ def __init__(
parent: Optional[str] = None,
task_result_hooks: Optional[list[TaskResultHook], None] = None,
connection_args: Optional[Union[TConnectArg, dict]] = None,
+ event_reporter: str = DEFAULT_EVENT_REPORTER,
+ session_id: Optional[str] = None,
**kwargs,
):
super().__init__(
@@ -101,6 +107,8 @@ def __init__(
max_event_priority_level=max_event_priority_level,
parent="connection" if not parent else parent,
task_result_hooks=task_result_hooks,
+ event_reporter=event_reporter,
+ session_id=session_id,
**kwargs,
)
diff --git a/nodescraper/interfaces/datacollectortask.py b/nodescraper/interfaces/datacollectortask.py
index 020bf053..3c30a6ea 100644
--- a/nodescraper/interfaces/datacollectortask.py
+++ b/nodescraper/interfaces/datacollectortask.py
@@ -32,6 +32,7 @@
from pydantic import BaseModel, ValidationError
+from nodescraper.constants import DEFAULT_EVENT_REPORTER
from nodescraper.enums import (
EventCategory,
EventPriority,
@@ -96,6 +97,11 @@ def wrapper(
priority=EventPriority.CRITICAL,
console_log=True,
)
+ collector.logger.error(
+ "Pydantic validation error in data collector %s: %s",
+ collector.__class__.__name__,
+ exception.errors(include_url=False),
+ )
else:
collector._log_event(
category=EventCategory.RUNTIME,
@@ -104,6 +110,9 @@ def wrapper(
priority=EventPriority.CRITICAL,
console_log=True,
)
+ collector.logger.exception(
+ "Exception in data collector %s", collector.__class__.__name__
+ )
collector.result.status = ExecutionStatus.EXECUTION_FAILURE
result = collector.result
data = None
@@ -144,6 +153,8 @@ def __init__(
max_event_priority_level: Union[EventPriority, str] = EventPriority.CRITICAL,
parent: Optional[str] = None,
task_result_hooks: Optional[list[TaskResultHook]] = None,
+ event_reporter: str = DEFAULT_EVENT_REPORTER,
+ session_id: Optional[str] = None,
**kwargs,
):
"""data collector init function
@@ -151,7 +162,7 @@ def __init__(
Args:
system_info (SystemInfo): system info object for target system for data collection
system_interaction (SystemInteraction): enum to indicate the type of actions that can be performed when interacting with the system
- event_reporter (str, optional): Described the reporter of the event. Defaults to DEFAULT_EVENT_REPORTER.
+ event_reporter (str, optional): Reporter string stored on emitted events. Defaults to DEFAULT_EVENT_REPORTER.
logger (Optional[logging.Logger], optional): python logger object. Defaults to None.
log_path (Optional[str], optional): file system log path. Defaults to None.
"""
@@ -161,6 +172,8 @@ def __init__(
max_event_priority_level=max_event_priority_level,
parent=parent,
task_result_hooks=task_result_hooks,
+ event_reporter=event_reporter,
+ session_id=session_id,
)
if isinstance(system_interaction_level, str):
diff --git a/nodescraper/interfaces/dataplugin.py b/nodescraper/interfaces/dataplugin.py
index ed632fb4..8448dff3 100644
--- a/nodescraper/interfaces/dataplugin.py
+++ b/nodescraper/interfaces/dataplugin.py
@@ -31,6 +31,7 @@
from pydantic import Field
+from nodescraper.constants import DEFAULT_EVENT_REPORTER
from nodescraper.enums import EventPriority, ExecutionStatus, SystemInteractionLevel
from nodescraper.generictypes import TAnalyzeArg, TCollectArg, TDataModel
from nodescraper.interfaces.dataanalyzertask import DataAnalyzer
@@ -74,6 +75,8 @@ def __init__(
connection_args: Optional[Union[TConnectArg, dict]] = None,
task_result_hooks: Optional[list[TaskResultHook]] = None,
log_path: Optional[str] = None,
+ event_reporter: str = DEFAULT_EVENT_REPORTER,
+ session_id: Optional[str] = None,
**kwargs,
):
super().__init__(
@@ -83,6 +86,8 @@ def __init__(
connection_args,
task_result_hooks,
log_path,
+ event_reporter=event_reporter,
+ session_id=session_id,
**kwargs,
)
self._validate_class_var()
@@ -186,6 +191,8 @@ def collect(
logger=self.logger,
parent=self.__class__.__name__,
task_result_hooks=self.task_result_hooks,
+ event_reporter=self.event_reporter,
+ session_id=self.session_id,
)
if (
@@ -219,6 +226,8 @@ def collect(
parent=self.__class__.__name__,
task_result_hooks=self.task_result_hooks,
log_path=self.log_path,
+ event_reporter=self.event_reporter,
+ session_id=self.session_id,
)
self.collection_result, self._data = collection_task.collect_data(collection_args)
@@ -230,6 +239,11 @@ def collect(
message=str(e),
)
except Exception as e:
+ self.logger.exception(
+ "Unhandled exception running collector %s for plugin %s",
+ self.COLLECTOR.__name__,
+ self.__class__.__name__,
+ )
self.collection_result = TaskResult(
task=self.COLLECTOR.__name__,
parent=self.__class__.__name__,
@@ -293,6 +307,8 @@ def analyze(
max_event_priority_level=max_event_priority_level,
parent=self.__class__.__name__,
task_result_hooks=self.task_result_hooks,
+ event_reporter=self.event_reporter,
+ session_id=self.session_id,
)
self.analysis_result = analyzer_task.analyze_data(self.data, analysis_args)
return self.analysis_result
diff --git a/nodescraper/interfaces/plugin.py b/nodescraper/interfaces/plugin.py
index 0194ef2d..06959b54 100644
--- a/nodescraper/interfaces/plugin.py
+++ b/nodescraper/interfaces/plugin.py
@@ -28,7 +28,7 @@
import logging
from typing import Callable, Generic, Optional, Type, Union
-from nodescraper.constants import DEFAULT_LOGGER
+from nodescraper.constants import DEFAULT_EVENT_REPORTER, DEFAULT_LOGGER
from nodescraper.models import PluginResult, SystemInfo
from nodescraper.taskresulthooks.filesystemloghook import FileSystemLogHook
@@ -50,6 +50,8 @@ def __init__(
task_result_hooks: Optional[list[TaskResultHook]] = None,
log_path: Optional[str] = None,
queue_callback: Optional[Callable] = None,
+ event_reporter: str = DEFAULT_EVENT_REPORTER,
+ session_id: Optional[str] = None,
**kwargs,
):
"""Initialize plugin
@@ -86,6 +88,9 @@ def __init__(
self.queue_callback = queue_callback
+ self.event_reporter = event_reporter
+ self.session_id = session_id
+
self.connection_manager = connection_manager
if connection_args and self.CONNECTION_TYPE and not self.connection_manager:
@@ -95,6 +100,8 @@ def __init__(
connection_args=connection_args,
parent=self.__class__.__name__,
task_result_hooks=self.task_result_hooks,
+ event_reporter=event_reporter,
+ session_id=self.session_id,
)
@classmethod
diff --git a/nodescraper/interfaces/task.py b/nodescraper/interfaces/task.py
index 16d1a70b..aa19d56f 100644
--- a/nodescraper/interfaces/task.py
+++ b/nodescraper/interfaces/task.py
@@ -27,9 +27,10 @@
import copy
import datetime
import logging
+import uuid
from typing import Any, Optional, Union
-from nodescraper.constants import DEFAULT_LOGGER
+from nodescraper.constants import DEFAULT_EVENT_REPORTER, DEFAULT_LOGGER
from nodescraper.enums import EventCategory, EventPriority
from nodescraper.models import Event, SystemInfo, TaskResult
@@ -54,17 +55,32 @@ def __init__(
max_event_priority_level: Union[EventPriority, str] = EventPriority.CRITICAL,
parent: Optional[str] = None,
task_result_hooks: Optional[list[TaskResultHook]] = None,
+ event_reporter: str = DEFAULT_EVENT_REPORTER,
+ session_id: Optional[str] = None,
**kwargs: dict[str, Any],
):
if logger is None:
logger = logging.getLogger(DEFAULT_LOGGER)
self.system_info = system_info
self.logger = logger
+ self.event_reporter = event_reporter
self.max_event_priority_level = max_event_priority_level
self.parent = parent
if not task_result_hooks:
task_result_hooks = []
self.task_result_hooks = task_result_hooks
+
+ if session_id is None and "session_id" in kwargs:
+ session_id = kwargs.pop("session_id") # type: ignore[assignment]
+ if session_id is not None:
+ try:
+ uuid.UUID(str(session_id))
+ except (ValueError, TypeError, AttributeError) as e:
+ raise ValueError(
+ f"session_id must be a valid UUID string, got: {session_id}"
+ ) from e
+ self.session_id: Optional[str] = str(session_id) if session_id is not None else None
+
self.result: TaskResult = self._init_result()
@property
@@ -115,6 +131,9 @@ def _build_event(
if self.parent:
data["parent"] = self.parent
+ if self.session_id is not None:
+ data["session_id"] = self.session_id
+
if self.system_info.metadata:
data["system_metadata"] = copy.copy(self.system_info.metadata)
@@ -122,6 +141,7 @@ def _build_event(
priority = self.max_event_priority_level
event = Event(
+ reporter=self.event_reporter,
category=category,
description=description,
priority=priority,
@@ -151,7 +171,22 @@ def _log_event(
)
if console_log:
- self.logger.log(getattr(logging, priority.name, logging.INFO), description)
+ level = getattr(logging, priority.name, logging.INFO)
+ prefix = ""
+ if data:
+ et = data.get("exception_type")
+ if et:
+ prefix = f"[{et}] "
+ self.logger.log(level, "%s%s", prefix, description)
+ if data:
+ tb = data.get("traceback")
+ if tb:
+ tb_text = "".join(tb) if isinstance(tb, list) else str(tb)
+ if tb_text.strip():
+ self.logger.log(level, "Traceback:\n%s", tb_text.rstrip())
+ det = data.get("details")
+ if det and not tb:
+ self.logger.log(level, "Details: %s", det)
self.result.events.append(event)
diff --git a/nodescraper/models/__init__.py b/nodescraper/models/__init__.py
index af9673c1..6b7ebb00 100644
--- a/nodescraper/models/__init__.py
+++ b/nodescraper/models/__init__.py
@@ -25,11 +25,16 @@
###############################################################################
from .analyzerargs import AnalyzerArgs
from .collectorargs import CollectorArgs
-from .datamodel import DataModel
+from .datamodel import DataModel, FileModel, TDataModel
from .datapluginresult import DataPluginResult
from .event import Event
from .pluginconfig import PluginConfig
from .pluginresult import PluginResult
+from .priority_override import (
+ NO_CHANGE,
+ PriorityOverrideRule,
+ apply_priority_override_rules,
+)
from .systeminfo import SystemInfo
from .taskresult import TaskResult
from .timerangeargs import TimeRangeAnalysisArgs
@@ -38,11 +43,16 @@
"AnalyzerArgs",
"CollectorArgs",
"DataModel",
+ "FileModel",
+ "TDataModel",
"TaskResult",
"Event",
"SystemInfo",
"PluginResult",
"DataPluginResult",
"PluginConfig",
+ "NO_CHANGE",
+ "PriorityOverrideRule",
+ "apply_priority_override_rules",
"TimeRangeAnalysisArgs",
]
diff --git a/nodescraper/models/datamodel.py b/nodescraper/models/datamodel.py
index 78a5df06..c310c810 100644
--- a/nodescraper/models/datamodel.py
+++ b/nodescraper/models/datamodel.py
@@ -29,7 +29,7 @@
import tarfile
from typing import TypeVar, Union
-from pydantic import BaseModel, Field, field_validator
+from pydantic import BaseModel, field_validator
from nodescraper.utils import get_unique_filename
@@ -37,7 +37,7 @@
class FileModel(BaseModel):
- file_contents: bytes = Field(exclude=True)
+ file_contents: bytes
file_name: str
@field_validator("file_contents", mode="before")
diff --git a/nodescraper/models/event.py b/nodescraper/models/event.py
index 33cf2801..de2ecc64 100644
--- a/nodescraper/models/event.py
+++ b/nodescraper/models/event.py
@@ -28,10 +28,11 @@
import re
import uuid
from enum import Enum
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, cast
from pydantic import BaseModel, Field, field_serializer, field_validator
+from nodescraper.constants import DEFAULT_EVENT_REPORTER
from nodescraper.enums import EventPriority
@@ -66,7 +67,7 @@ class Event(BaseModel):
timestamp: datetime.datetime = Field(
default_factory=lambda: datetime.datetime.now(datetime.timezone.utc)
)
- reporter: str = "NODE_SCRAPER"
+ reporter: str = DEFAULT_EVENT_REPORTER
category: str
description: str
data: dict = Field(default_factory=dict)
@@ -113,15 +114,30 @@ def validate_category(cls, category: Optional[Union[str, Enum]]) -> str:
@field_validator("priority", mode="before")
@classmethod
- def validate_priority(cls, priority: Union[str, EventPriority]) -> EventPriority:
- """Allow priority to be set via string priority name
+ def validate_priority(cls, priority: Union[str, int, EventPriority]) -> EventPriority:
+ """Allow priority via :class:`EventPriority`, name string, or integer value.
+
+ Integer values use :class:`~enum.IntEnum` construction (same numeric scale as
+ ``EventPriority``). Values outside the enum (e.g. foreign severity codes) map
+ to :attr:`EventPriority.ERROR`. Booleans are rejected (``bool`` is a subclass
+ of ``int`` in Python).
+
Args:
- priority (Union[str, EventPriority]): event priority string or enum
+ priority: Enum, member name, or integer severity.
+
Raises:
- ValueError: if priority string is an invalid value
+ ValueError: if *priority* is a boolean or an invalid string name.
+
Returns:
- EventPriority: priority enum
+ Resolved :class:`EventPriority`.
"""
+ if type(priority) is bool:
+ raise ValueError("priority must not be a boolean")
+ if isinstance(priority, int):
+ try:
+ return cast(EventPriority, EventPriority(priority))
+ except ValueError:
+ return EventPriority.ERROR
if isinstance(priority, str):
try:
return getattr(EventPriority, priority.upper())
@@ -131,7 +147,10 @@ def validate_priority(cls, priority: Union[str, EventPriority]) -> EventPriority
) from e
if isinstance(priority, EventPriority):
return priority
- raise ValueError("priority must be an EventPriority or its name as a string")
+ raise ValueError(
+ "priority must be an EventPriority, its name as a string, or an int "
+ "(unknown ints map to ERROR)"
+ )
@field_serializer("priority")
def serialize_priority(self, priority: EventPriority, _info) -> str:
diff --git a/nodescraper/models/priority_override.py b/nodescraper/models/priority_override.py
new file mode 100644
index 00000000..7f40aac7
--- /dev/null
+++ b/nodescraper/models/priority_override.py
@@ -0,0 +1,126 @@
+###############################################################################
+#
+# MIT License
+#
+# Copyright (C) 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+###############################################################################
+
+"""First-match-wins priority override rules for :class:`~nodescraper.models.event.Event`."""
+
+from __future__ import annotations
+
+import re
+from typing import Optional
+
+from pydantic import BaseModel, ConfigDict, field_validator, model_validator
+
+from nodescraper.enums import EventPriority
+from nodescraper.models.event import Event
+
+__all__ = [
+ "NO_CHANGE",
+ "PriorityOverrideRule",
+ "apply_priority_override_rules",
+]
+
+NO_CHANGE = "NO_CHANGE"
+
+
+def _normalize_category(category: str) -> str:
+ category = str(category).strip().upper()
+ return re.sub(r"[\s-]", "_", category)
+
+
+class PriorityOverrideRule(BaseModel):
+ """One override rule; first matching rule wins when applied to an event list."""
+
+ model_config = ConfigDict(extra="forbid", str_strip_whitespace=True)
+
+ match_all: bool = False
+ new_priority: str
+ description: Optional[str] = None
+ message: Optional[str] = None
+ event_category: Optional[str] = None
+
+ @field_validator("new_priority", mode="before")
+ @classmethod
+ def _validate_new_priority_token(cls, value: object) -> str:
+ if value is None:
+ raise ValueError("new_priority is required")
+ if value == NO_CHANGE:
+ return NO_CHANGE
+ if isinstance(value, EventPriority):
+ return value.name
+ if not isinstance(value, str):
+ raise ValueError("new_priority must be a string or EventPriority")
+ upper = value.upper()
+ if upper == NO_CHANGE:
+ return NO_CHANGE
+ if upper not in {p.name for p in EventPriority}:
+ raise ValueError(
+ f"new_priority must be {NO_CHANGE} or one of " f"{[p.name for p in EventPriority]}"
+ )
+ return upper
+
+ @model_validator(mode="after")
+ def _require_match_all_or_selectors(self) -> PriorityOverrideRule:
+ if self.match_all:
+ return self
+ if self.description is None and self.message is None and self.event_category is None:
+ raise ValueError(
+ "set match_all=True or provide at least one selector among "
+ "description, message, and event_category"
+ )
+ return self
+
+ def matches_event(self, event: Event) -> bool:
+ """Return True if this rule applies to *event*."""
+ if self.match_all:
+ return True
+ if self.description is not None and event.description != self.description:
+ return False
+ if self.message is not None:
+ match_content = event.data.get("match_content", "")
+ if not isinstance(match_content, str):
+ match_content = str(match_content)
+ if self.message not in match_content and self.message not in event.description:
+ return False
+ if self.event_category is not None:
+ if _normalize_category(self.event_category) != _normalize_category(event.category):
+ return False
+ return True
+
+
+def apply_priority_override_rules(events: list[Event], rules: list[dict]) -> None:
+ """Apply *rules* in order to each event in *events* (in place); first match wins.
+
+ ``new_priority`` may be :data:`NO_CHANGE` to keep the current priority while still
+ stopping further rules for that event.
+ """
+ parsed = [PriorityOverrideRule.model_validate(r) for r in rules]
+ for event in events:
+ for rule in parsed:
+ if not rule.matches_event(event):
+ continue
+ if rule.new_priority != NO_CHANGE:
+ event.priority = EventPriority[rule.new_priority]
+ break
diff --git a/nodescraper/pluginexecutor.py b/nodescraper/pluginexecutor.py
index 8a6998f2..0821ff20 100644
--- a/nodescraper/pluginexecutor.py
+++ b/nodescraper/pluginexecutor.py
@@ -28,6 +28,7 @@
import copy
import inspect
import logging
+import uuid
from collections import deque
from typing import Optional, Type, Union
@@ -53,6 +54,7 @@ def __init__(
logger: Optional[logging.Logger] = None,
plugin_registry: Optional[PluginRegistry] = None,
log_path: Optional[str] = None,
+ session_id: Optional[str] = None,
):
if logger is None:
@@ -65,6 +67,17 @@ def __init__(
system_info = SystemInfo()
self.system_info = system_info
+ if session_id is not None:
+ try:
+ uuid.UUID(session_id)
+ self.session_id = session_id
+ except (ValueError, AttributeError, TypeError) as e:
+ raise ValueError(
+ f"session_id must be a valid UUID string, got: {session_id}"
+ ) from e
+ else:
+ self.session_id = None
+
self.plugin_config = self.merge_configs(plugin_configs)
self.connection_library: dict[type[ConnectionManager], ConnectionManager] = {}
@@ -90,6 +103,7 @@ def __init__(
logger=self.logger,
connection_args=connection_args,
task_result_hooks=self.connection_result_hooks,
+ session_id=self.session_id,
)
self.logger.info("System Name: %s", self.system_info.name)
@@ -157,6 +171,7 @@ def run_queue(self) -> list[PluginResult]:
"logger": self.logger,
"queue_callback": plugin_queue.append,
"log_path": self.log_path,
+ "session_id": self.session_id,
}
if plugin_class.CONNECTION_TYPE:
@@ -192,6 +207,7 @@ def run_queue(self) -> list[PluginResult]:
system_info=self.system_info,
logger=self.logger,
task_result_hooks=self.connection_result_hooks,
+ session_id=self.session_id,
)
init_payload["connection_manager"] = self.connection_library[mgr_impl]
diff --git a/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py b/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py
index 82a82f91..9b0dc295 100644
--- a/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py
+++ b/nodescraper/plugins/inband/device_enumeration/device_enumeration_collector.py
@@ -38,7 +38,11 @@ class DeviceEnumerationCollector(InBandDataCollector[DeviceEnumerationDataModel,
DATA_MODEL = DeviceEnumerationDataModel
- CMD_GPU_COUNT_LINUX = "lspci -d {vendorid_ep}: | grep -i 'VGA\\|Display\\|3D' | wc -l"
+ CMD_GPU_COUNT_LINUX = (
+ "lspci -d {vendorid_ep}: | grep -iE "
+ "'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | "
+ "grep -vi 'Virtual Function' | wc -l"
+ )
CMD_VF_COUNT_LINUX = "lspci -d {vendorid_ep}: | grep -i 'Virtual Function' | wc -l"
CMD_LSCPU_LINUX = "lscpu"
CMD_LSHW_LINUX = "lshw"
diff --git a/nodescraper/plugins/inband/network/ethtool_vendor.py b/nodescraper/plugins/inband/network/ethtool_vendor.py
new file mode 100644
index 00000000..47e91be9
--- /dev/null
+++ b/nodescraper/plugins/inband/network/ethtool_vendor.py
@@ -0,0 +1,662 @@
+###############################################################################
+#
+# MIT License
+#
+# Copyright (c) 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+###############################################################################
+"""Vendor-specific ethtool -S statistics models (Pollara / Thor2 / ConnectX-7)."""
+
+from typing import ClassVar, Optional, Union
+
+from pydantic import BaseModel, Field, model_validator
+from typing_extensions import Self
+
+
+class PollaraEthtoolStatistics(BaseModel):
+ """ifname ionic. Keeping only fields of interest. Skip queue-specific stats for now"""
+
+ rx_csum_error: Optional[int] = None
+ hw_tx_dropped: Optional[int] = None
+ hw_rx_dropped: Optional[int] = None
+ hw_rx_over_errors: Optional[int] = None
+ hw_rx_missed_errors: Optional[int] = None
+ hw_tx_aborted_errors: Optional[int] = None
+ frames_rx_bad_fcs: Optional[int] = None
+ frames_rx_bad_all: Optional[int] = None
+ frames_rx_pause: Optional[int] = None
+ frames_rx_bad_length: Optional[int] = None
+ frames_rx_undersized: Optional[int] = None
+ frames_rx_oversized: Optional[int] = None
+ frames_rx_fragments: Optional[int] = None
+ frames_rx_jabber: Optional[int] = None
+ frames_rx_pripause: Optional[int] = None
+ frames_rx_stomped_crc: Optional[int] = None
+ frames_rx_too_long: Optional[int] = None
+ frames_rx_dropped: Optional[int] = None
+ frames_rx_less_than_64b: Optional[int] = None
+ frames_tx_bad: Optional[int] = None
+ frames_tx_pause: Optional[int] = None
+ frames_tx_pripause: Optional[int] = None
+ frames_tx_less_than_64b: Optional[int] = None
+ frames_tx_pri_0: Optional[int] = None
+ frames_tx_pri_1: Optional[int] = None
+ frames_tx_pri_2: Optional[int] = None
+ frames_tx_pri_3: Optional[int] = None
+ frames_tx_pri_4: Optional[int] = None
+ frames_tx_pri_5: Optional[int] = None
+ frames_tx_pri_6: Optional[int] = None
+ frames_tx_pri_7: Optional[int] = None
+ frames_rx_pri_0: Optional[int] = None
+ frames_rx_pri_1: Optional[int] = None
+ frames_rx_pri_2: Optional[int] = None
+ frames_rx_pri_3: Optional[int] = None
+ frames_rx_pri_4: Optional[int] = None
+ frames_rx_pri_5: Optional[int] = None
+ frames_rx_pri_6: Optional[int] = None
+ frames_rx_pri_7: Optional[int] = None
+ tx_pripause_0_1us_count: Optional[int] = None
+ tx_pripause_1_1us_count: Optional[int] = None
+ tx_pripause_2_1us_count: Optional[int] = None
+ tx_pripause_3_1us_count: Optional[int] = None
+ tx_pripause_4_1us_count: Optional[int] = None
+ tx_pripause_5_1us_count: Optional[int] = None
+ tx_pripause_6_1us_count: Optional[int] = None
+ tx_pripause_7_1us_count: Optional[int] = None
+ rx_pripause_0_1us_count: Optional[int] = None
+ rx_pripause_1_1us_count: Optional[int] = None
+ rx_pripause_2_1us_count: Optional[int] = None
+ rx_pripause_3_1us_count: Optional[int] = None
+ rx_pripause_4_1us_count: Optional[int] = None
+ rx_pripause_5_1us_count: Optional[int] = None
+ rx_pripause_6_1us_count: Optional[int] = None
+ rx_pripause_7_1us_count: Optional[int] = None
+ rx_pause_1us_count: Optional[int] = None
+ frames_tx_truncated: Optional[int] = None
+
+ error_fields: ClassVar[list[str]] = [
+ "rx_csum_error",
+ "hw_tx_dropped",
+ "hw_rx_dropped",
+ "hw_rx_over_errors",
+ "hw_rx_missed_errors",
+ "hw_tx_aborted_errors",
+ "frames_rx_bad_fcs",
+ "frames_rx_bad_all",
+ "frames_rx_bad_length",
+ "frames_rx_undersized",
+ "frames_rx_oversized",
+ "frames_rx_fragments",
+ "frames_rx_jabber",
+ "frames_rx_stomped_crc",
+ "frames_rx_too_long",
+ "frames_rx_dropped",
+ "frames_rx_less_than_64b",
+ "frames_tx_bad",
+ "frames_tx_pause",
+ "frames_tx_pripause",
+ "frames_tx_less_than_64b",
+ "frames_tx_pri_0",
+ "frames_tx_pri_1",
+ "frames_tx_pri_2",
+ "frames_tx_pri_3",
+ "frames_tx_pri_4",
+ "frames_tx_pri_5",
+ "frames_tx_pri_6",
+ "frames_tx_pri_7",
+ "tx_pripause_0_1us_count",
+ "tx_pripause_1_1us_count",
+ "tx_pripause_2_1us_count",
+ "tx_pripause_3_1us_count",
+ "tx_pripause_4_1us_count",
+ "tx_pripause_5_1us_count",
+ "tx_pripause_6_1us_count",
+ "tx_pripause_7_1us_count",
+ "frames_tx_truncated",
+ ]
+
+ warning_fields: ClassVar[list[str]] = [
+ "frames_rx_pause",
+ "frames_rx_pripause",
+ "frames_rx_pri_0",
+ "frames_rx_pri_1",
+ "frames_rx_pri_2",
+ "frames_rx_pri_3",
+ "frames_rx_pri_4",
+ "frames_rx_pri_5",
+ "frames_rx_pri_6",
+ "frames_rx_pri_7",
+ "rx_pripause_0_1us_count",
+ "rx_pripause_1_1us_count",
+ "rx_pripause_2_1us_count",
+ "rx_pripause_3_1us_count",
+ "rx_pripause_4_1us_count",
+ "rx_pripause_5_1us_count",
+ "rx_pripause_6_1us_count",
+ "rx_pripause_7_1us_count",
+ "rx_pause_1us_count",
+ ]
+
+
+class Thor2EthtoolStatistics(BaseModel):
+ """ifname bnxt. Keeping only fields of interest. Skip queue-specific stats for now"""
+
+ rx_total_l4_csum_errors: Optional[int] = None
+ rx_total_resets: Optional[int] = None
+ rx_total_buf_errors: Optional[int] = None
+ rx_total_oom_discards: Optional[int] = None
+ rx_total_netpoll_discards: Optional[int] = None
+ rx_total_ring_discards: Optional[int] = None
+ tx_total_resets: Optional[int] = None
+ tx_total_ring_discards: Optional[int] = None
+ total_missed_irqs: Optional[int] = None
+ ktls_tx_rec_err: Optional[int] = None
+ ktls_rx_resync_discard: Optional[int] = None
+ rx_fcs_err_frames: Optional[int] = None
+ rx_pause_frames: Optional[int] = None
+ rx_pfc_frames: Optional[int] = None
+ rx_align_err_frames: Optional[int] = None
+ rx_ovrsz_frames: Optional[int] = None
+ rx_jbr_frames: Optional[int] = None
+ rx_mtu_err_frames: Optional[int] = None
+ rx_pfc_ena_frames_pri0: Optional[int] = None
+ rx_pfc_ena_frames_pri1: Optional[int] = None
+ rx_pfc_ena_frames_pri2: Optional[int] = None
+ rx_pfc_ena_frames_pri3: Optional[int] = None
+ rx_pfc_ena_frames_pri4: Optional[int] = None
+ rx_pfc_ena_frames_pri5: Optional[int] = None
+ rx_pfc_ena_frames_pri6: Optional[int] = None
+ rx_pfc_ena_frames_pri7: Optional[int] = None
+ rx_undrsz_frames: Optional[int] = None
+ rx_runt_bytes: Optional[int] = None
+ rx_runt_frames: Optional[int] = None
+ rx_stat_discard: Optional[int] = None
+ rx_stat_err: Optional[int] = None
+ tx_pause_frames: Optional[int] = None
+ tx_pfc_frames: Optional[int] = None
+ tx_jabber_frames: Optional[int] = None
+ tx_fcs_err_frames: Optional[int] = None
+ tx_err: Optional[int] = None
+ tx_fifo_underruns: Optional[int] = None
+ tx_pfc_ena_frames_pri0: Optional[int] = None
+ tx_pfc_ena_frames_pri1: Optional[int] = None
+ tx_pfc_ena_frames_pri2: Optional[int] = None
+ tx_pfc_ena_frames_pri3: Optional[int] = None
+ tx_pfc_ena_frames_pri4: Optional[int] = None
+ tx_pfc_ena_frames_pri5: Optional[int] = None
+ tx_pfc_ena_frames_pri6: Optional[int] = None
+ tx_pfc_ena_frames_pri7: Optional[int] = None
+ tx_total_collisions: Optional[int] = None
+ tx_stat_discard: Optional[int] = None
+ tx_stat_error: Optional[int] = None
+ link_down_events: Optional[int] = None
+ continuous_pause_events: Optional[int] = None
+ resume_pause_events: Optional[int] = None
+ continuous_roce_pause_events: Optional[int] = None
+ resume_roce_pause_events: Optional[int] = None
+ pfc_pri0_rx_transitions: Optional[int] = None
+ pfc_pri1_rx_transitions: Optional[int] = None
+ pfc_pri2_rx_transitions: Optional[int] = None
+ pfc_pri3_rx_transitions: Optional[int] = None
+ pfc_pri4_rx_transitions: Optional[int] = None
+ pfc_pri5_rx_transitions: Optional[int] = None
+ pfc_pri6_rx_transitions: Optional[int] = None
+ pfc_pri7_rx_transitions: Optional[int] = None
+ rx_pcs_symbol_err: Optional[int] = None
+ rx_discard_bytes_cos0: Optional[int] = None
+ rx_discard_packets_cos0: Optional[int] = None
+ rx_discard_bytes_cos1: Optional[int] = None
+ rx_discard_packets_cos1: Optional[int] = None
+ rx_discard_bytes_cos2: Optional[int] = None
+ rx_discard_packets_cos2: Optional[int] = None
+ rx_discard_bytes_cos3: Optional[int] = None
+ rx_discard_packets_cos3: Optional[int] = None
+ rx_discard_bytes_cos4: Optional[int] = None
+ rx_discard_packets_cos4: Optional[int] = None
+ rx_discard_bytes_cos5: Optional[int] = None
+ rx_discard_packets_cos5: Optional[int] = None
+ rx_discard_bytes_cos6: Optional[int] = None
+ rx_discard_packets_cos6: Optional[int] = None
+ rx_discard_bytes_cos7: Optional[int] = None
+ rx_discard_packets_cos7: Optional[int] = None
+ rx_fec_uncorrectable_blocks: Optional[int] = None
+ rx_filter_miss: Optional[int] = None
+ pfc_pri0_tx_transitions: Optional[int] = None
+ pfc_pri1_tx_transitions: Optional[int] = None
+ pfc_pri2_tx_transitions: Optional[int] = None
+ pfc_pri3_tx_transitions: Optional[int] = None
+ pfc_pri4_tx_transitions: Optional[int] = None
+ pfc_pri5_tx_transitions: Optional[int] = None
+ pfc_pri6_tx_transitions: Optional[int] = None
+ pfc_pri7_tx_transitions: Optional[int] = None
+ hw_db_recov_dbs_dropped: Optional[int] = None
+ hw_db_recov_oo_drop_count: Optional[int] = None
+ lpbk_tx_discards: Optional[int] = None
+ lpbk_tx_errors: Optional[int] = None
+ lpbk_rx_discards: Optional[int] = None
+ lpbk_rx_errors: Optional[int] = None
+
+ error_fields: ClassVar[list[str]] = [
+ "rx_total_l4_csum_errors",
+ "rx_total_buf_errors",
+ "rx_total_oom_discards",
+ "rx_total_netpoll_discards",
+ "rx_total_ring_discards",
+ "tx_total_ring_discards",
+ "total_missed_irqs",
+ "ktls_tx_rec_err",
+ "ktls_rx_resync_discard",
+ "rx_fcs_err_frames",
+ "rx_align_err_frames",
+ "rx_ovrsz_frames",
+ "rx_jbr_frames",
+ "rx_mtu_err_frames",
+ "rx_undrsz_frames",
+ "rx_runt_bytes",
+ "rx_runt_frames",
+ "rx_stat_discard",
+ "rx_stat_err",
+ "tx_pause_frames",
+ "tx_pfc_frames",
+ "tx_jabber_frames",
+ "tx_fcs_err_frames",
+ "tx_err",
+ "tx_fifo_underruns",
+ "tx_pfc_ena_frames_pri0",
+ "tx_pfc_ena_frames_pri1",
+ "tx_pfc_ena_frames_pri2",
+ "tx_pfc_ena_frames_pri3",
+ "tx_pfc_ena_frames_pri4",
+ "tx_pfc_ena_frames_pri5",
+ "tx_pfc_ena_frames_pri6",
+ "tx_pfc_ena_frames_pri7",
+ "tx_total_collisions",
+ "tx_stat_discard",
+ "tx_stat_error",
+ "link_down_events",
+ "continuous_pause_events",
+ "resume_pause_events",
+ "continuous_roce_pause_events",
+ "resume_roce_pause_events",
+ "rx_pcs_symbol_err",
+ "rx_discard_bytes_cos0",
+ "rx_discard_packets_cos0",
+ "rx_discard_bytes_cos1",
+ "rx_discard_packets_cos1",
+ "rx_discard_bytes_cos2",
+ "rx_discard_packets_cos2",
+ "rx_discard_bytes_cos3",
+ "rx_discard_packets_cos3",
+ "rx_discard_bytes_cos4",
+ "rx_discard_packets_cos4",
+ "rx_discard_bytes_cos5",
+ "rx_discard_packets_cos5",
+ "rx_discard_bytes_cos6",
+ "rx_discard_packets_cos6",
+ "rx_discard_bytes_cos7",
+ "rx_discard_packets_cos7",
+ "rx_fec_uncorrectable_blocks",
+ "rx_filter_miss",
+ "pfc_pri0_tx_transitions",
+ "pfc_pri1_tx_transitions",
+ "pfc_pri2_tx_transitions",
+ "pfc_pri3_tx_transitions",
+ "pfc_pri4_tx_transitions",
+ "pfc_pri5_tx_transitions",
+ "pfc_pri6_tx_transitions",
+ "pfc_pri7_tx_transitions",
+ "hw_db_recov_dbs_dropped",
+ "hw_db_recov_oo_drop_count",
+ "lpbk_tx_discards",
+ "lpbk_tx_errors",
+ "lpbk_rx_discards",
+ "lpbk_rx_errors",
+ ]
+
+ warning_fields: ClassVar[list[str]] = [
+ "rx_total_resets",
+ "tx_total_resets",
+ "rx_pause_frames",
+ "rx_pfc_frames",
+ "rx_pfc_ena_frames_pri0",
+ "rx_pfc_ena_frames_pri1",
+ "rx_pfc_ena_frames_pri2",
+ "rx_pfc_ena_frames_pri3",
+ "rx_pfc_ena_frames_pri4",
+ "rx_pfc_ena_frames_pri5",
+ "rx_pfc_ena_frames_pri6",
+ "rx_pfc_ena_frames_pri7",
+ "pfc_pri0_rx_transitions",
+ "pfc_pri1_rx_transitions",
+ "pfc_pri2_rx_transitions",
+ "pfc_pri3_rx_transitions",
+ "pfc_pri4_rx_transitions",
+ "pfc_pri5_rx_transitions",
+ "pfc_pri6_rx_transitions",
+ "pfc_pri7_rx_transitions",
+ ]
+
+
+class Cx7EthtoolStatistics(BaseModel):
+ """ifname mlx. Keeping only fields of interest. Skip queue-specific stats for now"""
+
+ rx_xdp_drop: Optional[int] = None
+ rx_xdp_tx_err: Optional[int] = None
+ tx_queue_dropped: Optional[int] = None
+ tx_cqe_err: Optional[int] = None
+ tx_xdp_err: Optional[int] = None
+ rx_wqe_err: Optional[int] = None
+ rx_oversize_pkts_sw_drop: Optional[int] = None
+ rx_buff_alloc_err: Optional[int] = None
+ rx_arfs_err: Optional[int] = None
+ rx_tls_err: Optional[int] = None
+ rx_xsk_xdp_drop: Optional[int] = None
+ rx_xsk_wqe_err: Optional[int] = None
+ rx_xsk_oversize_pkts_sw_drop: Optional[int] = None
+ rx_xsk_buff_alloc_err: Optional[int] = None
+ tx_xsk_err: Optional[int] = None
+ rx_out_of_buffer: Optional[int] = None
+ rx_if_down_packets: Optional[int] = None
+ rx_steer_missed_packets: Optional[int] = None
+ rx_oversize_pkts_buffer: Optional[int] = None
+ rx_crc_errors_phy: Optional[int] = None
+ rx_in_range_len_errors_phy: Optional[int] = None
+ rx_out_of_range_len_phy: Optional[int] = None
+ rx_oversize_pkts_phy: Optional[int] = None
+ rx_symbol_err_phy: Optional[int] = None
+ rx_unsupported_op_phy: Optional[int] = None
+ rx_pause_ctrl_phy: Optional[int] = None
+ tx_pause_ctrl_phy: Optional[int] = None
+ rx_discards_phy: Optional[int] = None
+ tx_discards_phy: Optional[int] = None
+ tx_errors_phy: Optional[int] = None
+ rx_undersize_pkts_phy: Optional[int] = None
+ rx_fragments_phy: Optional[int] = None
+ rx_jabbers_phy: Optional[int] = None
+ link_down_events_phy: Optional[int] = None
+ rx_pcs_symbol_err_phy: Optional[int] = None
+ rx_pci_signal_integrity: Optional[int] = None
+ tx_pci_signal_integrity: Optional[int] = None
+ outbound_pci_stalled_rd: Optional[int] = None
+ outbound_pci_stalled_wr: Optional[int] = None
+ outbound_pci_stalled_rd_events: Optional[int] = None
+ outbound_pci_stalled_wr_events: Optional[int] = None
+ rx_prio0_discards: Optional[int] = None
+ rx_prio1_discards: Optional[int] = None
+ rx_prio2_discards: Optional[int] = None
+ rx_prio3_discards: Optional[int] = None
+ rx_prio4_discards: Optional[int] = None
+ rx_prio5_discards: Optional[int] = None
+ rx_prio6_discards: Optional[int] = None
+ rx_prio7_discards: Optional[int] = None
+ rx_global_pause: Optional[int] = None
+ rx_prio0_pause: Optional[int] = None
+ rx_prio1_pause: Optional[int] = None
+ rx_prio2_pause: Optional[int] = None
+ rx_prio3_pause: Optional[int] = None
+ rx_prio4_pause: Optional[int] = None
+ rx_prio5_pause: Optional[int] = None
+ rx_prio6_pause: Optional[int] = None
+ rx_prio7_pause: Optional[int] = None
+ rx_global_pause_duration: Optional[int] = None
+ rx_prio0_pause_duration: Optional[int] = None
+ rx_prio1_pause_duration: Optional[int] = None
+ rx_prio2_pause_duration: Optional[int] = None
+ rx_prio3_pause_duration: Optional[int] = None
+ rx_prio4_pause_duration: Optional[int] = None
+ rx_prio5_pause_duration: Optional[int] = None
+ rx_prio6_pause_duration: Optional[int] = None
+ rx_prio7_pause_duration: Optional[int] = None
+ tx_global_pause: Optional[int] = None
+ tx_prio0_pause: Optional[int] = None
+ tx_prio1_pause: Optional[int] = None
+ tx_prio2_pause: Optional[int] = None
+ tx_prio3_pause: Optional[int] = None
+ tx_prio4_pause: Optional[int] = None
+ tx_prio5_pause: Optional[int] = None
+ tx_prio6_pause: Optional[int] = None
+ tx_prio7_pause: Optional[int] = None
+ tx_global_pause_duration: Optional[int] = None
+ tx_prio0_pause_duration: Optional[int] = None
+ tx_prio1_pause_duration: Optional[int] = None
+ tx_prio2_pause_duration: Optional[int] = None
+ tx_prio3_pause_duration: Optional[int] = None
+ tx_prio4_pause_duration: Optional[int] = None
+ tx_prio5_pause_duration: Optional[int] = None
+ tx_prio6_pause_duration: Optional[int] = None
+ tx_prio7_pause_duration: Optional[int] = None
+ rx_global_pause_transition: Optional[int] = None
+ rx_prio0_pause_transition: Optional[int] = None
+ rx_prio1_pause_transition: Optional[int] = None
+ rx_prio2_pause_transition: Optional[int] = None
+ rx_prio3_pause_transition: Optional[int] = None
+ rx_prio4_pause_transition: Optional[int] = None
+ rx_prio5_pause_transition: Optional[int] = None
+ rx_prio6_pause_transition: Optional[int] = None
+ rx_prio7_pause_transition: Optional[int] = None
+ tx_pause_storm_warning_events: Optional[int] = None
+ tx_pause_storm_error_events: Optional[int] = None
+ module_unplug: Optional[int] = None
+ module_bus_stuck: Optional[int] = None
+ module_high_temp: Optional[int] = None
+ module_bad_shorted: Optional[int] = None
+ ipsec_rx_drop_pkts: Optional[int] = None
+ ipsec_rx_drop_bytes: Optional[int] = None
+ ipsec_rx_drop_mismatch_sa_sel: Optional[int] = None
+ ipsec_tx_drop_pkts: Optional[int] = None
+ ipsec_tx_drop_bytes: Optional[int] = None
+ ipsec_rx_drop_sp_alloc: Optional[int] = None
+ ipsec_rx_drop_sadb_miss: Optional[int] = None
+ ipsec_rx_drop_syndrome: Optional[int] = None
+ ipsec_tx_drop_bundle: Optional[int] = None
+ ipsec_tx_drop_no_state: Optional[int] = None
+ ipsec_tx_drop_not_ip: Optional[int] = None
+ ipsec_tx_drop_trailer: Optional[int] = None
+ rx_prio0_buf_discard: Optional[int] = None
+ rx_prio0_cong_discard: Optional[int] = None
+ rx_prio1_buf_discard: Optional[int] = None
+ rx_prio1_cong_discard: Optional[int] = None
+ rx_prio2_buf_discard: Optional[int] = None
+ rx_prio2_cong_discard: Optional[int] = None
+ rx_prio3_buf_discard: Optional[int] = None
+ rx_prio3_cong_discard: Optional[int] = None
+ rx_prio4_buf_discard: Optional[int] = None
+ rx_prio4_cong_discard: Optional[int] = None
+ rx_prio5_buf_discard: Optional[int] = None
+ rx_prio5_cong_discard: Optional[int] = None
+ rx_prio6_buf_discard: Optional[int] = None
+ rx_prio6_cong_discard: Optional[int] = None
+ rx_prio7_buf_discard: Optional[int] = None
+ rx_prio7_cong_discard: Optional[int] = None
+
+ error_fields: ClassVar[list[str]] = [
+ "rx_xdp_drop",
+ "rx_xdp_tx_err",
+ "tx_queue_dropped",
+ "tx_cqe_err",
+ "tx_xdp_err",
+ "rx_wqe_err",
+ "rx_oversize_pkts_sw_drop",
+ "rx_buff_alloc_err",
+ "rx_arfs_err",
+ "rx_tls_err",
+ "rx_xsk_xdp_drop",
+ "rx_xsk_wqe_err",
+ "rx_xsk_oversize_pkts_sw_drop",
+ "rx_xsk_buff_alloc_err",
+ "tx_xsk_err",
+ "rx_out_of_buffer",
+ "rx_if_down_packets",
+ "rx_steer_missed_packets",
+ "rx_oversize_pkts_buffer",
+ "rx_crc_errors_phy",
+ "rx_in_range_len_errors_phy",
+ "rx_out_of_range_len_phy",
+ "rx_oversize_pkts_phy",
+ "rx_symbol_err_phy",
+ "rx_unsupported_op_phy",
+ "tx_pause_ctrl_phy",
+ "rx_discards_phy",
+ "tx_discards_phy",
+ "tx_errors_phy",
+ "rx_undersize_pkts_phy",
+ "rx_fragments_phy",
+ "rx_jabbers_phy",
+ "link_down_events_phy",
+ "rx_pcs_symbol_err_phy",
+ "rx_pci_signal_integrity",
+ "tx_pci_signal_integrity",
+ "outbound_pci_stalled_rd",
+ "outbound_pci_stalled_wr",
+ "outbound_pci_stalled_rd_events",
+ "outbound_pci_stalled_wr_events",
+ "rx_prio0_discards",
+ "rx_prio1_discards",
+ "rx_prio2_discards",
+ "rx_prio3_discards",
+ "rx_prio4_discards",
+ "rx_prio5_discards",
+ "rx_prio6_discards",
+ "rx_prio7_discards",
+ "tx_global_pause",
+ "tx_prio0_pause",
+ "tx_prio1_pause",
+ "tx_prio2_pause",
+ "tx_prio3_pause",
+ "tx_prio4_pause",
+ "tx_prio5_pause",
+ "tx_prio6_pause",
+ "tx_prio7_pause",
+ "tx_global_pause_duration",
+ "tx_prio0_pause_duration",
+ "tx_prio1_pause_duration",
+ "tx_prio2_pause_duration",
+ "tx_prio3_pause_duration",
+ "tx_prio4_pause_duration",
+ "tx_prio5_pause_duration",
+ "tx_prio6_pause_duration",
+ "tx_prio7_pause_duration",
+ "tx_pause_storm_warning_events",
+ "tx_pause_storm_error_events",
+ "module_unplug",
+ "module_bus_stuck",
+ "module_high_temp",
+ "module_bad_shorted",
+ "ipsec_rx_drop_pkts",
+ "ipsec_rx_drop_bytes",
+ "ipsec_rx_drop_mismatch_sa_sel",
+ "ipsec_tx_drop_pkts",
+ "ipsec_tx_drop_bytes",
+ "ipsec_rx_drop_sp_alloc",
+ "ipsec_rx_drop_sadb_miss",
+ "ipsec_rx_drop_syndrome",
+ "ipsec_tx_drop_bundle",
+ "ipsec_tx_drop_no_state",
+ "ipsec_tx_drop_not_ip",
+ "ipsec_tx_drop_trailer",
+ "rx_prio0_buf_discard",
+ "rx_prio0_cong_discard",
+ "rx_prio1_buf_discard",
+ "rx_prio1_cong_discard",
+ "rx_prio2_buf_discard",
+ "rx_prio2_cong_discard",
+ "rx_prio3_buf_discard",
+ "rx_prio3_cong_discard",
+ "rx_prio4_buf_discard",
+ "rx_prio4_cong_discard",
+ "rx_prio5_buf_discard",
+ "rx_prio5_cong_discard",
+ "rx_prio6_buf_discard",
+ "rx_prio6_cong_discard",
+ "rx_prio7_buf_discard",
+ "rx_prio7_cong_discard",
+ ]
+
+ warning_fields: ClassVar[list[str]] = [
+ "rx_pause_ctrl_phy",
+ "rx_global_pause",
+ "rx_prio0_pause",
+ "rx_prio1_pause",
+ "rx_prio2_pause",
+ "rx_prio3_pause",
+ "rx_prio4_pause",
+ "rx_prio5_pause",
+ "rx_prio6_pause",
+ "rx_prio7_pause",
+ "rx_global_pause_transition",
+ "rx_prio0_pause_transition",
+ "rx_prio1_pause_transition",
+ "rx_prio2_pause_transition",
+ "rx_prio3_pause_transition",
+ "rx_prio4_pause_transition",
+ "rx_prio5_pause_transition",
+ "rx_prio6_pause_transition",
+ "rx_prio7_pause_transition",
+ "rx_global_pause_duration",
+ "rx_prio0_pause_duration",
+ "rx_prio1_pause_duration",
+ "rx_prio2_pause_duration",
+ "rx_prio3_pause_duration",
+ "rx_prio4_pause_duration",
+ "rx_prio5_pause_duration",
+ "rx_prio6_pause_duration",
+ "rx_prio7_pause_duration",
+ ]
+
+
+VendorEthtoolStatisticsModel = Union[
+ PollaraEthtoolStatistics,
+ Thor2EthtoolStatistics,
+ Cx7EthtoolStatistics,
+]
+
+VendorEthtoolStatisticsCls = Union[
+ type[PollaraEthtoolStatistics],
+ type[Thor2EthtoolStatistics],
+ type[Cx7EthtoolStatistics],
+]
+
+
+# Map ifname prefixes to vendor-specific statistic models
+# If netdev is ens, use Cx7
+# If netdev is benic, check if it starts with ionic or bnxt to determine if it's Pollara or Thor2
+VENDOR_PREFIX_MAP: dict[str, VendorEthtoolStatisticsCls] = {
+ "ionic": PollaraEthtoolStatistics,
+ "bnxt": Thor2EthtoolStatistics,
+ "mlx": Cx7EthtoolStatistics,
+}
+
+
+class EthtoolStatistics(BaseModel):
+ """Per-netdev ethtool -S row with optional vendor-parsed counters."""
+
+ netdev: Optional[str] = None
+ rdma_ifname: Optional[str] = Field(
+ default=None,
+ description="RDMA interface name from 'rdma link -j' used for vendor prefix selection",
+ )
+ vendor_statistics: Optional[VendorEthtoolStatisticsModel] = None
+
+ @model_validator(mode="after")
+ def validate_atleast_one_field(self) -> Self:
+ if not self.model_fields_set:
+ raise ValueError("At least one field must be set in EthtoolStatistics")
+ return self
diff --git a/nodescraper/plugins/inband/network/network_analyzer.py b/nodescraper/plugins/inband/network/network_analyzer.py
index dbd39fc8..27280c37 100644
--- a/nodescraper/plugins/inband/network/network_analyzer.py
+++ b/nodescraper/plugins/inband/network/network_analyzer.py
@@ -61,17 +61,16 @@ class NetworkAnalyzer(RegexAnalyzer[NetworkDataModel, NetworkAnalyzerArgs]):
def analyze_data(
self, data: NetworkDataModel, args: Optional[NetworkAnalyzerArgs] = None
) -> TaskResult:
- """Analyze network statistics for non-zero error counters.
- Currently only checks ethtool -S statistics.
+ """Analyze ethtool -S statistics: regex-based (per interface) and vendor-based (RDMA-scoped).
Args:
- data: Network data model with ethtool_info containing interface statistics.
+ data: Network data model with ethtool_info and/or rdma_ethtool_statistics.
args: Optional analyzer arguments with custom error regex support.
Returns:
- TaskResult with status OK if no errors, ERROR if any error counter > 0.
+ TaskResult with OK, WARNING (no data or vendor warning counters only), or ERROR.
"""
- if not data.ethtool_info:
+ if not data.ethtool_info and not data.rdma_ethtool_statistics:
self.result.message = "No network devices found"
self.result.status = ExecutionStatus.WARNING
return self.result
@@ -81,26 +80,23 @@ def analyze_data(
final_error_regex = self._convert_and_extend_error_regex(args.error_regex, self.ERROR_REGEX)
- error_state = False
+ regex_error = False
for interface_name, ethtool_info in data.ethtool_info.items():
- errors_on_interface = [] # (error_field, value)
- # Loop through all statistics in the ethtool statistics dict
+ errors_on_interface: list[tuple[str, int]] = []
for stat_name, stat_value in ethtool_info.statistics.items():
- # Check if this statistic matches any error field pattern
for error_regex_obj in final_error_regex:
if error_regex_obj.regex.match(stat_name):
- # Try to convert string value to int
try:
value = int(stat_value)
except (ValueError, TypeError):
- break # Skip non-numeric values
+ break
if value > 0:
errors_on_interface.append((stat_name, value))
- break # Stop checking patterns once we find a match
+ break
if errors_on_interface:
- error_state = True
+ regex_error = True
error_names = [e[0] for e in errors_on_interface]
errors_data = {field: value for field, value in errors_on_interface}
self._log_event(
@@ -114,9 +110,49 @@ def analyze_data(
console_log=True,
)
- if error_state:
+ vendor_error = False
+ vendor_warning = False
+ for stat in data.rdma_ethtool_statistics:
+ if stat.vendor_statistics is None:
+ continue
+
+ vs = stat.vendor_statistics
+ error_fields = vs.error_fields
+ warning_fields = vs.warning_fields
+
+ for field_name in error_fields + warning_fields:
+ error_value = getattr(vs, field_name, None)
+ if error_value is not None and error_value > 0:
+ is_warning_tier = field_name in warning_fields
+ priority = EventPriority.WARNING if is_warning_tier else EventPriority.ERROR
+ if is_warning_tier:
+ vendor_warning = True
+ else:
+ vendor_error = True
+ desc = (
+ f"Ethtool warning detected: {field_name}"
+ if is_warning_tier
+ else f"Ethtool error detected: {field_name}"
+ )
+ self._log_event(
+ category=EventCategory.NETWORK,
+ description=desc,
+ data={
+ "netdev": stat.netdev,
+ "rdma_ifname": stat.rdma_ifname,
+ "error_field": field_name,
+ "error_count": error_value,
+ },
+ priority=priority,
+ console_log=True,
+ )
+
+ if regex_error or vendor_error:
self.result.message = "Network errors detected in statistics"
self.result.status = ExecutionStatus.ERROR
+ elif vendor_warning:
+ self.result.message = "Network vendor ethtool warning counters non-zero"
+ self.result.status = ExecutionStatus.WARNING
else:
self.result.message = "No network errors detected in statistics"
self.result.status = ExecutionStatus.OK
diff --git a/nodescraper/plugins/inband/network/network_collector.py b/nodescraper/plugins/inband/network/network_collector.py
index d530bd98..7e5e4a39 100644
--- a/nodescraper/plugins/inband/network/network_collector.py
+++ b/nodescraper/plugins/inband/network/network_collector.py
@@ -23,15 +23,24 @@
# SOFTWARE.
#
###############################################################################
+import json
import re
from typing import Dict, List, Optional, Tuple
+from pydantic import ValidationError
+
from nodescraper.base import InBandDataCollector
from nodescraper.connection.inband import TextFileArtifact
from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily
from nodescraper.models import TaskResult
+from nodescraper.utils import get_exception_traceback
from .collector_args import NetworkCollectorArgs
+from .ethtool_vendor import (
+ VENDOR_PREFIX_MAP,
+ EthtoolStatistics,
+ VendorEthtoolStatisticsModel,
+)
from .networkdata import (
EthtoolInfo,
IpAddress,
@@ -53,6 +62,7 @@ class NetworkCollector(InBandDataCollector[NetworkDataModel, NetworkCollectorArg
CMD_NEIGHBOR = "ip neighbor show"
CMD_ETHTOOL_TEMPLATE = "ethtool {interface}"
CMD_ETHTOOL_S_TEMPLATE = "ethtool -S {interface}"
+ CMD_RDMA_LINK_JSON = "rdma link -j"
CMD_PING = "ping"
CMD_WGET = "wget"
CMD_CURL = "curl"
@@ -519,6 +529,151 @@ def _collect_ethtool_info(self, interfaces: List[NetworkInterface]) -> Dict[str,
return ethtool_data
+ def _collect_rdma_link_json(self) -> Optional[list[dict]]:
+ """Parse JSON from `rdma link -j`. Returns None on failure, [] when no links."""
+ res = self._run_sut_cmd(self.CMD_RDMA_LINK_JSON)
+ if res.exit_code != 0:
+ self._log_event(
+ category=EventCategory.NETWORK,
+ description="rdma link -j failed (RDMA-scoped ethtool collection skipped)",
+ data={
+ "command": self.CMD_RDMA_LINK_JSON,
+ "exit_code": res.exit_code,
+ "stderr": res.stderr,
+ },
+ priority=EventPriority.WARNING,
+ )
+ return None
+ if not res.stdout.strip():
+ return []
+ try:
+ parsed = json.loads(res.stdout)
+ except json.JSONDecodeError as e:
+ self._log_event(
+ category=EventCategory.NETWORK,
+ description="Failed to parse rdma link -j JSON",
+ data={"exception": get_exception_traceback(e)},
+ priority=EventPriority.WARNING,
+ )
+ return None
+ if not isinstance(parsed, list):
+ self._log_event(
+ category=EventCategory.NETWORK,
+ description="Unexpected rdma link -j JSON type",
+ data={"data_type": type(parsed).__name__},
+ priority=EventPriority.WARNING,
+ )
+ return None
+ return parsed
+
+ def _collect_rdma_scoped_ethtool_statistic(
+ self, netdev: str, ifname: str
+ ) -> Optional[EthtoolStatistics]:
+ """Run `ethtool -S` for netdev and attach vendor-parsed stats (prefix from RDMA ifname)."""
+ cmd_s = f"ethtool -S {netdev}"
+ res = self._run_sut_cmd(cmd_s, sudo=True)
+ if res.exit_code != 0:
+ self._log_event(
+ category=EventCategory.NETWORK,
+ description=f"Error executing ethtool -S for device {netdev}",
+ data={
+ "command": cmd_s,
+ "exit_code": res.exit_code,
+ "stderr": res.stderr,
+ },
+ priority=EventPriority.ERROR,
+ console_log=True,
+ )
+ return None
+
+ if res.stdout:
+ self.result.artifacts.append(
+ TextFileArtifact(
+ filename=f"rdma-ethtool-{netdev}.log",
+ contents=res.stdout,
+ )
+ )
+ stats_dict = self._parse_ethtool_statistics(res.stdout, netdev)
+
+ vendor_stats: Optional[VendorEthtoolStatisticsModel] = None
+ for prefix, vendor_cls in VENDOR_PREFIX_MAP.items():
+ if ifname.startswith(prefix):
+ vendor_fields = set(vendor_cls.model_fields.keys())
+ stat_fields = set(stats_dict.keys()) - {"netdev"}
+
+ missing_fields = vendor_fields - stat_fields
+ if missing_fields:
+ sorted_missing = sorted(missing_fields)
+ self._log_event(
+ category=EventCategory.NETWORK,
+ description=f"Missing fields in ethtool statistic for {netdev}",
+ data={
+ "netdev": netdev,
+ "ifname": ifname,
+ "missing_fields_count": len(sorted_missing),
+ "missing_fields": sorted_missing[:50],
+ },
+ priority=EventPriority.WARNING,
+ )
+
+ filtered_stats = {k: v for k, v in stats_dict.items() if k in vendor_fields}
+ try:
+ vendor_stats = vendor_cls.model_validate(filtered_stats)
+ except ValidationError as ve:
+ self._log_event(
+ category=EventCategory.NETWORK,
+ description=f"Failed to build vendor ethtool model for {netdev}",
+ data={"exception": get_exception_traceback(ve)},
+ priority=EventPriority.WARNING,
+ )
+ break
+
+ return EthtoolStatistics(
+ netdev=netdev,
+ rdma_ifname=ifname or None,
+ vendor_statistics=vendor_stats,
+ )
+
+ def _collect_rdma_scoped_ethtool(self) -> tuple[List[str], List[EthtoolStatistics]]:
+ """Collect ethtool -S for netdevs listed on RDMA links (error-scraper EthtoolCollector parity)."""
+ netdev_list: List[str] = []
+ statistics_list: List[EthtoolStatistics] = []
+
+ link_data = self._collect_rdma_link_json()
+ if link_data is None:
+ return netdev_list, statistics_list
+
+ for link in link_data:
+ if not isinstance(link, dict):
+ self._log_event(
+ category=EventCategory.NETWORK,
+ description="Invalid data type for RDMA link entry",
+ data={"data_type": type(link).__name__},
+ priority=EventPriority.WARNING,
+ )
+ continue
+
+ netdev = link.get("netdev") or ""
+ ifname = link.get("ifname") or ""
+
+ if netdev:
+ netdev_list.append(netdev)
+ stat = self._collect_rdma_scoped_ethtool_statistic(netdev, ifname)
+ if stat is not None:
+ statistics_list.append(stat)
+
+ if netdev_list:
+ self._log_event(
+ category=EventCategory.NETWORK,
+ description=(
+ f"Collected RDMA-scoped ethtool -S for {len(statistics_list)}/"
+ f"{len(netdev_list)} netdev(s) from rdma link"
+ ),
+ priority=EventPriority.INFO,
+ )
+
+ return netdev_list, statistics_list
+
def _collect_lldp_info(self) -> None:
"""Collect LLDP information using lldpcli and lldpctl commands."""
# Run lldpcli show neighbor
@@ -618,6 +773,8 @@ def collect_data(
neighbors = []
ethtool_data = {}
network_accessible: Optional[bool] = None
+ rdma_ethtool_netdevs: List[str] = []
+ rdma_ethtool_statistics: List[EthtoolStatistics] = []
# Check network connectivity if URL is provided
if args and args.url:
@@ -662,6 +819,9 @@ def collect_data(
priority=EventPriority.INFO,
)
+ if self.system_info.os_family == OSFamily.LINUX:
+ rdma_ethtool_netdevs, rdma_ethtool_statistics = self._collect_rdma_scoped_ethtool()
+
# Collect routing table
res_route = self._run_sut_cmd(self.CMD_ROUTE)
if res_route.exit_code == 0:
@@ -724,6 +884,8 @@ def collect_data(
rules=rules,
neighbors=neighbors,
ethtool_info=ethtool_data,
+ rdma_ethtool_netdevs=rdma_ethtool_netdevs,
+ rdma_ethtool_statistics=rdma_ethtool_statistics,
accessible=network_accessible,
)
self.result.status = ExecutionStatus.OK
diff --git a/nodescraper/plugins/inband/network/networkdata.py b/nodescraper/plugins/inband/network/networkdata.py
index 20caaeca..c90a1fc1 100644
--- a/nodescraper/plugins/inband/network/networkdata.py
+++ b/nodescraper/plugins/inband/network/networkdata.py
@@ -29,6 +29,8 @@
from nodescraper.models import DataModel
+from .ethtool_vendor import EthtoolStatistics
+
class IpAddress(BaseModel):
"""Individual IP address on an interface"""
@@ -117,4 +119,7 @@ class NetworkDataModel(DataModel):
ethtool_info: Dict[str, EthtoolInfo] = Field(
default_factory=dict
) # Interface name -> EthtoolInfo mapping
+ # RDMA-scoped ethtool -S: netdevs from `rdma link -j` with vendor-parsed counters
+ rdma_ethtool_netdevs: List[str] = Field(default_factory=list)
+ rdma_ethtool_statistics: List[EthtoolStatistics] = Field(default_factory=list)
accessible: Optional[bool] = None # Network accessibility check via ping
diff --git a/nodescraper/plugins/inband/nic/nic_collector.py b/nodescraper/plugins/inband/nic/nic_collector.py
index 021fa3e3..c9c0b606 100644
--- a/nodescraper/plugins/inband/nic/nic_collector.py
+++ b/nodescraper/plugins/inband/nic/nic_collector.py
@@ -354,7 +354,9 @@ class NicCollector(InBandDataCollector[NicDataModel, NicCollectorArgs]):
CMD_NICCLI_QOS_TEMPLATE_LEGACY,
]
# New (> v233): double-dash options and qos --ets --show
- CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_NEW = "niccli --dev {device_num} nvm --getoption support_rdma"
+ CMD_NICCLI_SUPPORT_RDMA_TEMPLATE_NEW = (
+ "niccli --dev {device_num} nvm --getoption support_rdma --scope 0"
+ )
CMD_NICCLI_PERFORMANCE_PROFILE_TEMPLATE_NEW = (
"niccli --dev {device_num} nvm --getoption performance_profile"
)
@@ -471,6 +473,19 @@ def collect_data(
card_ids = [c.id for c in legacy_cards]
card_list_from_text = [c.model_dump() for c in legacy_cards]
+ if custom_commands is None and not device_nums and not card_ids:
+ self._log_event(
+ category=EventCategory.NETWORK,
+ description="No Broadcom (niccli) or Pensando (nicctl) NIC hardware detected",
+ priority=EventPriority.INFO,
+ )
+ self.result.status = ExecutionStatus.NOT_RAN
+ self.result.message = (
+ "No Broadcom (niccli) or Pensando (nicctl) NIC hardware detected; "
+ "NIC collection skipped"
+ )
+ return self.result, None
+
# Build full command list (expand placeholders)
if custom_commands is not None:
commands_to_run: List[str] = []
@@ -486,18 +501,19 @@ def collect_data(
else:
commands_to_run = []
# niccli list already stored
- per_device_templates = _get_niccli_per_device_templates(niccli_version)
- for tpl in per_device_templates:
- for d in device_nums:
- commands_to_run.append(tpl.format(device_num=d))
- # nicctl global (card discovery already done via CMD_NICCTL_CARD_TEXT)
- for c in NicCollector.CMD_NICCTL_GLOBAL:
- commands_to_run.append(c)
- for tpl in NicCollector.CMD_NICCTL_PER_CARD:
- for cid in card_ids:
- commands_to_run.append(tpl.format(card_id=cid))
- for cmd in NicCollector.CMD_NICCTL_LEGACY_TEXT:
- commands_to_run.append(cmd)
+ if device_nums:
+ per_device_templates = _get_niccli_per_device_templates(niccli_version)
+ for tpl in per_device_templates:
+ for d in device_nums:
+ commands_to_run.append(tpl.format(device_num=d))
+ if card_ids:
+ for c in NicCollector.CMD_NICCTL_GLOBAL:
+ commands_to_run.append(c)
+ for tpl in NicCollector.CMD_NICCTL_PER_CARD:
+ for cid in card_ids:
+ commands_to_run.append(tpl.format(card_id=cid))
+ for cmd in NicCollector.CMD_NICCTL_LEGACY_TEXT:
+ commands_to_run.append(cmd)
# Run each command and store (artifact-only commands are not added to results / data model).
for cmd in commands_to_run:
diff --git a/nodescraper/plugins/inband/pcie/pcie_analyzer.py b/nodescraper/plugins/inband/pcie/pcie_analyzer.py
index 7d9a7e58..43bf0213 100755
--- a/nodescraper/plugins/inband/pcie/pcie_analyzer.py
+++ b/nodescraper/plugins/inband/pcie/pcie_analyzer.py
@@ -53,6 +53,9 @@
T_CAP = TypeVar("T_CAP", bound=PcieCapStructure)
+_AMD_PCIE_BRIDGE_DEVICE_IDS = frozenset({0x1500, 0x1501})
+_PCI_BASE_CLASS_BRIDGE = 0x06
+
class PcieAnalyzerInputModel(BaseModel):
"""
@@ -870,6 +873,20 @@ def filter_pcie_data_by_device_id(
new_cfg_space_dict[bdf] = pcie_data
return new_cfg_space_dict
+ @staticmethod
+ def _is_amd_gpu_pcie_endpoint(cfg_space: PcieCfgSpace, vendorid_ep: int) -> bool:
+ """True if this config space is an AMD GPU/accelerator endpoint, not a bridge."""
+ t0 = cfg_space.type_0_configuration
+ if t0.vendor_id.val != vendorid_ep:
+ return False
+ device_id = t0.device_id.val
+ if device_id in _AMD_PCIE_BRIDGE_DEVICE_IDS:
+ return False
+ base_class = t0.class_code.val
+ if base_class == _PCI_BASE_CLASS_BRIDGE:
+ return False
+ return True
+
def check_gpu_count(
self,
pcie_data: PcieDataModel,
@@ -888,10 +905,15 @@ def check_gpu_count(
return
gpu_count_from_pcie = 0
+ bridge_count = 0
for cfg_space in pcie_data.pcie_cfg_space.values():
- vendor_id = cfg_space.type_0_configuration.vendor_id.val
- if vendor_id == self.system_info.vendorid_ep:
+ t0 = cfg_space.type_0_configuration
+ if t0.vendor_id.val != self.system_info.vendorid_ep:
+ continue
+ if self._is_amd_gpu_pcie_endpoint(cfg_space, self.system_info.vendorid_ep):
gpu_count_from_pcie += 1
+ else:
+ bridge_count += 1
if gpu_count_from_pcie != expected_gpu_count:
self._log_event(
@@ -900,6 +922,7 @@ def check_gpu_count(
priority=EventPriority.ERROR,
data={
"gpu_count_from_pcie": gpu_count_from_pcie,
+ "amd_pcie_bridge_count_excluded": bridge_count,
"expected_gpu_count": expected_gpu_count,
},
)
@@ -910,6 +933,7 @@ def check_gpu_count(
priority=EventPriority.INFO,
data={
"gpu_count": gpu_count_from_pcie,
+ "amd_pcie_bridge_count_excluded": bridge_count,
},
)
diff --git a/nodescraper/plugins/inband/rocm/rocm_collector.py b/nodescraper/plugins/inband/rocm/rocm_collector.py
index 7b910a69..27a6c4f5 100644
--- a/nodescraper/plugins/inband/rocm/rocm_collector.py
+++ b/nodescraper/plugins/inband/rocm/rocm_collector.py
@@ -25,6 +25,8 @@
###############################################################################
from typing import Optional
+from pydantic import ValidationError
+
from nodescraper.base import InBandDataCollector
from nodescraper.connection.inband import TextFileArtifact
from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus, OSFamily
@@ -41,7 +43,7 @@ class RocmCollector(InBandDataCollector[RocmDataModel, RocmCollectorArgs]):
SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.LINUX}
DATA_MODEL = RocmDataModel
- CMD_ROCM_SUB_VERSIONS_TMPL = "grep . -r {rocm_path}/.info/*"
+ CMD_ROCM_SUB_VERSIONS_TMPL = "grep . -H -r -i {rocm_path}/.info/*"
CMD_ROCMINFO_TMPL = "{rocm_path}/bin/rocminfo"
CMD_ROCM_LATEST_TMPL = "ls -v -d {rocm_path}-[3-7]* | tail -1"
CMD_ROCM_DIRS_TMPL = "ls -v -d {rocm_path}*"
@@ -99,7 +101,7 @@ def collect_data(
self.result.message = f"ROCm version: {rocm_data.rocm_version}"
self.result.status = ExecutionStatus.OK
break
- except ValueError as e:
+ except (ValueError, ValidationError) as e:
self._log_event(
category=EventCategory.OS,
description=f"Invalid ROCm version format: {res.stdout}",
diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py
index c7e75608..cd1b0537 100644
--- a/nodescraper/plugins/inband/rocm/rocmdata.py
+++ b/nodescraper/plugins/inband/rocm/rocmdata.py
@@ -24,14 +24,18 @@
#
###############################################################################
import re
-from typing import List
+from typing import ClassVar, List, Optional
-from pydantic import field_validator
+from pydantic import computed_field, field_validator
from nodescraper.models import DataModel
+_ROCM_VERSION_RE = re.compile(r"^(\d+(?:\.\d+){0,3})(?:-(\d+)(?:-gfx\w+(?:;gfx\w+)*)?)?$")
+
class RocmDataModel(DataModel):
+ ROCM_VERSION_FILENAME: ClassVar[str] = "version-rocm"
+
rocm_version: str
rocm_sub_versions: dict[str, str] = {}
rocminfo: List[str] = []
@@ -43,21 +47,28 @@ class RocmDataModel(DataModel):
clinfo: List[str] = []
kfd_proc: List[str] = []
+ @staticmethod
+ def _validate_version_string(version: str) -> str:
+ if not _ROCM_VERSION_RE.match(version):
+ raise ValueError(f"ROCm version has invalid format: {version}")
+ return version
+
@field_validator("rocm_version")
@classmethod
def validate_rocm_version(cls, rocm_version: str) -> str:
- """
- Validate the ROCm version format.
+ return cls._validate_version_string(rocm_version)
- Args:
- rocm_version (str): The ROCm version string to validate.
-
- Raises:
- ValueError: If the ROCm version does not match the expected format.
+ @field_validator("rocm_sub_versions")
+ @classmethod
+ def validate_rocm_sub_versions(cls, sub_versions: dict[str, str]) -> dict[str, str]:
+ for version in sub_versions.values():
+ cls._validate_version_string(version)
+ return sub_versions
- Returns:
- str: The validated ROCm version string.
- """
- if not re.match(r"^\d+(?:\.\d+){0,3}(-\d+)?$", rocm_version):
- raise ValueError(f"ROCm version has invalid format: {rocm_version}")
- return rocm_version
+ @computed_field
+ def build_number(self) -> Optional[str]:
+ """Build tag from version-rocm sub-version, or rocm_version when absent."""
+ rocm_version = self.rocm_sub_versions.get(self.ROCM_VERSION_FILENAME, self.rocm_version)
+ if "-" in rocm_version:
+ return rocm_version.split("-")[1]
+ return None
diff --git a/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py b/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py
index 662caffa..55bb4269 100644
--- a/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py
+++ b/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py
@@ -55,6 +55,16 @@ class RedfishEndpointCollectorArgs(BaseModel):
le=32,
description="Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.",
)
+ follow_next_link: bool = Field(
+ default=False,
+ description="If True, follow Members@odata.nextLink pagination for each URI and merge all pages into a single response.",
+ )
+ max_pages: int = Field(
+ default=200,
+ ge=1,
+ le=10_000, # Some arbitrary value - may need to be revisited
+ description="When follow_next_link is True: safety cap on the number of pages to follow per URI (default 200).",
+ )
@field_validator("uris", mode="before")
@classmethod
diff --git a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py
index 0a1305a2..e0878c1a 100644
--- a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py
+++ b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py
@@ -29,16 +29,18 @@
from urllib.parse import urlparse
from nodescraper.base import RedfishDataCollector
-from nodescraper.connection.redfish import RedfishConnection, RedfishGetResult
+from nodescraper.connection.redfish import (
+ RF_MEMBERS,
+ RF_ODATA_ID,
+ RedfishConnection,
+ RedfishGetResult,
+)
from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus
from nodescraper.models import TaskResult
from .collector_args import RedfishEndpointCollectorArgs
from .endpoint_data import RedfishEndpointDataModel
-ODATA_ID = "@odata.id"
-MEMBERS = "Members"
-
def _normalize_path(odata_id: str, api_root: str) -> str:
"""Convert @odata.id value (URL or path) to a normalized path under api_root."""
@@ -61,17 +63,17 @@ def _extract_odata_ids(obj: Any) -> list[str]:
"""Recursively extract all @odata.id values from a Redfish JSON body."""
out: list[str] = []
if isinstance(obj, dict):
- if ODATA_ID in obj and isinstance(obj[ODATA_ID], str):
- out.append(obj[ODATA_ID])
+ if RF_ODATA_ID in obj and isinstance(obj[RF_ODATA_ID], str):
+ out.append(obj[RF_ODATA_ID])
for k, v in obj.items():
- if k == MEMBERS and isinstance(v, list):
+ if k == RF_MEMBERS and isinstance(v, list):
for item in v:
if (
isinstance(item, dict)
- and ODATA_ID in item
- and isinstance(item[ODATA_ID], str)
+ and RF_ODATA_ID in item
+ and isinstance(item[RF_ODATA_ID], str)
):
- out.append(item[ODATA_ID])
+ out.append(item[RF_ODATA_ID])
elif isinstance(v, dict):
out.extend(_extract_odata_ids(v))
elif isinstance(v, list):
@@ -136,6 +138,13 @@ def _fetch_one(connection_copy: RedfishConnection, path: str) -> RedfishGetResul
return connection_copy.run_get(path)
+def _fetch_one_paged(
+ connection_copy: RedfishConnection, path: str, max_pages: int
+) -> RedfishGetResult:
+ """Run a paged GET on a connection copy, following Members@odata.nextLink (used from worker threads)."""
+ return connection_copy.run_get_paged(path, max_pages=max_pages)
+
+
class RedfishEndpointCollector(
RedfishDataCollector[RedfishEndpointDataModel, RedfishEndpointCollectorArgs]
):
@@ -199,12 +208,17 @@ def collect_data(
max_workers = getattr(args, "max_workers", 1) if args else 1
max_workers = min(max_workers, len(paths))
+ follow_next_link = getattr(args, "follow_next_link", False) is True
+ max_pages = getattr(args, "max_pages", 200) if args else 200
if max_workers <= 1:
# Sequential
responses = {}
for path in paths:
- res = self._run_redfish_get(path, log_artifact=True)
+ if follow_next_link:
+ res = self._run_redfish_get_paged(path, max_pages=max_pages, log_artifact=True)
+ else:
+ res = self._run_redfish_get(path, log_artifact=True)
if res.success and res.data is not None:
responses[res.path] = res.data
else:
@@ -222,7 +236,10 @@ def collect_data(
futures = {}
for path in paths:
conn = self.connection.copy()
- futures[executor.submit(_fetch_one, conn, path)] = path
+ if follow_next_link:
+ futures[executor.submit(_fetch_one_paged, conn, path, max_pages)] = path
+ else:
+ futures[executor.submit(_fetch_one, conn, path)] = path
for future in as_completed(futures):
path = futures[future]
try:
diff --git a/nodescraper/serialization.py b/nodescraper/serialization.py
new file mode 100644
index 00000000..64d4f19e
--- /dev/null
+++ b/nodescraper/serialization.py
@@ -0,0 +1,72 @@
+###############################################################################
+#
+# MIT License
+#
+# Copyright (C) 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+###############################################################################
+
+from __future__ import annotations
+
+import json
+from typing import Any, cast
+
+from pydantic import BaseModel
+
+__all__ = ["safe_dump_to_json_dict"]
+
+
+def safe_dump_to_json_dict(
+ model: BaseModel,
+ *,
+ exclude: set[str] | frozenset[str] | None = None,
+ by_alias: bool = True,
+) -> dict[str, Any]:
+ """Best-effort JSON-like ``dict`` from a Pydantic model.
+
+ Args:
+ model: Model instance to export.
+ exclude: Field names to omit (same shape as Pydantic ``exclude`` for sets).
+ by_alias: When ``True``, use field aliases in the output.
+
+ Returns:
+ A plain ``dict`` suitable for JSON tools and schema validators.
+ """
+ ex: set[str] | frozenset[str] | None = exclude
+ ex_inc = cast(Any, ex)
+ try:
+ raw = model.model_dump_json(
+ by_alias=by_alias,
+ exclude=ex_inc,
+ serialize_as_any=True,
+ )
+ return json.loads(raw)
+ except Exception as first_exc:
+ try:
+ dumped = model.model_dump(
+ mode="python",
+ by_alias=by_alias,
+ exclude=ex_inc,
+ serialize_as_any=True,
+ )
+ except Exception as second_exc:
+ raise second_exc from first_exc
+ return dumped
diff --git a/nodescraper/utils.py b/nodescraper/utils.py
index 3b9edf34..e7a201b8 100644
--- a/nodescraper/utils.py
+++ b/nodescraper/utils.py
@@ -53,18 +53,20 @@ def _generate_next_value_(name, start, count, last_values):
return name
-def get_exception_traceback(exception: Exception) -> dict:
+def get_exception_traceback(exception: BaseException) -> dict:
"""get traceback and exception type from an exception
Args:
- exception (Exception): exception
+ exception (BaseException): exception
Returns:
- dict: exception details dict
+ dict: exception details dict (traceback is full format_exception lines, not frames only)
"""
return {
"exception_type": type(exception).__name__,
- "traceback": traceback.format_tb(exception.__traceback__),
+ "traceback": traceback.format_exception(
+ type(exception), exception, exception.__traceback__
+ ),
}
diff --git a/test/unit/connection/redfish/test_redfish_connection_paging.py b/test/unit/connection/redfish/test_redfish_connection_paging.py
new file mode 100644
index 00000000..f922709c
--- /dev/null
+++ b/test/unit/connection/redfish/test_redfish_connection_paging.py
@@ -0,0 +1,160 @@
+###############################################################################
+#
+# MIT License
+#
+# Copyright (C) 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+###############################################################################
+from unittest.mock import patch
+
+import pytest
+
+from nodescraper.connection.redfish import (
+ RF_MEMBERS,
+ RF_MEMBERS_COUNT,
+ RF_MEMBERS_NEXT_LINK,
+ RedfishConnection,
+ RedfishGetResult,
+)
+
+
+@pytest.fixture
+def rf_conn() -> RedfishConnection:
+ return RedfishConnection(
+ base_url="https://bmc.example",
+ username="u",
+ password="p",
+ verify_ssl=False,
+ )
+
+
+def test_run_get_paged_no_next_link_returns_first_unchanged(rf_conn: RedfishConnection) -> None:
+ first_body = {RF_MEMBERS: [{"x": 1}], "Name": "Col"}
+ first = RedfishGetResult(
+ path="/redfish/v1/Systems",
+ success=True,
+ data=first_body,
+ status_code=200,
+ )
+ with patch.object(rf_conn, "run_get", return_value=first) as mock_get:
+ out = rf_conn.run_get_paged("/redfish/v1/Systems")
+
+ mock_get.assert_called_once()
+ assert out.success is True
+ assert out.data == first_body
+ assert RF_MEMBERS_NEXT_LINK not in out.data
+
+
+def test_run_get_paged_merges_members_and_strips_next_link(rf_conn: RedfishConnection) -> None:
+ page1 = {
+ RF_MEMBERS: [{"@odata.id": "/1"}],
+ RF_MEMBERS_NEXT_LINK: "/redfish/v1/Systems?$skip=1",
+ f"{RF_MEMBERS}@odata.count": 99,
+ }
+ page2 = {
+ RF_MEMBERS: [{"@odata.id": "/2"}],
+ }
+
+ def fake_get(path: str) -> RedfishGetResult:
+ p = str(path).strip()
+ if not p.startswith("/"):
+ p = "/" + p
+ if "skip" not in p:
+ return RedfishGetResult(
+ path="/redfish/v1/Systems", success=True, data=page1, status_code=200
+ )
+ return RedfishGetResult(path=p, success=True, data=page2, status_code=200)
+
+ with patch.object(rf_conn, "run_get", side_effect=fake_get):
+ out = rf_conn.run_get_paged("/redfish/v1/Systems", max_pages=10)
+
+ assert out.success is True
+ assert out.path == "/redfish/v1/Systems"
+ assert out.data is not None
+ assert out.data[RF_MEMBERS] == [{"@odata.id": "/1"}, {"@odata.id": "/2"}]
+ assert out.data[RF_MEMBERS_COUNT] == 2
+ assert RF_MEMBERS_NEXT_LINK not in out.data
+
+
+def test_run_get_paged_stops_on_followup_failure_keeps_partial_merge(
+ rf_conn: RedfishConnection,
+) -> None:
+ page1 = {
+ RF_MEMBERS: [{"@odata.id": "/1"}],
+ RF_MEMBERS_NEXT_LINK: "/next",
+ }
+
+ def fake_get(path: str) -> RedfishGetResult:
+ ps = str(path)
+ if "next" not in ps:
+ return RedfishGetResult(path="/col", success=True, data=page1, status_code=200)
+ return RedfishGetResult(path="/next", success=False, error="timeout", status_code=None)
+
+ with patch.object(rf_conn, "run_get", side_effect=fake_get):
+ out = rf_conn.run_get_paged("/col")
+
+ assert out.success is True
+ assert out.data is not None
+ assert out.data[RF_MEMBERS] == [{"@odata.id": "/1"}]
+ assert RF_MEMBERS_NEXT_LINK not in out.data
+
+
+def test_run_get_paged_respects_max_pages(rf_conn: RedfishConnection) -> None:
+ """max_pages=2 allows initial GET plus one nextLink follow only."""
+
+ def body_with_next(mid: str) -> dict:
+ return {
+ RF_MEMBERS: [{"id": mid}],
+ RF_MEMBERS_NEXT_LINK: "/page2",
+ }
+
+ calls: list[str] = []
+
+ def fake_get(path: str) -> RedfishGetResult:
+ calls.append(str(path))
+ ps = str(path)
+ if len(calls) == 1:
+ return RedfishGetResult(
+ path="/start", success=True, data=body_with_next("a"), status_code=200
+ )
+ return RedfishGetResult(path=ps, success=True, data=body_with_next("b"), status_code=200)
+
+ with patch.object(rf_conn, "run_get", side_effect=fake_get):
+ out = rf_conn.run_get_paged("/start", max_pages=2)
+
+ assert len(calls) == 2
+ assert out.data is not None
+ assert len(out.data[RF_MEMBERS]) == 2
+ assert RF_MEMBERS_NEXT_LINK not in out.data
+
+
+def test_run_get_paged_first_request_failure_passthrough(rf_conn: RedfishConnection) -> None:
+ err = RedfishGetResult(
+ path="/redfish/v1/Bad",
+ success=False,
+ error="nope",
+ status_code=404,
+ )
+ with patch.object(rf_conn, "run_get", return_value=err):
+ out = rf_conn.run_get_paged("/redfish/v1/Bad")
+
+ assert out is err
+ assert out.success is False
diff --git a/test/unit/framework/common/shared_utils.py b/test/unit/framework/common/shared_utils.py
index 1ffda40d..11e6f541 100644
--- a/test/unit/framework/common/shared_utils.py
+++ b/test/unit/framework/common/shared_utils.py
@@ -26,6 +26,7 @@
from typing import Optional
from unittest.mock import MagicMock
+from nodescraper.constants import DEFAULT_EVENT_REPORTER
from nodescraper.enums import ExecutionStatus
from nodescraper.interfaces import ConnectionManager, PluginInterface
from nodescraper.models import AnalyzerArgs, PluginResult, TaskResult
@@ -43,6 +44,9 @@ def __init__(
parent=None,
task_result_hooks=None,
connection_args=None,
+ event_reporter: str = DEFAULT_EVENT_REPORTER,
+ session_id: Optional[str] = None,
+ **kwargs,
):
super().__init__(
system_info=system_info,
@@ -50,6 +54,9 @@ def __init__(
parent=parent,
task_result_hooks=task_result_hooks,
connection_args=connection_args,
+ event_reporter=event_reporter,
+ session_id=session_id,
+ **kwargs,
)
# Use the class variable if available, otherwise create a new MagicMock
self.connection = (
diff --git a/test/unit/framework/test_datacollector.py b/test/unit/framework/test_datacollector.py
index 30fde48f..410b4d85 100644
--- a/test/unit/framework/test_datacollector.py
+++ b/test/unit/framework/test_datacollector.py
@@ -154,6 +154,17 @@ class RestrictedSkuCollector(DummyCollector):
assert res.status == ExecutionStatus.OK
+def test_supported_skus_coerce_non_string_items(conn_mock):
+ class IntSkuCollector(DummyCollector):
+ SUPPORTED_SKUS = {42}
+
+ args = {"name": "h", "sku": "42", "platform": "X", "os_family": 1}
+ info = SystemInfo(**args)
+ col = IntSkuCollector(info, conn_mock)
+ res, data = col.collect_data()
+ assert res.status == ExecutionStatus.OK
+
+
def test_missing_data_model():
with pytest.raises(TypeError, match="No data model set for DummyCollector1"):
diff --git a/test/unit/framework/test_dataplugin.py b/test/unit/framework/test_dataplugin.py
index c6e5cb48..e88f8cc5 100644
--- a/test/unit/framework/test_dataplugin.py
+++ b/test/unit/framework/test_dataplugin.py
@@ -23,10 +23,12 @@
# SOFTWARE.
#
###############################################################################
+import json
+from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
-from common.shared_utils import MockConnectionManager
+from framework.common.shared_utils import MockConnectionManager
from nodescraper.enums import EventPriority, ExecutionStatus, SystemInteractionLevel
from nodescraper.interfaces.dataanalyzertask import DataAnalyzer
@@ -151,6 +153,8 @@ def test_collect_creates_connection_manager(self, plugin, conn_mock, system_info
logger=plugin.logger,
parent=plugin.__class__.__name__,
task_result_hooks=plugin.task_result_hooks,
+ event_reporter=plugin.event_reporter,
+ session_id=plugin.session_id,
)
mock_collect.assert_called_once()
assert result.status == ExecutionStatus.OK
@@ -403,3 +407,139 @@ def test_analyze_no_data_available(self, plugin_with_conn):
assert result.status == ExecutionStatus.NOT_RAN
assert "No data available" in result.message
+
+
+class ContentModel(StandardDataModel):
+ def get_compare_content(self) -> str:
+ return self.value
+
+
+class ErrMatchAnalyzer(DataAnalyzer):
+ DATA_MODEL = ContentModel
+
+ def analyze_data(self, data, args=None):
+ return TaskResult(status=ExecutionStatus.OK)
+
+ @staticmethod
+ def get_error_matches(content: str) -> list[str]:
+ return ["z", "a"] if content else []
+
+
+class ContentCollector(DataCollector):
+ DATA_MODEL = ContentModel
+
+ def collect_data(self, args=None):
+ return TaskResult(status=ExecutionStatus.OK), ContentModel(value="x")
+
+
+class ExtractPlugin(DataPlugin):
+ DATA_MODEL = ContentModel
+ CONNECTION_TYPE = MockConnectionManager
+ COLLECTOR = ContentCollector
+ ANALYZER = ErrMatchAnalyzer
+
+
+class LogImportModel(ContentModel):
+ @classmethod
+ def import_model(cls, model_input):
+ if isinstance(model_input, str) and model_input.endswith(".log"):
+ return cls(value=Path(model_input).read_text(encoding="utf-8"))
+ return super().import_model(model_input)
+
+
+class LogImportPlugin(DataPlugin):
+ DATA_MODEL = LogImportModel
+ CONNECTION_TYPE = MockConnectionManager
+ COLLECTOR = ContentCollector
+ ANALYZER = ErrMatchAnalyzer
+
+
+class TestDataPluginRunPaths:
+ def test_find_datamodel_path_requires_directory(self) -> None:
+ assert CoreDataPlugin.find_datamodel_path_in_run("/no/such/run") is None
+
+ def test_find_datamodel_path_success(self, tmp_path: Path) -> None:
+ collector_dir = tmp_path / "extract_plugin" / "content_collector"
+ collector_dir.mkdir(parents=True)
+ (collector_dir / "result.json").write_text(
+ json.dumps({"parent": "ExtractPlugin"}), encoding="utf-8"
+ )
+ (collector_dir / "contentmodel.json").write_text(
+ json.dumps({"value": "from_run"}), encoding="utf-8"
+ )
+
+ found = ExtractPlugin.find_datamodel_path_in_run(str(tmp_path))
+ assert found is not None
+ assert found.endswith("contentmodel.json")
+
+ def test_find_datamodel_path_wrong_parent(self, tmp_path: Path) -> None:
+ collector_dir = tmp_path / "extract_plugin" / "content_collector"
+ collector_dir.mkdir(parents=True)
+ (collector_dir / "result.json").write_text(
+ json.dumps({"parent": "OtherPlugin"}), encoding="utf-8"
+ )
+ (collector_dir / "contentmodel.json").write_text("{}", encoding="utf-8")
+
+ assert ExtractPlugin.find_datamodel_path_in_run(str(tmp_path)) is None
+
+ def test_find_datamodel_path_invalid_result_json(self, tmp_path: Path) -> None:
+ collector_dir = tmp_path / "extract_plugin" / "content_collector"
+ collector_dir.mkdir(parents=True)
+ (collector_dir / "result.json").write_text("{not json", encoding="utf-8")
+ (collector_dir / "contentmodel.json").write_text("{}", encoding="utf-8")
+
+ assert ExtractPlugin.find_datamodel_path_in_run(str(tmp_path)) is None
+
+ def test_load_datamodel_from_path_json(self, tmp_path: Path) -> None:
+ p = tmp_path / "dm.json"
+ p.write_text(json.dumps({"value": "file"}), encoding="utf-8")
+ m = ExtractPlugin.load_datamodel_from_path(str(p))
+ assert isinstance(m, ContentModel)
+ assert m.value == "file"
+
+ def test_load_datamodel_from_path_missing(self) -> None:
+ assert ExtractPlugin.load_datamodel_from_path("/nonexistent/x.json") is None
+
+ def test_load_datamodel_from_path_log_with_custom_import(self, tmp_path: Path) -> None:
+ log = tmp_path / "capture.log"
+ log.write_text("from_log", encoding="utf-8")
+ m = LogImportPlugin.load_datamodel_from_path(str(log))
+ assert isinstance(m, LogImportModel)
+ assert m.value == "from_log"
+
+ def test_load_datamodel_from_path_log_without_override_returns_none(
+ self, tmp_path: Path
+ ) -> None:
+ log = tmp_path / "plain.log"
+ log.write_text("{}", encoding="utf-8")
+ assert ExtractPlugin.load_datamodel_from_path(str(log)) is None
+
+ def test_get_extracted_errors_sorted(self) -> None:
+ dm = ContentModel(value="has text")
+ out = ExtractPlugin.get_extracted_errors(dm)
+ assert out == ["a", "z"]
+
+ def test_get_extracted_errors_without_hooks(self) -> None:
+ assert CoreDataPlugin.get_extracted_errors(StandardDataModel()) is None
+
+ def test_load_run_data_from_run_dir(self, tmp_path: Path) -> None:
+ collector_dir = tmp_path / "extract_plugin" / "content_collector"
+ collector_dir.mkdir(parents=True)
+ (collector_dir / "result.json").write_text(
+ json.dumps({"parent": "ExtractPlugin"}), encoding="utf-8"
+ )
+ (collector_dir / "contentmodel.json").write_text(
+ json.dumps({"value": "run"}), encoding="utf-8"
+ )
+
+ loaded = ExtractPlugin.load_run_data(str(tmp_path))
+ assert loaded is not None
+ assert loaded["value"] == "run"
+ assert loaded["extracted_errors"] == ["a", "z"]
+
+ def test_load_run_data_direct_file(self, tmp_path: Path) -> None:
+ p = tmp_path / "direct.json"
+ p.write_text(json.dumps({"value": "direct"}), encoding="utf-8")
+ loaded = ExtractPlugin.load_run_data(str(p))
+ assert loaded is not None
+ assert loaded["value"] == "direct"
diff --git a/test/unit/framework/test_plugin_executor.py b/test/unit/framework/test_plugin_executor.py
index 7ed75b93..fe9a8954 100644
--- a/test/unit/framework/test_plugin_executor.py
+++ b/test/unit/framework/test_plugin_executor.py
@@ -24,7 +24,7 @@
#
###############################################################################
import pytest
-from common.shared_utils import DummyDataModel, MockConnectionManager
+from framework.common.shared_utils import DummyDataModel, MockConnectionManager
from pydantic import BaseModel
from nodescraper.enums import ExecutionStatus
@@ -103,6 +103,11 @@ def test_config_merge(input_configs: list[PluginConfig], output_config: PluginCo
assert PluginExecutor.merge_configs(input_configs) == output_config
+def test_plugin_executor_rejects_invalid_session_id():
+ with pytest.raises(ValueError, match="session_id must be a valid UUID"):
+ PluginExecutor(plugin_configs=[], session_id="not-a-uuid")
+
+
def test_plugin_queue(plugin_registry):
executor = PluginExecutor(
plugin_configs=[PluginConfig(global_args={"test_arg": "abc"}, plugins={"TestPluginB": {}})],
diff --git a/test/unit/plugin/test_device_enumeration_collector.py b/test/unit/plugin/test_device_enumeration_collector.py
index 795611a6..50335f1f 100644
--- a/test/unit/plugin/test_device_enumeration_collector.py
+++ b/test/unit/plugin/test_device_enumeration_collector.py
@@ -66,7 +66,11 @@ def test_collect_linux(system_info, device_enumeration_collector):
exit_code=0,
stdout="8",
stderr="",
- command="lspci -d 1002: | grep -i 'VGA\\|Display\\|3D' | wc -l",
+ command=(
+ "lspci -d 1002: | grep -iE "
+ "'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | "
+ "grep -vi 'Virtual Function' | wc -l"
+ ),
),
MagicMock(
exit_code=0,
@@ -142,7 +146,11 @@ def test_collect_error(system_info, device_enumeration_collector):
exit_code=1,
stdout="some output",
stderr="command failed",
- command="lspci -d 1002: | grep -i 'VGA\\|Display\\|3D' | wc -l",
+ command=(
+ "lspci -d 1002: | grep -iE "
+ "'VGA|Display|3D|Processing accelerators|Co-processor|Accelerator' | "
+ "grep -vi 'Virtual Function' | wc -l"
+ ),
),
MagicMock(
exit_code=1,
diff --git a/test/unit/plugin/test_network_analyzer.py b/test/unit/plugin/test_network_analyzer.py
index e886b765..6b7aeff3 100644
--- a/test/unit/plugin/test_network_analyzer.py
+++ b/test/unit/plugin/test_network_analyzer.py
@@ -27,6 +27,10 @@
from nodescraper.enums import EventPriority, ExecutionStatus
from nodescraper.plugins.inband.network.analyzer_args import NetworkAnalyzerArgs
+from nodescraper.plugins.inband.network.ethtool_vendor import (
+ EthtoolStatistics,
+ Thor2EthtoolStatistics,
+)
from nodescraper.plugins.inband.network.network_analyzer import NetworkAnalyzer
from nodescraper.plugins.inband.network.networkdata import (
EthtoolInfo,
@@ -158,13 +162,55 @@ def test_multiple_interfaces_with_errors(network_analyzer):
def test_empty_ethtool_info(network_analyzer):
- """Test with empty ethtool_info: WARNING and message logged."""
+ """Test with empty ethtool_info and no RDMA ethtool: WARNING and message logged."""
model = NetworkDataModel(ethtool_info={})
result = network_analyzer.analyze_data(model)
assert result.status == ExecutionStatus.WARNING
assert result.message == "No network devices found"
+def test_rdma_ethtool_vendor_error_only(network_analyzer):
+ """RDMA-scoped vendor ethtool: error-tier counter raises ERROR."""
+ stat = EthtoolStatistics(
+ netdev="eth0",
+ rdma_ifname="bnxt0",
+ vendor_statistics=Thor2EthtoolStatistics(tx_pfc_frames=4),
+ )
+ model = NetworkDataModel(ethtool_info={}, rdma_ethtool_statistics=[stat])
+ result = network_analyzer.analyze_data(model)
+ assert result.status == ExecutionStatus.ERROR
+ assert "Network errors detected" in result.message
+ assert len(result.events) == 1
+ assert result.events[0].data["error_field"] == "tx_pfc_frames"
+ assert result.events[0].data["error_count"] == 4
+ assert result.events[0].priority == EventPriority.ERROR
+
+
+def test_rdma_ethtool_vendor_warning_only(network_analyzer):
+ """RDMA-scoped vendor ethtool: only warning-tier counters -> WARNING status."""
+ stat = EthtoolStatistics(
+ netdev="eth0",
+ rdma_ifname="bnxt0",
+ vendor_statistics=Thor2EthtoolStatistics(rx_pause_frames=2),
+ )
+ model = NetworkDataModel(ethtool_info={}, rdma_ethtool_statistics=[stat])
+ result = network_analyzer.analyze_data(model)
+ assert result.status == ExecutionStatus.WARNING
+ assert "warning counters" in result.message
+ assert len(result.events) == 1
+ assert result.events[0].data["error_field"] == "rx_pause_frames"
+ assert result.events[0].priority == EventPriority.WARNING
+
+
+def test_rdma_ethtool_no_vendor_model_ok(network_analyzer):
+ """RDMA ethtool row without parsed vendor statistics is ignored by vendor path."""
+ stat = EthtoolStatistics(netdev="eth0", rdma_ifname="unknown0", vendor_statistics=None)
+ model = NetworkDataModel(ethtool_info={}, rdma_ethtool_statistics=[stat])
+ result = network_analyzer.analyze_data(model)
+ assert result.status == ExecutionStatus.OK
+ assert len(result.events) == 0
+
+
def test_regex_patterns_priority_numbers(network_analyzer):
"""Test that regex patterns match various priority numbers (0-7 and beyond)."""
ethtool = EthtoolInfo(
diff --git a/test/unit/plugin/test_network_collector.py b/test/unit/plugin/test_network_collector.py
index 6382adeb..a7b1faae 100644
--- a/test/unit/plugin/test_network_collector.py
+++ b/test/unit/plugin/test_network_collector.py
@@ -648,3 +648,44 @@ def run_sut_cmd_side_effect(cmd, **kwargs):
result, accessible = collector.check_network_accessibility()
assert result.status == ExecutionStatus.ERRORS_DETECTED
assert accessible is False
+
+
+def test_collect_data_includes_rdma_ethtool(collector, conn_mock):
+ """RDMA-scoped ethtool -S is stored on NetworkDataModel when rdma link succeeds."""
+ import json
+
+ collector.system_info.os_family = OSFamily.LINUX
+
+ rdma_link = [{"netdev": "eth0", "ifname": "bnxt0"}]
+ ethtool_s_bnxt = "NIC statistics:\n tx_pfc_frames: 0\n rx_pause_frames: 0\n"
+
+ def run_sut_cmd_side_effect(cmd, **kwargs):
+ if "addr show" in cmd:
+ return MagicMock(exit_code=0, stdout=IP_ADDR_OUTPUT, command=cmd)
+ elif "route show" in cmd:
+ return MagicMock(exit_code=0, stdout=IP_ROUTE_OUTPUT, command=cmd)
+ elif "rule show" in cmd:
+ return MagicMock(exit_code=0, stdout=IP_RULE_OUTPUT, command=cmd)
+ elif "neighbor show" in cmd:
+ return MagicMock(exit_code=0, stdout=IP_NEIGHBOR_OUTPUT, command=cmd)
+ elif "rdma link -j" in cmd:
+ return MagicMock(exit_code=0, stdout=json.dumps(rdma_link), command=cmd)
+ elif "ethtool -S" in cmd and "eth0" in cmd:
+ return MagicMock(exit_code=0, stdout=ethtool_s_bnxt, command=cmd)
+ elif "ethtool" in cmd:
+ return MagicMock(exit_code=1, stdout="", command=cmd)
+ elif "lldpcli" in cmd or "lldpctl" in cmd:
+ return MagicMock(exit_code=1, stdout="", command=cmd)
+ return MagicMock(exit_code=1, stdout="", command=cmd)
+
+ collector._run_sut_cmd = MagicMock(side_effect=run_sut_cmd_side_effect)
+
+ result, data = collector.collect_data()
+
+ assert result.status == ExecutionStatus.OK
+ assert data is not None
+ assert "eth0" in data.rdma_ethtool_netdevs
+ assert len(data.rdma_ethtool_statistics) == 1
+ assert data.rdma_ethtool_statistics[0].netdev == "eth0"
+ assert data.rdma_ethtool_statistics[0].rdma_ifname == "bnxt0"
+ assert data.rdma_ethtool_statistics[0].vendor_statistics is not None
diff --git a/test/unit/plugin/test_niccli_collector.py b/test/unit/plugin/test_niccli_collector.py
index c4e5adef..709ed8a3 100644
--- a/test/unit/plugin/test_niccli_collector.py
+++ b/test/unit/plugin/test_niccli_collector.py
@@ -242,6 +242,44 @@ def test_nic_data_model_with_pensando_nic(collector):
assert data.pensando_nic_cards[1].serial_number == "FPL253710E5"
+def test_collect_data_not_ran_when_no_nic_hardware(collector, conn_mock):
+ """Skip collection when discovery finds no Broadcom or Pensando NICs."""
+ collector.system_info.os_family = OSFamily.LINUX
+ collector._run_sut_cmd = MagicMock(
+ return_value=MagicMock(exit_code=1, stdout="", stderr="not found", command="")
+ )
+
+ result, data = collector.collect_data()
+
+ assert result.status == ExecutionStatus.NOT_RAN
+ assert data is None
+ assert "skipped" in result.message.lower()
+ assert collector._run_sut_cmd.call_count <= 4
+
+
+def test_collect_data_skips_nicctl_commands_when_no_pensando_cards(collector, conn_mock):
+ """Do not run nicctl global/legacy commands when nicctl show card finds no cards."""
+ collector.system_info.os_family = OSFamily.LINUX
+ commands_run: list[str] = []
+
+ def run_sut_cmd_side_effect(cmd, **kwargs):
+ commands_run.append(cmd)
+ if "niccli" in cmd and ("--list" in cmd or "--list_devices" in cmd or "--listdev" in cmd):
+ return MagicMock(exit_code=0, stdout=NICCLI_LISTDEV_OUTPUT, stderr="", command=cmd)
+ if cmd.strip() == "nicctl show card":
+ return MagicMock(exit_code=1, stdout="", stderr="no card", command=cmd)
+ return MagicMock(exit_code=0, stdout="", stderr="", command=cmd)
+
+ collector._run_sut_cmd = MagicMock(side_effect=run_sut_cmd_side_effect)
+
+ result, data = collector.collect_data()
+
+ assert result.status == ExecutionStatus.OK
+ assert data is not None
+ assert not any(c.startswith("nicctl show card flash") for c in commands_run)
+ assert not any(c == "nicctl --version" for c in commands_run)
+
+
def test_collect_data_success(collector, conn_mock):
"""Test successful collection of niccli/nicctl data."""
collector.system_info.os_family = OSFamily.LINUX
diff --git a/test/unit/plugin/test_redfish_endpoint_collector.py b/test/unit/plugin/test_redfish_endpoint_collector.py
index 7a786409..501a0d8b 100644
--- a/test/unit/plugin/test_redfish_endpoint_collector.py
+++ b/test/unit/plugin/test_redfish_endpoint_collector.py
@@ -237,6 +237,116 @@ def test_fetch_one_calls_run_get():
assert out.path == "/redfish/v1"
+def test_fetch_one_paged_calls_run_get_paged():
+ conn = MagicMock()
+ conn.run_get_paged.return_value = RedfishGetResult(
+ path="/redfish/v1/Systems",
+ success=True,
+ data={"Members": []},
+ status_code=200,
+ )
+ out = ec._fetch_one_paged(conn, "/redfish/v1/Systems", max_pages=42)
+ conn.run_get_paged.assert_called_once_with("/redfish/v1/Systems", max_pages=42)
+ assert out.success is True
+
+
+def test_run_redfish_get_paged_appends_artifact_when_enabled(
+ redfish_endpoint_collector, redfish_conn_mock
+):
+ redfish_conn_mock.run_get_paged.return_value = RedfishGetResult(
+ path="/redfish/v1/Systems",
+ success=True,
+ data={"Members": []},
+ status_code=200,
+ )
+ redfish_endpoint_collector.result.artifacts.clear()
+ res = redfish_endpoint_collector._run_redfish_get_paged(
+ "/redfish/v1/Systems", max_pages=5, log_artifact=True
+ )
+ redfish_conn_mock.run_get_paged.assert_called_once_with("/redfish/v1/Systems", max_pages=5)
+ assert res.success is True
+ assert len(redfish_endpoint_collector.result.artifacts) == 1
+
+
+def test_run_redfish_get_paged_skips_artifact_when_disabled(
+ redfish_endpoint_collector, redfish_conn_mock
+):
+ redfish_conn_mock.run_get_paged.return_value = RedfishGetResult(
+ path="/x",
+ success=True,
+ data={"Members": []},
+ status_code=200,
+ )
+ redfish_endpoint_collector.result.artifacts.clear()
+ redfish_endpoint_collector._run_redfish_get_paged("/x", log_artifact=False)
+ assert len(redfish_endpoint_collector.result.artifacts) == 0
+
+
+def test_collect_follow_next_link_sequential_uses_run_get_paged(
+ redfish_endpoint_collector, redfish_conn_mock
+):
+ redfish_conn_mock.run_get_paged.return_value = RedfishGetResult(
+ path="/redfish/v1/Systems",
+ success=True,
+ data={"Members": [{"@odata.id": "/redfish/v1/Systems/1"}], "Members@odata.count": 1},
+ status_code=200,
+ )
+ result, data = redfish_endpoint_collector.collect_data(
+ args=RedfishEndpointCollectorArgs(
+ uris=["/redfish/v1/Systems"],
+ follow_next_link=True,
+ max_pages=50,
+ )
+ )
+ assert result.status == ExecutionStatus.OK
+ assert data is not None
+ assert data.responses["/redfish/v1/Systems"]["Members@odata.count"] == 1
+ redfish_conn_mock.run_get_paged.assert_called_once_with("/redfish/v1/Systems", max_pages=50)
+ redfish_conn_mock.run_get.assert_not_called()
+
+
+def test_collect_follow_next_link_concurrent_uses_connection_copy(
+ redfish_endpoint_collector, redfish_conn_mock
+):
+ copy_a = MagicMock()
+ copy_b = MagicMock()
+ copies = [copy_a, copy_b]
+
+ def next_copy():
+ return copies.pop(0) if copies else MagicMock()
+
+ redfish_conn_mock.copy.side_effect = next_copy
+
+ copy_a.run_get_paged.return_value = RedfishGetResult(
+ path="/a",
+ success=True,
+ data={"Members": []},
+ status_code=200,
+ )
+ copy_b.run_get_paged.return_value = RedfishGetResult(
+ path="/b",
+ success=True,
+ data={"Members": []},
+ status_code=200,
+ )
+
+ result, data = redfish_endpoint_collector.collect_data(
+ args=RedfishEndpointCollectorArgs(
+ uris=["/a", "/b"],
+ max_workers=2,
+ follow_next_link=True,
+ max_pages=10,
+ )
+ )
+ assert result.status == ExecutionStatus.OK
+ assert data is not None
+ assert len(data.responses) == 2
+ copy_a.run_get_paged.assert_called_once_with("/a", max_pages=10)
+ copy_b.run_get_paged.assert_called_once_with("/b", max_pages=10)
+ assert redfish_conn_mock.copy.call_count == 2
+ assert len(result.artifacts) == 2
+
+
def test_discover_tree_single_root():
conn = MagicMock()
conn.run_get.return_value = RedfishGetResult(
diff --git a/test/unit/plugin/test_rocm_collector.py b/test/unit/plugin/test_rocm_collector.py
index d1d1c09b..0cb8523c 100644
--- a/test/unit/plugin/test_rocm_collector.py
+++ b/test/unit/plugin/test_rocm_collector.py
@@ -32,6 +32,48 @@
from nodescraper.enums.systeminteraction import SystemInteractionLevel
from nodescraper.plugins.inband.rocm.rocm_collector import RocmCollector
+ROCM_SUB_VERSIONS_GREP_CMD = "grep . -H -r -i /opt/rocm/.info/*"
+# gfx942 (CDNA3 / MI300) and gfx950 (CDNA4 / MI350) — released ROCm 7.13 LLVM targets
+ROCM_7_13_GFX_VERSION = "7.13.0-123-gfx942;gfx950"
+
+ROCM_6_4_SUB_VERSIONS_STDOUT = (
+ "/opt/rocm/.info/version:6.4.0-47\n"
+ "/opt/rocm/.info/version-hip-libraries:6.4.0-47\n"
+ "/opt/rocm/.info/version-hiprt:6.4.0-47\n"
+ "/opt/rocm/.info/version-hiprt-devel:6.4.0-47\n"
+ "/opt/rocm/.info/version-hip-sdk:6.4.0-47\n"
+ "/opt/rocm/.info/version-lrt:6.4.0-47\n"
+ "/opt/rocm/.info/version-ml-libraries:6.4.0-47\n"
+ "/opt/rocm/.info/version-ml-sdk:6.4.0-47\n"
+ "/opt/rocm/.info/version-oclrt:6.4.0-47\n"
+ "/opt/rocm/.info/version-ocl-sdk:6.4.0-47\n"
+ "/opt/rocm/.info/version-openmp-sdk:6.4.0-47\n"
+ "/opt/rocm/.info/version-rocm:6.4.0-47\n"
+ "/opt/rocm/.info/version-rocm-developer-tools:6.4.0-47\n"
+ "/opt/rocm/.info/version-utils:6.4.0-47\n"
+)
+
+ROCM_6_4_EXPECTED_SUB_VERSIONS = {
+ "version": "6.4.0-47",
+ "version-hip-libraries": "6.4.0-47",
+ "version-hiprt": "6.4.0-47",
+ "version-hiprt-devel": "6.4.0-47",
+ "version-hip-sdk": "6.4.0-47",
+ "version-lrt": "6.4.0-47",
+ "version-ml-libraries": "6.4.0-47",
+ "version-ml-sdk": "6.4.0-47",
+ "version-oclrt": "6.4.0-47",
+ "version-ocl-sdk": "6.4.0-47",
+ "version-openmp-sdk": "6.4.0-47",
+ "version-rocm": "6.4.0-47",
+ "version-rocm-developer-tools": "6.4.0-47",
+ "version-utils": "6.4.0-47",
+}
+
+
+def _optional_collection_failures(count: int = 8):
+ return [MagicMock(exit_code=1, stdout="") for _ in range(count)]
+
@pytest.fixture
def collector(system_info, conn_mock):
@@ -64,7 +106,7 @@ def test_collect_rocm_version_fallback(collector):
"""Test fallback to version file when version-rocm fails"""
collector._run_sut_cmd = MagicMock(
side_effect=[
- # Sub-versions (grep . -r /opt/rocm/.info/*)
+ # Sub-versions (grep . -H -r -i /opt/rocm/.info/*)
MagicMock(exit_code=0, stdout=""),
# First path: version-rocm (fails)
MagicMock(exit_code=1, stdout="", command="grep . /opt/rocm/.info/version-rocm"),
@@ -113,7 +155,7 @@ def test_collect_all_rocm_data(collector):
# Mock all command outputs in sequence (order must match collector's call order)
collector._run_sut_cmd = MagicMock(
side_effect=[
- # Sub-versions (grep . -r /opt/rocm/.info/*)
+ # Sub-versions (grep . -H -r -i /opt/rocm/.info/*)
MagicMock(exit_code=0, stdout="/opt/rocm/.info/version-rocm:6.2.0-66"),
# ROCm version (grep . /opt/rocm/.info/version-rocm)
MagicMock(exit_code=0, stdout="6.2.0-66"),
@@ -159,6 +201,7 @@ def test_collect_all_rocm_data(collector):
# Verify ROCm version
assert data.rocm_version == "6.2.0-66"
+ assert data.build_number == "66"
# Verify ROCm latest path
assert data.rocm_latest_versioned_path == "/opt/rocm-1.1.0"
@@ -206,7 +249,7 @@ def test_collect_with_clinfo_failure(collector):
"""Test that clinfo failure is handled gracefully and captured in artifact"""
collector._run_sut_cmd = MagicMock(
side_effect=[
- # Sub-versions (grep . -r /opt/rocm/.info/*)
+ # Sub-versions (grep . -H -r -i /opt/rocm/.info/*)
MagicMock(exit_code=0, stdout="/opt/rocm/.info/version-rocm:6.2.0-66"),
# ROCm version (grep . /opt/rocm/.info/version-rocm)
MagicMock(exit_code=0, stdout="6.2.0-66"),
@@ -249,7 +292,7 @@ def test_collect_minimal_data(collector):
"""Test collection when only version is available"""
collector._run_sut_cmd = MagicMock(
side_effect=[
- # Sub-versions (grep . -r /opt/rocm/.info/*)
+ # Sub-versions (grep . -H -r -i /opt/rocm/.info/*)
MagicMock(exit_code=0, stdout=""),
# ROCm version (grep . /opt/rocm/.info/version-rocm)
MagicMock(exit_code=0, stdout="6.2.0-66"),
@@ -282,13 +325,28 @@ def test_collect_minimal_data(collector):
assert data.kfd_proc == []
+def test_sub_versions_grep_uses_h_flag(collector):
+ """Sub-version discovery must use grep -H so single-match output includes the filename."""
+ collector._run_sut_cmd = MagicMock(
+ side_effect=[
+ MagicMock(exit_code=1, stdout=""),
+ MagicMock(exit_code=1, stdout=""),
+ MagicMock(exit_code=1, stdout=""),
+ ]
+ )
+
+ collector.collect_data()
+
+ assert collector._run_sut_cmd.call_args_list[0].args[0] == ROCM_SUB_VERSIONS_GREP_CMD
+
+
def test_invalid_rocm_version_format(collector):
"""Test that invalid ROCm version format is handled gracefully"""
collector._run_sut_cmd = MagicMock(
- return_value=MagicMock(
- exit_code=0,
- stdout="invalid_version_format",
- )
+ side_effect=[
+ MagicMock(exit_code=0, stdout=""),
+ MagicMock(exit_code=0, stdout="invalid_version_format"),
+ ]
)
result, data = collector.collect_data()
@@ -296,57 +354,36 @@ def test_invalid_rocm_version_format(collector):
assert result.status == ExecutionStatus.ERROR
assert data is None
assert len(result.events) >= 1
+ assert any(
+ event.category == EventCategory.OS.value
+ and "Invalid ROCm version format" in event.description
+ for event in result.events
+ )
-def test_collect_rocm_sub_versions(collector):
- """Test collection of ROCm version and multiple sub-versions."""
- sub_versions_stdout = (
- "/opt/rocm/.info/version:6.4.0-47\n"
- "/opt/rocm/.info/version-hip-libraries:6.4.0-47\n"
- "/opt/rocm/.info/version-hiprt:6.4.0-47\n"
- "/opt/rocm/.info/version-hiprt-devel:6.4.0-47\n"
- "/opt/rocm/.info/version-hip-sdk:6.4.0-47\n"
- "/opt/rocm/.info/version-lrt:6.4.0-47\n"
- "/opt/rocm/.info/version-ml-libraries:6.4.0-47\n"
- "/opt/rocm/.info/version-ml-sdk:6.4.0-47\n"
- "/opt/rocm/.info/version-oclrt:6.4.0-47\n"
- "/opt/rocm/.info/version-ocl-sdk:6.4.0-47\n"
- "/opt/rocm/.info/version-openmp-sdk:6.4.0-47\n"
- "/opt/rocm/.info/version-rocm:6.4.0-47\n"
- "/opt/rocm/.info/version-rocm-developer-tools:6.4.0-47\n"
- "/opt/rocm/.info/version-utils:6.4.0-47\n"
+def test_collect_invalid_sub_version_format(collector):
+ """Invalid sub-version values fail model validation during collection."""
+ collector._run_sut_cmd = MagicMock(
+ side_effect=[
+ MagicMock(exit_code=0, stdout="/opt/rocm/.info/version-rocm:not-a-version\n"),
+ MagicMock(exit_code=0, stdout="6.2.0-66"),
+ ]
)
- expected_sub_versions = {
- "version": "6.4.0-47",
- "version-hip-libraries": "6.4.0-47",
- "version-hiprt": "6.4.0-47",
- "version-hiprt-devel": "6.4.0-47",
- "version-hip-sdk": "6.4.0-47",
- "version-lrt": "6.4.0-47",
- "version-ml-libraries": "6.4.0-47",
- "version-ml-sdk": "6.4.0-47",
- "version-oclrt": "6.4.0-47",
- "version-ocl-sdk": "6.4.0-47",
- "version-openmp-sdk": "6.4.0-47",
- "version-rocm": "6.4.0-47",
- "version-rocm-developer-tools": "6.4.0-47",
- "version-utils": "6.4.0-47",
- }
+
+ result, data = collector.collect_data()
+
+ assert result.status == ExecutionStatus.ERROR
+ assert data is None
+ assert any("Invalid ROCm version format" in event.description for event in result.events)
+
+
+def test_collect_rocm_sub_versions(collector):
+ """Test collection of ROCm version and multiple sub-versions (error-scraper test_run_new_version)."""
collector._run_sut_cmd = MagicMock(
side_effect=[
- # First: grep . -r /opt/rocm/.info/* (sub-versions)
- MagicMock(exit_code=0, stdout=sub_versions_stdout),
- # Second: grep . /opt/rocm/.info/version-rocm (main version)
+ MagicMock(exit_code=0, stdout=ROCM_6_4_SUB_VERSIONS_STDOUT),
MagicMock(exit_code=0, stdout="6.4.0-47"),
- # Optional data (all fail for minimal test)
- MagicMock(exit_code=1, stdout=""), # latest path
- MagicMock(exit_code=1, stdout=""), # all paths
- MagicMock(exit_code=1, stdout=""), # rocminfo
- MagicMock(exit_code=1, stdout=""), # ld.so.conf
- MagicMock(exit_code=1, stdout=""), # rocm_libs
- MagicMock(exit_code=1, stdout=""), # env_vars
- MagicMock(exit_code=1, stdout=""), # clinfo
- MagicMock(exit_code=1, stdout=""), # kfd_proc
+ *_optional_collection_failures(),
]
)
@@ -355,6 +392,51 @@ def test_collect_rocm_sub_versions(collector):
assert result.status == ExecutionStatus.OK
assert data is not None
assert data.rocm_version == "6.4.0-47"
- assert data.rocm_sub_versions == expected_sub_versions
+ assert data.rocm_sub_versions == ROCM_6_4_EXPECTED_SUB_VERSIONS
+ assert data.build_number == "47"
assert any(event.category == "ROCM_VERSION_READ" for event in result.events)
assert "ROCm version: 6.4.0-47" in result.message
+
+
+def test_collect_rocm_version_with_gfx_suffix(collector):
+ """ROCm 7.13+ version strings may include build and gfx target suffixes."""
+ gfx_version = ROCM_7_13_GFX_VERSION
+ collector._run_sut_cmd = MagicMock(
+ side_effect=[
+ MagicMock(
+ exit_code=0,
+ stdout=f"/opt/rocm/.info/version-rocm:{gfx_version}\n",
+ ),
+ MagicMock(exit_code=0, stdout=gfx_version),
+ *_optional_collection_failures(),
+ ]
+ )
+
+ result, data = collector.collect_data()
+
+ assert result.status == ExecutionStatus.OK
+ assert data is not None
+ assert data.rocm_version == gfx_version
+ assert data.rocm_sub_versions["version-rocm"] == gfx_version
+ assert data.build_number == "123"
+
+
+def test_collect_sub_versions_skips_lines_without_filename(collector):
+ """Lines without a filename prefix are ignored (grep without -H can produce these)."""
+ collector._run_sut_cmd = MagicMock(
+ side_effect=[
+ MagicMock(
+ exit_code=0,
+ stdout="6.4.0-47\n/opt/rocm/.info/version-rocm:6.4.0-47\n",
+ ),
+ MagicMock(exit_code=0, stdout="6.4.0-47"),
+ *_optional_collection_failures(),
+ ]
+ )
+
+ result, data = collector.collect_data()
+
+ assert result.status == ExecutionStatus.OK
+ assert data is not None
+ assert data.rocm_sub_versions == {"version-rocm": "6.4.0-47"}
+ assert data.build_number == "47"
diff --git a/test/unit/test_serialization.py b/test/unit/test_serialization.py
new file mode 100644
index 00000000..a4e009b3
--- /dev/null
+++ b/test/unit/test_serialization.py
@@ -0,0 +1,75 @@
+###############################################################################
+#
+# MIT License
+#
+# Copyright (C) 2026 Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+###############################################################################
+
+import pytest
+from pydantic import BaseModel
+
+from nodescraper.serialization import safe_dump_to_json_dict
+
+
+class _Sample(BaseModel):
+ a: int
+ b: str = "x"
+
+
+def test_safe_dump_to_json_dict_round_trip() -> None:
+ m = _Sample(a=7)
+ d = safe_dump_to_json_dict(m)
+ assert d == {"a": 7, "b": "x"}
+
+
+def test_safe_dump_to_json_dict_exclude() -> None:
+ m = _Sample(a=7)
+ d = safe_dump_to_json_dict(m, exclude={"b"})
+ assert d == {"a": 7}
+
+
+def test_safe_dump_falls_back_when_model_dump_json_fails(monkeypatch: pytest.MonkeyPatch) -> None:
+ m = _Sample(a=3)
+
+ def _boom(self, **kwargs):
+ raise RuntimeError("json path failed")
+
+ monkeypatch.setattr(_Sample, "model_dump_json", _boom)
+ d = safe_dump_to_json_dict(m)
+ assert d == {"a": 3, "b": "x"}
+
+
+def test_safe_dump_chains_when_both_fail(monkeypatch: pytest.MonkeyPatch) -> None:
+ m = _Sample(a=1)
+
+ def _boom_json(self, **kwargs):
+ raise RuntimeError("first")
+
+ def _boom_dump(self, **kwargs):
+ raise RuntimeError("second")
+
+ monkeypatch.setattr(_Sample, "model_dump_json", _boom_json)
+ monkeypatch.setattr(_Sample, "model_dump", _boom_dump)
+ with pytest.raises(RuntimeError, match="second") as exc_info:
+ safe_dump_to_json_dict(m)
+ assert exc_info.value.__cause__ is not None
+ assert "first" in str(exc_info.value.__cause__)