From 5d2bbb214c08998d9a402e0074cf6f90266cb8d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20F=C4=85ferek?= Date: Thu, 19 Feb 2026 21:42:52 +0100 Subject: [PATCH 1/3] feat: add fault storm script and debounce demo config - fault_storm.py: injects 9 noise faults + 1 real (MOTOR_OVERHEAT) - medkit_params_debounce.yaml: confirmation_threshold=-3, healing enabled - docker-compose.article3.yml: compose override for debounce config - run-demo-debounce.sh: convenience script for debounce mode Closes #34 --- .../config/medkit_params_debounce.yaml | 93 ++++++++++ .../docker-compose.article3.yml | 21 +++ .../run-demo-debounce.sh | 105 +++++++++++ .../scripts/fault_storm.py | 167 ++++++++++++++++++ 4 files changed, 386 insertions(+) create mode 100644 demos/turtlebot3_integration/config/medkit_params_debounce.yaml create mode 100644 demos/turtlebot3_integration/docker-compose.article3.yml create mode 100755 demos/turtlebot3_integration/run-demo-debounce.sh create mode 100644 demos/turtlebot3_integration/scripts/fault_storm.py diff --git a/demos/turtlebot3_integration/config/medkit_params_debounce.yaml b/demos/turtlebot3_integration/config/medkit_params_debounce.yaml new file mode 100644 index 0000000..d931df3 --- /dev/null +++ b/demos/turtlebot3_integration/config/medkit_params_debounce.yaml @@ -0,0 +1,93 @@ +# ros2_medkit gateway configuration for TurtleBot3 demo +# ARTICLE 3 - DEBOUNCE VERSION +# Differences from default: +# confirmation_threshold: -3 (was 0) — requires 3 sustained FAILED events +# healing_enabled: true (was false) — auto-heal after PASSED events +# +# Node runs under /diagnostics namespace, so we need to match that here +diagnostics: + ros2_medkit_gateway: + ros__parameters: + server: + # Bind to all interfaces for Docker networking + host: "0.0.0.0" + port: 8080 + + refresh_interval_ms: 10000 # 10 seconds (default), reduces log spam + + cors: + allowed_origins: ["*"] + allowed_methods: ["GET", "PUT", "POST", "DELETE", "OPTIONS"] + allowed_headers: ["Content-Type", "Accept"] + allow_credentials: false + max_age_seconds: 86400 + + max_parallel_topic_samples: 10 + + # Discovery configuration + discovery_mode: "hybrid" # runtime_only, manifest_only, or hybrid + manifest_path: "" # Will be set via launch argument + manifest_strict_validation: true + + discovery: + runtime: + create_synthetic_components: false # Manifest defines components + +# Fault Manager configuration (runs in root namespace) +fault_manager: + ros__parameters: + # Storage configuration + storage_type: "sqlite" + database_path: "/var/lib/ros2_medkit/faults.db" + + # === DEBOUNCE CONFIGURATION (Article 3) === + confirmation_threshold: -3 # Need 3 sustained FAILED events to confirm + healing_enabled: true # Auto-heal when problem resolves + healing_threshold: 3 # Need 3 PASSED events to heal + auto_confirm_after_sec: 0.0 # Disabled + + # Snapshot configuration (freeze frames) + snapshots: + enabled: true + background_capture: true # Non-blocking capture + timeout_sec: 2.0 + max_message_size: 131072 # 128KB max per message + + # Topics to capture for all faults + default_topics: + - /odom + - /amcl_pose + - /scan + - /tf + - /navigate_to_pose/_action/status + + # Rosbag recording configuration + rosbag: + enabled: true + duration_sec: 10.0 # Record 10 seconds before fault confirmation + duration_after_sec: 2.0 # Record 2 seconds after confirmation + lazy_start: false # Always recording (ring buffer) + format: "mcap" # MCAP format (recommended for cross-platform) + storage_path: "/var/lib/ros2_medkit/rosbags" + max_bag_size_mb: 100 # Max size per rosbag file + max_total_storage_mb: 1000 # 1GB total storage limit + auto_cleanup: true # Cleanup rosbags on fault clear + + # Topics to record (use 'config' or 'all') + topics: "config" # Use include/exclude lists below + include_topics: + - /odom + - /amcl_pose + - /scan + - /cmd_vel + - /tf + - /tf_static + - /navigate_to_pose/_action/status + - /navigate_to_pose/_action/feedback + - /local_costmap/costmap + - /global_costmap/costmap + - /plan + - /diagnostics + exclude_topics: + - /rosout + - /parameter_events diff --git a/demos/turtlebot3_integration/docker-compose.article3.yml b/demos/turtlebot3_integration/docker-compose.article3.yml new file mode 100644 index 0000000..5ba2f56 --- /dev/null +++ b/demos/turtlebot3_integration/docker-compose.article3.yml @@ -0,0 +1,21 @@ +# Docker Compose override for Article 3 (Debounce Demo) +# +# Usage: +# STORM (no debounce, default config): +# docker compose --profile cpu up -d +# +# DEBOUNCE (with filtering): +# docker compose --profile cpu -f docker-compose.yml -f docker-compose.article3.yml up -d +# +# The override mounts the debounce config over the default one inside the container. +# colcon build --symlink-install means the installed config points to the source, +# so mounting over the source path works. + +services: + turtlebot3-demo: + volumes: + - ./config/medkit_params_debounce.yaml:/root/demo_ws/src/turtlebot3_medkit_demo/config/medkit_params.yaml:ro + + turtlebot3-demo-nvidia: + volumes: + - ./config/medkit_params_debounce.yaml:/root/demo_ws/src/turtlebot3_medkit_demo/config/medkit_params.yaml:ro diff --git a/demos/turtlebot3_integration/run-demo-debounce.sh b/demos/turtlebot3_integration/run-demo-debounce.sh new file mode 100755 index 0000000..3499d46 --- /dev/null +++ b/demos/turtlebot3_integration/run-demo-debounce.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# TurtleBot3 + ros2_medkit Demo Runner — DEBOUNCE MODE (Article 3) +# +# Same as run-demo.sh but uses debounce config: +# confirmation_threshold: -3 (requires sustained failure) +# healing_enabled: true (auto-heal on recovery) +# +# Compare with: +# ./run-demo.sh → STORM (no debounce, threshold 0) +# ./run-demo-debounce.sh → CALM (debounce, threshold -3) + +set -eu + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +echo "🛡️ TurtleBot3 + ros2_medkit — DEBOUNCE MODE" +echo "==============================================" +echo " confirmation_threshold: -3 (need 3 sustained FAILED events)" +echo " healing_enabled: true (auto-heal after 3 PASSED events)" +echo "" + +# Set TurtleBot3 environment variables +export TURTLEBOT3_MODEL=${TURTLEBOT3_MODEL:-burger} +export GAZEBO_MODEL_PATH=${GAZEBO_MODEL_PATH:-}:/opt/ros/jazzy/share/turtlebot3_gazebo/models + +# Check for Docker +if ! command -v docker &> /dev/null; then + echo "Error: Docker is not installed" + exit 1 +fi + +# Setup X11 forwarding for GUI (Gazebo) +echo "Setting up X11 forwarding..." +xhost +local:docker 2>/dev/null || { + echo " Warning: xhost failed. GUI may not work." +} + +# Cleanup function +cleanup() { + echo "" + echo "Cleaning up..." + xhost -local:docker 2>/dev/null || true + echo "Done!" +} +trap cleanup EXIT + +# Parse arguments +HEADLESS_MODE="false" +DETACH_MODE="true" +PROFILE="cpu" + +while [[ $# -gt 0 ]]; do + case "$1" in + --nvidia) PROFILE="nvidia" ;; + --headless) HEADLESS_MODE="true" ;; + --attached) DETACH_MODE="false" ;; + *) echo "Unknown option: $1"; exit 1 ;; + esac + shift +done + +export HEADLESS=$HEADLESS_MODE + +DETACH_FLAG="" +if [[ "$DETACH_MODE" == "true" ]]; then + DETACH_FLAG="-d" +fi + +echo "Building and starting demo (debounce mode)..." +echo "" +echo "🌐 REST API: http://localhost:8080/api/v1/" +echo "🌐 Web UI: http://localhost:3000/" +echo "" + +# Use docker-compose override to mount debounce config +if docker compose version &> /dev/null; then + docker compose --profile "$PROFILE" \ + -f docker-compose.yml \ + -f docker-compose.article3.yml \ + build && \ + docker compose --profile "$PROFILE" \ + -f docker-compose.yml \ + -f docker-compose.article3.yml \ + up ${DETACH_FLAG} +else + docker-compose --profile "$PROFILE" \ + -f docker-compose.yml \ + -f docker-compose.article3.yml \ + build && \ + docker-compose --profile "$PROFILE" \ + -f docker-compose.yml \ + -f docker-compose.article3.yml \ + up ${DETACH_FLAG} +fi + +if [[ "$DETACH_MODE" == "true" ]]; then + echo "" + echo "✅ Demo started in DEBOUNCE mode!" + echo "" + echo "Run the article 3 demo sequence:" + echo " ./article3-demo.sh" + echo "" + echo "🛑 To stop: ./stop-demo.sh" +fi diff --git a/demos/turtlebot3_integration/scripts/fault_storm.py b/demos/turtlebot3_integration/scripts/fault_storm.py new file mode 100644 index 0000000..2974c48 --- /dev/null +++ b/demos/turtlebot3_integration/scripts/fault_storm.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +"""Fault storm injector for ros2_medkit debounce demo. + +Fires 9 noise faults (1 report each) + 1 real problem (5 sustained reports), +interleaved so the real fault is buried in the noise. + + Storm (threshold=0): all 10 CONFIRMED - wall of warnings, real issue invisible + Debounce (threshold=-3): 9 PREFAILED + 1 CONFIRMED - real issue pops out + +Requires ros2_medkit built in your workspace. + +Usage (standalone): + python3 fault_storm.py + +Usage (Docker demo): + docker exec turtlebot3_medkit_demo bash -c \ + "source /opt/ros/jazzy/setup.bash && \ + source /root/demo_ws/install/setup.bash && \ + python3 /root/demo_ws/src/turtlebot3_medkit_demo/scripts/fault_storm.py" +""" + +import rclpy +from rclpy.node import Node +from ros2_medkit_msgs.srv import ReportFault +import time + + +# === THE REAL PROBLEM (sustained, confirms even with debounce) === +REAL_FAULT = { + "fault_code": "MOTOR_OVERHEAT", + "severity": 1, # WARN (same as noise — invisible in storm mode) + "description": "Drive motor temperature 92C (limit: 80C)", + "source_id": "/drive/thermal_monitor", + "burst": 5, +} + +# === NOISE: 9 faults across 3 categories → wall of similar alerts === +NOISE_FAULTS = [ + # Sensor noise cluster (4 faults) + { + "fault_code": "SENSOR_NOISE_LIDAR", + "severity": 1, + "description": "LiDAR std dev: 0.12m (limit: 0.03m)", + "source_id": "/sensors/lidar", + }, + { + "fault_code": "SENSOR_NOISE_IMU", + "severity": 1, + "description": "IMU gyro drift: 1.8 deg/s (limit: 0.5 deg/s)", + "source_id": "/sensors/imu", + }, + { + "fault_code": "SENSOR_NOISE_CAMERA", + "severity": 1, + "description": "Depth camera: 3 frames dropped in 1s", + "source_id": "/sensors/camera", + }, + { + "fault_code": "SENSOR_NOISE_ODOM", + "severity": 1, + "description": "Wheel odometry jitter: 0.12 m/s (limit: 0.05 m/s)", + "source_id": "/sensors/odom", + }, + # Nav timeout cluster (4 faults) + { + "fault_code": "NAV_TIMEOUT_CONTROLLER", + "severity": 1, + "description": "Controller loop: 62ms (limit: 50ms)", + "source_id": "/nav/controller", + }, + { + "fault_code": "NAV_TIMEOUT_PLANNER", + "severity": 1, + "description": "Planner response: 320ms (limit: 200ms)", + "source_id": "/nav/planner", + }, + { + "fault_code": "NAV_TIMEOUT_COSTMAP", + "severity": 1, + "description": "Costmap update: 250ms (limit: 100ms)", + "source_id": "/nav/costmap", + }, + { + "fault_code": "NAV_TIMEOUT_TF", + "severity": 1, + "description": "TF lookup: 85ms (limit: 50ms)", + "source_id": "/nav/tf_monitor", + }, + # Comms cluster (1 fault) + { + "fault_code": "COMM_LATENCY_WIFI", + "severity": 1, + "description": "WiFi round-trip: 230ms (limit: 100ms)", + "source_id": "/network/wifi_monitor", + }, +] + + +class FaultStormNode(Node): + def __init__(self): + super().__init__("fault_storm_injector") + self.client = self.create_client( + ReportFault, "/fault_manager/report_fault" + ) + self.get_logger().info("Waiting for fault_manager...") + self.client.wait_for_service(timeout_sec=5.0) + self.get_logger().info("Connected") + + def fire(self, fault_code, severity, description, source_id): + req = ReportFault.Request() + req.fault_code = fault_code + req.event_type = ReportFault.Request.EVENT_FAILED + req.severity = severity + req.description = description + req.source_id = source_id + future = self.client.call_async(req) + rclpy.spin_until_future_complete(self, future, timeout_sec=2.0) + return future.result() and future.result().accepted + + def run_storm(self): + self.get_logger().info("=== STORM ===") + n = 0 + + # Interleave: noise, noise, REAL, noise, noise, REAL, ... + # MOTOR_OVERHEAT buried among SENSOR_NOISE, NAV_TIMEOUT, and COMM faults + real_sent = 0 + noise_idx = 0 + sequence = [ + "noise", "noise", "REAL", # lidar, imu, MOTOR #1 + "noise", "noise", "REAL", # camera, odom, MOTOR #2 + "noise", "noise", "REAL", # controller, planner, MOTOR #3 + "noise", "noise", "noise", # costmap, tf, wifi + "REAL", "REAL", # MOTOR #4, #5 + ] + + for step in sequence: + if step == "REAL" and real_sent < REAL_FAULT["burst"]: + f = REAL_FAULT + real_sent += 1 + label = f"MOTOR_OVERHEAT #{real_sent}" + elif step == "noise" and noise_idx < len(NOISE_FAULTS): + f = NOISE_FAULTS[noise_idx] + noise_idx += 1 + label = f["fault_code"] + else: + continue + + ok = self.fire(f["fault_code"], f["severity"], + f["description"], f["source_id"]) + n += 1 + self.get_logger().info( + f" [{n:2d}] {label} -> {'OK' if ok else 'FAIL'}") + time.sleep(0.4) + + self.get_logger().info(f"=== DONE: {n} events ===") + + +def main(): + rclpy.init() + node = FaultStormNode() + node.run_storm() + node.destroy_node() + rclpy.shutdown() + + +if __name__ == "__main__": + main() From 57266189286f9e5db5317636c5587c4b8bf75bc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20F=C4=85ferek?= Date: Thu, 19 Feb 2026 21:46:19 +0100 Subject: [PATCH 2/3] refactor: rename docker-compose.article3.yml to docker-compose.debounce.yml Remove internal article references from compose override and run-demo-debounce.sh. --- ...e.article3.yml => docker-compose.debounce.yml} | 4 ++-- demos/turtlebot3_integration/run-demo-debounce.sh | 15 ++++++++------- 2 files changed, 10 insertions(+), 9 deletions(-) rename demos/turtlebot3_integration/{docker-compose.article3.yml => docker-compose.debounce.yml} (89%) diff --git a/demos/turtlebot3_integration/docker-compose.article3.yml b/demos/turtlebot3_integration/docker-compose.debounce.yml similarity index 89% rename from demos/turtlebot3_integration/docker-compose.article3.yml rename to demos/turtlebot3_integration/docker-compose.debounce.yml index 5ba2f56..9585713 100644 --- a/demos/turtlebot3_integration/docker-compose.article3.yml +++ b/demos/turtlebot3_integration/docker-compose.debounce.yml @@ -1,11 +1,11 @@ -# Docker Compose override for Article 3 (Debounce Demo) +# Docker Compose override for debounce mode # # Usage: # STORM (no debounce, default config): # docker compose --profile cpu up -d # # DEBOUNCE (with filtering): -# docker compose --profile cpu -f docker-compose.yml -f docker-compose.article3.yml up -d +# docker compose --profile cpu -f docker-compose.yml -f docker-compose.debounce.yml up -d # # The override mounts the debounce config over the default one inside the container. # colcon build --symlink-install means the installed config points to the source, diff --git a/demos/turtlebot3_integration/run-demo-debounce.sh b/demos/turtlebot3_integration/run-demo-debounce.sh index 3499d46..ff834d5 100755 --- a/demos/turtlebot3_integration/run-demo-debounce.sh +++ b/demos/turtlebot3_integration/run-demo-debounce.sh @@ -1,5 +1,5 @@ #!/bin/bash -# TurtleBot3 + ros2_medkit Demo Runner — DEBOUNCE MODE (Article 3) +# TurtleBot3 + ros2_medkit Demo Runner - DEBOUNCE MODE # # Same as run-demo.sh but uses debounce config: # confirmation_threshold: -3 (requires sustained failure) @@ -77,20 +77,20 @@ echo "" if docker compose version &> /dev/null; then docker compose --profile "$PROFILE" \ -f docker-compose.yml \ - -f docker-compose.article3.yml \ + -f docker-compose.debounce.yml \ build && \ docker compose --profile "$PROFILE" \ -f docker-compose.yml \ - -f docker-compose.article3.yml \ + -f docker-compose.debounce.yml \ up ${DETACH_FLAG} else docker-compose --profile "$PROFILE" \ -f docker-compose.yml \ - -f docker-compose.article3.yml \ + -f docker-compose.debounce.yml \ build && \ docker-compose --profile "$PROFILE" \ -f docker-compose.yml \ - -f docker-compose.article3.yml \ + -f docker-compose.debounce.yml \ up ${DETACH_FLAG} fi @@ -98,8 +98,9 @@ if [[ "$DETACH_MODE" == "true" ]]; then echo "" echo "✅ Demo started in DEBOUNCE mode!" echo "" - echo "Run the article 3 demo sequence:" - echo " ./article3-demo.sh" + echo "Fire fault storm to see debounce in action:" + echo " docker exec turtlebot3_medkit_demo bash -c \\" + echo " 'source /opt/ros/jazzy/setup.bash && source /root/demo_ws/install/setup.bash && python3 /root/demo_ws/src/turtlebot3_medkit_demo/scripts/fault_storm.py'" echo "" echo "🛑 To stop: ./stop-demo.sh" fi From 451ab8e84af906e2462b631202ae79b4184a5302 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20F=C4=85ferek?= Date: Thu, 19 Feb 2026 21:57:10 +0100 Subject: [PATCH 3/3] fix: address PR review comments - Add Apache 2.0 license header to fault_storm.py - Handle wait_for_service() timeout with error instead of silent continue - Document nvidia container name variant in docstring - Use correct container name based on profile in run-demo-debounce.sh --- .../run-demo-debounce.sh | 7 +++++- .../scripts/fault_storm.py | 23 ++++++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/demos/turtlebot3_integration/run-demo-debounce.sh b/demos/turtlebot3_integration/run-demo-debounce.sh index ff834d5..eb541db 100755 --- a/demos/turtlebot3_integration/run-demo-debounce.sh +++ b/demos/turtlebot3_integration/run-demo-debounce.sh @@ -95,11 +95,16 @@ else fi if [[ "$DETACH_MODE" == "true" ]]; then + if [[ "$PROFILE" == "nvidia" ]]; then + CONTAINER="turtlebot3_medkit_demo_nvidia" + else + CONTAINER="turtlebot3_medkit_demo" + fi echo "" echo "✅ Demo started in DEBOUNCE mode!" echo "" echo "Fire fault storm to see debounce in action:" - echo " docker exec turtlebot3_medkit_demo bash -c \\" + echo " docker exec $CONTAINER bash -c \\" echo " 'source /opt/ros/jazzy/setup.bash && source /root/demo_ws/install/setup.bash && python3 /root/demo_ws/src/turtlebot3_medkit_demo/scripts/fault_storm.py'" echo "" echo "🛑 To stop: ./stop-demo.sh" diff --git a/demos/turtlebot3_integration/scripts/fault_storm.py b/demos/turtlebot3_integration/scripts/fault_storm.py index 2974c48..11bdd5e 100644 --- a/demos/turtlebot3_integration/scripts/fault_storm.py +++ b/demos/turtlebot3_integration/scripts/fault_storm.py @@ -1,4 +1,18 @@ #!/usr/bin/env python3 +# Copyright 2026 selfpatch +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Fault storm injector for ros2_medkit debounce demo. Fires 9 noise faults (1 report each) + 1 real problem (5 sustained reports), @@ -13,10 +27,13 @@ python3 fault_storm.py Usage (Docker demo): + # CPU profile (default): docker exec turtlebot3_medkit_demo bash -c \ "source /opt/ros/jazzy/setup.bash && \ source /root/demo_ws/install/setup.bash && \ python3 /root/demo_ws/src/turtlebot3_medkit_demo/scripts/fault_storm.py" + # NVIDIA profile: + docker exec turtlebot3_medkit_demo_nvidia bash -c ... """ import rclpy @@ -103,7 +120,11 @@ def __init__(self): ReportFault, "/fault_manager/report_fault" ) self.get_logger().info("Waiting for fault_manager...") - self.client.wait_for_service(timeout_sec=5.0) + if not self.client.wait_for_service(timeout_sec=5.0): + self.get_logger().error( + "fault_manager service not available after 5s; aborting." + ) + raise RuntimeError("fault_manager service not available") self.get_logger().info("Connected") def fire(self, fault_code, severity, description, source_id):