Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/ros2_medkit_fault_manager/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ if(BUILD_TESTING)
find_package(ament_lint_auto REQUIRED)
find_package(ament_cmake_gtest REQUIRED)
find_package(launch_testing_ament_cmake REQUIRED)
find_package(std_msgs REQUIRED)

set(ament_cmake_clang_format_CONFIG_FILE "${CMAKE_CURRENT_SOURCE_DIR}/../../.clang-format")
list(APPEND AMENT_LINT_AUTO_EXCLUDE ament_cmake_uncrustify ament_cmake_cpplint ament_cmake_clang_tidy)
Expand All @@ -138,7 +139,7 @@ if(BUILD_TESTING)
# in the same CTest invocation and launch their own fault_manager_node subprocess.
ament_add_gtest(test_fault_manager test/test_fault_manager.cpp)
target_link_libraries(test_fault_manager fault_manager_lib)
medkit_target_dependencies(test_fault_manager rclcpp ros2_medkit_msgs)
medkit_target_dependencies(test_fault_manager rclcpp ros2_medkit_msgs std_msgs)
medkit_set_test_domain(test_fault_manager)

# SQLite storage tests
Expand All @@ -154,7 +155,7 @@ if(BUILD_TESTING)
# Snapshot capture tests
ament_add_gtest(test_snapshot_capture test/test_snapshot_capture.cpp)
target_link_libraries(test_snapshot_capture fault_manager_lib)
medkit_target_dependencies(test_snapshot_capture rclcpp ros2_medkit_msgs)
medkit_target_dependencies(test_snapshot_capture rclcpp ros2_medkit_msgs std_msgs)
medkit_set_test_domain(test_snapshot_capture)

# Rosbag capture tests
Expand Down
3 changes: 3 additions & 0 deletions src/ros2_medkit_fault_manager/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ ros2 service call /fault_manager/clear_fault ros2_medkit_msgs/srv/ClearFault \
- **Persistent storage**: SQLite backend ensures faults survive node restarts
- **Debounce filtering** (optional): AUTOSAR DEM-style counter-based fault confirmation with per-entity threshold overrides
- **Snapshot capture**: Captures topic data when faults are confirmed for debugging (snapshots are deleted when fault is cleared)
- **Freeze-frame retention**: One compact JSON freeze-frame per fault code, retained across `clear_fault` (see below)
- **Fault correlation** (optional): Root cause analysis with symptom muting and auto-clear
- **Tamper-evident audit log** (optional): Append-only, hash-chained record of fault state transitions for verifiable history

Expand All @@ -71,6 +72,8 @@ ros2 service call /fault_manager/clear_fault ros2_medkit_msgs/srv/ClearFault \

Snapshots capture topic data when faults are confirmed for post-mortem debugging.

Each confirm also writes a **freeze-frame**: a single compact JSON object mapping every captured topic to its value at confirmation time, keyed by fault code. It differs from per-topic snapshots in two ways: snapshots are deleted when the fault is cleared, while the freeze-frame is retained across `clear_fault` (once the snapshots are gone, `~/get_fault` serves the retained frame so the confirmed-state record stays available after acknowledgement); and a re-confirm that captures nothing (e.g. source publishers down) never overwrites an existing non-empty frame. A fault code with no configured capture set gets no freeze-frame row; a configured capture that samples nothing on its first run records an empty `{}` frame. Freeze-frame storage is bounded by the number of distinct fault codes (one row per code, replaced in place) and rows are never evicted.

Under a fault storm, captures are bounded by a worker pool (`capture_pool_size`) draining a bounded queue (`capture_queue_depth`); excess captures are dropped per `capture_queue_full_policy` and logged (throttled). The pool is shared and is created when snapshots **or** rosbag is enabled, so these parameters bound both. `capture_pool_size` parallelizes freeze-frame snapshot capture only - rosbag is single-writer and records one fault at a time regardless of pool size.

| Parameter | Type | Default | Description |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,18 @@ struct SnapshotData {
int64_t captured_at_ns{0};
};

/// Compact freeze-frame captured when a fault confirms: a single JSON object mapping
/// each captured topic to its latest value at confirmation time. Unlike per-topic
/// snapshots, a freeze-frame is keyed by fault_code (one row per code) and is RETAINED
/// across clear_fault, so the confirmed-state record persists after acknowledgement.
/// A row exists only for fault codes with a configured capture set; a fault code with
/// no capture configured gets no row at all (lookup returns nullopt, never an empty {}).
struct FreezeFrameData {
std::string fault_code;
std::string data; ///< Compact JSON object: {"<topic>": <value>, ...}
int64_t captured_at_ns{0};
};

/// Rosbag file metadata for time-window recording
struct RosbagFileInfo {
std::string fault_code;
Expand Down Expand Up @@ -189,6 +201,21 @@ class FaultStorage {
virtual std::vector<SnapshotData> get_snapshots(const std::string & fault_code,
const std::string & topic_filter = "") const = 0;

/// Store the compact freeze-frame captured for a fault (JSON dict of topic values).
/// Keyed by fault_code: a later capture for the same code replaces the frame. The frame
/// is retained across clear_fault so the confirmed-state record survives acknowledgement.
/// Storage is bounded by the number of distinct fault codes (one row per code, replaced
/// in place); rows are never evicted. Faults themselves are never deleted (clear_fault
/// only flips status), so there is currently no delete hook to tie eviction to.
/// @param frame The freeze-frame to store
virtual void store_freeze_frame(const FreezeFrameData & frame) = 0;

/// Get the freeze-frame captured for a fault, if any.
/// @param fault_code The fault code to look up
/// @return The freeze-frame if one was captured, nullopt otherwise (including fault
/// codes with no capture configured, which never get a row)
virtual std::optional<FreezeFrameData> get_freeze_frame(const std::string & fault_code) const = 0;
Comment thread
mfaferek93 marked this conversation as resolved.

/// Store rosbag file metadata for a fault
/// @param info The rosbag file info to store (replaces any existing entry for fault_code)
virtual void store_rosbag_file(const RosbagFileInfo & info) = 0;
Expand Down Expand Up @@ -267,6 +294,9 @@ class InMemoryFaultStorage : public FaultStorage {
std::vector<SnapshotData> get_snapshots(const std::string & fault_code,
const std::string & topic_filter = "") const override;

void store_freeze_frame(const FreezeFrameData & frame) override;
std::optional<FreezeFrameData> get_freeze_frame(const std::string & fault_code) const override;

void store_rosbag_file(const RosbagFileInfo & info) override;
std::optional<RosbagFileInfo> get_rosbag_file(const std::string & fault_code) const override;
bool delete_rosbag_file(const std::string & fault_code) override;
Expand All @@ -283,7 +313,8 @@ class InMemoryFaultStorage : public FaultStorage {
mutable std::mutex mutex_;
std::map<std::string, FaultState> faults_;
std::vector<SnapshotData> snapshots_;
std::map<std::string, RosbagFileInfo> rosbag_files_; ///< fault_code -> rosbag info
std::map<std::string, FreezeFrameData> freeze_frames_; ///< fault_code -> freeze-frame (retained across clear)
std::map<std::string, RosbagFileInfo> rosbag_files_; ///< fault_code -> rosbag info
DebounceConfig config_;
size_t max_snapshots_per_fault_{0}; ///< 0 = unlimited
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <string>
#include <vector>

#include <nlohmann/json.hpp>

#include "rclcpp/rclcpp.hpp"
#include "ros2_medkit_fault_manager/fault_storage.hpp"

Expand Down Expand Up @@ -147,6 +149,10 @@ class SnapshotCapture {
SnapshotCapture & operator=(SnapshotCapture &&) = delete;

/// Capture snapshots for a fault that was just confirmed
///
/// If the fault code resolves to no capture set (not in fault_specific, no pattern
/// match, no default_topics), capture returns early: no freeze_frames row is written
/// (no empty {} row) and FaultStorage::get_freeze_frame() returns nullopt for it.
/// @param fault_code The fault code that was confirmed
void capture(const std::string & fault_code);

Expand All @@ -166,12 +172,16 @@ class SnapshotCapture {
std::vector<std::string> resolve_topics(const std::string & fault_code) const;

/// Capture a single topic on-demand (creates temporary subscription)
/// On success also records the captured value into @p freeze_frame under the topic key.
/// @return true if capture was successful
bool capture_topic_on_demand(const std::string & fault_code, const std::string & topic);
bool capture_topic_on_demand(const std::string & fault_code, const std::string & topic,
nlohmann::json & freeze_frame);

/// Capture a topic from background cache
/// On success also records the cached value into @p freeze_frame under the topic key.
/// @return true if data was available in cache
bool capture_topic_from_cache(const std::string & fault_code, const std::string & topic);
bool capture_topic_from_cache(const std::string & fault_code, const std::string & topic,
nlohmann::json & freeze_frame);

/// Initialize background subscriptions for all configured topics
void init_background_subscriptions();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ class SqliteFaultStorage : public FaultStorage {
std::vector<SnapshotData> get_snapshots(const std::string & fault_code,
const std::string & topic_filter = "") const override;

void store_freeze_frame(const FreezeFrameData & frame) override;
std::optional<FreezeFrameData> get_freeze_frame(const std::string & fault_code) const override;

void store_rosbag_file(const RosbagFileInfo & info) override;
std::optional<RosbagFileInfo> get_rosbag_file(const std::string & fault_code) const override;
bool delete_rosbag_file(const std::string & fault_code) override;
Expand Down
16 changes: 16 additions & 0 deletions src/ros2_medkit_fault_manager/src/fault_manager_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,22 @@ void FaultManagerNode::handle_get_fault(const std::shared_ptr<ros2_medkit_msgs::
response->environment_data.snapshots.push_back(snapshot);
}

// Per-topic snapshots are deleted on clear_fault, but the compact freeze-frame row is
// retained. When no snapshots remain, serve the retained frame so the confirmed-state
// record stays observable after acknowledgement.
if (stored_snapshots.empty()) {
auto frame = storage_->get_freeze_frame(request->fault_code);
if (frame) {
ros2_medkit_msgs::msg::Snapshot snapshot;
snapshot.type = ros2_medkit_msgs::msg::Snapshot::TYPE_FREEZE_FRAME;
snapshot.name = "freeze_frame";
snapshot.data = frame->data;
// topic/message_type left empty: the frame aggregates values from multiple topics
snapshot.captured_at_ns = frame->captured_at_ns;
response->environment_data.snapshots.push_back(snapshot);
}
}

// Get rosbag info if available
auto rosbag_info = storage_->get_rosbag_file(request->fault_code);
if (rosbag_info) {
Expand Down
14 changes: 14 additions & 0 deletions src/ros2_medkit_fault_manager/src/fault_storage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,20 @@ std::vector<SnapshotData> InMemoryFaultStorage::get_snapshots(const std::string
return result;
}

void InMemoryFaultStorage::store_freeze_frame(const FreezeFrameData & frame) {
std::lock_guard<std::mutex> lock(mutex_);
freeze_frames_[frame.fault_code] = frame;
}

std::optional<FreezeFrameData> InMemoryFaultStorage::get_freeze_frame(const std::string & fault_code) const {
std::lock_guard<std::mutex> lock(mutex_);
auto it = freeze_frames_.find(fault_code);
if (it == freeze_frames_.end()) {
return std::nullopt;
}
return it->second;
}

void InMemoryFaultStorage::store_rosbag_file(const RosbagFileInfo & info) {
std::lock_guard<std::mutex> lock(mutex_);

Expand Down
49 changes: 45 additions & 4 deletions src/ros2_medkit_fault_manager/src/snapshot_capture.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,20 +124,26 @@ void SnapshotCapture::capture(const std::string & fault_code) {

auto topics = resolve_topics(fault_code);
if (topics.empty()) {
// Unconfigured fault code: resolve_topics() yielded nothing, so capture returns early.
// No freeze_frames row is written (we never persist an empty {} row) and no
// subscriptions are created. A freeze-frame lookup for such a fault returns not-found
// (get_freeze_frame() == nullopt); absence of a row means "no capture configured".
RCLCPP_DEBUG(node_->get_logger(), "No topics configured for fault '%s'", fault_code.c_str());
return;
Comment thread
mfaferek93 marked this conversation as resolved.
}

RCLCPP_INFO(node_->get_logger(), "Capturing snapshots for fault '%s' (%zu topics)", fault_code.c_str(),
topics.size());

// Accumulate a compact dict of captured topic values alongside the per-topic snapshots.
nlohmann::json freeze_frame = nlohmann::json::object();
size_t captured_count = 0;
for (const auto & topic : topics) {
bool success = false;
if (config_.background_capture) {
success = capture_topic_from_cache(fault_code, topic);
success = capture_topic_from_cache(fault_code, topic, freeze_frame);
} else {
success = capture_topic_on_demand(fault_code, topic);
success = capture_topic_on_demand(fault_code, topic, freeze_frame);
}

if (success) {
Expand All @@ -147,6 +153,27 @@ void SnapshotCapture::capture(const std::string & fault_code) {

RCLCPP_INFO(node_->get_logger(), "Captured %zu/%zu snapshots for fault '%s'", captured_count, topics.size(),
fault_code.c_str());

// Persist the compact freeze-frame keyed by fault_code (retained across clear_fault).
// Only reached for faults with a configured capture set (unconfigured codes returned
// early above and get no row). If nothing was publishing at confirmation time the row
// holds an empty object, recording that a configured capture ran but sampled nothing.
// An empty capture must never clobber an existing non-empty frame: a re-confirm while
// the source publishers are down would otherwise replace the retained record with {}.
if (captured_count == 0) {
auto existing = storage_->get_freeze_frame(fault_code);
if (existing.has_value() && existing->data != "{}") {
RCLCPP_WARN(node_->get_logger(), "Nothing captured for fault '%s'; keeping previously retained freeze-frame",
fault_code.c_str());
return;
}
}

FreezeFrameData frame;
frame.fault_code = fault_code;
frame.data = freeze_frame.dump();
frame.captured_at_ns = get_wall_clock_ns();
storage_->store_freeze_frame(frame);
Comment thread
mfaferek93 marked this conversation as resolved.
}

std::vector<std::string> SnapshotCapture::resolve_topics(const std::string & fault_code) const {
Expand Down Expand Up @@ -174,7 +201,8 @@ std::vector<std::string> SnapshotCapture::resolve_topics(const std::string & fau
return {};
}

bool SnapshotCapture::capture_topic_on_demand(const std::string & fault_code, const std::string & topic) {
bool SnapshotCapture::capture_topic_on_demand(const std::string & fault_code, const std::string & topic,
nlohmann::json & freeze_frame) {
// Get topic type
std::string msg_type = get_topic_type(topic);
if (msg_type.empty()) {
Expand Down Expand Up @@ -286,6 +314,9 @@ bool SnapshotCapture::capture_topic_on_demand(const std::string & fault_code, co

storage_->store_snapshot(snapshot);

// Record the value into the compact freeze-frame dict under the topic key.
freeze_frame[topic] = std::move(json_data);

RCLCPP_DEBUG(node_->get_logger(), "Captured snapshot from '%s' for fault '%s'", topic.c_str(), fault_code.c_str());
return true;

Expand All @@ -300,7 +331,8 @@ bool SnapshotCapture::capture_topic_on_demand(const std::string & fault_code, co
return false;
}

bool SnapshotCapture::capture_topic_from_cache(const std::string & fault_code, const std::string & topic) {
bool SnapshotCapture::capture_topic_from_cache(const std::string & fault_code, const std::string & topic,
nlohmann::json & freeze_frame) {
std::lock_guard<std::mutex> lock(cache_mutex_);

auto it = message_cache_.find(topic);
Expand All @@ -321,6 +353,15 @@ bool SnapshotCapture::capture_topic_from_cache(const std::string & fault_code, c

storage_->store_snapshot(snapshot);

// Record the cached value into the compact freeze-frame dict. The cache holds a serialized
// JSON string; parse it back so the frame nests structured values (fall back to the raw
// string only if it somehow fails to parse).
try {
freeze_frame[topic] = nlohmann::json::parse(cached.data);
} catch (const nlohmann::json::parse_error &) {
freeze_frame[topic] = cached.data;
}

RCLCPP_DEBUG(node_->get_logger(), "Captured snapshot from cache for '%s' (fault '%s')", topic.c_str(),
fault_code.c_str());
return true;
Expand Down
50 changes: 50 additions & 0 deletions src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,23 @@ void SqliteFaultStorage::initialize_schema() {
throw std::runtime_error("Failed to create snapshots table: " + error);
}

// Create freeze_frames table: one compact JSON dict of captured topic values per fault
// code. Unlike snapshots, freeze frames are keyed by fault_code and are NOT removed on
// clear_fault, so the confirmed-state record is retained after acknowledgement.
const char * create_freeze_frames_table_sql = R"(
CREATE TABLE IF NOT EXISTS freeze_frames (
fault_code TEXT PRIMARY KEY,
data TEXT NOT NULL,
captured_at_ns INTEGER NOT NULL
);
)";

if (sqlite3_exec(db_, create_freeze_frames_table_sql, nullptr, nullptr, &err_msg) != SQLITE_OK) {
std::string error = err_msg ? err_msg : "Unknown error";
sqlite3_free(err_msg);
throw std::runtime_error("Failed to create freeze_frames table: " + error);
}

// Create rosbag_files table for storing time-window bag file metadata
const char * create_rosbag_files_table_sql = R"(
CREATE TABLE IF NOT EXISTS rosbag_files (
Expand Down Expand Up @@ -795,6 +812,39 @@ std::vector<SnapshotData> SqliteFaultStorage::get_snapshots(const std::string &
return result;
}

void SqliteFaultStorage::store_freeze_frame(const FreezeFrameData & frame) {
std::lock_guard<std::mutex> lock(mutex_);

// Keyed by fault_code (PRIMARY KEY): a re-confirm replaces the previous frame.
SqliteStatement stmt(db_,
"INSERT OR REPLACE INTO freeze_frames (fault_code, data, captured_at_ns) "
"VALUES (?, ?, ?)");
stmt.bind_text(1, frame.fault_code);
stmt.bind_text(2, frame.data);
stmt.bind_int64(3, frame.captured_at_ns);

if (stmt.step() != SQLITE_DONE) {
throw std::runtime_error(std::string("Failed to store freeze frame: ") + sqlite3_errmsg(db_));
}
}

std::optional<FreezeFrameData> SqliteFaultStorage::get_freeze_frame(const std::string & fault_code) const {
std::lock_guard<std::mutex> lock(mutex_);

SqliteStatement stmt(db_, "SELECT fault_code, data, captured_at_ns FROM freeze_frames WHERE fault_code = ?");
stmt.bind_text(1, fault_code);

if (stmt.step() != SQLITE_ROW) {
return std::nullopt;
}

FreezeFrameData frame;
frame.fault_code = stmt.column_text(0);
frame.data = stmt.column_text(1);
frame.captured_at_ns = stmt.column_int64(2);
return frame;
}

void SqliteFaultStorage::store_rosbag_file(const RosbagFileInfo & info) {
std::lock_guard<std::mutex> lock(mutex_);

Expand Down
Loading
Loading