From 7344d686c765a004a202013898261b0fd7eba5fa Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 18:32:17 +0200 Subject: [PATCH 1/9] Add tamper-evident append-only fault audit log Append-only, hash-chained audit log of fault state transitions in the fault manager. Each transition appends one immutable row with record_hash = sha256(prev_hash + canonical(event)) (OpenSSL EVP SHA-256), a persisted chain head that resumes across restarts, a verify routine, a read API, and retention that seals a segment anchor before pruning so the surviving tail stays verifiable. Off by default. Refs #483 --- src/ros2_medkit_fault_manager/CHANGELOG.rst | 4 + src/ros2_medkit_fault_manager/CMakeLists.txt | 8 + src/ros2_medkit_fault_manager/README.md | 18 + .../config/fault_manager.yaml | 14 + .../fault_audit_log.hpp | 144 +++++ .../fault_manager_node.hpp | 17 + src/ros2_medkit_fault_manager/package.xml | 1 + .../src/fault_audit_log.cpp | 522 ++++++++++++++++++ .../src/fault_manager_node.cpp | 108 +++- .../test/test_fault_audit_log.cpp | 270 +++++++++ .../test/test_fault_manager.cpp | 119 ++++ 11 files changed, 1224 insertions(+), 1 deletion(-) create mode 100644 src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp create mode 100644 src/ros2_medkit_fault_manager/src/fault_audit_log.cpp create mode 100644 src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp diff --git a/src/ros2_medkit_fault_manager/CHANGELOG.rst b/src/ros2_medkit_fault_manager/CHANGELOG.rst index a6130582..902b8447 100644 --- a/src/ros2_medkit_fault_manager/CHANGELOG.rst +++ b/src/ros2_medkit_fault_manager/CHANGELOG.rst @@ -2,6 +2,10 @@ Changelog for package ros2_medkit_fault_manager ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Forthcoming +----------- +* Optional tamper-evident, append-only audit log of fault state transitions: each transition appends one immutable, hash-chained row (``record_hash = sha256(prev_hash + canonical(event))`` via OpenSSL EVP SHA-256) with a persisted chain head, a ``verify`` routine, a read API, and retention that seals a segment anchor before pruning. Off by default (`#483 `_) + 0.6.0 (2026-06-22) ------------------ * Bounded concurrent snapshot capture under fault storms with a ``CaptureThreadPool`` and configurable capture pool / queue / overflow-policy parameters. The rosbag leg is serialized and the cooldown map is bounded, so a burst of simultaneous faults can no longer exhaust capture threads or grow memory without limit (`#456 `_) diff --git a/src/ros2_medkit_fault_manager/CMakeLists.txt b/src/ros2_medkit_fault_manager/CMakeLists.txt index 23c4924c..9421ed7c 100644 --- a/src/ros2_medkit_fault_manager/CMakeLists.txt +++ b/src/ros2_medkit_fault_manager/CMakeLists.txt @@ -41,6 +41,8 @@ find_package(ros2_medkit_msgs REQUIRED) find_package(ros2_medkit_serialization REQUIRED) find_package(SQLite3 REQUIRED) find_package(nlohmann_json REQUIRED) +# OpenSSL EVP SHA-256 for the tamper-evident audit log hash chain +find_package(OpenSSL REQUIRED) # yaml-cpp is required as transitive dependency from ros2_medkit_serialization medkit_find_yaml_cpp() # rosbag2 for time-window snapshot recording @@ -55,6 +57,7 @@ add_library(fault_manager_lib STATIC src/fault_manager_node.cpp src/fault_storage.cpp src/sqlite_fault_storage.cpp + src/fault_audit_log.cpp src/snapshot_capture.cpp src/rosbag_capture.cpp src/correlation/types.cpp @@ -81,6 +84,7 @@ target_link_libraries(fault_manager_lib PUBLIC SQLite::SQLite3 nlohmann_json::nlohmann_json yaml-cpp::yaml-cpp + OpenSSL::Crypto ) medkit_apply_compat_defs(fault_manager_lib) @@ -143,6 +147,10 @@ if(BUILD_TESTING) medkit_target_dependencies(test_sqlite_storage rclcpp ros2_medkit_msgs) medkit_set_test_domain(test_sqlite_storage) + # Fault audit log tests (hash chain, verify, rotation, reopen) + ament_add_gtest(test_fault_audit_log test/test_fault_audit_log.cpp) + target_link_libraries(test_fault_audit_log fault_manager_lib) + # Snapshot capture tests ament_add_gtest(test_snapshot_capture test/test_snapshot_capture.cpp) target_link_libraries(test_snapshot_capture fault_manager_lib) diff --git a/src/ros2_medkit_fault_manager/README.md b/src/ros2_medkit_fault_manager/README.md index aaaeb20e..908f6d5b 100644 --- a/src/ros2_medkit_fault_manager/README.md +++ b/src/ros2_medkit_fault_manager/README.md @@ -53,6 +53,7 @@ ros2 service call /fault_manager/clear_fault ros2_medkit_msgs/srv/ClearFault \ - **Debounce filtering** (optional): AUTOSAR DEM-style counter-based fault confirmation with per-entity threshold overrides - **Snapshot capture**: Captures topic data when faults are confirmed for debugging (snapshots are deleted when fault is cleared) - **Fault correlation** (optional): Root cause analysis with symptom muting and auto-clear +- **Tamper-evident audit log** (optional): Append-only, hash-chained record of fault state transitions for verifiable history ## Parameters @@ -109,6 +110,23 @@ patterns: **Memory**: Faults are stored in memory only. Useful for testing or when persistence is not required. +## Advanced: Tamper-Evident Audit Log + +An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `cleared`) so the fault history is verifiable and any later edit or deletion is detectable. It is **off by default** because it adds a write and storage cost per transition. + +Each transition appends one immutable row holding `record_hash = sha256(prev_hash + canonical(event))` (OpenSSL EVP SHA-256), the `prev_hash` it links to, and a monotonic `seq`. The hash is computed once at insert and never recomputed. A persisted chain head lets the chain resume across restarts. The log is stored in its own SQLite database (separate from the fault store) and is treated as append-only - the manager only ever inserts rows. + +`verify()` walks the persisted chain oldest-first and recomputes every link: editing a row breaks its `record_hash`, deleting a row breaks the next row's `prev_hash` linkage, and deleting the newest row is caught by the persisted-head check. + +**Retention/rotation**: when more than `audit_log.retention_max_records` rows are retained, the oldest segment is *sealed* (its final `seq` + hash are persisted as an anchor) and then pruned. The surviving tail still verifies because the oldest retained row links back to the sealed anchor. + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `audit_log.enabled` | bool | `false` | Enable the tamper-evident audit log | +| `audit_log.transitions` | string | `"all"` | Which transitions to record: `"all"` or `"confirmed_only"` | +| `audit_log.database_path` | string | `""` | SQLite path. Empty => sibling `fault_audit.db` next to the fault DB (or `:memory:` for in-memory fault stores) | +| `audit_log.retention_max_records` | int | `0` | Seal + prune the oldest segment beyond this many retained records (0 = unlimited) | + ## Usage ### Launch diff --git a/src/ros2_medkit_fault_manager/config/fault_manager.yaml b/src/ros2_medkit_fault_manager/config/fault_manager.yaml index d421c0fd..3f94e71c 100644 --- a/src/ros2_medkit_fault_manager/config/fault_manager.yaml +++ b/src/ros2_medkit_fault_manager/config/fault_manager.yaml @@ -27,3 +27,17 @@ fault_manager: # snapshots.capture_pool_size: 2 # max concurrent capture threads (>= 1) # snapshots.capture_queue_depth: 16 # max pending captures (>= 1) # snapshots.capture_queue_full_policy: reject_newest # reject_newest | drop_oldest + + # Tamper-evident, append-only audit log of fault state transitions + # (occurred/confirmed/cleared). OFF by default: it adds a write + storage cost + # per transition. When enabled, each transition appends one immutable, + # hash-chained row so any later edit or deletion is detectable via verify. + audit_log.enabled: false + # Which transitions to record: "all" or "confirmed_only". + # audit_log.transitions: all + # SQLite path. Empty => sibling "fault_audit.db" next to the fault DB + # (or :memory: when the fault store is in-memory). + # audit_log.database_path: "" + # Seal + prune the oldest segment beyond this many retained records + # (0 = unlimited). A sealed anchor keeps the surviving tail verifiable. + # audit_log.retention_max_records: 0 diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp new file mode 100644 index 00000000..59cca09d --- /dev/null +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp @@ -0,0 +1,144 @@ +// Copyright 2026 mfaferek93, bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include + +namespace ros2_medkit_fault_manager { + +/// A single fault state-transition to record in the audit log. +/// +/// `transition` is one of the kTransition* constants below. The remaining +/// fields describe the fault at the moment of the transition; all of them feed +/// the canonical serialization that the hash chain is computed over, so any +/// later edit to a stored row is detectable. +struct AuditEvent { + std::string fault_code; + std::string transition; ///< occurred | confirmed | cleared | ack + uint8_t severity{0}; ///< severity at the time of the transition + std::string status; ///< resulting fault status (e.g. CONFIRMED) + std::string source_id; ///< reporting source that drove the transition + std::string description; ///< human-readable description + int64_t occurred_at_ns{0}; ///< wall-clock timestamp of the transition +}; + +/// Canonical transition kinds. Stored verbatim, so they are part of the hash. +constexpr const char * kTransitionOccurred = "occurred"; +constexpr const char * kTransitionConfirmed = "confirmed"; +constexpr const char * kTransitionCleared = "cleared"; +constexpr const char * kTransitionAck = "ack"; + +/// One immutable, hash-chained row read back from the audit log. +struct AuditRecord { + int64_t seq{0}; + AuditEvent event; + std::string prev_hash; + std::string record_hash; +}; + +/// Persisted head of the hash chain. +struct ChainHead { + int64_t seq{0}; ///< 0 when the chain is empty + std::string record_hash; ///< genesis hash when the chain is empty +}; + +/// Result of verifying the persisted chain. +struct AuditVerifyResult { + bool ok{true}; + int64_t checked{0}; ///< number of records walked + int64_t bad_seq{0}; ///< seq of the first offending record (0 if ok) + std::string error; ///< human-readable reason when !ok +}; + +/// Append-only, hash-chained audit log of fault state transitions. +/// +/// Each appended row stores `record_hash = sha256(prev_hash + canonical(event))` +/// using OpenSSL's EVP SHA-256 (the same primitive the gateway links). The hash +/// is computed once at insert and never recomputed. A persisted chain head lets +/// the chain resume across process restarts, and rotation seals a segment by +/// persisting an anchor (its final seq + hash) before pruning so the surviving +/// history stays verifiable. +/// +/// The table is treated as append-only: this class only ever INSERTs rows (and, +/// on rotation, deletes a sealed prefix). It never UPDATEs an existing record. +class FaultAuditLog { + public: + /// Open (or create) the audit log database. + /// @param db_path SQLite path. Use ":memory:" for an in-memory log. + /// @param retention_max_records Max records to retain before rotation seals + /// and prunes the oldest segment. 0 disables rotation (unlimited). + /// @throws std::runtime_error if the database cannot be opened or initialized. + explicit FaultAuditLog(const std::string & db_path, int64_t retention_max_records = 0); + + ~FaultAuditLog(); + + FaultAuditLog(const FaultAuditLog &) = delete; + FaultAuditLog & operator=(const FaultAuditLog &) = delete; + FaultAuditLog(FaultAuditLog &&) = delete; + FaultAuditLog & operator=(FaultAuditLog &&) = delete; + + /// Append one transition. Computes the chained hash, inserts the row, and + /// advances the persisted head, atomically. + /// @return the monotonic seq assigned to the new record. + int64_t append(const AuditEvent & event); + + /// Walk the persisted chain oldest-first and validate every link. + AuditVerifyResult verify() const; + + /// Read records oldest-first. + /// @param limit Max records to return (0 = all). + /// @param after_seq Only return records with seq > after_seq (0 = from start). + std::vector read(int64_t limit = 0, int64_t after_seq = 0) const; + + /// Current persisted chain head. + ChainHead head() const; + + /// Number of records currently retained (excludes pruned/sealed rows). + int64_t record_count() const; + + /// Deterministic canonical serialization of an event at a given seq. + /// Stable field order so verify is reproducible across processes. + static std::string canonicalize(int64_t seq, const AuditEvent & event); + + /// Genesis hash used as prev_hash for the very first record. + static std::string genesis_hash(); + + /// SHA-256 of `data` as a lowercase hex string (OpenSSL EVP). + static std::string sha256_hex(const std::string & data); + + const std::string & db_path() const { + return db_path_; + } + + private: + void initialize_schema(); + ChainHead load_head_locked() const; + void store_head_locked(const ChainHead & head_record); + /// Seal + prune the oldest segment if the retained count exceeds the limit. + void rotate_if_needed_locked(); + + std::string db_path_; + int64_t retention_max_records_{0}; + sqlite3 * db_{nullptr}; + mutable std::mutex mutex_; + ChainHead head_; ///< cached head, kept in sync with the head table +}; + +} // namespace ros2_medkit_fault_manager diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp index 9af908a7..7c603232 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp @@ -23,6 +23,7 @@ #include "ros2_medkit_fault_manager/capture_thread_pool.hpp" #include "ros2_medkit_fault_manager/correlation/correlation_engine.hpp" #include "ros2_medkit_fault_manager/entity_threshold_resolver.hpp" +#include "ros2_medkit_fault_manager/fault_audit_log.hpp" #include "ros2_medkit_fault_manager/fault_storage.hpp" #include "ros2_medkit_fault_manager/rosbag_capture.hpp" #include "ros2_medkit_fault_manager/snapshot_capture.hpp" @@ -148,6 +149,18 @@ class FaultManagerNode : public rclcpp::Node { /// Falls back to global config if no entity-specific overrides match. DebounceConfig resolve_config(const std::string & source_id) const; + /// Create the tamper-evident audit log from parameters (nullptr if disabled). + std::unique_ptr create_audit_log(); + + /// Append a fault state-transition to the audit log when enabled. No-op when + /// the log is off, or when only confirmations are logged and this is not one. + /// @param transition One of the kTransition* constants. + /// @param fault The fault state at the time of the transition. + /// @param source_id The reporting source that drove the transition. + /// @param occurred_at_ns Wall-clock timestamp of the transition. + void audit_transition(const char * transition, const ros2_medkit_msgs::msg::Fault & fault, + const std::string & source_id, int64_t occurred_at_ns); + std::string storage_type_; std::string database_path_; int32_t confirmation_threshold_{-1}; @@ -162,6 +175,10 @@ class FaultManagerNode : public rclcpp::Node { std::unique_ptr storage_; std::unique_ptr threshold_resolver_; ///< Per-entity threshold overrides + /// Tamper-evident audit log of fault transitions (nullptr when disabled). + std::unique_ptr audit_log_; + bool audit_confirmed_only_{false}; ///< When true, only "confirmed" transitions are logged + rclcpp::Service::SharedPtr report_fault_srv_; rclcpp::Service::SharedPtr list_faults_srv_; rclcpp::Service::SharedPtr get_fault_srv_; diff --git a/src/ros2_medkit_fault_manager/package.xml b/src/ros2_medkit_fault_manager/package.xml index 1658c527..723c7fed 100644 --- a/src/ros2_medkit_fault_manager/package.xml +++ b/src/ros2_medkit_fault_manager/package.xml @@ -16,6 +16,7 @@ ros2_medkit_serialization libsqlite3-dev nlohmann-json-dev + libssl-dev rosbag2_cpp rosbag2_storage diff --git a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp new file mode 100644 index 00000000..8799eddb --- /dev/null +++ b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp @@ -0,0 +1,522 @@ +// Copyright 2026 mfaferek93, bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "ros2_medkit_fault_manager/fault_audit_log.hpp" + +#include + +#include +#include +#include +#include +#include + +namespace ros2_medkit_fault_manager { + +namespace { + +/// 64 hex zeros: prev_hash of the first record. A fixed, well-known anchor so a +/// verifier can confirm the chain starts where it claims to. +constexpr const char * kGenesisHash = "0000000000000000000000000000000000000000000000000000000000000000"; + +/// RAII wrapper for a prepared SQLite statement (audit-log local copy). +class Stmt { + public: + Stmt(sqlite3 * db, const char * sql) : db_(db) { + if (sqlite3_prepare_v2(db, sql, -1, &stmt_, nullptr) != SQLITE_OK) { + throw std::runtime_error(std::string("audit: failed to prepare: ") + sqlite3_errmsg(db)); + } + } + + ~Stmt() { + if (stmt_) { + sqlite3_finalize(stmt_); + } + } + + Stmt(const Stmt &) = delete; + Stmt & operator=(const Stmt &) = delete; + + void bind_text(int index, const std::string & value) { + if (value.size() > static_cast(std::numeric_limits::max())) { + throw std::runtime_error("audit: text too large to bind"); + } + if (sqlite3_bind_text(stmt_, index, value.c_str(), static_cast(value.size()), SQLITE_TRANSIENT) != SQLITE_OK) { + throw std::runtime_error(std::string("audit: failed to bind text: ") + sqlite3_errmsg(db_)); + } + } + + void bind_int64(int index, int64_t value) { + if (sqlite3_bind_int64(stmt_, index, value) != SQLITE_OK) { + throw std::runtime_error(std::string("audit: failed to bind int64: ") + sqlite3_errmsg(db_)); + } + } + + int step() { + return sqlite3_step(stmt_); + } + + std::string column_text(int index) { + const auto * text = reinterpret_cast(sqlite3_column_text(stmt_, index)); + return text ? std::string(text) : std::string(); + } + + int64_t column_int64(int index) { + return sqlite3_column_int64(stmt_, index); + } + + private: + sqlite3 * db_; + sqlite3_stmt * stmt_{nullptr}; +}; + +void exec_or_throw(sqlite3 * db, const char * sql, const char * what) { + char * err_msg = nullptr; + if (sqlite3_exec(db, sql, nullptr, nullptr, &err_msg) != SQLITE_OK) { + std::string error = err_msg ? err_msg : "unknown error"; + sqlite3_free(err_msg); + throw std::runtime_error(std::string("audit: ") + what + ": " + error); + } +} + +} // namespace + +FaultAuditLog::FaultAuditLog(const std::string & db_path, int64_t retention_max_records) + : db_path_(db_path), retention_max_records_(retention_max_records < 0 ? 0 : retention_max_records) { + int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | SQLITE_OPEN_FULLMUTEX; + if (sqlite3_open_v2(db_path.c_str(), &db_, flags, nullptr) != SQLITE_OK) { + std::string error = db_ ? sqlite3_errmsg(db_) : "unknown error"; + if (db_) { + sqlite3_close(db_); + db_ = nullptr; + } + throw std::runtime_error("audit: failed to open '" + db_path + "': " + error); + } + + exec_or_throw(db_, "PRAGMA journal_mode=WAL;", "enable WAL"); + sqlite3_busy_timeout(db_, 5000); + + initialize_schema(); + head_ = load_head_locked(); +} + +FaultAuditLog::~FaultAuditLog() { + if (db_) { + sqlite3_close(db_); + } +} + +std::string FaultAuditLog::genesis_hash() { + return kGenesisHash; +} + +std::string FaultAuditLog::sha256_hex(const std::string & data) { + std::array md{}; + unsigned int md_len = 0; + + EVP_MD_CTX * ctx = EVP_MD_CTX_new(); + if (ctx == nullptr) { + throw std::runtime_error("audit: EVP_MD_CTX_new failed"); + } + const bool ok = EVP_DigestInit_ex(ctx, EVP_sha256(), nullptr) == 1 && + EVP_DigestUpdate(ctx, data.data(), data.size()) == 1 && + EVP_DigestFinal_ex(ctx, md.data(), &md_len) == 1; + EVP_MD_CTX_free(ctx); + if (!ok) { + throw std::runtime_error("audit: SHA-256 digest failed"); + } + + static constexpr char kHex[] = "0123456789abcdef"; + std::string out; + out.reserve(static_cast(md_len) * 2); + for (unsigned int i = 0; i < md_len; ++i) { + out.push_back(kHex[md[i] >> 4]); + out.push_back(kHex[md[i] & 0x0F]); + } + return out; +} + +std::string FaultAuditLog::canonicalize(int64_t seq, const AuditEvent & event) { + // Deterministic field order with JSON-escaped string values. Numeric fields + // render in fixed decimal form. This is the exact byte sequence the hash is + // taken over, so it must be stable across processes and re-reads. + std::string out; + out.reserve(192); + out += "{\"seq\":"; + out += std::to_string(seq); + out += ",\"ts\":"; + out += std::to_string(event.occurred_at_ns); + out += ",\"code\":"; + out += nlohmann::json(event.fault_code).dump(); + out += ",\"transition\":"; + out += nlohmann::json(event.transition).dump(); + out += ",\"severity\":"; + out += std::to_string(static_cast(event.severity)); + out += ",\"status\":"; + out += nlohmann::json(event.status).dump(); + out += ",\"source\":"; + out += nlohmann::json(event.source_id).dump(); + out += ",\"description\":"; + out += nlohmann::json(event.description).dump(); + out += "}"; + return out; +} + +void FaultAuditLog::initialize_schema() { + // Immutable transition rows. seq is the monotonic chain index. + exec_or_throw(db_, + R"( + CREATE TABLE IF NOT EXISTS audit_log ( + seq INTEGER PRIMARY KEY, + occurred_at_ns INTEGER NOT NULL, + fault_code TEXT NOT NULL, + transition TEXT NOT NULL, + severity INTEGER NOT NULL, + status TEXT NOT NULL, + source_id TEXT NOT NULL, + description TEXT NOT NULL, + prev_hash TEXT NOT NULL, + record_hash TEXT NOT NULL + ); + CREATE INDEX IF NOT EXISTS idx_audit_log_fault_code ON audit_log(fault_code); + )", + "create audit_log table"); + + // Single-row persisted chain head, so the chain resumes across restarts. + exec_or_throw(db_, + R"( + CREATE TABLE IF NOT EXISTS audit_chain_head ( + id INTEGER PRIMARY KEY CHECK (id = 1), + seq INTEGER NOT NULL, + record_hash TEXT NOT NULL + ); + )", + "create audit_chain_head table"); + + // Sealed-segment anchors written before pruning. Each captures the final + // (seq, hash) of a pruned prefix so the surviving tail stays verifiable. + exec_or_throw(db_, + R"( + CREATE TABLE IF NOT EXISTS audit_anchors ( + last_seq INTEGER PRIMARY KEY, + sealed_at_ns INTEGER NOT NULL, + last_hash TEXT NOT NULL + ); + )", + "create audit_anchors table"); +} + +ChainHead FaultAuditLog::load_head_locked() const { + // Prefer the persisted head row. + { + Stmt stmt(db_, "SELECT seq, record_hash FROM audit_chain_head WHERE id = 1"); + if (stmt.step() == SQLITE_ROW) { + ChainHead head_record; + head_record.seq = stmt.column_int64(0); + head_record.record_hash = stmt.column_text(1); + return head_record; + } + } + + // No head row. Recover from the last retained record if any exist (defensive: + // a crash between INSERT and head update would land here on reopen). + { + Stmt stmt(db_, "SELECT seq, record_hash FROM audit_log ORDER BY seq DESC LIMIT 1"); + if (stmt.step() == SQLITE_ROW) { + ChainHead head_record; + head_record.seq = stmt.column_int64(0); + head_record.record_hash = stmt.column_text(1); + return head_record; + } + } + + return ChainHead{0, genesis_hash()}; +} + +void FaultAuditLog::store_head_locked(const ChainHead & head_record) { + Stmt stmt(db_, + "INSERT INTO audit_chain_head (id, seq, record_hash) VALUES (1, ?, ?) " + "ON CONFLICT(id) DO UPDATE SET seq = excluded.seq, record_hash = excluded.record_hash"); + stmt.bind_int64(1, head_record.seq); + stmt.bind_text(2, head_record.record_hash); + if (stmt.step() != SQLITE_DONE) { + throw std::runtime_error(std::string("audit: failed to store head: ") + sqlite3_errmsg(db_)); + } +} + +int64_t FaultAuditLog::append(const AuditEvent & event) { + std::lock_guard lock(mutex_); + + const int64_t new_seq = head_.seq + 1; + const std::string prev_hash = head_.record_hash; + const std::string canonical = canonicalize(new_seq, event); + const std::string record_hash = sha256_hex(prev_hash + canonical); + + exec_or_throw(db_, "BEGIN IMMEDIATE", "begin append"); + try { + Stmt insert(db_, + "INSERT INTO audit_log (seq, occurred_at_ns, fault_code, transition, severity, status, " + "source_id, description, prev_hash, record_hash) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"); + insert.bind_int64(1, new_seq); + insert.bind_int64(2, event.occurred_at_ns); + insert.bind_text(3, event.fault_code); + insert.bind_text(4, event.transition); + insert.bind_int64(5, static_cast(event.severity)); + insert.bind_text(6, event.status); + insert.bind_text(7, event.source_id); + insert.bind_text(8, event.description); + insert.bind_text(9, prev_hash); + insert.bind_text(10, record_hash); + if (insert.step() != SQLITE_DONE) { + throw std::runtime_error(std::string("audit: failed to insert record: ") + sqlite3_errmsg(db_)); + } + + store_head_locked(ChainHead{new_seq, record_hash}); + exec_or_throw(db_, "COMMIT", "commit append"); + } catch (...) { + exec_or_throw(db_, "ROLLBACK", "rollback append"); + throw; + } + + head_ = ChainHead{new_seq, record_hash}; + + rotate_if_needed_locked(); + return new_seq; +} + +void FaultAuditLog::rotate_if_needed_locked() { + if (retention_max_records_ <= 0) { + return; + } + + // Count retained rows and find the oldest surviving seq. + int64_t count = 0; + int64_t min_seq = 0; + { + Stmt stmt(db_, "SELECT COUNT(*), COALESCE(MIN(seq), 0) FROM audit_log"); + if (stmt.step() == SQLITE_ROW) { + count = stmt.column_int64(0); + min_seq = stmt.column_int64(1); + } + } + if (count <= retention_max_records_) { + return; + } + + // Prune the oldest (count - retention_max_records_) rows. The boundary row is + // the highest seq being pruned; its hash becomes the sealed anchor that the + // first surviving row's prev_hash links back to. + const int64_t prune_count = count - retention_max_records_; + const int64_t boundary_seq = min_seq + prune_count - 1; + + std::string boundary_hash; + int64_t sealed_at_ns = 0; + { + Stmt stmt(db_, "SELECT record_hash, occurred_at_ns FROM audit_log WHERE seq = ?"); + stmt.bind_int64(1, boundary_seq); + if (stmt.step() != SQLITE_ROW) { + // Should not happen given the count; leave the log intact rather than + // prune without a valid anchor. + return; + } + boundary_hash = stmt.column_text(0); + sealed_at_ns = stmt.column_int64(1); + } + + exec_or_throw(db_, "BEGIN IMMEDIATE", "begin rotate"); + try { + Stmt anchor(db_, + "INSERT INTO audit_anchors (last_seq, sealed_at_ns, last_hash) VALUES (?, ?, ?) " + "ON CONFLICT(last_seq) DO NOTHING"); + anchor.bind_int64(1, boundary_seq); + anchor.bind_int64(2, sealed_at_ns); + anchor.bind_text(3, boundary_hash); + if (anchor.step() != SQLITE_DONE) { + throw std::runtime_error(std::string("audit: failed to write anchor: ") + sqlite3_errmsg(db_)); + } + + Stmt del(db_, "DELETE FROM audit_log WHERE seq <= ?"); + del.bind_int64(1, boundary_seq); + if (del.step() != SQLITE_DONE) { + throw std::runtime_error(std::string("audit: failed to prune records: ") + sqlite3_errmsg(db_)); + } + exec_or_throw(db_, "COMMIT", "commit rotate"); + } catch (...) { + exec_or_throw(db_, "ROLLBACK", "rollback rotate"); + throw; + } +} + +std::vector FaultAuditLog::read(int64_t limit, int64_t after_seq) const { + std::lock_guard lock(mutex_); + + std::string sql = + "SELECT seq, occurred_at_ns, fault_code, transition, severity, status, source_id, description, " + "prev_hash, record_hash FROM audit_log WHERE seq > ? ORDER BY seq ASC"; + if (limit > 0) { + sql += " LIMIT ?"; + } + + Stmt stmt(db_, sql.c_str()); + stmt.bind_int64(1, after_seq); + if (limit > 0) { + stmt.bind_int64(2, limit); + } + + std::vector result; + while (stmt.step() == SQLITE_ROW) { + AuditRecord rec; + rec.seq = stmt.column_int64(0); + rec.event.occurred_at_ns = stmt.column_int64(1); + rec.event.fault_code = stmt.column_text(2); + rec.event.transition = stmt.column_text(3); + rec.event.severity = static_cast(stmt.column_int64(4)); + rec.event.status = stmt.column_text(5); + rec.event.source_id = stmt.column_text(6); + rec.event.description = stmt.column_text(7); + rec.prev_hash = stmt.column_text(8); + rec.record_hash = stmt.column_text(9); + result.push_back(std::move(rec)); + } + return result; +} + +ChainHead FaultAuditLog::head() const { + std::lock_guard lock(mutex_); + return head_; +} + +int64_t FaultAuditLog::record_count() const { + std::lock_guard lock(mutex_); + Stmt stmt(db_, "SELECT COUNT(*) FROM audit_log"); + if (stmt.step() != SQLITE_ROW) { + return 0; + } + return stmt.column_int64(0); +} + +AuditVerifyResult FaultAuditLog::verify() const { + std::lock_guard lock(mutex_); + + AuditVerifyResult result; + + // Walk every retained row oldest-first, recomputing each link. + Stmt stmt(db_, + "SELECT seq, occurred_at_ns, fault_code, transition, severity, status, source_id, description, " + "prev_hash, record_hash FROM audit_log ORDER BY seq ASC"); + + bool first = true; + int64_t expected_seq = 0; + std::string expected_prev; + + while (stmt.step() == SQLITE_ROW) { + AuditRecord rec; + rec.seq = stmt.column_int64(0); + rec.event.occurred_at_ns = stmt.column_int64(1); + rec.event.fault_code = stmt.column_text(2); + rec.event.transition = stmt.column_text(3); + rec.event.severity = static_cast(stmt.column_int64(4)); + rec.event.status = stmt.column_text(5); + rec.event.source_id = stmt.column_text(6); + rec.event.description = stmt.column_text(7); + rec.prev_hash = stmt.column_text(8); + rec.record_hash = stmt.column_text(9); + + if (first) { + first = false; + // The first retained row must link to genesis (seq 1) or to a sealed + // anchor whose last_seq == rec.seq - 1. + if (rec.seq == 1) { + if (rec.prev_hash != genesis_hash()) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "first record does not link to genesis"; + return result; + } + } else { + Stmt anchor(db_, "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); + anchor.bind_int64(1, rec.seq - 1); + if (anchor.step() != SQLITE_ROW) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "no sealed anchor for the oldest retained record (history truncated)"; + return result; + } + const std::string anchor_hash = anchor.column_text(0); + if (rec.prev_hash != anchor_hash) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "oldest retained record does not link to its sealed anchor"; + return result; + } + } + expected_seq = rec.seq; + expected_prev = rec.prev_hash; + } else { + // Subsequent rows must be contiguous and chain to the previous record. + if (rec.seq != expected_seq) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "non-contiguous seq (record deleted or reordered)"; + return result; + } + if (rec.prev_hash != expected_prev) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "prev_hash does not match previous record_hash (chain broken)"; + return result; + } + } + + const std::string recomputed = sha256_hex(rec.prev_hash + canonicalize(rec.seq, rec.event)); + if (recomputed != rec.record_hash) { + result.ok = false; + result.bad_seq = rec.seq; + result.error = "record_hash mismatch (record tampered)"; + return result; + } + + ++result.checked; + expected_prev = rec.record_hash; + expected_seq = rec.seq + 1; + } + + // The persisted head must match the last retained record (catches deletion of + // the newest row, which the row walk alone cannot see). + if (result.checked > 0) { + if (head_.seq != expected_seq - 1 || head_.record_hash != expected_prev) { + result.ok = false; + result.bad_seq = head_.seq; + result.error = "persisted head does not match the last retained record"; + return result; + } + } else { + // Empty retained log: head must be either genesis (never written) or point + // at a sealed anchor (everything pruned). + if (head_.seq != 0) { + Stmt anchor(db_, "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); + anchor.bind_int64(1, head_.seq); + if (anchor.step() != SQLITE_ROW || anchor.column_text(0) != head_.record_hash) { + result.ok = false; + result.bad_seq = head_.seq; + result.error = "head references a record that is neither retained nor sealed"; + return result; + } + } + } + + return result; +} + +} // namespace ros2_medkit_fault_manager diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index 462c6c21..782aec41 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -151,6 +151,9 @@ FaultManagerNode::FaultManagerNode(const rclcpp::NodeOptions & options) : Node(" // Create storage backend storage_ = create_storage(); + // Create the tamper-evident fault audit log (disabled by default) + audit_log_ = create_audit_log(); + // Apply snapshot limit to storage if (max_snapshots > 0) { storage_->set_max_snapshots_per_fault(static_cast(max_snapshots)); @@ -385,6 +388,91 @@ std::unique_ptr FaultManagerNode::create_storage() { return std::make_unique(); } +std::unique_ptr FaultManagerNode::create_audit_log() { + const bool enabled = declare_parameter("audit_log.enabled", false); + + // Which transitions to record: "all" (occurred/confirmed/cleared) or + // "confirmed_only". + const std::string transitions = declare_parameter("audit_log.transitions", "all"); + audit_confirmed_only_ = (transitions == "confirmed_only"); + if (transitions != "all" && transitions != "confirmed_only") { + RCLCPP_WARN(get_logger(), "audit_log.transitions '%s' invalid, using 'all'", transitions.c_str()); + audit_confirmed_only_ = false; + } + + // Retention: seal + prune the oldest segment beyond this many records (0 = off). + auto retention = declare_parameter("audit_log.retention_max_records", 0); + if (retention < 0) { + RCLCPP_WARN(get_logger(), "audit_log.retention_max_records < 0, disabling rotation"); + retention = 0; + } + + // Path: explicit override, else a sibling of the fault DB, else :memory:. + std::string audit_path = declare_parameter("audit_log.database_path", ""); + + if (!enabled) { + return nullptr; // No table, no file, no write overhead when off. + } + + if (audit_path.empty()) { + if (database_path_ == ":memory:" || storage_type_ != "sqlite") { + audit_path = ":memory:"; + } else { + std::filesystem::path base(database_path_); + audit_path = (base.parent_path() / "fault_audit.db").string(); + } + } + + if (audit_path != ":memory:") { + std::filesystem::path p(audit_path); + auto parent = p.parent_path(); + if (!parent.empty() && !std::filesystem::exists(parent)) { + try { + std::filesystem::create_directories(parent); + } catch (const std::filesystem::filesystem_error & e) { + RCLCPP_ERROR(get_logger(), "Failed to create audit log directory '%s': %s", parent.string().c_str(), e.what()); + throw; + } + } + } + + try { + auto log = std::make_unique(audit_path, retention); + RCLCPP_INFO(get_logger(), "Fault audit log enabled: %s (transitions=%s, retention=%ld, resume_seq=%ld)", + audit_path.c_str(), audit_confirmed_only_ ? "confirmed_only" : "all", retention, log->head().seq); + return log; + } catch (const std::exception & e) { + RCLCPP_ERROR(get_logger(), "Failed to open fault audit log '%s': %s", audit_path.c_str(), e.what()); + throw; + } +} + +void FaultManagerNode::audit_transition(const char * transition, const ros2_medkit_msgs::msg::Fault & fault, + const std::string & source_id, int64_t occurred_at_ns) { + if (!audit_log_) { + return; + } + if (audit_confirmed_only_ && std::string(transition) != kTransitionConfirmed) { + return; + } + + AuditEvent event; + event.fault_code = fault.fault_code; + event.transition = transition; + event.severity = fault.severity; + event.status = fault.status; + event.source_id = source_id; + event.description = fault.description; + event.occurred_at_ns = occurred_at_ns; + + try { + audit_log_->append(event); + } catch (const std::exception & e) { + RCLCPP_ERROR(get_logger(), "Failed to append audit record for '%s' (%s): %s", fault.fault_code.c_str(), transition, + e.what()); + } +} + void FaultManagerNode::handle_report_fault( const std::shared_ptr & request, const std::shared_ptr & response) { @@ -428,9 +516,10 @@ void FaultManagerNode::handle_report_fault( auto resolved_config = resolve_config(request->source_id); // Report the fault event (use wall clock time, not sim time, for proper timestamps) + const rclcpp::Time event_time = get_wall_clock_time(); bool is_new = storage_->report_fault_event(request->fault_code, request->event_type, request->severity, request->description, - request->source_id, get_wall_clock_time(), resolved_config); + request->source_id, event_time, resolved_config); response->accepted = true; @@ -487,6 +576,16 @@ void FaultManagerNode::handle_report_fault( } // Note: PREFAILED/PREPASSED status changes don't emit events (debounce in progress) + // Append tamper-evident audit records for the transitions that just happened. + // Recorded regardless of correlation muting: muting affects display, not the + // fact that the state transition occurred. + if (is_new) { + audit_transition(kTransitionOccurred, *fault_after, request->source_id, event_time.nanoseconds()); + } + if (just_confirmed) { + audit_transition(kTransitionConfirmed, *fault_after, request->source_id, event_time.nanoseconds()); + } + // Capture snapshots/rosbag when a fault confirms via the bounded pool (issue #441). // handle_report_fault runs on the single-threaded executor, so confirmations are // already serialized; last_capture_mutex_ only guards last_capture_times_ itself @@ -695,6 +794,12 @@ void FaultManagerNode::handle_clear_fault( // Auto-clear correlated symptoms for (const auto & symptom_code : auto_cleared_codes) { storage_->clear_fault(symptom_code); + if (audit_log_) { + auto symptom = storage_->get_fault(symptom_code); + if (symptom) { + audit_transition(kTransitionCleared, *symptom, "clear_service", get_wall_clock_time().nanoseconds()); + } + } RCLCPP_DEBUG(get_logger(), "Auto-cleared symptom: %s (root cause: %s)", symptom_code.c_str(), request->fault_code.c_str()); // Also cleanup rosbag for auto-cleared faults @@ -722,6 +827,7 @@ void FaultManagerNode::handle_clear_fault( auto fault = storage_->get_fault(request->fault_code); if (fault) { publish_fault_event(ros2_medkit_msgs::msg::FaultEvent::EVENT_CLEARED, *fault, auto_cleared_codes); + audit_transition(kTransitionCleared, *fault, "clear_service", get_wall_clock_time().nanoseconds()); } } else { response->message = "Fault not found: " + request->fault_code; diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp new file mode 100644 index 00000000..21fd3448 --- /dev/null +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -0,0 +1,270 @@ +// Copyright 2026 mfaferek93, bburda +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include + +#include "ros2_medkit_fault_manager/fault_audit_log.hpp" + +using ros2_medkit_fault_manager::AuditEvent; +using ros2_medkit_fault_manager::FaultAuditLog; +using ros2_medkit_fault_manager::kTransitionCleared; +using ros2_medkit_fault_manager::kTransitionConfirmed; +using ros2_medkit_fault_manager::kTransitionOccurred; + +namespace { + +AuditEvent make_event(const std::string & code, const char * transition, int64_t ts) { + AuditEvent e; + e.fault_code = code; + e.transition = transition; + e.severity = 2; + e.status = "CONFIRMED"; + e.source_id = "/robot/source"; + e.description = "pump pressure low"; + e.occurred_at_ns = ts; + return e; +} + +/// Run a single SQL statement directly against the audit DB file (used to +/// simulate tampering an immutable row). +void raw_exec(const std::string & db_path, const std::string & sql) { + sqlite3 * db = nullptr; + ASSERT_EQ(sqlite3_open(db_path.c_str(), &db), SQLITE_OK); + char * err = nullptr; + int rc = sqlite3_exec(db, sql.c_str(), nullptr, nullptr, &err); + std::string err_str = err ? err : ""; + sqlite3_free(err); + sqlite3_close(db); + ASSERT_EQ(rc, SQLITE_OK) << err_str; +} + +} // namespace + +class FaultAuditLogTest : public ::testing::Test { + protected: + void SetUp() override { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution dist; + path_ = (std::filesystem::temp_directory_path() / ("test_audit_" + std::to_string(dist(gen)) + ".db")).string(); + } + + void TearDown() override { + std::error_code ec; + std::filesystem::remove(path_, ec); + std::filesystem::remove(path_ + "-wal", ec); + std::filesystem::remove(path_ + "-shm", ec); + } + + std::string path_; +}; + +// Each transition appends a chained row with a monotonic seq and linked hashes. +TEST_F(FaultAuditLogTest, AppendsChainedRowPerTransition) { + FaultAuditLog log(path_); + + EXPECT_EQ(log.append(make_event("F1", kTransitionOccurred, 100)), 1); + EXPECT_EQ(log.append(make_event("F1", kTransitionConfirmed, 200)), 2); + EXPECT_EQ(log.append(make_event("F1", kTransitionCleared, 300)), 3); + + auto records = log.read(); + ASSERT_EQ(records.size(), 3u); + + // First row links to genesis; each subsequent prev_hash equals the prior hash. + EXPECT_EQ(records[0].seq, 1); + EXPECT_EQ(records[0].prev_hash, FaultAuditLog::genesis_hash()); + EXPECT_EQ(records[1].prev_hash, records[0].record_hash); + EXPECT_EQ(records[2].prev_hash, records[1].record_hash); + + // record_hash is the sha256 of prev_hash + canonical(event). + const std::string expected = + FaultAuditLog::sha256_hex(records[0].prev_hash + FaultAuditLog::canonicalize(1, records[0].event)); + EXPECT_EQ(records[0].record_hash, expected); + + EXPECT_EQ(log.record_count(), 3); + EXPECT_EQ(log.head().seq, 3); + EXPECT_EQ(log.head().record_hash, records[2].record_hash); +} + +// Verify confirms an untampered chain. +TEST_F(FaultAuditLogTest, VerifyUntamperedChain) { + FaultAuditLog log(path_); + for (int i = 0; i < 10; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + auto result = log.verify(); + EXPECT_TRUE(result.ok) << result.error; + EXPECT_EQ(result.checked, 10); +} + +// Known SHA-256 vector proves the EVP wiring (sha256("") == e3b0c442...). +TEST_F(FaultAuditLogTest, Sha256KnownVector) { + EXPECT_EQ(FaultAuditLog::sha256_hex(""), + "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); +} + +// Editing a past row makes verify fail. +TEST_F(FaultAuditLogTest, EditingPastRowFailsVerify) { + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionOccurred, 200)); + log.append(make_event("F3", kTransitionOccurred, 300)); + ASSERT_TRUE(log.verify().ok); + } + + // Tamper: change a stored field without recomputing the hash. + raw_exec(path_, "UPDATE audit_log SET description = 'forged' WHERE seq = 2"); + + FaultAuditLog reopened(path_); + auto result = reopened.verify(); + EXPECT_FALSE(result.ok); + EXPECT_EQ(result.bad_seq, 2); +} + +// Deleting a middle row makes verify fail (chain gap). +TEST_F(FaultAuditLogTest, DeletingMiddleRowFailsVerify) { + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionOccurred, 200)); + log.append(make_event("F3", kTransitionOccurred, 300)); + } + + raw_exec(path_, "DELETE FROM audit_log WHERE seq = 2"); + + FaultAuditLog reopened(path_); + EXPECT_FALSE(reopened.verify().ok); +} + +// Deleting the newest row is caught by the persisted head check. +TEST_F(FaultAuditLogTest, DeletingNewestRowFailsVerify) { + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionOccurred, 200)); + log.append(make_event("F3", kTransitionOccurred, 300)); + } + + // Drop the last row but leave the head pointing at seq 3. + raw_exec(path_, "DELETE FROM audit_log WHERE seq = 3"); + + FaultAuditLog reopened(path_); + EXPECT_FALSE(reopened.verify().ok); +} + +// The chain head persists across a reopen and the chain resumes from it. +TEST_F(FaultAuditLogTest, HeadPersistsAcrossReopen) { + std::string head_hash; + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionConfirmed, 200)); + head_hash = log.head().record_hash; + EXPECT_EQ(log.head().seq, 2); + } + + FaultAuditLog reopened(path_); + EXPECT_EQ(reopened.head().seq, 2); + EXPECT_EQ(reopened.head().record_hash, head_hash); + + // The next append continues the same chain. + EXPECT_EQ(reopened.append(make_event("F3", kTransitionCleared, 300)), 3); + auto records = reopened.read(); + ASSERT_EQ(records.size(), 3u); + EXPECT_EQ(records[2].prev_hash, head_hash); + EXPECT_TRUE(reopened.verify().ok); +} + +// Rotation seals a segment and prunes it, leaving the tail verifiable. +TEST_F(FaultAuditLogTest, RotationSealsAndPrunesButStaysVerifiable) { + FaultAuditLog log(path_, /*retention_max_records=*/5); + for (int i = 1; i <= 12; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + + // Only the most recent 5 rows are retained. + EXPECT_EQ(log.record_count(), 5); + auto records = log.read(); + ASSERT_EQ(records.size(), 5u); + EXPECT_EQ(records.front().seq, 8); + EXPECT_EQ(records.back().seq, 12); + EXPECT_EQ(log.head().seq, 12); + + // The surviving tail still verifies via the sealed anchor. + auto result = log.verify(); + EXPECT_TRUE(result.ok) << result.error; + EXPECT_EQ(result.checked, 5); +} + +// Tampering a row inside a rotated tail is still detected. +TEST_F(FaultAuditLogTest, RotationThenTamperFails) { + { + FaultAuditLog log(path_, /*retention_max_records=*/5); + for (int i = 1; i <= 12; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + } + raw_exec(path_, "UPDATE audit_log SET source_id = 'forged' WHERE seq = 10"); + + FaultAuditLog reopened(path_, 5); + auto result = reopened.verify(); + EXPECT_FALSE(result.ok); + EXPECT_EQ(result.bad_seq, 10); +} + +// Removing the sealed anchor breaks the link for the oldest retained row. +TEST_F(FaultAuditLogTest, MissingAnchorFailsVerify) { + { + FaultAuditLog log(path_, /*retention_max_records=*/5); + for (int i = 1; i <= 12; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + } + raw_exec(path_, "DELETE FROM audit_anchors"); + + FaultAuditLog reopened(path_, 5); + EXPECT_FALSE(reopened.verify().ok); +} + +// canonicalize is deterministic and order-stable. +TEST_F(FaultAuditLogTest, CanonicalizeDeterministic) { + auto e = make_event("F1", kTransitionConfirmed, 12345); + EXPECT_EQ(FaultAuditLog::canonicalize(7, e), FaultAuditLog::canonicalize(7, e)); + // seq is part of the canonical form, so a different seq changes the bytes. + EXPECT_NE(FaultAuditLog::canonicalize(7, e), FaultAuditLog::canonicalize(8, e)); +} + +// read(after_seq) returns only newer records, oldest-first. +TEST_F(FaultAuditLogTest, ReadAfterSeq) { + FaultAuditLog log(path_); + for (int i = 1; i <= 5; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + auto tail = log.read(/*limit=*/0, /*after_seq=*/3); + ASSERT_EQ(tail.size(), 2u); + EXPECT_EQ(tail[0].seq, 4); + EXPECT_EQ(tail[1].seq, 5); +} + +int main(int argc, char ** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index e2b8fe39..c00e3b7c 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -16,12 +16,15 @@ #include #include +#include #include #include +#include #include #include #include "rclcpp/rclcpp.hpp" +#include "ros2_medkit_fault_manager/fault_audit_log.hpp" #include "ros2_medkit_fault_manager/fault_manager_node.hpp" #include "ros2_medkit_fault_manager/fault_storage.hpp" #include "ros2_medkit_msgs/msg/fault.hpp" @@ -1516,6 +1519,122 @@ TEST_F(SnapshotCooldownTest, CooldownPreventsRapidRecapture) { SUCCEED(); } +// End-to-end check that the node hooks the audit log on the fault write path. +// Reports a CRITICAL fault (immediate confirm => occurred + confirmed), then +// clears it (=> cleared), and inspects the persisted audit DB by reopening it. +class FaultAuditIntegrationTest : public ::testing::Test { + protected: + void SetUp() override { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution dist; + audit_path_ = (std::filesystem::temp_directory_path() / ("test_node_audit_" + std::to_string(dist(gen)) + ".db")) + .string(); + ns_ = "/test_audit_" + std::to_string(dist(gen)); + + rclcpp::NodeOptions fm_options; + fm_options.parameter_overrides({ + {"storage_type", "memory"}, + {"confirmation_threshold", -1}, + {"audit_log.enabled", true}, + {"audit_log.database_path", audit_path_}, + }); + fm_options.arguments({"--ros-args", "-r", "__ns:=" + ns_}); + fault_manager_ = std::make_shared(fm_options); + + rclcpp::NodeOptions test_options; + test_options.arguments({"--ros-args", "-r", "__ns:=" + ns_}); + test_node_ = std::make_shared("test_audit_client", test_options); + + report_client_ = test_node_->create_client(ns_ + "/fault_manager/report_fault"); + clear_client_ = test_node_->create_client(ns_ + "/fault_manager/clear_fault"); + ASSERT_TRUE(report_client_->wait_for_service(std::chrono::seconds(5))); + ASSERT_TRUE(clear_client_->wait_for_service(std::chrono::seconds(5))); + } + + void TearDown() override { + report_client_.reset(); + clear_client_.reset(); + test_node_.reset(); + fault_manager_.reset(); + std::error_code ec; + std::filesystem::remove(audit_path_, ec); + std::filesystem::remove(audit_path_ + "-wal", ec); + std::filesystem::remove(audit_path_ + "-shm", ec); + } + + template + bool spin_until_ready(FutureT & future) { + auto start = std::chrono::steady_clock::now(); + while (std::chrono::steady_clock::now() - start < std::chrono::seconds(2)) { + rclcpp::spin_some(fault_manager_); + rclcpp::spin_some(test_node_); + if (future.wait_for(std::chrono::milliseconds(0)) == std::future_status::ready) { + return true; + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + return false; + } + + std::string audit_path_; + std::string ns_; + std::shared_ptr fault_manager_; + std::shared_ptr test_node_; + rclcpp::Client::SharedPtr report_client_; + rclcpp::Client::SharedPtr clear_client_; +}; + +TEST_F(FaultAuditIntegrationTest, TransitionsAppendVerifiableChain) { + auto report = std::make_shared(); + report->fault_code = "AUDIT_FAULT"; + report->event_type = ReportFault::Request::EVENT_FAILED; + report->severity = Fault::SEVERITY_CRITICAL; // immediate confirm + report->description = "overpressure"; + report->source_id = "/plc/pump"; + auto rf = report_client_->async_send_request(report); + ASSERT_TRUE(spin_until_ready(rf)); + ASSERT_TRUE(rf.get()->accepted); + + auto clear = std::make_shared(); + clear->fault_code = "AUDIT_FAULT"; + auto cf = clear_client_->async_send_request(clear); + ASSERT_TRUE(spin_until_ready(cf)); + ASSERT_TRUE(cf.get()->success); + + // Reopen the audit DB independently and inspect the persisted chain. + ros2_medkit_fault_manager::FaultAuditLog audit(audit_path_); + auto records = audit.read(); + ASSERT_EQ(records.size(), 3u); + EXPECT_EQ(records[0].event.transition, ros2_medkit_fault_manager::kTransitionOccurred); + EXPECT_EQ(records[1].event.transition, ros2_medkit_fault_manager::kTransitionConfirmed); + EXPECT_EQ(records[2].event.transition, ros2_medkit_fault_manager::kTransitionCleared); + EXPECT_EQ(records[0].event.fault_code, "AUDIT_FAULT"); + EXPECT_EQ(records[1].event.source_id, "/plc/pump"); + + auto result = audit.verify(); + EXPECT_TRUE(result.ok) << result.error; + EXPECT_EQ(result.checked, 3); +} + +TEST(FaultAuditDisabledTest, NoAuditFileWhenDisabled) { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution dist; + const std::string audit_path = + (std::filesystem::temp_directory_path() / ("test_audit_off_" + std::to_string(dist(gen)) + ".db")).string(); + + rclcpp::NodeOptions options; + options.parameter_overrides({ + {"storage_type", "memory"}, + {"audit_log.database_path", audit_path}, // default enabled=false + }); + auto node = std::make_shared(options); + + // With the feature off, no audit database file is created. + EXPECT_FALSE(std::filesystem::exists(audit_path)); +} + int main(int argc, char ** argv) { rclcpp::init(argc, argv); ::testing::InitGoogleTest(&argc, argv); From d0b51247aaf37fccdc046f5eadf604c96f41a22d Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 16:36:35 +0000 Subject: [PATCH 2/9] style: apply clang-format-18 --- .../fault_audit_log.hpp | 20 +++++++++---------- .../src/fault_audit_log.cpp | 2 +- .../src/fault_manager_node.cpp | 5 ++--- .../test/test_fault_audit_log.cpp | 3 +-- .../test/test_fault_manager.cpp | 7 +++---- 5 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp index 59cca09d..039a12e5 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp @@ -31,11 +31,11 @@ namespace ros2_medkit_fault_manager { /// later edit to a stored row is detectable. struct AuditEvent { std::string fault_code; - std::string transition; ///< occurred | confirmed | cleared | ack - uint8_t severity{0}; ///< severity at the time of the transition - std::string status; ///< resulting fault status (e.g. CONFIRMED) - std::string source_id; ///< reporting source that drove the transition - std::string description; ///< human-readable description + std::string transition; ///< occurred | confirmed | cleared | ack + uint8_t severity{0}; ///< severity at the time of the transition + std::string status; ///< resulting fault status (e.g. CONFIRMED) + std::string source_id; ///< reporting source that drove the transition + std::string description; ///< human-readable description int64_t occurred_at_ns{0}; ///< wall-clock timestamp of the transition }; @@ -55,16 +55,16 @@ struct AuditRecord { /// Persisted head of the hash chain. struct ChainHead { - int64_t seq{0}; ///< 0 when the chain is empty - std::string record_hash; ///< genesis hash when the chain is empty + int64_t seq{0}; ///< 0 when the chain is empty + std::string record_hash; ///< genesis hash when the chain is empty }; /// Result of verifying the persisted chain. struct AuditVerifyResult { bool ok{true}; - int64_t checked{0}; ///< number of records walked - int64_t bad_seq{0}; ///< seq of the first offending record (0 if ok) - std::string error; ///< human-readable reason when !ok + int64_t checked{0}; ///< number of records walked + int64_t bad_seq{0}; ///< seq of the first offending record (0 if ok) + std::string error; ///< human-readable reason when !ok }; /// Append-only, hash-chained audit log of fault state transitions. diff --git a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp index 8799eddb..8f146704 100644 --- a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp @@ -93,7 +93,7 @@ void exec_or_throw(sqlite3 * db, const char * sql, const char * what) { } // namespace FaultAuditLog::FaultAuditLog(const std::string & db_path, int64_t retention_max_records) - : db_path_(db_path), retention_max_records_(retention_max_records < 0 ? 0 : retention_max_records) { + : db_path_(db_path), retention_max_records_(retention_max_records < 0 ? 0 : retention_max_records) { int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | SQLITE_OPEN_FULLMUTEX; if (sqlite3_open_v2(db_path.c_str(), &db_, flags, nullptr) != SQLITE_OK) { std::string error = db_ ? sqlite3_errmsg(db_) : "unknown error"; diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index 782aec41..b1665fbd 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -517,9 +517,8 @@ void FaultManagerNode::handle_report_fault( // Report the fault event (use wall clock time, not sim time, for proper timestamps) const rclcpp::Time event_time = get_wall_clock_time(); - bool is_new = - storage_->report_fault_event(request->fault_code, request->event_type, request->severity, request->description, - request->source_id, event_time, resolved_config); + bool is_new = storage_->report_fault_event(request->fault_code, request->event_type, request->severity, + request->description, request->source_id, event_time, resolved_config); response->accepted = true; diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp index 21fd3448..b0ac54b9 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -116,8 +116,7 @@ TEST_F(FaultAuditLogTest, VerifyUntamperedChain) { // Known SHA-256 vector proves the EVP wiring (sha256("") == e3b0c442...). TEST_F(FaultAuditLogTest, Sha256KnownVector) { - EXPECT_EQ(FaultAuditLog::sha256_hex(""), - "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); + EXPECT_EQ(FaultAuditLog::sha256_hex(""), "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"); } // Editing a past row makes verify fail. diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index c00e3b7c..27ada404 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -1528,8 +1528,8 @@ class FaultAuditIntegrationTest : public ::testing::Test { std::random_device rd; std::mt19937_64 gen(rd()); std::uniform_int_distribution dist; - audit_path_ = (std::filesystem::temp_directory_path() / ("test_node_audit_" + std::to_string(dist(gen)) + ".db")) - .string(); + audit_path_ = + (std::filesystem::temp_directory_path() / ("test_node_audit_" + std::to_string(dist(gen)) + ".db")).string(); ns_ = "/test_audit_" + std::to_string(dist(gen)); rclcpp::NodeOptions fm_options; @@ -1626,8 +1626,7 @@ TEST(FaultAuditDisabledTest, NoAuditFileWhenDisabled) { rclcpp::NodeOptions options; options.parameter_overrides({ - {"storage_type", "memory"}, - {"audit_log.database_path", audit_path}, // default enabled=false + {"storage_type", "memory"}, {"audit_log.database_path", audit_path}, // default enabled=false }); auto node = std::make_shared(options); From 3423ccdf6de94715472eb86814466ec63a231531 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 19:10:24 +0200 Subject: [PATCH 3/9] fix(fault-audit): close truncation bypass and audit timer confirmations verify() now reads audit_chain_head directly and fails when the head row is missing on a non-empty log, so deleting the newest row plus the head is caught. Timer-driven PREFAILED->CONFIRMED confirmations now call audit_transition. Adds append-only triggers (hardening, not a security boundary) and corrects the unkeyed/single-file threat model in the docs. Refs #483 --- src/ros2_medkit_fault_manager/CHANGELOG.rst | 2 +- src/ros2_medkit_fault_manager/README.md | 8 +- .../config/fault_manager.yaml | 8 +- .../fault_audit_log.hpp | 18 ++- .../fault_manager_node.hpp | 10 ++ .../fault_storage.hpp | 6 +- .../sqlite_fault_storage.hpp | 2 +- .../src/fault_audit_log.cpp | 114 ++++++++++++------ .../src/fault_manager_node.cpp | 16 ++- .../src/fault_storage.cpp | 10 +- .../src/sqlite_fault_storage.cpp | 24 +++- .../test/test_fault_audit_log.cpp | 79 +++++++++++- .../test/test_fault_manager.cpp | 63 +++++++++- .../test/test_sqlite_storage.cpp | 9 +- 14 files changed, 293 insertions(+), 76 deletions(-) diff --git a/src/ros2_medkit_fault_manager/CHANGELOG.rst b/src/ros2_medkit_fault_manager/CHANGELOG.rst index 902b8447..1e884c91 100644 --- a/src/ros2_medkit_fault_manager/CHANGELOG.rst +++ b/src/ros2_medkit_fault_manager/CHANGELOG.rst @@ -4,7 +4,7 @@ Changelog for package ros2_medkit_fault_manager Forthcoming ----------- -* Optional tamper-evident, append-only audit log of fault state transitions: each transition appends one immutable, hash-chained row (``record_hash = sha256(prev_hash + canonical(event))`` via OpenSSL EVP SHA-256) with a persisted chain head, a ``verify`` routine, a read API, and retention that seals a segment anchor before pruning. Off by default (`#483 `_) +* Optional append-only, hash-chained audit log of fault state transitions: each transition appends one immutable row (``record_hash = sha256(prev_hash + canonical(event))`` via OpenSSL EVP SHA-256) with a persisted chain head, a ``verify`` routine, a read API, and retention that seals a segment anchor before pruning. Time-based (PREFAILED->CONFIRMED) auto-confirmations are also audited. ``verify`` reads the chain head directly from the database, so deleting the newest row together with the head row is reported as tampering instead of silently recovering. ``BEFORE UPDATE`` / ``BEFORE DELETE`` triggers reject out-of-band edits as defense-in-depth. The chain is unkeyed and stored in a single writable file, so ``verify`` detects edits/deletions that did not recompute the chain (casual or accidental tampering); it is not a defence against an attacker who can rewrite the whole file. Off by default (`#483 `_) 0.6.0 (2026-06-22) ------------------ diff --git a/src/ros2_medkit_fault_manager/README.md b/src/ros2_medkit_fault_manager/README.md index 908f6d5b..3e827aea 100644 --- a/src/ros2_medkit_fault_manager/README.md +++ b/src/ros2_medkit_fault_manager/README.md @@ -112,11 +112,13 @@ patterns: ## Advanced: Tamper-Evident Audit Log -An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `cleared`) so the fault history is verifiable and any later edit or deletion is detectable. It is **off by default** because it adds a write and storage cost per transition. +An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `cleared`) so the fault history is independently verifiable. It is **off by default** because it adds a write and storage cost per transition. -Each transition appends one immutable row holding `record_hash = sha256(prev_hash + canonical(event))` (OpenSSL EVP SHA-256), the `prev_hash` it links to, and a monotonic `seq`. The hash is computed once at insert and never recomputed. A persisted chain head lets the chain resume across restarts. The log is stored in its own SQLite database (separate from the fault store) and is treated as append-only - the manager only ever inserts rows. +Each transition appends one immutable row holding `record_hash = sha256(prev_hash + canonical(event))` (OpenSSL EVP SHA-256), the `prev_hash` it links to, and a monotonic `seq`. The hash is computed once at insert and never recomputed. A persisted chain head lets the chain resume across restarts. The log is stored in its own SQLite database (separate from the fault store) and is treated as append-only: the manager only ever inserts rows, and `BEFORE UPDATE` / `BEFORE DELETE` triggers reject out-of-band edits (the guarded rotation prune excepted). -`verify()` walks the persisted chain oldest-first and recomputes every link: editing a row breaks its `record_hash`, deleting a row breaks the next row's `prev_hash` linkage, and deleting the newest row is caught by the persisted-head check. +`verify()` walks the persisted chain oldest-first and recomputes every link: editing a row breaks its `record_hash`, deleting a row breaks the next row's `prev_hash` linkage, and deleting the newest row (the head row is read straight from the DB) is caught by the persisted-head check. + +**Threat model (read this).** The chain is **unkeyed**, and the head and segment anchors live in the **same writable SQLite file** as the rows. `verify()` therefore catches edits or deletions that did **not** also recompute the chain - that is, casual or accidental tampering, and the bookkeeping bugs that would otherwise lose records. It does **not** stop an attacker with write access to the file: such an attacker can drop the triggers and recompute the entire chain (and head and anchors) to forge a self-consistent history. The append-only triggers are defense-in-depth, **not** a security boundary. True tamper-*proofing* requires a key or signature over the head (so it cannot be recomputed without the key) or external anchoring of the head hash to an append-only store you do not control; both are out of scope here and belong to the audit-log exporter / signing follow-up. **Retention/rotation**: when more than `audit_log.retention_max_records` rows are retained, the oldest segment is *sealed* (its final `seq` + hash are persisted as an anchor) and then pruned. The surviving tail still verifies because the oldest retained row links back to the sealed anchor. diff --git a/src/ros2_medkit_fault_manager/config/fault_manager.yaml b/src/ros2_medkit_fault_manager/config/fault_manager.yaml index 3f94e71c..13d1f57a 100644 --- a/src/ros2_medkit_fault_manager/config/fault_manager.yaml +++ b/src/ros2_medkit_fault_manager/config/fault_manager.yaml @@ -28,10 +28,14 @@ fault_manager: # snapshots.capture_queue_depth: 16 # max pending captures (>= 1) # snapshots.capture_queue_full_policy: reject_newest # reject_newest | drop_oldest - # Tamper-evident, append-only audit log of fault state transitions + # Append-only, hash-chained audit log of fault state transitions # (occurred/confirmed/cleared). OFF by default: it adds a write + storage cost # per transition. When enabled, each transition appends one immutable, - # hash-chained row so any later edit or deletion is detectable via verify. + # hash-chained row, so verify() detects edits or deletions that did NOT also + # recompute the chain (casual/accidental tampering). The chain is unkeyed and + # lives in a single writable file, so it is NOT proof against an attacker who + # can rewrite the whole file; true tamper-proofing needs a signed head or + # external anchoring (out of scope here). See README "Threat model". audit_log.enabled: false # Which transitions to record: "all" or "confirmed_only". # audit_log.transitions: all diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp index 039a12e5..457bca9e 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -27,8 +28,8 @@ namespace ros2_medkit_fault_manager { /// /// `transition` is one of the kTransition* constants below. The remaining /// fields describe the fault at the moment of the transition; all of them feed -/// the canonical serialization that the hash chain is computed over, so any -/// later edit to a stored row is detectable. +/// the canonical serialization that the hash chain is computed over, so an edit +/// to a stored row that does not also recompute the chain breaks verify(). struct AuditEvent { std::string fault_code; std::string transition; ///< occurred | confirmed | cleared | ack @@ -78,6 +79,15 @@ struct AuditVerifyResult { /// /// The table is treated as append-only: this class only ever INSERTs rows (and, /// on rotation, deletes a sealed prefix). It never UPDATEs an existing record. +/// BEFORE UPDATE / BEFORE DELETE triggers reject out-of-band edits (the guarded +/// rotation prune excepted) as defense-in-depth. +/// +/// Threat model: the hash chain is UNKEYED and the head/anchors live in the same +/// writable file. verify() catches edits or deletions that did not also recompute +/// the chain (casual or accidental tampering), but anyone with write access to the +/// file can recompute the whole chain (and drop the triggers) and forge a +/// consistent history. True tamper-proofing needs a key/signature over the head or +/// external anchoring; that is out of scope here. class FaultAuditLog { public: /// Open (or create) the audit log database. @@ -129,6 +139,10 @@ class FaultAuditLog { private: void initialize_schema(); + /// Read the persisted chain head row (audit_chain_head id=1) straight from the + /// DB. Returns nullopt when the row is absent. verify() relies on this so a + /// deleted head row is treated as tampering rather than silently recovered. + std::optional read_head_row_locked() const; ChainHead load_head_locked() const; void store_head_locked(const ChainHead & head_record); /// Seal + prune the oldest segment if the retained count exceeds the limit. diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp index 7c603232..61cf227c 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp @@ -62,6 +62,16 @@ class FaultManagerNode : public rclcpp::Node { return *storage_; } + /// Get mutable access to fault storage (for testing only). + FaultStorage & get_storage_for_test() { + return *storage_; + } + + /// Get the tamper-evident audit log (nullptr when disabled), for testing only. + const FaultAuditLog * get_audit_log_for_test() const { + return audit_log_.get(); + } + /// Get the storage type being used const std::string & get_storage_type() const { return storage_type_; diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp index 34feffc7..2f376eb9 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp @@ -171,8 +171,8 @@ class FaultStorage { /// Check and confirm PREFAILED faults that have been pending too long (time-based confirmation) /// @param current_time Current timestamp for age calculation - /// @return Number of faults that were confirmed - virtual size_t check_time_based_confirmation(const rclcpp::Time & current_time) = 0; + /// @return Fault codes that were confirmed by this call (so the caller can audit each). + virtual std::vector check_time_based_confirmation(const rclcpp::Time & current_time) = 0; /// Set maximum snapshots per fault code (0 = unlimited) virtual void set_max_snapshots_per_fault(size_t /*max_count*/) { @@ -259,7 +259,7 @@ class InMemoryFaultStorage : public FaultStorage { bool contains(const std::string & fault_code) const override; - size_t check_time_based_confirmation(const rclcpp::Time & current_time) override; + std::vector check_time_based_confirmation(const rclcpp::Time & current_time) override; void set_max_snapshots_per_fault(size_t max_count) override; diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp index 7f2239e0..fb4006c9 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp @@ -59,7 +59,7 @@ class SqliteFaultStorage : public FaultStorage { bool contains(const std::string & fault_code) const override; - size_t check_time_based_confirmation(const rclcpp::Time & current_time) override; + std::vector check_time_based_confirmation(const rclcpp::Time & current_time) override; void set_max_snapshots_per_fault(size_t max_count) override; diff --git a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp index 8f146704..88710a53 100644 --- a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -215,32 +216,54 @@ void FaultAuditLog::initialize_schema() { ); )", "create audit_anchors table"); + + // Append-only enforcement at the DB layer (defense-in-depth, NOT a security + // boundary: anyone able to write the file can also DROP these triggers). The + // single-row guard lets the in-process rotation prune delete a sealed prefix + // while every other UPDATE/DELETE on audit_log is rejected. + exec_or_throw(db_, + R"( + CREATE TABLE IF NOT EXISTS audit_prune_guard ( + id INTEGER PRIMARY KEY CHECK (id = 1), + enabled INTEGER NOT NULL DEFAULT 0 + ); + INSERT INTO audit_prune_guard (id, enabled) VALUES (1, 0) + ON CONFLICT(id) DO NOTHING; + CREATE TRIGGER IF NOT EXISTS audit_log_no_update + BEFORE UPDATE ON audit_log + BEGIN + SELECT RAISE(ABORT, 'audit_log is append-only'); + END; + CREATE TRIGGER IF NOT EXISTS audit_log_no_delete + BEFORE DELETE ON audit_log + WHEN (SELECT enabled FROM audit_prune_guard WHERE id = 1) IS NOT 1 + BEGIN + SELECT RAISE(ABORT, 'audit_log is append-only'); + END; + )", + "create append-only triggers"); } -ChainHead FaultAuditLog::load_head_locked() const { - // Prefer the persisted head row. - { - Stmt stmt(db_, "SELECT seq, record_hash FROM audit_chain_head WHERE id = 1"); - if (stmt.step() == SQLITE_ROW) { - ChainHead head_record; - head_record.seq = stmt.column_int64(0); - head_record.record_hash = stmt.column_text(1); - return head_record; - } +std::optional FaultAuditLog::read_head_row_locked() const { + Stmt stmt(db_, "SELECT seq, record_hash FROM audit_chain_head WHERE id = 1"); + if (stmt.step() == SQLITE_ROW) { + ChainHead head_record; + head_record.seq = stmt.column_int64(0); + head_record.record_hash = stmt.column_text(1); + return head_record; } + return std::nullopt; +} - // No head row. Recover from the last retained record if any exist (defensive: - // a crash between INSERT and head update would land here on reopen). - { - Stmt stmt(db_, "SELECT seq, record_hash FROM audit_log ORDER BY seq DESC LIMIT 1"); - if (stmt.step() == SQLITE_ROW) { - ChainHead head_record; - head_record.seq = stmt.column_int64(0); - head_record.record_hash = stmt.column_text(1); - return head_record; - } +ChainHead FaultAuditLog::load_head_locked() const { + // Resume strictly from the persisted head row. There is deliberately no + // MAX(seq) fallback: append+head-update are written in one transaction, so a + // missing head row while rows exist means tampering, not a recoverable crash. + // verify() reports that case as a failure rather than silently fabricating a + // head from the surviving rows. + if (auto head_record = read_head_row_locked()) { + return *head_record; } - return ChainHead{0, genesis_hash()}; } @@ -346,11 +369,18 @@ void FaultAuditLog::rotate_if_needed_locked() { throw std::runtime_error(std::string("audit: failed to write anchor: ") + sqlite3_errmsg(db_)); } + // Open the prune guard so the BEFORE DELETE trigger allows this sealed-prefix + // delete. Both the guard flip and the delete are in this transaction, so a + // ROLLBACK restores the guard to its closed state. + exec_or_throw(db_, "UPDATE audit_prune_guard SET enabled = 1 WHERE id = 1", "open prune guard"); + Stmt del(db_, "DELETE FROM audit_log WHERE seq <= ?"); del.bind_int64(1, boundary_seq); if (del.step() != SQLITE_DONE) { throw std::runtime_error(std::string("audit: failed to prune records: ") + sqlite3_errmsg(db_)); } + + exec_or_throw(db_, "UPDATE audit_prune_guard SET enabled = 0 WHERE id = 1", "close prune guard"); exec_or_throw(db_, "COMMIT", "commit rotate"); } catch (...) { exec_or_throw(db_, "ROLLBACK", "rollback rotate"); @@ -492,27 +522,37 @@ AuditVerifyResult FaultAuditLog::verify() const { expected_seq = rec.seq + 1; } - // The persisted head must match the last retained record (catches deletion of - // the newest row, which the row walk alone cannot see). + // Read the persisted head row DIRECTLY from the DB (not the cached head_). A + // missing head row cannot be silently recovered from MAX(seq), so deleting the + // newest record together with the head row is reported as tampering instead of + // verifying clean. + const std::optional persisted = read_head_row_locked(); + if (result.checked > 0) { - if (head_.seq != expected_seq - 1 || head_.record_hash != expected_prev) { + // A non-empty log must carry its head row, and it must match the last record. + if (!persisted) { result.ok = false; - result.bad_seq = head_.seq; + result.bad_seq = expected_seq - 1; + result.error = "audit_chain_head row missing while audit_log is non-empty (head deleted / truncated)"; + return result; + } + if (persisted->seq != expected_seq - 1 || persisted->record_hash != expected_prev) { + result.ok = false; + result.bad_seq = persisted->seq; result.error = "persisted head does not match the last retained record"; return result; } - } else { - // Empty retained log: head must be either genesis (never written) or point - // at a sealed anchor (everything pruned). - if (head_.seq != 0) { - Stmt anchor(db_, "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); - anchor.bind_int64(1, head_.seq); - if (anchor.step() != SQLITE_ROW || anchor.column_text(0) != head_.record_hash) { - result.ok = false; - result.bad_seq = head_.seq; - result.error = "head references a record that is neither retained nor sealed"; - return result; - } + } else if (persisted && persisted->seq != 0) { + // Empty retained log with a head past genesis: everything was pruned, so the + // head must point at a sealed anchor. (No head row, or a genesis head, on an + // empty log is the never-written case and is fine.) + Stmt anchor(db_, "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); + anchor.bind_int64(1, persisted->seq); + if (anchor.step() != SQLITE_ROW || anchor.column_text(0) != persisted->record_hash) { + result.ok = false; + result.bad_seq = persisted->seq; + result.error = "head references a record that is neither retained nor sealed"; + return result; } } diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index b1665fbd..1b27cad0 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -327,10 +327,20 @@ FaultManagerNode::FaultManagerNode(const rclcpp::NodeOptions & options) : Node(" // Create auto-confirmation timer if enabled if (auto_confirm_after_sec_ > 0.0) { auto_confirm_timer_ = create_wall_timer(std::chrono::seconds(1), [this]() { - size_t confirmed = storage_->check_time_based_confirmation(get_wall_clock_time()); - if (confirmed > 0) { - RCLCPP_INFO(get_logger(), "Auto-confirmed %zu PREFAILED fault(s) due to time threshold", confirmed); + const auto confirmed = storage_->check_time_based_confirmation(get_wall_clock_time()); + if (confirmed.empty()) { + return; } + // Audit every timer-driven PREFAILED->CONFIRMED transition. Without this the + // confirmations are invisible to the audit log's verify(). + const int64_t confirmed_at_ns = get_wall_clock_time().nanoseconds(); + for (const auto & fault_code : confirmed) { + auto fault = storage_->get_fault(fault_code); + if (fault) { + audit_transition(kTransitionConfirmed, *fault, "auto_confirm_timer", confirmed_at_ns); + } + } + RCLCPP_INFO(get_logger(), "Auto-confirmed %zu PREFAILED fault(s) due to time threshold", confirmed.size()); }); RCLCPP_INFO(get_logger(), "FaultManager node started (storage=%s, confirmation_threshold=%d, " diff --git a/src/ros2_medkit_fault_manager/src/fault_storage.cpp b/src/ros2_medkit_fault_manager/src/fault_storage.cpp index 6f61b00d..14b5e41e 100644 --- a/src/ros2_medkit_fault_manager/src/fault_storage.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_storage.cpp @@ -305,14 +305,14 @@ bool InMemoryFaultStorage::contains(const std::string & fault_code) const { return faults_.find(fault_code) != faults_.end(); } -size_t InMemoryFaultStorage::check_time_based_confirmation(const rclcpp::Time & current_time) { +std::vector InMemoryFaultStorage::check_time_based_confirmation(const rclcpp::Time & current_time) { std::lock_guard lock(mutex_); + std::vector confirmed; if (config_.auto_confirm_after_sec <= 0.0) { - return 0; // Time-based confirmation disabled + return confirmed; // Time-based confirmation disabled } - size_t confirmed_count = 0; const double threshold_ns = config_.auto_confirm_after_sec * 1e9; for (auto & [code, state] : faults_) { @@ -320,12 +320,12 @@ size_t InMemoryFaultStorage::check_time_based_confirmation(const rclcpp::Time & const int64_t age_ns = (current_time - state.last_failed_time).nanoseconds(); if (static_cast(age_ns) >= threshold_ns) { state.status = ros2_medkit_msgs::msg::Fault::STATUS_CONFIRMED; - ++confirmed_count; + confirmed.push_back(code); } } } - return confirmed_count; + return confirmed; } void InMemoryFaultStorage::set_max_snapshots_per_fault(size_t max_count) { diff --git a/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp b/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp index 8cc75695..bd9d6d82 100644 --- a/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp +++ b/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp @@ -670,17 +670,35 @@ bool SqliteFaultStorage::contains(const std::string & fault_code) const { return stmt.step() == SQLITE_ROW; } -size_t SqliteFaultStorage::check_time_based_confirmation(const rclcpp::Time & current_time) { +std::vector SqliteFaultStorage::check_time_based_confirmation(const rclcpp::Time & current_time) { std::lock_guard lock(mutex_); + std::vector confirmed; if (config_.auto_confirm_after_sec <= 0.0) { - return 0; // Time-based confirmation disabled + return confirmed; // Time-based confirmation disabled } int64_t current_ns = current_time.nanoseconds(); int64_t threshold_ns = static_cast(config_.auto_confirm_after_sec * 1e9); int64_t cutoff_ns = current_ns - threshold_ns; + // Collect the codes that will flip first so the caller can audit each one. The + // SELECT predicate mirrors the UPDATE exactly, and both run under the same lock, + // so the returned list matches the rows actually confirmed below. + { + SqliteStatement select_stmt( + db_, "SELECT fault_code FROM faults WHERE status = ? AND last_failed_ns <= ? AND last_failed_ns > 0"); + select_stmt.bind_text(1, ros2_medkit_msgs::msg::Fault::STATUS_PREFAILED); + select_stmt.bind_int64(2, cutoff_ns); + while (select_stmt.step() == SQLITE_ROW) { + confirmed.push_back(select_stmt.column_text(0)); + } + } + + if (confirmed.empty()) { + return confirmed; + } + SqliteStatement update_stmt( db_, "UPDATE faults SET status = ? WHERE status = ? AND last_failed_ns <= ? AND last_failed_ns > 0"); update_stmt.bind_text(1, ros2_medkit_msgs::msg::Fault::STATUS_CONFIRMED); @@ -691,7 +709,7 @@ size_t SqliteFaultStorage::check_time_based_confirmation(const rclcpp::Time & cu throw std::runtime_error(std::string("Failed to confirm faults: ") + sqlite3_errmsg(db_)); } - return static_cast(sqlite3_changes(db_)); + return confirmed; } void SqliteFaultStorage::set_max_snapshots_per_fault(size_t max_count) { diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp index b0ac54b9..6a959b8e 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -42,8 +42,8 @@ AuditEvent make_event(const std::string & code, const char * transition, int64_t return e; } -/// Run a single SQL statement directly against the audit DB file (used to -/// simulate tampering an immutable row). +/// Run SQL directly against the audit DB file (used to simulate tampering an +/// immutable row). Asserts success. void raw_exec(const std::string & db_path, const std::string & sql) { sqlite3 * db = nullptr; ASSERT_EQ(sqlite3_open(db_path.c_str(), &db), SQLITE_OK); @@ -55,6 +55,22 @@ void raw_exec(const std::string & db_path, const std::string & sql) { ASSERT_EQ(rc, SQLITE_OK) << err_str; } +/// Run SQL directly and return the raw result code (does not assert), so a test +/// can confirm the append-only triggers reject a write. +int raw_exec_rc(const std::string & db_path, const std::string & sql) { + sqlite3 * db = nullptr; + EXPECT_EQ(sqlite3_open(db_path.c_str(), &db), SQLITE_OK); + int rc = sqlite3_exec(db, sql.c_str(), nullptr, nullptr, nullptr); + sqlite3_close(db); + return rc; +} + +/// Drop the DB-level append-only triggers so a raw tamper write can land. This +/// mimics an attacker who bypassed the (defense-in-depth, not security boundary) +/// triggers; verify() must still detect the recompute-free edit afterwards. +const char * const kDropTriggers = + "DROP TRIGGER IF EXISTS audit_log_no_update; DROP TRIGGER IF EXISTS audit_log_no_delete; "; + } // namespace class FaultAuditLogTest : public ::testing::Test { @@ -130,7 +146,7 @@ TEST_F(FaultAuditLogTest, EditingPastRowFailsVerify) { } // Tamper: change a stored field without recomputing the hash. - raw_exec(path_, "UPDATE audit_log SET description = 'forged' WHERE seq = 2"); + raw_exec(path_, std::string(kDropTriggers) + "UPDATE audit_log SET description = 'forged' WHERE seq = 2"); FaultAuditLog reopened(path_); auto result = reopened.verify(); @@ -147,7 +163,7 @@ TEST_F(FaultAuditLogTest, DeletingMiddleRowFailsVerify) { log.append(make_event("F3", kTransitionOccurred, 300)); } - raw_exec(path_, "DELETE FROM audit_log WHERE seq = 2"); + raw_exec(path_, std::string(kDropTriggers) + "DELETE FROM audit_log WHERE seq = 2"); FaultAuditLog reopened(path_); EXPECT_FALSE(reopened.verify().ok); @@ -163,12 +179,34 @@ TEST_F(FaultAuditLogTest, DeletingNewestRowFailsVerify) { } // Drop the last row but leave the head pointing at seq 3. - raw_exec(path_, "DELETE FROM audit_log WHERE seq = 3"); + raw_exec(path_, std::string(kDropTriggers) + "DELETE FROM audit_log WHERE seq = 3"); FaultAuditLog reopened(path_); EXPECT_FALSE(reopened.verify().ok); } +// Truncation bypass: deleting the newest row AND the head row must still FAIL. +// The head row is read directly from the DB, so a missing head on a non-empty +// log is treated as tampering rather than silently rebuilt from MAX(seq). +TEST_F(FaultAuditLogTest, DeletingNewestRowAndHeadFailsVerify) { + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionOccurred, 200)); + log.append(make_event("F3", kTransitionOccurred, 300)); + ASSERT_TRUE(log.verify().ok); + } + + // Drop the newest row and the persisted head row together. + raw_exec(path_, std::string(kDropTriggers) + + "DELETE FROM audit_log WHERE seq = 3; DELETE FROM audit_chain_head WHERE id = 1;"); + + FaultAuditLog reopened(path_); + auto result = reopened.verify(); + EXPECT_FALSE(result.ok); + EXPECT_EQ(result.bad_seq, 2); // last surviving row +} + // The chain head persists across a reopen and the chain resumes from it. TEST_F(FaultAuditLogTest, HeadPersistsAcrossReopen) { std::string head_hash; @@ -221,7 +259,7 @@ TEST_F(FaultAuditLogTest, RotationThenTamperFails) { log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); } } - raw_exec(path_, "UPDATE audit_log SET source_id = 'forged' WHERE seq = 10"); + raw_exec(path_, std::string(kDropTriggers) + "UPDATE audit_log SET source_id = 'forged' WHERE seq = 10"); FaultAuditLog reopened(path_, 5); auto result = reopened.verify(); @@ -263,6 +301,35 @@ TEST_F(FaultAuditLogTest, ReadAfterSeq) { EXPECT_EQ(tail[1].seq, 5); } +// Defense-in-depth: the append-only triggers reject an out-of-band UPDATE or +// DELETE on audit_log while the in-process rotation prune still works. +TEST_F(FaultAuditLogTest, AppendOnlyTriggersRejectOutOfBandWrites) { + { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F2", kTransitionOccurred, 200)); + } + + // Both a raw UPDATE and a raw DELETE are aborted by the triggers. + EXPECT_NE(raw_exec_rc(path_, "UPDATE audit_log SET description = 'forged' WHERE seq = 1"), SQLITE_OK); + EXPECT_NE(raw_exec_rc(path_, "DELETE FROM audit_log WHERE seq = 1"), SQLITE_OK); + + // Rows are intact and the chain still verifies. + FaultAuditLog reopened(path_); + EXPECT_EQ(reopened.record_count(), 2); + EXPECT_TRUE(reopened.verify().ok); +} + +// The guarded rotation prune is exempt from the append-only delete trigger. +TEST_F(FaultAuditLogTest, RotationPrunePassesAppendOnlyTrigger) { + FaultAuditLog log(path_, /*retention_max_records=*/3); + for (int i = 1; i <= 8; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + EXPECT_EQ(log.record_count(), 3); + EXPECT_TRUE(log.verify().ok); +} + int main(int argc, char ** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index 27ada404..589cffd9 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -532,9 +532,9 @@ TEST_F(FaultStorageTest, TimeBasedConfirmationDisabledByDefault) { // Advance time and check - should not auto-confirm (auto_confirm_after_sec = 0) auto future_time = rclcpp::Time(clock.now().nanoseconds() + static_cast(20e9)); - size_t confirmed = storage_.check_time_based_confirmation(future_time); + auto confirmed = storage_.check_time_based_confirmation(future_time); - EXPECT_EQ(confirmed, 0u); + EXPECT_TRUE(confirmed.empty()); auto fault = storage_.get_fault("FAULT_1"); ASSERT_TRUE(fault.has_value()); @@ -554,13 +554,14 @@ TEST_F(FaultStorageTest, TimeBasedConfirmationWhenEnabled) { // Check before timeout - should not confirm auto before_timeout = rclcpp::Time(now.nanoseconds() + static_cast(5e9)); - size_t confirmed_early = storage_.check_time_based_confirmation(before_timeout); - EXPECT_EQ(confirmed_early, 0u); + auto confirmed_early = storage_.check_time_based_confirmation(before_timeout); + EXPECT_TRUE(confirmed_early.empty()); // Check after timeout - should confirm auto after_timeout = rclcpp::Time(now.nanoseconds() + static_cast(15e9)); - size_t confirmed = storage_.check_time_based_confirmation(after_timeout); - EXPECT_EQ(confirmed, 1u); + auto confirmed = storage_.check_time_based_confirmation(after_timeout); + ASSERT_EQ(confirmed.size(), 1u); + EXPECT_EQ(confirmed[0], "FAULT_1"); auto fault = storage_.get_fault("FAULT_1"); ASSERT_TRUE(fault.has_value()); @@ -1634,6 +1635,56 @@ TEST(FaultAuditDisabledTest, NoAuditFileWhenDisabled) { EXPECT_FALSE(std::filesystem::exists(audit_path)); } +// Timer-driven (PREFAILED->CONFIRMED) auto-confirmations must be audited, not +// silently applied. Sets auto_confirm_after_sec and asserts a "confirmed" audit +// row appears after the 1 Hz timer fires. +TEST(FaultAuditTimerTest, TimerConfirmationAppendsConfirmedAuditRow) { + rclcpp::NodeOptions options; + options.parameter_overrides({ + {"storage_type", "memory"}, + {"confirmation_threshold", -3}, // keep the fault PREFAILED so only the timer confirms it + {"auto_confirm_after_sec", 0.2}, + {"audit_log.enabled", true}, // in-memory audit DB (memory storage) + }); + auto node = std::make_shared(options); + + const auto * audit = node->get_audit_log_for_test(); + ASSERT_NE(audit, nullptr); + + // Land a fault in PREFAILED directly in storage; the node's auto-confirm timer + // must later flip it to CONFIRMED and append the audit row. + DebounceConfig config; + config.confirmation_threshold = -3; + config.auto_confirm_after_sec = 0.2; + rclcpp::Clock clock(RCL_SYSTEM_TIME); + node->get_storage_for_test().report_fault_event("AUTO_CONF_1", ReportFault::Request::EVENT_FAILED, + Fault::SEVERITY_ERROR, "stuck", "/robot/src", clock.now(), config); + ASSERT_EQ(node->get_storage().get_fault("AUTO_CONF_1")->status, Fault::STATUS_PREFAILED); + + // Spin until a confirmed audit row appears or the budget expires (the wall + // timer fires once per second). + bool saw_confirmed = false; + auto start = std::chrono::steady_clock::now(); + while (std::chrono::steady_clock::now() - start < std::chrono::seconds(5)) { + rclcpp::spin_some(node); + for (const auto & rec : audit->read()) { + if (rec.event.fault_code == "AUTO_CONF_1" && + rec.event.transition == ros2_medkit_fault_manager::kTransitionConfirmed) { + saw_confirmed = true; + break; + } + } + if (saw_confirmed) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + + EXPECT_TRUE(saw_confirmed) << "timer-driven confirmation was not audited"; + EXPECT_EQ(node->get_storage().get_fault("AUTO_CONF_1")->status, Fault::STATUS_CONFIRMED); + EXPECT_TRUE(audit->verify().ok); +} + int main(int argc, char ** argv) { rclcpp::init(argc, argv); ::testing::InitGoogleTest(&argc, argv); diff --git a/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp b/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp index d0c5c3c0..bd588013 100644 --- a/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp +++ b/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp @@ -725,13 +725,14 @@ TEST_F(SqliteFaultStorageTest, TimeBasedConfirmationWhenEnabled) { // Check before timeout - should not confirm auto before_timeout = rclcpp::Time(now.nanoseconds() + static_cast(5e9)); - size_t confirmed_early = storage_->check_time_based_confirmation(before_timeout); - EXPECT_EQ(confirmed_early, 0u); + auto confirmed_early = storage_->check_time_based_confirmation(before_timeout); + EXPECT_TRUE(confirmed_early.empty()); // Check after timeout - should confirm auto after_timeout = rclcpp::Time(now.nanoseconds() + static_cast(15e9)); - size_t confirmed = storage_->check_time_based_confirmation(after_timeout); - EXPECT_EQ(confirmed, 1u); + auto confirmed = storage_->check_time_based_confirmation(after_timeout); + ASSERT_EQ(confirmed.size(), 1u); + EXPECT_EQ(confirmed[0], "FAULT_1"); auto fault = storage_->get_fault("FAULT_1"); ASSERT_TRUE(fault.has_value()); From 3828901693e297fec2156969cc62b5a35e1a46d1 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 19:43:52 +0200 Subject: [PATCH 4/9] fix(fault-audit): guard null DB handle and use PRId64 for int64 logs raw_exec_rc no longer runs sqlite3_exec on a failed sqlite3_open; it records the failure, closes the handle and returns early. The audit-log enabled RCLCPP_INFO now formats int64_t retention/seq with PRId64 () to satisfy -Werror=format=2. Refs #483 --- src/ros2_medkit_fault_manager/src/fault_manager_node.cpp | 4 +++- src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index 1b27cad0..28ef88e3 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -448,7 +449,8 @@ std::unique_ptr FaultManagerNode::create_audit_log() { try { auto log = std::make_unique(audit_path, retention); - RCLCPP_INFO(get_logger(), "Fault audit log enabled: %s (transitions=%s, retention=%ld, resume_seq=%ld)", + RCLCPP_INFO(get_logger(), + "Fault audit log enabled: %s (transitions=%s, retention=%" PRId64 ", resume_seq=%" PRId64 ")", audit_path.c_str(), audit_confirmed_only_ ? "confirmed_only" : "all", retention, log->head().seq); return log; } catch (const std::exception & e) { diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp index 6a959b8e..581ba006 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -59,7 +59,11 @@ void raw_exec(const std::string & db_path, const std::string & sql) { /// can confirm the append-only triggers reject a write. int raw_exec_rc(const std::string & db_path, const std::string & sql) { sqlite3 * db = nullptr; - EXPECT_EQ(sqlite3_open(db_path.c_str(), &db), SQLITE_OK); + if (sqlite3_open(db_path.c_str(), &db) != SQLITE_OK) { + ADD_FAILURE() << "sqlite3_open failed: " << (db ? sqlite3_errmsg(db) : "out of memory"); + sqlite3_close(db); // sqlite3 allows close on a failed-open handle (incl. nullptr). + return SQLITE_ERROR; + } int rc = sqlite3_exec(db, sql.c_str(), nullptr, nullptr, nullptr); sqlite3_close(db); return rc; From f9bbdc23dee404d57af1113851d5d478cfd701a0 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 22:33:57 +0200 Subject: [PATCH 5/9] fix(fault-audit): audit auto-heal and stop silent append-failure gaps Record auto-recovery as a distinct "healed" row (source auto_heal) so a fault's end is in the chain. Append failures now bump a dropped-writes health counter and clear an audit-healthy flag; audit_log.fail_closed makes them a hard error. Drop the never-written "ack" kind (clear == ack), protect audit_prune_guard with a trigger, add logging activate/deactivate markers, and tighten the threat-model README. Refs #483 --- src/ros2_medkit_fault_manager/README.md | 9 +- .../config/fault_manager.yaml | 19 +- .../fault_audit_log.hpp | 16 +- .../fault_manager_node.hpp | 27 ++- .../src/fault_audit_log.cpp | 36 ++++ .../src/fault_manager_node.cpp | 72 +++++++- .../test/test_fault_audit_log.cpp | 39 +++++ .../test/test_fault_manager.cpp | 165 +++++++++++++++++- 8 files changed, 358 insertions(+), 25 deletions(-) diff --git a/src/ros2_medkit_fault_manager/README.md b/src/ros2_medkit_fault_manager/README.md index 3e827aea..b03a22cc 100644 --- a/src/ros2_medkit_fault_manager/README.md +++ b/src/ros2_medkit_fault_manager/README.md @@ -112,22 +112,25 @@ patterns: ## Advanced: Tamper-Evident Audit Log -An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `cleared`) so the fault history is independently verifiable. It is **off by default** because it adds a write and storage cost per transition. +An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `healed`, `cleared`) so the fault history is independently verifiable. Auto-recovery (a fault reaching the healing threshold via PASSED events) is recorded as a distinct `healed` row with source `auto_heal`, so the fault's END is in the timeline and is not confused with a manual `cleared`. The manager has no acknowledge action separate from clearing, so `~/clear_fault` is recorded as `cleared` (clear == ack); there is no `ack` kind. The log also records its own lifecycle with `logging_activated` / `logging_deactivated` markers (CIR (EU) 2024/2690 sec. 3.2) at start and stop. It is **off by default** because it adds a write and storage cost per transition. Each transition appends one immutable row holding `record_hash = sha256(prev_hash + canonical(event))` (OpenSSL EVP SHA-256), the `prev_hash` it links to, and a monotonic `seq`. The hash is computed once at insert and never recomputed. A persisted chain head lets the chain resume across restarts. The log is stored in its own SQLite database (separate from the fault store) and is treated as append-only: the manager only ever inserts rows, and `BEFORE UPDATE` / `BEFORE DELETE` triggers reject out-of-band edits (the guarded rotation prune excepted). +**Completeness is an integrity property.** `verify()` proves nothing was *deleted* from the chain, but it cannot prove a transition that was *never appended*. So a silently dropped append is a hole `verify()` can never see. Every transition on the write path is therefore audited (occurred, timer/threshold confirmations, auto-heal, and clears), and an append failure is never swallowed silently: it increments a dropped-writes health counter and clears an "audit healthy" flag. With `audit_log.fail_closed` set, an append failure is a **hard error** that aborts the operation, so a compliance-strict deployment learns the audit broke instead of losing records unnoticed. The default (`fail_closed=false`) keeps fault processing running but still surfaces the gap via the health counter. + `verify()` walks the persisted chain oldest-first and recomputes every link: editing a row breaks its `record_hash`, deleting a row breaks the next row's `prev_hash` linkage, and deleting the newest row (the head row is read straight from the DB) is caught by the persisted-head check. -**Threat model (read this).** The chain is **unkeyed**, and the head and segment anchors live in the **same writable SQLite file** as the rows. `verify()` therefore catches edits or deletions that did **not** also recompute the chain - that is, casual or accidental tampering, and the bookkeeping bugs that would otherwise lose records. It does **not** stop an attacker with write access to the file: such an attacker can drop the triggers and recompute the entire chain (and head and anchors) to forge a self-consistent history. The append-only triggers are defense-in-depth, **not** a security boundary. True tamper-*proofing* requires a key or signature over the head (so it cannot be recomputed without the key) or external anchoring of the head hash to an append-only store you do not control; both are out of scope here and belong to the audit-log exporter / signing follow-up. +**Threat model (read this).** The chain is **unkeyed**, and the head and segment anchors live in the **same writable SQLite file** as the rows. `verify()` therefore catches edits or deletions that did **not** also recompute the chain - that is, casual or accidental tampering, and the bookkeeping bugs that would otherwise lose records. The append-only triggers are defense-in-depth: `audit_log` rejects out-of-band UPDATE/DELETE, and the rotation-prune guard (`audit_prune_guard`) is itself protected by a trigger so an external writer cannot simply flip it open and then delete a prefix - that flip is only permitted from the in-process connection that holds a per-connection temp marker. The single-row chain head (`audit_chain_head`) is intentionally **not** trigger-protected (a trigger there would block the legitimate head update inside the append transaction); a casual edit or delete of the head is instead caught by `verify()` via the seq/hash/head-mismatch checks. None of this stops an attacker with write access to the file: such an attacker can create the same temp marker or drop the triggers, and recompute the entire chain (head and anchors included) to forge a self-consistent history. The triggers are **not** a security boundary - this is tamper-**evident**, not tamper-**proof**. True tamper-*proofing* requires a key or signature over the head (so it cannot be recomputed without the key) or external anchoring of the head hash to an append-only store you do not control; both are out of scope here and belong to the audit-log exporter / signing follow-up. **Retention/rotation**: when more than `audit_log.retention_max_records` rows are retained, the oldest segment is *sealed* (its final `seq` + hash are persisted as an anchor) and then pruned. The surviving tail still verifies because the oldest retained row links back to the sealed anchor. | Parameter | Type | Default | Description | |-----------|------|---------|-------------| | `audit_log.enabled` | bool | `false` | Enable the tamper-evident audit log | -| `audit_log.transitions` | string | `"all"` | Which transitions to record: `"all"` or `"confirmed_only"` | +| `audit_log.transitions` | string | `"all"` | Which transitions to record: `"all"` (occurred/confirmed/healed/cleared) or `"confirmed_only"`. Lifecycle markers are always recorded. | | `audit_log.database_path` | string | `""` | SQLite path. Empty => sibling `fault_audit.db` next to the fault DB (or `:memory:` for in-memory fault stores) | | `audit_log.retention_max_records` | int | `0` | Seal + prune the oldest segment beyond this many retained records (0 = unlimited) | +| `audit_log.fail_closed` | bool | `false` | When `true`, an audit append failure is a hard error that aborts the operation (compliance-strict). When `false`, the failure is logged and counted but fault processing continues. Either way the gap is visible via the dropped-writes health counter. | ## Usage diff --git a/src/ros2_medkit_fault_manager/config/fault_manager.yaml b/src/ros2_medkit_fault_manager/config/fault_manager.yaml index 13d1f57a..fde100d2 100644 --- a/src/ros2_medkit_fault_manager/config/fault_manager.yaml +++ b/src/ros2_medkit_fault_manager/config/fault_manager.yaml @@ -29,13 +29,13 @@ fault_manager: # snapshots.capture_queue_full_policy: reject_newest # reject_newest | drop_oldest # Append-only, hash-chained audit log of fault state transitions - # (occurred/confirmed/cleared). OFF by default: it adds a write + storage cost - # per transition. When enabled, each transition appends one immutable, - # hash-chained row, so verify() detects edits or deletions that did NOT also - # recompute the chain (casual/accidental tampering). The chain is unkeyed and - # lives in a single writable file, so it is NOT proof against an attacker who - # can rewrite the whole file; true tamper-proofing needs a signed head or - # external anchoring (out of scope here). See README "Threat model". + # (occurred/confirmed/healed/cleared). OFF by default: it adds a write + + # storage cost per transition. When enabled, each transition appends one + # immutable, hash-chained row, so verify() detects edits or deletions that did + # NOT also recompute the chain (casual/accidental tampering). The chain is + # unkeyed and lives in a single writable file, so it is NOT proof against an + # attacker who can rewrite the whole file; true tamper-proofing needs a signed + # head or external anchoring (out of scope here). See README "Threat model". audit_log.enabled: false # Which transitions to record: "all" or "confirmed_only". # audit_log.transitions: all @@ -45,3 +45,8 @@ fault_manager: # Seal + prune the oldest segment beyond this many retained records # (0 = unlimited). A sealed anchor keeps the surviving tail verifiable. # audit_log.retention_max_records: 0 + # Fail-closed (compliance-strict): when true, an audit append failure is a hard + # error that aborts the operation instead of being logged and dropped. Default + # false keeps fault processing running; either way a dropped-writes health + # counter makes the gap visible. + # audit_log.fail_closed: false diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp index 457bca9e..3a954159 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp @@ -32,7 +32,7 @@ namespace ros2_medkit_fault_manager { /// to a stored row that does not also recompute the chain breaks verify(). struct AuditEvent { std::string fault_code; - std::string transition; ///< occurred | confirmed | cleared | ack + std::string transition; ///< one of the kTransition* constants below uint8_t severity{0}; ///< severity at the time of the transition std::string status; ///< resulting fault status (e.g. CONFIRMED) std::string source_id; ///< reporting source that drove the transition @@ -44,7 +44,19 @@ struct AuditEvent { constexpr const char * kTransitionOccurred = "occurred"; constexpr const char * kTransitionConfirmed = "confirmed"; constexpr const char * kTransitionCleared = "cleared"; -constexpr const char * kTransitionAck = "ack"; +/// Auto-recovery: a fault reached the healing threshold via PASSED events. Kept +/// distinct from kTransitionCleared so an automatic recovery is not mistaken for +/// a manual clear in the timeline. +constexpr const char * kTransitionHealed = "healed"; +/// Audit-log lifecycle markers (CIR (EU) 2024/2690 sec. 3.2: activation / +/// deactivation of logging). Appended directly, independent of the per-fault +/// transition filter, so the log records its own start and stop. +constexpr const char * kTransitionLoggingActivated = "logging_activated"; +constexpr const char * kTransitionLoggingDeactivated = "logging_deactivated"; +// NOTE: there is deliberately no "ack" kind. The open fault_manager has no +// acknowledge action separate from clearing: ~/clear_fault IS the acknowledge, +// and it is recorded as kTransitionCleared (clear == ack). A separate "ack" kind +// would never be written, so defining it would only mislead readers of the log. /// One immutable, hash-chained row read back from the audit log. struct AuditRecord { diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp index 61cf227c..a2226537 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp @@ -14,7 +14,9 @@ #pragma once +#include #include +#include #include #include #include @@ -72,6 +74,26 @@ class FaultManagerNode : public rclcpp::Node { return audit_log_.get(); } + /// Health signal: number of audit transitions that failed to append (and were + /// therefore lost from the chain). 0 when the audit is healthy or disabled. + uint64_t audit_dropped_writes() const { + return audit_dropped_writes_.load(std::memory_order_relaxed); + } + + /// Health signal: false once any audit append has failed. A compliance-strict + /// deployment should treat a false here (or a non-zero audit_dropped_writes()) + /// as the audit chain being incomplete. + bool audit_healthy() const { + return audit_healthy_.load(std::memory_order_relaxed); + } + + /// Test-only: drive one audit append through the same failure-handling path the + /// service handlers use. Throws when audit_log.fail_closed is set and the append + /// fails; otherwise observe audit_dropped_writes()/audit_healthy(). + void audit_transition_for_test(const char * transition, const ros2_medkit_msgs::msg::Fault & fault) { + audit_transition(transition, fault, "test", 0); + } + /// Get the storage type being used const std::string & get_storage_type() const { return storage_type_; @@ -187,7 +209,10 @@ class FaultManagerNode : public rclcpp::Node { /// Tamper-evident audit log of fault transitions (nullptr when disabled). std::unique_ptr audit_log_; - bool audit_confirmed_only_{false}; ///< When true, only "confirmed" transitions are logged + bool audit_confirmed_only_{false}; ///< When true, only "confirmed" transitions are logged + bool audit_fail_closed_{false}; ///< When true, an audit append failure aborts the operation + std::atomic audit_dropped_writes_{0}; ///< Count of audit appends that failed (lost rows) + std::atomic audit_healthy_{true}; ///< Cleared on the first failed audit append rclcpp::Service::SharedPtr report_fault_srv_; rclcpp::Service::SharedPtr list_faults_srv_; diff --git a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp index 88710a53..78b47c81 100644 --- a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp @@ -91,6 +91,15 @@ void exec_or_throw(sqlite3 * db, const char * sql, const char * what) { } } +/// SQL function registered ONLY on this in-process connection. The prune-guard +/// protection trigger calls it, so an out-of-band connection (which never +/// registered it) cannot flip the guard: preparing the UPDATE fails on the +/// unknown function. Connection-scoped, hence not a security boundary - a +/// write-capable adversary can register the same function or drop the trigger. +void audit_prune_authorized(sqlite3_context * ctx, int /*argc*/, sqlite3_value ** /*argv*/) { + sqlite3_result_int(ctx, 1); +} + } // namespace FaultAuditLog::FaultAuditLog(const std::string & db_path, int64_t retention_max_records) @@ -108,6 +117,15 @@ FaultAuditLog::FaultAuditLog(const std::string & db_path, int64_t retention_max_ exec_or_throw(db_, "PRAGMA journal_mode=WAL;", "enable WAL"); sqlite3_busy_timeout(db_, 5000); + // Authorize this connection for the prune-guard protection trigger (see below). + if (sqlite3_create_function_v2(db_, "audit_prune_authorized", 0, SQLITE_UTF8, nullptr, &audit_prune_authorized, + nullptr, nullptr, nullptr) != SQLITE_OK) { + std::string error = sqlite3_errmsg(db_); + sqlite3_close(db_); + db_ = nullptr; + throw std::runtime_error("audit: failed to register prune-authorization function: " + error); + } + initialize_schema(); head_ = load_head_locked(); } @@ -242,6 +260,24 @@ void FaultAuditLog::initialize_schema() { END; )", "create append-only triggers"); + + // Protect the prune guard itself. Without this, an external writer could simply + // `UPDATE audit_prune_guard SET enabled = 1` and then DELETE rows past the + // append-only delete trigger. The guard may only be flipped by a connection on + // which audit_prune_authorized() is registered (this in-process connection); an + // out-of-band UPDATE fails because that function is unknown to it. This is still + // defense-in-depth, not a security boundary: a write-capable adversary can + // register the same function or DROP this trigger. verify() remains the backstop. + exec_or_throw(db_, + R"( + CREATE TRIGGER IF NOT EXISTS audit_prune_guard_protect + BEFORE UPDATE ON audit_prune_guard + WHEN audit_prune_authorized() IS NOT 1 + BEGIN + SELECT RAISE(ABORT, 'audit_prune_guard is protected (in-process prune only)'); + END; + )", + "create prune-guard protection trigger"); } std::optional FaultAuditLog::read_head_row_locked() const { diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index 28ef88e3..a4579a4d 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -365,6 +365,24 @@ FaultManagerNode::~FaultManagerNode() { if (rosbag_capture_) { rosbag_capture_->stop(); } + + // Close the chain with a "logging deactivated" marker (CIR (EU) 2024/2690 + // sec. 3.2). Best-effort and appended directly (recorded even in confirmed_only + // mode); a failure here must not throw out of the destructor. + if (audit_log_) { + try { + AuditEvent marker; + marker.fault_code = "__audit__"; + marker.transition = kTransitionLoggingDeactivated; + marker.status = "INACTIVE"; + marker.source_id = "fault_manager"; + marker.description = "audit logging deactivated"; + marker.occurred_at_ns = get_wall_clock_time().nanoseconds(); + audit_log_->append(marker); + } catch (const std::exception & e) { + RCLCPP_WARN(get_logger(), "Failed to append audit 'logging_deactivated' marker: %s", e.what()); + } + } } std::unique_ptr FaultManagerNode::create_storage() { @@ -418,6 +436,11 @@ std::unique_ptr FaultManagerNode::create_audit_log() { retention = 0; } + // Fail-closed: when set, an append failure is a hard error (the operation + // aborts and the audit is marked unhealthy) instead of being logged and + // silently dropped. Off by default so the audit never blocks fault processing. + audit_fail_closed_ = declare_parameter("audit_log.fail_closed", false); + // Path: explicit override, else a sibling of the fault DB, else :memory:. std::string audit_path = declare_parameter("audit_log.database_path", ""); @@ -450,8 +473,26 @@ std::unique_ptr FaultManagerNode::create_audit_log() { try { auto log = std::make_unique(audit_path, retention); RCLCPP_INFO(get_logger(), - "Fault audit log enabled: %s (transitions=%s, retention=%" PRId64 ", resume_seq=%" PRId64 ")", - audit_path.c_str(), audit_confirmed_only_ ? "confirmed_only" : "all", retention, log->head().seq); + "Fault audit log enabled: %s (transitions=%s, retention=%" PRId64 + ", fail_closed=%s, resume_seq=%" PRId64 ")", + audit_path.c_str(), audit_confirmed_only_ ? "confirmed_only" : "all", retention, + audit_fail_closed_ ? "true" : "false", log->head().seq); + + // Record a "logging activated" marker so the chain documents its own start + // (CIR (EU) 2024/2690 sec. 3.2). Appended directly, so it is recorded even in + // confirmed_only mode. Best-effort: a failure here does not abort startup. + AuditEvent marker; + marker.fault_code = "__audit__"; + marker.transition = kTransitionLoggingActivated; + marker.status = "ACTIVE"; + marker.source_id = "fault_manager"; + marker.description = "audit logging activated"; + marker.occurred_at_ns = get_wall_clock_time().nanoseconds(); + try { + log->append(marker); + } catch (const std::exception & e) { + RCLCPP_WARN(get_logger(), "Failed to append audit 'logging_activated' marker: %s", e.what()); + } return log; } catch (const std::exception & e) { RCLCPP_ERROR(get_logger(), "Failed to open fault audit log '%s': %s", audit_path.c_str(), e.what()); @@ -480,8 +521,22 @@ void FaultManagerNode::audit_transition(const char * transition, const ros2_medk try { audit_log_->append(event); } catch (const std::exception & e) { - RCLCPP_ERROR(get_logger(), "Failed to append audit record for '%s' (%s): %s", fault.fault_code.c_str(), transition, - e.what()); + // A dropped append is a hole in the chain that verify() can never see (it + // proves nothing was deleted, not that everything was written). Always make + // it loud and visible via the health counter so the gap is observable. + audit_dropped_writes_.fetch_add(1, std::memory_order_relaxed); + audit_healthy_.store(false, std::memory_order_relaxed); + const uint64_t dropped = audit_dropped_writes_.load(std::memory_order_relaxed); + RCLCPP_ERROR(get_logger(), "Failed to append audit record for '%s' (%s): %s [audit dropped_writes=%" PRIu64 "]", + fault.fault_code.c_str(), transition, e.what(), dropped); + if (audit_fail_closed_) { + // Compliance-strict: refuse to proceed as if nothing happened. Abort the + // operation so the broken audit cannot be silently outlived. + RCLCPP_FATAL(get_logger(), + "audit_log.fail_closed is set: aborting operation because the audit append failed for '%s' (%s)", + fault.fault_code.c_str(), transition); + throw; + } } } @@ -587,6 +642,12 @@ void FaultManagerNode::handle_report_fault( } // Note: PREFAILED/PREPASSED status changes don't emit events (debounce in progress) + // A PASSED event can drive an existing fault past the healing threshold to + // HEALED. That is the fault's END and must be audited (with a distinct kind + // and source) so the timeline is complete, not just occurred+confirmed. + const bool just_healed = !is_new && status_before != ros2_medkit_msgs::msg::Fault::STATUS_HEALED && + fault_after->status == ros2_medkit_msgs::msg::Fault::STATUS_HEALED; + // Append tamper-evident audit records for the transitions that just happened. // Recorded regardless of correlation muting: muting affects display, not the // fact that the state transition occurred. @@ -596,6 +657,9 @@ void FaultManagerNode::handle_report_fault( if (just_confirmed) { audit_transition(kTransitionConfirmed, *fault_after, request->source_id, event_time.nanoseconds()); } + if (just_healed) { + audit_transition(kTransitionHealed, *fault_after, "auto_heal", event_time.nanoseconds()); + } // Capture snapshots/rosbag when a fault confirms via the bounded pool (issue #441). // handle_report_fault runs on the single-threaded executor, so confirmations are diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp index 581ba006..d6e6ed06 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -26,6 +26,7 @@ using ros2_medkit_fault_manager::AuditEvent; using ros2_medkit_fault_manager::FaultAuditLog; using ros2_medkit_fault_manager::kTransitionCleared; using ros2_medkit_fault_manager::kTransitionConfirmed; +using ros2_medkit_fault_manager::kTransitionHealed; using ros2_medkit_fault_manager::kTransitionOccurred; namespace { @@ -334,6 +335,44 @@ TEST_F(FaultAuditLogTest, RotationPrunePassesAppendOnlyTrigger) { EXPECT_TRUE(log.verify().ok); } +// A "healed" auto-recovery row chains and verifies like any other transition, and +// stays distinct from a "cleared" so the timeline can tell them apart. +TEST_F(FaultAuditLogTest, HealedTransitionChainVerifies) { + FaultAuditLog log(path_); + log.append(make_event("F1", kTransitionOccurred, 100)); + log.append(make_event("F1", kTransitionConfirmed, 200)); + log.append(make_event("F1", kTransitionHealed, 300)); + + auto records = log.read(); + ASSERT_EQ(records.size(), 3u); + EXPECT_EQ(records[2].event.transition, kTransitionHealed); + EXPECT_NE(records[2].event.transition, std::string(kTransitionCleared)); + EXPECT_TRUE(log.verify().ok); +} + +// Defense-in-depth (item 4): the prune guard itself is protected. An out-of-band +// connection cannot flip audit_prune_guard open, so it cannot then delete a +// prefix past the append-only delete trigger. The in-process prune still works. +TEST_F(FaultAuditLogTest, GuardProtectTriggerBlocksExternalGuardFlip) { + { + FaultAuditLog log(path_, /*retention_max_records=*/3); + for (int i = 1; i <= 8; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + EXPECT_EQ(log.record_count(), 3); // in-process prune succeeded despite the protect trigger + EXPECT_TRUE(log.verify().ok); + } + + // An external writer (no in-process temp unlock marker) cannot open the guard. + EXPECT_NE(raw_exec_rc(path_, "UPDATE audit_prune_guard SET enabled = 1 WHERE id = 1"), SQLITE_OK); + // With the guard still closed, a raw DELETE remains blocked by the delete trigger. + EXPECT_NE(raw_exec_rc(path_, "DELETE FROM audit_log"), SQLITE_OK); + + FaultAuditLog reopened(path_, 3); + EXPECT_EQ(reopened.record_count(), 3); + EXPECT_TRUE(reopened.verify().ok); +} + int main(int argc, char ** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index 589cffd9..14133999 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include @@ -20,6 +21,7 @@ #include #include #include +#include #include #include @@ -1537,6 +1539,8 @@ class FaultAuditIntegrationTest : public ::testing::Test { fm_options.parameter_overrides({ {"storage_type", "memory"}, {"confirmation_threshold", -1}, + {"healing_enabled", true}, + {"healing_threshold", 1}, // counter >= 1 heals (two PASSED after one FAILED) {"audit_log.enabled", true}, {"audit_log.database_path", audit_path_}, }); @@ -1603,19 +1607,66 @@ TEST_F(FaultAuditIntegrationTest, TransitionsAppendVerifiableChain) { ASSERT_TRUE(spin_until_ready(cf)); ASSERT_TRUE(cf.get()->success); - // Reopen the audit DB independently and inspect the persisted chain. + // Reopen the audit DB independently and inspect the persisted chain. The chain + // opens with a "logging_activated" lifecycle marker (seq 1), then the three + // fault transitions. The "logging_deactivated" marker is only appended when the + // node is destroyed (TearDown), so it is not visible to this read. ros2_medkit_fault_manager::FaultAuditLog audit(audit_path_); auto records = audit.read(); - ASSERT_EQ(records.size(), 3u); - EXPECT_EQ(records[0].event.transition, ros2_medkit_fault_manager::kTransitionOccurred); - EXPECT_EQ(records[1].event.transition, ros2_medkit_fault_manager::kTransitionConfirmed); - EXPECT_EQ(records[2].event.transition, ros2_medkit_fault_manager::kTransitionCleared); - EXPECT_EQ(records[0].event.fault_code, "AUDIT_FAULT"); - EXPECT_EQ(records[1].event.source_id, "/plc/pump"); + ASSERT_EQ(records.size(), 4u); + EXPECT_EQ(records[0].event.transition, ros2_medkit_fault_manager::kTransitionLoggingActivated); + EXPECT_EQ(records[1].event.transition, ros2_medkit_fault_manager::kTransitionOccurred); + EXPECT_EQ(records[2].event.transition, ros2_medkit_fault_manager::kTransitionConfirmed); + EXPECT_EQ(records[3].event.transition, ros2_medkit_fault_manager::kTransitionCleared); + EXPECT_EQ(records[1].event.fault_code, "AUDIT_FAULT"); + EXPECT_EQ(records[2].event.source_id, "/plc/pump"); + + auto result = audit.verify(); + EXPECT_TRUE(result.ok) << result.error; + EXPECT_EQ(result.checked, 4); +} + +// Completeness: an auto-healed fault must record its END. One FAILED confirms the +// fault (occurred + confirmed); two PASSED drive the debounce counter to the +// healing threshold, which must append a distinct "healed" row (source auto_heal), +// and the full occurred -> confirmed -> healed chain must verify. +TEST_F(FaultAuditIntegrationTest, AutoHealAppendsHealedRow) { + auto send_report = [&](uint8_t event_type) { + auto req = std::make_shared(); + req->fault_code = "HEAL_FAULT"; + req->event_type = event_type; + req->severity = Fault::SEVERITY_ERROR; // not CRITICAL: goes through debounce + req->description = "intermittent sensor"; + req->source_id = "/robot/sensor"; + auto fut = report_client_->async_send_request(req); + ASSERT_TRUE(spin_until_ready(fut)); + ASSERT_TRUE(fut.get()->accepted); + }; + + send_report(ReportFault::Request::EVENT_FAILED); // counter -1, threshold -1 => CONFIRMED + send_report(ReportFault::Request::EVENT_PASSED); // counter 0 (hysteresis): stays CONFIRMED + send_report(ReportFault::Request::EVENT_PASSED); // counter 1 >= healing_threshold => HEALED + + ros2_medkit_fault_manager::FaultAuditLog audit(audit_path_); + std::vector transitions; + std::string heal_source; + for (const auto & rec : audit.read()) { + if (rec.event.fault_code == "HEAL_FAULT") { + transitions.push_back(rec.event.transition); + if (rec.event.transition == ros2_medkit_fault_manager::kTransitionHealed) { + heal_source = rec.event.source_id; + } + } + } + + ASSERT_EQ(transitions.size(), 3u) << "expected occurred, confirmed, healed"; + EXPECT_EQ(transitions[0], ros2_medkit_fault_manager::kTransitionOccurred); + EXPECT_EQ(transitions[1], ros2_medkit_fault_manager::kTransitionConfirmed); + EXPECT_EQ(transitions[2], ros2_medkit_fault_manager::kTransitionHealed); + EXPECT_EQ(heal_source, "auto_heal"); auto result = audit.verify(); EXPECT_TRUE(result.ok) << result.error; - EXPECT_EQ(result.checked, 3); } TEST(FaultAuditDisabledTest, NoAuditFileWhenDisabled) { @@ -1685,6 +1736,104 @@ TEST(FaultAuditTimerTest, TimerConfirmationAppendsConfirmedAuditRow) { EXPECT_TRUE(audit->verify().ok); } +namespace { + +/// Force the node's next audit append to fail by inserting a row at the seq the +/// node will try next (MAX(seq)+1), so its INSERT collides on the seq PRIMARY KEY. +/// Done from a separate connection; the append-only triggers do not block INSERT. +void poison_next_audit_seq(const std::string & db_path) { + sqlite3 * db = nullptr; + ASSERT_EQ(sqlite3_open(db_path.c_str(), &db), SQLITE_OK); + const char * sql = + "INSERT INTO audit_log (seq, occurred_at_ns, fault_code, transition, severity, status, source_id, " + "description, prev_hash, record_hash) " + "SELECT COALESCE(MAX(seq), 0) + 1, 0, 'x', 'x', 0, 'x', 'x', 'x', 'x', 'x' FROM audit_log;"; + char * err = nullptr; + int rc = sqlite3_exec(db, sql, nullptr, nullptr, &err); + std::string err_str = err ? err : ""; + sqlite3_free(err); + sqlite3_close(db); + ASSERT_EQ(rc, SQLITE_OK) << err_str; +} + +Fault make_failclosed_fault() { + Fault f; + f.fault_code = "FAILCLOSED"; + f.severity = Fault::SEVERITY_ERROR; + f.status = "CONFIRMED"; + f.description = "injected"; + return f; +} + +std::string make_temp_audit_path(const char * prefix) { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution dist; + return (std::filesystem::temp_directory_path() / (std::string(prefix) + std::to_string(dist(gen)) + ".db")).string(); +} + +void remove_audit_files(const std::string & path) { + std::error_code ec; + std::filesystem::remove(path, ec); + std::filesystem::remove(path + "-wal", ec); + std::filesystem::remove(path + "-shm", ec); +} + +} // namespace + +// Silent-gap guard, default behaviour: when fail_closed is false, an append +// failure must NOT abort the operation, but it must be VISIBLE - the dropped +// counter increments and the audit is flagged unhealthy (not silently lost). +TEST(FaultAuditFailClosedTest, FailOpenFlagsButDoesNotThrow) { + const std::string audit_path = make_temp_audit_path("test_audit_failopen_"); + { + rclcpp::NodeOptions options; + options.parameter_overrides({ + {"storage_type", "memory"}, + {"audit_log.enabled", true}, + {"audit_log.database_path", audit_path}, + {"audit_log.fail_closed", false}, + }); + auto node = std::make_shared(options); + ASSERT_TRUE(node->audit_healthy()); + EXPECT_EQ(node->audit_dropped_writes(), 0u); + + poison_next_audit_seq(audit_path); + + EXPECT_NO_THROW( + node->audit_transition_for_test(ros2_medkit_fault_manager::kTransitionConfirmed, make_failclosed_fault())); + EXPECT_EQ(node->audit_dropped_writes(), 1u); + EXPECT_FALSE(node->audit_healthy()); + } + remove_audit_files(audit_path); +} + +// Silent-gap guard, compliance-strict: with fail_closed true, an injected append +// failure must abort the operation (throw) rather than silently proceed, and the +// same health signals must fire. +TEST(FaultAuditFailClosedTest, FailClosedAbortsAndFlags) { + const std::string audit_path = make_temp_audit_path("test_audit_failclosed_"); + { + rclcpp::NodeOptions options; + options.parameter_overrides({ + {"storage_type", "memory"}, + {"audit_log.enabled", true}, + {"audit_log.database_path", audit_path}, + {"audit_log.fail_closed", true}, + }); + auto node = std::make_shared(options); + + poison_next_audit_seq(audit_path); + + EXPECT_THROW( + node->audit_transition_for_test(ros2_medkit_fault_manager::kTransitionConfirmed, make_failclosed_fault()), + std::exception); + EXPECT_EQ(node->audit_dropped_writes(), 1u); + EXPECT_FALSE(node->audit_healthy()); + } + remove_audit_files(audit_path); +} + int main(int argc, char ** argv) { rclcpp::init(argc, argv); ::testing::InitGoogleTest(&argc, argv); From d05c12e634681b42edb6f3ed0f59635be1827c81 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Wed, 1 Jul 2026 21:32:03 +0200 Subject: [PATCH 6/9] fix(fault-audit): total canonicalize, RAII db handle, bounded anchors canonicalize uses a non-throwing dump so non-UTF-8 content never breaks append; the sqlite handle is RAII so a throwing ctor cannot leak; rotation is best-effort after commit and never fails a durable append; anchors are pruned to the current boundary and guard-gated. Docs: fail_closed is not a rollback, health signals are in-process only, tail/prefix truncations are cheap; dropped the CIR citation. Refs #483 --- src/ros2_medkit_fault_manager/README.md | 12 +- .../fault_audit_log.hpp | 51 +++-- .../fault_manager_node.hpp | 14 +- .../src/fault_audit_log.cpp | 185 ++++++++++++------ .../src/fault_manager_node.cpp | 39 ++-- .../test/test_fault_audit_log.cpp | 123 +++++++++++- 6 files changed, 331 insertions(+), 93 deletions(-) diff --git a/src/ros2_medkit_fault_manager/README.md b/src/ros2_medkit_fault_manager/README.md index b03a22cc..f376f8dd 100644 --- a/src/ros2_medkit_fault_manager/README.md +++ b/src/ros2_medkit_fault_manager/README.md @@ -112,17 +112,17 @@ patterns: ## Advanced: Tamper-Evident Audit Log -An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `healed`, `cleared`) so the fault history is independently verifiable. Auto-recovery (a fault reaching the healing threshold via PASSED events) is recorded as a distinct `healed` row with source `auto_heal`, so the fault's END is in the timeline and is not confused with a manual `cleared`. The manager has no acknowledge action separate from clearing, so `~/clear_fault` is recorded as `cleared` (clear == ack); there is no `ack` kind. The log also records its own lifecycle with `logging_activated` / `logging_deactivated` markers (CIR (EU) 2024/2690 sec. 3.2) at start and stop. It is **off by default** because it adds a write and storage cost per transition. +An optional append-only, hash-chained audit log records every fault state transition (`occurred`, `confirmed`, `healed`, `cleared`) so the fault history is independently verifiable. Auto-recovery (a fault reaching the healing threshold via PASSED events) is recorded as a distinct `healed` row with source `auto_heal`, so the fault's END is in the timeline and is not confused with a manual `cleared`. The manager has no acknowledge action separate from clearing, so `~/clear_fault` is recorded as `cleared` (clear == ack); there is no `ack` kind. The log also records its own lifecycle with `logging_activated` / `logging_deactivated` markers at start and stop. It is **off by default** because it adds a write and storage cost per transition. Each transition appends one immutable row holding `record_hash = sha256(prev_hash + canonical(event))` (OpenSSL EVP SHA-256), the `prev_hash` it links to, and a monotonic `seq`. The hash is computed once at insert and never recomputed. A persisted chain head lets the chain resume across restarts. The log is stored in its own SQLite database (separate from the fault store) and is treated as append-only: the manager only ever inserts rows, and `BEFORE UPDATE` / `BEFORE DELETE` triggers reject out-of-band edits (the guarded rotation prune excepted). -**Completeness is an integrity property.** `verify()` proves nothing was *deleted* from the chain, but it cannot prove a transition that was *never appended*. So a silently dropped append is a hole `verify()` can never see. Every transition on the write path is therefore audited (occurred, timer/threshold confirmations, auto-heal, and clears), and an append failure is never swallowed silently: it increments a dropped-writes health counter and clears an "audit healthy" flag. With `audit_log.fail_closed` set, an append failure is a **hard error** that aborts the operation, so a compliance-strict deployment learns the audit broke instead of losing records unnoticed. The default (`fail_closed=false`) keeps fault processing running but still surfaces the gap via the health counter. +**Completeness is an integrity property.** `verify()` proves nothing was *deleted* from the chain, but it cannot prove a transition that was *never appended*. So a silently dropped append is a hole `verify()` can never see. Every transition on the write path is therefore audited (occurred, timer/threshold confirmations, auto-heal, and clears), and an append failure is never swallowed silently: it increments a dropped-writes counter and clears an "audit healthy" flag. **These are in-process signals only** (C++ getters on the node). This revision exposes no service/REST/health endpoint that surfaces audit health or lets an operator run `verify()` at runtime, so the signals are **not operator-observable at runtime yet** - a runtime read/verify/health surface is future work. With `audit_log.fail_closed` set, an append failure is re-raised as a **fail-FAST** error so a compliance-strict deployment learns the audit broke. This does **not** roll back the fault-state change that already committed: the audit log and the fault store are **separate SQLite databases**, so there is no cross-DB atomicity, and `fail_closed` is a broken-audit alarm requiring operator action, not a rollback. The default (`fail_closed=false`) keeps fault processing running; either way the in-process signals record the gap. -`verify()` walks the persisted chain oldest-first and recomputes every link: editing a row breaks its `record_hash`, deleting a row breaks the next row's `prev_hash` linkage, and deleting the newest row (the head row is read straight from the DB) is caught by the persisted-head check. +`verify()` walks the persisted chain oldest-first and recomputes every link: editing a row breaks its `record_hash`, and deleting a row breaks the next row's `prev_hash` linkage. Deleting the newest row *while leaving the head untouched* is caught by the persisted-head check (the head is read straight from the DB). However, deleting the newest row(s) **and** repointing the head with a single `UPDATE audit_chain_head SET seq=..., record_hash=...` to the prior row's values costs no more than any other casual edit - it is **not** the "recompute the entire chain" the threat model below might suggest - and the truncated chain still verifies. There is no external record that a later `seq` ever existed, so this tail-truncation is undetectable by design. -**Threat model (read this).** The chain is **unkeyed**, and the head and segment anchors live in the **same writable SQLite file** as the rows. `verify()` therefore catches edits or deletions that did **not** also recompute the chain - that is, casual or accidental tampering, and the bookkeeping bugs that would otherwise lose records. The append-only triggers are defense-in-depth: `audit_log` rejects out-of-band UPDATE/DELETE, and the rotation-prune guard (`audit_prune_guard`) is itself protected by a trigger so an external writer cannot simply flip it open and then delete a prefix - that flip is only permitted from the in-process connection that holds a per-connection temp marker. The single-row chain head (`audit_chain_head`) is intentionally **not** trigger-protected (a trigger there would block the legitimate head update inside the append transaction); a casual edit or delete of the head is instead caught by `verify()` via the seq/hash/head-mismatch checks. None of this stops an attacker with write access to the file: such an attacker can create the same temp marker or drop the triggers, and recompute the entire chain (head and anchors included) to forge a self-consistent history. The triggers are **not** a security boundary - this is tamper-**evident**, not tamper-**proof**. True tamper-*proofing* requires a key or signature over the head (so it cannot be recomputed without the key) or external anchoring of the head hash to an append-only store you do not control; both are out of scope here and belong to the audit-log exporter / signing follow-up. +**Threat model (read this).** The chain is **unkeyed**, and the head and segment anchors live in the **same writable SQLite file** as the rows. `verify()` therefore catches edits or deletions that did **not** also recompute the chain - that is, casual or accidental tampering, and the bookkeeping bugs that would otherwise lose records. The append-only triggers are defense-in-depth: `audit_log` rejects out-of-band UPDATE/DELETE, `audit_anchors` carries the same guard-gated triggers so an out-of-band INSERT/UPDATE/DELETE of an anchor is rejected too, and the rotation-prune guard (`audit_prune_guard`) is itself protected by a trigger so an external writer cannot simply flip it open and then delete a prefix (or forge an anchor) - that flip is only permitted from the in-process connection that holds a per-connection temp marker. The single-row chain head (`audit_chain_head`) is intentionally **not** trigger-protected (a trigger there would block the legitimate head update inside the append transaction); a casual edit or delete of the head is instead caught by `verify()` via the seq/hash/head-mismatch checks. None of this stops an attacker with write access to the file: such an attacker can create the same temp marker or drop the triggers, and recompute the entire chain (head and anchors included) to forge a self-consistent history - and cheaper still, the tail-truncation above and the forged prefix-truncation below need no recompute at all. The triggers are **not** a security boundary - this is tamper-**evident**, not tamper-**proof**. True tamper-*proofing* requires a key or signature over the head (so it cannot be recomputed without the key) or external anchoring of the head hash to an append-only store you do not control; both are out of scope here and belong to the audit-log exporter / signing follow-up. -**Retention/rotation**: when more than `audit_log.retention_max_records` rows are retained, the oldest segment is *sealed* (its final `seq` + hash are persisted as an anchor) and then pruned. The surviving tail still verifies because the oldest retained row links back to the sealed anchor. +**Retention/rotation**: when more than `audit_log.retention_max_records` rows are retained, the oldest segment is *sealed* (its final `seq` + hash are persisted as an anchor) and then pruned. The surviving tail still verifies because the oldest retained row links back to the sealed anchor. Only the anchor at the current prune boundary is kept - the same rotation drops older anchors - so `audit_anchors` stays bounded (one row) instead of growing one row per rotation. Because `verify()` treats any matching sealed anchor as a valid tail root, a **forged** prefix-truncation (an out-of-band actor deletes a prefix and inserts a matching anchor) is **indistinguishable** from legitimate pruning: "the surviving tail still verifies" therefore covers a forged truncation exactly as well as a real one. The guard-gated `audit_anchors` triggers raise the bar for this (casual/accidental only) but, like every trigger here, a write-capable adversary can drop them - so this stays tamper-**evident**, not tamper-**proof**. | Parameter | Type | Default | Description | |-----------|------|---------|-------------| @@ -130,7 +130,7 @@ Each transition appends one immutable row holding `record_hash = sha256(prev_has | `audit_log.transitions` | string | `"all"` | Which transitions to record: `"all"` (occurred/confirmed/healed/cleared) or `"confirmed_only"`. Lifecycle markers are always recorded. | | `audit_log.database_path` | string | `""` | SQLite path. Empty => sibling `fault_audit.db` next to the fault DB (or `:memory:` for in-memory fault stores) | | `audit_log.retention_max_records` | int | `0` | Seal + prune the oldest segment beyond this many retained records (0 = unlimited) | -| `audit_log.fail_closed` | bool | `false` | When `true`, an audit append failure is a hard error that aborts the operation (compliance-strict). When `false`, the failure is logged and counted but fault processing continues. Either way the gap is visible via the dropped-writes health counter. | +| `audit_log.fail_closed` | bool | `false` | When `true`, an audit append failure is re-raised as a fail-FAST error signalling the audit chain is broken and needs operator action. It does **not** roll back the already-committed fault-state change (the fault store is a separate DB - no cross-DB atomicity). When `false`, the failure is logged and counted and fault processing continues. Either way the gap is recorded via the in-process dropped-writes / audit-healthy signals (not operator-observable at runtime yet). | ## Usage diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp index 3a954159..96709069 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_audit_log.hpp @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -24,6 +25,19 @@ namespace ros2_medkit_fault_manager { +/// Deleter that closes an owned sqlite3 handle. sqlite3_close(nullptr) is a +/// no-op, and unique_ptr never invokes the deleter on a null pointer anyway. +struct SqliteDeleter { + void operator()(sqlite3 * db) const noexcept { + sqlite3_close(db); + } +}; + +/// Owning handle for the connection. RAII so the handle is always closed, +/// including on a throw during construction (a raw pointer would leak because +/// the destructor does not run when the constructor throws). +using SqliteHandle = std::unique_ptr; + /// A single fault state-transition to record in the audit log. /// /// `transition` is one of the kTransition* constants below. The remaining @@ -48,9 +62,9 @@ constexpr const char * kTransitionCleared = "cleared"; /// distinct from kTransitionCleared so an automatic recovery is not mistaken for /// a manual clear in the timeline. constexpr const char * kTransitionHealed = "healed"; -/// Audit-log lifecycle markers (CIR (EU) 2024/2690 sec. 3.2: activation / -/// deactivation of logging). Appended directly, independent of the per-fault -/// transition filter, so the log records its own start and stop. +/// Audit-log lifecycle markers: activation / deactivation of logging. Appended +/// directly, independent of the per-fault transition filter, so the log records +/// its own start and stop. constexpr const char * kTransitionLoggingActivated = "logging_activated"; constexpr const char * kTransitionLoggingDeactivated = "logging_deactivated"; // NOTE: there is deliberately no "ack" kind. The open fault_manager has no @@ -95,11 +109,16 @@ struct AuditVerifyResult { /// rotation prune excepted) as defense-in-depth. /// /// Threat model: the hash chain is UNKEYED and the head/anchors live in the same -/// writable file. verify() catches edits or deletions that did not also recompute -/// the chain (casual or accidental tampering), but anyone with write access to the -/// file can recompute the whole chain (and drop the triggers) and forge a -/// consistent history. True tamper-proofing needs a key/signature over the head or -/// external anchoring; that is out of scope here. +/// writable file, so this is tamper-EVIDENT, not tamper-PROOF. verify() catches +/// edits or deletions that did not also recompute the chain (casual or accidental +/// tampering). It does NOT stop a write-capable adversary, and two truncations are +/// cheap and undetectable by design: (a) dropping the newest row(s) and repointing +/// audit_chain_head with a single UPDATE to the prior (seq, hash) still verifies - +/// there is no external record that a later seq ever existed; (b) a forged +/// prefix-truncation (delete a prefix + insert a matching anchor) is +/// indistinguishable from legitimate rotation, so a surviving-tail-that-verifies +/// does not prove the prefix was never present. True tamper-proofing needs a +/// key/signature over the head or external anchoring; that is out of scope here. class FaultAuditLog { public: /// Open (or create) the audit log database. @@ -117,7 +136,10 @@ class FaultAuditLog { FaultAuditLog & operator=(FaultAuditLog &&) = delete; /// Append one transition. Computes the chained hash, inserts the row, and - /// advances the persisted head, atomically. + /// advances the persisted head in a single transaction (atomic within THIS + /// database). Once that transaction commits the append is durable; any + /// retention rotation that follows is best-effort maintenance and never fails + /// the committed append (its failures are counted via rotation_failures()). /// @return the monotonic seq assigned to the new record. int64_t append(const AuditEvent & event); @@ -135,6 +157,12 @@ class FaultAuditLog { /// Number of records currently retained (excludes pruned/sealed rows). int64_t record_count() const; + /// Number of best-effort rotation cycles that failed AFTER a durable append. + /// A rotation failure (e.g. checkpoint busy, disk-full on the prune) is retried + /// on the next append and never turns the already-committed append into a + /// reported drop; this counter makes such maintenance failures observable. + int64_t rotation_failures() const; + /// Deterministic canonical serialization of an event at a given seq. /// Stable field order so verify is reproducible across processes. static std::string canonicalize(int64_t seq, const AuditEvent & event); @@ -162,9 +190,10 @@ class FaultAuditLog { std::string db_path_; int64_t retention_max_records_{0}; - sqlite3 * db_{nullptr}; + SqliteHandle db_; ///< owning connection; closed by RAII even if the ctor throws mutable std::mutex mutex_; - ChainHead head_; ///< cached head, kept in sync with the head table + ChainHead head_; ///< cached head, kept in sync with the head table + int64_t rotation_failures_{0}; ///< best-effort rotation failures (guarded by mutex_) }; } // namespace ros2_medkit_fault_manager diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp index a2226537..0f7a5cc1 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_manager_node.hpp @@ -76,13 +76,17 @@ class FaultManagerNode : public rclcpp::Node { /// Health signal: number of audit transitions that failed to append (and were /// therefore lost from the chain). 0 when the audit is healthy or disabled. + /// IN-PROCESS ONLY: this is a C++ getter, not exposed over any service/REST/ + /// health surface yet, so an operator cannot read it at runtime in this + /// revision (a runtime read/verify/health surface is future work). uint64_t audit_dropped_writes() const { return audit_dropped_writes_.load(std::memory_order_relaxed); } /// Health signal: false once any audit append has failed. A compliance-strict /// deployment should treat a false here (or a non-zero audit_dropped_writes()) - /// as the audit chain being incomplete. + /// as the audit chain being incomplete. IN-PROCESS ONLY (see above): not yet + /// observable at runtime by an operator. bool audit_healthy() const { return audit_healthy_.load(std::memory_order_relaxed); } @@ -209,8 +213,12 @@ class FaultManagerNode : public rclcpp::Node { /// Tamper-evident audit log of fault transitions (nullptr when disabled). std::unique_ptr audit_log_; - bool audit_confirmed_only_{false}; ///< When true, only "confirmed" transitions are logged - bool audit_fail_closed_{false}; ///< When true, an audit append failure aborts the operation + bool audit_confirmed_only_{false}; ///< When true, only "confirmed" transitions are logged + /// When true, an audit append failure is re-raised as a fail-FAST signal that + /// the chain is broken (operator must act). It does NOT roll back the already- + /// committed fault-state change (the fault store is a separate DB); it is not + /// atomicity. + bool audit_fail_closed_{false}; std::atomic audit_dropped_writes_{0}; ///< Count of audit appends that failed (lost rows) std::atomic audit_healthy_{true}; ///< Cleared on the first failed audit append diff --git a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp index 78b47c81..11411851 100644 --- a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp @@ -31,6 +31,19 @@ namespace { /// verifier can confirm the chain starts where it claims to. constexpr const char * kGenesisHash = "0000000000000000000000000000000000000000000000000000000000000000"; +/// JSON-encode a string value for the canonical form. Audit content +/// (description / source_id / fault_code / ...) is UNVALIDATED request data and +/// may contain arbitrary bytes. nlohmann's default (strict) UTF-8 handler THROWS +/// on any non-UTF-8 byte; that would make canonicalize() -> append() throw and +/// leave a silent, un-verifiable gap in the chain. The `replace` handler +/// substitutes U+FFFD for invalid bytes deterministically, so any byte sequence +/// canonicalizes reproducibly and append never throws on content. Output for +/// valid UTF-8 is byte-for-byte identical to the old strict dump, so chains +/// written before this change still verify. +std::string json_encode(const std::string & value) { + return nlohmann::json(value).dump(-1, ' ', false, nlohmann::json::error_handler_t::replace); +} + /// RAII wrapper for a prepared SQLite statement (audit-log local copy). class Stmt { public: @@ -105,36 +118,33 @@ void audit_prune_authorized(sqlite3_context * ctx, int /*argc*/, sqlite3_value * FaultAuditLog::FaultAuditLog(const std::string & db_path, int64_t retention_max_records) : db_path_(db_path), retention_max_records_(retention_max_records < 0 ? 0 : retention_max_records) { int flags = SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE | SQLITE_OPEN_FULLMUTEX; - if (sqlite3_open_v2(db_path.c_str(), &db_, flags, nullptr) != SQLITE_OK) { - std::string error = db_ ? sqlite3_errmsg(db_) : "unknown error"; - if (db_) { - sqlite3_close(db_); - db_ = nullptr; - } + sqlite3 * raw = nullptr; + if (sqlite3_open_v2(db_path.c_str(), &raw, flags, nullptr) != SQLITE_OK) { + std::string error = raw ? sqlite3_errmsg(raw) : "unknown error"; + sqlite3_close(raw); // no-op on nullptr; releases a failed-open handle. throw std::runtime_error("audit: failed to open '" + db_path + "': " + error); } + // Take ownership immediately. From here on ANY throw (including out of the + // PRAGMA / schema / head setup below) runs db_'s destructor and closes the + // handle: a member unique_ptr is destroyed even when the constructor throws, + // whereas a raw pointer whose only cleanup lived in ~FaultAuditLog() would leak. + db_.reset(raw); - exec_or_throw(db_, "PRAGMA journal_mode=WAL;", "enable WAL"); - sqlite3_busy_timeout(db_, 5000); + exec_or_throw(db_.get(), "PRAGMA journal_mode=WAL;", "enable WAL"); + sqlite3_busy_timeout(db_.get(), 5000); // Authorize this connection for the prune-guard protection trigger (see below). - if (sqlite3_create_function_v2(db_, "audit_prune_authorized", 0, SQLITE_UTF8, nullptr, &audit_prune_authorized, + if (sqlite3_create_function_v2(db_.get(), "audit_prune_authorized", 0, SQLITE_UTF8, nullptr, &audit_prune_authorized, nullptr, nullptr, nullptr) != SQLITE_OK) { - std::string error = sqlite3_errmsg(db_); - sqlite3_close(db_); - db_ = nullptr; - throw std::runtime_error("audit: failed to register prune-authorization function: " + error); + throw std::runtime_error(std::string("audit: failed to register prune-authorization function: ") + + sqlite3_errmsg(db_.get())); } initialize_schema(); head_ = load_head_locked(); } -FaultAuditLog::~FaultAuditLog() { - if (db_) { - sqlite3_close(db_); - } -} +FaultAuditLog::~FaultAuditLog() = default; std::string FaultAuditLog::genesis_hash() { return kGenesisHash; @@ -169,7 +179,8 @@ std::string FaultAuditLog::sha256_hex(const std::string & data) { std::string FaultAuditLog::canonicalize(int64_t seq, const AuditEvent & event) { // Deterministic field order with JSON-escaped string values. Numeric fields // render in fixed decimal form. This is the exact byte sequence the hash is - // taken over, so it must be stable across processes and re-reads. + // taken over, so it must be stable across processes and re-reads (and total + // for arbitrary bytes - see json_encode). std::string out; out.reserve(192); out += "{\"seq\":"; @@ -177,24 +188,24 @@ std::string FaultAuditLog::canonicalize(int64_t seq, const AuditEvent & event) { out += ",\"ts\":"; out += std::to_string(event.occurred_at_ns); out += ",\"code\":"; - out += nlohmann::json(event.fault_code).dump(); + out += json_encode(event.fault_code); out += ",\"transition\":"; - out += nlohmann::json(event.transition).dump(); + out += json_encode(event.transition); out += ",\"severity\":"; out += std::to_string(static_cast(event.severity)); out += ",\"status\":"; - out += nlohmann::json(event.status).dump(); + out += json_encode(event.status); out += ",\"source\":"; - out += nlohmann::json(event.source_id).dump(); + out += json_encode(event.source_id); out += ",\"description\":"; - out += nlohmann::json(event.description).dump(); + out += json_encode(event.description); out += "}"; return out; } void FaultAuditLog::initialize_schema() { // Immutable transition rows. seq is the monotonic chain index. - exec_or_throw(db_, + exec_or_throw(db_.get(), R"( CREATE TABLE IF NOT EXISTS audit_log ( seq INTEGER PRIMARY KEY, @@ -213,7 +224,7 @@ void FaultAuditLog::initialize_schema() { "create audit_log table"); // Single-row persisted chain head, so the chain resumes across restarts. - exec_or_throw(db_, + exec_or_throw(db_.get(), R"( CREATE TABLE IF NOT EXISTS audit_chain_head ( id INTEGER PRIMARY KEY CHECK (id = 1), @@ -225,7 +236,7 @@ void FaultAuditLog::initialize_schema() { // Sealed-segment anchors written before pruning. Each captures the final // (seq, hash) of a pruned prefix so the surviving tail stays verifiable. - exec_or_throw(db_, + exec_or_throw(db_.get(), R"( CREATE TABLE IF NOT EXISTS audit_anchors ( last_seq INTEGER PRIMARY KEY, @@ -239,7 +250,7 @@ void FaultAuditLog::initialize_schema() { // boundary: anyone able to write the file can also DROP these triggers). The // single-row guard lets the in-process rotation prune delete a sealed prefix // while every other UPDATE/DELETE on audit_log is rejected. - exec_or_throw(db_, + exec_or_throw(db_.get(), R"( CREATE TABLE IF NOT EXISTS audit_prune_guard ( id INTEGER PRIMARY KEY CHECK (id = 1), @@ -268,7 +279,7 @@ void FaultAuditLog::initialize_schema() { // out-of-band UPDATE fails because that function is unknown to it. This is still // defense-in-depth, not a security boundary: a write-capable adversary can // register the same function or DROP this trigger. verify() remains the backstop. - exec_or_throw(db_, + exec_or_throw(db_.get(), R"( CREATE TRIGGER IF NOT EXISTS audit_prune_guard_protect BEFORE UPDATE ON audit_prune_guard @@ -278,10 +289,41 @@ void FaultAuditLog::initialize_schema() { END; )", "create prune-guard protection trigger"); + + // Guard audit_anchors with the SAME in-process prune guard as audit_log. + // verify() links the surviving tail back to a sealed anchor, so an out-of-band + // writer that could freely INSERT a fake anchor (or DELETE a real one) could + // forge a prefix-truncation that is indistinguishable from legitimate pruning. + // Anchors are never UPDATEd by this class, and the in-process rotation INSERT + + // prune both run with the guard open, so those pass. Defense-in-depth only, NOT + // a security boundary: a write-capable adversary can flip the guard or DROP + // these triggers; verify() remains the backstop and cannot see a forged + // truncation performed this way. + exec_or_throw(db_.get(), + R"( + CREATE TRIGGER IF NOT EXISTS audit_anchors_no_update + BEFORE UPDATE ON audit_anchors + BEGIN + SELECT RAISE(ABORT, 'audit_anchors is not updatable'); + END; + CREATE TRIGGER IF NOT EXISTS audit_anchors_no_insert + BEFORE INSERT ON audit_anchors + WHEN (SELECT enabled FROM audit_prune_guard WHERE id = 1) IS NOT 1 + BEGIN + SELECT RAISE(ABORT, 'audit_anchors sealing is in-process only'); + END; + CREATE TRIGGER IF NOT EXISTS audit_anchors_no_delete + BEFORE DELETE ON audit_anchors + WHEN (SELECT enabled FROM audit_prune_guard WHERE id = 1) IS NOT 1 + BEGIN + SELECT RAISE(ABORT, 'audit_anchors pruning is in-process only'); + END; + )", + "create audit_anchors protection triggers"); } std::optional FaultAuditLog::read_head_row_locked() const { - Stmt stmt(db_, "SELECT seq, record_hash FROM audit_chain_head WHERE id = 1"); + Stmt stmt(db_.get(), "SELECT seq, record_hash FROM audit_chain_head WHERE id = 1"); if (stmt.step() == SQLITE_ROW) { ChainHead head_record; head_record.seq = stmt.column_int64(0); @@ -304,13 +346,13 @@ ChainHead FaultAuditLog::load_head_locked() const { } void FaultAuditLog::store_head_locked(const ChainHead & head_record) { - Stmt stmt(db_, + Stmt stmt(db_.get(), "INSERT INTO audit_chain_head (id, seq, record_hash) VALUES (1, ?, ?) " "ON CONFLICT(id) DO UPDATE SET seq = excluded.seq, record_hash = excluded.record_hash"); stmt.bind_int64(1, head_record.seq); stmt.bind_text(2, head_record.record_hash); if (stmt.step() != SQLITE_DONE) { - throw std::runtime_error(std::string("audit: failed to store head: ") + sqlite3_errmsg(db_)); + throw std::runtime_error(std::string("audit: failed to store head: ") + sqlite3_errmsg(db_.get())); } } @@ -322,9 +364,9 @@ int64_t FaultAuditLog::append(const AuditEvent & event) { const std::string canonical = canonicalize(new_seq, event); const std::string record_hash = sha256_hex(prev_hash + canonical); - exec_or_throw(db_, "BEGIN IMMEDIATE", "begin append"); + exec_or_throw(db_.get(), "BEGIN IMMEDIATE", "begin append"); try { - Stmt insert(db_, + Stmt insert(db_.get(), "INSERT INTO audit_log (seq, occurred_at_ns, fault_code, transition, severity, status, " "source_id, description, prev_hash, record_hash) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"); insert.bind_int64(1, new_seq); @@ -338,19 +380,29 @@ int64_t FaultAuditLog::append(const AuditEvent & event) { insert.bind_text(9, prev_hash); insert.bind_text(10, record_hash); if (insert.step() != SQLITE_DONE) { - throw std::runtime_error(std::string("audit: failed to insert record: ") + sqlite3_errmsg(db_)); + throw std::runtime_error(std::string("audit: failed to insert record: ") + sqlite3_errmsg(db_.get())); } store_head_locked(ChainHead{new_seq, record_hash}); - exec_or_throw(db_, "COMMIT", "commit append"); + exec_or_throw(db_.get(), "COMMIT", "commit append"); } catch (...) { - exec_or_throw(db_, "ROLLBACK", "rollback append"); + exec_or_throw(db_.get(), "ROLLBACK", "rollback append"); throw; } head_ = ChainHead{new_seq, record_hash}; - rotate_if_needed_locked(); + // The append is durable now that COMMIT succeeded. Retention rotation is + // best-effort maintenance (seal + prune of the oldest segment); a rotation + // failure (SQLITE_BUSY on a WAL checkpoint, disk-full on the prune DELETE) + // must NOT turn a committed append into a reported dropped write. Swallow it, + // count it so the maintenance failure stays observable, and let the next + // append retry the rotation. + try { + rotate_if_needed_locked(); + } catch (const std::exception &) { + ++rotation_failures_; + } return new_seq; } @@ -363,7 +415,7 @@ void FaultAuditLog::rotate_if_needed_locked() { int64_t count = 0; int64_t min_seq = 0; { - Stmt stmt(db_, "SELECT COUNT(*), COALESCE(MIN(seq), 0) FROM audit_log"); + Stmt stmt(db_.get(), "SELECT COUNT(*), COALESCE(MIN(seq), 0) FROM audit_log"); if (stmt.step() == SQLITE_ROW) { count = stmt.column_int64(0); min_seq = stmt.column_int64(1); @@ -382,7 +434,7 @@ void FaultAuditLog::rotate_if_needed_locked() { std::string boundary_hash; int64_t sealed_at_ns = 0; { - Stmt stmt(db_, "SELECT record_hash, occurred_at_ns FROM audit_log WHERE seq = ?"); + Stmt stmt(db_.get(), "SELECT record_hash, occurred_at_ns FROM audit_log WHERE seq = ?"); stmt.bind_int64(1, boundary_seq); if (stmt.step() != SQLITE_ROW) { // Should not happen given the count; leave the log intact rather than @@ -393,33 +445,45 @@ void FaultAuditLog::rotate_if_needed_locked() { sealed_at_ns = stmt.column_int64(1); } - exec_or_throw(db_, "BEGIN IMMEDIATE", "begin rotate"); + exec_or_throw(db_.get(), "BEGIN IMMEDIATE", "begin rotate"); try { - Stmt anchor(db_, + // Open the prune guard FIRST so every guarded write below - the anchor + // INSERT, the sealed-prefix audit_log DELETE, and the old-anchor DELETE - + // passes its guard-gated trigger. Both the flip and the writes are in this + // transaction, so a ROLLBACK restores the guard to its closed state. + exec_or_throw(db_.get(), "UPDATE audit_prune_guard SET enabled = 1 WHERE id = 1", "open prune guard"); + + Stmt anchor(db_.get(), "INSERT INTO audit_anchors (last_seq, sealed_at_ns, last_hash) VALUES (?, ?, ?) " "ON CONFLICT(last_seq) DO NOTHING"); anchor.bind_int64(1, boundary_seq); anchor.bind_int64(2, sealed_at_ns); anchor.bind_text(3, boundary_hash); if (anchor.step() != SQLITE_DONE) { - throw std::runtime_error(std::string("audit: failed to write anchor: ") + sqlite3_errmsg(db_)); + throw std::runtime_error(std::string("audit: failed to write anchor: ") + sqlite3_errmsg(db_.get())); } - // Open the prune guard so the BEFORE DELETE trigger allows this sealed-prefix - // delete. Both the guard flip and the delete are in this transaction, so a - // ROLLBACK restores the guard to its closed state. - exec_or_throw(db_, "UPDATE audit_prune_guard SET enabled = 1 WHERE id = 1", "open prune guard"); - - Stmt del(db_, "DELETE FROM audit_log WHERE seq <= ?"); + Stmt del(db_.get(), "DELETE FROM audit_log WHERE seq <= ?"); del.bind_int64(1, boundary_seq); if (del.step() != SQLITE_DONE) { - throw std::runtime_error(std::string("audit: failed to prune records: ") + sqlite3_errmsg(db_)); + throw std::runtime_error(std::string("audit: failed to prune records: ") + sqlite3_errmsg(db_.get())); + } + + // Bound audit_anchors: verify() only ever needs the anchor at the current + // prune boundary (the oldest retained row links to it, and an all-pruned log + // links its head to it). Older anchors describe already-forgotten prefixes + // and are never read again, so drop them - otherwise audit_anchors grows one + // row per rotation without bound while audit_log stays at the cap. + Stmt prune_anchors(db_.get(), "DELETE FROM audit_anchors WHERE last_seq < ?"); + prune_anchors.bind_int64(1, boundary_seq); + if (prune_anchors.step() != SQLITE_DONE) { + throw std::runtime_error(std::string("audit: failed to prune anchors: ") + sqlite3_errmsg(db_.get())); } - exec_or_throw(db_, "UPDATE audit_prune_guard SET enabled = 0 WHERE id = 1", "close prune guard"); - exec_or_throw(db_, "COMMIT", "commit rotate"); + exec_or_throw(db_.get(), "UPDATE audit_prune_guard SET enabled = 0 WHERE id = 1", "close prune guard"); + exec_or_throw(db_.get(), "COMMIT", "commit rotate"); } catch (...) { - exec_or_throw(db_, "ROLLBACK", "rollback rotate"); + exec_or_throw(db_.get(), "ROLLBACK", "rollback rotate"); throw; } } @@ -434,7 +498,7 @@ std::vector FaultAuditLog::read(int64_t limit, int64_t after_seq) c sql += " LIMIT ?"; } - Stmt stmt(db_, sql.c_str()); + Stmt stmt(db_.get(), sql.c_str()); stmt.bind_int64(1, after_seq); if (limit > 0) { stmt.bind_int64(2, limit); @@ -465,20 +529,25 @@ ChainHead FaultAuditLog::head() const { int64_t FaultAuditLog::record_count() const { std::lock_guard lock(mutex_); - Stmt stmt(db_, "SELECT COUNT(*) FROM audit_log"); + Stmt stmt(db_.get(), "SELECT COUNT(*) FROM audit_log"); if (stmt.step() != SQLITE_ROW) { return 0; } return stmt.column_int64(0); } +int64_t FaultAuditLog::rotation_failures() const { + std::lock_guard lock(mutex_); + return rotation_failures_; +} + AuditVerifyResult FaultAuditLog::verify() const { std::lock_guard lock(mutex_); AuditVerifyResult result; // Walk every retained row oldest-first, recomputing each link. - Stmt stmt(db_, + Stmt stmt(db_.get(), "SELECT seq, occurred_at_ns, fault_code, transition, severity, status, source_id, description, " "prev_hash, record_hash FROM audit_log ORDER BY seq ASC"); @@ -511,7 +580,7 @@ AuditVerifyResult FaultAuditLog::verify() const { return result; } } else { - Stmt anchor(db_, "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); + Stmt anchor(db_.get(), "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); anchor.bind_int64(1, rec.seq - 1); if (anchor.step() != SQLITE_ROW) { result.ok = false; @@ -582,7 +651,7 @@ AuditVerifyResult FaultAuditLog::verify() const { // Empty retained log with a head past genesis: everything was pruned, so the // head must point at a sealed anchor. (No head row, or a genesis head, on an // empty log is the never-written case and is fine.) - Stmt anchor(db_, "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); + Stmt anchor(db_.get(), "SELECT last_hash FROM audit_anchors WHERE last_seq = ?"); anchor.bind_int64(1, persisted->seq); if (anchor.step() != SQLITE_ROW || anchor.column_text(0) != persisted->record_hash) { result.ok = false; diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index a4579a4d..d8a72717 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -366,9 +366,9 @@ FaultManagerNode::~FaultManagerNode() { rosbag_capture_->stop(); } - // Close the chain with a "logging deactivated" marker (CIR (EU) 2024/2690 - // sec. 3.2). Best-effort and appended directly (recorded even in confirmed_only - // mode); a failure here must not throw out of the destructor. + // Close the chain with a "logging deactivated" marker. Best-effort and appended + // directly (recorded even in confirmed_only mode); a failure here must not throw + // out of the destructor. if (audit_log_) { try { AuditEvent marker; @@ -436,9 +436,12 @@ std::unique_ptr FaultManagerNode::create_audit_log() { retention = 0; } - // Fail-closed: when set, an append failure is a hard error (the operation - // aborts and the audit is marked unhealthy) instead of being logged and - // silently dropped. Off by default so the audit never blocks fault processing. + // Fail-closed: when set, an audit append failure is re-raised as a hard error + // (fail-FAST) so a broken audit chain cannot be silently outlived. It does NOT + // roll back the fault-state change that triggered the transition - that change + // has already committed to the SEPARATE fault-store DB and cross-DB atomicity + // is not achievable here - it only signals the broken audit so an operator must + // act. Off by default so the audit never blocks fault processing. audit_fail_closed_ = declare_parameter("audit_log.fail_closed", false); // Path: explicit override, else a sibling of the fault DB, else :memory:. @@ -478,9 +481,9 @@ std::unique_ptr FaultManagerNode::create_audit_log() { audit_path.c_str(), audit_confirmed_only_ ? "confirmed_only" : "all", retention, audit_fail_closed_ ? "true" : "false", log->head().seq); - // Record a "logging activated" marker so the chain documents its own start - // (CIR (EU) 2024/2690 sec. 3.2). Appended directly, so it is recorded even in - // confirmed_only mode. Best-effort: a failure here does not abort startup. + // Record a "logging activated" marker so the chain documents its own start. + // Appended directly, so it is recorded even in confirmed_only mode. + // Best-effort: a failure here does not abort startup. AuditEvent marker; marker.fault_code = "__audit__"; marker.transition = kTransitionLoggingActivated; @@ -522,18 +525,26 @@ void FaultManagerNode::audit_transition(const char * transition, const ros2_medk audit_log_->append(event); } catch (const std::exception & e) { // A dropped append is a hole in the chain that verify() can never see (it - // proves nothing was deleted, not that everything was written). Always make - // it loud and visible via the health counter so the gap is observable. + // proves nothing was deleted, not that everything was written). The fault- + // state transition that triggered this has ALREADY committed to the fault + // store, which is a SEPARATE SQLite database: there is no cross-DB atomicity, + // so this failure does NOT (and cannot) roll that change back. All we can do + // is make the broken audit loud and observable via the health signals. audit_dropped_writes_.fetch_add(1, std::memory_order_relaxed); audit_healthy_.store(false, std::memory_order_relaxed); const uint64_t dropped = audit_dropped_writes_.load(std::memory_order_relaxed); RCLCPP_ERROR(get_logger(), "Failed to append audit record for '%s' (%s): %s [audit dropped_writes=%" PRIu64 "]", fault.fault_code.c_str(), transition, e.what(), dropped); if (audit_fail_closed_) { - // Compliance-strict: refuse to proceed as if nothing happened. Abort the - // operation so the broken audit cannot be silently outlived. + // Explicit fail-FAST: surface the broken audit to the caller so a + // compliance-strict deployment learns the chain is now incomplete and an + // operator must act. Rethrowing propagates that signal ONLY - the fault- + // state change already committed (see above) and is NOT rolled back; this + // is not atomicity. RCLCPP_FATAL(get_logger(), - "audit_log.fail_closed is set: aborting operation because the audit append failed for '%s' (%s)", + "audit_log.fail_closed is set: audit append failed for '%s' (%s); the audit chain is now " + "incomplete and requires operator action. The already-committed fault-state change is NOT " + "rolled back (the fault store is a separate database).", fault.fault_code.c_str(), transition); throw; } diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp index d6e6ed06..79ec5192 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -76,6 +76,28 @@ int raw_exec_rc(const std::string & db_path, const std::string & sql) { const char * const kDropTriggers = "DROP TRIGGER IF EXISTS audit_log_no_update; DROP TRIGGER IF EXISTS audit_log_no_delete; "; +/// Drop the guard-gated audit_anchors triggers so a raw anchor write can land +/// (same attacker model as kDropTriggers, for the anchor table). +const char * const kDropAnchorTriggers = + "DROP TRIGGER IF EXISTS audit_anchors_no_update; DROP TRIGGER IF EXISTS audit_anchors_no_insert; " + "DROP TRIGGER IF EXISTS audit_anchors_no_delete; "; + +/// Count rows in a table via a fresh connection (used to assert audit_anchors +/// stays bounded across many rotations). +int64_t count_table_rows(const std::string & db_path, const std::string & table) { + sqlite3 * db = nullptr; + EXPECT_EQ(sqlite3_open(db_path.c_str(), &db), SQLITE_OK); + sqlite3_stmt * stmt = nullptr; + const std::string sql = "SELECT COUNT(*) FROM " + table; + int64_t rows = -1; + if (sqlite3_prepare_v2(db, sql.c_str(), -1, &stmt, nullptr) == SQLITE_OK && sqlite3_step(stmt) == SQLITE_ROW) { + rows = sqlite3_column_int64(stmt, 0); + } + sqlite3_finalize(stmt); + sqlite3_close(db); + return rows; +} + } // namespace class FaultAuditLogTest : public ::testing::Test { @@ -280,7 +302,7 @@ TEST_F(FaultAuditLogTest, MissingAnchorFailsVerify) { log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); } } - raw_exec(path_, "DELETE FROM audit_anchors"); + raw_exec(path_, std::string(kDropAnchorTriggers) + "DELETE FROM audit_anchors"); FaultAuditLog reopened(path_, 5); EXPECT_FALSE(reopened.verify().ok); @@ -373,6 +395,105 @@ TEST_F(FaultAuditLogTest, GuardProtectTriggerBlocksExternalGuardFlip) { EXPECT_TRUE(reopened.verify().ok); } +// Non-UTF-8 bytes in unvalidated content (description / source_id come straight +// from the ReportFault request) must NOT make canonicalize -> append throw and +// leave a silent gap. Canonicalization is total: any byte sequence hashes +// reproducibly and the chain still verifies. +TEST_F(FaultAuditLogTest, NonUtf8ContentAppendsAndVerifies) { + FaultAuditLog log(path_); + + AuditEvent e = make_event("F1", kTransitionOccurred, 100); + e.description = "pressure \xff\xfe low"; // invalid UTF-8 bytes + e.source_id = "src\x80\x81"; // invalid UTF-8 continuation bytes + + int64_t seq = 0; + ASSERT_NO_THROW(seq = log.append(e)); + EXPECT_EQ(seq, 1); + + // A plain append still chains on top of the non-UTF-8 record. + ASSERT_NO_THROW(log.append(make_event("F2", kTransitionConfirmed, 200))); + + // canonicalize is deterministic on the same arbitrary bytes. + EXPECT_EQ(FaultAuditLog::canonicalize(1, e), FaultAuditLog::canonicalize(1, e)); + + auto result = log.verify(); + EXPECT_TRUE(result.ok) << result.error; + EXPECT_EQ(result.checked, 2); +} + +// Many rotations must not grow audit_anchors without bound: verify() only ever +// needs the anchor at the current prune boundary, so older anchors are pruned in +// the same rotation. audit_log stays at the cap and audit_anchors stays bounded. +TEST_F(FaultAuditLogTest, ManyRotationsDoNotGrowAnchorsUnbounded) { + FaultAuditLog log(path_, /*retention_max_records=*/3); + for (int i = 1; i <= 100; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + EXPECT_EQ(log.record_count(), 3); + // Without pruning old anchors this would be ~97 rows; it must stay bounded. + EXPECT_LE(count_table_rows(path_, "audit_anchors"), 1); + + // Pruning old anchors must not break verify: the surviving tail still links to + // the current boundary anchor. + auto result = log.verify(); + EXPECT_TRUE(result.ok) << result.error; + EXPECT_EQ(result.checked, 3); +} + +// A rotation failure AFTER a durable append must not fail the append: COMMIT +// already made the record durable, so a broken rotation is counted (retried next +// time), never surfaced to the caller as a failed/dropped write. +TEST_F(FaultAuditLogTest, RotationFailureDoesNotFailDurableAppend) { + FaultAuditLog log(path_, /*retention_max_records=*/3); + for (int i = 1; i <= 5; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + ASSERT_EQ(log.record_count(), 3); + ASSERT_EQ(log.rotation_failures(), 0); + + // Sabotage rotation out-of-band: drop audit_anchors so the next rotation's + // anchor INSERT fails INSIDE rotate_if_needed_locked, after the append COMMIT. + raw_exec(path_, std::string(kDropAnchorTriggers) + "DROP TABLE audit_anchors;"); + + int64_t seq = 0; + EXPECT_NO_THROW(seq = log.append(make_event("F6", kTransitionOccurred, 600))); + EXPECT_EQ(seq, 6); + EXPECT_EQ(log.rotation_failures(), 1); + + // The append is durable: the record is present and is the newest. + auto tail = log.read(/*limit=*/0, /*after_seq=*/5); + ASSERT_EQ(tail.size(), 1u); + EXPECT_EQ(tail.front().seq, 6); + EXPECT_EQ(log.head().seq, 6); +} + +// Defense-in-depth (item 7): audit_anchors carries the same guard-gated triggers +// as audit_log, so an out-of-band INSERT/UPDATE/DELETE of an anchor is rejected +// while the guard is closed. The in-process rotation (guard open) still seals and +// prunes anchors normally. +TEST_F(FaultAuditLogTest, AnchorTriggersRejectOutOfBandWrites) { + { + FaultAuditLog log(path_, /*retention_max_records=*/3); + for (int i = 1; i <= 8; ++i) { + log.append(make_event("F" + std::to_string(i), kTransitionOccurred, 100 + i)); + } + EXPECT_EQ(log.record_count(), 3); + EXPECT_TRUE(log.verify().ok); + } + + // A forged anchor INSERT, an UPDATE of the real anchor, and a DELETE are all + // rejected out-of-band (guard closed), so a forged prefix-truncation cannot be + // planted without first dropping the triggers. + EXPECT_NE(raw_exec_rc(path_, "INSERT INTO audit_anchors (last_seq, sealed_at_ns, last_hash) VALUES (999, 0, 'x')"), + SQLITE_OK); + EXPECT_NE(raw_exec_rc(path_, "UPDATE audit_anchors SET last_hash = 'forged'"), SQLITE_OK); + EXPECT_NE(raw_exec_rc(path_, "DELETE FROM audit_anchors"), SQLITE_OK); + + // The real anchor survived and the tail still verifies. + FaultAuditLog reopened(path_, 3); + EXPECT_TRUE(reopened.verify().ok); +} + int main(int argc, char ** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); From 9935a11346ab2ddfdf2b7a6a169b779ee3b69dad Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Wed, 1 Jul 2026 21:42:30 +0200 Subject: [PATCH 7/9] fix(fault-audit): read full column bytes to preserve embedded NULs sqlite3_column_text is NUL-terminated, so std::string(text) truncated audit content at the first embedded NUL while bind_text stored the full bytes, making verify() falsely report a record_hash mismatch on untampered records. Read with sqlite3_column_bytes so the write/read round-trip stays byte-exact and the chain still verifies. Refs #483 --- .../src/fault_audit_log.cpp | 10 +++++- .../test/test_fault_audit_log.cpp | 33 +++++++++++++++++++ 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp index 11411851..65d855bd 100644 --- a/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_audit_log.cpp @@ -82,8 +82,16 @@ class Stmt { } std::string column_text(int index) { + // sqlite3_column_text yields a NUL-terminated pointer, so std::string(text) + // would truncate content at the first embedded NUL. bind_text stores the full + // bytes via value.size(), so reading with the real length keeps the write/read + // round-trip byte-exact and the recomputed hash matching. sqlite3_column_bytes + // must be read AFTER sqlite3_column_text so it reports the UTF-8 byte count. const auto * text = reinterpret_cast(sqlite3_column_text(stmt_, index)); - return text ? std::string(text) : std::string(); + if (text == nullptr) { + return std::string(); + } + return std::string(text, static_cast(sqlite3_column_bytes(stmt_, index))); } int64_t column_int64(int index) { diff --git a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp index 79ec5192..2107aab7 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_audit_log.cpp @@ -421,6 +421,39 @@ TEST_F(FaultAuditLogTest, NonUtf8ContentAppendsAndVerifies) { EXPECT_EQ(result.checked, 2); } +// Embedded NUL bytes in unvalidated content (description / source_id come from +// the ReportFault request) must survive the write/read round-trip byte-exact. +// bind_text stores the full length, so the read path must too - reading a +// NUL-terminated const char* would truncate at the first NUL, change the +// canonical form on re-read, and make verify() falsely report tampering on +// legitimate content. +TEST_F(FaultAuditLogTest, EmbeddedNulContentRoundTripsAndVerifies) { + FaultAuditLog log(path_); + + AuditEvent e = make_event("F1", kTransitionOccurred, 100); + e.description = std::string("pressure\0low", 12); // embedded NUL, bytes after it + e.source_id = std::string("src\0id", 6); // embedded NUL, bytes after it + e.status = std::string("CON\0FIRMED", 10); // embedded NUL, bytes after it + + int64_t seq = 0; + ASSERT_NO_THROW(seq = log.append(e)); + EXPECT_EQ(seq, 1); + + // A plain append still chains on top of the embedded-NUL record. + ASSERT_NO_THROW(log.append(make_event("F2", kTransitionConfirmed, 200))); + + auto result = log.verify(); + EXPECT_TRUE(result.ok) << result.error; + EXPECT_EQ(result.checked, 2); + + // The stored bytes read back whole, not truncated at the first NUL. + auto records = log.read(); + ASSERT_EQ(records.size(), 2u); + EXPECT_EQ(records[0].event.description, e.description); + EXPECT_EQ(records[0].event.source_id, e.source_id); + EXPECT_EQ(records[0].event.status, e.status); +} + // Many rotations must not grow audit_anchors without bound: verify() only ever // needs the anchor at the current prune boundary, so older anchors are pruned in // the same rotation. audit_log stays at the cap and audit_anchors stays bounded. From 3aeb2ff99e113f7a6b412f84aaa454cd2775a09a Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Wed, 1 Jul 2026 23:13:50 +0200 Subject: [PATCH 8/9] test(fault-audit): cover latch and single-event heal audit semantics Assert exactly one healed row under the HEALED latch, no duplicate confirmed rows on latched re-confirmation, and a healing_threshold == 0 single-PASSED heal audited exactly once. Refs #483 --- .../test/test_fault_manager.cpp | 87 ++++++++++++++++++- 1 file changed, 85 insertions(+), 2 deletions(-) diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index 14133999..2de2ec66 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -1527,6 +1527,11 @@ TEST_F(SnapshotCooldownTest, CooldownPreventsRapidRecapture) { // clears it (=> cleared), and inspects the persisted audit DB by reopening it. class FaultAuditIntegrationTest : public ::testing::Test { protected: + /// Healing threshold for the node under test; override per-fixture. + virtual int healing_threshold() const { + return 1; // counter >= 1 heals (two PASSED after one FAILED) + } + void SetUp() override { std::random_device rd; std::mt19937_64 gen(rd()); @@ -1540,7 +1545,7 @@ class FaultAuditIntegrationTest : public ::testing::Test { {"storage_type", "memory"}, {"confirmation_threshold", -1}, {"healing_enabled", true}, - {"healing_threshold", 1}, // counter >= 1 heals (two PASSED after one FAILED) + {"healing_threshold", healing_threshold()}, {"audit_log.enabled", true}, {"audit_log.database_path", audit_path_}, }); @@ -1647,6 +1652,11 @@ TEST_F(FaultAuditIntegrationTest, AutoHealAppendsHealedRow) { send_report(ReportFault::Request::EVENT_PASSED); // counter 0 (hysteresis): stays CONFIRMED send_report(ReportFault::Request::EVENT_PASSED); // counter 1 >= healing_threshold => HEALED + // HEALED is latched and the counter is clamped at the healing threshold: further + // PASSED events keep status HEALED and must NOT append duplicate "healed" rows. + send_report(ReportFault::Request::EVENT_PASSED); + send_report(ReportFault::Request::EVENT_PASSED); + ros2_medkit_fault_manager::FaultAuditLog audit(audit_path_); std::vector transitions; std::string heal_source; @@ -1659,7 +1669,7 @@ TEST_F(FaultAuditIntegrationTest, AutoHealAppendsHealedRow) { } } - ASSERT_EQ(transitions.size(), 3u) << "expected occurred, confirmed, healed"; + ASSERT_EQ(transitions.size(), 3u) << "expected exactly occurred, confirmed, healed (no duplicates under latch)"; EXPECT_EQ(transitions[0], ros2_medkit_fault_manager::kTransitionOccurred); EXPECT_EQ(transitions[1], ros2_medkit_fault_manager::kTransitionConfirmed); EXPECT_EQ(transitions[2], ros2_medkit_fault_manager::kTransitionHealed); @@ -1669,6 +1679,79 @@ TEST_F(FaultAuditIntegrationTest, AutoHealAppendsHealedRow) { EXPECT_TRUE(result.ok) << result.error; } +// The CONFIRMED latch must not create duplicate "confirmed" audit rows: a PASSED +// event on a confirmed fault keeps it CONFIRMED (latched), so the following FAILED +// event is an update, not a re-confirmation. Exactly one confirmed row per episode. +TEST_F(FaultAuditIntegrationTest, ConfirmedLatchNoDuplicateConfirmedRows) { + auto send_report = [&](uint8_t event_type) { + auto req = std::make_shared(); + req->fault_code = "LATCH_FAULT"; + req->event_type = event_type; + req->severity = Fault::SEVERITY_ERROR; + req->description = "flapping sensor"; + req->source_id = "/robot/sensor"; + auto fut = report_client_->async_send_request(req); + ASSERT_TRUE(spin_until_ready(fut)); + ASSERT_TRUE(fut.get()->accepted); + }; + + send_report(ReportFault::Request::EVENT_FAILED); // counter -1 <= threshold -1 => CONFIRMED + send_report(ReportFault::Request::EVENT_PASSED); // counter 0: latch keeps CONFIRMED + send_report(ReportFault::Request::EVENT_FAILED); // counter -1 again: still CONFIRMED, no new row + + ros2_medkit_fault_manager::FaultAuditLog audit(audit_path_); + size_t confirmed_rows = 0; + for (const auto & rec : audit.read()) { + if (rec.event.fault_code == "LATCH_FAULT" && + rec.event.transition == ros2_medkit_fault_manager::kTransitionConfirmed) { + ++confirmed_rows; + } + } + EXPECT_EQ(confirmed_rows, 1u) << "latched re-confirmation must not append a duplicate confirmed row"; + EXPECT_TRUE(audit.verify().ok); +} + +// healing_threshold == 0 means a single PASSED event heals. That single-event heal +// must be audited as exactly one "healed" row, and further PASSED events (counter +// clamped at 0, status latched HEALED) must not append more. +class FaultAuditSingleEventHealTest : public FaultAuditIntegrationTest { + protected: + int healing_threshold() const override { + return 0; + } +}; + +TEST_F(FaultAuditSingleEventHealTest, SinglePassedHealAuditedOnce) { + auto send_report = [&](uint8_t event_type) { + auto req = std::make_shared(); + req->fault_code = "FAST_HEAL"; + req->event_type = event_type; + req->severity = Fault::SEVERITY_ERROR; + req->description = "transient glitch"; + req->source_id = "/robot/imu"; + auto fut = report_client_->async_send_request(req); + ASSERT_TRUE(spin_until_ready(fut)); + ASSERT_TRUE(fut.get()->accepted); + }; + + send_report(ReportFault::Request::EVENT_FAILED); // counter -1 => CONFIRMED + send_report(ReportFault::Request::EVENT_PASSED); // counter 0 >= threshold 0 => HEALED + send_report(ReportFault::Request::EVENT_PASSED); // clamped at 0, latched HEALED: no new row + + ros2_medkit_fault_manager::FaultAuditLog audit(audit_path_); + std::vector transitions; + for (const auto & rec : audit.read()) { + if (rec.event.fault_code == "FAST_HEAL") { + transitions.push_back(rec.event.transition); + } + } + ASSERT_EQ(transitions.size(), 3u) << "expected exactly occurred, confirmed, healed"; + EXPECT_EQ(transitions[0], ros2_medkit_fault_manager::kTransitionOccurred); + EXPECT_EQ(transitions[1], ros2_medkit_fault_manager::kTransitionConfirmed); + EXPECT_EQ(transitions[2], ros2_medkit_fault_manager::kTransitionHealed); + EXPECT_TRUE(audit.verify().ok); +} + TEST(FaultAuditDisabledTest, NoAuditFileWhenDisabled) { std::random_device rd; std::mt19937_64 gen(rd()); From 92ef4b380d8c9a17dd306fa7be12ccdc5cb5744c Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Wed, 1 Jul 2026 23:31:41 +0200 Subject: [PATCH 9/9] fix(fault-audit): audit startup HEALED->CLEARED reclassification reclassify_healed_as_cleared() now returns the flipped fault codes on both backends so the node can audit each transition (source startup_reclassify) instead of silently mutating storage. Refs #483 --- .../fault_storage.hpp | 8 +- .../sqlite_fault_storage.hpp | 2 +- .../src/fault_manager_node.cpp | 20 +++- .../src/fault_storage.cpp | 8 +- .../src/sqlite_fault_storage.cpp | 20 +++- .../test/test_fault_manager.cpp | 99 ++++++++++++++++++- .../test/test_sqlite_storage.cpp | 5 +- 7 files changed, 145 insertions(+), 17 deletions(-) diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp index 2f376eb9..68b2f994 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/fault_storage.hpp @@ -223,9 +223,9 @@ class FaultStorage { /// One-time startup cleanup: reclassify HEALED faults as CLEARED. Called when healing is disabled, /// so a HEALED row left by a previous (healing-enabled) run does not behave inconsistently under /// the latch. Default is a no-op (in-memory storage starts empty). - /// @return number of faults reclassified - virtual size_t reclassify_healed_as_cleared() { - return 0; + /// @return fault codes of the reclassified faults, so the caller can audit each transition + virtual std::vector reclassify_healed_as_cleared() { + return {}; } protected: @@ -274,7 +274,7 @@ class InMemoryFaultStorage : public FaultStorage { std::vector get_all_rosbag_files() const override; std::vector list_rosbags_for_entity(const std::string & entity_fqn) const override; std::vector get_all_faults() const override; - size_t reclassify_healed_as_cleared() override; + std::vector reclassify_healed_as_cleared() override; private: /// Update fault status based on debounce counter and given config diff --git a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp index fb4006c9..2db560fa 100644 --- a/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp +++ b/src/ros2_medkit_fault_manager/include/ros2_medkit_fault_manager/sqlite_fault_storage.hpp @@ -74,7 +74,7 @@ class SqliteFaultStorage : public FaultStorage { std::vector get_all_rosbag_files() const override; std::vector list_rosbags_for_entity(const std::string & entity_fqn) const override; std::vector get_all_faults() const override; - size_t reclassify_healed_as_cleared() override; + std::vector reclassify_healed_as_cleared() override; /// Get the database path const std::string & db_path() const { diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index d8a72717..7053258f 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -177,11 +177,23 @@ FaultManagerNode::FaultManagerNode(const rclcpp::NodeOptions & options) : Node(" storage_->set_debounce_config(global_config_); // One-time cleanup: when healing is disabled, a HEALED row left by a previous (healing-enabled) run - // would behave inconsistently under the latch, so reclassify it as CLEARED at startup. + // would behave inconsistently under the latch, so reclassify it as CLEARED at startup. Each flipped + // fault is audited (the audit log is already constructed above); without this the reclassification + // would be invisible to the audit log's verify(). if (!global_config_.healing_enabled) { - size_t reclassified = storage_->reclassify_healed_as_cleared(); - if (reclassified > 0) { - RCLCPP_INFO(get_logger(), "Healing disabled: reclassified %zu stale HEALED fault(s) as CLEARED", reclassified); + const auto reclassified = storage_->reclassify_healed_as_cleared(); + if (!reclassified.empty()) { + if (audit_log_) { + const int64_t reclassified_at_ns = get_wall_clock_time().nanoseconds(); + for (const auto & fault_code : reclassified) { + auto fault = storage_->get_fault(fault_code); + if (fault) { + audit_transition(kTransitionCleared, *fault, "startup_reclassify", reclassified_at_ns); + } + } + } + RCLCPP_INFO(get_logger(), "Healing disabled: reclassified %zu stale HEALED fault(s) as CLEARED", + reclassified.size()); } } diff --git a/src/ros2_medkit_fault_manager/src/fault_storage.cpp b/src/ros2_medkit_fault_manager/src/fault_storage.cpp index 14b5e41e..e8956484 100644 --- a/src/ros2_medkit_fault_manager/src/fault_storage.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_storage.cpp @@ -467,16 +467,16 @@ std::vector InMemoryFaultStorage::get_all_faults() return result; } -size_t InMemoryFaultStorage::reclassify_healed_as_cleared() { +std::vector InMemoryFaultStorage::reclassify_healed_as_cleared() { std::lock_guard lock(mutex_); - size_t changed = 0; + std::vector reclassified; for (auto & [code, state] : faults_) { if (state.status == ros2_medkit_msgs::msg::Fault::STATUS_HEALED) { state.status = ros2_medkit_msgs::msg::Fault::STATUS_CLEARED; - ++changed; + reclassified.push_back(code); } } - return changed; + return reclassified; } } // namespace ros2_medkit_fault_manager diff --git a/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp b/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp index bd9d6d82..76482987 100644 --- a/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp +++ b/src/ros2_medkit_fault_manager/src/sqlite_fault_storage.cpp @@ -629,9 +629,25 @@ bool SqliteFaultStorage::clear_fault(const std::string & fault_code) { return sqlite3_changes(db_) > 0; } -size_t SqliteFaultStorage::reclassify_healed_as_cleared() { +std::vector SqliteFaultStorage::reclassify_healed_as_cleared() { std::lock_guard lock(mutex_); + // Collect the codes that will flip first so the caller can audit each one. The + // SELECT predicate mirrors the UPDATE exactly, and both run under the same lock, + // so the returned list matches the rows actually reclassified below. + std::vector reclassified; + { + SqliteStatement select_stmt(db_, "SELECT fault_code FROM faults WHERE status = ?"); + select_stmt.bind_text(1, ros2_medkit_msgs::msg::Fault::STATUS_HEALED); + while (select_stmt.step() == SQLITE_ROW) { + reclassified.push_back(select_stmt.column_text(0)); + } + } + + if (reclassified.empty()) { + return reclassified; + } + // Drop snapshots for the affected faults so a reclassified row matches CLEARED semantics. SqliteStatement del(db_, "DELETE FROM snapshots WHERE fault_code IN (SELECT fault_code FROM faults WHERE status = ?)"); @@ -646,7 +662,7 @@ size_t SqliteFaultStorage::reclassify_healed_as_cleared() { if (stmt.step() != SQLITE_DONE) { throw std::runtime_error(std::string("Failed to reclassify HEALED faults: ") + sqlite3_errmsg(db_)); } - return static_cast(sqlite3_changes(db_)); + return reclassified; } size_t SqliteFaultStorage::size() const { diff --git a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp index 2de2ec66..3eb477b2 100644 --- a/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp +++ b/src/ros2_medkit_fault_manager/test/test_fault_manager.cpp @@ -29,6 +29,7 @@ #include "ros2_medkit_fault_manager/fault_audit_log.hpp" #include "ros2_medkit_fault_manager/fault_manager_node.hpp" #include "ros2_medkit_fault_manager/fault_storage.hpp" +#include "ros2_medkit_fault_manager/sqlite_fault_storage.hpp" #include "ros2_medkit_msgs/msg/fault.hpp" #include "ros2_medkit_msgs/msg/fault_event.hpp" #include "ros2_medkit_msgs/srv/clear_fault.hpp" @@ -717,8 +718,11 @@ TEST_F(FaultStorageTest, ReclassifyHealedAsCleared) { } ASSERT_EQ(storage_.get_fault("F")->status, Fault::STATUS_HEALED); - EXPECT_EQ(storage_.reclassify_healed_as_cleared(), 1u); + const auto reclassified = storage_.reclassify_healed_as_cleared(); + ASSERT_EQ(reclassified.size(), 1u); + EXPECT_EQ(reclassified[0], "F"); EXPECT_EQ(storage_.get_fault("F")->status, Fault::STATUS_CLEARED); + EXPECT_TRUE(storage_.reclassify_healed_as_cleared().empty()); } // Unit tests for the shared debounce helpers. @@ -1821,6 +1825,99 @@ TEST(FaultAuditTimerTest, TimerConfirmationAppendsConfirmedAuditRow) { namespace { +/// Leave a HEALED fault in a sqlite fault DB, as a previous healing-enabled run would. +void seed_healed_fault(const std::string & db_path, const std::string & fault_code) { + ros2_medkit_fault_manager::SqliteFaultStorage seed(db_path); + DebounceConfig config; + config.healing_enabled = true; + config.healing_threshold = 3; + rclcpp::Clock clock; + seed.report_fault_event(fault_code, ReportFault::Request::EVENT_FAILED, Fault::SEVERITY_ERROR, "stale", "/robot/src", + clock.now(), config); + for (int i = 0; i < 4; ++i) { + seed.report_fault_event(fault_code, ReportFault::Request::EVENT_PASSED, 0, "", "/robot/src", clock.now(), config); + } + ASSERT_EQ(seed.get_fault(fault_code)->status, Fault::STATUS_HEALED); +} + +std::filesystem::path make_temp_dir(const char * prefix) { + std::random_device rd; + std::mt19937_64 gen(rd()); + std::uniform_int_distribution dist; + auto dir = std::filesystem::temp_directory_path() / (std::string(prefix) + std::to_string(dist(gen))); + std::filesystem::create_directories(dir); + return dir; +} + +} // namespace + +// Startup reclassification (HEALED->CLEARED when healing is disabled) flips faults +// directly in storage, so each one must be audited or the transition is invisible +// to the audit log's verify(). Seeds a HEALED row, restarts with healing disabled +// and the audit on, and expects exactly one "cleared" row with the startup source. +TEST(FaultAuditStartupReclassifyTest, StartupReclassifyAppendsClearedRow) { + const auto dir = make_temp_dir("test_startup_reclassify_"); + const std::string db_path = (dir / "faults.db").string(); + const std::string audit_path = (dir / "audit.db").string(); + seed_healed_fault(db_path, "STALE_HEALED"); + + { + rclcpp::NodeOptions options; + options.parameter_overrides({ + {"storage_type", "sqlite"}, + {"database_path", db_path}, + {"healing_enabled", false}, + {"audit_log.enabled", true}, + {"audit_log.database_path", audit_path}, + }); + auto node = std::make_shared(options); + + EXPECT_EQ(node->get_storage().get_fault("STALE_HEALED")->status, Fault::STATUS_CLEARED); + + const auto * audit = node->get_audit_log_for_test(); + ASSERT_NE(audit, nullptr); + size_t cleared_rows = 0; + for (const auto & rec : audit->read()) { + if (rec.event.fault_code == "STALE_HEALED") { + EXPECT_EQ(rec.event.transition, ros2_medkit_fault_manager::kTransitionCleared); + EXPECT_EQ(rec.event.source_id, "startup_reclassify"); + EXPECT_EQ(rec.event.status, Fault::STATUS_CLEARED); + ++cleared_rows; + } + } + EXPECT_EQ(cleared_rows, 1u) << "expected exactly one audited startup reclassification"; + + auto result = audit->verify(); + EXPECT_TRUE(result.ok) << result.error; + } + std::filesystem::remove_all(dir); +} + +// With the audit log disabled, startup reclassification must behave exactly as +// before: the fault is flipped to CLEARED and no audit database is created. +TEST(FaultAuditStartupReclassifyTest, DisabledAuditStillReclassifies) { + const auto dir = make_temp_dir("test_startup_reclassify_off_"); + const std::string db_path = (dir / "faults.db").string(); + seed_healed_fault(db_path, "STALE_HEALED"); + + { + rclcpp::NodeOptions options; + options.parameter_overrides({ + {"storage_type", "sqlite"}, + {"database_path", db_path}, + {"healing_enabled", false}, // default audit_log.enabled=false + }); + auto node = std::make_shared(options); + + EXPECT_EQ(node->get_storage().get_fault("STALE_HEALED")->status, Fault::STATUS_CLEARED); + EXPECT_EQ(node->get_audit_log_for_test(), nullptr); + EXPECT_FALSE(std::filesystem::exists(dir / "fault_audit.db")); + } + std::filesystem::remove_all(dir); +} + +namespace { + /// Force the node's next audit append to fail by inserting a row at the seq the /// node will try next (MAX(seq)+1), so its INSERT collides on the seq PRIMARY KEY. /// Done from a separate connection; the append-only triggers do not block INSERT. diff --git a/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp b/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp index bd588013..736a9dd5 100644 --- a/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp +++ b/src/ros2_medkit_fault_manager/test/test_sqlite_storage.cpp @@ -687,8 +687,11 @@ TEST_F(SqliteFaultStorageTest, ReclassifyHealedAsCleared) { } ASSERT_EQ(storage_->get_fault("F")->status, Fault::STATUS_HEALED); - EXPECT_EQ(storage_->reclassify_healed_as_cleared(), 1u); + const auto reclassified = storage_->reclassify_healed_as_cleared(); + ASSERT_EQ(reclassified.size(), 1u); + EXPECT_EQ(reclassified[0], "F"); EXPECT_EQ(storage_->get_fault("F")->status, Fault::STATUS_CLEARED); + EXPECT_TRUE(storage_->reclassify_healed_as_cleared().empty()); } TEST_F(SqliteFaultStorageTest, HealingWhenEnabled) {