From 28e006ecb8a09fd4d536819eedeb8e6797a51556 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Thu, 14 May 2026 13:00:51 +0200 Subject: [PATCH 1/4] fix(gateway,#395): scope per-entity fault GET/DELETE by hosted-app FQNs Per-entity fault routes used the addressed entity's namespace_path as a transport-level prefix filter. When namespace_path was empty (host-derived or synthetic components, manifest components without an explicit `namespace` field, Areas, Functions), the filter short-circuited: - GET /{entity}/faults/{code} returned faults reported by apps that lived in entirely different entities. - DELETE /{entity}/faults/{code} cleared them, since ClearFault.srv has no scope argument and the handler did not check ownership first. Both handlers now resolve the entity to the set of FQNs of every hosted app via the new HandlerContext::resolve_entity_source_fqns helper and require the fault's reporting_sources to intersect that set. Empty set means "nothing in scope" -> 404. DELETE additionally fetches the fault and verifies scope before issuing the clear. The GetFault.srv / ClearFault.srv contracts are unchanged. --- src/ros2_medkit_gateway/CHANGELOG.rst | 1 + .../http/handlers/handler_context.hpp | 25 ++++ .../src/http/handlers/fault_handlers.cpp | 78 ++++++++++- .../src/http/handlers/handler_context.cpp | 53 +++++++ .../test/test_handler_context.cpp | 108 +++++++++++++++ .../test_faults_scope_isolation.test.py | 130 ++++++++++++++++++ 6 files changed, 391 insertions(+), 4 deletions(-) create mode 100644 src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py diff --git a/src/ros2_medkit_gateway/CHANGELOG.rst b/src/ros2_medkit_gateway/CHANGELOG.rst index 8744997e..e48b4f8e 100644 --- a/src/ros2_medkit_gateway/CHANGELOG.rst +++ b/src/ros2_medkit_gateway/CHANGELOG.rst @@ -7,6 +7,7 @@ Unreleased **Fixes:** +* ``GET /api/v1/{entity-path}/faults/{fault_code}`` and ``DELETE /api/v1/{entity-path}/faults/{fault_code}`` no longer leak faults across entities. The previous implementation relied on a prefix match against the entity's ``namespace_path`` and silently disabled the scope filter when the entity had an empty ``namespace_path`` (host-derived / synthetic components, manifest components without a ``namespace`` field, Areas, Functions), exposing - and on ``DELETE``, clearing - faults reported by apps that belonged to entirely different entities. Both handlers now resolve the addressed entity to its hosted-app FQN set and return ``404 Resource Not Found`` when the fault's ``reporting_sources`` does not intersect that set. The underlying ``GetFault.srv`` / ``ClearFault.srv`` contract is unchanged. Any client that previously received a ``200`` / ``204`` for a fault outside an entity's scope will now get a ``404`` (`#395 `_) * ``GET /api/v1/updates/{id}/status`` no longer returns ``404`` for a registered-but-idle package; ``POST /api/v1/updates`` now seeds a ``pending`` status, so the endpoint returns ``200 {"status": "pending"}`` immediately after registration. ``404`` is reserved for packages that are not registered. Clients that used ``404`` as a signal for "registered but nothing started yet" must adapt (`#378 `_) **Features:** diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp index ab019d86..96be3fc8 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -339,6 +340,30 @@ class HandlerContext { static std::vector resolve_app_host_fqns(const ThreadSafeEntityCache & cache, const std::vector & app_ids); + /** + * @brief Resolve an entity to the set of source FQNs it owns. + * + * Returns the FQNs of every ROS 2 node within the entity's scope. Used by + * fault handlers to filter `reporting_sources` so that per-entity routes + * never expose faults reported from outside the addressed entity. + * + * Mapping per entity type: + * - App: the app's `effective_fqn()` (single entry, or empty set if unbound) + * - Component: `effective_fqn()` of every hosted app via `get_apps_for_component()` + * - Area: `effective_fqn()` of every app in every component under the area + * - Function: `node_fqn` of every entry in the function's aggregation configuration + * - Unknown: empty set + * + * An empty result means "no apps are in scope" and callers must treat any + * fault as out-of-scope (i.e., return 404), never as "no filter". + * + * @param cache Entity cache for lookups + * @param entity Resolved entity info (from `validate_entity_for_route`) + * @return Set of FQNs that uniquely scopes faults to this entity + */ + static std::set resolve_entity_source_fqns(const ThreadSafeEntityCache & cache, + const EntityInfo & entity); + private: GatewayNode * node_; CorsConfig cors_config_; diff --git a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp index 4c60328a..6fa29c80 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp @@ -40,6 +40,32 @@ namespace handlers { namespace { +/// Check if a single fault's reporting_sources intersects the given prefix set. +/// Returns true when any reporting_source starts with any prefix. An empty +/// prefix set always returns false ("nothing is in scope"), so callers must +/// supply the entity's own FQN set explicitly - a fault is in scope only when +/// the entity owns at least one app that reported it. +bool fault_in_source_scope(const json & fault, const std::set & source_prefixes) { + if (source_prefixes.empty()) { + return false; + } + if (!fault.contains("reporting_sources") || !fault["reporting_sources"].is_array()) { + return false; + } + for (const auto & src : fault["reporting_sources"]) { + if (!src.is_string()) { + continue; + } + const auto src_str = src.get(); + for (const auto & prefix : source_prefixes) { + if (src_str.rfind(prefix, 0) == 0) { + return true; + } + } + } + return false; +} + /// Helper to filter faults JSON array by a set of namespace prefixes /// Keeps faults where any reporting_source starts with any of the given prefixes json filter_faults_by_sources(const json & faults_array, const std::set & source_prefixes) { @@ -672,17 +698,30 @@ void FaultHandlers::handle_get_fault(const httplib::Request & req, httplib::Resp return; } - std::string namespace_path = entity_info.namespace_path; - auto fault_mgr = ctx_.node()->get_fault_manager(); - // Use get_fault_with_env to get fault with environment data - auto result = fault_mgr->get_fault_with_env(fault_code, namespace_path); + // Fetch the fault without manager-level scope filter. The transport's + // `source_id` prefix match silently disables itself when the entity has + // an empty namespace_path (synthetic / host-derived components, manifest + // components without a `namespace` field, Areas, Functions), so we + // re-scope here against the actual FQNs the entity owns. See #395. + auto result = fault_mgr->get_fault_with_env(fault_code, ""); if (result.success) { // Build SOVD-compliant response from the transport-supplied JSON shape. // The transport handed back `result.data = { "fault": {...}, "environment_data": {...} }`. const auto & fault_json = result.data.value("fault", json::object()); + + const auto & cache = ctx_.node()->get_thread_safe_cache(); + auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + if (!fault_in_source_scope(fault_json, source_fqns)) { + HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", + {{"details", "Fault is not reported by any app within this entity's scope"}, + {entity_info.id_field, entity_id}, + {"fault_code", fault_code}}); + return; + } + const auto & env_data_json = result.data.value("environment_data", json::object()); auto response = build_sovd_fault_response(fault_json, env_data_json, entity_path_info->entity_path); @@ -769,6 +808,37 @@ void FaultHandlers::handle_clear_fault(const httplib::Request & req, httplib::Re } auto fault_mgr = ctx_.node()->get_fault_manager(); + + // Verify the fault is in this entity's scope BEFORE clearing. The + // underlying ClearFault.srv has no scope argument, so without this + // check a DELETE on one entity's route would clear faults owned by + // any other entity that happens to share the fault_code. See #395. + auto get_result = fault_mgr->get_fault_with_env(fault_code, ""); + if (!get_result.success) { + if (get_result.error_message.find("not found") != std::string::npos || + get_result.error_message.find("Fault not found") != std::string::npos) { + HandlerContext::send_error( + res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", + {{"details", get_result.error_message}, {entity_info.id_field, entity_id}, {"fault_code", fault_code}}); + } else { + HandlerContext::send_error( + res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to clear fault", + {{"details", get_result.error_message}, {entity_info.id_field, entity_id}, {"fault_code", fault_code}}); + } + return; + } + + const auto & cache = ctx_.node()->get_thread_safe_cache(); + auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + const auto & fault_json = get_result.data.value("fault", json::object()); + if (!fault_in_source_scope(fault_json, source_fqns)) { + HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", + {{"details", "Fault is not reported by any app within this entity's scope"}, + {entity_info.id_field, entity_id}, + {"fault_code", fault_code}}); + return; + } + auto result = fault_mgr->clear_fault(fault_code); if (result.success) { diff --git a/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp b/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp index 40508a47..301c5a37 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp @@ -347,6 +347,59 @@ void HandlerContext::send_json(httplib::Response & res, const json & data) { res.set_content(data.dump(2), "application/json"); } +namespace { + +void collect_app_fqn(const ThreadSafeEntityCache & cache, const std::string & app_id, std::set & out) { + auto app = cache.get_app(app_id); + if (!app) { + return; + } + auto fqn = app->effective_fqn(); + if (fqn.empty()) { + return; + } + out.insert(std::move(fqn)); +} + +} // namespace + +std::set HandlerContext::resolve_entity_source_fqns(const ThreadSafeEntityCache & cache, + const EntityInfo & entity) { + std::set fqns; + switch (entity.type) { + case EntityType::APP: { + collect_app_fqn(cache, entity.id, fqns); + break; + } + case EntityType::COMPONENT: { + for (const auto & app_id : cache.get_apps_for_component(entity.id)) { + collect_app_fqn(cache, app_id, fqns); + } + break; + } + case EntityType::AREA: { + for (const auto & comp_id : cache.get_components_for_area(entity.id)) { + for (const auto & app_id : cache.get_apps_for_component(comp_id)) { + collect_app_fqn(cache, app_id, fqns); + } + } + break; + } + case EntityType::FUNCTION: { + auto agg_configs = cache.get_entity_configurations(entity.id); + for (const auto & node : agg_configs.nodes) { + if (!node.node_fqn.empty()) { + fqns.insert(node.node_fqn); + } + } + break; + } + case EntityType::UNKNOWN: + break; + } + return fqns; +} + std::vector HandlerContext::resolve_app_host_fqns(const ThreadSafeEntityCache & cache, const std::vector & app_ids) { // Order-preserving dedup: two app_ids that resolve to the same effective_fqn diff --git a/src/ros2_medkit_gateway/test/test_handler_context.cpp b/src/ros2_medkit_gateway/test/test_handler_context.cpp index bc864269..c2221c42 100644 --- a/src/ros2_medkit_gateway/test/test_handler_context.cpp +++ b/src/ros2_medkit_gateway/test/test_handler_context.cpp @@ -1231,6 +1231,114 @@ TEST(ResolveAppHostFqnsTest, DuplicateEffectiveFqnsAreDeduplicated) { EXPECT_EQ(fqns[1], "/powertrain/engine/rpm_sensor"); } +// ============================================================================= +// resolve_entity_source_fqns tests +// +// The helper underpins #395's per-entity fault scope check. It must reject +// faults that come from outside the entity (empty result -> caller returns +// 404) AND must collect the FQNs of every hosted ROS node so that +// reporting-source matching covers all the apps the entity actually owns. +// ============================================================================= + +namespace { + +EntityInfo make_entity_info(EntityType t, const std::string & id) { + EntityInfo ei; + ei.type = t; + ei.id = id; + return ei; +} + +App make_owned_app(const std::string & app_id, const std::string & component_id, const std::string & node_name, + const std::string & ns) { + App a = make_app_with_binding(app_id, node_name, ns); + a.component_id = component_id; + return a; +} + +} // namespace + +TEST(ResolveEntitySourceFqnsTest, AppReturnsItsOwnFqn) { + ThreadSafeEntityCache cache; + cache.update_apps({make_app_with_binding("temp_sensor", "temp_sensor", "/powertrain/engine")}); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::APP, "temp_sensor")); + EXPECT_EQ(fqns, std::set{"/powertrain/engine/temp_sensor"}); +} + +TEST(ResolveEntitySourceFqnsTest, AppMissingFromCacheYieldsEmptySet) { + ThreadSafeEntityCache cache; + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::APP, "gone")); + EXPECT_TRUE(fqns.empty()); +} + +TEST(ResolveEntitySourceFqnsTest, ComponentAggregatesHostedAppFqns) { + ThreadSafeEntityCache cache; + cache.update_apps({ + make_owned_app("temp-sensor", "temp-hw", "temp_sensor", "/powertrain/engine"), + make_owned_app("rpm-sensor", "rpm-hw", "rpm_sensor", "/powertrain/engine"), + make_owned_app("lidar-sensor", "lidar-unit", "lidar_sensor", "/perception/lidar"), + }); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::COMPONENT, "temp-hw")); + // The lidar app must not appear: the bug under #395 was that an empty + // namespace_path on the addressed component silently disabled the scope + // filter and exposed faults from unrelated apps. + EXPECT_EQ(fqns, std::set{"/powertrain/engine/temp_sensor"}); +} + +TEST(ResolveEntitySourceFqnsTest, ComponentWithoutHostedAppsReturnsEmptySet) { + ThreadSafeEntityCache cache; + cache.update_apps({make_owned_app("temp-sensor", "temp-hw", "temp_sensor", "/powertrain/engine")}); + + // Querying a different component yields an empty set, NOT "no filter". + // Callers (fault handlers) treat empty as out-of-scope -> 404. + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::COMPONENT, "lidar-unit")); + EXPECT_TRUE(fqns.empty()); +} + +TEST(ResolveEntitySourceFqnsTest, AreaCollectsAppsFromAllComponentsInArea) { + ThreadSafeEntityCache cache; + Component comp_a; + comp_a.id = "temp-hw"; + comp_a.area = "engine"; + Component comp_b; + comp_b.id = "rpm-hw"; + comp_b.area = "engine"; + Component comp_off_area; + comp_off_area.id = "lidar-unit"; + comp_off_area.area = "perception"; + cache.update_components({comp_a, comp_b, comp_off_area}); + cache.update_apps({ + make_owned_app("temp-sensor", "temp-hw", "temp_sensor", "/powertrain/engine"), + make_owned_app("rpm-sensor", "rpm-hw", "rpm_sensor", "/powertrain/engine"), + make_owned_app("lidar-sensor", "lidar-unit", "lidar_sensor", "/perception/lidar"), + }); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::AREA, "engine")); + std::set expected{"/powertrain/engine/temp_sensor", "/powertrain/engine/rpm_sensor"}; + EXPECT_EQ(fqns, expected); +} + +TEST(ResolveEntitySourceFqnsTest, UnknownEntityTypeReturnsEmptySet) { + ThreadSafeEntityCache cache; + cache.update_apps({make_app_with_binding("temp_sensor", "temp_sensor", "/powertrain/engine")}); + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::UNKNOWN, "anything")); + EXPECT_TRUE(fqns.empty()); +} + +TEST(ResolveEntitySourceFqnsTest, AppsWithEmptyEffectiveFqnAreSkipped) { + ThreadSafeEntityCache cache; + App no_binding; + no_binding.id = "no_binding"; + no_binding.component_id = "comp-x"; + App ok = make_owned_app("ok", "comp-x", "ok_node", "/ns"); + cache.update_apps({no_binding, ok}); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::COMPONENT, "comp-x")); + EXPECT_EQ(fqns, std::set{"/ns/ok_node"}); +} + int main(int argc, char ** argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py new file mode 100644 index 00000000..bf4210f5 --- /dev/null +++ b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# Copyright 2026 bburda +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Regression tests for cross-entity fault scope leak (#395). + +Per-entity fault routes (`GET` / `DELETE +/components/{id}/faults/{fault_code}`) must filter by the apps the entity +owns. Without this, components whose `namespace_path` is empty (synthetic, +host-derived, or manifest components without an explicit ``namespace`` +field) would short-circuit the transport-level prefix filter and expose +faults reported by apps that belong to entirely different components. + +These tests use the demo manifest in hybrid mode. Manifest components +in that file declare ``area`` but not ``namespace``, so every component +sees the bug pre-fix. +""" + +import os +import unittest + +from ament_index_python.packages import get_package_share_directory +import launch_testing +import launch_testing.actions +import requests + +from ros2_medkit_test_utils.constants import ALLOWED_EXIT_CODES +from ros2_medkit_test_utils.gateway_test_case import GatewayTestCase +from ros2_medkit_test_utils.launch_helpers import create_test_launch + + +LIDAR_FAULT_CODE = 'LIDAR_RANGE_INVALID' +LIDAR_OWNER_COMPONENT = 'lidar-unit' +OTHER_COMPONENT = 'temp-sensor-hw' + + +def generate_test_description(): + pkg_share = get_package_share_directory('ros2_medkit_gateway') + manifest_path = os.path.join( + pkg_share, 'config', 'examples', 'demo_nodes_manifest.yaml' + ) + return create_test_launch( + demo_nodes=['lidar_sensor', 'temp_sensor'], + fault_manager=True, + lidar_faulty=True, + fault_manager_params={'confirmation_threshold': -2}, + gateway_params={ + 'discovery.mode': 'hybrid', + 'discovery.manifest_path': manifest_path, + 'discovery.manifest_strict_validation': False, + 'discovery.merge_pipeline.gap_fill.allow_heuristic_apps': True, + }, + ) + + +class TestFaultsScopeIsolation(GatewayTestCase): + """Per-entity fault routes must not leak faults across components.""" + + MIN_EXPECTED_APPS = 2 + REQUIRED_APPS = {'lidar-sensor'} + + def setUp(self): + super().setUp() + # The lidar app reports LIDAR_RANGE_INVALID deterministically when + # launched with faulty params. Wait until the gateway sees it on the + # owning app before exercising the per-component routes. + self.wait_for_fault('/apps/lidar-sensor', LIDAR_FAULT_CODE) + + def test_get_fault_returns_404_on_unrelated_component(self): + """GET on a different component must not return another entity's fault. + + @verifies REQ_INTEROP_013 + """ + response = requests.get( + f'{self.BASE_URL}/components/{OTHER_COMPONENT}/faults/{LIDAR_FAULT_CODE}', + timeout=10, + ) + self.assertEqual( + response.status_code, 404, + f'Expected 404, got {response.status_code} body={response.text}', + ) + + def test_get_fault_returns_200_on_owning_component(self): + """GET on the component that hosts the reporting app must return the fault.""" + data = self.get_json( + f'/components/{LIDAR_OWNER_COMPONENT}/faults/{LIDAR_FAULT_CODE}' + ) + self.assertEqual(data.get('item', {}).get('code'), LIDAR_FAULT_CODE) + + def test_clear_fault_returns_404_on_unrelated_component(self): + """DELETE on a different component must not clear another entity's fault.""" + response = requests.delete( + f'{self.BASE_URL}/components/{OTHER_COMPONENT}/faults/{LIDAR_FAULT_CODE}', + timeout=10, + ) + self.assertEqual( + response.status_code, 404, + f'Expected 404, got {response.status_code} body={response.text}', + ) + + # Verify the fault is still present on the owning app afterwards. + listing = self.get_json('/apps/lidar-sensor/faults') + codes = {item.get('fault_code') for item in listing.get('items', [])} + self.assertIn( + LIDAR_FAULT_CODE, codes, + 'Unrelated DELETE must not clear faults outside its scope', + ) + + +@launch_testing.post_shutdown_test() +class TestShutdown(unittest.TestCase): + + def test_exit_codes(self, proc_info): + """Check all processes exited cleanly (SIGTERM allowed).""" + for info in proc_info: + self.assertIn( + info.returncode, ALLOWED_EXIT_CODES, + f'{info.process_name} exited with code {info.returncode}' + ) From 0340852555e820c5cfcaa0763e81180da0eb0471 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Thu, 14 May 2026 13:42:55 +0200 Subject: [PATCH 2/4] fix(gateway,#395): scope app-level list/clear-all by FQN set too The first pass on #395 only fixed the per-fault GET/DELETE routes. The collection routes (handle_list_faults, handle_clear_all_faults) still passed entity_info.namespace_path as the transport-level prefix filter for App entities. Apps with a wildcard ros_binding.namespace_pattern yield an empty effective_fqn, the transport silently disabled the filter, and the list/clear-all routes leaked - or cleared - every fault in the system. Both App branches now route through resolve_entity_source_fqns plus filter_faults_by_sources, matching how Component / Area / Function are already handled. Also: - Move the #395 CHANGELOG entry to Breaking Changes (the 200 -> 404 flip is observable to clients). - Document 404 on GET /{entity}/faults/{fault_code} in docs/api/rest.rst. - Add @verifies REQ_INTEROP_013 / REQ_INTEROP_015 tags to the scope-isolation tests so the generated verification matrix picks them up. - Add a DELETE happy-path test that clears the fault on the owning component and asserts 204. - Add unit coverage for resolve_entity_source_fqns on FUNCTION entities (aggregation + skip-unbound-hosts cases). - Note the wildcard-App limitation in the helper doxygen. - Drop unused launch_testing.actions import in the integration test. --- docs/api/rest.rst | 8 +++- src/ros2_medkit_gateway/CHANGELOG.rst | 4 +- .../http/handlers/handler_context.hpp | 8 +++- .../src/http/handlers/fault_handlers.cpp | 33 ++++++++++++----- .../test/test_handler_context.cpp | 37 +++++++++++++++++++ .../test_faults_scope_isolation.test.py | 31 ++++++++++++++-- 6 files changed, 103 insertions(+), 18 deletions(-) diff --git a/docs/api/rest.rst b/docs/api/rest.rst index f2b1bcfb..986dfffb 100644 --- a/docs/api/rest.rst +++ b/docs/api/rest.rst @@ -707,11 +707,17 @@ Query and manage faults. - ``freeze_frame``: Topic data captured at fault confirmation - ``rosbag``: Recording file available via bulk-data endpoint + **Response codes:** + + - **200:** Fault details + - **404:** Fault not found, or reported by an app outside this entity's scope + - **503:** Fault manager unavailable + ``DELETE /api/v1/components/{id}/faults/{fault_code}`` Clear a fault. - **204:** Fault cleared - - **404:** Fault not found + - **404:** Fault not found, or reported by an app outside this entity's scope ``DELETE /api/v1/components/{id}/faults`` Clear all faults for an entity. diff --git a/src/ros2_medkit_gateway/CHANGELOG.rst b/src/ros2_medkit_gateway/CHANGELOG.rst index e48b4f8e..c5a77eef 100644 --- a/src/ros2_medkit_gateway/CHANGELOG.rst +++ b/src/ros2_medkit_gateway/CHANGELOG.rst @@ -5,9 +5,9 @@ Changelog for package ros2_medkit_gateway Unreleased ---------- -**Fixes:** +**Breaking Changes:** -* ``GET /api/v1/{entity-path}/faults/{fault_code}`` and ``DELETE /api/v1/{entity-path}/faults/{fault_code}`` no longer leak faults across entities. The previous implementation relied on a prefix match against the entity's ``namespace_path`` and silently disabled the scope filter when the entity had an empty ``namespace_path`` (host-derived / synthetic components, manifest components without a ``namespace`` field, Areas, Functions), exposing - and on ``DELETE``, clearing - faults reported by apps that belonged to entirely different entities. Both handlers now resolve the addressed entity to its hosted-app FQN set and return ``404 Resource Not Found`` when the fault's ``reporting_sources`` does not intersect that set. The underlying ``GetFault.srv`` / ``ClearFault.srv`` contract is unchanged. Any client that previously received a ``200`` / ``204`` for a fault outside an entity's scope will now get a ``404`` (`#395 `_) +* Per-entity fault routes are now correctly scoped to the entity's hosted apps. ``GET /api/v1/{entity-path}/faults/{fault_code}``, ``DELETE /api/v1/{entity-path}/faults/{fault_code}``, ``GET /api/v1/{entity-path}/faults``, and ``DELETE /api/v1/{entity-path}/faults`` previously fell back to a prefix match against the entity's ``namespace_path``; when that was empty (host-derived / synthetic components, manifest components without a ``namespace`` field, Areas, Functions, and Apps with a wildcard ``ros_binding.namespace_pattern``) the scope filter was silently disabled and the routes exposed - and on ``DELETE``, cleared - faults reported by apps that belonged to entirely different entities. All four handlers now resolve the addressed entity to its hosted-app FQN set (via the new ``HandlerContext::resolve_entity_source_fqns`` helper) and apply that set as a positive scope filter. Per-fault routes return ``404 Resource Not Found`` when the fault's ``reporting_sources`` does not intersect the set; collection routes return an empty ``items`` array. The underlying ``GetFault.srv`` / ``ClearFault.srv`` contracts are unchanged. Any client that previously received a ``200`` / ``204`` (single fault) or a populated ``items`` array for faults outside an entity's scope will now get ``404`` or an empty list (`#395 `_) * ``GET /api/v1/updates/{id}/status`` no longer returns ``404`` for a registered-but-idle package; ``POST /api/v1/updates`` now seeds a ``pending`` status, so the endpoint returns ``200 {"status": "pending"}`` immediately after registration. ``404`` is reserved for packages that are not registered. Clients that used ``404`` as a signal for "registered but nothing started yet" must adapt (`#378 `_) **Features:** diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp index 96be3fc8..7b863842 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp @@ -348,10 +348,14 @@ class HandlerContext { * never expose faults reported from outside the addressed entity. * * Mapping per entity type: - * - App: the app's `effective_fqn()` (single entry, or empty set if unbound) + * - App: the app's `effective_fqn()` (single entry, or empty set if unbound + * or if `ros_binding.namespace_pattern` is a wildcard - by design + * `effective_fqn()` returns "" for those, so the scope filter treats them + * as having no addressable node and any fault read becomes a 404) * - Component: `effective_fqn()` of every hosted app via `get_apps_for_component()` * - Area: `effective_fqn()` of every app in every component under the area - * - Function: `node_fqn` of every entry in the function's aggregation configuration + * - Function: non-empty `node_fqn` of every entry in the function's aggregation + * configuration (entries with empty `node_fqn` are skipped) * - Unknown: empty set * * An empty result means "no apps are in scope" and callers must treat any diff --git a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp index 6fa29c80..8233874a 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp @@ -595,20 +595,27 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re return; } - // For Apps, use namespace_path filtering - std::string namespace_path = entity_info.namespace_path; - auto result = - fault_mgr->list_faults(namespace_path, filter.include_pending, filter.include_confirmed, filter.include_cleared, - filter.include_healed, include_muted, include_clusters); + // For Apps, scope by the app's effective FQN set. We can't reuse the + // transport-level prefix filter (`list_faults(namespace_path, ...)`) + // because an App with a wildcard `ros_binding.namespace_pattern` produces + // an empty effective_fqn, the transport silently disables the filter, and + // every fault in the system would be returned. Same bug class as #395 for + // Components - fix it the same way. + const auto & cache = ctx_.node()->get_thread_safe_cache(); + auto app_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + auto result = fault_mgr->list_faults("", filter.include_pending, filter.include_confirmed, filter.include_cleared, + filter.include_healed, include_muted, include_clusters); if (result.success) { + json filtered_faults = filter_faults_by_sources(result.data["faults"], app_fqns); + // Format: items array at top level - json response = {{"items", result.data["faults"]}}; + json response = {{"items", filtered_faults}}; // x-medkit extension for ros2_medkit-specific fields XMedkit ext; ext.entity_id(entity_id); - ext.add("source_id", namespace_path); + ext.add("source_id", entity_info.namespace_path); // Include detailed correlation data if requested and present if (result.data.contains("muted_faults")) { @@ -1006,14 +1013,20 @@ void FaultHandlers::handle_clear_all_faults(const httplib::Request & req, httpli faults_to_clear = filter_faults_by_sources(result.data["faults"], app_fqns); } else { - // Apps: use namespace_path filtering directly - auto result = fault_mgr->list_faults(entity_info.namespace_path); + // Apps: scope by the app's effective FQN. Cannot use the transport-level + // prefix filter (`list_faults(namespace_path)`) directly because Apps + // with a wildcard `ros_binding.namespace_pattern` produce an empty + // effective_fqn, the transport silently disables filtering, and we + // would clear every fault in the system. Same bug class as #395. + auto result = fault_mgr->list_faults(""); if (!result.success) { HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", {{"details", result.error_message}, {entity_info.id_field, entity_id}}); return; } - faults_to_clear = result.data["faults"]; + const auto & cache = ctx_.node()->get_thread_safe_cache(); + auto app_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + faults_to_clear = filter_faults_by_sources(result.data["faults"], app_fqns); } // Clear each matching fault diff --git a/src/ros2_medkit_gateway/test/test_handler_context.cpp b/src/ros2_medkit_gateway/test/test_handler_context.cpp index c2221c42..1c64037c 100644 --- a/src/ros2_medkit_gateway/test/test_handler_context.cpp +++ b/src/ros2_medkit_gateway/test/test_handler_context.cpp @@ -1339,6 +1339,43 @@ TEST(ResolveEntitySourceFqnsTest, AppsWithEmptyEffectiveFqnAreSkipped) { EXPECT_EQ(fqns, std::set{"/ns/ok_node"}); } +TEST(ResolveEntitySourceFqnsTest, FunctionAggregatesHostedAppFqns) { + ThreadSafeEntityCache cache; + cache.update_apps({ + make_app_with_binding("nav-planner", "planner", "/perception/nav"), + make_app_with_binding("nav-localizer", "localizer", "/perception/nav"), + make_app_with_binding("unrelated", "telemetry", "/telemetry"), + }); + Function autonomy; + autonomy.id = "autonomous-navigation"; + autonomy.name = "Autonomous Navigation"; + autonomy.hosts = {"nav-planner", "nav-localizer"}; + cache.update_functions({autonomy}); + + auto fqns = HandlerContext::resolve_entity_source_fqns( + cache, make_entity_info(EntityType::FUNCTION, "autonomous-navigation")); + std::set expected{"/perception/nav/planner", "/perception/nav/localizer"}; + EXPECT_EQ(fqns, expected); +} + +TEST(ResolveEntitySourceFqnsTest, FunctionWithUnboundHostsReturnsOnlyResolvedFqns) { + // Function.hosts referencing an app that does not produce an effective_fqn + // (no bound_fqn and no ros_binding) must be skipped, not return empty FQNs + // - they would prefix-match everything in `fault_in_source_scope` and the + // scope check would let any fault through. + ThreadSafeEntityCache cache; + App unbound; + unbound.id = "unbound"; + cache.update_apps({unbound, make_app_with_binding("planner", "planner", "/perception/nav")}); + Function f; + f.id = "func-x"; + f.hosts = {"unbound", "planner"}; + cache.update_functions({f}); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::FUNCTION, "func-x")); + EXPECT_EQ(fqns, std::set{"/perception/nav/planner"}); +} + int main(int argc, char ** argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py index bf4210f5..170dc2a3 100644 --- a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py +++ b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py @@ -32,7 +32,6 @@ from ament_index_python.packages import get_package_share_directory import launch_testing -import launch_testing.actions import requests from ros2_medkit_test_utils.constants import ALLOWED_EXIT_CODES @@ -92,14 +91,20 @@ def test_get_fault_returns_404_on_unrelated_component(self): ) def test_get_fault_returns_200_on_owning_component(self): - """GET on the component that hosts the reporting app must return the fault.""" + """GET on the component that hosts the reporting app must return the fault. + + @verifies REQ_INTEROP_013 + """ data = self.get_json( f'/components/{LIDAR_OWNER_COMPONENT}/faults/{LIDAR_FAULT_CODE}' ) self.assertEqual(data.get('item', {}).get('code'), LIDAR_FAULT_CODE) def test_clear_fault_returns_404_on_unrelated_component(self): - """DELETE on a different component must not clear another entity's fault.""" + """DELETE on a different component must not clear another entity's fault. + + @verifies REQ_INTEROP_015 + """ response = requests.delete( f'{self.BASE_URL}/components/{OTHER_COMPONENT}/faults/{LIDAR_FAULT_CODE}', timeout=10, @@ -117,6 +122,26 @@ def test_clear_fault_returns_404_on_unrelated_component(self): 'Unrelated DELETE must not clear faults outside its scope', ) + def test_zzz_clear_fault_returns_204_on_owning_component(self): + """DELETE on the owning component clears the fault. + + Named `_zzz_` so unittest's alphabetic ordering runs this last - it is + destructive (clears the lidar fault from the global FaultManager). The + lidar_sensor demo re-reports LIDAR_RANGE_INVALID continuously, so this + does not break other test runs, but in-suite siblings rely on the fault + being present via the setUp `wait_for_fault` poll. + + @verifies REQ_INTEROP_015 + """ + response = requests.delete( + f'{self.BASE_URL}/components/{LIDAR_OWNER_COMPONENT}/faults/{LIDAR_FAULT_CODE}', + timeout=10, + ) + self.assertEqual( + response.status_code, 204, + f'Expected 204, got {response.status_code} body={response.text}', + ) + @launch_testing.post_shutdown_test() class TestShutdown(unittest.TestCase): From 37693373329d2c05a0554ffc9723a1a29ee199b7 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Thu, 14 May 2026 14:34:38 +0200 Subject: [PATCH 3/4] fix(gateway,#395): tighter scope check, area recursion, function components Address the eight review findings from the post-push self-review pass: - Source matching now enforces a path boundary: a node is in scope iff it equals one of the entity's owned FQNs or is a strict path-child (`/<...>`). Pre-fix, `rfind(prefix, 0) == 0` let sibling nodes like `/ns/node_extra` pass for an entity owning `/ns/node`. - `fault_in_source_scope` now requires ALL reporting sources to be in scope. The old "any match" semantic exposed two cross-entity paths: the GET response carries the fault's full `reporting_sources` and environment data; the DELETE issues an unscoped `ClearFault.srv` that clears the aggregated record across every source for the code. `filter_faults_by_sources` inherits the same semantic so per-entity collection routes and per-fault routes agree on what counts as in scope. - `resolve_entity_source_fqns` AREA branch walks `get_subareas()` recursively. The manifest model nests areas (e.g. `engine` under `powertrain`) and components attach to subareas, so a top-level area query previously returned an empty set. - `resolve_entity_source_fqns` FUNCTION branch handles host IDs that are components (Function.hosts is documented as "App or Component IDs"), expanding component hosts to their apps. The cache's `function_to_apps_` index only resolves app hosts. - `fault_in_source_scope` is now a public static method on FaultHandlers so it can be unit-tested directly without a launch test. Added test_fault_handlers cases for exact match, path-child match, prefix collision, multi-source all-in-scope, multi-source partial, empty-scope, missing field, malformed entry, and root-FQN edge. - Added test_handler_context cases for nested-subarea recursion and function-hosting-component expansion. - Integration test setUp now verifies both the owning and the unrelated component were actually discovered before asserting 404. Without it, a 404 from an undiscovered entity is indistinguishable from the 404 the scope filter produces, so the regression would silently pass even with the fix reverted. - Doxygen for `resolve_entity_source_fqns` reworded to be generic about what callers do with an empty result (per-fault routes 404, list 200 with empty `items`, clear-all 204). --- .../http/handlers/fault_handlers.hpp | 29 ++++++ .../http/handlers/handler_context.hpp | 28 ++++-- .../src/http/handlers/fault_handlers.cpp | 86 +++++++++--------- .../src/http/handlers/handler_context.cpp | 78 +++++++++++----- .../test/test_fault_handlers.cpp | 89 +++++++++++++++++++ .../test/test_handler_context.cpp | 62 +++++++++++++ .../test_faults_scope_isolation.test.py | 14 +++ 7 files changed, 312 insertions(+), 74 deletions(-) diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/fault_handlers.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/fault_handlers.hpp index 014fdf0f..87bf9d5b 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/fault_handlers.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/fault_handlers.hpp @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include "ros2_medkit_gateway/http/handlers/handler_context.hpp" @@ -105,6 +106,34 @@ class FaultHandlers { const nlohmann::json & env_data_json, const std::string & entity_path); + /** + * @brief Scope check used by per-entity fault routes. + * + * Returns true iff every entry in `fault["reporting_sources"]` is in scope + * for `source_fqns`. A source matches a scope FQN when it equals the FQN + * or is a strict path-child (i.e. `/<...>`), so similarly named nodes + * like `/ns/node` and `/ns/node_extra` are not conflated. + * + * The "all sources must match" semantic (rather than "any source") is + * deliberate: it blocks two cross-entity escalation paths. + * + * 1. GET would otherwise return a response whose `reporting_sources` and + * environment data carry identities of nodes the caller has no + * business reading. + * 2. DELETE would otherwise let a viewer of entity A clear the aggregated + * fault record for a fault that entity B also reports, because the + * underlying `ClearFault.srv` has no scope argument. + * + * An empty scope set, an empty `reporting_sources` array, a missing + * `reporting_sources` field, or any non-string source entry all return + * false - there is no vacuous "all match" case. + * + * Public for direct unit testing; called by `handle_get_fault`, + * `handle_clear_fault`, and indirectly via `filter_faults_by_sources` by + * the collection routes. + */ + static bool fault_in_source_scope(const nlohmann::json & fault, const std::set & source_fqns); + private: HandlerContext & ctx_; }; diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp index 7b863842..c150a3e9 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/http/handlers/handler_context.hpp @@ -350,16 +350,26 @@ class HandlerContext { * Mapping per entity type: * - App: the app's `effective_fqn()` (single entry, or empty set if unbound * or if `ros_binding.namespace_pattern` is a wildcard - by design - * `effective_fqn()` returns "" for those, so the scope filter treats them - * as having no addressable node and any fault read becomes a 404) - * - Component: `effective_fqn()` of every hosted app via `get_apps_for_component()` - * - Area: `effective_fqn()` of every app in every component under the area - * - Function: non-empty `node_fqn` of every entry in the function's aggregation - * configuration (entries with empty `node_fqn` are skipped) - * - Unknown: empty set + * `effective_fqn()` returns "" for those, so the entity simply has no + * addressable ROS node and the scope check excludes every fault). + * - Component: `effective_fqn()` of every hosted app via + * `get_apps_for_component()`. + * - Area: `effective_fqn()` of every app under the area, walking + * `get_subareas()` recursively so nested areas (e.g. ``powertrain -> + * engine``) resolve to the union of their descendants' apps. + * - Function: `effective_fqn()` of every app the function hosts directly + * plus, for hosts that are component IDs, the apps inside those + * components. + * - Unknown: empty set. * - * An empty result means "no apps are in scope" and callers must treat any - * fault as out-of-scope (i.e., return 404), never as "no filter". + * Apps whose `effective_fqn()` is empty are silently dropped from the set + * so the scope check cannot match arbitrary FQNs against an empty prefix. + * + * An empty result means "no apps are in scope" and callers must NEVER + * interpret that as "no filter" - any fault must be treated as out of + * scope. The exact response (404 for per-fault routes, an empty `items` + * array for collection lists, 204 for collection clears) is up to the + * caller. * * @param cache Entity cache for lookups * @param entity Resolved entity info (from `validate_entity_for_route`) diff --git a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp index 8233874a..6d1273b1 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp @@ -40,55 +40,33 @@ namespace handlers { namespace { -/// Check if a single fault's reporting_sources intersects the given prefix set. -/// Returns true when any reporting_source starts with any prefix. An empty -/// prefix set always returns false ("nothing is in scope"), so callers must -/// supply the entity's own FQN set explicitly - a fault is in scope only when -/// the entity owns at least one app that reported it. -bool fault_in_source_scope(const json & fault, const std::set & source_prefixes) { - if (source_prefixes.empty()) { - return false; - } - if (!fault.contains("reporting_sources") || !fault["reporting_sources"].is_array()) { - return false; - } - for (const auto & src : fault["reporting_sources"]) { - if (!src.is_string()) { - continue; +/// Check if a ROS node FQN falls within the entity's source FQN set. +/// +/// A node is in scope iff it equals one of the entity's owned FQNs, OR is a +/// strict path-child of one (i.e., `/<...>`). We deliberately do +/// NOT use raw `rfind(prefix, 0)` because that would let `/ns/node_extra` +/// pass for an entity owning `/ns/node`. Path boundary is enforced by +/// requiring the byte after the prefix to be `/`. +bool source_matches_scope(const std::string & src, const std::set & scope_fqns) { + for (const auto & fqn : scope_fqns) { + if (src == fqn) { + return true; } - const auto src_str = src.get(); - for (const auto & prefix : source_prefixes) { - if (src_str.rfind(prefix, 0) == 0) { - return true; - } + if (src.size() > fqn.size() && src.compare(0, fqn.size(), fqn) == 0 && src[fqn.size()] == '/') { + return true; } } return false; } -/// Helper to filter faults JSON array by a set of namespace prefixes -/// Keeps faults where any reporting_source starts with any of the given prefixes -json filter_faults_by_sources(const json & faults_array, const std::set & source_prefixes) { +/// Filter a faults JSON array down to faults whose every reporting source is +/// in the entity's scope. Shares the same all-sources / path-boundary +/// semantics as `FaultHandlers::fault_in_source_scope` so per-entity +/// collection routes and per-fault routes agree on what counts as "in scope". +json filter_faults_by_sources(const json & faults_array, const std::set & source_fqns) { json filtered = json::array(); for (const auto & fault : faults_array) { - if (!fault.contains("reporting_sources")) { - continue; - } - const auto & sources = fault["reporting_sources"]; - bool matches = false; - for (const auto & src : sources) { - const std::string src_str = src.get(); - for (const auto & prefix : source_prefixes) { - if (src_str.rfind(prefix, 0) == 0) { - matches = true; - break; - } - } - if (matches) { - break; - } - } - if (matches) { + if (FaultHandlers::fault_in_source_scope(fault, source_fqns)) { filtered.push_back(fault); } } @@ -192,6 +170,28 @@ json extract_primary_value(const std::string & message_type, const json & full_d } // namespace +bool FaultHandlers::fault_in_source_scope(const json & fault, const std::set & source_fqns) { + if (source_fqns.empty()) { + return false; + } + if (!fault.contains("reporting_sources") || !fault["reporting_sources"].is_array()) { + return false; + } + const auto & sources = fault["reporting_sources"]; + if (sources.empty()) { + return false; + } + for (const auto & src : sources) { + if (!src.is_string()) { + return false; + } + if (!source_matches_scope(src.get(), source_fqns)) { + return false; + } + } + return true; +} + // Static method: Build SOVD-compliant fault response from transport-supplied JSON. // // The transport adapter performs ros2_medkit_msgs -> JSON translation; the @@ -721,7 +721,7 @@ void FaultHandlers::handle_get_fault(const httplib::Request & req, httplib::Resp const auto & cache = ctx_.node()->get_thread_safe_cache(); auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); - if (!fault_in_source_scope(fault_json, source_fqns)) { + if (!FaultHandlers::fault_in_source_scope(fault_json, source_fqns)) { HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", {{"details", "Fault is not reported by any app within this entity's scope"}, {entity_info.id_field, entity_id}, @@ -838,7 +838,7 @@ void FaultHandlers::handle_clear_fault(const httplib::Request & req, httplib::Re const auto & cache = ctx_.node()->get_thread_safe_cache(); auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); const auto & fault_json = get_result.data.value("fault", json::object()); - if (!fault_in_source_scope(fault_json, source_fqns)) { + if (!FaultHandlers::fault_in_source_scope(fault_json, source_fqns)) { HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", {{"details", "Fault is not reported by any app within this entity's scope"}, {entity_info.id_field, entity_id}, diff --git a/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp b/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp index 301c5a37..8a4562dc 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/handler_context.cpp @@ -361,39 +361,73 @@ void collect_app_fqn(const ThreadSafeEntityCache & cache, const std::string & ap out.insert(std::move(fqn)); } +void collect_component_app_fqns(const ThreadSafeEntityCache & cache, const std::string & comp_id, + std::set & out) { + for (const auto & app_id : cache.get_apps_for_component(comp_id)) { + collect_app_fqn(cache, app_id, out); + } +} + +void collect_area_app_fqns(const ThreadSafeEntityCache & cache, const std::string & area_id, + std::set & out) { + // BFS over (area, subareas...) so a top-level area whose components live in + // nested subareas still resolves to the union of every descendant's apps. + // Without the recursion, e.g. `/areas/powertrain/...` returns an empty set + // when components are attached to `engine` (subarea of `powertrain`). + std::vector pending = {area_id}; + std::set visited; + while (!pending.empty()) { + auto current = std::move(pending.back()); + pending.pop_back(); + if (!visited.insert(current).second) { + continue; + } + for (const auto & comp_id : cache.get_components_for_area(current)) { + collect_component_app_fqns(cache, comp_id, out); + } + for (const auto & sub_id : cache.get_subareas(current)) { + pending.push_back(sub_id); + } + } +} + +void collect_function_app_fqns(const ThreadSafeEntityCache & cache, const std::string & function_id, + std::set & out) { + // Function.hosts can contain either App IDs or Component IDs; the indexed + // lookups in the cache only resolve the App-host case (function_to_apps_), + // so we walk the raw `hosts` list and dispatch per host kind ourselves. + auto func = cache.get_function(function_id); + if (!func) { + return; + } + for (const auto & host_id : func->hosts) { + if (cache.get_app(host_id)) { + collect_app_fqn(cache, host_id, out); + } else if (cache.get_component(host_id)) { + collect_component_app_fqns(cache, host_id, out); + } + // Unknown host - silently skip; it would have been flagged by manifest validation. + } +} + } // namespace std::set HandlerContext::resolve_entity_source_fqns(const ThreadSafeEntityCache & cache, const EntityInfo & entity) { std::set fqns; switch (entity.type) { - case EntityType::APP: { + case EntityType::APP: collect_app_fqn(cache, entity.id, fqns); break; - } - case EntityType::COMPONENT: { - for (const auto & app_id : cache.get_apps_for_component(entity.id)) { - collect_app_fqn(cache, app_id, fqns); - } + case EntityType::COMPONENT: + collect_component_app_fqns(cache, entity.id, fqns); break; - } - case EntityType::AREA: { - for (const auto & comp_id : cache.get_components_for_area(entity.id)) { - for (const auto & app_id : cache.get_apps_for_component(comp_id)) { - collect_app_fqn(cache, app_id, fqns); - } - } + case EntityType::AREA: + collect_area_app_fqns(cache, entity.id, fqns); break; - } - case EntityType::FUNCTION: { - auto agg_configs = cache.get_entity_configurations(entity.id); - for (const auto & node : agg_configs.nodes) { - if (!node.node_fqn.empty()) { - fqns.insert(node.node_fqn); - } - } + case EntityType::FUNCTION: + collect_function_app_fqns(cache, entity.id, fqns); break; - } case EntityType::UNKNOWN: break; } diff --git a/src/ros2_medkit_gateway/test/test_fault_handlers.cpp b/src/ros2_medkit_gateway/test/test_fault_handlers.cpp index c0b5a5e0..450f0eb3 100644 --- a/src/ros2_medkit_gateway/test/test_fault_handlers.cpp +++ b/src/ros2_medkit_gateway/test/test_fault_handlers.cpp @@ -387,3 +387,92 @@ TEST_F(FaultHandlersTest, BuildSovdFaultResponseMixedSnapshots) { EXPECT_EQ(snap1["type"], "rosbag"); EXPECT_EQ(snap1["bulk_data_uri"], "/components/motor/bulk-data/rosbags/MIXED_FAULT"); } + +// ============================================================================= +// fault_in_source_scope tests (#395) +// +// Direct coverage for the scope-check logic that backs per-entity GET/DELETE +// and the collection-route filter. Integration tests pin the HTTP behavior; +// these tests pin the boundary semantics that decide what counts as "in +// scope" - especially the cases that motivated the post-review tightening: +// prefix-colliding FQNs and multi-source faults. +// ============================================================================= + +namespace { + +json make_fault(std::vector reporting_sources, const std::string & code = "F1") { + json f; + f["fault_code"] = code; + f["reporting_sources"] = std::move(reporting_sources); + return f; +} + +} // namespace + +TEST(FaultInSourceScopeTest, SingleSourceExactMatchInScope) { + EXPECT_TRUE(FaultHandlers::fault_in_source_scope(make_fault({"/perception/lidar/lidar_sensor"}), + {"/perception/lidar/lidar_sensor"})); +} + +TEST(FaultInSourceScopeTest, SubpathOfScopedFqnIsInScope) { + // A node nested under the entity's own FQN must still match - e.g. an app + // FQN `/perception/lidar/lidar_sensor` and a reporter at + // `/perception/lidar/lidar_sensor/diagnostic_updater` should be the same + // app's sub-node, not a stranger. + EXPECT_TRUE(FaultHandlers::fault_in_source_scope(make_fault({"/perception/lidar/lidar_sensor/diagnostic_updater"}), + {"/perception/lidar/lidar_sensor"})); +} + +TEST(FaultInSourceScopeTest, PrefixCollidingNameIsNotInScope) { + // `/ns/node_extra` shares the `/ns/node` prefix but is a distinct ROS node. + // The pre-review check used `rfind(prefix, 0) == 0` and silently let it + // through - this test pins the path-boundary fix. + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(make_fault({"/ns/node_extra"}), {"/ns/node"})); +} + +TEST(FaultInSourceScopeTest, MultiSourceAllInScopeIsInScope) { + std::set scope{"/perception/lidar/lidar_sensor", "/perception/rgb/rgb_camera"}; + EXPECT_TRUE(FaultHandlers::fault_in_source_scope( + make_fault({"/perception/lidar/lidar_sensor", "/perception/rgb/rgb_camera"}), scope)); +} + +TEST(FaultInSourceScopeTest, MultiSourcePartiallyInScopeIsOutOfScope) { + // The all-sources semantic blocks a cross-entity DELETE escalation: an + // entity that owns only `/perception/lidar/lidar_sensor` must not be able + // to clear (or read in detail) a fault that another entity in + // `/telemetry/...` also reported. + std::set scope{"/perception/lidar/lidar_sensor"}; + EXPECT_FALSE(FaultHandlers::fault_in_source_scope( + make_fault({"/perception/lidar/lidar_sensor", "/telemetry/telemetry_node"}), scope)); +} + +TEST(FaultInSourceScopeTest, EmptyScopeSetIsOutOfScope) { + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(make_fault({"/perception/lidar/lidar_sensor"}), {})); +} + +TEST(FaultInSourceScopeTest, EmptyReportingSourcesIsOutOfScope) { + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(make_fault({}), {"/perception/lidar/lidar_sensor"})); +} + +TEST(FaultInSourceScopeTest, MissingReportingSourcesFieldIsOutOfScope) { + json fault; + fault["fault_code"] = "F1"; + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(fault, {"/perception/lidar/lidar_sensor"})); +} + +TEST(FaultInSourceScopeTest, NonStringSourceEntryIsOutOfScope) { + json fault; + fault["fault_code"] = "F1"; + fault["reporting_sources"] = json::array(); + fault["reporting_sources"].push_back("/perception/lidar/lidar_sensor"); + fault["reporting_sources"].push_back(42); // Malformed entry + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(fault, {"/perception/lidar/lidar_sensor"})); +} + +TEST(FaultInSourceScopeTest, RootFqnEdgeCase) { + // Boundary check still works with "/" prefix candidates (artificial but + // exercises the index arithmetic): "/" itself can't match "/anything" + // unless the scope set actually contains "/" - which our resolver never + // emits. This pin catches a future regression that special-cased "/". + EXPECT_FALSE(FaultHandlers::fault_in_source_scope(make_fault({"/something"}), {"/other"})); +} diff --git a/src/ros2_medkit_gateway/test/test_handler_context.cpp b/src/ros2_medkit_gateway/test/test_handler_context.cpp index 1c64037c..f06f7792 100644 --- a/src/ros2_medkit_gateway/test/test_handler_context.cpp +++ b/src/ros2_medkit_gateway/test/test_handler_context.cpp @@ -1358,6 +1358,68 @@ TEST(ResolveEntitySourceFqnsTest, FunctionAggregatesHostedAppFqns) { EXPECT_EQ(fqns, expected); } +TEST(ResolveEntitySourceFqnsTest, AreaWalksNestedSubareasRecursively) { + // Real manifests put components under subareas (e.g. demo manifest attaches + // components to `engine` which is a subarea of `powertrain`). A query for + // the top-level area must walk subareas, otherwise the demo would 404 on + // every `/areas/powertrain/...` fault read. + ThreadSafeEntityCache cache; + Area powertrain; + powertrain.id = "powertrain"; + powertrain.namespace_path = "/powertrain"; + Area engine; + engine.id = "engine"; + engine.namespace_path = "/powertrain/engine"; + engine.parent_area_id = "powertrain"; + Area perception; + perception.id = "perception"; + perception.namespace_path = "/perception"; + cache.update_areas({powertrain, engine, perception}); + + Component engine_ecu; + engine_ecu.id = "engine-ecu"; + engine_ecu.area = "engine"; // attached to subarea, NOT to top-level + Component lidar_unit; + lidar_unit.id = "lidar-unit"; + lidar_unit.area = "perception"; + cache.update_components({engine_ecu, lidar_unit}); + + cache.update_apps({ + make_owned_app("engine-temp-sensor", "engine-ecu", "temp_sensor", "/powertrain/engine"), + make_owned_app("lidar-sensor", "lidar-unit", "lidar_sensor", "/perception/lidar"), + }); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::AREA, "powertrain")); + EXPECT_EQ(fqns, std::set{"/powertrain/engine/temp_sensor"}); +} + +TEST(ResolveEntitySourceFqnsTest, FunctionHostingComponentExpandsToComponentApps) { + // Function.hosts can carry component IDs as well as app IDs (see + // function.hpp). The cache's function_to_apps_ index only resolves + // app-host entries, so the helper must reach into Function.hosts itself + // and expand component hosts to the apps they own. + ThreadSafeEntityCache cache; + cache.update_components({Component{}}); // touch to ensure indexes settle + Component drive_ecu; + drive_ecu.id = "drive-ecu"; + cache.update_components({drive_ecu}); + + cache.update_apps({ + make_owned_app("planner", "drive-ecu", "planner_node", "/drive"), + make_owned_app("localizer", "drive-ecu", "localizer_node", "/drive"), + make_owned_app("standalone", "", "standalone_node", "/misc"), + }); + + Function autonomy; + autonomy.id = "autonomy"; + autonomy.hosts = {"drive-ecu", "standalone"}; // mix of component + app host + cache.update_functions({autonomy}); + + auto fqns = HandlerContext::resolve_entity_source_fqns(cache, make_entity_info(EntityType::FUNCTION, "autonomy")); + std::set expected{"/drive/planner_node", "/drive/localizer_node", "/misc/standalone_node"}; + EXPECT_EQ(fqns, expected); +} + TEST(ResolveEntitySourceFqnsTest, FunctionWithUnboundHostsReturnsOnlyResolvedFqns) { // Function.hosts referencing an app that does not produce an effective_fqn // (no bound_fqn and no ros_binding) must be skipped, not return empty FQNs diff --git a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py index 170dc2a3..786c6c47 100644 --- a/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py +++ b/src/ros2_medkit_integration_tests/test/features/test_faults_scope_isolation.test.py @@ -71,6 +71,20 @@ class TestFaultsScopeIsolation(GatewayTestCase): def setUp(self): super().setUp() + # Verify both components exist before running scope-leak assertions. + # Without this, a 404 from an undiscovered entity would be + # indistinguishable from a 404 produced by the scope filter, and the + # regression test would silently pass even if the fix was reverted. + for comp_id in (LIDAR_OWNER_COMPONENT, OTHER_COMPONENT): + response = requests.get( + f'{self.BASE_URL}/components/{comp_id}', timeout=10, + ) + self.assertEqual( + response.status_code, 200, + f'Precondition failed: component {comp_id} was not discovered ' + f'(GET returned {response.status_code}); the 404 assertions ' + f'below would pass for the wrong reason.', + ) # The lidar app reports LIDAR_RANGE_INVALID deterministically when # launched with faulty params. Wait until the gateway sees it on the # owning app before exercising the per-component routes. From 3e8673740f1e4faa0425f1029d58f13580634727 Mon Sep 17 00:00:00 2001 From: Bartosz Burda Date: Thu, 14 May 2026 15:45:24 +0200 Subject: [PATCH 4/4] fix(gateway,#395): address N1-N5 from second self-review pass N1: Per-entity App fault list response no longer emits the global muted_count / cluster_count / muted_faults / clusters correlation metadata. include_muted / include_clusters URL params are inert on per-entity routes - they remain on the global GET /api/v1/faults route where the aggregates are meaningful. N2: ClearFault.srv gains a `skip_correlation_auto_clear` request flag. fault_manager_node honors it: when true, the correlation engine is NOT asked to compute the auto-clear cascade for the cleared fault code, and auto_cleared_codes in the response is empty. Per-entity DELETE routes (both single-fault and clear-all) set the flag to true so a scoped clear cannot reach faults reported by apps in other entities via auto_clear_with_root rules. Global DELETE /api/v1/faults keeps the flag false to preserve cluster-wide clearing. N3: handle_list_faults and handle_clear_all_faults Function/Area branches now route through HandlerContext::resolve_entity_source_fqns. Previously the Function branch used cache.get_entity_configurations() which only resolves App hosts (so a Function whose `hosts` list carried a Component ID returned an empty FQN set), and the Area branch walked cache.get_components_for_area() without recursing into subareas (so a top-level area like `powertrain` whose components lived in the `engine` subarea returned an empty set). Both classes are fixed via the shared helper. N4: 404 messages on per-fault GET/DELETE now describe the actual all-sources rule: a fault must have every reporting_source in the entity's scope; mixed-source faults that include any out-of-entity reporter are rejected. The previous wording said "not reported by any app within scope" which was wrong for the mixed-source case. N5: CHANGELOG behavior-change description now matches the implementation - the rule is "every reporting_source must be in scope", not just "reporting_sources must intersect". Documents the include_muted / include_clusters drop and the new ClearFault flag for per-entity DELETE. Test changes: - MockFaultServiceTransport in test_fault_manager_routing tracks the new skip_correlation_auto_clear arg. - Two new FaultManagerRoutingTest cases pin the default (false, for global routes) and the per-entity routing (true, for scoped DELETE). --- .../src/fault_manager_node.cpp | 7 +- src/ros2_medkit_gateway/CHANGELOG.rst | 2 +- .../core/managers/fault_manager.hpp | 6 +- .../transports/fault_service_transport.hpp | 7 +- .../ros2_fault_service_transport.hpp | 2 +- .../src/core/managers/fault_manager.cpp | 4 +- .../src/http/handlers/fault_handlers.cpp | 269 +++++------------- .../ros2_fault_service_transport.cpp | 3 +- .../test/test_fault_manager_routing.cpp | 31 +- src/ros2_medkit_msgs/srv/ClearFault.srv | 8 + 10 files changed, 130 insertions(+), 209 deletions(-) diff --git a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp index e8880992..a9c8d742 100644 --- a/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp +++ b/src/ros2_medkit_fault_manager/src/fault_manager_node.cpp @@ -530,9 +530,12 @@ void FaultManagerNode::handle_clear_fault( return; } - // Process through correlation engine first (to get auto-clear list) + // Process through correlation engine first (to get auto-clear list). + // `skip_correlation_auto_clear` lets the caller opt out of cascade-clearing + // correlated symptom fault codes. Per-entity DELETE routes set it to true + // so they cannot reach across entity boundaries via the correlation graph. std::vector auto_cleared_codes; - if (correlation_engine_) { + if (correlation_engine_ && !request->skip_correlation_auto_clear) { auto clear_result = correlation_engine_->process_clear(request->fault_code); auto_cleared_codes = clear_result.auto_cleared_codes; } diff --git a/src/ros2_medkit_gateway/CHANGELOG.rst b/src/ros2_medkit_gateway/CHANGELOG.rst index c5a77eef..4d6928d4 100644 --- a/src/ros2_medkit_gateway/CHANGELOG.rst +++ b/src/ros2_medkit_gateway/CHANGELOG.rst @@ -7,7 +7,7 @@ Unreleased **Breaking Changes:** -* Per-entity fault routes are now correctly scoped to the entity's hosted apps. ``GET /api/v1/{entity-path}/faults/{fault_code}``, ``DELETE /api/v1/{entity-path}/faults/{fault_code}``, ``GET /api/v1/{entity-path}/faults``, and ``DELETE /api/v1/{entity-path}/faults`` previously fell back to a prefix match against the entity's ``namespace_path``; when that was empty (host-derived / synthetic components, manifest components without a ``namespace`` field, Areas, Functions, and Apps with a wildcard ``ros_binding.namespace_pattern``) the scope filter was silently disabled and the routes exposed - and on ``DELETE``, cleared - faults reported by apps that belonged to entirely different entities. All four handlers now resolve the addressed entity to its hosted-app FQN set (via the new ``HandlerContext::resolve_entity_source_fqns`` helper) and apply that set as a positive scope filter. Per-fault routes return ``404 Resource Not Found`` when the fault's ``reporting_sources`` does not intersect the set; collection routes return an empty ``items`` array. The underlying ``GetFault.srv`` / ``ClearFault.srv`` contracts are unchanged. Any client that previously received a ``200`` / ``204`` (single fault) or a populated ``items`` array for faults outside an entity's scope will now get ``404`` or an empty list (`#395 `_) +* Per-entity fault routes are now correctly scoped to the entity's hosted apps. ``GET /api/v1/{entity-path}/faults/{fault_code}``, ``DELETE /api/v1/{entity-path}/faults/{fault_code}``, ``GET /api/v1/{entity-path}/faults``, and ``DELETE /api/v1/{entity-path}/faults`` previously fell back to a prefix match against the entity's ``namespace_path``; when that was empty (host-derived / synthetic components, manifest components without a ``namespace`` field, Areas, Functions, and Apps with a wildcard ``ros_binding.namespace_pattern``) the scope filter was silently disabled and the routes exposed - and on ``DELETE``, cleared - faults reported by apps that belonged to entirely different entities. All four handlers now resolve the addressed entity to its hosted-app FQN set (via the new ``HandlerContext::resolve_entity_source_fqns`` helper) and apply a strict all-sources scope check: a fault counts as in scope only when **every** entry in its ``reporting_sources`` is owned by the entity (exact FQN match, or strict path-child via ``/...``). Per-fault routes return ``404 Resource Not Found`` for any fault that fails the check; collection routes return an empty ``items`` array. The underlying ``GetFault.srv`` contract is unchanged; ``ClearFault.srv`` gains a new ``skip_correlation_auto_clear`` request flag so per-entity DELETE can opt out of cascade-clearing correlated symptom fault codes that may live in other entities. Per-entity collection responses no longer include the global ``muted_count`` / ``cluster_count`` / ``muted_faults`` / ``clusters`` correlation metadata; those remain on the global ``GET /api/v1/faults`` route. Behavior changes visible to clients: (a) faults reported by apps outside the addressed entity are no longer returned or cleared via that entity's route, (b) **mixed-source** faults that include at least one out-of-entity reporter are likewise rejected with ``404`` on per-fault routes and excluded from per-entity collection responses (use the global ``GET /api/v1/faults`` to see them), (c) per-entity DELETE no longer cascade-clears correlated symptoms outside the entity (`#395 `_) * ``GET /api/v1/updates/{id}/status`` no longer returns ``404`` for a registered-but-idle package; ``POST /api/v1/updates`` now seeds a ``pending`` status, so the endpoint returns ``200 {"status": "pending"}`` immediately after registration. ``404`` is reserved for packages that are not registered. Clients that used ``404`` as a signal for "registered but nothing started yet" must adapt (`#378 `_) **Features:** diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/managers/fault_manager.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/managers/fault_manager.hpp index 85798df2..2b6627b7 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/managers/fault_manager.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/managers/fault_manager.hpp @@ -79,8 +79,10 @@ class FaultManager { /// Get a specific fault by code (JSON result - "fault" body only). FaultResult get_fault(const std::string & fault_code, const std::string & source_id = ""); - /// Clear a fault. - FaultResult clear_fault(const std::string & fault_code); + /// Clear a fault. When `skip_correlation_auto_clear` is true the fault + /// manager will not cascade-clear correlated symptom fault codes - per-entity + /// DELETE routes set this to keep their clear inside the entity boundary. + FaultResult clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear = false); /// Get snapshots for a fault (optional topic filter). FaultResult get_snapshots(const std::string & fault_code, const std::string & topic = ""); diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/transports/fault_service_transport.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/transports/fault_service_transport.hpp index 63963d08..8a4c1288 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/transports/fault_service_transport.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/core/transports/fault_service_transport.hpp @@ -47,7 +47,12 @@ class FaultServiceTransport { virtual FaultResult get_fault(const std::string & fault_code, const std::string & source_id) = 0; - virtual FaultResult clear_fault(const std::string & fault_code) = 0; + /// Clear a fault by its fault_code. + /// `skip_correlation_auto_clear`, when true, asks the fault manager to NOT + /// cascade-clear correlated symptom fault codes. Per-entity DELETE routes + /// set this to true so the clear cannot reach faults reported by apps + /// outside the addressed entity via the correlation graph. + virtual FaultResult clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear = false) = 0; virtual FaultResult get_snapshots(const std::string & fault_code, const std::string & topic) = 0; diff --git a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/ros2/transports/ros2_fault_service_transport.hpp b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/ros2/transports/ros2_fault_service_transport.hpp index 7d8ddf96..d6766b22 100644 --- a/src/ros2_medkit_gateway/include/ros2_medkit_gateway/ros2/transports/ros2_fault_service_transport.hpp +++ b/src/ros2_medkit_gateway/include/ros2_medkit_gateway/ros2/transports/ros2_fault_service_transport.hpp @@ -70,7 +70,7 @@ class Ros2FaultServiceTransport : public FaultServiceTransport { FaultResult get_fault(const std::string & fault_code, const std::string & source_id) override; - FaultResult clear_fault(const std::string & fault_code) override; + FaultResult clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear) override; FaultResult get_snapshots(const std::string & fault_code, const std::string & topic) override; diff --git a/src/ros2_medkit_gateway/src/core/managers/fault_manager.cpp b/src/ros2_medkit_gateway/src/core/managers/fault_manager.cpp index 68f22b20..cb02ef22 100644 --- a/src/ros2_medkit_gateway/src/core/managers/fault_manager.cpp +++ b/src/ros2_medkit_gateway/src/core/managers/fault_manager.cpp @@ -41,8 +41,8 @@ FaultResult FaultManager::get_fault(const std::string & fault_code, const std::s return transport_->get_fault(fault_code, source_id); } -FaultResult FaultManager::clear_fault(const std::string & fault_code) { - return transport_->clear_fault(fault_code); +FaultResult FaultManager::clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear) { + return transport_->clear_fault(fault_code, skip_correlation_auto_clear); } FaultResult FaultManager::get_snapshots(const std::string & fault_code, const std::string & topic) { diff --git a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp index 6d1273b1..e16740e6 100644 --- a/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp +++ b/src/ros2_medkit_gateway/src/http/handlers/fault_handlers.cpp @@ -431,9 +431,12 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re return; } - // Parse correlation query parameters - bool include_muted = req.get_param_value("include_muted") == "true"; - bool include_clusters = req.get_param_value("include_clusters") == "true"; + // Note: include_muted / include_clusters URL params are intentionally + // ignored on per-entity routes - the underlying service returns + // correlation metadata computed across the entire fault manager, so + // emitting it on a scoped response would leak cross-entity data + // (review finding N1). The global `GET /faults` route is the only place + // these flags are honored. auto fault_mgr = ctx_.node()->get_fault_manager(); @@ -441,10 +444,10 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re // Functions don't have a single namespace_path (it is always empty in EntityInfo) // because they host apps from potentially different namespaces. // Instead, we collect the FQNs of all host apps and filter by reporting_source. - if (entity_info.type == EntityType::FUNCTION) { + if (entity_info.type == EntityType::FUNCTION || entity_info.type == EntityType::COMPONENT) { // Get all faults (no namespace filter) auto result = fault_mgr->list_faults("", filter.include_pending, filter.include_confirmed, filter.include_cleared, - filter.include_healed, include_muted, include_clusters); + filter.include_healed, /*include_muted=*/false, /*include_clusters=*/false); if (!result.success) { HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to get faults", @@ -452,30 +455,26 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re return; } - // Collect host app FQNs for filtering + // Resolve FQN set via the shared helper so subareas (Areas) and + // component-hosted Functions are handled the same way as the per-fault + // routes - the previous Function branch only walked + // `get_entity_configurations` and dropped component hosts; the previous + // Component branch was correct but is now consolidated for parity. const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto agg_configs = cache.get_entity_configurations(entity_id); - std::set host_fqns; - for (const auto & node : agg_configs.nodes) { - if (!node.node_fqn.empty()) { - host_fqns.insert(node.node_fqn); - } - } + auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); - // Filter faults to only those from function's host apps - json filtered_faults = filter_faults_by_sources(result.data["faults"], host_fqns); + json filtered_faults = filter_faults_by_sources(result.data["faults"], source_fqns); - // Build response json response = {{"items", filtered_faults}}; XMedkit ext; ext.entity_id(entity_id); - ext.add("aggregation_level", "function"); + const bool is_function = entity_info.type == EntityType::FUNCTION; + ext.add("aggregation_level", is_function ? "function" : "component"); ext.add("aggregated", true); - ext.add("host_count", host_fqns.size()); - // Include source app IDs for cross-referencing aggregated results + ext.add(is_function ? "host_count" : "app_count", source_fqns.size()); json source_ids = json::array(); - for (const auto & fqn : host_fqns) { + for (const auto & fqn : source_fqns) { source_ids.push_back(fqn); } ext.add("aggregation_sources", source_ids); @@ -487,64 +486,11 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re return; } - // For Components, aggregate faults from all hosted apps - // Components group Apps, so we filter by the apps' FQNs rather than namespace (which is too broad) - if (entity_info.type == EntityType::COMPONENT) { - // Get all faults (no namespace filter) - auto result = fault_mgr->list_faults("", filter.include_pending, filter.include_confirmed, filter.include_cleared, - filter.include_healed, include_muted, include_clusters); - - if (!result.success) { - HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to get faults", - {{"details", result.error_message}, {entity_info.id_field, entity_id}}); - return; - } - - // Collect hosted app FQNs for filtering - const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto app_ids = cache.get_apps_for_component(entity_id); - std::set app_fqns; - for (const auto & app_id : app_ids) { - auto app = cache.get_app(app_id); - if (app) { - auto fqn = app->effective_fqn(); - if (!fqn.empty()) { - app_fqns.insert(std::move(fqn)); - } - } - } - - // Filter faults to only those from component's hosted apps - json filtered_faults = filter_faults_by_sources(result.data["faults"], app_fqns); - - // Build response - json response = {{"items", filtered_faults}}; - - XMedkit ext; - ext.entity_id(entity_id); - ext.add("aggregation_level", "component"); - ext.add("aggregated", true); - ext.add("app_count", app_fqns.size()); - // Include source app FQNs for cross-referencing aggregated results - json source_fqns = json::array(); - for (const auto & fqn : app_fqns) { - source_fqns.push_back(fqn); - } - ext.add("aggregation_sources", source_fqns); - - merge_peer_items(ctx_.aggregation_manager(), req, response, ext); - ext.add("count", response["items"].size()); - response["x-medkit"] = ext.build(); - HandlerContext::send_json(res, response); - return; - } - // For Areas, aggregate faults from all apps in all components within the area // This is an x-medkit extension - SOVD spec does not define fault collections for Areas if (entity_info.type == EntityType::AREA) { - // Get all faults (no namespace filter) auto result = fault_mgr->list_faults("", filter.include_pending, filter.include_confirmed, filter.include_cleared, - filter.include_healed, include_muted, include_clusters); + filter.include_healed, /*include_muted=*/false, /*include_clusters=*/false); if (!result.success) { HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to get faults", @@ -552,36 +498,23 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re return; } - // Collect FQNs from all apps in all components belonging to this area + // Resolve via the shared helper so the BFS over `get_subareas()` reaches + // components attached to nested subareas (the demo manifest puts every + // component on a subarea, so a direct `get_components_for_area` walk + // would resolve to the empty set and silently 404 every area route). const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto comp_ids = cache.get_components_for_area(entity_id); - std::set app_fqns; - for (const auto & comp_id : comp_ids) { - auto app_ids = cache.get_apps_for_component(comp_id); - for (const auto & app_id : app_ids) { - auto app = cache.get_app(app_id); - if (app) { - auto fqn = app->effective_fqn(); - if (!fqn.empty()) { - app_fqns.insert(std::move(fqn)); - } - } - } - } + auto app_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); - // Filter faults to only those from the area's apps json filtered_faults = filter_faults_by_sources(result.data["faults"], app_fqns); - // Build response json response = {{"items", filtered_faults}}; XMedkit ext; ext.entity_id(entity_id); ext.add("aggregation_level", "area"); ext.add("aggregated", true); - ext.add("component_count", comp_ids.size()); + ext.add("component_count", cache.get_components_for_area(entity_id).size()); ext.add("app_count", app_fqns.size()); - // Include source app FQNs for cross-referencing aggregated results json area_source_fqns = json::array(); for (const auto & fqn : app_fqns) { area_source_fqns.push_back(fqn); @@ -603,8 +536,15 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re // Components - fix it the same way. const auto & cache = ctx_.node()->get_thread_safe_cache(); auto app_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + // include_muted / include_clusters intentionally NOT forwarded: the + // service returns muted/cluster aggregates computed across the entire + // fault manager, not against the entity's app FQN set, so emitting them + // on a per-entity response would leak cross-entity correlation metadata + // (review finding N1). Clients that need system-wide correlation data + // use the global `GET /faults` route. auto result = fault_mgr->list_faults("", filter.include_pending, filter.include_confirmed, filter.include_cleared, - filter.include_healed, include_muted, include_clusters); + filter.include_healed, + /*include_muted=*/false, /*include_clusters=*/false); if (result.success) { json filtered_faults = filter_faults_by_sources(result.data["faults"], app_fqns); @@ -617,18 +557,8 @@ void FaultHandlers::handle_list_faults(const httplib::Request & req, httplib::Re ext.entity_id(entity_id); ext.add("source_id", entity_info.namespace_path); - // Include detailed correlation data if requested and present - if (result.data.contains("muted_faults")) { - ext.add("muted_faults", result.data["muted_faults"]); - } - if (result.data.contains("clusters")) { - ext.add("clusters", result.data["clusters"]); - } - merge_peer_items(ctx_.aggregation_manager(), req, response, ext); ext.add("count", response["items"].size()); - ext.add("muted_count", result.data["muted_count"]); - ext.add("cluster_count", result.data["cluster_count"]); response["x-medkit"] = ext.build(); HandlerContext::send_json(res, response); } else { @@ -722,10 +652,14 @@ void FaultHandlers::handle_get_fault(const httplib::Request & req, httplib::Resp const auto & cache = ctx_.node()->get_thread_safe_cache(); auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); if (!FaultHandlers::fault_in_source_scope(fault_json, source_fqns)) { - HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", - {{"details", "Fault is not reported by any app within this entity's scope"}, - {entity_info.id_field, entity_id}, - {"fault_code", fault_code}}); + HandlerContext::send_error( + res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", + {{"details", + "Fault is not in scope for this entity: every reporting source must be one of the entity's " + "owned apps, and a mixed-source fault that includes any out-of-entity reporter is rejected " + "to prevent cross-entity disclosure"}, + {entity_info.id_field, entity_id}, + {"fault_code", fault_code}}); return; } @@ -839,14 +773,22 @@ void FaultHandlers::handle_clear_fault(const httplib::Request & req, httplib::Re auto source_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); const auto & fault_json = get_result.data.value("fault", json::object()); if (!FaultHandlers::fault_in_source_scope(fault_json, source_fqns)) { - HandlerContext::send_error(res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", - {{"details", "Fault is not reported by any app within this entity's scope"}, - {entity_info.id_field, entity_id}, - {"fault_code", fault_code}}); + HandlerContext::send_error( + res, 404, ERR_RESOURCE_NOT_FOUND, "Fault not found", + {{"details", + "Fault is not in scope for this entity: every reporting source must be one of the entity's " + "owned apps, and a mixed-source fault that includes any out-of-entity reporter is rejected " + "to prevent cross-entity disclosure"}, + {entity_info.id_field, entity_id}, + {"fault_code", fault_code}}); return; } - auto result = fault_mgr->clear_fault(fault_code); + // Pass `skip_correlation_auto_clear=true` so this scoped DELETE cannot + // cascade-clear correlated symptom fault codes that may be reported by + // apps in other entities. The cluster-wide clear behaviour is preserved + // on the global `DELETE /api/v1/faults` route below. + auto result = fault_mgr->clear_fault(fault_code, /*skip_correlation_auto_clear=*/true); if (result.success) { // Format: return 204 No Content on successful delete @@ -944,97 +886,28 @@ void FaultHandlers::handle_clear_all_faults(const httplib::Request & req, httpli auto fault_mgr = ctx_.node()->get_fault_manager(); - // For non-App entities (Functions, Components, Areas), namespace_path is - // either empty or too broad for accurate filtering. Use the same FQN-based - // approach as handle_list_faults: collect host app FQNs and filter by - // reporting_source match. - json faults_to_clear; - - if (entity_info.type == EntityType::FUNCTION) { - auto result = fault_mgr->list_faults(""); - if (!result.success) { - HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", - {{"details", result.error_message}, {entity_info.id_field, entity_id}}); - return; - } - const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto agg_configs = cache.get_entity_configurations(entity_id); - std::set host_fqns; - for (const auto & node : agg_configs.nodes) { - if (!node.node_fqn.empty()) { - host_fqns.insert(node.node_fqn); - } - } - faults_to_clear = filter_faults_by_sources(result.data["faults"], host_fqns); - - } else if (entity_info.type == EntityType::COMPONENT) { - auto result = fault_mgr->list_faults(""); - if (!result.success) { - HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", - {{"details", result.error_message}, {entity_info.id_field, entity_id}}); - return; - } - const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto app_ids = cache.get_apps_for_component(entity_id); - std::set app_fqns; - for (const auto & app_id : app_ids) { - auto app = cache.get_app(app_id); - if (app) { - auto fqn = app->effective_fqn(); - if (!fqn.empty()) { - app_fqns.insert(std::move(fqn)); - } - } - } - faults_to_clear = filter_faults_by_sources(result.data["faults"], app_fqns); - - } else if (entity_info.type == EntityType::AREA) { - auto result = fault_mgr->list_faults(""); - if (!result.success) { - HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", - {{"details", result.error_message}, {entity_info.id_field, entity_id}}); - return; - } - const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto comp_ids = cache.get_components_for_area(entity_id); - std::set app_fqns; - for (const auto & comp_id : comp_ids) { - auto app_ids_inner = cache.get_apps_for_component(comp_id); - for (const auto & app_id : app_ids_inner) { - auto app = cache.get_app(app_id); - if (app) { - auto fqn = app->effective_fqn(); - if (!fqn.empty()) { - app_fqns.insert(std::move(fqn)); - } - } - } - } - faults_to_clear = filter_faults_by_sources(result.data["faults"], app_fqns); - - } else { - // Apps: scope by the app's effective FQN. Cannot use the transport-level - // prefix filter (`list_faults(namespace_path)`) directly because Apps - // with a wildcard `ros_binding.namespace_pattern` produce an empty - // effective_fqn, the transport silently disables filtering, and we - // would clear every fault in the system. Same bug class as #395. - auto result = fault_mgr->list_faults(""); - if (!result.success) { - HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", - {{"details", result.error_message}, {entity_info.id_field, entity_id}}); - return; - } - const auto & cache = ctx_.node()->get_thread_safe_cache(); - auto app_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); - faults_to_clear = filter_faults_by_sources(result.data["faults"], app_fqns); + // Same scope rules as `handle_list_faults` and `handle_get_fault`: every + // entity type resolves through `HandlerContext::resolve_entity_source_fqns` + // so the area BFS, function-hosting-component expansion, and wildcard-app + // empty-set behavior stay consistent across all four fault routes. + auto result = fault_mgr->list_faults(""); + if (!result.success) { + HandlerContext::send_error(res, 503, ERR_SERVICE_UNAVAILABLE, "Failed to retrieve faults", + {{"details", result.error_message}, {entity_info.id_field, entity_id}}); + return; } + const auto & cache = ctx_.node()->get_thread_safe_cache(); + auto entity_fqns = HandlerContext::resolve_entity_source_fqns(cache, entity_info); + json faults_to_clear = filter_faults_by_sources(result.data["faults"], entity_fqns); - // Clear each matching fault + // Clear each matching fault. Use `skip_correlation_auto_clear=true` for + // the same reason as the single-fault DELETE: keep this entity's clear + // from cascading into correlated symptoms reported by other entities. if (faults_to_clear.is_array()) { for (const auto & fault : faults_to_clear) { if (fault.contains("fault_code")) { std::string fault_code = fault["fault_code"].get(); - auto clear_result = fault_mgr->clear_fault(fault_code); + auto clear_result = fault_mgr->clear_fault(fault_code, /*skip_correlation_auto_clear=*/true); if (!clear_result.success) { RCLCPP_WARN(HandlerContext::logger(), "Failed to clear fault '%s' for entity '%s': %s", fault_code.c_str(), entity_id.c_str(), clear_result.error_message.c_str()); diff --git a/src/ros2_medkit_gateway/src/ros2/transports/ros2_fault_service_transport.cpp b/src/ros2_medkit_gateway/src/ros2/transports/ros2_fault_service_transport.cpp index 1e531d63..a48dfcf2 100644 --- a/src/ros2_medkit_gateway/src/ros2/transports/ros2_fault_service_transport.cpp +++ b/src/ros2_medkit_gateway/src/ros2/transports/ros2_fault_service_transport.cpp @@ -281,7 +281,7 @@ FaultResult Ros2FaultServiceTransport::get_fault(const std::string & fault_code, return result; } -FaultResult Ros2FaultServiceTransport::clear_fault(const std::string & fault_code) { +FaultResult Ros2FaultServiceTransport::clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear) { std::lock_guard lock(clear_mutex_); FaultResult result; @@ -294,6 +294,7 @@ FaultResult Ros2FaultServiceTransport::clear_fault(const std::string & fault_cod auto request = std::make_shared(); request->fault_code = fault_code; + request->skip_correlation_auto_clear = skip_correlation_auto_clear; auto future = clear_fault_client_->async_send_request(request); diff --git a/src/ros2_medkit_gateway/test/test_fault_manager_routing.cpp b/src/ros2_medkit_gateway/test/test_fault_manager_routing.cpp index 59fd6c5d..a30ba927 100644 --- a/src/ros2_medkit_gateway/test/test_fault_manager_routing.cpp +++ b/src/ros2_medkit_gateway/test/test_fault_manager_routing.cpp @@ -88,8 +88,9 @@ class MockFaultServiceTransport : public FaultServiceTransport { return r; } - FaultResult clear_fault(const std::string & fault_code) override { + FaultResult clear_fault(const std::string & fault_code, bool skip_correlation_auto_clear) override { last_clear_code_ = fault_code; + last_clear_skip_correlation_ = skip_correlation_auto_clear; ++clear_calls_; FaultResult r; r.success = clear_success_; @@ -180,6 +181,7 @@ class MockFaultServiceTransport : public FaultServiceTransport { json clear_data_ = json{{"success", true}, {"message", "ok"}}; std::string clear_error_; std::string last_clear_code_; + bool last_clear_skip_correlation_ = false; int clear_calls_ = 0; bool snapshots_success_ = true; @@ -316,6 +318,33 @@ TEST(FaultManagerRoutingTest, ClearFaultDelegatesAndReturnsAutoClearedCodes) { EXPECT_EQ(r.data["auto_cleared_codes"].size(), 2u); } +TEST(FaultManagerRoutingTest, ClearFaultDefaultsToCorrelationAutoClear) { + // Global `DELETE /api/v1/faults` and any unscoped caller should preserve + // the legacy auto-clear-with-root behaviour (skip flag defaults to false). + auto mock = std::make_shared(); + mock->clear_success_ = true; + FaultManager mgr(mock); + + mgr.clear_fault("ROOT"); + + EXPECT_EQ(mock->last_clear_code_, "ROOT"); + EXPECT_FALSE(mock->last_clear_skip_correlation_); +} + +TEST(FaultManagerRoutingTest, ClearFaultForwardsSkipCorrelationFlag) { + // Per-entity DELETE routes call `clear_fault(code, /*skip=*/true)` so the + // fault manager does NOT cascade-clear correlated symptoms reported by + // apps outside the addressed entity. Pin the routing here. + auto mock = std::make_shared(); + mock->clear_success_ = true; + FaultManager mgr(mock); + + mgr.clear_fault("ROOT", /*skip_correlation_auto_clear=*/true); + + EXPECT_EQ(mock->last_clear_code_, "ROOT"); + EXPECT_TRUE(mock->last_clear_skip_correlation_); +} + TEST(FaultManagerRoutingTest, GetSnapshotsRoutesTopicFilter) { auto mock = std::make_shared(); mock->snapshots_data_ = {{"topics", json::object()}}; diff --git a/src/ros2_medkit_msgs/srv/ClearFault.srv b/src/ros2_medkit_msgs/srv/ClearFault.srv index c8f3f883..cfa0b778 100644 --- a/src/ros2_medkit_msgs/srv/ClearFault.srv +++ b/src/ros2_medkit_msgs/srv/ClearFault.srv @@ -21,6 +21,14 @@ # Request fields string fault_code # The fault_code to clear + +# Disable auto-cleanup of correlated symptom faults. When false (default), +# clearing a root-cause fault also clears every symptom the correlation engine +# attributes to it (auto_clear_with_root rules). When true, only the requested +# fault_code is cleared and `auto_cleared_codes` in the response is empty. +# Set this from scoped per-entity DELETE routes so a viewer of one entity +# cannot cascade-clear symptom faults reported by apps in other entities. +bool skip_correlation_auto_clear --- # Response fields bool success # True if the fault was found and cleared