From 9d0fbaee77d417998335fb529b240154f517893f Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 28 Jun 2026 21:05:27 +0200 Subject: [PATCH 01/11] feat(gateway): add secure-by-default field profile and hardening checklist Add gateway_params.secure.yaml turning on the existing JWT auth, TLS, restricted CORS and rate limiting for appliance / plant-network deployments, plus design/hardening.rst (checklist + credential/cert provisioning). The appliance image ships the preset; the demo params now point to it. --- .../config/gateway_params.secure.yaml | 117 ++++++++++++++++++ src/ros2_medkit_gateway/design/hardening.rst | 104 ++++++++++++++++ .../docker/Dockerfile.gateway | 6 + .../docker/gateway_params.yaml | 3 + 4 files changed, 230 insertions(+) create mode 100644 src/ros2_medkit_gateway/config/gateway_params.secure.yaml create mode 100644 src/ros2_medkit_gateway/design/hardening.rst diff --git a/src/ros2_medkit_gateway/config/gateway_params.secure.yaml b/src/ros2_medkit_gateway/config/gateway_params.secure.yaml new file mode 100644 index 000000000..8dc739dab --- /dev/null +++ b/src/ros2_medkit_gateway/config/gateway_params.secure.yaml @@ -0,0 +1,117 @@ +# ROS 2 Medkit Gateway - secure field profile +# +# Hardened parameter preset for on-prem / plant-network (appliance) +# deployments. It turns ON every control that the default development config +# leaves OFF: JWT auth, TLS, restricted CORS, and rate limiting. Use this file +# instead of gateway_params.yaml for any deployment reachable from an +# untrusted network. +# +# ros2 run ros2_medkit_gateway gateway_node \ +# --ros-args --params-file gateway_params.secure.yaml +# +# Items marked "REQUIRED" below must be set before the gateway will start / +# accept clients. See design/hardening.rst for the full checklist and +# credential / certificate provisioning steps. + +ros2_medkit_gateway: + ros__parameters: + server: + # Appliance binds all interfaces; TLS + auth below protect the surface. + # Narrow to a management interface IP when the deployment allows it. + host: "0.0.0.0" + port: 8443 + + # TLS/HTTPS - REQUIRED. Provision a real certificate (see + # scripts/generate_dev_certs.sh for the dev-only equivalent). The key + # file must be chmod 600 and owned by the gateway service user. + tls: + enabled: true + cert_file: "/etc/ros2_medkit/certs/cert.pem" # REQUIRED + key_file: "/etc/ros2_medkit/certs/key.pem" # REQUIRED (chmod 600) + ca_file: "" + # 1.3 preferred on a controlled fleet; drop to 1.2 only for legacy + # clients that cannot negotiate 1.3. + min_version: "1.3" + + # CORS - restrict to the explicit origins that serve the operator UI. + # Never use ["*"] with credentials. Empty list disables CORS entirely + # (correct for API-only appliances with no browser UI). + cors: + allowed_origins: ["https://medkit-ui.local"] # REQUIRED if a browser UI is used; else [""] + allowed_methods: ["GET", "PUT", "POST", "DELETE", "OPTIONS"] + allowed_headers: ["Content-Type", "Accept", "Authorization"] + allow_credentials: true + max_age_seconds: 600 + + # Authentication (JWT + RBAC) - REQUIRED on. + auth: + enabled: true + # HS256 shared secret (>= 32 chars) or, for RS256, the private key path. + # REQUIRED - inject from a secret store / env at deploy time; do NOT + # commit a real secret to source control. + jwt_secret: "" # REQUIRED + jwt_public_key: "" + jwt_algorithm: "HS256" + token_expiry_seconds: 3600 + refresh_token_expiry_seconds: 86400 + # "all" forces auth on every request (reads + writes). Use "write" only + # when unauthenticated reads are explicitly acceptable on this network. + require_auth_for: "all" + issuer: "ros2_medkit_gateway" + # Provision the minimum set of role-scoped clients. Format: + # "client_id:client_secret:role" (roles: viewer/operator/configurator/admin). + # REQUIRED - replace with real, rotated credentials. + clients: [""] # REQUIRED + + # Rate limiting - ON to bound abuse / runaway clients. + rate_limiting: + enabled: true + global_requests_per_minute: 600 + client_requests_per_minute: 120 + # Tighten mutating endpoints (operations / data writes). + endpoint_limits: ["/api/v1/*/operations/*:30"] + client_cleanup_interval_seconds: 300 + client_max_idle_seconds: 600 + + # Diagnostic scripts - disabled by default on an appliance. Enable + # deliberately and keep uploads off (manifest-defined scripts only). + scripts: + scripts_dir: "" + allow_uploads: false + max_file_size_mb: 10 + max_concurrent_executions: 5 + default_timeout_sec: 300 + max_execution_history: 100 + + # Bulk data uploads - cap the payload size; raise only if the deployment + # genuinely needs large uploads. + bulk_data: + storage_dir: "/var/lib/ros2_medkit/bulk_data" + max_upload_size: 26214400 # 25 MiB + categories: [""] + + # SOVD resource locking on, so concurrent operators cannot stamp on each + # other's mutations. + locking: + enabled: true + default_max_expiration: 3600 + cleanup_interval: 30 + defaults: + components: + lock_required_scopes: ["operations"] + breakable: true + apps: + lock_required_scopes: ["operations"] + breakable: true + + # OpenAPI /docs endpoints off on a hardened appliance (reduce surface). + docs: + enabled: false + + # Peer aggregation: if used across hosts, require TLS and do NOT forward + # client tokens to mDNS-discovered peers unless every peer is trusted. + aggregation: + enabled: false + require_tls: true + forward_auth: false + peer_scheme: "https" diff --git a/src/ros2_medkit_gateway/design/hardening.rst b/src/ros2_medkit_gateway/design/hardening.rst new file mode 100644 index 000000000..c1815e09e --- /dev/null +++ b/src/ros2_medkit_gateway/design/hardening.rst @@ -0,0 +1,104 @@ +Gateway hardening (secure field profile) +======================================== + +The gateway ships every transport and access control needed for a hardened +deployment - JWT authentication with RBAC, TLS/HTTPS, restricted CORS, and +token-bucket rate limiting - but they are **disabled by default** so local +development works out of the box. A gateway exposed on a plant network with the +defaults is wide open: unauthenticated reads and writes over cleartext HTTP. + +For any deployment reachable from an untrusted network, start from the secure +field profile preset ``config/gateway_params.secure.yaml`` instead of +``config/gateway_params.yaml``: + +.. code-block:: bash + + ros2 run ros2_medkit_gateway gateway_node \ + --ros-args --params-file gateway_params.secure.yaml \ + -p auth.jwt_secret:="$MEDKIT_JWT_SECRET" \ + -p 'auth.clients:=["operator:'"$OP_SECRET"':operator"]' + +What the secure profile turns on +-------------------------------- + +============================ ============== =========================================== +Control Default Secure profile +============================ ============== =========================================== +``auth.enabled`` false true +``auth.require_auth_for`` write all (auth on reads + writes) +``server.tls.enabled`` false true (HTTPS, min TLS 1.3) +``cors.allowed_origins`` ``[""]`` explicit origin list (no wildcard) +``rate_limiting.enabled`` false true (global + per-client + per-endpoint) +``scripts.allow_uploads`` true false (manifest-defined scripts only) +``docs.enabled`` true false (reduced surface) +``bulk_data.max_upload`` 100 MiB 25 MiB +``locking`` on operations none lock required before mutation +============================ ============== =========================================== + +Credential and certificate provisioning +---------------------------------------- + +1. **TLS certificate.** Provision a real server certificate + private key and + point ``server.tls.cert_file`` / ``server.tls.key_file`` at them. The key + file must be ``chmod 600`` and owned by the gateway service user. For a + dev/test box only, ``scripts/generate_dev_certs.sh`` emits a self-signed + ``cert.pem`` / ``key.pem`` / ``ca.pem`` (never use these in production). + +2. **JWT secret.** Generate a high-entropy secret of at least 32 characters + (HS256) or provision an RS256 key pair. Inject it at deploy time from a + secret store or environment variable - do not commit it to source control. + +3. **Role-scoped clients.** Create the minimum set of clients in + ``auth.clients`` (``client_id:client_secret:role``). Roles, least to most + privileged: ``viewer`` (read), ``operator`` (+ trigger ops / ack faults / + publish), ``configurator`` (+ modify configs), ``admin`` (+ auth + management). Rotate secrets periodically. + +4. **Obtain a token** and call the API over HTTPS: + + .. code-block:: bash + + curl -sk -X POST https://gateway:8443/api/v1/auth/authorize \ + -H 'Content-Type: application/json' \ + -d '{"client_id":"operator","client_secret":"...","grant_type":"client_credentials"}' + # use the returned access_token: + curl -sk https://gateway:8443/api/v1/faults -H "Authorization: Bearer $TOKEN" + +Hardening checklist +------------------- + +Before exposing a gateway on a shared / plant network, confirm: + +- [ ] ``auth.enabled: true`` and ``auth.require_auth_for`` is ``all`` (or + ``write`` only if unauthenticated reads are explicitly acceptable). +- [ ] ``auth.jwt_secret`` is set to a >= 32-char secret injected from a secret + store (not the placeholder, not in version control). +- [ ] ``auth.clients`` lists only the role-scoped clients you need; default / + example credentials removed; secrets rotated. +- [ ] ``server.tls.enabled: true`` with a real certificate; private key is + ``chmod 600``; ``min_version`` is ``1.3`` (or ``1.2`` only for legacy + clients). +- [ ] ``cors.allowed_origins`` is an explicit list (no ``*``); ``*`` is never + combined with ``allow_credentials: true``. +- [ ] ``rate_limiting.enabled: true`` with per-client and mutating-endpoint + limits tuned to the deployment. +- [ ] ``scripts.allow_uploads: false`` unless remote script upload is a + required, reviewed capability. +- [ ] ``bulk_data.max_upload_size`` bounded to what the deployment needs. +- [ ] If peer aggregation is used: ``aggregation.require_tls: true`` and + ``forward_auth`` only enabled when every peer is trusted. +- [ ] Bind ``server.host`` to a management interface where the network layout + allows, and place the gateway behind the plant firewall / segmentation. +- [ ] Back the gateway with persistent storage on a volume with restricted + permissions (faults DB, triggers DB, rosbag snapshots). + +OPC-UA plugin (southbound) hardening +------------------------------------ + +The gateway controls the northbound REST surface; the OPC-UA plugin controls +the southbound connection to the PLC. Harden both. The plugin supports +SecurityPolicy (Basic256Sha256 / Aes128 / Aes256), MessageSecurityMode +(Sign / SignAndEncrypt), a client application-instance certificate, a server +trust store with reject-untrusted, and user identity (anonymous / +username-password / X.509). See ``ros2_medkit_opcua`` README, section +"OPC-UA client security". diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/Dockerfile.gateway b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/Dockerfile.gateway index bb8d68a78..2887336b1 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/Dockerfile.gateway +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/Dockerfile.gateway @@ -64,5 +64,11 @@ COPY --from=builder ${COLCON_WS}/install/ ${COLCON_WS}/install/ RUN mkdir -p /config COPY src/ros2_medkit_plugins/ros2_medkit_opcua/config/tank_demo_nodes.yaml /config/tank_nodes.yaml COPY src/ros2_medkit_plugins/ros2_medkit_opcua/docker/gateway_params.yaml /config/gateway_params.yaml +# Secure field profile for appliance deployments (auth + TLS + restricted CORS +# + rate limiting on). The demo gateway_params.yaml above is wide open and is +# for local testing only. Appliance / plant-network deployments must launch +# with --params-file /config/gateway_params.secure.yaml and provision the +# REQUIRED secrets + certificate (see ros2_medkit_gateway design/hardening.rst). +COPY src/ros2_medkit_gateway/config/gateway_params.secure.yaml /config/gateway_params.secure.yaml EXPOSE 8080 diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/gateway_params.yaml b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/gateway_params.yaml index 9ea92f6d3..a331166cb 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/gateway_params.yaml +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/gateway_params.yaml @@ -1,3 +1,6 @@ +# DEMO ONLY - wide open (no auth, no TLS, CORS "*"). For appliance / plant +# deployments use /config/gateway_params.secure.yaml instead (auth + TLS + +# restricted CORS + rate limiting). See ros2_medkit_gateway design/hardening.rst. ros2_medkit_gateway: ros__parameters: server: From 407f675310f6bb678ceb10357d90476299bc92db Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 28 Jun 2026 21:08:58 +0200 Subject: [PATCH 02/11] feat(opcua): add OPC-UA client security (policy, certs, user auth) Extend OpcuaClientConfig + connect() with SecurityPolicy, MessageSecurityMode, a client application-instance certificate, a server trust store with reject-untrusted, and user identity (anonymous / username-password / X.509). Compile in the OpenSSL encryption backend. Anonymous + None stay the default. --- .../ros2_medkit_opcua/CMakeLists.txt | 6 + .../ros2_medkit_opcua/opcua_client.hpp | 78 +++++- .../ros2_medkit_opcua/opcua_plugin.hpp | 4 + .../ros2_medkit_opcua/src/opcua_client.cpp | 243 +++++++++++++++++- .../ros2_medkit_opcua/src/opcua_plugin.cpp | 155 ++++++++++- .../test/test_opcua_client.cpp | 75 ++++++ 6 files changed, 554 insertions(+), 7 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt index d66c3ecce..54e54c463 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt @@ -55,6 +55,12 @@ fetchcontent_declare( set(UAPP_INTERNAL_OPEN62541 ON CACHE BOOL "" FORCE) set(CMAKE_POSITION_INDEPENDENT_CODE ON CACHE BOOL "" FORCE) +# Enable the OpenSSL encryption backend so the client can negotiate signed / +# encrypted SecureChannels (SecurityPolicy Basic256Sha256 etc.) and present a +# client application-instance certificate. Without this open62541 is built +# SecurityPolicy=None only and the encrypted ClientConfig overloads compile out. +set(UA_ENABLE_ENCRYPTION "OPENSSL" CACHE STRING "" FORCE) + # open62541 + open62541pp are third-party code pulled via FetchContent. # ROS2MedkitWarnings promotes -Wswitch-enum, -Wnull-dereference, and other # warnings to errors project-wide, and those fire on upstream C sources that diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp index ce058ad77..41ecc0d8b 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp @@ -42,12 +42,62 @@ struct ReadResult { /// Callback for subscription data changes using DataChangeCallback = std::function; +/// OPC-UA SecurityPolicy selector. Maps to the policy URI sent during the +/// SecureChannel handshake. ``None`` is the only value that works without a +/// client application-instance certificate; everything else requires +/// ``client_cert_path`` + ``client_key_path`` and an encryption-enabled build. +enum class SecurityPolicy { None, Basic256Sha256, Aes128Sha256RsaOaep, Aes256Sha256RsaPss }; + +/// OPC-UA MessageSecurityMode. ``None`` = cleartext; ``Sign`` = signed but not +/// encrypted; ``SignAndEncrypt`` = signed + encrypted. Independent of the +/// SecurityPolicy selector above (the server endpoint must offer the pair). +enum class SecurityMode { None, Sign, SignAndEncrypt }; + +/// Session user identity token type. ``Anonymous`` needs no credentials; +/// ``UsernamePassword`` uses ``username``/``password``; ``X509`` presents a +/// user certificate (``user_cert_path``). +enum class UserAuthMode { Anonymous, UsernamePassword, X509 }; + /// Configuration for OPC-UA connection struct OpcuaClientConfig { std::string endpoint_url = "opc.tcp://localhost:4840"; std::chrono::milliseconds connect_timeout{5000}; std::chrono::milliseconds reconnect_interval{3000}; - // TODO: OPC-UA security (certificates, Basic256Sha256) + + // --- SecureChannel security (opt-in; defaults reproduce the legacy + // anonymous + SecurityPolicy=None behaviour) --- + SecurityPolicy security_policy{SecurityPolicy::None}; + SecurityMode security_mode{SecurityMode::None}; + + /// Client application-instance certificate (X.509 v3, DER-encoded) and its + /// private key (PEM-encoded). Required for any SecurityPolicy other than + /// None. Empty when running unsecured. + std::string client_cert_path; + std::string client_key_path; + + /// Application URI advertised by the client. MUST match the URI entry in + /// the certificate's SubjectAltName, otherwise the server rejects the + /// SecureChannel with BadCertificateUriInvalid. Empty leaves the + /// open62541 default ("urn:open62541.client.application"). + std::string application_uri; + + /// Trusted server / CA certificates (DER-encoded) forming the trust store. + /// Used to validate the server certificate when ``reject_untrusted`` is + /// true. + std::vector trust_list_paths; + + /// When true (default) the server certificate must chain to an entry in + /// ``trust_list_paths``; an untrusted server is rejected. When false the + /// client accepts any server certificate (lab / trust-on-first-use only). + bool reject_untrusted{true}; + + // --- Session user identity --- + UserAuthMode user_auth_mode{UserAuthMode::Anonymous}; + std::string username; + std::string password; + /// User X.509 token certificate (DER-encoded), used when + /// ``user_auth_mode == X509``. + std::string user_cert_path; }; /// RAII wrapper around open62541pp::Client with auto-reconnect @@ -213,6 +263,32 @@ class OpcuaClient { /// Get server description string (for status endpoint) std::string server_description() const; + // --- Security config parsing helpers (pure, unit-testable without a + // server). Case-insensitive; unknown input falls back to the safe default + // and sets ``*ok = false`` when provided. --- + + /// Parse a SecurityPolicy name ("None", "Basic256Sha256", + /// "Aes128Sha256RsaOaep", "Aes256Sha256RsaPss"). Falls back to None. + static SecurityPolicy parse_security_policy(const std::string & name, bool * ok = nullptr); + + /// Map a SecurityPolicy to its OPC-UA policy URI + /// (e.g. "http://opcfoundation.org/UA/SecurityPolicy#Basic256Sha256"). + static std::string security_policy_uri(SecurityPolicy policy); + + /// Parse a MessageSecurityMode name ("None", "Sign", "SignAndEncrypt"). + /// Falls back to None. + static SecurityMode parse_security_mode(const std::string & name, bool * ok = nullptr); + + /// Parse a user-identity mode ("Anonymous", "Username"/"UsernamePassword", + /// "X509"/"Certificate"). Falls back to Anonymous. + static UserAuthMode parse_user_auth_mode(const std::string & name, bool * ok = nullptr); + + /// True when the config requests a secured SecureChannel (any + /// SecurityPolicy other than None, or any MessageSecurityMode other than + /// None). Username/password or X.509 identity alone does NOT imply an + /// encrypted channel. + static bool requires_secure_channel(const OpcuaClientConfig & config); + private: struct Impl; std::unique_ptr impl_; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_plugin.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_plugin.hpp index 5141243d7..e655d49d1 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_plugin.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_plugin.hpp @@ -126,6 +126,10 @@ class OpcuaPlugin : public ros2_medkit_gateway::GatewayPlugin, // Publish PLC values to ROS 2 topics (called after each poll) void publish_values(const PollSnapshot & snap); + // Log the effective OPC-UA security profile (policy / mode / user auth) at + // startup; warns when running unsecured. + void log_security_profile() const; + // Build JSON response for data endpoint nlohmann::json build_data_response(const std::string & entity_id) const; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index f26a13e50..16a9c72e0 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -16,8 +16,10 @@ #include #include +#include #include #include +#include #include #include #include @@ -26,6 +28,10 @@ #include #include +#include +#ifdef UA_ENABLE_ENCRYPTION +#include +#endif #include #include @@ -134,6 +140,26 @@ OpcuaValue variant_to_value(const opcua::Variant & var) { return std::string(""); } +// Lower-case copy for case-insensitive config parsing. +std::string to_lower(const std::string & s) { + std::string out = s; + std::transform(out.begin(), out.end(), out.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + return out; +} + +// Read a whole file into an opcua::ByteString (binary-safe). Returns an empty +// ByteString on failure; the caller treats empty cert/key as a config error. +opcua::ByteString read_file_bytes(const std::string & path) { + std::ifstream f(path, std::ios::binary); + if (!f) { + return opcua::ByteString{}; + } + std::string data((std::istreambuf_iterator(f)), std::istreambuf_iterator()); + return opcua::ByteString(std::string_view(data)); +} + opcua::Variant value_to_variant(const OpcuaValue & val) { return std::visit( [](auto && v) -> opcua::Variant { @@ -199,6 +225,212 @@ OpcuaClient::~OpcuaClient() { disconnect(); } +// --- Security config helpers (pure / unit-testable) --- + +SecurityPolicy OpcuaClient::parse_security_policy(const std::string & name, bool * ok) { + if (ok) { + *ok = true; + } + const std::string n = to_lower(name); + if (n.empty() || n == "none") { + return SecurityPolicy::None; + } + if (n == "basic256sha256") { + return SecurityPolicy::Basic256Sha256; + } + if (n == "aes128sha256rsaoaep" || n == "aes128_sha256_rsaoaep") { + return SecurityPolicy::Aes128Sha256RsaOaep; + } + if (n == "aes256sha256rsapss" || n == "aes256_sha256_rsapss") { + return SecurityPolicy::Aes256Sha256RsaPss; + } + if (ok) { + *ok = false; + } + return SecurityPolicy::None; +} + +std::string OpcuaClient::security_policy_uri(SecurityPolicy policy) { + switch (policy) { + case SecurityPolicy::None: + return "http://opcfoundation.org/UA/SecurityPolicy#None"; + case SecurityPolicy::Basic256Sha256: + return "http://opcfoundation.org/UA/SecurityPolicy#Basic256Sha256"; + case SecurityPolicy::Aes128Sha256RsaOaep: + return "http://opcfoundation.org/UA/SecurityPolicy#Aes128_Sha256_RsaOaep"; + case SecurityPolicy::Aes256Sha256RsaPss: + return "http://opcfoundation.org/UA/SecurityPolicy#Aes256_Sha256_RsaPss"; + } + return "http://opcfoundation.org/UA/SecurityPolicy#None"; +} + +SecurityMode OpcuaClient::parse_security_mode(const std::string & name, bool * ok) { + if (ok) { + *ok = true; + } + const std::string n = to_lower(name); + if (n.empty() || n == "none") { + return SecurityMode::None; + } + if (n == "sign") { + return SecurityMode::Sign; + } + if (n == "signandencrypt" || n == "sign_and_encrypt") { + return SecurityMode::SignAndEncrypt; + } + if (ok) { + *ok = false; + } + return SecurityMode::None; +} + +UserAuthMode OpcuaClient::parse_user_auth_mode(const std::string & name, bool * ok) { + if (ok) { + *ok = true; + } + const std::string n = to_lower(name); + if (n.empty() || n == "anonymous") { + return UserAuthMode::Anonymous; + } + if (n == "username" || n == "usernamepassword" || n == "username_password" || n == "password") { + return UserAuthMode::UsernamePassword; + } + if (n == "x509" || n == "certificate" || n == "cert") { + return UserAuthMode::X509; + } + if (ok) { + *ok = false; + } + return UserAuthMode::Anonymous; +} + +bool OpcuaClient::requires_secure_channel(const OpcuaClientConfig & config) { + return config.security_policy != SecurityPolicy::None || config.security_mode != SecurityMode::None; +} + +namespace { + +#ifdef UA_ENABLE_ENCRYPTION +opcua::MessageSecurityMode to_ua_security_mode(SecurityMode mode) { + switch (mode) { + case SecurityMode::None: + return opcua::MessageSecurityMode::None; + case SecurityMode::Sign: + return opcua::MessageSecurityMode::Sign; + case SecurityMode::SignAndEncrypt: + return opcua::MessageSecurityMode::SignAndEncrypt; + } + return opcua::MessageSecurityMode::None; +} +#endif + +// Rebuild ``client`` with the SecureChannel + user-identity profile requested +// by ``cfg``. A fresh ClientConfig is mandatory for secured connections +// because the application-instance certificate and trust list can only be +// supplied at construction (UA_ClientConfig_setDefaultEncryption). Returns +// false on a fatal configuration error (already logged); the caller then +// reports the connect as failed without contacting the server. +bool apply_security_config(opcua::Client & client, const OpcuaClientConfig & cfg) { + const bool secure = OpcuaClient::requires_secure_channel(cfg); + + if (!secure) { + // Unsecured channel (SecurityPolicy=None, MessageSecurityMode=None). A + // username/password or X.509 identity may still be applied below. + client = opcua::Client(); + } else { +#ifndef UA_ENABLE_ENCRYPTION + RCLCPP_ERROR(opcua_client_logger(), + "OPC-UA SecurityPolicy/MessageSecurityMode requested but this build has no " + "encryption support (UA_ENABLE_ENCRYPTION is off)"); + return false; +#else + if (cfg.client_cert_path.empty() || cfg.client_key_path.empty()) { + RCLCPP_ERROR(opcua_client_logger(), "OPC-UA secured connection requires client_cert_path and client_key_path"); + return false; + } + auto cert = read_file_bytes(cfg.client_cert_path); + auto key = read_file_bytes(cfg.client_key_path); + if (cert.empty() || key.empty()) { + RCLCPP_ERROR(opcua_client_logger(), "OPC-UA: failed to read client cert/key ('%s' / '%s')", + cfg.client_cert_path.c_str(), cfg.client_key_path.c_str()); + return false; + } + + std::vector trust; + trust.reserve(cfg.trust_list_paths.size()); + for (const auto & p : cfg.trust_list_paths) { + auto b = read_file_bytes(p); + if (b.empty()) { + RCLCPP_WARN(opcua_client_logger(), "OPC-UA: skipping unreadable trust-list entry '%s'", p.c_str()); + continue; + } + trust.push_back(std::move(b)); + } + + opcua::ClientConfig cc(cert, key, trust, {}); + + // Force the requested SecurityPolicy. An empty URI lets open62541 pick the + // highest the server offers; we want explicit selection so a downgraded + // server endpoint cannot silently weaken the channel. + const std::string policy_uri = OpcuaClient::security_policy_uri(cfg.security_policy); + if (cfg.security_policy != SecurityPolicy::None && !policy_uri.empty()) { + UA_String_clear(&cc.handle()->securityPolicyUri); + cc.handle()->securityPolicyUri = UA_STRING_ALLOC(policy_uri.c_str()); + } + + cc.setSecurityMode(to_ua_security_mode(cfg.security_mode)); + + // The application URI must match the URI SAN in the client certificate or + // the server rejects the channel with BadCertificateUriInvalid. + if (!cfg.application_uri.empty()) { + UA_String_clear(&cc.handle()->clientDescription.applicationUri); + cc.handle()->clientDescription.applicationUri = UA_STRING_ALLOC(cfg.application_uri.c_str()); + } + + // Trust handling: by default validate the server certificate against the + // trust list. When reject_untrusted is false, accept any server + // certificate (lab / trust-on-first-use only). + if (!cfg.reject_untrusted) { + auto & cv = cc.handle()->certificateVerification; + if (cv.clear) { + cv.clear(&cv); + } + UA_CertificateVerification_AcceptAll(&cv); + } + + client = opcua::Client(std::move(cc)); +#endif + } + + // User identity token (applied regardless of channel encryption). + switch (cfg.user_auth_mode) { + case UserAuthMode::Anonymous: + client.config().setUserIdentityToken(opcua::AnonymousIdentityToken{}); + break; + case UserAuthMode::UsernamePassword: + client.config().setUserIdentityToken(opcua::UserNameIdentityToken(cfg.username, cfg.password)); + break; + case UserAuthMode::X509: { +#ifdef UA_ENABLE_ENCRYPTION + auto ucert = read_file_bytes(cfg.user_cert_path); + if (ucert.empty()) { + RCLCPP_ERROR(opcua_client_logger(), "OPC-UA X.509 user auth: failed to read user_cert_path '%s'", + cfg.user_cert_path.c_str()); + return false; + } + client.config().setUserIdentityToken(opcua::X509IdentityToken(std::move(ucert))); +#else + RCLCPP_ERROR(opcua_client_logger(), "OPC-UA X.509 user auth requires an encryption-enabled build"); + return false; +#endif + break; + } + } + return true; +} + +} // namespace + bool OpcuaClient::connect(const OpcuaClientConfig & config) { std::lock_guard lock(impl_->client_mutex); impl_->config = config; @@ -208,6 +440,14 @@ bool OpcuaClient::connect(const OpcuaClientConfig & config) { impl_->connected = true; return true; } + + // (Re)build the client with the requested security profile before every + // connect so reconnects re-apply the same SecureChannel settings. + if (!apply_security_config(impl_->client, config)) { + impl_->connected = false; + return false; + } + impl_->client.config().setTimeout(static_cast(config.connect_timeout.count())); impl_->client.connect(config.endpoint_url); impl_->connected = true; @@ -223,7 +463,8 @@ bool OpcuaClient::connect(const OpcuaClientConfig & config) { } return true; - } catch (const opcua::BadStatus &) { + } catch (const opcua::BadStatus & e) { + RCLCPP_WARN(opcua_client_logger(), "OPC-UA connect to '%s' failed: %s", config.endpoint_url.c_str(), e.what()); impl_->connected = false; return false; } diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index e4960df78..9e2f1401f 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -115,8 +115,49 @@ void OpcuaPlugin::configure(const nlohmann::json & config) { if (config.contains("endpoint_url")) { client_config_.endpoint_url = config["endpoint_url"].get(); } - // NOTE: OPC-UA security (username/password, certificates) not yet implemented. - // Connect uses Anonymous auth with SecurityPolicy=None. + + // OPC-UA SecureChannel security + user identity. All opt-in; defaults keep + // the legacy anonymous + SecurityPolicy=None behaviour. + if (config.contains("security_policy")) { + client_config_.security_policy = OpcuaClient::parse_security_policy(config["security_policy"].get()); + } + if (config.contains("security_mode") || config.contains("message_security_mode")) { + const std::string mode_str = config.contains("security_mode") ? config["security_mode"].get() + : config["message_security_mode"].get(); + client_config_.security_mode = OpcuaClient::parse_security_mode(mode_str); + } + if (config.contains("client_cert_path")) { + client_config_.client_cert_path = config["client_cert_path"].get(); + } + if (config.contains("client_key_path")) { + client_config_.client_key_path = config["client_key_path"].get(); + } + if (config.contains("application_uri")) { + client_config_.application_uri = config["application_uri"].get(); + } + if (config.contains("trust_list_paths") && config["trust_list_paths"].is_array()) { + client_config_.trust_list_paths.clear(); + for (const auto & p : config["trust_list_paths"]) { + if (p.is_string() && !p.get().empty()) { + client_config_.trust_list_paths.push_back(p.get()); + } + } + } + if (config.contains("reject_untrusted")) { + client_config_.reject_untrusted = config["reject_untrusted"].get(); + } + if (config.contains("user_auth_mode")) { + client_config_.user_auth_mode = OpcuaClient::parse_user_auth_mode(config["user_auth_mode"].get()); + } + if (config.contains("username")) { + client_config_.username = config["username"].get(); + } + if (config.contains("password")) { + client_config_.password = config["password"].get(); + } + if (config.contains("user_cert_path")) { + client_config_.user_cert_path = config["user_cert_path"].get(); + } if (config.contains("node_map_path")) { node_map_path_ = config["node_map_path"].get(); @@ -140,6 +181,58 @@ void OpcuaPlugin::configure(const nlohmann::json & config) { if (auto * env = std::getenv("OPCUA_NODE_MAP_PATH")) { node_map_path_ = env; } + // Security env overrides (for Docker / appliance deployments). + if (auto * env = std::getenv("OPCUA_SECURITY_POLICY")) { + client_config_.security_policy = OpcuaClient::parse_security_policy(env); + } + if (auto * env = std::getenv("OPCUA_SECURITY_MODE")) { + client_config_.security_mode = OpcuaClient::parse_security_mode(env); + } + if (auto * env = std::getenv("OPCUA_CLIENT_CERT")) { + client_config_.client_cert_path = env; + } + if (auto * env = std::getenv("OPCUA_CLIENT_KEY")) { + client_config_.client_key_path = env; + } + if (auto * env = std::getenv("OPCUA_APPLICATION_URI")) { + client_config_.application_uri = env; + } + if (auto * env = std::getenv("OPCUA_TRUST_LIST")) { + // Colon-separated list of DER trust-store paths. + client_config_.trust_list_paths.clear(); + std::string list = env; + size_t start = 0; + while (start <= list.size()) { + size_t sep = list.find(':', start); + std::string path = list.substr(start, sep == std::string::npos ? std::string::npos : sep - start); + if (!path.empty()) { + client_config_.trust_list_paths.push_back(path); + } + if (sep == std::string::npos) { + break; + } + start = sep + 1; + } + } + if (auto * env = std::getenv("OPCUA_REJECT_UNTRUSTED")) { + std::string v = env; + std::transform(v.begin(), v.end(), v.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + client_config_.reject_untrusted = !(v == "0" || v == "false" || v == "no"); + } + if (auto * env = std::getenv("OPCUA_USER_AUTH")) { + client_config_.user_auth_mode = OpcuaClient::parse_user_auth_mode(env); + } + if (auto * env = std::getenv("OPCUA_USERNAME")) { + client_config_.username = env; + } + if (auto * env = std::getenv("OPCUA_PASSWORD")) { + client_config_.password = env; + } + if (auto * env = std::getenv("OPCUA_USER_CERT")) { + client_config_.user_cert_path = env; + } if (!node_map_path_.empty()) { if (!node_map_.load(node_map_path_)) { @@ -177,9 +270,7 @@ void OpcuaPlugin::set_context(PluginContext & context) { } } - log_warn( - "OPC-UA security: Anonymous auth, SecurityPolicy=None. " - "Not suitable for untrusted networks."); + log_security_profile(); if (client_->connect(client_config_)) { log_info("Connected to OPC-UA server: " + client_config_.endpoint_url); @@ -660,6 +751,60 @@ void OpcuaPlugin::publish_values(const PollSnapshot & snap) { } } +void OpcuaPlugin::log_security_profile() const { + auto policy_name = [](SecurityPolicy p) -> std::string { + switch (p) { + case SecurityPolicy::None: + return "None"; + case SecurityPolicy::Basic256Sha256: + return "Basic256Sha256"; + case SecurityPolicy::Aes128Sha256RsaOaep: + return "Aes128Sha256RsaOaep"; + case SecurityPolicy::Aes256Sha256RsaPss: + return "Aes256Sha256RsaPss"; + } + return "None"; + }; + auto mode_name = [](SecurityMode m) -> std::string { + switch (m) { + case SecurityMode::None: + return "None"; + case SecurityMode::Sign: + return "Sign"; + case SecurityMode::SignAndEncrypt: + return "SignAndEncrypt"; + } + return "None"; + }; + auto auth_name = [](UserAuthMode a) -> std::string { + switch (a) { + case UserAuthMode::Anonymous: + return "Anonymous"; + case UserAuthMode::UsernamePassword: + return "Username/Password"; + case UserAuthMode::X509: + return "X.509"; + } + return "Anonymous"; + }; + + const std::string profile = "OPC-UA security: SecurityPolicy=" + policy_name(client_config_.security_policy) + + ", MessageSecurityMode=" + mode_name(client_config_.security_mode) + + ", user=" + auth_name(client_config_.user_auth_mode); + + const bool unsecured = + !OpcuaClient::requires_secure_channel(client_config_) && client_config_.user_auth_mode == UserAuthMode::Anonymous; + if (unsecured) { + log_warn(profile + ". Unsecured - not suitable for untrusted networks."); + } else { + if (!client_config_.reject_untrusted) { + log_warn(profile + ". WARNING: reject_untrusted=false (accepts any server certificate)."); + } else { + log_info(profile); + } + } +} + nlohmann::json OpcuaPlugin::build_data_response(const std::string & entity_id) const { auto entries = node_map_.entries_for_entity(entity_id); auto snap = poller_->snapshot(); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp index 66eb59056..6f23b3d6b 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp @@ -92,6 +92,81 @@ TEST(OpcuaClientTest, WriteValueWithTypeHintDisconnected) { EXPECT_EQ(result.error().code, OpcuaClient::WriteError::NotConnected); } +// --------------------------------------------------------------------------- +// Issue #389: OPC-UA client security config parsing (pure helpers, no server). +// --------------------------------------------------------------------------- + +TEST(OpcuaClientSecurity, ParseSecurityPolicy) { + bool ok = false; + EXPECT_EQ(OpcuaClient::parse_security_policy("None", &ok), SecurityPolicy::None); + EXPECT_TRUE(ok); + EXPECT_EQ(OpcuaClient::parse_security_policy("", &ok), SecurityPolicy::None); + EXPECT_TRUE(ok); + EXPECT_EQ(OpcuaClient::parse_security_policy("basic256sha256"), SecurityPolicy::Basic256Sha256); + EXPECT_EQ(OpcuaClient::parse_security_policy("Basic256Sha256"), SecurityPolicy::Basic256Sha256); + EXPECT_EQ(OpcuaClient::parse_security_policy("Aes128Sha256RsaOaep"), SecurityPolicy::Aes128Sha256RsaOaep); + EXPECT_EQ(OpcuaClient::parse_security_policy("Aes256_Sha256_RsaPss"), SecurityPolicy::Aes256Sha256RsaPss); + // Unknown -> safe default + ok=false. + EXPECT_EQ(OpcuaClient::parse_security_policy("Bogus", &ok), SecurityPolicy::None); + EXPECT_FALSE(ok); +} + +TEST(OpcuaClientSecurity, SecurityPolicyUri) { + EXPECT_EQ(OpcuaClient::security_policy_uri(SecurityPolicy::None), "http://opcfoundation.org/UA/SecurityPolicy#None"); + EXPECT_EQ(OpcuaClient::security_policy_uri(SecurityPolicy::Basic256Sha256), + "http://opcfoundation.org/UA/SecurityPolicy#Basic256Sha256"); + EXPECT_EQ(OpcuaClient::security_policy_uri(SecurityPolicy::Aes128Sha256RsaOaep), + "http://opcfoundation.org/UA/SecurityPolicy#Aes128_Sha256_RsaOaep"); + EXPECT_EQ(OpcuaClient::security_policy_uri(SecurityPolicy::Aes256Sha256RsaPss), + "http://opcfoundation.org/UA/SecurityPolicy#Aes256_Sha256_RsaPss"); +} + +TEST(OpcuaClientSecurity, ParseSecurityMode) { + bool ok = false; + EXPECT_EQ(OpcuaClient::parse_security_mode("none", &ok), SecurityMode::None); + EXPECT_TRUE(ok); + EXPECT_EQ(OpcuaClient::parse_security_mode("Sign"), SecurityMode::Sign); + EXPECT_EQ(OpcuaClient::parse_security_mode("SignAndEncrypt"), SecurityMode::SignAndEncrypt); + EXPECT_EQ(OpcuaClient::parse_security_mode("sign_and_encrypt"), SecurityMode::SignAndEncrypt); + EXPECT_EQ(OpcuaClient::parse_security_mode("bogus", &ok), SecurityMode::None); + EXPECT_FALSE(ok); +} + +TEST(OpcuaClientSecurity, ParseUserAuthMode) { + EXPECT_EQ(OpcuaClient::parse_user_auth_mode("anonymous"), UserAuthMode::Anonymous); + EXPECT_EQ(OpcuaClient::parse_user_auth_mode(""), UserAuthMode::Anonymous); + EXPECT_EQ(OpcuaClient::parse_user_auth_mode("username"), UserAuthMode::UsernamePassword); + EXPECT_EQ(OpcuaClient::parse_user_auth_mode("UsernamePassword"), UserAuthMode::UsernamePassword); + EXPECT_EQ(OpcuaClient::parse_user_auth_mode("x509"), UserAuthMode::X509); + EXPECT_EQ(OpcuaClient::parse_user_auth_mode("certificate"), UserAuthMode::X509); +} + +TEST(OpcuaClientSecurity, RequiresSecureChannel) { + OpcuaClientConfig cfg; + EXPECT_FALSE(OpcuaClient::requires_secure_channel(cfg)); // defaults: None/None + cfg.user_auth_mode = UserAuthMode::UsernamePassword; + // Username/password alone does NOT imply an encrypted channel. + EXPECT_FALSE(OpcuaClient::requires_secure_channel(cfg)); + cfg.security_mode = SecurityMode::Sign; + EXPECT_TRUE(OpcuaClient::requires_secure_channel(cfg)); + cfg.security_mode = SecurityMode::None; + cfg.security_policy = SecurityPolicy::Basic256Sha256; + EXPECT_TRUE(OpcuaClient::requires_secure_channel(cfg)); +} + +TEST(OpcuaClientSecurity, ConnectSecureWithoutCertFailsFast) { + // A secured profile with no client cert/key must fail config-time, before + // touching the network, and leave the client disconnected. + OpcuaClient client; + OpcuaClientConfig cfg; + cfg.endpoint_url = "opc.tcp://localhost:49998"; + cfg.connect_timeout = std::chrono::milliseconds(500); + cfg.security_policy = SecurityPolicy::Basic256Sha256; + cfg.security_mode = SecurityMode::SignAndEncrypt; + EXPECT_FALSE(client.connect(cfg)); + EXPECT_FALSE(client.is_connected()); +} + TEST(OpcuaClientTest, CurrentConfigPersistence) { OpcuaClient client; OpcuaClientConfig cfg; From bd8dec16e5b84b9bd2725666fab9446eeeacd4e9 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Sun, 28 Jun 2026 21:09:21 +0200 Subject: [PATCH 03/11] feat(opcua): active-condition reconnect replay and multi-alarm mapping Replay already-active conditions on (re)subscribe with a configurable strategy (ConditionRefresh, read-based fallback that browses sources and reconciles the fault set, auto, off) so servers rejecting ConditionRefresh still recover their alarms. Route one source's events to distinct faults by condition identity (ConditionName / SourceNode / EventType) and enrich descriptions with the event Message plus configured associated values (SD_n). Allow event-alarm-only maps. --- .../ros2_medkit_opcua/README.md | 118 +++++- .../include/ros2_medkit_opcua/node_map.hpp | 58 ++- .../ros2_medkit_opcua/opcua_client.hpp | 25 ++ .../ros2_medkit_opcua/opcua_poller.hpp | 50 ++- .../ros2_medkit_opcua/src/node_map.cpp | 133 +++++- .../ros2_medkit_opcua/src/opcua_client.cpp | 90 ++++ .../ros2_medkit_opcua/src/opcua_plugin.cpp | 15 +- .../ros2_medkit_opcua/src/opcua_poller.cpp | 395 +++++++++++++----- .../ros2_medkit_opcua/test/test_node_map.cpp | 159 +++++++ .../test/test_opcua_client.cpp | 8 + .../test/test_opcua_plugin.cpp | 13 + 11 files changed, 930 insertions(+), 134 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md b/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md index 9f6f0ac2a..cb64c919d 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md @@ -181,11 +181,41 @@ nodes: # CodeSys 3.5+, Rockwell via FactoryTalk Linx). Mutually exclusive per entry # with the threshold-based alarm form above. event_alarms: + # Simple form: one source -> one fault. - alarm_source: "ns=4;s=Alarms.Overpressure" entity_id: tank_process fault_code: PLC_OVERPRESSURE + + # Multi-alarm form (issue #389): one source emits many conditions (e.g. the + # Server object as a catch-all, or one Object owning several Program_Alarms) + # routed to distinct faults by condition identity. `mappings` are evaluated + # in order; the first whose non-empty match fields all equal the observed + # event wins, falling back to the source-level `fault_code` (if set). + - alarm_source: "ns=0;i=2253" # Server object (system-wide) + entity_id: line_1 + fault_code: PLC_GENERIC_ALARM # catch-all fallback (optional) + mappings: + - condition_name: "Overpressure" # match ConditionType.ConditionName + fault_code: PLC_OVERPRESSURE + severity_override: ERROR + message: "Tank overpressure" + - source_node: "ns=3;s=PumpA" # or match by event SourceNode + event_type: "ns=0;i=2915" # and/or EventType (AND-combined) + fault_code: PLC_PUMP_A_FAULT + # Append vendor associated values (Siemens Program_Alarm SD_1..SD_n) to the + # fault description as "label=value". + associated_values: + - "SD_1" + - name: "SD_2" + label: "Setpoint" ``` +Mapping precedence: a mapping with more (non-empty) match fields is matched +only when all of them equal the observed event; declaration order breaks ties. +An event matching no mapping uses the source-level `fault_code`; if that is also +empty the event is ignored. A mapping-level `severity_override` / `message` +overrides the source-level one; otherwise the source-level value is inherited. + The plugin auto-registers `acknowledge_fault` and `confirm_fault` operations on every entity that has at least one `event_alarms` entry. Invoke them with: @@ -216,6 +246,64 @@ ros2_medkit_gateway: | `poll_interval_ms` | `1000` | Polling interval in ms (clamped to [100, 60000]) | | `prefer_subscriptions` | `false` | Use OPC-UA subscriptions instead of polling | | `subscription_interval_ms` | `500` | Publishing interval for OPC-UA subscriptions when `prefer_subscriptions: true` | +| `condition_replay_strategy` | `auto` | Active-condition replay on reconnect: `method`, `read`, `auto`, `off` (see below) | + +### OPC-UA client security (SecurityPolicy, certificates, user auth) + +By default the client connects with `SecurityPolicy=None` and an `Anonymous` +user (unencrypted, unauthenticated). This is fine for an isolated lab LAN but +not for a hardened server. The following parameters enable a signed/encrypted +SecureChannel with a client application-instance certificate, a server trust +store, and a user identity. All are opt-in; leaving them unset reproduces the +legacy behaviour. + +```yaml +plugins.opcua.security_policy: "Basic256Sha256" # None | Basic256Sha256 | Aes128Sha256RsaOaep | Aes256Sha256RsaPss +plugins.opcua.security_mode: "SignAndEncrypt" # None | Sign | SignAndEncrypt +plugins.opcua.client_cert_path: "/etc/ros2_medkit/opcua/client_cert.der" # X.509 v3, DER +plugins.opcua.client_key_path: "/etc/ros2_medkit/opcua/client_key.pem" # private key, PEM +plugins.opcua.application_uri: "urn:selfpatch:medkit:opcua-client" # MUST match the cert SAN URI +plugins.opcua.trust_list_paths: ["/etc/ros2_medkit/opcua/server_cert.der"] # DER trust store +plugins.opcua.reject_untrusted: true # false = accept any server cert (lab/TOFU only) +plugins.opcua.user_auth_mode: "UsernamePassword" # Anonymous | UsernamePassword | X509 +plugins.opcua.username: "medkit" +plugins.opcua.password: "..." # inject from a secret store at deploy time +plugins.opcua.user_cert_path: "/etc/ros2_medkit/opcua/user_cert.der" # X509 user token only +``` + +Notes: +- A non-None `security_policy` requires `client_cert_path` + `client_key_path`; + a secured connection with no cert fails fast (before contacting the server). +- `application_uri` must equal the URI entry in the client certificate's + SubjectAltName or the server rejects the channel with + `BadCertificateUriInvalid`. +- The client cert (its public part) must be added to the server's trusted + client list, and the server cert (DER) added to `trust_list_paths`, unless + `reject_untrusted: false`. +- The encryption backend (OpenSSL) is compiled in; a build without it logs an + error and refuses any secured profile. + +### Active-condition replay on reconnect (issue #389) + +When the gateway (re)subscribes after a drop or restart, conditions that are +already active on the server would otherwise not be re-reported (only live +transitions flow). The standard recovery is OPC-UA `ConditionRefresh`, but some +servers (Siemens S7-1500) reject it with `BadNodeIdUnknown`. The +`condition_replay_strategy` parameter controls recovery: + +| Value | Behaviour | +|-------|-----------| +| `method` | Call `ConditionRefresh` only (warns if rejected) | +| `read` | Skip the method; browse each `alarm_source`, read current condition state, reconcile the fault set | +| `auto` (default) | Try `ConditionRefresh`; on rejection fall back to `read` | +| `off` | No replay (only live transitions surface) | + +The read fallback browses each `alarm_source` for child AlarmCondition +instances and reads their state, so it relies on the conditions being +browseable under the configured source (point `alarm_source` at the owning +Object, not the Server catch-all, for full restart recovery). It does not +recover the live `EventId`, so an operator ack/confirm may need the next live +event before it succeeds. Node map entries also support an optional `ros2_topic` field to override the auto-generated ROS 2 topic name for the PLC value bridge: @@ -241,6 +329,16 @@ Write operations use the `set_` prefix convention: |----------|-------------| | `OPCUA_ENDPOINT_URL` | OPC-UA server URL | | `OPCUA_NODE_MAP_PATH` | Path to node map YAML | +| `OPCUA_SECURITY_POLICY` | `None` / `Basic256Sha256` / `Aes128Sha256RsaOaep` / `Aes256Sha256RsaPss` | +| `OPCUA_SECURITY_MODE` | `None` / `Sign` / `SignAndEncrypt` | +| `OPCUA_CLIENT_CERT` / `OPCUA_CLIENT_KEY` | Client cert (DER) / key (PEM) paths | +| `OPCUA_APPLICATION_URI` | Client application URI (must match cert SAN) | +| `OPCUA_TRUST_LIST` | Colon-separated DER trust-store paths | +| `OPCUA_REJECT_UNTRUSTED` | `false`/`0`/`no` to accept any server cert (lab only) | +| `OPCUA_USER_AUTH` | `Anonymous` / `UsernamePassword` / `X509` | +| `OPCUA_USERNAME` / `OPCUA_PASSWORD` | Username-password identity | +| `OPCUA_USER_CERT` | X.509 user-token cert (DER) | +| `OPCUA_CONDITION_REPLAY` | `method` / `read` / `auto` / `off` | ## Hardware Deployment @@ -348,16 +446,22 @@ bash scripts/stop.sh | Error handling | 3 | Unknown entity, unknown operation, invalid JSON | | **Total** | **16** | | -## Security Limitations +## Security -**Current version uses Anonymous auth with SecurityPolicy=None.** All OPC-UA communication is unencrypted and unauthenticated. This is acceptable for isolated LANs and demo environments but NOT suitable for production networks exposed to untrusted traffic. +The client supports a signed/encrypted SecureChannel (`SecurityPolicy` +Basic256Sha256 / Aes128 / Aes256, `MessageSecurityMode` Sign / SignAndEncrypt), +a client application-instance certificate, a server trust store with +reject-untrusted, and user identity (Anonymous / Username-Password / X.509). +See "OPC-UA client security" above for configuration. -Planned for future versions: -- OPC-UA certificate-based authentication (Basic256Sha256) -- Username/password authentication -- Configurable security policy per connection +**The default remains `SecurityPolicy=None` + `Anonymous` (unencrypted, +unauthenticated)** for backward compatibility and isolated-LAN demos; enable a +secured profile for any network exposed to untrusted traffic. The plugin logs +the effective security profile at startup and warns when running unsecured or +with `reject_untrusted: false`. -The plugin logs a startup message indicating the auth mode in use. +Northbound (gateway REST) hardening is documented separately in the gateway's +`design/hardening.rst` and the `gateway_params.secure.yaml` field profile. Write operations include configurable `min_value`/`max_value` range validation to prevent out-of-range values being sent to PLC actuators. diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/node_map.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/node_map.hpp index 6b26fe8bc..d34ce6d6c 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/node_map.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/node_map.hpp @@ -33,6 +33,33 @@ struct AlarmConfig { bool above_threshold{true}; // true = alarm when value > threshold }; +/// Reference to one extra OPC-UA event field to select and surface as an +/// associated value (issue #389). Used for vendor associated values such as +/// Siemens Program_Alarm ``SD_1``..``SD_n``. ``label`` is what appears in the +/// fault description; ``name``/``namespace_index`` address the event field via +/// a SimpleAttributeOperand on BaseEventType. +struct AssociatedValueRef { + uint16_t namespace_index{0}; + std::string name; + std::string label; // defaults to ``name`` when unset +}; + +/// One condition-identity mapping inside an ``event_alarms`` source (issue +/// #389). A single OPC-UA source (e.g. the Server object as a catch-all, or a +/// real owning Object) can emit many distinct conditions; each mapping routes +/// a subset to its own SOVD fault. Match fields are AND-combined; an empty +/// match field is a wildcard. The first mapping whose non-empty match fields +/// all equal the observed event wins (declaration order = precedence). +struct AlarmMapping { + std::string match_condition_name; // ConditionType.ConditionName (empty = any) + std::string match_source_node; // event SourceNode id string (empty = any) + std::string match_event_type; // event EventType id string (empty = any) + + std::string fault_code; + std::string severity_override; + std::string message_override; +}; + /// Configuration for a native OPC-UA AlarmConditionType event subscription /// (issue #386). The plugin subscribes to events emitted from /// ``alarm_source`` and bridges them through ``AlarmStateMachine`` into @@ -48,7 +75,9 @@ struct AlarmEventConfig { /// SOVD entity that should host the resulting fault. std::string entity_id; - /// SOVD fault code (e.g. ``PLC_OVERPRESSURE``). + /// Source-level / fallback SOVD fault code (e.g. ``PLC_OVERPRESSURE``). + /// Used when no ``mappings`` entry matches an observed event. May be empty + /// when every alarm is routed through ``mappings``. std::string fault_code; /// Optional severity override. When empty, ``AlarmStateMachine`` derives @@ -59,6 +88,24 @@ struct AlarmEventConfig { /// Optional friendly message override; falls back to the event's /// ``Message`` field when empty. std::string message_override; + + /// Issue #389: per-condition-identity mappings (multi-alarm). Resolved in + /// declaration order; first match wins, falling back to the source-level + /// ``fault_code`` above. + std::vector mappings; + + /// Issue #389: extra event fields to append to the fault description. + std::vector associated_values; +}; + +/// Result of resolving an observed event against an ``AlarmEventConfig`` +/// (issue #389). ``matched`` is false when neither a mapping nor the +/// source-level fault_code applies (the event should be ignored). +struct ResolvedAlarm { + std::string fault_code; + std::string severity_override; + std::string message_override; + bool matched{false}; }; /// Mapping entry: OPC-UA NodeId -> SOVD entity data point @@ -127,9 +174,16 @@ class NodeMap { } /// Find an event-mode alarm by ``(entity_id, fault_code)`` (used by the - /// SOVD ``acknowledge_fault`` / ``confirm_fault`` operations). + /// SOVD ``acknowledge_fault`` / ``confirm_fault`` operations). Matches the + /// source-level fault_code or any of the entry's mapping fault_codes. const AlarmEventConfig * find_event_alarm(const std::string & entity_id, const std::string & fault_code) const; + /// Resolve an observed event against a config's mappings (issue #389). + /// First matching mapping wins; falls back to the source-level fault_code. + /// Pure / static so the precedence rules are unit-testable without a server. + static ResolvedAlarm resolve_alarm(const AlarmEventConfig & cfg, const std::string & condition_name, + const std::string & source_node, const std::string & event_type); + /// Get derived SOVD entity definitions const std::vector & entity_defs() const { return entity_defs_; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp index 41ecc0d8b..558e38179 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp @@ -260,6 +260,31 @@ class OpcuaClient { static tl::expected classify_call_result(uint32_t overall_status_code, const std::vector & arg_results); + /// Current state of one OPC-UA Condition instance, read directly from the + /// address space (issue #389 reconnect replay). Used as the read-based + /// fallback when ConditionRefresh is unavailable: instead of waiting for + /// the server to replay buffered events, the client browses the alarm + /// source for its condition instances and reads their state variables. + struct ConditionStateSnapshot { + opcua::NodeId condition_id; + std::string condition_name; // ConditionType.ConditionName (issue #389 mapping) + bool enabled_state{true}; + bool active_state{false}; + bool acked_state{true}; + bool confirmed_state{true}; + bool retain{false}; + uint16_t severity{0}; + std::string message; + }; + + /// Browse ``source_node`` for AlarmCondition instances and read their + /// current state (ActiveState/Id, AckedState/Id, ConfirmedState/Id, + /// EnabledState/Id, Retain, Severity, Message). Only immediate children + /// that expose an ``ActiveState`` node are treated as conditions; other + /// children are skipped. Returns an empty vector when disconnected or when + /// the source has no readable conditions. + std::vector read_source_conditions(const opcua::NodeId & source_node); + /// Get server description string (for status endpoint) std::string server_description() const; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp index cbb3a6fb9..bb0daf534 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -54,9 +55,10 @@ struct AlarmEventDelivery { std::string entity_id; SovdAlarmStatus next_status; AlarmAction action; - uint16_t severity{0}; // raw OPC-UA Severity 1-1000 - std::string message; // event Message field (or AlarmEventConfig override) - std::string condition_id; // string form of OPC-UA ConditionId + uint16_t severity{0}; // raw OPC-UA Severity 1-1000 + std::string severity_override; // resolved severity bucket override (issue #389), empty = derive from severity + std::string message; // event Message field (or resolved override) + std::string condition_id; // string form of OPC-UA ConditionId }; using EventAlarmCallback = std::function; @@ -76,12 +78,27 @@ struct ConditionRuntime { /// Callback after each poll cycle (for publishing values to ROS 2 topics) using PollCallback = std::function; +/// Strategy for replaying already-active OPC-UA conditions on (re)subscribe +/// (issue #389). Some servers (e.g. Siemens S7-1500) reject the standard +/// ConditionRefresh method with BadNodeIdUnknown, so the active alarm set is +/// otherwise lost across a reconnect / gateway restart. +/// - Method: call ConditionRefresh only (Part 9 §5.5.7 standard path). +/// - Read: skip ConditionRefresh; browse the alarm sources and read each +/// condition's current state, then reconcile the fault set. +/// - Auto: try ConditionRefresh first; on rejection fall back to Read. +/// - Off: no replay (only live transitions surface). +enum class ConditionReplayStrategy { Method, Read, Auto, Off }; + /// Configuration for the poller struct PollerConfig { bool prefer_subscriptions{false}; // poll mode by default (subscriptions need event loop) double subscription_interval_ms{500.0}; std::chrono::milliseconds poll_interval{1000}; std::chrono::milliseconds reconnect_interval{5000}; + /// Active-condition replay strategy on (re)subscribe (issue #389). + /// Default Auto: ConditionRefresh with a read-based fallback so hardened + /// servers that reject the method still recover their active alarms. + ConditionReplayStrategy condition_replay_strategy{ConditionReplayStrategy::Auto}; /// Optional warn-level log sink for operator-visible failures inside the /// poll thread. Set by the plugin owning the poller to its log_warn /// helper so events like ``ConditionRefresh failed`` reach the ROS 2 log @@ -136,6 +153,10 @@ class OpcuaPoller { /// currently active for the entity. std::optional lookup_condition(const std::string & entity_id, const std::string & fault_code) const; + /// Parse a replay-strategy name ("method", "read", "auto", "off"). + /// Case-insensitive; unknown input falls back to Auto. + static ConditionReplayStrategy parse_replay_strategy(const std::string & name); + private: void poll_loop(); void do_poll(); @@ -148,7 +169,28 @@ class OpcuaPoller { void on_event(const AlarmEventConfig & cfg, const std::vector & values, const opcua::NodeId & source_node, const opcua::NodeId & event_type, const opcua::NodeId & condition_id); - void condition_refresh(); + + /// Run the configured active-condition replay (issue #389). Dispatches to + /// ConditionRefresh and/or the read-based fallback per + /// ``condition_replay_strategy``. + void replay_active_conditions(); + /// Call OPC-UA ConditionRefresh. Returns true when the server accepted it. + bool try_condition_refresh(); + /// Read-based fallback: browse each alarm source, read current condition + /// state, drive the state machine, and reconcile the fault set. + void read_fallback_replay(); + /// Clear faults for conditions that were active before the replay but are + /// no longer present/active in the read scan (``seen`` = condition ids + /// observed this scan). + void reconcile_after_read(const std::set & seen); + + /// Apply one condition state observation (from a live event or a read scan) + /// to the tracked condition map + state machine, dispatching the resulting + /// fault action. ``event_id`` is the live EventId for ack/confirm (null for + /// read-based observations). + void apply_condition_state(const AlarmEventConfig & cfg, const opcua::NodeId & condition_id, + const AlarmEventInput & input, uint16_t severity, const std::string & message, + const opcua::ByteString * event_id); OpcuaClient & client_; const NodeMap & node_map_; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp index 755faafda..e86e910ef 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp @@ -175,29 +175,25 @@ bool NodeMap::load(const std::string & yaml_path) { auto_browse_ = root["auto_browse"].as(); } - // Node entries - auto nodes = root["nodes"]; - if (!nodes || !nodes.IsSequence()) { - // Clear any stale state from previous load - entries_.clear(); - entity_index_.clear(); - node_id_index_.clear(); - entity_defs_.clear(); - return false; - } - + // Node entries. The ``nodes:`` section is optional: a config may declare + // only ``event_alarms:`` (native AlarmCondition subscriptions with no + // scalar data points to poll). The final emptiness check below rejects a + // file that declares neither. entries_.clear(); entity_index_.clear(); node_id_index_.clear(); - if (nodes.size() > 10000) { + auto nodes = root["nodes"]; + const bool has_nodes = nodes && nodes.IsSequence(); + + if (has_nodes && nodes.size() > 10000) { RCLCPP_ERROR(rclcpp::get_logger("opcua.node_map"), "Node map has %zu entries (max 10000) - refusing to load to prevent resource exhaustion", nodes.size()); return false; } - for (size_t i = 0; i < nodes.size(); ++i) { + for (size_t i = 0; has_nodes && i < nodes.size(); ++i) { const auto & n = nodes[i]; // Validate required fields @@ -320,18 +316,64 @@ bool NodeMap::load(const std::string & yaml_path) { } for (size_t i = 0; i < event_alarms_node.size(); ++i) { const auto & a = event_alarms_node[i]; - if (!a["alarm_source"] || !a["entity_id"] || !a["fault_code"]) { + if (!a["alarm_source"] || !a["entity_id"]) { RCLCPP_WARN(rclcpp::get_logger("opcua.node_map"), - "event_alarms[%zu] missing alarm_source/entity_id/fault_code - skipping", i); + "event_alarms[%zu] missing alarm_source/entity_id - skipping", i); continue; } AlarmEventConfig cfg; cfg.source_node_id_str = a["alarm_source"].as(); cfg.source_node_id = parse_node_id(cfg.source_node_id_str); cfg.entity_id = a["entity_id"].as(); - cfg.fault_code = a["fault_code"].as(); + cfg.fault_code = a["fault_code"].as(""); cfg.severity_override = a["severity_override"].as(""); cfg.message_override = a["message"].as(""); + + // Issue #389: per-condition-identity mappings (multi-alarm). + if (a["mappings"] && a["mappings"].IsSequence()) { + for (const auto & m : a["mappings"]) { + if (!m["fault_code"]) { + RCLCPP_WARN(rclcpp::get_logger("opcua.node_map"), + "event_alarms[%zu] mapping missing fault_code - skipping", i); + continue; + } + AlarmMapping mapping; + mapping.match_condition_name = m["condition_name"].as(""); + mapping.match_source_node = m["source_node"].as(""); + mapping.match_event_type = m["event_type"].as(""); + mapping.fault_code = m["fault_code"].as(); + mapping.severity_override = m["severity_override"].as(""); + mapping.message_override = m["message"].as(""); + cfg.mappings.push_back(std::move(mapping)); + } + } + + // Issue #389: extra associated-value event fields (e.g. SD_1..SD_n). + if (a["associated_values"] && a["associated_values"].IsSequence()) { + for (const auto & v : a["associated_values"]) { + AssociatedValueRef ref; + if (v.IsScalar()) { + ref.name = v.as(); + } else { + ref.namespace_index = v["namespace_index"].as(0); + ref.name = v["name"].as(""); + ref.label = v["label"].as(""); + } + if (ref.name.empty()) { + continue; + } + if (ref.label.empty()) { + ref.label = ref.name; + } + cfg.associated_values.push_back(std::move(ref)); + } + } + + if (cfg.fault_code.empty() && cfg.mappings.empty()) { + RCLCPP_WARN(rclcpp::get_logger("opcua.node_map"), + "event_alarms[%zu] has neither fault_code nor mappings - skipping", i); + continue; + } event_alarms_.push_back(std::move(cfg)); } } @@ -364,7 +406,12 @@ bool NodeMap::load(const std::string & yaml_path) { { std::set> event_keys; for (const auto & cfg : event_alarms_) { - event_keys.emplace(cfg.entity_id, cfg.fault_code); + if (!cfg.fault_code.empty()) { + event_keys.emplace(cfg.entity_id, cfg.fault_code); + } + for (const auto & m : cfg.mappings) { + event_keys.emplace(cfg.entity_id, m.fault_code); + } } for (const auto & entry : entries_) { if (entry.alarm.has_value() && event_keys.count({entry.entity_id, entry.alarm->fault_code}) > 0) { @@ -377,6 +424,16 @@ bool NodeMap::load(const std::string & yaml_path) { } } + // A config that declares neither a 'nodes:' section nor any event alarms + // carries no mappings and is almost always a mistake (wrong file / typo). + // A present-but-empty 'nodes:' section is still a valid config. + if (!has_nodes && event_alarms_.empty()) { + RCLCPP_ERROR(rclcpp::get_logger("opcua.node_map"), + "Node map has neither 'nodes:' nor 'event_alarms:' entries - nothing to map"); + entity_defs_.clear(); + return false; + } + build_entity_defs(); return true; @@ -389,13 +446,53 @@ bool NodeMap::load(const std::string & yaml_path) { const AlarmEventConfig * NodeMap::find_event_alarm(const std::string & entity_id, const std::string & fault_code) const { for (const auto & cfg : event_alarms_) { - if (cfg.entity_id == entity_id && cfg.fault_code == fault_code) { + if (cfg.entity_id != entity_id) { + continue; + } + if (cfg.fault_code == fault_code) { return &cfg; } + for (const auto & m : cfg.mappings) { + if (m.fault_code == fault_code) { + return &cfg; + } + } } return nullptr; } +ResolvedAlarm NodeMap::resolve_alarm(const AlarmEventConfig & cfg, const std::string & condition_name, + const std::string & source_node, const std::string & event_type) { + ResolvedAlarm out; + // First mapping whose non-empty match fields all equal the observed event + // wins (declaration order = precedence). + for (const auto & m : cfg.mappings) { + if (!m.match_condition_name.empty() && m.match_condition_name != condition_name) { + continue; + } + if (!m.match_source_node.empty() && m.match_source_node != source_node) { + continue; + } + if (!m.match_event_type.empty() && m.match_event_type != event_type) { + continue; + } + out.fault_code = m.fault_code; + // A mapping-level override wins; otherwise inherit the source-level one. + out.severity_override = m.severity_override.empty() ? cfg.severity_override : m.severity_override; + out.message_override = m.message_override.empty() ? cfg.message_override : m.message_override; + out.matched = true; + return out; + } + // No mapping matched: fall back to the source-level fault_code (catch-all). + if (!cfg.fault_code.empty()) { + out.fault_code = cfg.fault_code; + out.severity_override = cfg.severity_override; + out.message_override = cfg.message_override; + out.matched = true; + } + return out; +} + std::vector NodeMap::entries_for_entity(const std::string & entity_id) const { std::vector result; auto it = entity_index_.find(entity_id); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index 16a9c72e0..5fbfb7cdf 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -797,6 +797,96 @@ std::string OpcuaClient::server_description() const { return impl_->server_desc; } +std::vector +OpcuaClient::read_source_conditions(const opcua::NodeId & source_node) { + std::lock_guard lock(impl_->client_mutex); + std::vector out; + if (!impl_->connected) { + return out; + } + + // Read a Boolean two-step state variable (e.g. ActiveState/Id). Returns the + // fallback when the path is absent or not Boolean. ``found`` reports whether + // the path resolved at all (used to decide if a child is a condition). + auto read_state_id = [](opcua::Node & cond, const char * state_name, bool fallback, + bool * found) -> bool { + try { + auto node = cond.browseChild({{0, state_name}, {0, "Id"}}); + auto val = node.readValue(); + if (found) { + *found = true; + } + if (val.isType()) { + return val.getScalarCopy(); + } + return fallback; + } catch (const opcua::BadStatus &) { + if (found) { + *found = false; + } + return fallback; + } + }; + + try { + opcua::Node src(impl_->client, source_node); + auto children = src.browseChildren(opcua::ReferenceTypeId::HierarchicalReferences, opcua::NodeClass::Object); + for (auto & child : children) { + ConditionStateSnapshot snap; + snap.condition_id = child.id(); + + // A child is only treated as an alarm condition when ActiveState/Id + // resolves; everything else under the source is ignored. + bool is_condition = false; + snap.active_state = read_state_id(child, "ActiveState", false, &is_condition); + if (!is_condition) { + continue; + } + + snap.enabled_state = read_state_id(child, "EnabledState", true, nullptr); + snap.acked_state = read_state_id(child, "AckedState", true, nullptr); + snap.confirmed_state = read_state_id(child, "ConfirmedState", true, nullptr); + + try { + auto retain = child.browseChild({{0, "Retain"}}).readValue(); + if (retain.isType()) { + snap.retain = retain.getScalarCopy(); + } + } catch (const opcua::BadStatus &) { + } + try { + auto sev = child.browseChild({{0, "Severity"}}).readValue(); + if (sev.isType()) { + snap.severity = sev.getScalarCopy(); + } + } catch (const opcua::BadStatus &) { + } + try { + auto cname = child.browseChild({{0, "ConditionName"}}).readValue(); + if (cname.isType()) { + snap.condition_name = std::string(cname.getScalarCopy()); + } + } catch (const opcua::BadStatus &) { + } + try { + auto msg = child.browseChild({{0, "Message"}}).readValue(); + if (msg.isType()) { + snap.message = std::string(msg.getScalarCopy().getText()); + } else if (msg.isType()) { + snap.message = std::string(msg.getScalarCopy()); + } + } catch (const opcua::BadStatus &) { + } + + out.push_back(std::move(snap)); + } + } catch (const opcua::BadStatus & e) { + maybe_mark_disconnected(impl_->connected, impl_->generation, e); + } + + return out; +} + // ---------------------------------------------------------------------------- // Issue #386: native OPC-UA AlarmCondition event subscription primitives. // open62541pp v0.16 has no native EventFilter / event subscription API, so the diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index 9e2f1401f..e750052dc 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -173,6 +173,13 @@ void OpcuaPlugin::configure(const nlohmann::json & config) { auto ms = std::clamp(config["poll_interval_ms"].get(), 100, 60000); poller_config_.poll_interval = std::chrono::milliseconds(ms); } + if (config.contains("condition_replay_strategy")) { + poller_config_.condition_replay_strategy = + OpcuaPoller::parse_replay_strategy(config["condition_replay_strategy"].get()); + } + if (auto * env = std::getenv("OPCUA_CONDITION_REPLAY")) { + poller_config_.condition_replay_strategy = OpcuaPoller::parse_replay_strategy(env); + } // Environment variables override YAML config (for Docker) if (auto * env = std::getenv("OPCUA_ENDPOINT_URL")) { @@ -615,11 +622,11 @@ void OpcuaPlugin::on_event_alarm(const AlarmEventDelivery & delivery) { // Map raw OPC-UA Severity (1-1000) to SOVD severity bucket. // Selfpatch convention documented in design/index.rst; not from IEC 62682. - // Severity_override on the AlarmEventConfig wins when set. + // The resolved severity_override (mapping- or source-level, issue #389) + // wins when set. auto severity_str = [&]() { - auto * cfg = node_map_.find_event_alarm(delivery.entity_id, delivery.fault_code); - if (cfg && !cfg->severity_override.empty()) { - return cfg->severity_override; + if (!delivery.severity_override.empty()) { + return delivery.severity_override; } if (delivery.severity >= 801) { return std::string("CRITICAL"); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index c4b7213e6..5c6118ab1 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -33,12 +33,6 @@ inline rclcpp::Logger opcua_poller_logger() { static auto logger = rclcpp::get_logger("opcua.poller"); return logger; } - -inline bool poller_debug_enabled() { - // rcutils API for Humble compatibility; Jazzy+ adds rclcpp::Logger:: - // get_effective_level but Humble does not. - return rcutils_logging_logger_is_enabled_for("opcua.poller", RCUTILS_LOG_SEVERITY_DEBUG); -} } // namespace namespace { @@ -68,7 +62,11 @@ constexpr size_t kFieldAckedState = 5; constexpr size_t kFieldConfirmedState = 6; constexpr size_t kFieldShelvingState = 7; constexpr size_t kFieldBranchId = 8; -constexpr size_t kAlarmFieldCount = 9; +constexpr size_t kFieldConditionName = 9; // issue #389 (multi-alarm identity) +constexpr size_t kAlarmFieldCount = 9; // count of fixed alarm-state fields (indices 0-8) +// ConditionName is always appended at index kFieldConditionName; configured +// associated values follow at index kFieldConditionName + 1 onward. +constexpr size_t kFirstAssociatedValueField = kFieldConditionName + 1; // Standard NodeIds for the types that *directly* define each field (open62541 // servers reject SAOs whose BrowsePath is inherited rather than direct). @@ -77,11 +75,11 @@ constexpr uint32_t kConditionType = 2782; constexpr uint32_t kAcknowledgeableConditionType = 2881; constexpr uint32_t kAlarmConditionType = 2915; -std::vector build_alarm_event_select_specs() { +std::vector build_alarm_event_select_specs(const AlarmEventConfig & cfg) { // Each clause carries the type that *directly* defines its first browse // segment - inheritance traversal is not honored by the open62541 server // validator (verified against 1.4.6 with FULL ns0). - return { + std::vector specs = { {opcua::NodeId(0, kBaseEventType), {{0, "EventId"}}, UA_ATTRIBUTEID_VALUE}, {opcua::NodeId(0, kBaseEventType), {{0, "Severity"}}, UA_ATTRIBUTEID_VALUE}, {opcua::NodeId(0, kBaseEventType), {{0, "Message"}}, UA_ATTRIBUTEID_VALUE}, @@ -93,7 +91,65 @@ std::vector build_alarm_event_select_specs() { {{0, "ShelvingState"}, {0, "CurrentState"}, {0, "Id"}}, UA_ATTRIBUTEID_VALUE}, {opcua::NodeId(0, kConditionType), {{0, "BranchId"}}, UA_ATTRIBUTEID_VALUE}, + // Issue #389: ConditionName, used to route distinct conditions from one + // source to distinct faults. + {opcua::NodeId(0, kConditionType), {{0, "ConditionName"}}, UA_ATTRIBUTEID_VALUE}, }; + // Issue #389: configured associated values (e.g. Siemens SD_1..SD_n) as + // BaseEventType properties. + for (const auto & av : cfg.associated_values) { + specs.push_back({opcua::NodeId(0, kBaseEventType), {{av.namespace_index, av.name}}, UA_ATTRIBUTEID_VALUE}); + } + return specs; +} + +// Render an arbitrary scalar event field value to a short string for inclusion +// in a fault description (associated values can be of any builtin type). +std::string variant_to_display(const opcua::Variant & v) { + if (v.isEmpty()) { + return ""; + } + if (v.isType()) { + return std::string(v.getScalarCopy().getText()); + } + if (v.isType()) { + return std::string(v.getScalarCopy()); + } + if (v.isType()) { + return v.getScalarCopy() ? "true" : "false"; + } + if (v.isType()) { + return std::to_string(v.getScalarCopy()); + } + if (v.isType()) { + return std::to_string(v.getScalarCopy()); + } + if (v.isType()) { + return std::to_string(v.getScalarCopy()); + } + if (v.isType()) { + return std::to_string(v.getScalarCopy()); + } + if (v.isType()) { + return std::to_string(v.getScalarCopy()); + } + if (v.isType()) { + return std::to_string(v.getScalarCopy()); + } + if (v.isType()) { + return std::to_string(v.getScalarCopy()); + } + return ""; +} + +std::string variant_to_string_scalar(const opcua::Variant & v) { + if (v.isType()) { + return std::string(v.getScalarCopy()); + } + if (v.isType()) { + return std::string(v.getScalarCopy().getText()); + } + return ""; } // Extract a scalar of type T from a Variant, returning the default if absent. @@ -243,10 +299,12 @@ void OpcuaPoller::setup_event_subscriptions() { return; } - const auto select_specs = build_alarm_event_select_specs(); event_monitored_item_ids_.clear(); for (const auto & cfg : node_map_.event_alarms()) { + // Per-source select specs so each source can carry its own associated + // values (issue #389) in addition to the fixed alarm-state fields. + const auto select_specs = build_alarm_event_select_specs(cfg); // Capture cfg BY VALUE: even though range-for ``const auto & cfg`` binds // to a vector element that outlives the loop, an `&cfg`-by-reference // capture would chain through a local reference variable whose name @@ -266,61 +324,168 @@ void OpcuaPoller::setup_event_subscriptions() { } } - // Fire ConditionRefresh so the server pushes any conditions that fired - // before our subscription started (Part 9 §5.5.7 mandates the bracketing - // RefreshStartEvent / RefreshEndEvent which we treat as ordinary events - // tagged by EventType). + // Replay conditions that were already active before this (re)subscribe so a + // reconnect / restart does not drop the live fault set. Strategy-driven: + // ConditionRefresh (Part 9 §5.5.7) and/or a read-based fallback (issue + // #389) for servers that reject the method. if (!event_monitored_item_ids_.empty()) { - condition_refresh(); + replay_active_conditions(); + } +} + +ConditionReplayStrategy OpcuaPoller::parse_replay_strategy(const std::string & name) { + std::string n = name; + std::transform(n.begin(), n.end(), n.begin(), [](unsigned char c) { + return static_cast(std::tolower(c)); + }); + if (n == "method") { + return ConditionReplayStrategy::Method; + } + if (n == "read" || n == "read_fallback" || n == "readfallback") { + return ConditionReplayStrategy::Read; + } + if (n == "off" || n == "none" || n == "disabled") { + return ConditionReplayStrategy::Off; + } + return ConditionReplayStrategy::Auto; +} + +void OpcuaPoller::replay_active_conditions() { + // Shared warn sink (operator-visible) used by the strategy branches. + auto warn = [this](const std::string & msg) { + if (config_.log_warn) { + config_.log_warn(msg); + } else { + RCLCPP_WARN(opcua_poller_logger(), "%s", msg.c_str()); + } + }; + + switch (config_.condition_replay_strategy) { + case ConditionReplayStrategy::Off: + return; + case ConditionReplayStrategy::Method: + if (!try_condition_refresh() && !condition_refresh_warned_) { + warn( + "OPC-UA ConditionRefresh rejected and replay strategy is 'method'; active conditions " + "will NOT be replayed on reconnect with this server. Live transitions still flow. See issue #389."); + condition_refresh_warned_ = true; + } else if (condition_refresh_warned_) { + condition_refresh_warned_ = false; + } + return; + case ConditionReplayStrategy::Read: + read_fallback_replay(); + return; + case ConditionReplayStrategy::Auto: + if (try_condition_refresh()) { + condition_refresh_warned_ = false; + return; + } + if (!condition_refresh_warned_) { + warn("OPC-UA ConditionRefresh rejected; using read-based active-condition replay fallback (issue #389)."); + condition_refresh_warned_ = true; + } + read_fallback_replay(); + return; } } -void OpcuaPoller::condition_refresh() { +bool OpcuaPoller::try_condition_refresh() { // Server object NodeId (i=2253) hosts the ConditionRefresh method - // (i=3875 - per Part 9 §5.5.7). We use the standard NodeId; servers that - // diverge from the spec (rare) will return BadMethodInvalid which is - // logged but does not abort subscribing. + // (i=3875 - per Part 9 §5.5.7). Servers that reject it (BadMethodInvalid in + // open62541, BadNodeIdUnknown on Siemens S7-1500, etc.) return an error; + // the caller then decides whether to fall back to the read-based replay. static constexpr uint32_t kServerObjectId = 2253; static constexpr uint32_t kConditionRefreshMethodId = 3875; std::vector args; args.push_back(opcua::Variant::fromScalar(static_cast(event_subscription_id_))); auto result = client_.call_method(opcua::NodeId(0, kServerObjectId), opcua::NodeId(0, kConditionRefreshMethodId), args); - if (!result.has_value()) { - // Not fatal but operator-visible: when ConditionRefresh is rejected by - // the server (BadMethodInvalid in open62541 v1.4.x, BadNotImplemented - // on Siemens S7-1500, etc.) the gateway will not re-receive any active - // conditions on reconnect; only live transitions surface in /faults. - // Worth a single warn per connect so the operator knows their - // alarm-replay-on-reconnect contract is broken with this PLC. - if (!condition_refresh_warned_) { - const std::string msg = "OPC-UA ConditionRefresh rejected (" + result.error().message + - "); active conditions will NOT be replayed on reconnect with this server. " - "Live transitions still flow. See issue #389."; - if (config_.log_warn) { - config_.log_warn(msg); - } else { - // Fallback when the plugin did not wire log_warn through PollerConfig - // (e.g., direct unit tests). RCLCPP_WARN goes to /rosout instead of - // raw stderr so the warn integrates with normal log filtering. - RCLCPP_WARN(opcua_poller_logger(), "%s", msg.c_str()); + return result.has_value(); +} + +void OpcuaPoller::read_fallback_replay() { + // Browse each configured alarm source, read its conditions' current state, + // and drive the same state machine as live events. Conditions that are no + // longer active are reconciled (cleared) afterwards. + std::set seen; + for (const auto & cfg : node_map_.event_alarms()) { + auto conditions = client_.read_source_conditions(cfg.source_node_id); + for (const auto & snap : conditions) { + AlarmEventInput input; + input.enabled_state = snap.enabled_state; + input.active_state = snap.active_state; + input.acked_state = snap.acked_state; + input.confirmed_state = snap.confirmed_state; + // Read-based replay never observes a historical branch (we read the + // current branch state directly) and shelving is folded into the live + // event path; treat read snapshots as live, non-shelved state. + input.shelved = false; + input.branch_id_present = false; + + // Resolve identity to a specific fault. EventType is not available from + // a state read, so event_type-only mappings will not match here; routing + // by condition_name / source_node still works. + ResolvedAlarm resolved = + NodeMap::resolve_alarm(cfg, snap.condition_name, cfg.source_node_id_str, /*event_type=*/""); + if (!resolved.matched) { + continue; } - condition_refresh_warned_ = true; + AlarmEventConfig eff = cfg; + eff.fault_code = resolved.fault_code; + eff.severity_override = resolved.severity_override; + eff.message_override = resolved.message_override; + + seen.insert(snap.condition_id.toString()); + apply_condition_state(eff, snap.condition_id, input, snap.severity, snap.message, /*event_id=*/nullptr); + } + } + reconcile_after_read(seen); +} + +void OpcuaPoller::reconcile_after_read(const std::set & seen) { + std::vector clears; + { + std::unique_lock lock(conditions_mutex_); + for (auto & [cid, runtime] : conditions_) { + const bool was_active = + (runtime.last_status == SovdAlarmStatus::Confirmed || runtime.last_status == SovdAlarmStatus::Healed); + if (was_active && seen.find(cid) == seen.end()) { + // Tracked as active but absent from the read scan -> the alarm + // cleared while we were offline. Clear the SOVD fault. + runtime.last_status = SovdAlarmStatus::Cleared; + AlarmEventDelivery d; + d.fault_code = runtime.fault_code; + d.entity_id = runtime.entity_id; + d.next_status = SovdAlarmStatus::Cleared; + d.action = AlarmAction::ClearFault; + d.condition_id = cid; + clears.push_back(std::move(d)); + } + } + } + if (clears.empty()) { + return; + } + EventAlarmCallback cb_copy; + { + std::lock_guard cb_lock(event_alarm_callback_mutex_); + cb_copy = event_alarm_callback_; + } + if (cb_copy) { + for (const auto & d : clears) { + cb_copy(d); } - } else { - // Reset the throttle: a successful refresh means the server is - // cooperating again, so the next failure (e.g., after a restart of a - // server with a different config) earns a fresh warn. - condition_refresh_warned_ = false; } } void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector & values, - const opcua::NodeId & /*source_node*/, const opcua::NodeId & event_type, + const opcua::NodeId & source_node, const opcua::NodeId & event_type, const opcua::NodeId & condition_id) { - RCLCPP_DEBUG_STREAM(opcua_poller_logger(), - "on_event fault=" << cfg.fault_code << " event_type=" << event_type.toString() - << " condition=" << condition_id.toString() << " values=" << values.size()); + RCLCPP_DEBUG_STREAM(opcua_poller_logger(), "on_event source_fault=" << cfg.fault_code + << " event_type=" << event_type.toString() + << " condition=" << condition_id.toString() + << " values=" << values.size()); // Detect ConditionRefresh bracketing per Part 9 §5.5.7. The flag is for // diagnostics only; the state machine itself does not need to know // because RefreshStart / RefreshEnd notifications carry no condition @@ -380,14 +545,69 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector(values[kFieldSeverity], 0); + std::string message = variant_to_localized_text(values[kFieldMessage]); + + // Issue #389: resolve this event's identity (ConditionName / SourceNode / + // EventType) to a specific fault via the config's mappings. This lets one + // OPC-UA source emit many distinct conditions that surface as distinct + // SOVD faults with their own text. + std::string condition_name; + if (values.size() > kFieldConditionName) { + condition_name = variant_to_string_scalar(values[kFieldConditionName]); + } + ResolvedAlarm resolved = NodeMap::resolve_alarm(cfg, condition_name, source_node.toString(), event_type.toString()); + if (!resolved.matched) { + // No mapping and no source-level fault_code applies to this condition. + RCLCPP_DEBUG_STREAM(opcua_poller_logger(), + "on_event: no fault mapping for condition_name='" << condition_name << "' - ignoring"); + return; + } - ConditionRuntime runtime_snapshot; - SovdAlarmStatus prev_status; + // Append configured associated values (e.g. SD_1..SD_n) to the description. + for (size_t i = 0; i < cfg.associated_values.size(); ++i) { + const size_t idx = kFirstAssociatedValueField + i; + if (idx >= values.size()) { + break; + } + const std::string rendered = variant_to_display(values[idx]); + if (rendered.empty()) { + continue; + } + if (!message.empty()) { + message += "; "; + } + message += cfg.associated_values[i].label + "=" + rendered; + } + + // Build the effective config for this specific event (resolved fault_code + + // overrides) so apply_condition_state tracks the right fault_code / entity. + AlarmEventConfig eff = cfg; + eff.fault_code = resolved.fault_code; + eff.severity_override = resolved.severity_override; + eff.message_override = resolved.message_override; + + // Capture the live EventId for spec-compliant Acknowledge / Confirm. + opcua::ByteString event_id; + bool have_event_id = false; + if (values[kFieldEventId].isType()) { + event_id = values[kFieldEventId].getScalarCopy(); + have_event_id = true; + } + + apply_condition_state(eff, condition_id, input, severity, message, have_event_id ? &event_id : nullptr); +} + +void OpcuaPoller::apply_condition_state(const AlarmEventConfig & cfg, const opcua::NodeId & condition_id, + const AlarmEventInput & input, uint16_t severity, const std::string & message, + const opcua::ByteString * event_id) { + // Key the runtime map on the ConditionId string form so distinct condition + // instances within the same event source remain separate (Part 9 + // §5.5.2.13). Shared by the live event path and the read-based replay. + const std::string condition_id_str = condition_id.toString(); + + AlarmEventDelivery delivery; + bool dispatch = false; { std::unique_lock lock(conditions_mutex_); auto it = conditions_.find(condition_id_str); @@ -397,27 +617,12 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vectorsecond.last_status; - - // Track the latest EventId for spec-compliant Acknowledge calls. - if (values[kFieldEventId].isType()) { - it->second.latest_event_id = values[kFieldEventId].getScalarCopy(); - if (poller_debug_enabled()) { - std::ostringstream hex_oss; - const auto * bytes = it->second.latest_event_id.data(); - for (size_t i = 0; i < std::min(it->second.latest_event_id.length(), 16); ++i) { - char buf[3]; - std::snprintf(buf, sizeof(buf), "%02x", static_cast(bytes[i]) & 0xffu); - hex_oss << buf; - } - RCLCPP_DEBUG_STREAM(opcua_poller_logger(), - "captured EventId len=" << it->second.latest_event_id.length() << " hex=" << hex_oss.str()); - } - } else { - RCLCPP_DEBUG(opcua_poller_logger(), "EventId field not a ByteString"); + const SovdAlarmStatus prev_status = it->second.last_status; + + if (event_id != nullptr) { + it->second.latest_event_id = *event_id; } auto outcome = AlarmStateMachine::compute(prev_status, input); @@ -428,42 +633,34 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector source-level catch-all. + auto r3 = NodeMap::resolve_alarm(cfg, "Unknown", "ns=2;s=Tank", "ns=0;i=2915"); + EXPECT_TRUE(r3.matched); + EXPECT_EQ(r3.fault_code, "PLC_CATCHALL"); +} + +TEST(ResolveAlarmTest, MatchBySourceNodeAndEventType) { + AlarmEventConfig cfg; + cfg.entity_id = "tank"; + cfg.mappings.push_back({"", "ns=2;s=PumpA", "ns=0;i=2915", "PLC_PUMP_A", "", ""}); + // SourceNode mismatch -> unmatched (no fallback fault_code). + EXPECT_FALSE(NodeMap::resolve_alarm(cfg, "X", "ns=2;s=PumpB", "ns=0;i=2915").matched); + // Both match. + auto r = NodeMap::resolve_alarm(cfg, "X", "ns=2;s=PumpA", "ns=0;i=2915"); + EXPECT_TRUE(r.matched); + EXPECT_EQ(r.fault_code, "PLC_PUMP_A"); +} + +TEST(ResolveAlarmTest, MappingInheritsSourceLevelOverridesWhenUnset) { + AlarmEventConfig cfg; + cfg.entity_id = "tank"; + cfg.severity_override = "CRITICAL"; + cfg.message_override = "src-msg"; + cfg.mappings.push_back({"Cond", "", "", "PLC_X", "", ""}); // no per-mapping overrides + auto r = NodeMap::resolve_alarm(cfg, "Cond", "s", "e"); + EXPECT_EQ(r.severity_override, "CRITICAL"); + EXPECT_EQ(r.message_override, "src-msg"); +} + +TEST_F(NodeMapTest, LoadsMappingsAndAssociatedValues) { + std::string path = "/tmp/test_node_map_mappings.yaml"; + std::ofstream f(path); + f << R"( +area_id: test +component_id: test +event_alarms: + - alarm_source: "ns=0;i=2253" + entity_id: plant + fault_code: PLC_GENERIC + mappings: + - condition_name: "Overpressure" + fault_code: PLC_OVERPRESSURE + severity_override: ERROR + message: "Tank overpressure" + - condition_name: "LowLevel" + fault_code: PLC_LOW_LEVEL + associated_values: + - "SD_1" + - name: "SD_2" + label: "Setpoint" +)"; + f.close(); + + NodeMap map; + ASSERT_TRUE(map.load(path)); + ASSERT_EQ(map.event_alarms().size(), 1u); + const auto & cfg = map.event_alarms()[0]; + EXPECT_EQ(cfg.fault_code, "PLC_GENERIC"); + ASSERT_EQ(cfg.mappings.size(), 2u); + EXPECT_EQ(cfg.mappings[0].match_condition_name, "Overpressure"); + EXPECT_EQ(cfg.mappings[0].fault_code, "PLC_OVERPRESSURE"); + ASSERT_EQ(cfg.associated_values.size(), 2u); + EXPECT_EQ(cfg.associated_values[0].name, "SD_1"); + EXPECT_EQ(cfg.associated_values[0].label, "SD_1"); // defaults to name + EXPECT_EQ(cfg.associated_values[1].label, "Setpoint"); + + // find_event_alarm resolves a mapping fault_code, not just the source code. + EXPECT_NE(map.find_event_alarm("plant", "PLC_LOW_LEVEL"), nullptr); + EXPECT_NE(map.find_event_alarm("plant", "PLC_GENERIC"), nullptr); +} + +TEST_F(NodeMapTest, MappingOnlyEntryNeedsNoSourceFaultCode) { + std::string path = "/tmp/test_node_map_mapping_only.yaml"; + std::ofstream f(path); + f << R"( +area_id: test +component_id: test +event_alarms: + - alarm_source: "ns=0;i=2253" + entity_id: plant + mappings: + - condition_name: "A" + fault_code: PLC_A +)"; + f.close(); + + NodeMap map; + ASSERT_TRUE(map.load(path)); + ASSERT_EQ(map.event_alarms().size(), 1u); + EXPECT_TRUE(map.event_alarms()[0].fault_code.empty()); + EXPECT_EQ(map.event_alarms()[0].mappings.size(), 1u); +} + +TEST_F(NodeMapTest, RejectsMappingFaultCodeCollisionWithThreshold) { + // A mapping fault_code that collides with a threshold alarm fault_code on + // the same entity must be rejected, same as a source-level collision. + std::string path = "/tmp/test_node_map_mapping_collision.yaml"; + std::ofstream f(path); + f << R"( +area_id: test +component_id: test +nodes: + - node_id: "ns=1;i=1" + entity_id: tank + data_name: pressure + alarm: + fault_code: PLC_OVERPRESSURE + threshold: 90.0 +event_alarms: + - alarm_source: "ns=0;i=2253" + entity_id: tank + mappings: + - condition_name: "Overpressure" + fault_code: PLC_OVERPRESSURE +)"; + f.close(); + + NodeMap map; + EXPECT_FALSE(map.load(path)); +} + } // namespace ros2_medkit_gateway diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp index 6f23b3d6b..78592fad3 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp @@ -59,6 +59,14 @@ TEST(OpcuaClientTest, ReadWhenDisconnected) { EXPECT_FALSE(result.good); } +// Issue #389 (read-based active-condition replay): browsing for conditions on a +// disconnected client returns nothing rather than throwing. +TEST(OpcuaClientTest, ReadSourceConditionsWhenDisconnected) { + OpcuaClient client; + auto conds = client.read_source_conditions(opcua::NodeId(0, UA_NS0ID_SERVER)); + EXPECT_TRUE(conds.empty()); +} + TEST(OpcuaClientTest, WriteWhenDisconnected) { OpcuaClient client; EXPECT_FALSE(client.write_value({1, "SomeNode"}, 42.0)); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp index bda73f832..6c17c8ae2 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp @@ -294,4 +294,17 @@ TEST_F(OpcuaPluginTest, GetFaultNotFound) { EXPECT_EQ(result.error().code, FaultProviderError::FaultNotFound); } +// Issue #389: condition-replay strategy parsing (used by the poller's +// reconnect / restart active-condition replay). +TEST(ConditionReplayStrategyParse, KnownValues) { + EXPECT_EQ(OpcuaPoller::parse_replay_strategy("method"), ConditionReplayStrategy::Method); + EXPECT_EQ(OpcuaPoller::parse_replay_strategy("read"), ConditionReplayStrategy::Read); + EXPECT_EQ(OpcuaPoller::parse_replay_strategy("read_fallback"), ConditionReplayStrategy::Read); + EXPECT_EQ(OpcuaPoller::parse_replay_strategy("off"), ConditionReplayStrategy::Off); + EXPECT_EQ(OpcuaPoller::parse_replay_strategy("auto"), ConditionReplayStrategy::Auto); + EXPECT_EQ(OpcuaPoller::parse_replay_strategy("AUTO"), ConditionReplayStrategy::Auto); + // Unknown -> safe default (Auto: method with read fallback). + EXPECT_EQ(OpcuaPoller::parse_replay_strategy("bogus"), ConditionReplayStrategy::Auto); +} + } // namespace ros2_medkit_gateway From 584915ee902f67beaa9f2b11a5de2454774837b3 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 19:20:33 +0200 Subject: [PATCH 04/11] fix(opcua,gateway): harden secure profile and replay reconcile Fail closed when auth/TLS is enabled but its config is invalid (throw instead of serving unauthenticated/plaintext). Warn loudly when OPC-UA credentials would cross an unencrypted channel. Stop the read-based replay from false-clearing active faults on a failed/disconnected scan, and keep conditions whose ActiveState read fails transiently. Refs #477 #478 #479 #480 --- src/ros2_medkit_gateway/src/gateway_node.cpp | 22 ++++-- .../test/test_gateway_node.cpp | 41 +++++++++++ .../ros2_medkit_opcua/opcua_client.hpp | 30 +++++++- .../ros2_medkit_opcua/opcua_poller.hpp | 20 +++++- .../ros2_medkit_opcua/src/opcua_client.cpp | 69 ++++++++++++++----- .../ros2_medkit_opcua/src/opcua_plugin.cpp | 20 +++--- .../ros2_medkit_opcua/src/opcua_poller.cpp | 53 +++++++++++--- .../test/test_opcua_client.cpp | 40 +++++++++++ .../test/test_opcua_plugin.cpp | 35 ++++++++++ 9 files changed, 286 insertions(+), 44 deletions(-) diff --git a/src/ros2_medkit_gateway/src/gateway_node.cpp b/src/ros2_medkit_gateway/src/gateway_node.cpp index 716b5e8ea..e87071b4a 100644 --- a/src/ros2_medkit_gateway/src/gateway_node.cpp +++ b/src/ros2_medkit_gateway/src/gateway_node.cpp @@ -496,8 +496,15 @@ GatewayNode::GatewayNode(const rclcpp::NodeOptions & options) : Node("ros2_medki .build(); // Note: HttpServerManager will log TLS configuration details } catch (const std::exception & e) { - RCLCPP_ERROR(get_logger(), "Invalid TLS configuration: %s. TLS disabled.", e.what()); - tls_config_ = TlsConfig{}; // Disabled + // Fail closed: TLS was explicitly requested but could not be built (e.g. + // missing/invalid cert or key). Refuse to start rather than silently + // serving plaintext HTTP. Disabling here would expose the API in the + // clear under a configuration that asked for encryption. + RCLCPP_FATAL(get_logger(), + "TLS is enabled (server.tls.enabled=true) but the TLS configuration is invalid: %s. " + "Refusing to start in plaintext - fix the certificate/key configuration or disable TLS.", + e.what()); + throw std::runtime_error(std::string("Invalid TLS configuration: ") + e.what()); } } else { RCLCPP_INFO(get_logger(), "TLS/HTTPS: disabled"); @@ -551,8 +558,15 @@ GatewayNode::GatewayNode(const rclcpp::NodeOptions & options) : Node("ros2_medki algorithm_to_string(auth_config_.jwt_algorithm).c_str(), get_parameter("auth.require_auth_for").as_string().c_str()); } catch (const std::exception & e) { - RCLCPP_ERROR(get_logger(), "Invalid authentication configuration: %s. Authentication disabled.", e.what()); - auth_config_ = AuthConfig{}; // Disabled + // Fail closed: authentication was explicitly requested but could not be + // built (e.g. empty jwt_secret). Refuse to start rather than silently + // serving an unauthenticated API under a configuration that asked for + // auth. + RCLCPP_FATAL(get_logger(), + "Authentication is enabled (auth.enabled=true) but the auth configuration is invalid: %s. " + "Refusing to start unauthenticated - fix the auth configuration or disable auth.", + e.what()); + throw std::runtime_error(std::string("Invalid authentication configuration: ") + e.what()); } } else { RCLCPP_INFO(get_logger(), "Authentication: disabled"); diff --git a/src/ros2_medkit_gateway/test/test_gateway_node.cpp b/src/ros2_medkit_gateway/test/test_gateway_node.cpp index e467c7849..702210149 100644 --- a/src/ros2_medkit_gateway/test/test_gateway_node.cpp +++ b/src/ros2_medkit_gateway/test/test_gateway_node.cpp @@ -908,6 +908,47 @@ TEST_F(TestGatewayNode, test_invalid_function_id_bad_request) { EXPECT_EQ(res->status, 400); } +// ============================================================================= +// Secure-profile fail-closed tests (issue #477): when auth/TLS is explicitly +// enabled but its configuration is invalid, the node must refuse to start +// rather than silently fall back to an unauthenticated / plaintext server. +// ============================================================================= + +TEST_F(TestGatewayNode, auth_enabled_with_empty_secret_fails_closed) { + // auth.enabled=true with an empty HS256 secret: the AuthConfigBuilder rejects + // it, and the node must propagate that as a fatal (throw) instead of disabling + // auth and serving unauthenticated. + node_.reset(); + int free_port = reserve_free_port(); + ASSERT_NE(free_port, 0); + + rclcpp::NodeOptions options; + options.parameter_overrides({ + rclcpp::Parameter("server.port", free_port), + rclcpp::Parameter("auth.enabled", true), + rclcpp::Parameter("auth.jwt_secret", std::string("")), + rclcpp::Parameter("auth.jwt_algorithm", std::string("HS256")), + }); + EXPECT_THROW({ auto n = std::make_shared(options); }, std::exception); +} + +TEST_F(TestGatewayNode, tls_enabled_with_missing_cert_fails_closed) { + // server.tls.enabled=true with no cert/key: the TlsConfigBuilder rejects it, + // and the node must refuse to start in plaintext. + node_.reset(); + int free_port = reserve_free_port(); + ASSERT_NE(free_port, 0); + + rclcpp::NodeOptions options; + options.parameter_overrides({ + rclcpp::Parameter("server.port", free_port), + rclcpp::Parameter("server.tls.enabled", true), + rclcpp::Parameter("server.tls.cert_file", std::string("")), + rclcpp::Parameter("server.tls.key_file", std::string("")), + }); + EXPECT_THROW({ auto n = std::make_shared(options); }, std::exception); +} + // ============================================================================= // Entity type path extraction tests // ============================================================================= diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp index 558e38179..3b1a548a8 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp @@ -275,15 +275,33 @@ class OpcuaClient { bool retain{false}; uint16_t severity{0}; std::string message; + /// True when the condition's ActiveState/Id node resolved but its value + /// could not be read this scan (transient read failure). The caller must + /// treat the condition as still present (do not clear it) even though its + /// state fields fell back to their defaults. + bool state_read_failed{false}; }; /// Browse ``source_node`` for AlarmCondition instances and read their /// current state (ActiveState/Id, AckedState/Id, ConfirmedState/Id, /// EnabledState/Id, Retain, Severity, Message). Only immediate children /// that expose an ``ActiveState`` node are treated as conditions; other - /// children are skipped. Returns an empty vector when disconnected or when - /// the source has no readable conditions. - std::vector read_source_conditions(const opcua::NodeId & source_node); + /// children are skipped. + /// + /// ``scan_ok`` (when provided) distinguishes a successful empty scan from a + /// failed one: it is set true only when the source browse completed, and + /// false when disconnected or when the browse raised a Bad status. Callers + /// must NOT reconcile (clear) active faults for a source whose scan failed, + /// otherwise a transient disconnect would falsely clear live alarms. + /// + /// NOTE: this read-based fallback assumes each AlarmCondition instance is + /// browseable as an immediate child of its alarm source via + /// HierarchicalReferences. That holds for the open62541 reference server, + /// but has NOT yet been validated against a real Siemens S7-1500 (whose + /// address-space layout for condition instances must be confirmed before + /// this path can be claimed to work there). + std::vector read_source_conditions(const opcua::NodeId & source_node, + bool * scan_ok = nullptr); /// Get server description string (for status endpoint) std::string server_description() const; @@ -314,6 +332,12 @@ class OpcuaClient { /// encrypted channel. static bool requires_secure_channel(const OpcuaClientConfig & config); + /// True when user credentials (username/password or X.509) would be sent + /// over an unencrypted SecureChannel: a non-anonymous identity combined with + /// no secured channel. Such credentials can be intercepted on the wire and + /// the caller is expected to warn loudly. + static bool credentials_sent_in_clear(const OpcuaClientConfig & config); + private: struct Impl; std::unique_ptr impl_; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp index bb0daf534..eac345dd5 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp @@ -72,6 +72,10 @@ struct ConditionRuntime { opcua::ByteString latest_event_id; std::string entity_id; std::string fault_code; + /// String form of the alarm source NodeId that owns this condition. Used by + /// the read-based reconcile to skip clearing conditions whose source scan + /// failed (issue #479). + std::string source_id; SovdAlarmStatus last_status{SovdAlarmStatus::Suppressed}; }; @@ -157,6 +161,16 @@ class OpcuaPoller { /// Case-insensitive; unknown input falls back to Auto. static ConditionReplayStrategy parse_replay_strategy(const std::string & name); + /// Decide whether a tracked condition should be cleared after a read-based + /// replay scan (issue #479). A condition is cleared only when it was active, + /// was NOT observed this scan (``seen``), and its owning source scan + /// succeeded. If the source is in ``failed_sources`` the condition is left + /// untouched - its absence from ``seen`` means "not scanned", not "gone". + /// Pure and static so the false-clear guard is unit-testable without a server. + static bool should_clear_condition(SovdAlarmStatus last_status, const std::string & condition_id, + const std::string & source_id, const std::set & seen, + const std::set & failed_sources); + private: void poll_loop(); void do_poll(); @@ -181,8 +195,10 @@ class OpcuaPoller { void read_fallback_replay(); /// Clear faults for conditions that were active before the replay but are /// no longer present/active in the read scan (``seen`` = condition ids - /// observed this scan). - void reconcile_after_read(const std::set & seen); + /// observed this scan). Conditions whose owning source is in + /// ``failed_sources`` (its browse failed this scan) are left untouched so a + /// transient disconnect cannot falsely clear live alarms (issue #479). + void reconcile_after_read(const std::set & seen, const std::set & failed_sources); /// Apply one condition state observation (from a live event or a read scan) /// to the tracked condition map + state machine, dispatching the resulting diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index 5fbfb7cdf..435013517 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -308,6 +308,10 @@ bool OpcuaClient::requires_secure_channel(const OpcuaClientConfig & config) { return config.security_policy != SecurityPolicy::None || config.security_mode != SecurityMode::None; } +bool OpcuaClient::credentials_sent_in_clear(const OpcuaClientConfig & config) { + return config.user_auth_mode != UserAuthMode::Anonymous && !requires_secure_channel(config); +} + namespace { #ifdef UA_ENABLE_ENCRYPTION @@ -797,32 +801,49 @@ std::string OpcuaClient::server_description() const { return impl_->server_desc; } -std::vector -OpcuaClient::read_source_conditions(const opcua::NodeId & source_node) { +std::vector OpcuaClient::read_source_conditions(const opcua::NodeId & source_node, + bool * scan_ok) { std::lock_guard lock(impl_->client_mutex); std::vector out; + // Default to "scan failed" so an early return (disconnected) or a thrown + // browse never looks like a clean "no conditions" result to the caller. + if (scan_ok) { + *scan_ok = false; + } if (!impl_->connected) { return out; } // Read a Boolean two-step state variable (e.g. ActiveState/Id). Returns the - // fallback when the path is absent or not Boolean. ``found`` reports whether - // the path resolved at all (used to decide if a child is a condition). - auto read_state_id = [](opcua::Node & cond, const char * state_name, bool fallback, - bool * found) -> bool { + // fallback when the path is absent or not Boolean. ``resolved`` reports + // whether the ``state_name/Id`` path browsed successfully (used to decide if + // a child is a condition); ``read_failed`` reports that the path resolved but + // its value could not be read this scan (transient failure, distinct from the + // path simply not existing). + auto read_state_id = [](opcua::Node & cond, const char * state_name, bool fallback, bool * resolved, + bool * read_failed) -> bool { try { auto node = cond.browseChild({{0, state_name}, {0, "Id"}}); - auto val = node.readValue(); - if (found) { - *found = true; + if (resolved) { + *resolved = true; } - if (val.isType()) { - return val.getScalarCopy(); + try { + auto val = node.readValue(); + if (val.isType()) { + return val.getScalarCopy(); + } + return fallback; + } catch (const opcua::BadStatus &) { + // Path exists but the read failed transiently: keep the condition. + if (read_failed) { + *read_failed = true; + } + return fallback; } - return fallback; } catch (const opcua::BadStatus &) { - if (found) { - *found = false; + // Path absent: this child is not an alarm condition. + if (resolved) { + *resolved = false; } return fallback; } @@ -836,16 +857,20 @@ OpcuaClient::read_source_conditions(const opcua::NodeId & source_node) { snap.condition_id = child.id(); // A child is only treated as an alarm condition when ActiveState/Id - // resolves; everything else under the source is ignored. + // resolves; everything else under the source is ignored. A transient + // read failure on a resolved ActiveState keeps the condition (flagged so + // the caller does not drop/clear it) instead of silently discarding it. bool is_condition = false; - snap.active_state = read_state_id(child, "ActiveState", false, &is_condition); + bool active_read_failed = false; + snap.active_state = read_state_id(child, "ActiveState", false, &is_condition, &active_read_failed); if (!is_condition) { continue; } + snap.state_read_failed = active_read_failed; - snap.enabled_state = read_state_id(child, "EnabledState", true, nullptr); - snap.acked_state = read_state_id(child, "AckedState", true, nullptr); - snap.confirmed_state = read_state_id(child, "ConfirmedState", true, nullptr); + snap.enabled_state = read_state_id(child, "EnabledState", true, nullptr, nullptr); + snap.acked_state = read_state_id(child, "AckedState", true, nullptr, nullptr); + snap.confirmed_state = read_state_id(child, "ConfirmedState", true, nullptr, nullptr); try { auto retain = child.browseChild({{0, "Retain"}}).readValue(); @@ -880,8 +905,14 @@ OpcuaClient::read_source_conditions(const opcua::NodeId & source_node) { out.push_back(std::move(snap)); } + // The source browse completed without throwing: this is a trustworthy + // scan (possibly with zero conditions), safe to reconcile against. + if (scan_ok) { + *scan_ok = true; + } } catch (const opcua::BadStatus & e) { maybe_mark_disconnected(impl_->connected, impl_->generation, e); + // scan_ok stays false: a browse failure must not be read as "no conditions". } return out; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index e750052dc..142591d72 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -799,16 +799,20 @@ void OpcuaPlugin::log_security_profile() const { ", MessageSecurityMode=" + mode_name(client_config_.security_mode) + ", user=" + auth_name(client_config_.user_auth_mode); - const bool unsecured = - !OpcuaClient::requires_secure_channel(client_config_) && client_config_.user_auth_mode == UserAuthMode::Anonymous; - if (unsecured) { + const bool secure_channel = OpcuaClient::requires_secure_channel(client_config_); + + if (OpcuaClient::credentials_sent_in_clear(client_config_)) { + // Username/password or X.509 credentials sent without a Sign/SignAndEncrypt + // channel travel in the clear and can be intercepted on the wire. + log_warn(profile + + ". WARNING: user credentials cross an unencrypted OPC-UA channel (MessageSecurityMode=None) - " + "they can be intercepted. Use Sign or SignAndEncrypt on an untrusted network."); + } else if (!secure_channel) { log_warn(profile + ". Unsecured - not suitable for untrusted networks."); + } else if (!client_config_.reject_untrusted) { + log_warn(profile + ". WARNING: reject_untrusted=false (accepts any server certificate)."); } else { - if (!client_config_.reject_untrusted) { - log_warn(profile + ". WARNING: reject_untrusted=false (accepts any server certificate)."); - } else { - log_info(profile); - } + log_info(profile); } } diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index 5c6118ab1..3349cddd9 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -407,11 +407,33 @@ bool OpcuaPoller::try_condition_refresh() { void OpcuaPoller::read_fallback_replay() { // Browse each configured alarm source, read its conditions' current state, // and drive the same state machine as live events. Conditions that are no - // longer active are reconciled (cleared) afterwards. + // longer active are reconciled (cleared) afterwards - but only for sources + // whose scan succeeded, so a transient disconnect never falsely clears. + // + // This fallback assumes each AlarmCondition instance is browseable as an + // immediate child of its alarm source. Validated against the open62541 + // reference server; NOT yet validated against a real Siemens S7-1500, whose + // condition-instance address-space layout must be confirmed before this path + // can be relied on there. std::set seen; + std::set failed_sources; for (const auto & cfg : node_map_.event_alarms()) { - auto conditions = client_.read_source_conditions(cfg.source_node_id); + bool scan_ok = false; + auto conditions = client_.read_source_conditions(cfg.source_node_id, &scan_ok); + if (!scan_ok) { + // Browse failed (disconnect / Bad status). Record the source so reconcile + // does not clear its still-tracked conditions on an empty/partial scan. + failed_sources.insert(cfg.source_node_id_str); + continue; + } for (const auto & snap : conditions) { + if (snap.state_read_failed) { + // Condition is present but its ActiveState read failed transiently. + // Keep it in ``seen`` (so reconcile does not clear it) without feeding + // an unreliable state into the state machine. + seen.insert(snap.condition_id.toString()); + continue; + } AlarmEventInput input; input.enabled_state = snap.enabled_state; input.active_state = snap.active_state; @@ -440,18 +462,32 @@ void OpcuaPoller::read_fallback_replay() { apply_condition_state(eff, snap.condition_id, input, snap.severity, snap.message, /*event_id=*/nullptr); } } - reconcile_after_read(seen); + reconcile_after_read(seen, failed_sources); +} + +bool OpcuaPoller::should_clear_condition(SovdAlarmStatus last_status, const std::string & condition_id, + const std::string & source_id, const std::set & seen, + const std::set & failed_sources) { + const bool was_active = (last_status == SovdAlarmStatus::Confirmed || last_status == SovdAlarmStatus::Healed); + if (!was_active) { + return false; + } + // Never clear a condition whose source scan failed this cycle: its absence + // from ``seen`` means "not scanned", not "no longer active". + if (failed_sources.find(source_id) != failed_sources.end()) { + return false; + } + return seen.find(condition_id) == seen.end(); } -void OpcuaPoller::reconcile_after_read(const std::set & seen) { +void OpcuaPoller::reconcile_after_read(const std::set & seen, + const std::set & failed_sources) { std::vector clears; { std::unique_lock lock(conditions_mutex_); for (auto & [cid, runtime] : conditions_) { - const bool was_active = - (runtime.last_status == SovdAlarmStatus::Confirmed || runtime.last_status == SovdAlarmStatus::Healed); - if (was_active && seen.find(cid) == seen.end()) { - // Tracked as active but absent from the read scan -> the alarm + if (should_clear_condition(runtime.last_status, cid, runtime.source_id, seen, failed_sources)) { + // Tracked as active but absent from a successful read scan -> the alarm // cleared while we were offline. Clear the SOVD fault. runtime.last_status = SovdAlarmStatus::Cleared; AlarmEventDelivery d; @@ -616,6 +652,7 @@ void OpcuaPoller::apply_condition_state(const AlarmEventConfig & cfg, const opcu fresh.condition_id = condition_id; fresh.entity_id = cfg.entity_id; fresh.fault_code = cfg.fault_code; + fresh.source_id = cfg.source_node_id_str; fresh.last_status = SovdAlarmStatus::Suppressed; it = conditions_.emplace(condition_id_str, std::move(fresh)).first; } diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp index 78592fad3..5ff83c686 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_client.cpp @@ -67,6 +67,16 @@ TEST(OpcuaClientTest, ReadSourceConditionsWhenDisconnected) { EXPECT_TRUE(conds.empty()); } +// Issue #479: a disconnected scan must report scan_ok=false so the poller does +// NOT treat the empty result as "no active conditions" and falsely clear faults. +TEST(OpcuaClientTest, ReadSourceConditionsReportsScanFailureWhenDisconnected) { + OpcuaClient client; + bool scan_ok = true; // must be flipped to false + auto conds = client.read_source_conditions(opcua::NodeId(0, UA_NS0ID_SERVER), &scan_ok); + EXPECT_TRUE(conds.empty()); + EXPECT_FALSE(scan_ok); +} + TEST(OpcuaClientTest, WriteWhenDisconnected) { OpcuaClient client; EXPECT_FALSE(client.write_value({1, "SomeNode"}, 42.0)); @@ -162,6 +172,36 @@ TEST(OpcuaClientSecurity, RequiresSecureChannel) { EXPECT_TRUE(OpcuaClient::requires_secure_channel(cfg)); } +// Issue #478: non-anonymous credentials on an unencrypted channel must be +// flagged as cleartext so the plugin can warn loudly. +TEST(OpcuaClientSecurity, CredentialsSentInClear) { + OpcuaClientConfig cfg; + // Anonymous on a plain channel: nothing to leak. + EXPECT_FALSE(OpcuaClient::credentials_sent_in_clear(cfg)); + + // Username/password with no secured channel: credentials travel in clear. + cfg.user_auth_mode = UserAuthMode::UsernamePassword; + EXPECT_TRUE(OpcuaClient::credentials_sent_in_clear(cfg)); + + // X.509 identity with no secured channel: still cleartext. + cfg.user_auth_mode = UserAuthMode::X509; + EXPECT_TRUE(OpcuaClient::credentials_sent_in_clear(cfg)); + + // Once the channel is signed, credentials are protected. + cfg.security_mode = SecurityMode::Sign; + EXPECT_FALSE(OpcuaClient::credentials_sent_in_clear(cfg)); + + // Encryption via the policy alone (mode None) also protects them. + cfg.security_mode = SecurityMode::None; + cfg.security_policy = SecurityPolicy::Basic256Sha256; + EXPECT_FALSE(OpcuaClient::credentials_sent_in_clear(cfg)); + + // Anonymous is never cleartext regardless of channel. + cfg.user_auth_mode = UserAuthMode::Anonymous; + cfg.security_policy = SecurityPolicy::None; + EXPECT_FALSE(OpcuaClient::credentials_sent_in_clear(cfg)); +} + TEST(OpcuaClientSecurity, ConnectSecureWithoutCertFailsFast) { // A secured profile with no client cert/key must fail config-time, before // touching the network, and leave the client disconnected. diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp index 6c17c8ae2..44790af24 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp @@ -307,4 +307,39 @@ TEST(ConditionReplayStrategyParse, KnownValues) { EXPECT_EQ(OpcuaPoller::parse_replay_strategy("bogus"), ConditionReplayStrategy::Auto); } +// -- Issue #479: read-replay reconcile must not falsely clear faults --------- + +TEST(ReconcileShouldClearCondition, ClearsActiveConditionAbsentFromSuccessfulScan) { + // An active condition that the (successful) scan did not observe is gone. + std::set seen; // condition not seen this scan + std::set failed_sources; // its source scanned fine + EXPECT_TRUE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Confirmed, "cond-1", "src-1", seen, failed_sources)); + EXPECT_TRUE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Healed, "cond-1", "src-1", seen, failed_sources)); +} + +TEST(ReconcileShouldClearCondition, KeepsConditionStillSeen) { + std::set seen{"cond-1"}; + std::set failed_sources; + EXPECT_FALSE( + OpcuaPoller::should_clear_condition(SovdAlarmStatus::Confirmed, "cond-1", "src-1", seen, failed_sources)); +} + +TEST(ReconcileShouldClearCondition, SkipsClearWhenSourceScanFailed) { + // The core false-clear guard: source scan failed (browse error / disconnect), + // so the condition's absence from ``seen`` must NOT clear it. + std::set seen; // not observed... + std::set failed_sources{"src-1"}; // ...because its source failed + EXPECT_FALSE( + OpcuaPoller::should_clear_condition(SovdAlarmStatus::Confirmed, "cond-1", "src-1", seen, failed_sources)); + EXPECT_FALSE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Healed, "cond-1", "src-1", seen, failed_sources)); +} + +TEST(ReconcileShouldClearCondition, NeverClearsInactiveCondition) { + std::set seen; + std::set failed_sources; + EXPECT_FALSE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Cleared, "cond-1", "src-1", seen, failed_sources)); + EXPECT_FALSE( + OpcuaPoller::should_clear_condition(SovdAlarmStatus::Suppressed, "cond-1", "src-1", seen, failed_sources)); +} + } // namespace ros2_medkit_gateway From c02e60315dc72f8bfcbec6027fad6729eeb88837 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 19:54:52 +0200 Subject: [PATCH 05/11] fix(opcua,gateway,docs): address review and CI findings Fail loud on unrecognized OPC-UA security_policy/security_mode/user_auth instead of silently defaulting to an insecure profile. Keep read-replay observed-but-unmapped conditions in the seen set so reconcile does not clear them, and reject a non-sequence nodes: section in the node map. Drop unused locals in EXPECT_THROW, fix the hardening table syntax and bulk_data.max_upload_size name, and add hardening to the design toctree. Refs #477 #478 #479 #480 --- src/ros2_medkit_gateway/design/hardening.rst | 26 +++++----- src/ros2_medkit_gateway/design/index.rst | 1 + .../test/test_gateway_node.cpp | 4 +- .../ros2_medkit_opcua/src/node_map.cpp | 9 ++++ .../ros2_medkit_opcua/src/opcua_plugin.cpp | 51 ++++++++++++++++--- .../ros2_medkit_opcua/src/opcua_poller.cpp | 6 ++- 6 files changed, 75 insertions(+), 22 deletions(-) diff --git a/src/ros2_medkit_gateway/design/hardening.rst b/src/ros2_medkit_gateway/design/hardening.rst index c1815e09e..c572eb05f 100644 --- a/src/ros2_medkit_gateway/design/hardening.rst +++ b/src/ros2_medkit_gateway/design/hardening.rst @@ -21,19 +21,19 @@ field profile preset ``config/gateway_params.secure.yaml`` instead of What the secure profile turns on -------------------------------- -============================ ============== =========================================== -Control Default Secure profile -============================ ============== =========================================== -``auth.enabled`` false true -``auth.require_auth_for`` write all (auth on reads + writes) -``server.tls.enabled`` false true (HTTPS, min TLS 1.3) -``cors.allowed_origins`` ``[""]`` explicit origin list (no wildcard) -``rate_limiting.enabled`` false true (global + per-client + per-endpoint) -``scripts.allow_uploads`` true false (manifest-defined scripts only) -``docs.enabled`` true false (reduced surface) -``bulk_data.max_upload`` 100 MiB 25 MiB -``locking`` on operations none lock required before mutation -============================ ============== =========================================== +================================ ============== =========================================== +Control Default Secure profile +================================ ============== =========================================== +``auth.enabled`` false true +``auth.require_auth_for`` write all (auth on reads + writes) +``server.tls.enabled`` false true (HTTPS, min TLS 1.3) +``cors.allowed_origins`` ``[""]`` explicit origin list (no wildcard) +``rate_limiting.enabled`` false true (global + per-client + per-endpoint) +``scripts.allow_uploads`` true false (manifest-defined scripts only) +``docs.enabled`` true false (reduced surface) +``bulk_data.max_upload_size`` 100 MiB 25 MiB +``locking`` on operations none lock required before mutation +================================ ============== =========================================== Credential and certificate provisioning ---------------------------------------- diff --git a/src/ros2_medkit_gateway/design/index.rst b/src/ros2_medkit_gateway/design/index.rst index 09bb0c204..a78928315 100644 --- a/src/ros2_medkit_gateway/design/index.rst +++ b/src/ros2_medkit_gateway/design/index.rst @@ -674,6 +674,7 @@ Additional Design Documents aggregation dto_contract entity_cache_architecture + hardening lifecycle plugin_entity_notifications ros2_subscription_architecture diff --git a/src/ros2_medkit_gateway/test/test_gateway_node.cpp b/src/ros2_medkit_gateway/test/test_gateway_node.cpp index 702210149..d0451a58a 100644 --- a/src/ros2_medkit_gateway/test/test_gateway_node.cpp +++ b/src/ros2_medkit_gateway/test/test_gateway_node.cpp @@ -929,7 +929,7 @@ TEST_F(TestGatewayNode, auth_enabled_with_empty_secret_fails_closed) { rclcpp::Parameter("auth.jwt_secret", std::string("")), rclcpp::Parameter("auth.jwt_algorithm", std::string("HS256")), }); - EXPECT_THROW({ auto n = std::make_shared(options); }, std::exception); + EXPECT_THROW(std::make_shared(options), std::exception); } TEST_F(TestGatewayNode, tls_enabled_with_missing_cert_fails_closed) { @@ -946,7 +946,7 @@ TEST_F(TestGatewayNode, tls_enabled_with_missing_cert_fails_closed) { rclcpp::Parameter("server.tls.cert_file", std::string("")), rclcpp::Parameter("server.tls.key_file", std::string("")), }); - EXPECT_THROW({ auto n = std::make_shared(options); }, std::exception); + EXPECT_THROW(std::make_shared(options), std::exception); } // ============================================================================= diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp index e86e910ef..ebd7a76c3 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp @@ -184,6 +184,15 @@ bool NodeMap::load(const std::string & yaml_path) { node_id_index_.clear(); auto nodes = root["nodes"]; + // A ``nodes:`` key that is present but neither null nor a sequence (e.g. a + // scalar or a map) is a config error, not an absent section. Fail loudly + // instead of silently dropping every node mapping. An explicitly empty + // ``nodes:`` (null) stays valid and is handled by the emptiness check below. + if (nodes && !nodes.IsNull() && !nodes.IsSequence()) { + RCLCPP_ERROR(rclcpp::get_logger("opcua.node_map"), + "'nodes:' must be a sequence of node entries - refusing to load"); + return false; + } const bool has_nodes = nodes && nodes.IsSequence(); if (has_nodes && nodes.size() > 10000) { diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index 142591d72..396d148bf 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -29,6 +29,7 @@ #include #include #include +#include namespace ros2_medkit_gateway { @@ -48,6 +49,44 @@ inline bool plugin_debug_enabled() { return rcutils_logging_logger_is_enabled_for("opcua.plugin", RCUTILS_LOG_SEVERITY_DEBUG); } +// Security-config parse wrappers. An unrecognized value must NOT silently fall +// back to the insecure default (SecurityPolicy::None / SecurityMode::None / +// anonymous auth): a typo would quietly downgrade a hardened link. Fail loud and +// refuse - the thrown exception disables the plugin in PluginManager::configure_plugins. +SecurityPolicy require_security_policy(const std::string & value) { + bool ok = true; + const auto parsed = OpcuaClient::parse_security_policy(value, &ok); + if (!ok) { + RCLCPP_ERROR(opcua_plugin_logger(), + "Unrecognized OPC-UA security_policy '%s'; refusing to start with an insecure fallback", + value.c_str()); + throw std::invalid_argument("opcua: unrecognized security_policy '" + value + "'"); + } + return parsed; +} + +SecurityMode require_security_mode(const std::string & value) { + bool ok = true; + const auto parsed = OpcuaClient::parse_security_mode(value, &ok); + if (!ok) { + RCLCPP_ERROR(opcua_plugin_logger(), + "Unrecognized OPC-UA security_mode '%s'; refusing to start with an insecure fallback", value.c_str()); + throw std::invalid_argument("opcua: unrecognized security_mode '" + value + "'"); + } + return parsed; +} + +UserAuthMode require_user_auth_mode(const std::string & value) { + bool ok = true; + const auto parsed = OpcuaClient::parse_user_auth_mode(value, &ok); + if (!ok) { + RCLCPP_ERROR(opcua_plugin_logger(), + "Unrecognized OPC-UA user_auth_mode '%s'; refusing to start with an insecure fallback", value.c_str()); + throw std::invalid_argument("opcua: unrecognized user_auth_mode '" + value + "'"); + } + return parsed; +} + /// Parse a JSON "value" field, coerce to the node's declared data_type, and /// validate against the optional min/max range. Shared by handle_plc_operations, /// DataProvider::write_data, and OperationProvider::execute_operation to keep @@ -119,12 +158,12 @@ void OpcuaPlugin::configure(const nlohmann::json & config) { // OPC-UA SecureChannel security + user identity. All opt-in; defaults keep // the legacy anonymous + SecurityPolicy=None behaviour. if (config.contains("security_policy")) { - client_config_.security_policy = OpcuaClient::parse_security_policy(config["security_policy"].get()); + client_config_.security_policy = require_security_policy(config["security_policy"].get()); } if (config.contains("security_mode") || config.contains("message_security_mode")) { const std::string mode_str = config.contains("security_mode") ? config["security_mode"].get() : config["message_security_mode"].get(); - client_config_.security_mode = OpcuaClient::parse_security_mode(mode_str); + client_config_.security_mode = require_security_mode(mode_str); } if (config.contains("client_cert_path")) { client_config_.client_cert_path = config["client_cert_path"].get(); @@ -147,7 +186,7 @@ void OpcuaPlugin::configure(const nlohmann::json & config) { client_config_.reject_untrusted = config["reject_untrusted"].get(); } if (config.contains("user_auth_mode")) { - client_config_.user_auth_mode = OpcuaClient::parse_user_auth_mode(config["user_auth_mode"].get()); + client_config_.user_auth_mode = require_user_auth_mode(config["user_auth_mode"].get()); } if (config.contains("username")) { client_config_.username = config["username"].get(); @@ -190,10 +229,10 @@ void OpcuaPlugin::configure(const nlohmann::json & config) { } // Security env overrides (for Docker / appliance deployments). if (auto * env = std::getenv("OPCUA_SECURITY_POLICY")) { - client_config_.security_policy = OpcuaClient::parse_security_policy(env); + client_config_.security_policy = require_security_policy(env); } if (auto * env = std::getenv("OPCUA_SECURITY_MODE")) { - client_config_.security_mode = OpcuaClient::parse_security_mode(env); + client_config_.security_mode = require_security_mode(env); } if (auto * env = std::getenv("OPCUA_CLIENT_CERT")) { client_config_.client_cert_path = env; @@ -229,7 +268,7 @@ void OpcuaPlugin::configure(const nlohmann::json & config) { client_config_.reject_untrusted = !(v == "0" || v == "false" || v == "no"); } if (auto * env = std::getenv("OPCUA_USER_AUTH")) { - client_config_.user_auth_mode = OpcuaClient::parse_user_auth_mode(env); + client_config_.user_auth_mode = require_user_auth_mode(env); } if (auto * env = std::getenv("OPCUA_USERNAME")) { client_config_.username = env; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index 3349cddd9..6902360fe 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -434,6 +434,11 @@ void OpcuaPoller::read_fallback_replay() { seen.insert(snap.condition_id.toString()); continue; } + // Mark every successfully observed condition as seen, even if it matches + // no mapping below. Otherwise reconcile_after_read would treat a live (but + // unmapped) condition as "cleared while offline" and wrongly clear it. + seen.insert(snap.condition_id.toString()); + AlarmEventInput input; input.enabled_state = snap.enabled_state; input.active_state = snap.active_state; @@ -458,7 +463,6 @@ void OpcuaPoller::read_fallback_replay() { eff.severity_override = resolved.severity_override; eff.message_override = resolved.message_override; - seen.insert(snap.condition_id.toString()); apply_condition_state(eff, snap.condition_id, input, snap.severity, snap.message, /*event_id=*/nullptr); } } From 35a8cf9d348f8b8ffcef668838a5e55a4469696f Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 21:32:59 +0200 Subject: [PATCH 06/11] fix(opcua): harden active-condition replay correctness Call ConditionRefresh on ConditionType i=2782 per Part 9 5.5.7, not the Server object i=2253 (root cause of BadNodeIdUnknown). Read-fallback now never clears faults for a source not positively known to expose condition instance nodes (EventNotifier-only servers like S7-1500 are preserved), filters by Retain, excludes Disabled conditions, follows HasCondition i=9006, and keeps conditions on a transient browse failure. Adds optional require_confirm_for_clear (default unchanged) for Confirm-less servers. Refs #478 --- .../ros2_medkit_opcua/README.md | 50 +++++-- .../ros2_medkit_opcua/design/index.rst | 70 ++++++++-- .../ros2_medkit_opcua/alarm_state_machine.hpp | 19 ++- .../ros2_medkit_opcua/opcua_poller.hpp | 62 ++++++++- .../ros2_medkit_opcua/src/opcua_client.cpp | 78 ++++++++++- .../ros2_medkit_opcua/src/opcua_plugin.cpp | 10 ++ .../ros2_medkit_opcua/src/opcua_poller.cpp | 131 +++++++++++++----- .../test/test_alarm_state_machine.cpp | 42 ++++++ .../test/test_opcua_plugin.cpp | 101 ++++++++++++-- 9 files changed, 474 insertions(+), 89 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md b/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md index cb64c919d..7e684c01a 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md @@ -247,6 +247,7 @@ ros2_medkit_gateway: | `prefer_subscriptions` | `false` | Use OPC-UA subscriptions instead of polling | | `subscription_interval_ms` | `500` | Publishing interval for OPC-UA subscriptions when `prefer_subscriptions: true` | | `condition_replay_strategy` | `auto` | Active-condition replay on reconnect: `method`, `read`, `auto`, `off` (see below) | +| `require_confirm_for_clear` | `true` | Require both Acknowledge AND Confirm before a native alarm auto-clears. Set `false` for Confirm-less servers (e.g. Siemens S7-1500) so alarms clear on Acknowledge alone (see below) | ### OPC-UA client security (SecurityPolicy, certificates, user auth) @@ -283,13 +284,18 @@ Notes: - The encryption backend (OpenSSL) is compiled in; a build without it logs an error and refuses any secured profile. -### Active-condition replay on reconnect (issue #389) +### Active-condition replay on reconnect (issue #389/#478) When the gateway (re)subscribes after a drop or restart, conditions that are already active on the server would otherwise not be re-reported (only live -transitions flow). The standard recovery is OPC-UA `ConditionRefresh`, but some -servers (Siemens S7-1500) reject it with `BadNodeIdUnknown`. The -`condition_replay_strategy` parameter controls recovery: +transitions flow). The standard recovery is OPC-UA `ConditionRefresh`. Per +Part 9 §5.5.7 it is a method of the **ConditionType** node (`i=2782`), so the +Call is issued there (not the Server object) - calling it on the Server object +was the root cause of `BadNodeIdUnknown` rejections, including on Siemens +S7-1500, which documents `ConditionRefresh` as supported (only the optional +`ConditionRefresh2` is not). With the corrected target, `ConditionRefresh` is +expected to work on conformant servers and the read fallback should rarely be +needed. The `condition_replay_strategy` parameter controls recovery: | Value | Behaviour | |-------|-----------| @@ -298,12 +304,35 @@ servers (Siemens S7-1500) reject it with `BadNodeIdUnknown`. The | `auto` (default) | Try `ConditionRefresh`; on rejection fall back to `read` | | `off` | No replay (only live transitions surface) | -The read fallback browses each `alarm_source` for child AlarmCondition -instances and reads their state, so it relies on the conditions being -browseable under the configured source (point `alarm_source` at the owning -Object, not the Server catch-all, for full restart recovery). It does not -recover the live `EventId`, so an operator ack/confirm may need the next live -event before it succeeds. +The read fallback browses each `alarm_source` for AlarmCondition instances - +both hierarchical Object children and targets of the `HasCondition` reference +(`i=9006`), recursing one level - and reads their state. It mirrors +`ConditionRefresh` semantics: only conditions with `Retain==true` form the +active set, a `Disabled` condition (`EnabledState/Id=false`) is never treated +as active, and a transient read failure keeps the condition rather than +dropping it. + +**Safety guarantee (#478):** the read fallback NEVER clears a tracked fault for +a source that has not positively exposed at least one Condition instance node. +An EventNotifier-only server (e.g. S7-1500, which models no per-condition +instance nodes) makes the browse return zero conditions; rather than wiping the +active alarm set, the gateway preserves the last-known faults, logs an operator +warning that read-fallback is unsupported on this server, and keeps live +transitions flowing. Use `ConditionRefresh` (`method`/`auto`) on such servers. +The read fallback does not recover the live `EventId`, so an operator +ack/confirm may need the next live event before it succeeds. + +### Confirm-less servers and `require_confirm_for_clear` (issue #478) + +By default a native alarm auto-clears only after the operator has both +Acknowledged AND Confirmed it (OPC-UA Part 9 §5.7). Some servers do not +implement the optional `Confirm` transition - Siemens S7-1500 supports +`Acknowledge` / `ConditionRefresh` / `AddComment` but not `Confirm`, so +`ConfirmedState` never becomes true and an inactive alarm would latch forever. +Set `require_confirm_for_clear: false` (or `OPCUA_REQUIRE_CONFIRM_FOR_CLEAR=0`) +so the alarm clears on `Acknowledge` alone. The default (`true`) is unchanged +and spec-strict; the relaxed path still requires acknowledgement and needs +real-S7-1500 validation. Node map entries also support an optional `ros2_topic` field to override the auto-generated ROS 2 topic name for the PLC value bridge: @@ -339,6 +368,7 @@ Write operations use the `set_` prefix convention: | `OPCUA_USERNAME` / `OPCUA_PASSWORD` | Username-password identity | | `OPCUA_USER_CERT` | X.509 user-token cert (DER) | | `OPCUA_CONDITION_REPLAY` | `method` / `read` / `auto` / `off` | +| `OPCUA_REQUIRE_CONFIRM_FOR_CLEAR` | `0`/`false`/`no`/`off` to clear native alarms on Acknowledge alone (Confirm-less servers) | ## Hardware Deployment diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/design/index.rst b/src/ros2_medkit_plugins/ros2_medkit_opcua/design/index.rst index 77ecfa33b..994d49280 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/design/index.rst +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/design/index.rst @@ -249,15 +249,27 @@ Decision order, first match wins: | | | below) | +-----+--------------------------------------+---------------------------------------+ | 5b | ``ActiveState == false``, | ``CLEARED`` | -| | ``Acked == true``, | | -| | ``Confirmed == true`` | | +| | ``Acked == true`` and | | +| | (``Confirmed == true`` or | | +| | not ``require_confirm_for_clear``) | | +-----+--------------------------------------+---------------------------------------+ -``Retain`` is intentionally NOT used for state determination. Per Part 9 -§5.5.2.10 it controls visibility during ``ConditionRefresh`` bursts only; -lifecycle is driven entirely by Active / Acked / Confirmed. The SOVD -``PREFAILED`` state has no native equivalent and is reserved for the -threshold-polling pre-trigger path. +``require_confirm_for_clear`` (default ``true``) gates rule 5b on both +Acknowledge and Confirm. Some servers do not implement the optional ``Confirm`` +transition - Siemens S7-1500 supports ``Acknowledge`` / ``ConditionRefresh`` / +``AddComment`` but not ``Confirm`` - so ``ConfirmedState`` never becomes true +and the alarm would latch in ``HEALED`` forever. Setting the flag ``false`` +(YAML ``require_confirm_for_clear`` / env ``OPCUA_REQUIRE_CONFIRM_FOR_CLEAR``) +clears such an alarm on ``Acknowledge`` alone. The default is unchanged +(spec-strict, no regression) and the relaxed path still requires +acknowledgement; it needs real-S7-1500 validation (issue #478). + +``Retain`` is NOT used by this live state-machine table (lifecycle is driven by +Active / Acked / Confirmed; per Part 9 §5.5.2.10 ``Retain`` controls visibility +during ``ConditionRefresh`` bursts). It IS used in the read-based reconnect +replay path (below) as the interesting-state filter, mirroring what +``ConditionRefresh`` would replay. The SOVD ``PREFAILED`` state has no native +equivalent and is reserved for the threshold-polling pre-trigger path. .. note:: @@ -298,15 +310,45 @@ ConditionRefresh ---------------- After creating event monitored items the plugin invokes ``ConditionRefresh`` -(Server object ``i=2253``, method ``i=3875``) so the server pushes any -condition that fired before the subscription started. The same call fires -on every successful reconnect. +so the server pushes any condition that fired before the subscription started. +The same call fires on every successful reconnect. + +Per Part 9 §5.5.7 ``ConditionRefresh`` is a Method of the **ConditionType**, so +the Call's ObjectId is the well-known ConditionType node ``i=2782`` (method +``i=3875``), NOT the Server object ``i=2253``. Calling it on the Server object +was the root cause of ``BadNodeIdUnknown`` rejections on conformant servers, +including Siemens S7-1500, which documents ``ConditionRefresh`` as supported +(only the optional ``ConditionRefresh2`` is not). With the corrected target the +method is expected to succeed on Siemens, making the read fallback unnecessary +there (issue #478). The bracketing ``RefreshStartEventType`` (i=2787) and ``RefreshEndEventType`` (i=2788) are recognized and used to set a diagnostic flag; live notifications arriving during the burst are applied normally because the state machine is driven by per-condition ``ConditionId`` and runs idempotently. +Read-based replay fallback +-------------------------- + +When ``ConditionRefresh`` is rejected (``condition_replay_strategy`` = +``read`` / ``auto``) the plugin browses each ``alarm_source`` for AlarmCondition +instances - hierarchical Object children plus targets of the ``HasCondition`` +reference (``i=9006``), recursing one level - reads their current state, and +drives the same state machine. It mirrors ``ConditionRefresh`` semantics: + +- only conditions with ``Retain == true`` form the active set; +- a ``Disabled`` condition (``EnabledState/Id = false``) is never active; +- a transient browse/read failure keeps the condition (it is not dropped and + must not be reconciled away). + +**Safety gate (issue #478).** The reconcile that clears faults for conditions +no longer present NEVER fires for a source that has not positively exposed at +least one Condition instance node. An EventNotifier-only server (S7-1500 models +no per-condition instance nodes) returns a successful but empty browse; rather +than wiping the tracked active alarms, the plugin preserves them, logs an +operator warning, and relies on ``ConditionRefresh`` / live transitions. This +prevents the net-negative outcome where a reconnect erased a still-active alarm. + Acknowledge / Confirm round-trip -------------------------------- @@ -336,7 +378,13 @@ Vendor matrix | Vendor / runtime | AlarmConditionType | Notes | +======================+=====================+===============================+ | Siemens S7-1500 | yes (FW V2.9+) | ProDiag, Program_Alarm, | -| | | system diagnostics | +| | | system diagnostics. | +| | | ConditionRefresh + | +| | | Acknowledge supported, NO | +| | | Confirm (set | +| | | require_confirm_for_clear | +| | | false). EventNotifier-only: | +| | | no condition instance nodes | +----------------------+---------------------+-------------------------------+ | Beckhoff TwinCAT 3 | yes (TF6100) | ``Confirm`` propagates to | | | | PLC code; ``Ack`` does not | diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp index 3b7119c0a..37579cd30 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/alarm_state_machine.hpp @@ -86,6 +86,15 @@ struct AlarmEventInput { /// 5. ActiveState == false -> Healed or Cleared based on /// Acked + Confirmed /// +/// ``require_confirm_for_clear`` (default true) controls rule 5: a fully +/// cleared alarm needs both Acked AND Confirmed. Some servers do NOT implement +/// the optional Confirm transition (Siemens S7-1500 supports Acknowledge / +/// ConditionRefresh / AddComment but not Confirm), so ConfirmedState never +/// becomes true and the alarm would latch in Healed forever. Setting the flag +/// false lets such a deployment clear on Acknowledge alone. Default true keeps +/// the spec-strict behaviour for conformant servers (no regression). Needs +/// real-S7-1500 validation. +/// /// Retain is intentionally not modeled by this state machine and does not /// affect ``compute()``. Per Part 9 §5.5.2.10 it controls visibility during /// ConditionRefresh bursts rather than the lifecycle mapping implemented @@ -100,7 +109,8 @@ class AlarmStateMachine { AlarmAction action; }; - static Outcome compute(SovdAlarmStatus prev_status, const AlarmEventInput & in) { + static Outcome compute(SovdAlarmStatus prev_status, const AlarmEventInput & in, + bool require_confirm_for_clear = true) { // Rule 1: branch events are recorded in the fault_manager event log // (caller's responsibility) but never advance the primary lifecycle. if (in.branch_id_present) { @@ -141,8 +151,11 @@ class AlarmStateMachine { // Rule 5: ActiveState=false. Cleared only when both Acked AND Confirmed // have been completed by the operator; until then the alarm is latched - // (HEALED) so the operator sees the unfinished workflow item. - if (in.acked_state && in.confirmed_state) { + // (HEALED) so the operator sees the unfinished workflow item. When + // ``require_confirm_for_clear`` is false (Confirm-less server, e.g. S7-1500) + // the Confirmed gate is dropped so Acknowledge alone clears the alarm. + const bool clear_ready = in.acked_state && (in.confirmed_state || !require_confirm_for_clear); + if (clear_ready) { if (prev_status == SovdAlarmStatus::Cleared || prev_status == SovdAlarmStatus::Suppressed) { return {SovdAlarmStatus::Cleared, AlarmAction::NoOp}; } diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp index eac345dd5..81cf4dad1 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp @@ -103,6 +103,14 @@ struct PollerConfig { /// Default Auto: ConditionRefresh with a read-based fallback so hardened /// servers that reject the method still recover their active alarms. ConditionReplayStrategy condition_replay_strategy{ConditionReplayStrategy::Auto}; + /// When true (default) a native AlarmCondition auto-clears only after BOTH + /// Acknowledge and Confirm (OPC-UA Part 9 §5.7). Set false for servers that + /// do not implement the optional Confirm transition (e.g. Siemens S7-1500, + /// which supports Acknowledge / ConditionRefresh / AddComment but not + /// Confirm) so the alarm clears on Acknowledge alone instead of latching + /// forever. Default true preserves the spec-strict behaviour. Issue #478; + /// needs real-S7-1500 validation. + bool require_confirm_for_clear{true}; /// Optional warn-level log sink for operator-visible failures inside the /// poll thread. Set by the plugin owning the poller to its log_warn /// helper so events like ``ConditionRefresh failed`` reach the ROS 2 log @@ -161,15 +169,36 @@ class OpcuaPoller { /// Case-insensitive; unknown input falls back to Auto. static ConditionReplayStrategy parse_replay_strategy(const std::string & name); + /// How a read-scan condition snapshot participates in the read-based replay. + /// - Feed: interesting + reliable -> mark seen and drive the state + /// machine (mirrors a ConditionRefresh replay event). + /// - KeepOnly: present but unreliable (transient read/browse failure) -> + /// mark seen so reconcile does not clear it, but do NOT feed an + /// unreliable state into the state machine. + /// - Skip: not interesting (Disabled, or Retain=false) -> neither + /// marked seen nor fed, so a stale tracked fault can reconcile. + enum class ReadReplayDisposition { Feed, KeepOnly, Skip }; + + /// Classify one read-scan condition snapshot for reconnect replay (issue + /// #478). Mirrors ConditionRefresh's interesting-state filter: a Disabled + /// condition (EnabledState/Id=false) is never active, and only conditions + /// with Retain==true are replayed; a transient read failure is kept but not + /// trusted. Pure / static so the filter is unit-testable without a server. + static ReadReplayDisposition classify_read_snapshot(const OpcuaClient::ConditionStateSnapshot & snap); + /// Decide whether a tracked condition should be cleared after a read-based - /// replay scan (issue #479). A condition is cleared only when it was active, - /// was NOT observed this scan (``seen``), and its owning source scan - /// succeeded. If the source is in ``failed_sources`` the condition is left - /// untouched - its absence from ``seen`` means "not scanned", not "gone". + /// replay scan (issue #479 / #478). A condition is cleared only when it was + /// active, was NOT observed this scan (``seen``), its owning source scan + /// succeeded (not in ``failed_sources``), AND its source is positively known + /// to model Condition instances as address-space nodes (in + /// ``modeled_sources``). The last gate is the #478 safety guarantee: an + /// EventNotifier-only server (e.g. S7-1500) exposes no per-condition nodes, + /// so its empty read scan must never wipe the tracked fault set. /// Pure and static so the false-clear guard is unit-testable without a server. static bool should_clear_condition(SovdAlarmStatus last_status, const std::string & condition_id, const std::string & source_id, const std::set & seen, - const std::set & failed_sources); + const std::set & failed_sources, + const std::set & modeled_sources); private: void poll_loop(); @@ -198,7 +227,15 @@ class OpcuaPoller { /// observed this scan). Conditions whose owning source is in /// ``failed_sources`` (its browse failed this scan) are left untouched so a /// transient disconnect cannot falsely clear live alarms (issue #479). - void reconcile_after_read(const std::set & seen, const std::set & failed_sources); + /// ``modeled_sources`` lists the sources positively known to expose Condition + /// instance nodes; conditions whose source is absent from it are never + /// cleared, so an EventNotifier-only server cannot wipe live alarms (#478). + void reconcile_after_read(const std::set & seen, const std::set & failed_sources, + const std::set & modeled_sources); + + /// Emit an operator-visible warning via the configured ``log_warn`` sink, + /// falling back to the poller's ROS 2 logger. + void warn_operator(const std::string & msg); /// Apply one condition state observation (from a live event or a read scan) /// to the tracked condition map + state machine, dispatching the resulting @@ -236,6 +273,19 @@ class OpcuaPoller { mutable std::shared_mutex conditions_mutex_; std::unordered_map conditions_; // ConditionId stringForm -> runtime + // Issue #478 read-fallback safety state. Touched only from the replay path + // (start() before the poll thread exists, then the poll thread on reconnect), + // never concurrently, so no extra lock is needed. + // + // ``read_modeled_sources_``: alarm sources positively known to expose + // Condition instances as address-space nodes (a read scan yielded >=1 + // ActiveState/Id condition). Only such sources are eligible for read-based + // clearing; EventNotifier-only servers (S7-1500) never enter this set, so + // their tracked faults survive an empty read scan. + std::set read_modeled_sources_; + // Sources already warned about as read-fallback-unsupported (warn once each). + std::set read_unsupported_warned_sources_; + // ConditionRefresh bracketing state. open62541 sends the buffered // historical condition burst between RefreshStartEvent and // RefreshEndEvent; we apply each event during the burst as normal but diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index 435013517..ebec809fa 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -814,14 +815,26 @@ std::vector OpcuaClient::read_source_condit return out; } + // A browse failure during state-variable resolution is "path absent" only + // for the explicit not-found status codes. Anything else (timeout, channel + // drop, server busy) is a transient failure that must NOT be read as "this + // child is not a condition" - otherwise a momentary glitch silently drops a + // live condition and lets reconcile clear its fault. + auto is_path_absent_code = [](UA_StatusCode code) { + return code == UA_STATUSCODE_BADNOMATCH || code == UA_STATUSCODE_BADNODEIDUNKNOWN || + code == UA_STATUSCODE_BADNOTFOUND || code == UA_STATUSCODE_BADBROWSENAMEINVALID; + }; + // Read a Boolean two-step state variable (e.g. ActiveState/Id). Returns the // fallback when the path is absent or not Boolean. ``resolved`` reports // whether the ``state_name/Id`` path browsed successfully (used to decide if // a child is a condition); ``read_failed`` reports that the path resolved but // its value could not be read this scan (transient failure, distinct from the - // path simply not existing). - auto read_state_id = [](opcua::Node & cond, const char * state_name, bool fallback, bool * resolved, - bool * read_failed) -> bool { + // path simply not existing). A transient BROWSE failure (as opposed to a + // genuine path-absent code) is also treated as resolved+read_failed so the + // condition is kept and flagged rather than dropped (issue #478). + auto read_state_id = [&is_path_absent_code](opcua::Node & cond, const char * state_name, bool fallback, + bool * resolved, bool * read_failed) -> bool { try { auto node = cond.browseChild({{0, state_name}, {0, "Id"}}); if (resolved) { @@ -840,18 +853,69 @@ std::vector OpcuaClient::read_source_condit } return fallback; } - } catch (const opcua::BadStatus &) { - // Path absent: this child is not an alarm condition. + } catch (const opcua::BadStatus & e) { + if (is_path_absent_code(e.code())) { + // Path genuinely absent: this child is not an alarm condition. + if (resolved) { + *resolved = false; + } + return fallback; + } + // Transient browse failure: conservatively keep the condition and flag it + // so the caller does not drop it (and let reconcile wrongly clear it). if (resolved) { - *resolved = false; + *resolved = true; + } + if (read_failed) { + *read_failed = true; } return fallback; } }; + // Collect candidate condition nodes from the source. AlarmCondition + // instances may be linked as hierarchical Object children OR via the + // non-hierarchical HasCondition reference (i=9006, Part 9 §5.5.3); we follow + // both and recurse one level through HasCondition so conditions owned by an + // intermediate Object are still found. De-duplicated by NodeId string. + constexpr uint32_t kHasConditionRefId = 9006; + auto collect_candidates = [&](opcua::Node & src, std::vector> & out_nodes, + std::set & seen_ids) { + auto add = [&](opcua::Node n) { + if (seen_ids.insert(n.id().toString()).second) { + out_nodes.push_back(std::move(n)); + } + }; + auto hier = src.browseChildren(opcua::ReferenceTypeId::HierarchicalReferences, opcua::NodeClass::Object); + for (auto & c : hier) { + add(c); + } + try { + auto linked = src.browseChildren(opcua::NodeId(0, kHasConditionRefId), opcua::NodeClass::Object); + for (auto & c : linked) { + add(c); + } + } catch (const opcua::BadStatus &) { + // HasCondition browse unsupported on this server: hierarchical children + // already collected, continue. + } + // One-level recursion through HasCondition from each hierarchical child. + for (auto & c : hier) { + try { + auto linked = c.browseChildren(opcua::NodeId(0, kHasConditionRefId), opcua::NodeClass::Object); + for (auto & gc : linked) { + add(gc); + } + } catch (const opcua::BadStatus &) { + } + } + }; + try { opcua::Node src(impl_->client, source_node); - auto children = src.browseChildren(opcua::ReferenceTypeId::HierarchicalReferences, opcua::NodeClass::Object); + std::vector> children; + std::set candidate_ids; + collect_candidates(src, children, candidate_ids); for (auto & child : children) { ConditionStateSnapshot snap; snap.condition_id = child.id(); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp index 396d148bf..99aee3254 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_plugin.cpp @@ -219,6 +219,16 @@ void OpcuaPlugin::configure(const nlohmann::json & config) { if (auto * env = std::getenv("OPCUA_CONDITION_REPLAY")) { poller_config_.condition_replay_strategy = OpcuaPoller::parse_replay_strategy(env); } + // Issue #478: drop the Confirm gate for servers that do not implement the + // optional Confirm transition (e.g. Siemens S7-1500) so alarms clear on + // Acknowledge alone. Default true keeps the spec-strict behaviour. + if (config.contains("require_confirm_for_clear")) { + poller_config_.require_confirm_for_clear = config["require_confirm_for_clear"].get(); + } + if (auto * env = std::getenv("OPCUA_REQUIRE_CONFIRM_FOR_CLEAR")) { + const std::string v = env; + poller_config_.require_confirm_for_clear = !(v == "0" || v == "false" || v == "no" || v == "off"); + } // Environment variables override YAML config (for Docker) if (auto * env = std::getenv("OPCUA_ENDPOINT_URL")) { diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index 6902360fe..d038e5a68 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -350,22 +350,21 @@ ConditionReplayStrategy OpcuaPoller::parse_replay_strategy(const std::string & n return ConditionReplayStrategy::Auto; } -void OpcuaPoller::replay_active_conditions() { - // Shared warn sink (operator-visible) used by the strategy branches. - auto warn = [this](const std::string & msg) { - if (config_.log_warn) { - config_.log_warn(msg); - } else { - RCLCPP_WARN(opcua_poller_logger(), "%s", msg.c_str()); - } - }; +void OpcuaPoller::warn_operator(const std::string & msg) { + if (config_.log_warn) { + config_.log_warn(msg); + } else { + RCLCPP_WARN(opcua_poller_logger(), "%s", msg.c_str()); + } +} +void OpcuaPoller::replay_active_conditions() { switch (config_.condition_replay_strategy) { case ConditionReplayStrategy::Off: return; case ConditionReplayStrategy::Method: if (!try_condition_refresh() && !condition_refresh_warned_) { - warn( + warn_operator( "OPC-UA ConditionRefresh rejected and replay strategy is 'method'; active conditions " "will NOT be replayed on reconnect with this server. Live transitions still flow. See issue #389."); condition_refresh_warned_ = true; @@ -382,7 +381,8 @@ void OpcuaPoller::replay_active_conditions() { return; } if (!condition_refresh_warned_) { - warn("OPC-UA ConditionRefresh rejected; using read-based active-condition replay fallback (issue #389)."); + warn_operator( + "OPC-UA ConditionRefresh rejected; using read-based active-condition replay fallback (issue #389)."); condition_refresh_warned_ = true; } read_fallback_replay(); @@ -391,30 +391,57 @@ void OpcuaPoller::replay_active_conditions() { } bool OpcuaPoller::try_condition_refresh() { - // Server object NodeId (i=2253) hosts the ConditionRefresh method - // (i=3875 - per Part 9 §5.5.7). Servers that reject it (BadMethodInvalid in - // open62541, BadNodeIdUnknown on Siemens S7-1500, etc.) return an error; - // the caller then decides whether to fall back to the read-based replay. - static constexpr uint32_t kServerObjectId = 2253; + // Per Part 9 §5.5.7 ConditionRefresh is a Method of the ConditionType, so the + // Call's ObjectId MUST be the well-known ConditionType node (i=2782), NOT the + // Server object (i=2253). Calling it on the Server object is the root cause of + // BadNodeIdUnknown / BadMethodInvalid rejections on conformant servers + // (including Siemens S7-1500, which documents ConditionRefresh as supported - + // only ConditionRefresh2 is unsupported). The method itself is i=3875 + // (ConditionType.ConditionRefresh) and takes the SubscriptionId argument. + static constexpr uint32_t kConditionTypeId = 2782; static constexpr uint32_t kConditionRefreshMethodId = 3875; std::vector args; args.push_back(opcua::Variant::fromScalar(static_cast(event_subscription_id_))); auto result = - client_.call_method(opcua::NodeId(0, kServerObjectId), opcua::NodeId(0, kConditionRefreshMethodId), args); + client_.call_method(opcua::NodeId(0, kConditionTypeId), opcua::NodeId(0, kConditionRefreshMethodId), args); return result.has_value(); } +OpcuaPoller::ReadReplayDisposition +OpcuaPoller::classify_read_snapshot(const OpcuaClient::ConditionStateSnapshot & snap) { + // Transient read/browse failure: present but unreliable. Keep it (so reconcile + // does not clear) but do not feed an untrustworthy state into the machine. + if (snap.state_read_failed) { + return ReadReplayDisposition::KeepOnly; + } + // EnabledState=false: a Disabled condition is never active and is never + // replayed by ConditionRefresh (Part 9 §5.7.2). Exclude it entirely so a + // stale tracked fault can reconcile away. + if (!snap.enabled_state) { + return ReadReplayDisposition::Skip; + } + // Retain mirrors ConditionRefresh's interesting-state filter (Part 9 + // §5.5.2): only Retain==true conditions are replayed. A condition that has + // gone quiet (Retain==false) is no longer of interest. + if (!snap.retain) { + return ReadReplayDisposition::Skip; + } + return ReadReplayDisposition::Feed; +} + void OpcuaPoller::read_fallback_replay() { // Browse each configured alarm source, read its conditions' current state, // and drive the same state machine as live events. Conditions that are no - // longer active are reconciled (cleared) afterwards - but only for sources - // whose scan succeeded, so a transient disconnect never falsely clears. + // longer active/interesting are reconciled (cleared) afterwards - but only + // for sources whose scan succeeded AND that positively expose Condition + // instance nodes, so neither a transient disconnect nor an EventNotifier-only + // server (S7-1500) can falsely clear live alarms (issue #478). // - // This fallback assumes each AlarmCondition instance is browseable as an - // immediate child of its alarm source. Validated against the open62541 - // reference server; NOT yet validated against a real Siemens S7-1500, whose - // condition-instance address-space layout must be confirmed before this path - // can be relied on there. + // This fallback assumes each AlarmCondition instance is browseable from its + // alarm source (hierarchical child or via HasCondition). Validated against + // the open62541 reference server; NOT yet validated against a real Siemens + // S7-1500, whose condition-instance address-space layout must be confirmed + // before this path can be relied on there (use ConditionRefresh on Siemens). std::set seen; std::set failed_sources; for (const auto & cfg : node_map_.event_alarms()) { @@ -426,18 +453,40 @@ void OpcuaPoller::read_fallback_replay() { failed_sources.insert(cfg.source_node_id_str); continue; } + if (!conditions.empty()) { + // The source exposes Condition instances as nodes -> read-fallback is + // viable here and reconcile may clear stale faults for it. + read_modeled_sources_.insert(cfg.source_node_id_str); + read_unsupported_warned_sources_.erase(cfg.source_node_id_str); + } else if (read_modeled_sources_.find(cfg.source_node_id_str) == read_modeled_sources_.end()) { + // Successful scan, zero Condition instance nodes, and this source has + // never yielded any. Almost certainly an EventNotifier-only server + // (e.g. S7-1500). Do NOT clear its tracked faults; warn once. + if (read_unsupported_warned_sources_.insert(cfg.source_node_id_str).second) { + warn_operator( + "OPC-UA read-based active-condition replay found no Condition instance nodes under source '" + + cfg.source_node_id_str + + "'. This server appears to expose alarms via EventNotifier only (e.g. Siemens S7-1500), so the " + "read fallback cannot recover the active set; tracked faults are preserved (not cleared) and live " + "transitions still flow. Prefer ConditionRefresh ('method' or 'auto'). See issue #478."); + } + } for (const auto & snap : conditions) { - if (snap.state_read_failed) { - // Condition is present but its ActiveState read failed transiently. - // Keep it in ``seen`` (so reconcile does not clear it) without feeding - // an unreliable state into the state machine. - seen.insert(snap.condition_id.toString()); + const std::string cid = snap.condition_id.toString(); + const ReadReplayDisposition disp = classify_read_snapshot(snap); + if (disp == ReadReplayDisposition::Skip) { + // Not interesting (Disabled or Retain=false): leave out of ``seen`` so + // any stale tracked fault reconciles away. continue; } - // Mark every successfully observed condition as seen, even if it matches + // Mark every interesting/observed condition as seen, even if it matches // no mapping below. Otherwise reconcile_after_read would treat a live (but // unmapped) condition as "cleared while offline" and wrongly clear it. - seen.insert(snap.condition_id.toString()); + seen.insert(cid); + if (disp == ReadReplayDisposition::KeepOnly) { + // Present but unreliable (transient read failure): kept, not fed. + continue; + } AlarmEventInput input; input.enabled_state = snap.enabled_state; @@ -466,12 +515,13 @@ void OpcuaPoller::read_fallback_replay() { apply_condition_state(eff, snap.condition_id, input, snap.severity, snap.message, /*event_id=*/nullptr); } } - reconcile_after_read(seen, failed_sources); + reconcile_after_read(seen, failed_sources, read_modeled_sources_); } bool OpcuaPoller::should_clear_condition(SovdAlarmStatus last_status, const std::string & condition_id, const std::string & source_id, const std::set & seen, - const std::set & failed_sources) { + const std::set & failed_sources, + const std::set & modeled_sources) { const bool was_active = (last_status == SovdAlarmStatus::Confirmed || last_status == SovdAlarmStatus::Healed); if (!was_active) { return false; @@ -481,16 +531,23 @@ bool OpcuaPoller::should_clear_condition(SovdAlarmStatus last_status, const std: if (failed_sources.find(source_id) != failed_sources.end()) { return false; } + // Safety gate (issue #478): only clear when the source is positively known to + // model Condition instances as address-space nodes. An EventNotifier-only + // server (S7-1500) yields zero condition nodes, so an empty read scan must + // never wipe its tracked faults - this is the single most important guard. + if (modeled_sources.find(source_id) == modeled_sources.end()) { + return false; + } return seen.find(condition_id) == seen.end(); } -void OpcuaPoller::reconcile_after_read(const std::set & seen, - const std::set & failed_sources) { +void OpcuaPoller::reconcile_after_read(const std::set & seen, const std::set & failed_sources, + const std::set & modeled_sources) { std::vector clears; { std::unique_lock lock(conditions_mutex_); for (auto & [cid, runtime] : conditions_) { - if (should_clear_condition(runtime.last_status, cid, runtime.source_id, seen, failed_sources)) { + if (should_clear_condition(runtime.last_status, cid, runtime.source_id, seen, failed_sources, modeled_sources)) { // Tracked as active but absent from a successful read scan -> the alarm // cleared while we were offline. Clear the SOVD fault. runtime.last_status = SovdAlarmStatus::Cleared; @@ -666,7 +723,7 @@ void OpcuaPoller::apply_condition_state(const AlarmEventConfig & cfg, const opcu it->second.latest_event_id = *event_id; } - auto outcome = AlarmStateMachine::compute(prev_status, input); + auto outcome = AlarmStateMachine::compute(prev_status, input, config_.require_confirm_for_clear); RCLCPP_DEBUG_STREAM(opcua_poller_logger(), "state machine: enabled=" << input.enabled_state << " active=" << input.active_state << " acked=" << input.acked_state diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp index 918a68d40..207634c7c 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_alarm_state_machine.cpp @@ -314,4 +314,46 @@ TEST(AlarmStateMachineTest, InactiveUnackedFromSuppressedReportsHealed) { EXPECT_EQ(out.action, AlarmAction::ReportHealed); } +// -- Issue #478: require_confirm_for_clear (Siemens Confirm-less clear path) -- + +TEST(AlarmStateMachineTest, AckedNotConfirmedLatchesWhenConfirmRequiredDefault) { + // Default behaviour (require_confirm_for_clear=true): an acked-but-not- + // confirmed inactive alarm stays latched (HEALED). No regression. + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.acked_state = true; + in.confirmed_state = false; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in); // default true + EXPECT_EQ(out.next_status, SovdAlarmStatus::Healed); + EXPECT_EQ(out.action, AlarmAction::ReportHealed); +} + +TEST(AlarmStateMachineTest, AckedClearsWhenConfirmNotRequired) { + // Confirm-less server (S7-1500): with require_confirm_for_clear=false an + // acknowledged inactive alarm clears even though ConfirmedState never + // becomes true. + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.acked_state = true; + in.confirmed_state = false; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in, /*require_confirm_for_clear=*/false); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Cleared); + EXPECT_EQ(out.action, AlarmAction::ClearFault); +} + +TEST(AlarmStateMachineTest, UnackedStillLatchesWhenConfirmNotRequired) { + // Even with the Confirm gate dropped, an UN-acknowledged inactive alarm must + // still latch (HEALED): acknowledgement is the minimum operator action. + AlarmEventInput in; + in.enabled_state = true; + in.active_state = false; + in.acked_state = false; + in.confirmed_state = false; + auto out = AlarmStateMachine::compute(SovdAlarmStatus::Confirmed, in, /*require_confirm_for_clear=*/false); + EXPECT_EQ(out.next_status, SovdAlarmStatus::Healed); + EXPECT_EQ(out.action, AlarmAction::ReportHealed); +} + } // namespace ros2_medkit_gateway diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp index 44790af24..fac369463 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp @@ -307,39 +307,110 @@ TEST(ConditionReplayStrategyParse, KnownValues) { EXPECT_EQ(OpcuaPoller::parse_replay_strategy("bogus"), ConditionReplayStrategy::Auto); } -// -- Issue #479: read-replay reconcile must not falsely clear faults --------- +// -- Issue #479/#478: read-replay reconcile must not falsely clear faults ----- TEST(ReconcileShouldClearCondition, ClearsActiveConditionAbsentFromSuccessfulScan) { - // An active condition that the (successful) scan did not observe is gone. - std::set seen; // condition not seen this scan - std::set failed_sources; // its source scanned fine - EXPECT_TRUE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Confirmed, "cond-1", "src-1", seen, failed_sources)); - EXPECT_TRUE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Healed, "cond-1", "src-1", seen, failed_sources)); + // An active condition that the (successful) scan did not observe is gone - + // but only when its source is positively known to model condition nodes. + std::set seen; // condition not seen this scan + std::set failed_sources; // its source scanned fine + std::set modeled_sources{"src-1"}; // source exposes condition nodes + EXPECT_TRUE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Confirmed, "cond-1", "src-1", seen, failed_sources, + modeled_sources)); + EXPECT_TRUE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Healed, "cond-1", "src-1", seen, failed_sources, + modeled_sources)); } TEST(ReconcileShouldClearCondition, KeepsConditionStillSeen) { std::set seen{"cond-1"}; std::set failed_sources; - EXPECT_FALSE( - OpcuaPoller::should_clear_condition(SovdAlarmStatus::Confirmed, "cond-1", "src-1", seen, failed_sources)); + std::set modeled_sources{"src-1"}; + EXPECT_FALSE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Confirmed, "cond-1", "src-1", seen, failed_sources, + modeled_sources)); } TEST(ReconcileShouldClearCondition, SkipsClearWhenSourceScanFailed) { - // The core false-clear guard: source scan failed (browse error / disconnect), + // The false-clear guard: source scan failed (browse error / disconnect), // so the condition's absence from ``seen`` must NOT clear it. std::set seen; // not observed... std::set failed_sources{"src-1"}; // ...because its source failed - EXPECT_FALSE( - OpcuaPoller::should_clear_condition(SovdAlarmStatus::Confirmed, "cond-1", "src-1", seen, failed_sources)); - EXPECT_FALSE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Healed, "cond-1", "src-1", seen, failed_sources)); + std::set modeled_sources{"src-1"}; + EXPECT_FALSE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Confirmed, "cond-1", "src-1", seen, failed_sources, + modeled_sources)); + EXPECT_FALSE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Healed, "cond-1", "src-1", seen, failed_sources, + modeled_sources)); +} + +// Issue #478 safety-gate: an empty scan from a source that has NEVER yielded a +// condition instance node (EventNotifier-only server, e.g. S7-1500) must NOT +// clear the still-active tracked fault. This is the single most important +// outcome of the hardening. +TEST(ReconcileShouldClearCondition, NeverClearsWhenSourceNotPositivelyModeled) { + std::set seen; // condition not seen (server has no nodes) + std::set failed_sources; // the browse itself succeeded (empty) + std::set modeled_sources; // src-1 has never yielded a condition node + EXPECT_FALSE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Confirmed, "cond-1", "src-1", seen, failed_sources, + modeled_sources)); + EXPECT_FALSE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Healed, "cond-1", "src-1", seen, failed_sources, + modeled_sources)); } TEST(ReconcileShouldClearCondition, NeverClearsInactiveCondition) { std::set seen; std::set failed_sources; - EXPECT_FALSE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Cleared, "cond-1", "src-1", seen, failed_sources)); - EXPECT_FALSE( - OpcuaPoller::should_clear_condition(SovdAlarmStatus::Suppressed, "cond-1", "src-1", seen, failed_sources)); + std::set modeled_sources{"src-1"}; + EXPECT_FALSE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Cleared, "cond-1", "src-1", seen, failed_sources, + modeled_sources)); + EXPECT_FALSE(OpcuaPoller::should_clear_condition(SovdAlarmStatus::Suppressed, "cond-1", "src-1", seen, failed_sources, + modeled_sources)); +} + +// -- Issue #478: read-scan snapshot classification (Retain / EnabledState / +// transient-keep filter that mirrors ConditionRefresh semantics). -------------- + +namespace { +OpcuaClient::ConditionStateSnapshot make_snap(bool enabled, bool active, bool retain, bool read_failed = false) { + OpcuaClient::ConditionStateSnapshot s; + s.enabled_state = enabled; + s.active_state = active; + s.retain = retain; + s.state_read_failed = read_failed; + return s; +} +} // namespace + +TEST(ReadSnapshotClassify, RetainTrueActiveIsFed) { + auto d = OpcuaPoller::classify_read_snapshot(make_snap(/*enabled=*/true, /*active=*/true, /*retain=*/true)); + EXPECT_EQ(d, OpcuaPoller::ReadReplayDisposition::Feed); +} + +TEST(ReadSnapshotClassify, RetainTrueInactiveStillInterestingIsFed) { + // ActiveState=false but Retain=true => unacked/unconfirmed, still of interest + // (ConditionRefresh would replay it). Fed so the HEALED latch is preserved. + auto d = OpcuaPoller::classify_read_snapshot(make_snap(/*enabled=*/true, /*active=*/false, /*retain=*/true)); + EXPECT_EQ(d, OpcuaPoller::ReadReplayDisposition::Feed); +} + +TEST(ReadSnapshotClassify, RetainFalseIsSkipped) { + // Retain=false => no longer interesting; not seen so a stale fault reconciles. + auto d = OpcuaPoller::classify_read_snapshot(make_snap(/*enabled=*/true, /*active=*/false, /*retain=*/false)); + EXPECT_EQ(d, OpcuaPoller::ReadReplayDisposition::Skip); +} + +TEST(ReadSnapshotClassify, DisabledIsSkippedEvenIfActiveAndRetained) { + // EnabledState=false must never be treated as active in the read path, even + // when ActiveState/Retain would otherwise mark it interesting. + auto d = OpcuaPoller::classify_read_snapshot(make_snap(/*enabled=*/false, /*active=*/true, /*retain=*/true)); + EXPECT_EQ(d, OpcuaPoller::ReadReplayDisposition::Skip); +} + +TEST(ReadSnapshotClassify, TransientReadFailureIsKeptNotFed) { + // A transient read/browse failure (flagged) keeps the condition (so reconcile + // does not clear it) but does not feed an unreliable state into the machine. + // KeepOnly takes precedence over every other field. + auto d = OpcuaPoller::classify_read_snapshot( + make_snap(/*enabled=*/false, /*active=*/false, /*retain=*/false, /*read_failed=*/true)); + EXPECT_EQ(d, OpcuaPoller::ReadReplayDisposition::KeepOnly); } } // namespace ros2_medkit_gateway From e502a1ec4acd7dc61cd90d5f7ede9be22d0a3138 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Tue, 30 Jun 2026 22:50:39 +0200 Subject: [PATCH 07/11] test(opcua): add secured A&C integration test over a real SecureChannel Negotiates an encrypted Basic256Sha256 + SignAndEncrypt channel with a username token against an encryption-enabled test_alarm_server, covering apply_security_config()'s success path: secured read, CONFIRMED alarm, and wrong-password fail-closed. CTest 77 skip unless required; new CI job. Refs #477 --- .github/workflows/opcua-plugin.yml | 86 ++++ .../ros2_medkit_opcua/CMakeLists.txt | 34 +- .../docker/test_alarm_server/Dockerfile | 8 +- .../test_alarm_server/test_alarm_server.cpp | 201 +++++++- .../test/integration/gen_test_certs.sh | 75 +++ .../integration/test_opcua_secured.test.py | 429 ++++++++++++++++++ 6 files changed, 827 insertions(+), 6 deletions(-) create mode 100755 src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/gen_test_certs.sh create mode 100755 src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/test_opcua_secured.test.py diff --git a/.github/workflows/opcua-plugin.yml b/.github/workflows/opcua-plugin.yml index 398084079..43cf4a39d 100644 --- a/.github/workflows/opcua-plugin.yml +++ b/.github/workflows/opcua-plugin.yml @@ -256,3 +256,89 @@ jobs: echo "=== ${c} logs ===" docker logs "${c}" 2>&1 | tail -80 || true done + + integration-secured: + name: Integration (Secured A&C) + # Issue #477: proves the OPC-UA client security code's + # apply_security_config() SUCCESS path - a real Basic256Sha256 + + # SignAndEncrypt SecureChannel with a username token against the + # encryption-enabled test_alarm_server fixture. Needs fault_manager_node + # built alongside the gateway, openssl for cert generation, and + # ROS2_MEDKIT_OPCUA_SECURE_REQUIRE=1 so the test runs for real instead of + # skipping with CTest code 77. + runs-on: ubuntu-latest + container: + image: ubuntu:noble + timeout-minutes: 60 + defaults: + run: + shell: bash + steps: + - name: Install Git + run: | + apt-get update + apt-get install -y git + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Pre-install ROS 2 apt source + uses: ./.github/actions/ros-apt-source + + - name: Set up ROS 2 jazzy + uses: ros-tooling/setup-ros@v0.7 + with: + required-ros-distributions: jazzy + + - name: Install ccache + run: apt-get install -y ccache + + - name: Cache ccache + uses: actions/cache@v4 + with: + path: /root/.cache/ccache + key: ccache-opcua-secured-${{ github.sha }} + restore-keys: | + ccache-opcua-secured- + + - name: Install dependencies + run: | + apt-get update + # openssl: cert generation. libssl-dev: encryption backend for both + # the plugin's open62541pp and the fixture's static open62541. + apt-get install -y ros-jazzy-test-msgs libyaml-cpp-dev libssl-dev openssl + source /opt/ros/jazzy/setup.bash + rosdep update + rosdep install --from-paths src --ignore-src -y \ + --skip-keys='nav2_msgs ament_cmake_clang_format ament_cmake_clang_tidy' + + - name: Build gateway + OPC-UA plugin + fault_manager + env: + CCACHE_DIR: /root/.cache/ccache + CCACHE_MAXSIZE: 500M + CCACHE_SLOPPINESS: pch_defines,time_macros + run: | + source /opt/ros/jazzy/setup.bash + # fault_manager_node is a sibling package the secured test launches; + # it is not a build dependency of the plugin, so name it explicitly. + colcon build --symlink-install \ + --packages-up-to ros2_medkit_opcua ros2_medkit_fault_manager \ + --cmake-args -DCMAKE_BUILD_TYPE=Release \ + --event-handlers console_direct+ + ccache -s + + - name: Run secured A&C integration test + timeout-minutes: 15 + env: + ROS2_MEDKIT_OPCUA_SECURE_REQUIRE: '1' + run: | + source /opt/ros/jazzy/setup.bash + source install/setup.bash + colcon test --return-code-on-test-failure \ + --packages-select ros2_medkit_opcua \ + --ctest-args -R test_opcua_secured \ + --event-handlers console_direct+ + + - name: Show test results + if: always() + run: colcon test-result --verbose diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt index 54e54c463..8407a6790 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/CMakeLists.txt @@ -268,6 +268,11 @@ if(BUILD_TESTING) -DBUILD_SHARED_LIBS=OFF -DUA_ENABLE_SUBSCRIPTIONS_EVENTS=ON -DUA_ENABLE_SUBSCRIPTIONS_ALARMS_CONDITIONS=ON + # Secured A&C test (issue #477): the fixture also serves an + # encryption-only endpoint (Basic256Sha256 Sign/SignAndEncrypt) so the + # gateway's apply_security_config() success path can be exercised + # against a real SecureChannel. Requires the OpenSSL backend. + -DUA_ENABLE_ENCRYPTION=OPENSSL -DUA_NAMESPACE_ZERO=FULL -DUA_BUILD_EXAMPLES=OFF -DUA_BUILD_TOOLS=OFF @@ -291,8 +296,10 @@ if(BUILD_TESTING) add_dependencies(test_alarm_server alarm_open62541_ep) target_include_directories(test_alarm_server SYSTEM PRIVATE "${_alarm_o62_include}") + # The static open62541 above is built with UA_ENABLE_ENCRYPTION=OPENSSL, so + # the fixture links OpenSSL for the secured-endpoint code path. target_link_libraries(test_alarm_server PRIVATE - "${_alarm_o62_lib}" Threads::Threads) + "${_alarm_o62_lib}" OpenSSL::SSL OpenSSL::Crypto Threads::Threads) set_target_properties(test_alarm_server PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}") target_compile_options(test_alarm_server PRIVATE -w) @@ -319,6 +326,31 @@ if(BUILD_TESTING) LABELS "integration" SKIP_RETURN_CODE 77 TIMEOUT 60) + + # ---- Secured A&C integration test (issue #477) ------------------------ + # Drives the gateway's apply_security_config() success path against the + # encryption-enabled fixture above: negotiates a Basic256Sha256 + + # SignAndEncrypt SecureChannel with a username token, asserts a secured + # read + a CONFIRMED alarm, and that a wrong password fails closed. + # + # Skips with CTest exit 77 unless ROS2_MEDKIT_OPCUA_SECURE_REQUIRE is set + # (mirrors the asyncua smoke-test skip). It also needs gateway_node + + # fault_manager_node + openssl on PATH, so the default unit-tests job (which + # does not build fault_manager) skips cleanly while the dedicated secured CI + # job runs it for real. + install(PROGRAMS + test/integration/gen_test_certs.sh + test/integration/test_opcua_secured.test.py + DESTINATION lib/${PROJECT_NAME}) + add_test(NAME test_opcua_secured + COMMAND "${Python3_EXECUTABLE}" + "${CMAKE_CURRENT_SOURCE_DIR}/test/integration/test_opcua_secured.test.py" + "${CMAKE_BINARY_DIR}/test_alarm_server" + "${CMAKE_CURRENT_SOURCE_DIR}/test/integration/gen_test_certs.sh") + set_tests_properties(test_opcua_secured PROPERTIES + LABELS "integration" + SKIP_RETURN_CODE 77 + TIMEOUT 300) endif() ros2_medkit_relax_vendor_warnings() diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile index 775a6477f..e21c6a5ce 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/docker/test_alarm_server/Dockerfile @@ -13,7 +13,7 @@ FROM ubuntu:24.04 AS builder ENV DEBIAN_FRONTEND=noninteractive RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential cmake git python3 ca-certificates \ + build-essential cmake git python3 ca-certificates libssl-dev \ && rm -rf /var/lib/apt/lists/* # Pin matches the SHA used by the plugin's open62541pp dependency @@ -34,6 +34,7 @@ RUN cmake -S /src/open62541 -B /tmp/o62-build \ -DBUILD_SHARED_LIBS=OFF \ -DUA_ENABLE_SUBSCRIPTIONS_EVENTS=ON \ -DUA_ENABLE_SUBSCRIPTIONS_ALARMS_CONDITIONS=ON \ + -DUA_ENABLE_ENCRYPTION=OPENSSL \ -DUA_NAMESPACE_ZERO=FULL \ -DUA_BUILD_EXAMPLES=OFF \ -DUA_BUILD_TOOLS=OFF \ @@ -42,15 +43,16 @@ RUN cmake -S /src/open62541 -B /tmp/o62-build \ -DCMAKE_C_FLAGS=-w \ && cmake --build /tmp/o62-build -j"$(nproc)" --target install +# The static lib is built with the OpenSSL encryption backend, so link OpenSSL. RUN g++ -O2 -std=c++17 -w \ -I/opt/open62541/include \ /src/test_alarm_server/test_alarm_server.cpp \ /opt/open62541/lib/libopen62541.a \ - -lpthread -o /opt/test_alarm_server + -lssl -lcrypto -lpthread -o /opt/test_alarm_server FROM ubuntu:24.04 RUN apt-get update && apt-get install -y --no-install-recommends \ - libstdc++6 ca-certificates \ + libstdc++6 ca-certificates libssl3 openssl \ && rm -rf /var/lib/apt/lists/* COPY --from=builder /opt/test_alarm_server /usr/local/bin/test_alarm_server EXPOSE 4842 diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp index 765394592..1cd65c32e 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/fixtures/test_alarm_server/test_alarm_server.cpp @@ -22,6 +22,7 @@ // "ERR " to stdout for the test harness to poll. See // docs in the header comment of fire(), clear(), ack(), etc. below. +#include #include #include #include @@ -31,7 +32,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -136,6 +139,23 @@ UA_StatusCode add_condition(UA_Server * server, const std::string & name, UA_UIn return rc; } +// A single readable Double variable (``ns=;s=Tank.Level`` = 42.0). The +// secured integration test maps this node and reads it back over the encrypted +// SecureChannel to prove a value read - not just an event subscription - +// traverses the Basic256Sha256 channel. +UA_StatusCode add_variable(UA_Server * server, UA_UInt16 ns) { + UA_VariableAttributes attr = UA_VariableAttributes_default; + UA_Double value = 42.0; + UA_Variant_setScalar(&attr.value, &value, &UA_TYPES[UA_TYPES_DOUBLE]); + attr.displayName = UA_LOCALIZEDTEXT(const_cast("en"), const_cast("TankLevel")); + attr.accessLevel = UA_ACCESSLEVELMASK_READ; + UA_NodeId node_id = UA_NODEID_STRING(ns, const_cast("Tank.Level")); + UA_QualifiedName qname = UA_QUALIFIEDNAME(ns, const_cast("TankLevel")); + return UA_Server_addVariableNode(server, node_id, UA_NODEID_NUMERIC(0, UA_NS0ID_OBJECTSFOLDER), + UA_NODEID_NUMERIC(0, UA_NS0ID_ORGANIZES), qname, + UA_NODEID_NUMERIC(0, UA_NS0ID_BASEDATAVARIABLETYPE), attr, nullptr, nullptr); +} + UA_StatusCode set_two_state(UA_Server * server, const UA_NodeId & cond, const char * field, UA_Boolean value) { UA_Variant v; UA_Variant_setScalar(&v, &value, &UA_TYPES[UA_TYPES_BOOLEAN]); @@ -300,6 +320,147 @@ UA_StatusCode handle_enable(UA_Server * server, const Condition & c, bool enable return set_two_state(server, c.node, "EnabledState", enable); } +#ifdef UA_ENABLE_ENCRYPTION + +// Read a whole file into a heap-allocated UA_ByteString. Returns an empty +// ByteString (length 0) on failure; the caller treats that as a fatal config +// error. open62541's OpenSSL backend auto-detects DER (0x30 0x82 magic) vs PEM +// for both certificates and keys, so the test harness can hand us a DER cert +// and a PEM key without us caring about the encoding here. +UA_ByteString load_file(const std::string & path) { + UA_ByteString out = UA_BYTESTRING_NULL; + std::ifstream f(path, std::ios::binary); + if (!f) { + return out; + } + std::string data((std::istreambuf_iterator(f)), std::istreambuf_iterator()); + out.length = data.size(); + out.data = static_cast(UA_malloc(out.length)); + if (!out.data) { + out.length = 0; + return out; + } + std::memcpy(out.data, data.data(), out.length); + return out; +} + +// Chained ActivateSession callback. open62541's UA_AccessControl_default owns +// the real user-token check; we wrap it only to emit a deterministic, greppable +// line proving which SecurityPolicy + MessageSecurityMode the SecureChannel +// actually negotiated. The test harness asserts on ``securityMode=3`` (= +// SignAndEncrypt) so the suite cannot pass against a downgraded / None channel. +using ActivateSessionFn = UA_StatusCode (*)(UA_Server *, UA_AccessControl *, const UA_EndpointDescription *, + const UA_ByteString *, const UA_NodeId *, const UA_ExtensionObject *, + void **); +ActivateSessionFn g_orig_activate_session = nullptr; + +UA_StatusCode logging_activate_session(UA_Server * server, UA_AccessControl * ac, + const UA_EndpointDescription * endpoint, const UA_ByteString * channel_cert, + const UA_NodeId * session_id, const UA_ExtensionObject * user_token, + void ** session_ctx) { + std::string policy_uri; + int mode = -1; + if (endpoint) { + policy_uri.assign(reinterpret_cast(endpoint->securityPolicyUri.data), + endpoint->securityPolicyUri.length); + mode = static_cast(endpoint->securityMode); + } + std::cout << "SECURE_SESSION securityPolicyUri=" << policy_uri << " securityMode=" << mode + << " (1=None,2=Sign,3=SignAndEncrypt)" << std::endl; + UA_StatusCode rc = g_orig_activate_session ? g_orig_activate_session(server, ac, endpoint, channel_cert, session_id, + user_token, session_ctx) + : UA_STATUSCODE_BADINTERNALERROR; + std::cout << "SECURE_SESSION activate rc=" << UA_StatusCode_name(rc) << std::endl; + return rc; +} + +// Build a secured server config: Basic256Sha256 with Sign AND SignAndEncrypt, +// the client app-instance cert in the trust list, and username/password access +// control with anonymous login DISABLED. We use +// UA_ServerConfig_setDefaultWithSecurityPolicies, which (per OPC UA discovery) +// also exposes a SecurityPolicy=None endpoint - open62541's connect-by-URL +// client opens a transient None channel purely to GetEndpoints, then opens the +// real encrypted channel for the session. The negotiated session security is +// proven by the SECURE_SESSION log line (securityMode=3), not by the absence of +// a None endpoint. Returns a UA status code; on success the caller adds the +// conditions exactly as in the insecure path. +UA_StatusCode configure_secure(UA_ServerConfig * config, UA_UInt16 port, const std::string & cert_path, + const std::string & key_path, const std::string & trust_path, + const std::string & app_uri, const std::string & username, + const std::string & password) { + UA_ByteString cert = load_file(cert_path); + UA_ByteString key = load_file(key_path); + UA_ByteString trust = load_file(trust_path); + if (cert.length == 0 || key.length == 0) { + UA_LOG_ERROR(UA_Log_Stdout, UA_LOGCATEGORY_USERLAND, "secure: failed to read cert '%s' / key '%s'", + cert_path.c_str(), key_path.c_str()); + UA_ByteString_clear(&cert); + UA_ByteString_clear(&key); + UA_ByteString_clear(&trust); + return UA_STATUSCODE_BADCONFIGURATIONERROR; + } + + const UA_ByteString * trust_list = (trust.length > 0) ? &trust : nullptr; + size_t trust_list_size = (trust.length > 0) ? 1 : 0; + UA_StatusCode rc = UA_ServerConfig_setDefaultWithSecurityPolicies(config, port, &cert, &key, trust_list, + trust_list_size, nullptr, 0, nullptr, 0); + if (rc != UA_STATUSCODE_GOOD) { + UA_ByteString_clear(&cert); + UA_ByteString_clear(&key); + UA_ByteString_clear(&trust); + return rc; + } + + // The server's applicationUri MUST equal the URI SAN in its own certificate + // (setDefaultWithSecurityPolicies defaults it to urn:open62541.server.*). + // open62541 stamps it into every endpoint at startup. + UA_String_clear(&config->applicationDescription.applicationUri); + config->applicationDescription.applicationUri = UA_STRING_ALLOC(app_uri.c_str()); + + // Replace the permissive default access control (anonymous allowed) with a + // username/password policy that rejects anonymous logins outright. + // UA_AccessControl_default clears the previous access control itself (it + // calls accessControl.clear at entry), so we must NOT clear it here first - + // a manual clear double-frees the context and segfaults on open62541 1.3. + UA_UsernamePasswordLogin login; + login.username = UA_STRING_ALLOC(username.c_str()); + login.password = UA_STRING_ALLOC(password.c_str()); + // The user-token policy advertises Basic256Sha256 so the password is + // encrypted at the token layer too (on top of the encrypted channel). A null + // URI is NOT portable: open62541 1.3 dereferences it unconditionally and + // segfaults. Non-owning UA_STRING is fine - the plugin copies it. + UA_String token_policy_uri = + UA_STRING(const_cast("http://opcfoundation.org/UA/SecurityPolicy#Basic256Sha256")); + // open62541 <= 1.3 (bundled by open62541pp v0.16.0, used by the CMake fixture + // build) takes an extra UA_CertificateVerification* (the x509 user-token + // validator) before userTokenPolicyUri; 1.4+ (the docker image pin) dropped + // it. We only use username/password, so pass null for it either way. +#if defined(UA_OPEN62541_VER_MAJOR) && UA_OPEN62541_VER_MAJOR == 1 && UA_OPEN62541_VER_MINOR < 4 + rc = UA_AccessControl_default(config, false /*allowAnonymous*/, nullptr /*verifyX509*/, &token_policy_uri, 1, &login); +#else + rc = UA_AccessControl_default(config, false /*allowAnonymous*/, &token_policy_uri, 1, &login); +#endif + UA_String_clear(&login.username); + UA_String_clear(&login.password); + if (rc != UA_STATUSCODE_GOOD) { + UA_ByteString_clear(&cert); + UA_ByteString_clear(&key); + UA_ByteString_clear(&trust); + return rc; + } + + // Install the logging wrapper around the freshly-built access control. + g_orig_activate_session = config->accessControl.activateSession; + config->accessControl.activateSession = logging_activate_session; + + UA_ByteString_clear(&cert); + UA_ByteString_clear(&key); + UA_ByteString_clear(&trust); + return rc; +} + +#endif // UA_ENABLE_ENCRYPTION + void cli_loop(UA_Server * server) { std::string line; while (g_running && std::getline(std::cin, line)) { @@ -392,17 +553,53 @@ int main(int argc, char ** argv) { signal(SIGTERM, stop_handler); UA_UInt16 port = 4842; + bool secure = false; + std::string cert_path, key_path, trust_path, username = "medkit", password = "secret"; + std::string app_uri = "urn:test:alarms:server"; for (int i = 1; i < argc; ++i) { if (std::strcmp(argv[i], "--port") == 0 && i + 1 < argc) { port = static_cast(std::atoi(argv[++i])); + } else if (std::strcmp(argv[i], "--secure") == 0) { + secure = true; + } else if (std::strcmp(argv[i], "--cert") == 0 && i + 1 < argc) { + cert_path = argv[++i]; + } else if (std::strcmp(argv[i], "--key") == 0 && i + 1 < argc) { + key_path = argv[++i]; + } else if (std::strcmp(argv[i], "--trust") == 0 && i + 1 < argc) { + trust_path = argv[++i]; + } else if (std::strcmp(argv[i], "--app-uri") == 0 && i + 1 < argc) { + app_uri = argv[++i]; + } else if (std::strcmp(argv[i], "--username") == 0 && i + 1 < argc) { + username = argv[++i]; + } else if (std::strcmp(argv[i], "--password") == 0 && i + 1 < argc) { + password = argv[++i]; } } UA_Server * server = UA_Server_new(); UA_ServerConfig * config = UA_Server_getConfig(server); - UA_ServerConfig_setMinimal(config, port, nullptr); + if (secure) { +#ifdef UA_ENABLE_ENCRYPTION + UA_StatusCode src = configure_secure(config, port, cert_path, key_path, trust_path, app_uri, username, password); + if (src != UA_STATUSCODE_GOOD) { + UA_LOG_ERROR(UA_Log_Stdout, UA_LOGCATEGORY_USERLAND, "secure config failed: %s", UA_StatusCode_name(src)); + UA_Server_delete(server); + return 1; + } +#else + UA_LOG_ERROR(UA_Log_Stdout, UA_LOGCATEGORY_USERLAND, + "--secure requested but this build has no UA_ENABLE_ENCRYPTION support"); + UA_Server_delete(server); + return 1; +#endif + } else { + UA_ServerConfig_setMinimal(config, port, nullptr); + } UA_UInt16 ns = UA_Server_addNamespace(server, NS_URI); + if (add_variable(server, ns) != UA_STATUSCODE_GOOD) { + UA_LOG_WARNING(UA_Log_Stdout, UA_LOGCATEGORY_USERLAND, "Failed to register Tank.Level variable"); + } Condition op, oh, sl; if (add_condition(server, "Overpressure", ns, op) != UA_STATUSCODE_GOOD || @@ -416,7 +613,7 @@ int main(int argc, char ** argv) { g_conditions[oh.name] = oh; g_conditions[sl.name] = sl; - std::cout << "READY port=" << port << " namespace=" << ns << std::endl; + std::cout << "READY port=" << port << " namespace=" << ns << " secure=" << (secure ? "true" : "false") << std::endl; std::thread cli(cli_loop, server); UA_StatusCode rc = UA_Server_run(server, reinterpret_cast(&g_running)); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/gen_test_certs.sh b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/gen_test_certs.sh new file mode 100755 index 000000000..4cd840994 --- /dev/null +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/gen_test_certs.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# Copyright 2026 mfaferek93 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Generate the throwaway OPC-UA application-instance certificates the secured +# A&C integration test (issue #477) needs. NOTHING is committed - the test +# regenerates these into a temp dir on every run. +# +# Produces, in the output directory: +# server_key.pem server_cert.der URI SAN urn:test:alarms:server +# client_key.pem client_cert.der URI SAN urn:selfpatch:medkit:opcua-client +# +# Certificates are DER (open62541 + the medkit client both auto-detect DER); +# keys are unencrypted RSA-2048 PEM. The server trust list gets the client +# cert; the gateway trust list gets the server cert. The URI SAN MUST equal the +# applicationUri each side advertises or open62541 rejects the channel with +# BadCertificateUriInvalid. +# +# Usage: gen_test_certs.sh [server-uri] [client-uri] + +set -euo pipefail + +OUT="${1:?usage: gen_test_certs.sh [server-uri] [client-uri]}" +SERVER_URI="${2:-urn:test:alarms:server}" +CLIENT_URI="${3:-urn:selfpatch:medkit:opcua-client}" + +mkdir -p "${OUT}" + +# Emit one self-signed application-instance cert (DER) + key (PEM). +# $1 basename $2 CN $3 URI SAN +gen_cert() { + local base="$1" cn="$2" uri="$3" + local cnf="${OUT}/${base}.cnf" + cat >"${cnf}" </dev/null 2>&1 + openssl x509 -in "${OUT}/${base}_cert.pem" -outform der -out "${OUT}/${base}_cert.der" +} + +gen_cert server "test-alarm-server" "${SERVER_URI}" +gen_cert client "selfpatch-medkit-opcua-client" "${CLIENT_URI}" + +echo "certs written to ${OUT}: server_cert.der client_cert.der (+ *_key.pem)" diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/test_opcua_secured.test.py b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/test_opcua_secured.test.py new file mode 100755 index 000000000..6254d6f59 --- /dev/null +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/test_opcua_secured.test.py @@ -0,0 +1,429 @@ +#!/usr/bin/env python3 +# Copyright 2026 mfaferek93 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Secured OPC-UA Alarms & Conditions integration test (issue #477). + +This is the success-path counterpart to the unit tests around +``apply_security_config()``: instead of only exercising the parsers, it +negotiates a real encrypted SecureChannel against a live open62541 server and +drives the native A&C bridge through it. + +Topology (all on the loopback, ephemeral ports): + + test_alarm_server --secure (Basic256Sha256 Sign + SignAndEncrypt, + | username/password, anonymous DISABLED) + | encrypted SecureChannel + UserName token + gateway_node (ros2_medkit_opcua) -> fault_manager_node + +POSITIVE (good credentials): + * SOVD x-plc-status reports connected == true (channel established). + * SOVD x-plc-data returns the Tank.Level value (a real read over the + encrypted channel, not just an event subscription). + * Firing an alarm over the server's stdin surfaces a CONFIRMED fault in + /api/v1/faults (secured A&C event subscription end-to-end). + * The server logs ``SECURE_SESSION ... securityMode=3`` (SignAndEncrypt), + proving the channel is encrypted and not None. + +NEGATIVE (wrong password): + * The gateway never reaches connected == true: fail-closed. + +Skips with the CTest convention (exit 77) unless +``ROS2_MEDKIT_OPCUA_SECURE_REQUIRE`` is set, mirroring the asyncua smoke-test +skip. When the flag is set, a missing prerequisite is a hard failure so a CI +job that loses a dependency cannot silently bypass the secured check. + +Usage: test_opcua_secured.test.py +""" + +import json +import os +from pathlib import Path +import shutil +import socket +import subprocess +import sys +import tempfile +import time +import urllib.error +import urllib.request + +CTEST_SKIP = 77 +USERNAME = 'medkit' +PASSWORD = 'secret' +WRONG_PASSWORD = 'definitely-not-the-password' +APP_URI_SERVER = 'urn:test:alarms:server' +APP_URI_CLIENT = 'urn:selfpatch:medkit:opcua-client' +ROS_DOMAIN_ID = '228' + +ALARM_CODE = 'PLC_OVERPRESSURE' + + +def require_flag(): + """True when the suite must run for real (CI) instead of skipping.""" + return os.environ.get('ROS2_MEDKIT_OPCUA_SECURE_REQUIRE', '0') not in ('', '0') + + +def skip_or_fail(reason): + """Return the CTest skip code, or fail hard when the run is required.""" + if require_flag(): + print(f'REQUIRED but prerequisite unmet: {reason}', file=sys.stderr) + return 1 + print(f'SKIP (set ROS2_MEDKIT_OPCUA_SECURE_REQUIRE to force): {reason}') + return CTEST_SKIP + + +def free_port(): + """Grab an OS-assigned free TCP port on the loopback and release it.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(('127.0.0.1', 0)) + return s.getsockname()[1] + + +def find_plugin(): + """Locate libros2_medkit_opcua_plugin.so under AMENT_PREFIX_PATH.""" + for prefix in os.environ.get('AMENT_PREFIX_PATH', '').split(os.pathsep): + if not prefix: + continue + for root, _dirs, files in os.walk(prefix): + if 'libros2_medkit_opcua_plugin.so' in files: + return os.path.join(root, 'libros2_medkit_opcua_plugin.so') + return None + + +def have_executable(package, executable): + """True when ``ros2 run `` is resolvable.""" + try: + out = subprocess.run( + ['ros2', 'pkg', 'executables', package], + capture_output=True, text=True, timeout=30, check=False, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + return f'{package} {executable}' in out.stdout + + +def http_json(url, timeout=2): + """GET and parse JSON, or return None on any failure.""" + try: + with urllib.request.urlopen(url, timeout=timeout) as resp: + return json.loads(resp.read().decode()) + except (urllib.error.URLError, OSError, ValueError): + return None + + +def wait_json(url, predicate, deadline=60, period=2.0): + """Poll until predicate(json) is true; return the final json or None.""" + last = None + for _ in range(int(deadline / period) + 1): + last = http_json(url) + if last is not None: + try: + if predicate(last): + return last + except (KeyError, TypeError, ValueError): + pass + time.sleep(period) + return last + + +def wait_log(path, needle, deadline=30, period=0.5): + """Poll a log file until appears; return True/False.""" + for _ in range(int(deadline / period) + 1): + try: + if needle in Path(path).read_text(errors='replace'): + return True + except OSError: + pass + time.sleep(period) + return False + + +def write_node_map(path): + """Map the readable Tank.Level node + the 3 alarm sources into one entity.""" + path.write_text( + 'area_id: plc_systems\n' + 'component_id: alarm_test_runtime\n' + 'nodes:\n' + ' - node_id: "ns=2;s=Tank.Level"\n' + ' entity_id: tank_process\n' + ' data_name: tank_level\n' + ' data_type: float\n' + 'event_alarms:\n' + ' - alarm_source: "ns=2;s=Alarms.Overpressure"\n' + ' entity_id: tank_process\n' + ' fault_code: PLC_OVERPRESSURE\n' + ' - alarm_source: "ns=2;s=Alarms.Overheat"\n' + ' entity_id: tank_process\n' + ' fault_code: PLC_OVERHEAT\n' + ' - alarm_source: "ns=2;s=Alarms.SensorLost"\n' + ' entity_id: tank_process\n' + ' fault_code: PLC_SENSOR_LOST\n' + ) + + +def write_gateway_params(path, *, port, plugin, server_port, node_map, manifest, + certs, password): + """Render a gateway params file pointing the opcua plugin at the secure server.""" + path.write_text( + 'ros2_medkit_gateway:\n' + ' ros__parameters:\n' + ' server:\n' + ' host: "127.0.0.1"\n' + f' port: {port}\n' + ' plugins: ["opcua"]\n' + f' plugins.opcua.path: "{plugin}"\n' + f' plugins.opcua.endpoint_url: "opc.tcp://127.0.0.1:{server_port}"\n' + f' plugins.opcua.node_map_path: "{node_map}"\n' + ' plugins.opcua.poll_interval_ms: 500\n' + ' plugins.opcua.security_policy: "Basic256Sha256"\n' + ' plugins.opcua.security_mode: "SignAndEncrypt"\n' + f' plugins.opcua.client_cert_path: "{certs}/client_cert.der"\n' + f' plugins.opcua.client_key_path: "{certs}/client_key.pem"\n' + f' plugins.opcua.application_uri: "{APP_URI_CLIENT}"\n' + f' plugins.opcua.trust_list_paths: ["{certs}/server_cert.der"]\n' + ' plugins.opcua.reject_untrusted: true\n' + ' plugins.opcua.user_auth_mode: "UsernamePassword"\n' + f' plugins.opcua.username: "{USERNAME}"\n' + f' plugins.opcua.password: "{password}"\n' + ' discovery.mode: "hybrid"\n' + f' discovery.manifest_path: "{manifest}"\n' + ' discovery.manifest_strict_validation: false\n' + ) + + +def start_gateway(params_file, log_path, env): + """Launch a gateway_node bound to the given params file.""" + log = open(log_path, 'w') + proc = subprocess.Popen( + ['ros2', 'run', 'ros2_medkit_gateway', 'gateway_node', + '--ros-args', '--params-file', str(params_file)], + stdout=log, stderr=subprocess.STDOUT, env=env, + ) + proc._log = log # keep handle alive for close on teardown + return proc + + +def terminate(proc): + """Best-effort SIGTERM then SIGKILL of a child process.""" + if proc is None: + return + proc.terminate() + try: + proc.wait(timeout=8) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + log = getattr(proc, '_log', None) + if log is not None: + log.close() + + +def main(): + if len(sys.argv) < 3: + print('usage: test_opcua_secured.test.py ', + file=sys.stderr) + return 2 + server_bin = Path(sys.argv[1]).resolve() + gen_certs = Path(sys.argv[2]).resolve() + + # --- Prerequisite gating (skip 77 unless required) --------------------- + if shutil.which('openssl') is None: + return skip_or_fail('openssl CLI not available') + if shutil.which('ros2') is None: + return skip_or_fail('ros2 CLI not available') + if not (server_bin.is_file() and os.access(server_bin, os.X_OK)): + return skip_or_fail(f'secure fixture missing: {server_bin}') + if not gen_certs.is_file(): + return skip_or_fail(f'cert generator missing: {gen_certs}') + plugin = find_plugin() + if plugin is None: + return skip_or_fail('libros2_medkit_opcua_plugin.so not found') + if not have_executable('ros2_medkit_fault_manager', 'fault_manager_node'): + return skip_or_fail('ros2_medkit_fault_manager/fault_manager_node not built') + if not have_executable('ros2_medkit_gateway', 'gateway_node'): + return skip_or_fail('ros2_medkit_gateway/gateway_node not built') + + workdir = Path(tempfile.mkdtemp(prefix='opcua_secured_')) + certs = workdir / 'certs' + server_log = workdir / 'server.log' + env = dict(os.environ, ROS_DOMAIN_ID=ROS_DOMAIN_ID) + + server = fault_mgr = gw_good = gw_bad = None + try: + # --- Certificates (regenerated every run, never committed) --------- + subprocess.run(['bash', str(gen_certs), str(certs)], check=True, + capture_output=True, text=True) + + node_map = workdir / 'alarm_nodes.yaml' + manifest = workdir / 'manifest.yaml' + write_node_map(node_map) + manifest.write_text('manifest_version: "1.0"\n') + + server_port = free_port() + good_port = free_port() + bad_port = free_port() + + # --- Secure server fixture (stdin pipe for the alarm CLI) ---------- + slog = open(server_log, 'w') + server = subprocess.Popen( + [str(server_bin), '--secure', '--port', str(server_port), + '--cert', str(certs / 'server_cert.der'), + '--key', str(certs / 'server_key.pem'), + '--trust', str(certs / 'client_cert.der'), + '--app-uri', APP_URI_SERVER, + '--username', USERNAME, '--password', PASSWORD], + stdin=subprocess.PIPE, stdout=slog, stderr=subprocess.STDOUT, + text=True, + ) + if not wait_log(server_log, 'READY ', deadline=20): + print('secure server did not become READY', file=sys.stderr) + print(Path(server_log).read_text(errors='replace'), file=sys.stderr) + return 1 + + # --- fault_manager_node -------------------------------------------- + fault_mgr = subprocess.Popen( + ['ros2', 'run', 'ros2_medkit_fault_manager', 'fault_manager_node'], + stdout=open(workdir / 'fault_manager.log', 'w'), + stderr=subprocess.STDOUT, env=env, + ) + # Poll for the service the opcua plugin calls. + for _ in range(40): + out = subprocess.run(['ros2', 'service', 'list'], capture_output=True, + text=True, env=env, timeout=15, check=False) + if '/fault_manager/report_fault' in out.stdout: + break + time.sleep(0.5) + + # === POSITIVE: good credentials ==================================== + good_params = workdir / 'gateway_good.yaml' + write_gateway_params(good_params, port=good_port, plugin=plugin, + server_port=server_port, node_map=node_map, + manifest=manifest, certs=certs, password=PASSWORD) + gw_good = start_gateway(good_params, workdir / 'gateway_good.log', env) + + base = f'http://127.0.0.1:{good_port}/api/v1' + status = wait_json( + f'{base}/components/alarm_test_runtime/x-plc-status', + lambda j: j.get('connected') is True, deadline=70) + if not (status and status.get('connected') is True): + print('FAIL: gateway did not connect over the secure channel', file=sys.stderr) + print('server log:\n' + Path(server_log).read_text(errors='replace'), file=sys.stderr) + print('gateway log tail:\n' + Path(workdir / 'gateway_good.log').read_text(errors='replace')[-3000:], + file=sys.stderr) + return 1 + print(' OK secured connect: x-plc-status connected == true') + + # Proof the channel is encrypted (SignAndEncrypt = securityMode 3), + # emitted by the server on session activation. Fail hard on None. + if not wait_log(server_log, 'securityMode=3', deadline=20): + print('FAIL: server never logged an encrypted (SignAndEncrypt) session', file=sys.stderr) + print(Path(server_log).read_text(errors='replace'), file=sys.stderr) + return 1 + if 'securityPolicyUri=http://opcfoundation.org/UA/SecurityPolicy#Basic256Sha256' \ + not in Path(server_log).read_text(errors='replace'): + print('FAIL: negotiated SecurityPolicy was not Basic256Sha256', file=sys.stderr) + print(Path(server_log).read_text(errors='replace'), file=sys.stderr) + return 1 + print(' OK encrypted channel: server logged Basic256Sha256 securityMode=3 (SignAndEncrypt)') + + # Secured read: Tank.Level value over the encrypted channel. + data = wait_json( + f'{base}/apps/tank_process/x-plc-data', + lambda j: any(i.get('name') == 'tank_level' and i.get('value') is not None + for i in j.get('items', [])), deadline=30) + value = None + if data: + value = next((i.get('value') for i in data.get('items', []) + if i.get('name') == 'tank_level'), None) + if value is None: + print('FAIL: secured tag read returned no value', file=sys.stderr) + print(json.dumps(data), file=sys.stderr) + return 1 + print(f' OK secured read: tank_level = {value}') + + # Secured A&C: fire an alarm over the server stdin -> CONFIRMED fault. + server.stdin.write('fire Overpressure 750\n') + server.stdin.flush() + faults = wait_json( + f'{base}/faults', + lambda j: any(i.get('fault_code') == ALARM_CODE and i.get('status') == 'CONFIRMED' + for i in j.get('items', [])), deadline=40) + if not (faults and any(i.get('fault_code') == ALARM_CODE and i.get('status') == 'CONFIRMED' + for i in faults.get('items', []))): + print('FAIL: alarm did not surface as a CONFIRMED fault', file=sys.stderr) + print(json.dumps(faults), file=sys.stderr) + print('server log:\n' + Path(server_log).read_text(errors='replace'), file=sys.stderr) + return 1 + print(' OK secured A&C: PLC_OVERPRESSURE CONFIRMED via encrypted event subscription') + + terminate(gw_good) + gw_good = None + + # === NEGATIVE: wrong password must fail closed ===================== + bad_params = workdir / 'gateway_bad.yaml' + write_gateway_params(bad_params, port=bad_port, plugin=plugin, + server_port=server_port, node_map=node_map, + manifest=manifest, certs=certs, password=WRONG_PASSWORD) + bad_log = workdir / 'gateway_bad.log' + gw_bad = start_gateway(bad_params, bad_log, env) + + bad_base = f'http://127.0.0.1:{bad_port}/api/v1' + # Fail-closed = the server rejects the wrong password and NO secured read + # ever succeeds. We assert both: (1) the client logs BadUserAccessDenied + # (the session never activates), and (2) the tag value is never served. + # We key off the secured read rather than x-plc-status.connected because + # a rejected ActivateSession leaves the underlying SecureChannel briefly + # open, which can make the coarse connected flag flap; no authenticated + # read can ever complete over it, and that is the real security property. + rejected = wait_log(bad_log, 'BadUserAccessDenied', deadline=40) + read_leaked = False + for _ in range(10): + d = http_json(f'{bad_base}/apps/tank_process/x-plc-data') + if d and any(i.get('name') == 'tank_level' and i.get('value') is not None + for i in d.get('items', [])): + read_leaked = True + break + time.sleep(2) + if not rejected or read_leaked: + print('FAIL: wrong password did NOT fail closed ' + f'(auth_rejected={rejected}, read_leaked={read_leaked})', file=sys.stderr) + print('gateway(bad) log tail:\n' + + bad_log.read_text(errors='replace')[-3000:], file=sys.stderr) + return 1 + print(' OK fail-closed: wrong password rejected (BadUserAccessDenied), no secured read served') + + print('PASS: secured OPC-UA A&C integration test') + return 0 + finally: + terminate(gw_good) + terminate(gw_bad) + terminate(fault_mgr) + if server is not None: + try: + if server.stdin: + server.stdin.close() + except OSError: + pass + terminate(server) + try: + slog.close() + except (NameError, OSError): + pass + shutil.rmtree(workdir, ignore_errors=True) + + +if __name__ == '__main__': + sys.exit(main()) From 82a5e25c2e757269452611c00b6ed56c67448c21 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Wed, 1 Jul 2026 08:17:22 +0200 Subject: [PATCH 08/11] fix(opcua): make read-replay reconcile guard airtight for transient reads Gate the modeled-source insert on at least one reliably-read condition so a transient browse error cannot flip an EventNotifier-only source into modeled and wipe live event faults. Never Skip a reliably-active condition, and flag a transient Retain read failure so a retained condition is kept, not cleared. Refs #477 #478 --- .../ros2_medkit_opcua/src/opcua_client.cpp | 24 ++++++++++--- .../ros2_medkit_opcua/src/opcua_poller.cpp | 35 ++++++++++++++----- .../test/test_opcua_plugin.cpp | 18 ++++++++++ 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index ebec809fa..b2e8264f4 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -937,11 +937,27 @@ std::vector OpcuaClient::read_source_condit snap.confirmed_state = read_state_id(child, "ConfirmedState", true, nullptr, nullptr); try { - auto retain = child.browseChild({{0, "Retain"}}).readValue(); - if (retain.isType()) { - snap.retain = retain.getScalarCopy(); + auto retain_node = child.browseChild({{0, "Retain"}}); + try { + auto retain = retain_node.readValue(); + if (retain.isType()) { + snap.retain = retain.getScalarCopy(); + } + } catch (const opcua::BadStatus &) { + // Retain node resolved but its value could not be read this scan: a + // transient failure. Do NOT let retain silently default to false + // (which would Skip an inactive-but-retained condition and let + // reconcile clear a still-tracked fault). Flag the snapshot unreliable + // so it is kept (KeepOnly), not dropped (issue #478). + snap.state_read_failed = true; + } + } catch (const opcua::BadStatus & e) { + // A genuinely absent Retain path (optional on some servers) leaves + // retain=false. A transient browse failure resolving it is unreliable, + // so keep-and-flag rather than trust the default. + if (!is_path_absent_code(e.code())) { + snap.state_read_failed = true; } - } catch (const opcua::BadStatus &) { } try { auto sev = child.browseChild({{0, "Severity"}}).readValue(); diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index d038e5a68..e1a1f3c18 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -420,9 +420,18 @@ OpcuaPoller::classify_read_snapshot(const OpcuaClient::ConditionStateSnapshot & if (!snap.enabled_state) { return ReadReplayDisposition::Skip; } + // A reliably-read ActiveState=true condition is always interesting and must + // never be Skipped on the Retain check below (issue #478). Retain is an + // optional node that can read false by default when it is unreadable this + // scan; Skipping a live, reliably-active condition on that basis would drop + // it from ``seen`` and let reconcile clear a still-active fault on a modeled + // source. ConditionRefresh always replays an active condition, so mirror it. + if (snap.active_state) { + return ReadReplayDisposition::Feed; + } // Retain mirrors ConditionRefresh's interesting-state filter (Part 9 - // §5.5.2): only Retain==true conditions are replayed. A condition that has - // gone quiet (Retain==false) is no longer of interest. + // §5.5.2): only Retain==true conditions are replayed. An inactive condition + // that has gone quiet (Retain==false) is no longer of interest. if (!snap.retain) { return ReadReplayDisposition::Skip; } @@ -453,15 +462,25 @@ void OpcuaPoller::read_fallback_replay() { failed_sources.insert(cfg.source_node_id_str); continue; } - if (!conditions.empty()) { - // The source exposes Condition instances as nodes -> read-fallback is - // viable here and reconcile may clear stale faults for it. + const bool has_reliable_condition = + std::any_of(conditions.begin(), conditions.end(), [](const OpcuaClient::ConditionStateSnapshot & s) { + return !s.state_read_failed; + }); + if (has_reliable_condition) { + // The source positively exposes at least one RELIABLY-read Condition + // instance node -> read-fallback is viable here and reconcile may clear + // stale faults for it. read_modeled_sources_.insert(cfg.source_node_id_str); read_unsupported_warned_sources_.erase(cfg.source_node_id_str); } else if (read_modeled_sources_.find(cfg.source_node_id_str) == read_modeled_sources_.end()) { - // Successful scan, zero Condition instance nodes, and this source has - // never yielded any. Almost certainly an EventNotifier-only server - // (e.g. S7-1500). Do NOT clear its tracked faults; warn once. + // Either zero Condition instance nodes, or every node this scan was a + // transient (unreliable) read - and this source has never yielded a + // reliably-read condition. Treat an all-transient scan exactly like an + // empty/unsupported scan: an EventNotifier-only server (e.g. S7-1500) + // yields zero condition nodes, and a lone transient browse error on a + // non-condition child must NOT flip the source into 'modeled' (which + // would let reconcile wipe the live event-fault set, whose ConditionIds + // are never in ``seen``). Do NOT clear its tracked faults; warn once. if (read_unsupported_warned_sources_.insert(cfg.source_node_id_str).second) { warn_operator( "OPC-UA read-based active-condition replay found no Condition instance nodes under source '" + diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp index fac369463..131712f25 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/test_opcua_plugin.cpp @@ -413,4 +413,22 @@ TEST(ReadSnapshotClassify, TransientReadFailureIsKeptNotFed) { EXPECT_EQ(d, OpcuaPoller::ReadReplayDisposition::KeepOnly); } +TEST(ReadSnapshotClassify, ActiveWithUnreadableRetainIsFedNotSkipped) { + // Regression (issue #478): Retain defaults false when its optional node is + // unreadable this scan. A reliably-active condition must be Fed (so it lands + // in ``seen``), never Skipped on the Retain check - otherwise reconcile would + // clear a still-active fault on a modeled source. + auto d = OpcuaPoller::classify_read_snapshot(make_snap(/*enabled=*/true, /*active=*/true, /*retain=*/false)); + EXPECT_EQ(d, OpcuaPoller::ReadReplayDisposition::Feed); +} + +TEST(ReadSnapshotClassify, ActiveButFlaggedUnreliableIsKeptNotFed) { + // KeepOnly still wins over the active fast-path: an active condition whose + // snapshot is flagged unreliable (e.g. a transient Retain read failure) is + // preserved via ``seen`` but not fed an untrustworthy state. + auto d = OpcuaPoller::classify_read_snapshot( + make_snap(/*enabled=*/true, /*active=*/true, /*retain=*/false, /*read_failed=*/true)); + EXPECT_EQ(d, OpcuaPoller::ReadReplayDisposition::KeepOnly); +} + } // namespace ros2_medkit_gateway From cea9d1505890c9e16bb06751125573a30a8d6871 Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Wed, 1 Jul 2026 17:33:00 +0200 Subject: [PATCH 09/11] test(opcua): fix flake8 lint in secured integration test Rephrase two docstrings to imperative mood (D401) and wrap four over-length lines (E501) in test_opcua_secured.test.py. Refs #477 --- .../test/integration/test_opcua_secured.test.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/test_opcua_secured.test.py b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/test_opcua_secured.test.py index 6254d6f59..fa6b889c2 100755 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/test_opcua_secured.test.py +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/test/integration/test_opcua_secured.test.py @@ -72,7 +72,7 @@ def require_flag(): - """True when the suite must run for real (CI) instead of skipping.""" + """Return True when the suite must run for real (CI) instead of skipping.""" return os.environ.get('ROS2_MEDKIT_OPCUA_SECURE_REQUIRE', '0') not in ('', '0') @@ -104,7 +104,7 @@ def find_plugin(): def have_executable(package, executable): - """True when ``ros2 run `` is resolvable.""" + """Return True when ``ros2 run `` is resolvable.""" try: out = subprocess.run( ['ros2', 'pkg', 'executables', package], @@ -321,15 +321,16 @@ def main(): if not (status and status.get('connected') is True): print('FAIL: gateway did not connect over the secure channel', file=sys.stderr) print('server log:\n' + Path(server_log).read_text(errors='replace'), file=sys.stderr) - print('gateway log tail:\n' + Path(workdir / 'gateway_good.log').read_text(errors='replace')[-3000:], - file=sys.stderr) + gw_tail = Path(workdir / 'gateway_good.log').read_text(errors='replace')[-3000:] + print('gateway log tail:\n' + gw_tail, file=sys.stderr) return 1 print(' OK secured connect: x-plc-status connected == true') # Proof the channel is encrypted (SignAndEncrypt = securityMode 3), # emitted by the server on session activation. Fail hard on None. if not wait_log(server_log, 'securityMode=3', deadline=20): - print('FAIL: server never logged an encrypted (SignAndEncrypt) session', file=sys.stderr) + print('FAIL: server never logged an encrypted (SignAndEncrypt) session', + file=sys.stderr) print(Path(server_log).read_text(errors='replace'), file=sys.stderr) return 1 if 'securityPolicyUri=http://opcfoundation.org/UA/SecurityPolicy#Basic256Sha256' \ @@ -337,7 +338,7 @@ def main(): print('FAIL: negotiated SecurityPolicy was not Basic256Sha256', file=sys.stderr) print(Path(server_log).read_text(errors='replace'), file=sys.stderr) return 1 - print(' OK encrypted channel: server logged Basic256Sha256 securityMode=3 (SignAndEncrypt)') + print(' OK encrypted channel: Basic256Sha256 securityMode=3 (SignAndEncrypt)') # Secured read: Tank.Level value over the encrypted channel. data = wait_json( @@ -403,7 +404,7 @@ def main(): print('gateway(bad) log tail:\n' + bad_log.read_text(errors='replace')[-3000:], file=sys.stderr) return 1 - print(' OK fail-closed: wrong password rejected (BadUserAccessDenied), no secured read served') + print(' OK fail-closed: wrong password rejected (BadUserAccessDenied)') print('PASS: secured OPC-UA A&C integration test') return 0 From 80e2cfe53fa4ea4cf14f6621e112a7ec5c05697d Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Wed, 1 Jul 2026 22:37:51 +0200 Subject: [PATCH 10/11] fix(opcua): harden A&C replay, security config, and fault de-dup - reconcile tracked faults on ConditionRefresh RefreshEnd so an alarm that cleared while offline no longer stays latched Confirmed - reject contradictory SecurityPolicy/MessageSecurityMode pairings and contain event-callback exceptions at the open62541 C boundary - reject duplicate (entity_id, fault_code) across event_alarms to stop clear-by-code flapping; add fault_manager test_depend - docs: warn that auth.jwt_secret is readable over DDS and via ps; fix the CORS default; relabel accept-any trust as insecure (not TOFU) Refs #477 #478 #479 #480 --- src/ros2_medkit_gateway/design/hardening.rst | 22 +++++- .../ros2_medkit_opcua/opcua_client.hpp | 12 ++- .../ros2_medkit_opcua/opcua_poller.hpp | 45 +++++++++--- .../ros2_medkit_opcua/package.xml | 5 ++ .../ros2_medkit_opcua/src/node_map.cpp | 29 +++++++- .../ros2_medkit_opcua/src/opcua_client.cpp | 39 +++++++++- .../ros2_medkit_opcua/src/opcua_poller.cpp | 56 +++++++++++++- .../ros2_medkit_opcua/test/test_node_map.cpp | 73 +++++++++++++++++++ .../test/test_opcua_client.cpp | 33 +++++++++ .../test/test_opcua_plugin.cpp | 36 +++++++++ 10 files changed, 329 insertions(+), 21 deletions(-) diff --git a/src/ros2_medkit_gateway/design/hardening.rst b/src/ros2_medkit_gateway/design/hardening.rst index c572eb05f..16311c0f3 100644 --- a/src/ros2_medkit_gateway/design/hardening.rst +++ b/src/ros2_medkit_gateway/design/hardening.rst @@ -27,7 +27,7 @@ Control Default Secure profile ``auth.enabled`` false true ``auth.require_auth_for`` write all (auth on reads + writes) ``server.tls.enabled`` false true (HTTPS, min TLS 1.3) -``cors.allowed_origins`` ``[""]`` explicit origin list (no wildcard) +``cors.allowed_origins`` ``[]`` explicit origin list (no wildcard) ``rate_limiting.enabled`` false true (global + per-client + per-endpoint) ``scripts.allow_uploads`` true false (manifest-defined scripts only) ``docs.enabled`` true false (reduced surface) @@ -48,6 +48,26 @@ Credential and certificate provisioning (HS256) or provision an RS256 key pair. Inject it at deploy time from a secret store or environment variable - do not commit it to source control. + .. warning:: + + ``auth.jwt_secret`` is a plain readable ROS 2 parameter. Beyond source + control it is exposed on two planes: + + - **DDS control plane.** Any peer on the ROS 2 graph can read it with + ``ros2 param get / auth.jwt_secret`` - the parameter is + declared readable and the DDS domain is unauthenticated by default. + - **Process table.** Passing it inline (``-p auth.jwt_secret:=...`` as in + the launch example above) also leaks the value via ``ps`` and + ``/proc//cmdline``. + + Injecting from an environment variable / params file instead of an inline + ``-p`` closes the process-table leak, but the value still lands in a + readable parameter, so it remains exposed on the DDS plane. To close that, + lock down the control plane: ROS 2 security (SROS2) with an access-control + policy that denies parameter reads to untrusted participants, or a + dedicated / firewalled ``ROS_DOMAIN_ID`` (optionally with + ``ROS_LOCALHOST_ONLY=1``) that no untrusted peer can join. + 3. **Role-scoped clients.** Create the minimum set of clients in ``auth.clients`` (``client_id:client_secret:role``). Roles, least to most privileged: ``viewer`` (read), ``operator`` (+ trigger ops / ack faults / diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp index 3b1a548a8..3c46f69c2 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_client.hpp @@ -88,7 +88,9 @@ struct OpcuaClientConfig { /// When true (default) the server certificate must chain to an entry in /// ``trust_list_paths``; an untrusted server is rejected. When false the - /// client accepts any server certificate (lab / trust-on-first-use only). + /// client accepts any server certificate (accept-any: INSECURE, lab only). + /// This is NOT trust-on-first-use: nothing is pinned, so a later cert change + /// is not detected. bool reject_untrusted{true}; // --- Session user identity --- @@ -338,6 +340,14 @@ class OpcuaClient { /// the caller is expected to warn loudly. static bool credentials_sent_in_clear(const OpcuaClientConfig & config); + /// True when SecurityPolicy and MessageSecurityMode are set inconsistently: + /// exactly one of them is None. OPC-UA (Part 4 §7.37) requires them to move + /// together - a mode of Sign/SignAndEncrypt with SecurityPolicy=None would + /// leave the policy URI unpinned (silently allowing a downgraded channel), + /// and a policy without a mode is equally malformed. A conflicting config is + /// rejected before contacting the server. + static bool security_config_conflict(const OpcuaClientConfig & config); + private: struct Impl; std::unique_ptr impl_; diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp index 81cf4dad1..fbe7544fb 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/include/ros2_medkit_opcua/opcua_poller.hpp @@ -200,6 +200,16 @@ class OpcuaPoller { const std::set & failed_sources, const std::set & modeled_sources); + /// Decide whether a tracked condition should be cleared after a completed + /// ConditionRefresh burst (issue #480). Cleared only when it was active + /// (Confirmed/Healed) and its ConditionId was NOT among the ones the server + /// replayed during the burst (``seen``). A delivered RefreshEnd is an + /// authoritative, subscription-wide replay, so - unlike the read fallback - + /// no per-source modeling gate is needed. Pure and static so it is + /// unit-testable without a server. + static bool should_clear_after_refresh(SovdAlarmStatus last_status, const std::string & condition_id, + const std::set & seen); + private: void poll_loop(); void do_poll(); @@ -233,6 +243,22 @@ class OpcuaPoller { void reconcile_after_read(const std::set & seen, const std::set & failed_sources, const std::set & modeled_sources); + /// Reconcile the tracked fault set after a completed ConditionRefresh burst + /// (issue #480). ConditionRefresh (Part 9 §5.5.7) replays only Retain=true + /// conditions between RefreshStart and RefreshEnd, so a condition that fully + /// cleared while we were offline is silently absent from the burst and would + /// otherwise stay latched. ``seen`` is the set of ConditionIds the server + /// replayed during the burst; any tracked-active condition NOT in it cleared + /// offline and is cleared here. Unlike the read fallback this needs no + /// per-source modeling gate: a delivered RefreshEnd is an authoritative, + /// subscription-wide replay (works on EventNotifier-only servers too). + void reconcile_after_refresh(const std::set & seen); + + /// Dispatch a batch of ClearFault deliveries to the event-alarm callback. + /// Shared by the read- and refresh-based reconcile paths. Must be called with + /// no poller lock held (invokes the user callback). + void dispatch_condition_clears(const std::vector & clears); + /// Emit an operator-visible warning via the configured ``log_warn`` sink, /// falling back to the poller's ROS 2 logger. void warn_operator(const std::string & msg); @@ -286,16 +312,17 @@ class OpcuaPoller { // Sources already warned about as read-fallback-unsupported (warn once each). std::set read_unsupported_warned_sources_; - // ConditionRefresh bracketing state. open62541 sends the buffered - // historical condition burst between RefreshStartEvent and - // RefreshEndEvent; we apply each event during the burst as normal but - // use this flag in tests / diagnostics. Production note: per Part 9 - // §5.5.7 the spec also requires the client to ignore ConditionRefresh - // notifications carrying ``Retain=false`` for non-current branches; the - // state machine already drops branches via BranchId, and the trampoline - // cannot distinguish refresh-burst events from live events without - // tracking the EventType, which we do explicitly below. + // ConditionRefresh bracketing state. open62541 sends the buffered historical + // condition burst between RefreshStartEvent and RefreshEndEvent; we apply each + // event during the burst as normal. On RefreshStart we clear ``refresh_seen_`` + // and accumulate every replayed ConditionId there; on RefreshEnd we reconcile + // (clear tracked-active conditions the server did NOT replay - they cleared + // offline). Per Part 9 §5.5.7 ConditionRefresh replays only Retain=true / + // current-branch conditions; the state machine already drops non-current + // branches via BranchId. Touched only on the poll thread (the burst is + // delivered via run_iterate there), same convention as read_modeled_sources_. std::atomic condition_refresh_in_progress_{false}; + std::set refresh_seen_; // Throttle the warn-level log emitted from condition_refresh() failures. // Reset to false on each fresh subscribe in setup_event_subscriptions() diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/package.xml b/src/ros2_medkit_plugins/ros2_medkit_opcua/package.xml index d4ef796e2..a91716cb0 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/package.xml +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/package.xml @@ -31,6 +31,11 @@ ament_lint_common ament_cmake_clang_format ament_cmake_gtest + + ros2_medkit_fault_manager ament_cmake diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp index ebd7a76c3..3fc90e2cd 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/node_map.cpp @@ -414,12 +414,35 @@ bool NodeMap::load(const std::string & yaml_path) { // even well-defined. { std::set> event_keys; + // A given (entity_id, fault_code) must be produced by at most ONE event + // alarm mapping. Distinct OPC-UA conditions are tracked separately by + // ConditionId at runtime and each reports, but a ClearFault is keyed on + // fault_code alone: if two conditions resolve to the same code, one + // condition clearing wipes the shared fault while the other is still + // active, producing an undebuggable flap. Reject duplicates rather than + // silently deduping them into a set (bburda review on PR #387). + auto register_event_key = [&](const std::string & entity_id, const std::string & fault_code) -> bool { + if (fault_code.empty()) { + return true; + } + if (!event_keys.emplace(entity_id, fault_code).second) { + RCLCPP_ERROR(rclcpp::get_logger("opcua.node_map"), + "Duplicate (entity_id=%s, fault_code=%s) declared by more than one event_alarms " + "source/mapping - distinct conditions must not share a fault_code (a clear on one " + "would wipe the other)", + entity_id.c_str(), fault_code.c_str()); + return false; + } + return true; + }; for (const auto & cfg : event_alarms_) { - if (!cfg.fault_code.empty()) { - event_keys.emplace(cfg.entity_id, cfg.fault_code); + if (!register_event_key(cfg.entity_id, cfg.fault_code)) { + return false; } for (const auto & m : cfg.mappings) { - event_keys.emplace(cfg.entity_id, m.fault_code); + if (!register_event_key(cfg.entity_id, m.fault_code)) { + return false; + } } } for (const auto & entry : entries_) { diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp index b2e8264f4..c11b16476 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_client.cpp @@ -313,6 +313,12 @@ bool OpcuaClient::credentials_sent_in_clear(const OpcuaClientConfig & config) { return config.user_auth_mode != UserAuthMode::Anonymous && !requires_secure_channel(config); } +bool OpcuaClient::security_config_conflict(const OpcuaClientConfig & config) { + const bool policy_none = config.security_policy == SecurityPolicy::None; + const bool mode_none = config.security_mode == SecurityMode::None; + return policy_none != mode_none; +} + namespace { #ifdef UA_ENABLE_ENCRYPTION @@ -336,6 +342,20 @@ opcua::MessageSecurityMode to_ua_security_mode(SecurityMode mode) { // false on a fatal configuration error (already logged); the caller then // reports the connect as failed without contacting the server. bool apply_security_config(opcua::Client & client, const OpcuaClientConfig & cfg) { + // Reject a contradictory SecurityPolicy / MessageSecurityMode pairing before + // building anything. Accepting e.g. security_mode=Sign with + // security_policy=None would leave the policy URI unpinned and silently + // defeat the explicit-selection guarantee below (OPC-UA Part 4 §7.37). + if (OpcuaClient::security_config_conflict(cfg)) { + const bool policy_none = cfg.security_policy == SecurityPolicy::None; + RCLCPP_ERROR(opcua_client_logger(), + "OPC-UA security config is contradictory: %s. security_policy and " + "message_security_mode must both be None or both be set. Refusing to connect.", + policy_none ? "message_security_mode is Sign/SignAndEncrypt but security_policy is None" + : "security_policy is set but message_security_mode is None"); + return false; + } + const bool secure = OpcuaClient::requires_secure_channel(cfg); if (!secure) { @@ -393,8 +413,10 @@ bool apply_security_config(opcua::Client & client, const OpcuaClientConfig & cfg } // Trust handling: by default validate the server certificate against the - // trust list. When reject_untrusted is false, accept any server - // certificate (lab / trust-on-first-use only). + // trust list. When reject_untrusted is false, accept any server certificate + // (accept-any: INSECURE, lab only). This is NOT trust-on-first-use - nothing + // is pinned, so a substituted or expired server cert is accepted on every + // connect with no chain / hostname / URI-SAN check. if (!cfg.reject_untrusted) { auto & cv = cc.handle()->certificateVerification; if (cv.clear) { @@ -1120,7 +1142,18 @@ static void on_event_trampoline_c(UA_Client * /*client*/, UA_UInt32 sub_id, void } if (ctx->callback) { - ctx->callback(user_values, source_node, event_type, condition_id); + // The callback chain (on_event -> apply_condition_state -> fault report / + // clear) must never let an exception unwind through open62541's C frames: + // they are not exception-safe (leaks / inconsistent internal state; formally + // UB) and an escape past run_iterate would call std::terminate on the poll + // thread. Contain everything at the C boundary. + try { + ctx->callback(user_values, source_node, event_type, condition_id); + } catch (const std::exception & e) { + RCLCPP_ERROR(opcua_client_logger(), "OPC-UA event callback threw, dropping notification: %s", e.what()); + } catch (...) { + RCLCPP_ERROR(opcua_client_logger(), "OPC-UA event callback threw a non-std exception, dropping notification"); + } } } diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp index e1a1f3c18..f08d9862a 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/src/opcua_poller.cpp @@ -560,6 +560,15 @@ bool OpcuaPoller::should_clear_condition(SovdAlarmStatus last_status, const std: return seen.find(condition_id) == seen.end(); } +bool OpcuaPoller::should_clear_after_refresh(SovdAlarmStatus last_status, const std::string & condition_id, + const std::set & seen) { + const bool was_active = (last_status == SovdAlarmStatus::Confirmed || last_status == SovdAlarmStatus::Healed); + if (!was_active) { + return false; + } + return seen.find(condition_id) == seen.end(); +} + void OpcuaPoller::reconcile_after_read(const std::set & seen, const std::set & failed_sources, const std::set & modeled_sources) { std::vector clears; @@ -580,6 +589,33 @@ void OpcuaPoller::reconcile_after_read(const std::set & seen, const } } } + dispatch_condition_clears(clears); +} + +void OpcuaPoller::reconcile_after_refresh(const std::set & seen) { + std::vector clears; + { + std::unique_lock lock(conditions_mutex_); + for (auto & [cid, runtime] : conditions_) { + if (!should_clear_after_refresh(runtime.last_status, cid, seen)) { + continue; + } + // Tracked as active but the completed ConditionRefresh burst did not + // replay it -> the alarm cleared while we were offline. Clear the fault. + runtime.last_status = SovdAlarmStatus::Cleared; + AlarmEventDelivery d; + d.fault_code = runtime.fault_code; + d.entity_id = runtime.entity_id; + d.next_status = SovdAlarmStatus::Cleared; + d.action = AlarmAction::ClearFault; + d.condition_id = cid; + clears.push_back(std::move(d)); + } + } + dispatch_condition_clears(clears); +} + +void OpcuaPoller::dispatch_condition_clears(const std::vector & clears) { if (clears.empty()) { return; } @@ -602,22 +638,34 @@ void OpcuaPoller::on_event(const AlarmEventConfig & cfg, const std::vector the alarm cleared offline and its latched fault must reconcile away. + std::set seen; // server replayed nothing (no active conditions) + EXPECT_TRUE(OpcuaPoller::should_clear_after_refresh(SovdAlarmStatus::Confirmed, "cond-1", seen)); + EXPECT_TRUE(OpcuaPoller::should_clear_after_refresh(SovdAlarmStatus::Healed, "cond-1", seen)); +} + +TEST(ReconcileAfterRefresh, KeepsConditionReplayedInBurst) { + // The server replayed this ConditionId during the burst => still active, keep. + std::set seen{"cond-1"}; + EXPECT_FALSE(OpcuaPoller::should_clear_after_refresh(SovdAlarmStatus::Confirmed, "cond-1", seen)); + EXPECT_FALSE(OpcuaPoller::should_clear_after_refresh(SovdAlarmStatus::Healed, "cond-1", seen)); +} + +TEST(ReconcileAfterRefresh, NeverClearsInactiveCondition) { + // A condition that was never active (Suppressed/Cleared) is not a latched + // fault and must not be touched even when absent from the burst. + std::set seen; + EXPECT_FALSE(OpcuaPoller::should_clear_after_refresh(SovdAlarmStatus::Cleared, "cond-1", seen)); + EXPECT_FALSE(OpcuaPoller::should_clear_after_refresh(SovdAlarmStatus::Suppressed, "cond-1", seen)); +} + +TEST(ReconcileAfterRefresh, KeepsOtherActiveConditionsWhenOneReplayed) { + // Peer condition (same fault_code family, different ConditionId) replayed; + // the un-replayed one still clears. Guards the distinct-ConditionId keying. + std::set seen{"cond-keep"}; + EXPECT_FALSE(OpcuaPoller::should_clear_after_refresh(SovdAlarmStatus::Confirmed, "cond-keep", seen)); + EXPECT_TRUE(OpcuaPoller::should_clear_after_refresh(SovdAlarmStatus::Confirmed, "cond-gone", seen)); +} + // -- Issue #478: read-scan snapshot classification (Retain / EnabledState / // transient-keep filter that mirrors ConditionRefresh semantics). -------------- From b44ab2cc17c9f081b206af8909f7252d7a9bcf5f Mon Sep 17 00:00:00 2001 From: mfaferek93 Date: Wed, 1 Jul 2026 22:44:59 +0200 Subject: [PATCH 11/11] docs(opcua): drop misleading TOFU label from accept-any cert example The reject_untrusted: false path uses AcceptAll, which pins nothing and re-accepts any server cert on every connect. Calling it TOFU overstated its security and contradicted the corrected code comments and the env-var table. Relabel it as INSECURE, lab only, not trust-on-first-use. --- src/ros2_medkit_plugins/ros2_medkit_opcua/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md b/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md index 7e684c01a..8da6efd33 100644 --- a/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md +++ b/src/ros2_medkit_plugins/ros2_medkit_opcua/README.md @@ -265,7 +265,7 @@ plugins.opcua.client_cert_path: "/etc/ros2_medkit/opcua/client_cert.der" # X.50 plugins.opcua.client_key_path: "/etc/ros2_medkit/opcua/client_key.pem" # private key, PEM plugins.opcua.application_uri: "urn:selfpatch:medkit:opcua-client" # MUST match the cert SAN URI plugins.opcua.trust_list_paths: ["/etc/ros2_medkit/opcua/server_cert.der"] # DER trust store -plugins.opcua.reject_untrusted: true # false = accept any server cert (lab/TOFU only) +plugins.opcua.reject_untrusted: true # false = accept ANY server cert (INSECURE, lab only; not trust-on-first-use, nothing pinned) plugins.opcua.user_auth_mode: "UsernamePassword" # Anonymous | UsernamePassword | X509 plugins.opcua.username: "medkit" plugins.opcua.password: "..." # inject from a secret store at deploy time