From f5f7677430cca5827561fd51208014414e171fb2 Mon Sep 17 00:00:00 2001 From: ehsan shariati Date: Sat, 2 May 2026 09:56:17 -0400 Subject: [PATCH 1/6] Added offline download feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit | # | Step | Repo / files (primary) | Effort | Depends on | |---|---|---|---|---| | 0 | Verify pinning chain | (op-run; done) | done | — | | 1.2 | Add `bucket_lookup_h` field + SDK header + master populate-if-missing | `fula-api/crates/fula-core/src/metadata.rs`, `fula-cli/src/handlers/object.rs`, `fula-client/src/encryption.rs:3243`, new `fula-crypto` HKDF helper | ~100 LOC, 2-3 days | Step 0 | | 2.1 | Master-down detection (health gate) | `fula-client/src/encryption.rs` GET, `fula-cli/src/client.rs:319-371` | ~150 LOC, 2-3 days | independent of 1.2 | | 2.2 | Local block cache (redb LRU) | new `fula-client/src/block_cache.rs` | ~200 LOC, 3-4 days | — | | 2.3 | Multi-gateway race + dynamic priority + CID verification | new `fula-client/src/gateway_fetch.rs` | ~300 LOC, 4-5 days | 2.2 | | 2.4 | Wire warm-device offline GET | `fula-client/src/encryption.rs` GET, glue 2.1+2.2+2.3 | ~150 LOC, 2-3 days | 2.1, 2.2, 2.3 | --- Cargo.lock | 13 + Cargo.toml | 6 + crates/fula-cli/src/handlers/mod.rs | 1 + crates/fula-cli/src/handlers/object.rs | 69 +- .../src/handlers/users_index_publisher.rs | 1450 +++++++++++++++++ crates/fula-client/Cargo.toml | 8 + crates/fula-client/src/block_cache.rs | 628 +++++++ crates/fula-client/src/client.rs | 62 +- crates/fula-client/src/config.rs | 13 + crates/fula-client/src/encryption.rs | 41 +- crates/fula-client/src/error.rs | 8 + crates/fula-client/src/gateway_fetch.rs | 1306 +++++++++++++++ crates/fula-client/src/health_gate.rs | 240 +++ crates/fula-client/src/lib.rs | 5 + crates/fula-core/src/bucket.rs | 361 ++++ crates/fula-core/src/metadata.rs | 135 +- crates/fula-flutter/src/api/error.rs | 8 + 17 files changed, 4344 insertions(+), 10 deletions(-) create mode 100644 crates/fula-cli/src/handlers/users_index_publisher.rs create mode 100644 crates/fula-client/src/block_cache.rs create mode 100644 crates/fula-client/src/gateway_fetch.rs create mode 100644 crates/fula-client/src/health_gate.rs diff --git a/Cargo.lock b/Cargo.lock index ca50b8f..2cabddd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1799,6 +1799,7 @@ dependencies = [ "blake3", "bytes", "chrono", + "cid 0.11.1", "dashmap 6.1.0", "dirs", "fs2", @@ -1806,10 +1807,13 @@ dependencies = [ "futures", "hex", "mime_guess", + "parking_lot", "quick-xml", + "redb", "reqwest", "serde", "serde_json", + "sha2", "tempfile", "thiserror 2.0.17", "tokio", @@ -4209,6 +4213,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redb" +version = "2.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eca1e9d98d5a7e9002d0013e18d5a9b000aee942eb134883a82f06ebffb6c01" +dependencies = [ + "libc", +] + [[package]] name = "redox_syscall" version = "0.5.18" diff --git a/Cargo.toml b/Cargo.toml index e12ca5c..e835bbc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -169,6 +169,12 @@ lru = "0.12" semver = "1.0" bitvec = "1.0" +# Embedded persistent KV (block cache, Phase 2.2 of master-independent reads). +# Pinned to 2.6.x to avoid silent file-format drift in routine cargo update. +# A 2.x bump is a deliberate decision (verify file-format compatibility before +# upgrading; cache files in production may need migration handling). +redb = "~2.6" + # Testing criterion = "0.5" proptest = "1.5" diff --git a/crates/fula-cli/src/handlers/mod.rs b/crates/fula-cli/src/handlers/mod.rs index 585052e..91ea72d 100644 --- a/crates/fula-cli/src/handlers/mod.rs +++ b/crates/fula-cli/src/handlers/mod.rs @@ -8,6 +8,7 @@ pub mod multipart; pub mod object; pub mod service; pub mod tagging; +pub mod users_index_publisher; pub use admin::*; pub use batch::*; diff --git a/crates/fula-cli/src/handlers/object.rs b/crates/fula-cli/src/handlers/object.rs index 19b1381..e33594f 100644 --- a/crates/fula-cli/src/handlers/object.rs +++ b/crates/fula-cli/src/handlers/object.rs @@ -130,9 +130,16 @@ pub async fn put_object( metadata = metadata.with_content_type(ct); } - // Extract user metadata (x-amz-meta-*) + // Extract user metadata (x-amz-meta-*). + // Internal Fula control headers (consumed by the handler, not stored as + // object metadata) are filtered out — they would otherwise pollute every + // object's persisted metadata. + const FULA_CONTROL_HEADERS: &[&str] = &["fula-bucket-lookup-h"]; for (name, value) in headers.iter() { if let Some(key) = name.as_str().strip_prefix("x-amz-meta-") { + if FULA_CONTROL_HEADERS.contains(&key) { + continue; + } if let Ok(v) = value.to_str() { metadata = metadata.with_user_metadata(key, v); } @@ -145,7 +152,7 @@ pub async fn put_object( tracing::error!(error = %e, key = %key, "Failed to put object"); e })?; - + tracing::debug!("Flushing bucket"); let bucket_root_cid = bucket.flush().await .map_err(|e| { @@ -153,6 +160,64 @@ pub async fn put_object( e })?; + // Phase 1.2 of master-independent reads: if the SDK included + // `x-amz-meta-fula-bucket-lookup-h` (only set on the Phase 2 manifest + // root PUT in `save_sharded_hamt_forest`), populate the bucket-level + // `bucket_lookup_h` field if currently None. Idempotent — never + // overwrites. Gated by env so we can stage the rollout: SDK always + // sends the header (cheap); master only consumes it when ready. + // + // Failures are non-fatal — bad/missing headers must not break uploads. + // Placement: AFTER bucket.flush() (so the flush has already replaced + // the DashMap entry) and BEFORE persist_registry_with_token (so the + // updated field gets serialized into the registry CBOR on this same + // request, no extra IPFS write). + let buckets_index_enabled = std::env::var("FULA_BUCKET_LOOKUP_H_ENABLED") + .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) + .unwrap_or(false); + if buckets_index_enabled { + if let Some(hex_str) = headers + .get("x-amz-meta-fula-bucket-lookup-h") + .and_then(|v| v.to_str().ok()) + { + match hex::decode(hex_str) { + Ok(bytes) if bytes.len() == 16 => { + let mut lookup_h = [0u8; 16]; + lookup_h.copy_from_slice(&bytes); + match state.bucket_manager.populate_lookup_h_if_missing( + &session.hashed_user_id, + &bucket_name, + lookup_h, + ) { + Ok(true) => tracing::debug!( + bucket = %bucket_name, + "Populated bucket_lookup_h (Phase 1.2)" + ), + Ok(false) => { /* already set; idempotent skip */ } + // BucketNotFound on a successful PUT to a real bucket + // is an internal-consistency violation — promote to + // error level so operators notice the signal. + Err(e) => tracing::error!( + error = %e, + bucket = %bucket_name, + user = %session.hashed_user_id, + "populate_lookup_h_if_missing failed on a bucket that just accepted a PUT" + ), + } + } + Ok(other) => tracing::warn!( + actual_len = other.len(), + "x-amz-meta-fula-bucket-lookup-h: expected 16-byte hex (32 chars), got {} bytes", + other.len() + ), + Err(e) => tracing::warn!( + error = %e, + "Failed to hex-decode x-amz-meta-fula-bucket-lookup-h" + ), + } + } + } + // Persist the bucket registry so the new root CID survives restarts. // This MUST succeed — otherwise the new tree root is lost on restart. // Use the user's JWT for pinning service authentication. diff --git a/crates/fula-cli/src/handlers/users_index_publisher.rs b/crates/fula-cli/src/handlers/users_index_publisher.rs new file mode 100644 index 0000000..448cd56 --- /dev/null +++ b/crates/fula-cli/src/handlers/users_index_publisher.rs @@ -0,0 +1,1450 @@ +//! Phase 3.2 master-side users-index publisher. +//! +//! Builds a global users-index CBOR mapping every active user's +//! `userKey` (= `hashed_user_id`) to that user's per-user +//! `bucketsIndex` CID, pins it via the existing pinning chain +//! (cluster), and publishes the new CID via IPNS for SDK clients to +//! resolve during master-down cold-starts. +//! +//! This module owns three responsibilities: +//! +//! 1. **State persistence** (this file, A1) — a tiny 3-line text file +//! that survives master restarts: `(latest_global_cid, sequence, +//! updated_at_unix)`. Crash safety mirrored from +//! `BucketManager::persist_registry_internal` (atomic write + +//! `.bak` backup). Sequence is monotonic; it only increments. +//! +//! 2. **Tick logic** (A2 — coming next) — snapshot +//! `BucketManager.buckets`, build per-user bucketsIndex CBORs +//! only for users whose state changed since the last tick (diff +//! cache), build the global users-index CBOR, pin both via cluster. +//! +//! 3. **IPNS publish + internal endpoints** (A3 — after A2) — call +//! kubo `/api/v0/name/publish`; expose `GET /_internal/users-index-state` +//! for the daily chain cron in `mainnet-reward-server`. +//! +//! Background-task lifecycle mirrors `handlers::locks::start_sweeper`: +//! one `tokio::spawn` from `server::run_server` after `AppState` is +//! wrapped in `Arc`. The task lives for the process lifetime. + +#![allow(dead_code)] // A3 will consume `internal_token` + +use anyhow::Result as AnyResult; +use cid::Cid; +use fula_blockstore::{BlockStore, PinStore}; +use fula_core::{metadata::BucketMetadata, BucketManager}; +use parking_lot::{Mutex, RwLock}; +use serde::{Deserialize, Serialize}; +use std::collections::{BTreeMap, HashMap}; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +/// State that persists across master restarts. Single source of truth +/// for "what did we last successfully publish?". Written **after** a +/// successful pin + IPNS publish. Read on startup. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct PersistedState { + /// CID of the most recently pinned global users-index CBOR. + /// `None` = nothing has been published yet (fresh master). + pub global_cid: Option, + /// Monotonic sequence number embedded in the most recent global + /// users-index CBOR's payload. Always increments. SDK clients + /// reject responses with a regression as a replay defense. + pub sequence: u64, + /// Wall-clock seconds-since-epoch when the most recent publish + /// committed. Used for diagnostics and for the + /// `/_internal/users-index-state` HTTP response. + pub updated_at_unix: u64, +} + +impl Default for PersistedState { + fn default() -> Self { + Self { + global_cid: None, + sequence: 0, + updated_at_unix: 0, + } + } +} + +impl PersistedState { + /// Load state from `path`. Returns `Ok(default)` if the file + /// doesn't exist (fresh master). Returns an error on any other + /// I/O failure or parse problem — the caller surfaces this so + /// the operator can fix it (e.g., truncated file from a + /// half-completed write). + /// + /// Format: 3 lines separated by `\n`: + /// line 1 = CID string (or empty for `None`) + /// line 2 = sequence (u64 decimal) + /// line 3 = updated_at_unix (u64 decimal); optional — older + /// two-line files parse to `updated_at_unix = 0` + pub fn load(path: &Path) -> Result { + let raw = match std::fs::read_to_string(path) { + Ok(s) => s, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + return Ok(Self::default()); + } + Err(e) => return Err(PersistError::Io(e)), + }; + Self::parse(&raw) + } + + fn parse(raw: &str) -> Result { + let mut lines = raw.lines(); + let cid_line = lines.next().unwrap_or("").trim(); + let seq_line = lines.next().unwrap_or("").trim(); + let ts_line = lines.next().unwrap_or("").trim(); + + let global_cid = if cid_line.is_empty() { + None + } else { + Some(cid_line.parse::().map_err(|e| { + PersistError::Parse(format!("invalid CID '{}': {}", cid_line, e)) + })?) + }; + + let sequence: u64 = if seq_line.is_empty() { + 0 + } else { + seq_line.parse().map_err(|e| { + PersistError::Parse(format!("invalid sequence '{}': {}", seq_line, e)) + })? + }; + + let updated_at_unix: u64 = if ts_line.is_empty() { + 0 + } else { + ts_line.parse().map_err(|e| { + PersistError::Parse(format!("invalid updated_at '{}': {}", ts_line, e)) + })? + }; + + Ok(Self { + global_cid, + sequence, + updated_at_unix, + }) + } + + fn serialize(&self) -> String { + format!( + "{}\n{}\n{}\n", + self.global_cid.map_or(String::new(), |c| c.to_string()), + self.sequence, + self.updated_at_unix + ) + } + + /// Atomically write to `path`. If `path` already exists, copy it + /// to `path.bak` first (mirrors `BucketManager::persist_registry_internal`'s + /// backup pattern). Tolerates missing parent directory by creating + /// it; tolerates missing existing file by skipping the backup. + pub fn save(&self, path: &Path) -> Result<(), PersistError> { + if let Some(parent) = path.parent() { + if !parent.as_os_str().is_empty() { + std::fs::create_dir_all(parent).map_err(PersistError::Io)?; + } + } + + // Backup the previous state file before overwriting. This + // mirrors the fula-bucket-registry persistence pattern; if a + // crash interrupts the write, the operator can recover from + // the .bak. + if path.exists() { + let backup_path = with_bak_suffix(path); + // Best-effort backup; failure to back up should not block + // the main write (we'd rather lose the .bak than the + // primary). Surfaces only as a tracing log. + if let Err(e) = std::fs::copy(path, &backup_path) { + tracing::warn!( + error = %e, + backup_path = %backup_path.display(), + "users-index state-file backup failed; continuing with primary write" + ); + } + } + + // Atomic rename: write to a tmp sibling then rename onto the + // target. On most filesystems this is atomic; on Windows it + // requires the destination to be removable, which our + // backup-first step makes safe. + let tmp_path = path.with_extension("tmp"); + std::fs::write(&tmp_path, self.serialize()).map_err(PersistError::Io)?; + std::fs::rename(&tmp_path, path).map_err(PersistError::Io)?; + Ok(()) + } + + /// Compose the next state from a successful publish: + /// increment sequence, set new CID, refresh timestamp. + pub fn next(&self, new_cid: Cid) -> Self { + Self { + global_cid: Some(new_cid), + sequence: self.sequence.saturating_add(1), + updated_at_unix: now_unix(), + } + } +} + +fn with_bak_suffix(path: &Path) -> PathBuf { + let mut s = path.as_os_str().to_owned(); + s.push(".bak"); + PathBuf::from(s) +} + +fn now_unix() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0) +} + +#[derive(Debug, thiserror::Error)] +pub enum PersistError { + #[error("io error: {0}")] + Io(#[from] std::io::Error), + + #[error("parse error: {0}")] + Parse(String), +} + +// ============================================================ +// CBOR data structures (Phase 3.2.a) +// ============================================================ + +/// Per-user `bucketsIndex` CBOR. Pinned per user; one CBOR per user +/// per snapshot if their state changed. Map keys are either: +/// - 32-hex BLAKE3-derived `bucketLookupH` (Phase 1.2 blinded form) +/// - plaintext bucket name (Phase 1.2 lazy-migration legacy form) +/// `legacy=true` distinguishes the latter so SDK cold-start can +/// dispatch correctly. +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct UserBucketsIndex { + pub v: u32, + /// `BTreeMap` for **deterministic** key ordering — same input + /// must produce byte-identical CBOR (and thus the same CID) + /// across master restarts and across hosts. dag-cbor sorts map + /// keys but using BTreeMap upstream is belt-and-suspenders. + pub buckets: BTreeMap, + pub updated_at_unix: u64, +} + +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct BucketEntry { + /// CID of the user's per-bucket forest manifest (Prolly Tree + /// root from `BucketMetadata.root_cid`). Stored as string so + /// the CBOR doesn't grow IPLD-link semantics that would change + /// the recursive-pin walk. + pub manifest: String, + /// `true` ⇔ map key is plaintext `bucket_name` (Phase 1.2 hadn't + /// run for this bucket yet — i.e., user hasn't uploaded with a + /// Phase-1.2-aware client since the field was introduced). SDK + /// lookup falls through from blinded-key to legacy-name on miss. + pub legacy: bool, +} + +/// Global users-index CBOR. Master pins one per snapshot; the CID +/// is published via IPNS (every flush) and to the chain anchor +/// (every 12h). +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct GlobalUsersIndex { + pub v: u32, + /// Monotonic publisher sequence. Replay defense: SDK persists + /// `highest_seen_sequence`; rejects payloads with regression. + pub sequence: u64, + pub updated_at_unix: u64, + /// `userKey_hex` (32 hex chars = 16-byte hashed_user_id) → + /// per-user bucketsIndex CID (string). BTreeMap for determinism. + pub users: BTreeMap, +} + +// ============================================================ +// Per-user diff cache +// ============================================================ + +/// One row of the publisher's diff cache. The publisher uses +/// `content_hash` to detect "this user's bucket set changed since +/// the last tick" without re-pinning a brand-new CBOR every time. +/// +/// `content_hash` is BLAKE3 over a deterministic encoding of the +/// user's complete bucket set — see [`compute_user_content_hash`]. +/// Changing any bucket's name, root_cid, or bucket_lookup_h +/// triggers a rebuild on the next tick. +#[derive(Clone, Debug, PartialEq, Eq)] +pub(crate) struct PerUserDiffEntry { + pub content_hash: [u8; 32], + pub buckets_index_cid: Cid, +} + +/// Build a per-user `bucketsIndex` CBOR from that user's full +/// bucket list. Pure — no I/O. The caller pins the resulting CBOR +/// via `BlockStore::put_ipld` + `PinStore::pin_with_token`. +pub fn build_user_buckets_index( + buckets: &[BucketMetadata], + now_unix: u64, +) -> UserBucketsIndex { + let mut entries: BTreeMap = BTreeMap::new(); + for b in buckets { + let (key, legacy) = match b.bucket_lookup_h { + Some(h) => (hex::encode(h), false), + None => (b.name.clone(), true), + }; + entries.insert( + key, + BucketEntry { + manifest: b.root_cid.to_string(), + legacy, + }, + ); + } + UserBucketsIndex { + v: 2, + buckets: entries, + updated_at_unix: now_unix, + } +} + +/// Build the global users-index CBOR from a per-user CID map. +/// `entries` is `userKey_hex (32 hex) → bucketsIndexCid`. +pub fn build_global_users_index( + entries: &BTreeMap, + sequence: u64, + now_unix: u64, +) -> GlobalUsersIndex { + let users: BTreeMap = entries + .iter() + .map(|(uk, cid)| (uk.clone(), cid.to_string())) + .collect(); + GlobalUsersIndex { + v: 1, + sequence, + updated_at_unix: now_unix, + users, + } +} + +/// Compute a deterministic content hash over a user's full bucket +/// set. Used for diff-cache lookups: if this hash matches the +/// cached value, skip rebuilding+re-pinning the per-user CBOR. +/// +/// Encoding: each bucket contributes the byte-concatenation of +/// `name_bytes || 0x00 || root_cid_bytes || 0x00 || lookup_h_bytes_or_marker`. +/// Buckets are sorted by `name` first (BLAKE3 is itself +/// order-sensitive). Domain separator at the start defends against +/// cross-namespace collisions. +pub(crate) fn compute_user_content_hash(buckets: &[BucketMetadata]) -> [u8; 32] { + let mut sorted: Vec<&BucketMetadata> = buckets.iter().collect(); + sorted.sort_by(|a, b| a.name.cmp(&b.name)); + + let mut hasher = blake3::Hasher::new(); + hasher.update(b"fula:users-index-publisher:user-content-hash:v1"); + for b in &sorted { + hasher.update(b.name.as_bytes()); + hasher.update(&[0u8]); + hasher.update(&b.root_cid.to_bytes()); + hasher.update(&[0u8]); + match b.bucket_lookup_h { + Some(h) => { + hasher.update(b"H"); + hasher.update(&h); + } + None => { + hasher.update(b"N"); + } + } + hasher.update(&[0u8]); + } + let h = hasher.finalize(); + let mut out = [0u8; 32]; + out.copy_from_slice(h.as_bytes()); + out +} + +// ============================================================ +// Publisher configuration +// ============================================================ + +#[derive(Clone, Debug)] +pub struct PublisherConfig { + /// How often the publisher tick fires when there are changes. + /// Default 5 min — matches the user-facing latency expectation + /// for cross-device-fresh-data when using the IPNS path. + pub flush_interval: Duration, + /// Cap on the per-user pin operations the first tick fires per + /// second. The first tick after deploy has to pin every user's + /// bucketsIndex CBOR (cache is empty), so for large user sets + /// this can be tens of thousands of pin requests. Throttle to + /// avoid swamping the pinning-service. + pub first_publish_max_pins_per_sec: u32, + /// IPNS record lifetime. 36h gives a 24h margin over the 12h + /// chain-cron cadence — see plan section 3.2.b. + pub ipns_lifetime: Duration, + /// IPNS DHT cache TTL hint for resolvers. 15min keeps the SDK's + /// IPNS lookup latency low without aggressive re-fetch. + pub ipns_ttl: Duration, + /// Kubo IPNS key NAME (kubo's local label, e.g., + /// `fula-users-index`). Distinct from the IPNS NAME (libp2p + /// public-key hash) that clients use. See plan 3.2.b. + pub ipns_key_name: String, + /// Path to the persisted `(global_cid, sequence, updated_at)` + /// state file. Mirrors the `registry_cid_path` pattern. + pub state_file_path: PathBuf, + /// Kubo HTTP API URL (e.g., `http://localhost:5001`). Used for + /// `/api/v0/name/publish`. + pub ipfs_api_url: String, + /// Internal-endpoint shared-secret token. Disabled (returns 503) + /// if not set. Required in production. + pub internal_token: Option, +} + +impl PublisherConfig { + pub fn default_for(state_file_path: PathBuf, ipfs_api_url: String) -> Self { + Self { + flush_interval: Duration::from_secs(300), + first_publish_max_pins_per_sec: 100, + ipns_lifetime: Duration::from_secs(36 * 3600), + ipns_ttl: Duration::from_secs(15 * 60), + ipns_key_name: "fula-users-index".to_string(), + state_file_path, + ipfs_api_url, + internal_token: None, + } + } +} + +// ============================================================ +// In-memory latest-published view (read by /_internal/users-index-state) +// ============================================================ + +/// Snapshot of the last-published state. Updated under a write lock +/// inside the publisher tick. Read by the internal HTTP endpoint +/// without blocking the publisher. +#[derive(Clone, Debug, Default)] +pub struct LatestPublished { + pub global_cid: Option, + pub sequence: u64, + pub updated_at_unix: u64, +} + +impl From<&PersistedState> for LatestPublished { + fn from(p: &PersistedState) -> Self { + Self { + global_cid: p.global_cid, + sequence: p.sequence, + updated_at_unix: p.updated_at_unix, + } + } +} + +// ============================================================ +// Publisher skeleton +// ============================================================ + +/// The publisher. Generic over the block store so tests can use +/// `MemoryBlockStore` while production uses `FlexibleBlockStore`. +pub struct UsersIndexPublisher { + config: PublisherConfig, + bucket_manager: Arc>, + block_store: Arc, + /// Per-user diff cache — owner_id → (content_hash, bucketsIndexCid). + /// `Mutex` (not `RwLock`) because the tick is the only writer and + /// the lock window is tiny (a HashMap insert). + diff_cache: Mutex>, + /// Mirror of the on-disk state, refreshed after every successful + /// publish. Read by the internal endpoint. + latest: RwLock, + /// Serializes `run_tick` invocations so a periodic firing and an + /// admin `publish-now` call (A3) never race the rename of the state + /// file or produce two competing `sequence` values for the same + /// underlying state. Tokio mutex (not parking_lot) because the tick + /// holds it across `await`s on the pin chain. + tick_lock: tokio::sync::Mutex<()>, +} + +/// Outcome of a single `run_tick` call. Useful for tests and for +/// observability counters. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct TickOutcome { + /// Number of distinct users whose per-user CBOR was rebuilt and + /// re-pinned this tick. Always equal to `total_users` on the + /// first tick (cache is empty). + pub changed_users: usize, + /// Total number of users in `BucketManager.buckets` at this tick. + pub total_users: usize, + /// CID of the global users-index CBOR pinned this tick. + pub global_cid: Cid, + /// Sequence number embedded in the global CBOR's payload. + pub sequence: u64, + /// `true` iff the global users-index actually changed (i.e., at + /// least one user changed OR the cache was empty). When `false` + /// the publisher could in principle skip the global rebuild — + /// but for simplicity the current implementation always rebuilds + /// the global CBOR. Field kept for future optimization. + pub global_rebuilt: bool, +} + +impl UsersIndexPublisher { + /// Construct from config + handles to the bucket manager and + /// block store. Loads existing state-file on-disk; fresh master + /// starts with `PersistedState::default()`. + pub fn open( + config: PublisherConfig, + bucket_manager: Arc>, + block_store: Arc, + ) -> Result { + let persisted = PersistedState::load(&config.state_file_path)?; + let latest = LatestPublished::from(&persisted); + Ok(Self { + config, + bucket_manager, + block_store, + diff_cache: Mutex::new(HashMap::new()), + latest: RwLock::new(latest), + tick_lock: tokio::sync::Mutex::new(()), + }) + } + + /// Snapshot of the last successful publish. Cheap-clone via the + /// underlying RwLock read guard. + pub fn latest(&self) -> LatestPublished { + self.latest.read().clone() + } + + /// Read the on-disk persisted state directly (bypasses the + /// in-memory `latest` cache). Used by tests and by the startup + /// chain-cross-check (see plan 3.2.b advisor note). + pub fn read_persisted(&self) -> Result { + PersistedState::load(&self.config.state_file_path) + } + + /// Number of entries in the diff cache. Test-only accessor. + #[cfg(test)] + fn diff_cache_len(&self) -> usize { + self.diff_cache.lock().len() + } + + /// Atomically write the next state to disk and update the + /// in-memory `latest` mirror. Called by `run_tick` AFTER a + /// successful pin — the documented order is "pin → persist" + /// (IPNS publish lands in A3, between these two). A crash + /// between pin and persist leaks the orphan-pinned CBOR; + /// cluster GC reaps it; on-chain `require(newSequence > sequence)` + /// keeps sequence monotonic regardless. (Advisor note, plan 3.2.a.) + fn commit_state(&self, next: PersistedState) -> Result<(), PersistError> { + next.save(&self.config.state_file_path)?; + *self.latest.write() = LatestPublished::from(&next); + Ok(()) + } + + /// Run one publisher tick: snapshot the bucket manager, rebuild + /// per-user CBORs only for users whose `content_hash` changed + /// since the last tick, build the global users-index CBOR, pin + /// both via the `PinStore` (cluster), persist the new state. + /// + /// IPNS publishing lands in A3 — this method does not call kubo's + /// `name/publish`. Tests assert the pin chain and the persisted + /// state; the IPNS step will plug in afterward without changing + /// the contract here. + /// + /// **Concurrency.** `BucketManager.buckets` is a `DashMap`; we + /// snapshot to a `Vec` in one synchronous block (no `await` while + /// the iterator is alive — that would be a shard-guard-deadlock + /// hazard). + pub async fn run_tick(&self) -> AnyResult { + // Single-tick-at-a-time. The periodic scheduler and the + // admin `publish-now` (A3) will both invoke run_tick; this + // ensures they never race the rename of the state file or + // emit two competing `sequence` values from the same + // starting state. + let _guard = self.tick_lock.lock().await; + + // 1. Snapshot every user's full bucket set. `list_buckets` + // iterates the DashMap and clones each value; drops the + // iterator before returning, so no shard guard survives + // into our subsequent `await`s. + let snapshot: Vec = self.bucket_manager.list_buckets(); + + // 2. Group by owner_id. + let mut by_user: HashMap> = HashMap::new(); + for b in snapshot { + by_user.entry(b.owner_id.clone()).or_default().push(b); + } + let total_users = by_user.len(); + let now = now_unix(); + + // 3. For each user: compute content_hash; if cache miss or + // diff, rebuild + pin per-user CBOR. + let max_concurrent = self + .config + .first_publish_max_pins_per_sec + .max(1) as usize; + let to_rebuild: Vec<(String, Vec)> = { + let cache = self.diff_cache.lock(); + by_user + .iter() + .filter_map(|(owner_id, buckets)| { + let hash = compute_user_content_hash(buckets); + let unchanged = cache + .get(owner_id) + .map(|e| e.content_hash == hash) + .unwrap_or(false); + if unchanged { + None + } else { + Some((owner_id.clone(), buckets.clone())) + } + }) + .collect() + // cache guard drops here, before any `await` + }; + + // Buffer-unordered keeps at most `max_concurrent` pin ops in + // flight at any time (advisor's first-publish throttle). + let block_store = Arc::clone(&self.block_store); + let pin_results: Vec> = { + use futures::stream::{self, StreamExt}; + stream::iter(to_rebuild.into_iter().map(|(owner_id, buckets)| { + let bs = Arc::clone(&block_store); + async move { + let hash = compute_user_content_hash(&buckets); + let cbor = build_user_buckets_index(&buckets, now); + let cid = bs.put_ipld(&cbor).await?; + bs.pin(&cid, Some("fula-users-index-per-user")) + .await?; + Ok::<_, anyhow::Error>((owner_id, hash, cid)) + } + })) + .buffer_unordered(max_concurrent) + .collect() + .await + }; + + let mut changed_users = 0usize; + for r in pin_results { + let (owner_id, hash, cid) = r?; + self.diff_cache.lock().insert( + owner_id, + PerUserDiffEntry { + content_hash: hash, + buckets_index_cid: cid, + }, + ); + changed_users += 1; + } + + // Prune diff-cache rows for users who disappeared from + // `BucketManager` since the last tick (deleted account, + // user deleted all their buckets, etc.). Without this, the + // cache would grow forever AND — critically — a removed + // user would keep appearing in published globals because + // the early-return below would never fire a rebuild for a + // pure-deletion tick. We track `users_pruned` to fold + // deletions into the rebuild trigger. + let users_pruned = { + let mut cache = self.diff_cache.lock(); + let before = cache.len(); + cache.retain(|owner_id, _| by_user.contains_key(owner_id)); + before - cache.len() + }; + + let prior = self.latest.read().clone(); + + // 4. Skip-if-no-change: every user's cache row matched AND + // no users were pruned AND we've already published at + // least once → tick is a no-op. Returning early avoids + // pin/unpin churn and keeps `sequence` from advancing + // for free, so the 12h chain cron sees the same + // `(cid, sequence)` and skips the on-chain publish. + // Including `users_pruned == 0` is load-bearing: a + // pure-deletion tick has `changed_users == 0` but MUST + // rebuild so the deleted user disappears from the + // published global. + if changed_users == 0 && users_pruned == 0 && prior.global_cid.is_some() { + return Ok(TickOutcome { + changed_users: 0, + total_users, + global_cid: prior.global_cid.expect("checked is_some"), + sequence: prior.sequence, + global_rebuilt: false, + }); + } + + // 5. Build the user → bucketsIndexCid map from the now-up-to-date + // cache. Iterating `by_user.keys()` ensures we include every + // user even if their cache row was already up to date. + let mut user_to_cid: BTreeMap = BTreeMap::new(); + let cache_snapshot = self.diff_cache.lock().clone(); + for owner_id in by_user.keys() { + if let Some(entry) = cache_snapshot.get(owner_id) { + user_to_cid.insert(owner_id.clone(), entry.buckets_index_cid); + } + } + + // 6. Build + pin global users-index CBOR. Sequence increments + // relative to the last persisted state; new state is committed + // only after the pin succeeds. + let next_sequence = prior.sequence.saturating_add(1); + let global = build_global_users_index(&user_to_cid, next_sequence, now); + let global_cid = self.block_store.put_ipld(&global).await?; + self.block_store + .pin(&global_cid, Some("fula-users-index-global")) + .await?; + + // 7. Best-effort unpin previous global. Failure is fine — + // cluster GC will eventually reap it. + if let Some(prev) = prior.global_cid { + if prev != global_cid { + if let Err(e) = self.block_store.unpin(&prev).await { + tracing::debug!( + prev = %prev, + error = %e, + "users-index publisher: unpin previous global failed (best-effort; cluster GC will reap)" + ); + } + } + } + + // 8. Persist new state. (A3 will insert IPNS publish between + // pin and persist; commit_state stays last so a crash mid- + // IPNS leaves us in a recoverable place.) + let next_state = PersistedState { + global_cid: Some(global_cid), + sequence: next_sequence, + updated_at_unix: now, + }; + self.commit_state(next_state)?; + + Ok(TickOutcome { + changed_users, + total_users, + global_cid, + sequence: next_sequence, + global_rebuilt: true, + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use cid::multihash::Multihash; + use fula_blockstore::MemoryBlockStore; + use fula_core::metadata::Owner; + use tempfile::TempDir; + + fn fixture_cid(seed: u8) -> Cid { + let mut bytes = [0u8; 32]; + bytes[0] = seed; + let mh = Multihash::<64>::wrap(0x1e /* blake3 */, &bytes).unwrap(); + Cid::new_v1(0x71 /* dag-cbor */, mh) + } + + /// Build a synthetic `BucketMetadata` for the **pure** (no-IPFS) + /// builder + content-hash tests. Uses `BucketMetadata::new` so the + /// struct stays in sync with field additions. Real `run_tick` + /// integration tests use `create_bucket_for_user` instead so they + /// exercise the real DashMap insertion path. + fn bucket_meta( + owner_id: &str, + name: &str, + root_seed: u8, + lookup_h: Option<[u8; 16]>, + ) -> BucketMetadata { + let mut m = BucketMetadata::new( + name.to_string(), + owner_id.to_string(), + fixture_cid(root_seed), + ); + m.bucket_lookup_h = lookup_h; + m + } + + /// Construct a publisher backed by `MemoryBlockStore` for tests. + /// Returns `(publisher, store, manager)` so individual tests can + /// poke at the manager (insert buckets etc.) and inspect the + /// store (verify pins). + fn fixture_publisher( + path: PathBuf, + ) -> ( + UsersIndexPublisher, + Arc, + Arc>, + ) { + let store = Arc::new(MemoryBlockStore::new()); + let manager = Arc::new(BucketManager::new(Arc::clone(&store))); + let publisher = UsersIndexPublisher::open( + fixture_config(path), + Arc::clone(&manager), + Arc::clone(&store), + ) + .expect("open"); + (publisher, store, manager) + } + + // ============================================================ + // PersistedState round-trip + // ============================================================ + + #[test] + fn test_persisted_state_default_is_empty() { + let s = PersistedState::default(); + assert!(s.global_cid.is_none()); + assert_eq!(s.sequence, 0); + assert_eq!(s.updated_at_unix, 0); + } + + #[test] + fn test_load_missing_file_returns_default() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("nonexistent.state"); + let s = PersistedState::load(&path).expect("missing file is not an error"); + assert_eq!(s, PersistedState::default()); + } + + #[test] + fn test_save_then_load_roundtrip() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let cid = fixture_cid(0xab); + let s = PersistedState { + global_cid: Some(cid), + sequence: 42, + updated_at_unix: 1_700_000_000, + }; + s.save(&path).expect("save"); + let loaded = PersistedState::load(&path).expect("load"); + assert_eq!(loaded, s); + } + + #[test] + fn test_save_creates_parent_directory() { + // Mirrors `persist_registry_internal`'s parent-creation + // behavior — operators may configure a path under a missing + // directory; the publisher must not fail. + let dir = TempDir::new().unwrap(); + let nested = dir.path().join("sub").join("dir").join("state.txt"); + let s = PersistedState::default(); + s.save(&nested).expect("save"); + assert!(nested.exists()); + } + + #[test] + fn test_save_creates_bak_on_overwrite() { + // Critical for crash recovery: the previous state file must + // be backed up to .bak before being overwritten, so a half- + // completed write doesn't lose the prior valid state. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let s1 = PersistedState { + global_cid: Some(fixture_cid(1)), + sequence: 1, + updated_at_unix: 100, + }; + s1.save(&path).expect("save 1"); + + let s2 = PersistedState { + global_cid: Some(fixture_cid(2)), + sequence: 2, + updated_at_unix: 200, + }; + s2.save(&path).expect("save 2"); + + let bak = with_bak_suffix(&path); + assert!(bak.exists(), ".bak file must be created on overwrite"); + let bak_loaded = PersistedState::load(&bak).expect("load bak"); + assert_eq!(bak_loaded, s1, ".bak must hold the previous state"); + + let primary_loaded = PersistedState::load(&path).expect("load primary"); + assert_eq!(primary_loaded, s2); + } + + #[test] + fn test_first_save_does_not_create_bak() { + // No prior file → no .bak created. Avoids leaving a stray + // empty file on first write. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let s = PersistedState::default(); + s.save(&path).expect("save"); + let bak = with_bak_suffix(&path); + assert!(!bak.exists(), ".bak must NOT exist on first write"); + } + + #[test] + fn test_parse_two_line_legacy_format() { + // Forward-tolerant: an older two-line file (CID + sequence, + // no timestamp) must parse with `updated_at = 0`. This isn't + // a current production format, but the parser is permissive. + let cid = fixture_cid(7); + let raw = format!("{}\n5\n", cid); + let s = PersistedState::parse(&raw).expect("parse"); + assert_eq!(s.global_cid, Some(cid)); + assert_eq!(s.sequence, 5); + assert_eq!(s.updated_at_unix, 0); + } + + #[test] + fn test_parse_empty_lines_are_treated_as_missing() { + // An empty-string CID line means "nothing published yet." + // An empty sequence line means seq=0. Tolerates the + // edge case where a pre-publish state file gets persisted. + let s = PersistedState::parse("\n\n\n").expect("parse"); + assert_eq!(s, PersistedState::default()); + } + + #[test] + fn test_parse_corrupt_cid_returns_error() { + let raw = "not-a-cid\n0\n"; + let result = PersistedState::parse(raw); + assert!(matches!(result, Err(PersistError::Parse(_)))); + } + + #[test] + fn test_parse_corrupt_sequence_returns_error() { + let cid = fixture_cid(1); + let raw = format!("{}\nnot-a-number\n", cid); + let result = PersistedState::parse(&raw); + assert!(matches!(result, Err(PersistError::Parse(_)))); + } + + #[test] + fn test_next_increments_sequence() { + let s = PersistedState { + global_cid: Some(fixture_cid(1)), + sequence: 99, + updated_at_unix: 1_700_000_000, + }; + let next_cid = fixture_cid(2); + let n = s.next(next_cid); + assert_eq!(n.global_cid, Some(next_cid)); + assert_eq!(n.sequence, 100, "sequence must increment exactly once"); + assert!( + n.updated_at_unix >= 1_700_000_000, + "timestamp must be monotonic-or-equal" + ); + } + + #[test] + fn test_next_from_default_starts_at_one() { + // First-ever publish: sequence transitions from 0 → 1. + let initial = PersistedState::default(); + let n = initial.next(fixture_cid(0)); + assert_eq!(n.sequence, 1); + } + + #[test] + fn test_next_saturating_at_max() { + // Defensive: if sequence somehow reaches u64::MAX (impossible + // in practice but worth not panicking on), `saturating_add` + // keeps us from overflow. + let s = PersistedState { + global_cid: Some(fixture_cid(1)), + sequence: u64::MAX, + updated_at_unix: 0, + }; + let n = s.next(fixture_cid(2)); + assert_eq!(n.sequence, u64::MAX); + } + + // ============================================================ + // UsersIndexPublisher::open + commit_state + // ============================================================ + + fn fixture_config(state_path: PathBuf) -> PublisherConfig { + PublisherConfig::default_for(state_path, "http://localhost:5001".to_string()) + } + + #[test] + fn test_open_with_empty_state_starts_fresh() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, _store, _manager) = fixture_publisher(path); + let latest = publisher.latest(); + assert!(latest.global_cid.is_none()); + assert_eq!(latest.sequence, 0); + } + + #[test] + fn test_open_with_existing_state_loads_it() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + + // Write existing state, then open + let prior = PersistedState { + global_cid: Some(fixture_cid(0xaa)), + sequence: 17, + updated_at_unix: 1_700_000_000, + }; + prior.save(&path).expect("seed"); + + let (publisher, _store, _manager) = fixture_publisher(path); + let latest = publisher.latest(); + assert_eq!(latest.global_cid, Some(fixture_cid(0xaa))); + assert_eq!(latest.sequence, 17); + assert_eq!(latest.updated_at_unix, 1_700_000_000); + } + + #[test] + fn test_commit_state_updates_disk_and_memory() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, _store, _manager) = fixture_publisher(path.clone()); + + let next = PersistedState { + global_cid: Some(fixture_cid(1)), + sequence: 1, + updated_at_unix: 1_700_000_001, + }; + publisher.commit_state(next.clone()).expect("commit"); + + // In-memory `latest` reflects the commit. + let latest = publisher.latest(); + assert_eq!(latest.global_cid, next.global_cid); + assert_eq!(latest.sequence, next.sequence); + + // On-disk file matches. + let disk = PersistedState::load(&path).expect("reload"); + assert_eq!(disk, next); + } + + #[test] + fn test_commit_state_survives_subsequent_open() { + // The crash-recovery path: master commits state, then + // restarts. New publisher instance must see the committed + // state. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + + { + let (publisher, _store, _manager) = fixture_publisher(path.clone()); + let next = PersistedState { + global_cid: Some(fixture_cid(0xee)), + sequence: 12, + updated_at_unix: 1_700_000_012, + }; + publisher.commit_state(next).expect("commit"); + // publisher drops here, simulating master restart + } + + let (publisher, _store, _manager) = fixture_publisher(path); + let latest = publisher.latest(); + assert_eq!(latest.global_cid, Some(fixture_cid(0xee))); + assert_eq!(latest.sequence, 12); + } + + #[test] + fn test_open_returns_error_on_corrupt_state_file() { + // Operator must be told if the state file is corrupt rather + // than silently starting with a default that would re-issue + // already-used sequence numbers. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + std::fs::write(&path, "not-a-cid\nnot-a-number\n").expect("seed"); + + let store = Arc::new(MemoryBlockStore::new()); + let manager = Arc::new(BucketManager::new(Arc::clone(&store))); + let result = UsersIndexPublisher::open(fixture_config(path), manager, store); + assert!(matches!(result, Err(PersistError::Parse(_)))); + } + + // ============================================================ + // Phase 3.2 A2 — pure CBOR builders + content-hash determinism + // ============================================================ + + #[test] + fn test_build_user_buckets_index_empty() { + let cbor = build_user_buckets_index(&[], 1_700_000_000); + assert_eq!(cbor.v, 2); + assert!(cbor.buckets.is_empty()); + assert_eq!(cbor.updated_at_unix, 1_700_000_000); + } + + #[test] + fn test_build_user_buckets_index_legacy_only() { + // Bucket with `bucket_lookup_h = None` → legacy plaintext key. + let buckets = vec![bucket_meta("alice", "photos", 1, None)]; + let cbor = build_user_buckets_index(&buckets, 1_700_000_000); + assert_eq!(cbor.buckets.len(), 1); + let entry = cbor.buckets.get("photos").expect("photos under plaintext key"); + assert!(entry.legacy, "missing lookup_h → must be legacy"); + assert_eq!(entry.manifest, fixture_cid(1).to_string()); + } + + #[test] + fn test_build_user_buckets_index_blinded_only() { + let h = [0x42u8; 16]; + let buckets = vec![bucket_meta("alice", "photos", 1, Some(h))]; + let cbor = build_user_buckets_index(&buckets, 1_700_000_000); + assert_eq!(cbor.buckets.len(), 1); + let entry = cbor.buckets.get(&hex::encode(h)).expect("blinded key"); + assert!(!entry.legacy, "lookup_h present → must NOT be legacy"); + assert!( + !cbor.buckets.contains_key("photos"), + "blinded entry must not also leak under plaintext name" + ); + } + + #[test] + fn test_build_user_buckets_index_mixed_legacy_and_blinded() { + // One bucket migrated, one not. Both appear in the CBOR + // under their respective key types (Phase 1.2 lazy- + // migration semantics). + let h = [0xaau8; 16]; + let buckets = vec![ + bucket_meta("alice", "photos", 1, Some(h)), + bucket_meta("alice", "tax-2024", 2, None), + ]; + let cbor = build_user_buckets_index(&buckets, 1_700_000_000); + assert_eq!(cbor.buckets.len(), 2); + let blinded = cbor.buckets.get(&hex::encode(h)).expect("blinded entry"); + assert!(!blinded.legacy); + let legacy = cbor + .buckets + .get("tax-2024") + .expect("legacy entry under plaintext name"); + assert!(legacy.legacy); + } + + #[test] + fn test_compute_user_content_hash_is_deterministic() { + // Same inputs in any iteration order must produce the same + // hash. Critical: dag-cbor maps + the diff cache both rely + // on this for determinism. + let h = [0x11u8; 16]; + let a = vec![ + bucket_meta("alice", "photos", 1, Some(h)), + bucket_meta("alice", "videos", 2, None), + ]; + let b = vec![ + bucket_meta("alice", "videos", 2, None), + bucket_meta("alice", "photos", 1, Some(h)), + ]; + assert_eq!(compute_user_content_hash(&a), compute_user_content_hash(&b)); + } + + #[test] + fn test_compute_user_content_hash_differs_on_root_cid_change() { + // Same bucket name, different root_cid → different hash. + // This is what triggers a re-pin on the next tick. + let a = vec![bucket_meta("alice", "photos", 1, None)]; + let b = vec![bucket_meta("alice", "photos", 2, None)]; + assert_ne!(compute_user_content_hash(&a), compute_user_content_hash(&b)); + } + + #[test] + fn test_compute_user_content_hash_differs_on_lookup_h_change() { + // None → Some([..]) is the lazy-migration path. The + // content_hash MUST detect this so the publisher rebuilds + // the per-user CBOR (replacing legacy entry with blinded). + let a = vec![bucket_meta("alice", "photos", 1, None)]; + let b = vec![bucket_meta("alice", "photos", 1, Some([0u8; 16]))]; + assert_ne!(compute_user_content_hash(&a), compute_user_content_hash(&b)); + } + + #[test] + fn test_build_global_users_index_sorted_by_userkey() { + // BTreeMap ordering — same input produces same byte-output + // and same CID across master restarts/hosts. + let mut entries: BTreeMap = BTreeMap::new(); + entries.insert("zzz_user".to_string(), fixture_cid(1)); + entries.insert("aaa_user".to_string(), fixture_cid(2)); + let cbor = build_global_users_index(&entries, 5, 1_700_000_000); + assert_eq!(cbor.v, 1); + assert_eq!(cbor.sequence, 5); + // First key in the BTreeMap iteration is the lex-smallest. + let first = cbor.users.keys().next().expect("nonempty"); + assert_eq!(first, "aaa_user"); + } + + // ============================================================ + // Phase 3.2 A2 — run_tick orchestration tests + // ============================================================ + // + // run_tick tests use the real `create_bucket_for_user` / + // `delete_bucket_for_user` / `populate_lookup_h_if_missing` API + // to seed `BucketManager` — no private-field reach-in. Root CIDs + // are whatever the freshly-built forest produces; tests assert + // *behavior* (sequence advance, pin/unpin, diff-cache state), + // not exact CID values. + + async fn create_user_bucket( + manager: &BucketManager, + user_id: &str, + bucket_name: &str, + ) { + manager + .create_bucket_for_user( + user_id, + bucket_name.to_string(), + Owner::new(user_id), + ) + .await + .expect("create_bucket_for_user"); + } + + #[tokio::test] + async fn test_run_tick_first_publish_pins_global_and_per_user() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, store, manager) = fixture_publisher(path); + + // Two users, three buckets total. + create_user_bucket(&manager, "alice", "photos").await; + create_user_bucket(&manager, "alice", "videos").await; + create_user_bucket(&manager, "bob", "docs").await; + + let outcome = publisher.run_tick().await.expect("tick"); + assert_eq!(outcome.total_users, 2); + assert_eq!(outcome.changed_users, 2); + assert!(outcome.global_rebuilt); + assert_eq!(outcome.sequence, 1); + + // The global CBOR is pinned and retrievable. + assert!(store.is_pinned(&outcome.global_cid).await.unwrap()); + + // After the first tick, the persisted state mirrors the in-memory. + let persisted = publisher.read_persisted().expect("read"); + assert_eq!(persisted.global_cid, Some(outcome.global_cid)); + assert_eq!(persisted.sequence, 1); + + // Decode the global CBOR and verify both users are present. + let global_cbor: GlobalUsersIndex = + store.get_ipld(&outcome.global_cid).await.expect("global"); + assert_eq!(global_cbor.users.len(), 2); + assert!(global_cbor.users.contains_key("alice")); + assert!(global_cbor.users.contains_key("bob")); + } + + #[tokio::test] + async fn test_run_tick_idempotent_skips_when_no_changes() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, _store, manager) = fixture_publisher(path); + + create_user_bucket(&manager, "alice", "photos").await; + let first = publisher.run_tick().await.expect("first"); + assert_eq!(first.sequence, 1); + + // Second tick — nothing changed in the manager. + let second = publisher.run_tick().await.expect("second"); + assert_eq!(second.changed_users, 0); + assert!(!second.global_rebuilt, "no-change tick must NOT rebuild"); + assert_eq!(second.sequence, 1, "sequence must NOT advance on no-op"); + assert_eq!(second.global_cid, first.global_cid); + } + + #[tokio::test] + async fn test_run_tick_advances_sequence_on_real_change() { + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, _store, manager) = fixture_publisher(path); + + create_user_bucket(&manager, "alice", "photos").await; + let first = publisher.run_tick().await.expect("first"); + + // Add a new bucket → user content_hash changes → re-pin. + create_user_bucket(&manager, "alice", "videos").await; + let second = publisher.run_tick().await.expect("second"); + + assert_eq!(second.changed_users, 1); + assert_eq!(second.sequence, 2, "sequence advances by exactly 1"); + assert_ne!(second.global_cid, first.global_cid); + } + + #[tokio::test] + async fn test_run_tick_diff_cache_prunes_deleted_users() { + // Pure-deletion tick: every surviving user's content_hash + // matches cache (changed_users == 0), but the global MUST + // still rebuild so the deleted user disappears from the + // published map. This guards against the early-return + // that previously fired on `changed_users == 0` alone. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, store, manager) = fixture_publisher(path); + + create_user_bucket(&manager, "alice", "photos").await; + create_user_bucket(&manager, "bob", "docs").await; + let first = publisher.run_tick().await.expect("first"); + assert_eq!(publisher.diff_cache_len(), 2); + + // Verify both users present in the first global. + let first_global: GlobalUsersIndex = + store.get_ipld(&first.global_cid).await.expect("first global"); + assert!(first_global.users.contains_key("alice")); + assert!(first_global.users.contains_key("bob")); + + // Delete bob's bucket — bob disappears from BucketManager. + manager + .delete_bucket_for_user("bob", "docs") + .await + .expect("delete"); + let second = publisher.run_tick().await.expect("second"); + assert_eq!( + publisher.diff_cache_len(), + 1, + "diff cache must shrink when a user disappears" + ); + assert_eq!(second.changed_users, 0, "no per-user CBOR rebuilt"); + assert!( + second.global_rebuilt, + "pure-deletion tick MUST rebuild global" + ); + assert_eq!( + second.sequence, 2, + "deletion-only tick advances sequence (chain cron must observe new state)" + ); + assert_ne!( + second.global_cid, first.global_cid, + "global CID must change when membership changes" + ); + + let second_global: GlobalUsersIndex = + store.get_ipld(&second.global_cid).await.expect("second global"); + assert!(second_global.users.contains_key("alice")); + assert!( + !second_global.users.contains_key("bob"), + "deleted user MUST disappear from published global" + ); + // Idempotency: alice's content didn't change, so her per- + // user `bucketsIndexCid` MUST be byte-identical across the + // two globals. If this drifts, something in the diff-cache + // logic is silently re-pinning unchanged users. + assert_eq!( + first_global.users["alice"], second_global.users["alice"], + "unchanged user's bucketsIndex CID must be stable across deletion ticks" + ); + } + + #[tokio::test] + async fn test_run_tick_after_restart_rebuilds_with_advanced_sequence() { + // Crash-recovery scenario: master commits state, restarts. + // The new publisher's in-memory diff cache is empty, so + // every user looks "changed" on the first tick and the + // sequence advances by 1. The per-user `bucketsIndexCid`s + // are deterministic CIDs over the same content, so the + // pin operations are idempotent — but the global CBOR + // embeds a fresh `sequence` + `updated_at_unix`, so its + // CID changes. Documented expected behavior. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + + let first_global_cid; + { + let (publisher, _store, manager) = fixture_publisher(path.clone()); + create_user_bucket(&manager, "alice", "photos").await; + create_user_bucket(&manager, "bob", "docs").await; + let first = publisher.run_tick().await.expect("first tick"); + assert_eq!(first.sequence, 1); + first_global_cid = first.global_cid; + } // publisher drops, simulating master restart + + // Re-open against the same state file AND a *fresh* + // BucketManager. We re-create the same buckets so the + // post-restart manager mirrors what `load_registry` would + // produce in production (same owner_ids + bucket_names). + let (publisher, _store, manager) = fixture_publisher(path); + create_user_bucket(&manager, "alice", "photos").await; + create_user_bucket(&manager, "bob", "docs").await; + + // State persisted before restart is loaded. + assert_eq!(publisher.latest().sequence, 1); + assert_eq!(publisher.latest().global_cid, Some(first_global_cid)); + + // First post-restart tick: cache is empty → every user + // gets a re-pin. Sequence advances exactly once. + let second = publisher.run_tick().await.expect("post-restart tick"); + assert_eq!(second.changed_users, 2, "empty cache → all users re-pinned"); + assert_eq!(second.total_users, 2); + assert_eq!( + second.sequence, 2, + "sequence advances by exactly 1 across restart" + ); + assert!(second.global_rebuilt); + } + + #[tokio::test] + async fn test_run_tick_legacy_to_blinded_replaces_entry() { + // Phase 3.2.1(d) backward-compat scenario: write under old + // client (no lookup_h), then again under new client (with + // lookup_h). The published CBOR must contain a single + // blinded entry for the bucket — NOT both legacy and blinded. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, store, manager) = fixture_publisher(path); + + create_user_bucket(&manager, "alice", "photos").await; + let first = publisher.run_tick().await.expect("first"); + + // Simulate the upgrade: populate lookup_h via the public + // helper (this is what the PUT handler calls in production + // when a Phase-1.2-aware client uploads). + let h = [0x77u8; 16]; + let changed = manager + .populate_lookup_h_if_missing("alice", "photos", h) + .expect("populate ok"); + assert!(changed, "must transition None → Some"); + + let second = publisher.run_tick().await.expect("second"); + assert_eq!(second.changed_users, 1); + assert_ne!(second.global_cid, first.global_cid); + + // Fetch and decode the per-user CBOR via the global. There + // should be exactly ONE entry — keyed under the blinded + // hex of `h`, not under "photos". + let global_cbor: GlobalUsersIndex = + store.get_ipld(&second.global_cid).await.expect("global"); + let alice_user_key = global_cbor + .users + .keys() + .next() + .expect("alice should be present"); + let alice_buckets_cid: Cid = global_cbor.users[alice_user_key] + .parse() + .expect("parse cid"); + let user_cbor: UserBucketsIndex = store + .get_ipld(&alice_buckets_cid) + .await + .expect("user buckets"); + assert_eq!( + user_cbor.buckets.len(), + 1, + "exactly one bucket — legacy must NOT coexist with blinded" + ); + assert!( + user_cbor.buckets.contains_key(&hex::encode(h)), + "blinded key present" + ); + assert!( + !user_cbor.buckets.contains_key("photos"), + "plaintext name must NOT appear after migration" + ); + let entry = user_cbor.buckets.get(&hex::encode(h)).unwrap(); + assert!(!entry.legacy); + } + + // NOTE: there is intentionally no `test_run_tick_unpins_previous_global` test. + // `MemoryBlockStore::unpin` is a no-op (memory.rs:108-111) and `is_pinned` + // resolves to `has_block`, so the in-memory backend can't observe a + // pin/unpin distinction. The unpin call itself is exercised — code path + // executes — but observability requires a real `IpfsPinning` or `Cluster` + // backend (covered in Phase 3.6 staging-mirror verification step 8). + // Adding a counting `PinStore` wrapper here would be ~80 LOC of scaffolding + // for one assertion; not worth it. + + #[tokio::test] + async fn test_run_tick_no_users_first_publish_emits_empty_global() { + // Edge case: master starts up with zero buckets. First tick + // still publishes (so the SDK can fetch and find an empty + // user map without falling back to chain). + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, _store, _manager) = fixture_publisher(path); + + let outcome = publisher.run_tick().await.expect("tick"); + assert_eq!(outcome.total_users, 0); + assert_eq!(outcome.changed_users, 0); + assert!(outcome.global_rebuilt, "first publish must run even on empty"); + assert_eq!(outcome.sequence, 1); + } +} diff --git a/crates/fula-client/Cargo.toml b/crates/fula-client/Cargo.toml index 42cdde4..017b095 100644 --- a/crates/fula-client/Cargo.toml +++ b/crates/fula-client/Cargo.toml @@ -48,6 +48,14 @@ reqwest = { workspace = true } uuid = { workspace = true } fs2 = "0.4" dirs = "5" +# Embedded persistent KV for the BlockCache (Phase 2.2 of master-independent reads). +# Native-only — wasm builds skip the cache (no persistent storage there anyway). +redb = { workspace = true } +cid = { workspace = true } +# CID verification on gateway-fetched bytes (Phase 2.3 of master-independent reads). +sha2 = { workspace = true } +# Mutex for per-gateway state in gateway_fetch (Phase 2.3). +parking_lot = { workspace = true } [target.'cfg(target_arch = "wasm32")'.dependencies] # WASM: disable default features (no tokio), enable wasm feature diff --git a/crates/fula-client/src/block_cache.rs b/crates/fula-client/src/block_cache.rs new file mode 100644 index 0000000..52e84ca --- /dev/null +++ b/crates/fula-client/src/block_cache.rs @@ -0,0 +1,628 @@ +//! Persistent LRU block cache (Phase 2.2 of master-independent reads). +//! +//! Stores **encrypted** IPFS blocks fetched from the gateway race so that, +//! during a master outage, re-reading the same file is served entirely +//! from local disk instead of re-fetching from public gateways. +//! +//! Backed by a single redb file (ACID, no separate DB process). Two +//! tables: +//! - `blocks`: CID multihash bytes → encrypted block bytes +//! - `meta`: CID multihash bytes → last-access unix-millis (for LRU) +//! +//! # Concurrency model +//! +//! - **One SDK instance per cache path.** redb requires exclusive access +//! to its file. Constructing two `BlockCache`s pointing at the same +//! path returns [`BlockCacheError::AlreadyOpen`]. +//! - **Concurrent get/put are safe** within a single instance via +//! redb's ACID transactions. +//! - **Eviction is serialized** by an internal async mutex so concurrent +//! `put`s that all cross the budget don't all run eviction at once. +//! +//! # Eviction policy +//! +//! When `put` would push the cache over `max_bytes`, evict to a +//! **80 %-of-budget low watermark** (rather than exactly the budget). +//! That amortizes the eviction cost — without it, every put just-over +//! the threshold would pay mutex + write-txn overhead to evict a single +//! tiny entry. +//! +//! # Security +//! +//! The cache stores **encrypted** block bytes content-addressed by their +//! IPFS CID. It does **not** verify CID-on-insert — CID verification is +//! the caller's responsibility (Phase 2.3 enforces it before calling +//! `put`). The cache makes no security promises about content secrecy +//! beyond what file-system permissions provide. +//! +//! # Backward compatibility +//! +//! Phase 2.2 is purely additive new infrastructure. No existing data is +//! touched; there is no migration. The cache is opt-in via SDK config +//! (Phase 2.4 wires it in). A first-time-ever open creates an empty +//! redb file at the configured path. + +#![cfg(not(target_arch = "wasm32"))] + +use bytes::Bytes; +use cid::Cid; +use redb::{Database, ReadableTable, ReadableTableMetadata, TableDefinition}; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; +use tokio::sync::Mutex; + +const BLOCKS: TableDefinition<&[u8], &[u8]> = TableDefinition::new("blocks"); +const META: TableDefinition<&[u8], u64> = TableDefinition::new("meta"); + +/// Eviction low-watermark: when triggered, free space until usage is at +/// or below this fraction of `max_bytes`. 80 % is the industry-standard +/// "evict-once-amortize-many-puts" point. +const EVICT_LOW_WATERMARK_NUMERATOR: u64 = 80; +const EVICT_LOW_WATERMARK_DENOMINATOR: u64 = 100; + +/// Errors specific to the block cache. Surfaced separately from +/// `ClientError` so tests can match without coupling to the global +/// error enum. +#[derive(Debug, thiserror::Error)] +pub enum BlockCacheError { + /// Another process (or a previously-leaked instance in the same + /// process) holds the redb file lock. + /// + /// Only one `BlockCache` may be open per path at a time. + #[error("block cache file is already open by another instance: {path}")] + AlreadyOpen { path: PathBuf }, + + /// The cache file exists but is not a valid redb database. + /// + /// The caller decides whether to delete and recreate it. We do not + /// auto-delete — losing hundreds of MB of cache silently is a + /// foot-gun. + #[error("block cache file is corrupt: {0}")] + Corrupt(String), + + /// A `put` was attempted with a block whose size exceeds the cache + /// budget. The cache cannot accept it; the caller should fetch + /// directly without caching. + #[error("block size {size} bytes exceeds cache budget {budget} bytes")] + BlockTooLarge { size: u64, budget: u64 }, + + #[error("redb error: {0}")] + Redb(String), + + #[error("io error: {0}")] + Io(#[from] std::io::Error), +} + +impl From for BlockCacheError { + fn from(e: redb::DatabaseError) -> Self { + // Single classification policy applied wherever DatabaseError + // surfaces (both inside `open()`'s map_err and via `?` in + // future callers). Lock-style errors → AlreadyOpen; corruption + // markers → Corrupt; everything else → generic Redb. + let s = e.to_string(); + let lower = s.to_lowercase(); + if lower.contains("in use") || lower.contains("locked") || lower.contains("lock") { + // Path is unknown at this conversion site — caller will see + // the message but lose the path. `open()` constructs + // AlreadyOpen directly with the path; this is the fallback + // for any other call site that uses `?`. + BlockCacheError::AlreadyOpen { path: PathBuf::new() } + } else if lower.contains("corrupt") || lower.contains("checksum") { + BlockCacheError::Corrupt(s) + } else { + BlockCacheError::Redb(s) + } + } +} + +impl From for BlockCacheError { + fn from(e: redb::TransactionError) -> Self { + BlockCacheError::Redb(e.to_string()) + } +} +impl From for BlockCacheError { + fn from(e: redb::TableError) -> Self { + BlockCacheError::Redb(e.to_string()) + } +} +impl From for BlockCacheError { + fn from(e: redb::StorageError) -> Self { + BlockCacheError::Redb(e.to_string()) + } +} +impl From for BlockCacheError { + fn from(e: redb::CommitError) -> Self { + BlockCacheError::Redb(e.to_string()) + } +} + +/// LRU block cache backed by a single redb file. +/// +/// Cheap-clone via `Arc`: clones share the same database, so a `put` +/// observed by one clone is immediately visible to all others. +#[derive(Clone)] +pub struct BlockCache { + inner: Arc, +} + +struct BlockCacheInner { + db: Database, + max_bytes: u64, + /// Live byte counter, kept in sync with the BLOCKS table on every + /// `put` / eviction. Re-synced from the table on `open()` to recover + /// from any prior abort that left the counter desynced. + current_bytes: AtomicU64, + /// Serializes eviction passes so concurrent over-budget puts don't + /// each run their own eviction. + evict_lock: Mutex<()>, +} + +impl BlockCache { + /// Open or create the block cache at `path` with a budget of + /// `max_bytes` total stored block-bytes. + /// + /// On open, scans the BLOCKS table to compute the current byte + /// count (recovers from any earlier abort that left the in-memory + /// counter desynced). + pub fn open(path: impl AsRef, max_bytes: u64) -> Result { + let path = path.as_ref(); + if let Some(parent) = path.parent() { + if !parent.as_os_str().is_empty() { + std::fs::create_dir_all(parent)?; + } + } + + let db = Database::create(path).map_err(|e| { + // redb returns a specific variant for "another process holds + // the lock" — but the variant name has shifted across redb + // versions. Do a string check as a portability hedge and + // map to AlreadyOpen so callers don't have to read redb + // source to interpret it. + let s = e.to_string().to_lowercase(); + if s.contains("in use") || s.contains("locked") || s.contains("lock") { + BlockCacheError::AlreadyOpen { path: path.to_path_buf() } + } else { + BlockCacheError::from(e) + } + })?; + + // Ensure tables exist (idempotent — opening a non-existent + // table inside a write txn creates it). + let init_txn = db.begin_write()?; + { + let _ = init_txn.open_table(BLOCKS)?; + let _ = init_txn.open_table(META)?; + } + init_txn.commit()?; + + // Re-sync the byte counter by scanning. One-time cost at startup; + // eliminates the class of bugs where a prior abort desynced the + // atomic counter. + let mut total: u64 = 0; + { + let read = db.begin_read()?; + let table = read.open_table(BLOCKS)?; + let iter = table.iter()?; + for entry in iter { + let (_, val) = entry?; + total += val.value().len() as u64; + } + } + + Ok(BlockCache { + inner: Arc::new(BlockCacheInner { + db, + max_bytes, + current_bytes: AtomicU64::new(total), + evict_lock: Mutex::new(()), + }), + }) + } + + /// Configured budget in bytes. + pub fn max_bytes(&self) -> u64 { + self.inner.max_bytes + } + + /// Approximate current byte usage. Eventually consistent under + /// concurrent writes (the next read after all writes settle is + /// exact). + pub fn current_bytes(&self) -> u64 { + self.inner.current_bytes.load(Ordering::Acquire) + } + + /// Number of cached blocks. O(1) approximation via the underlying + /// table length. + pub fn entry_count(&self) -> Result { + let read = self.inner.db.begin_read()?; + let table = read.open_table(BLOCKS)?; + Ok(table.len()?) + } + + /// Look up a block by its CID. Returns `None` if not cached. + /// Updates the last-access timestamp on hit (for LRU ordering). + /// + /// PERF: this currently uses a write txn to update last-access on + /// hit, which serializes against other writers. Phase 2.4 will + /// expose this in the hot read path; if profiling shows contention, + /// switch to deferred or probabilistic access-time updates (e.g., + /// buffer in-memory and flush periodically, or update on 1-in-N + /// reads). LRU is approximate by definition. + pub fn get(&self, cid: &Cid) -> Result, BlockCacheError> { + let key = cid.to_bytes(); + // Single write txn so the get-then-update-meta is atomic; under + // concurrent get/put the timestamp ordering stays consistent. + let txn = self.inner.db.begin_write()?; + let result = { + let blocks = txn.open_table(BLOCKS)?; + let val = blocks.get(key.as_slice())?; + val.map(|v| Bytes::copy_from_slice(v.value())) + }; + if result.is_some() { + let mut meta = txn.open_table(META)?; + meta.insert(key.as_slice(), now_ms())?; + } + txn.commit()?; + Ok(result) + } + + /// Insert (or overwrite) a block. Triggers LRU eviction down to the + /// 80 %-of-budget low watermark if this insert would cross + /// `max_bytes`. + /// + /// Idempotent under repeat-inserts of the same CID with identical + /// bytes — `current_bytes` accounting tracks the net delta. + pub async fn put(&self, cid: &Cid, data: &[u8]) -> Result<(), BlockCacheError> { + let new_size = data.len() as u64; + if new_size > self.inner.max_bytes { + // A single block larger than the entire budget can't be + // cached. Surface as a typed variant so Phase 2.4 can + // dispatch on it ("skip caching, fetch directly"). + return Err(BlockCacheError::BlockTooLarge { + size: new_size, + budget: self.inner.max_bytes, + }); + } + + // Eviction: if this insert would push us over budget, evict + // (under the lock) until we're at the low watermark. Note the + // budget check uses the *current* size, not the post-insert + // size — over-tightening to "fit exactly" leads to churn. + let cur = self.inner.current_bytes.load(Ordering::Acquire); + if cur + new_size > self.inner.max_bytes { + let _guard = self.inner.evict_lock.lock().await; + // Re-check under the lock — another concurrent put may have + // already evicted enough. + let cur = self.inner.current_bytes.load(Ordering::Acquire); + if cur + new_size > self.inner.max_bytes { + let target = (self.inner.max_bytes * EVICT_LOW_WATERMARK_NUMERATOR + / EVICT_LOW_WATERMARK_DENOMINATOR) + .saturating_sub(new_size); + self.evict_to(target)?; + } + } + + let key = cid.to_bytes(); + let now = now_ms(); + let txn = self.inner.db.begin_write()?; + let prior_size: u64 = { + let mut blocks = txn.open_table(BLOCKS)?; + let prior = blocks + .get(key.as_slice())? + .map(|v| v.value().len() as u64) + .unwrap_or(0); + blocks.insert(key.as_slice(), data)?; + prior + }; + { + let mut meta = txn.open_table(META)?; + meta.insert(key.as_slice(), now)?; + } + txn.commit()?; + + // Adjust the byte counter by net delta. Idempotent for + // identical re-inserts (delta = 0). + if new_size > prior_size { + self.inner + .current_bytes + .fetch_add(new_size - prior_size, Ordering::AcqRel); + } else if prior_size > new_size { + self.inner + .current_bytes + .fetch_sub(prior_size - new_size, Ordering::AcqRel); + } + Ok(()) + } + + /// Evict LRU entries until `current_bytes <= target_bytes`. Caller + /// must hold `evict_lock`. Atomic via a single redb write txn. + fn evict_to(&self, target_bytes: u64) -> Result<(), BlockCacheError> { + // Snapshot meta entries sorted by last-access ascending. At + // 256 MiB / 1 KiB blocks this is ~256 k entries — a few hundred + // microseconds. Acceptable. + let txn = self.inner.db.begin_write()?; + let mut entries: Vec<(Vec, u64)> = { + let meta = txn.open_table(META)?; + meta.iter()? + .filter_map(Result::ok) + .map(|(k, v)| (k.value().to_vec(), v.value())) + .collect() + }; + entries.sort_by_key(|(_, ts)| *ts); + + let mut bytes_freed: u64 = 0; + let mut evicted_keys: Vec> = Vec::new(); + let cur = self.inner.current_bytes.load(Ordering::Acquire); + let need = cur.saturating_sub(target_bytes); + + { + let mut blocks = txn.open_table(BLOCKS)?; + let mut meta = txn.open_table(META)?; + for (key, _ts) in entries { + if bytes_freed >= need { + break; + } + let block_size = blocks + .get(key.as_slice())? + .map(|v| v.value().len() as u64) + .unwrap_or(0); + blocks.remove(key.as_slice())?; + meta.remove(key.as_slice())?; + bytes_freed = bytes_freed.saturating_add(block_size); + evicted_keys.push(key); + } + } + txn.commit()?; + + self.inner + .current_bytes + .fetch_sub(bytes_freed, Ordering::AcqRel); + tracing::debug!( + evicted = evicted_keys.len(), + bytes_freed = bytes_freed, + target = target_bytes, + "block_cache: LRU eviction complete" + ); + Ok(()) + } +} + +fn now_ms() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0) +} + +#[cfg(test)] +mod tests { + use super::*; + use cid::multihash::Multihash; + use std::time::Duration; + use tempfile::TempDir; + + /// Build a deterministic CID from a small u64 seed for test fixtures. + fn test_cid(seed: u64) -> Cid { + let mut bytes = [0u8; 32]; + bytes[..8].copy_from_slice(&seed.to_le_bytes()); + let mh = Multihash::<64>::wrap(0x1e /* blake3 */, &bytes).unwrap(); + Cid::new_v1(0x55 /* raw */, mh) + } + + fn open_cache(dir: &TempDir, max: u64) -> BlockCache { + BlockCache::open(dir.path().join("cache.redb"), max).expect("open") + } + + #[tokio::test] + async fn test_put_get_roundtrip() { + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024 * 1024); + + let cid = test_cid(1); + let data = b"hello world"; + cache.put(&cid, data).await.expect("put"); + + let got = cache.get(&cid).expect("get").expect("hit"); + assert_eq!(got.as_ref(), data); + } + + #[tokio::test] + async fn test_get_missing_returns_none() { + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024 * 1024); + + let cid = test_cid(42); + assert!(cache.get(&cid).expect("get").is_none()); + } + + #[tokio::test] + async fn test_persistence_across_open_close() { + // Backward-compat-critical: an existing on-disk cache must + // survive an SDK restart and serve cached blocks. + let dir = TempDir::new().unwrap(); + let cid = test_cid(7); + let data = b"persistent block bytes"; + + { + let cache = open_cache(&dir, 1024 * 1024); + cache.put(&cid, data).await.expect("put"); + // drop happens at end of scope + } + { + let cache = open_cache(&dir, 1024 * 1024); + let got = cache.get(&cid).expect("get").expect("survived restart"); + assert_eq!(got.as_ref(), data); + // current_bytes is correctly re-synced from the DB on open. + assert_eq!(cache.current_bytes(), data.len() as u64); + } + } + + #[tokio::test] + async fn test_idempotent_put_does_not_grow() { + // Re-inserting the same CID with identical bytes must not double-count. + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024 * 1024); + let cid = test_cid(5); + let data = vec![0u8; 4096]; + + cache.put(&cid, &data).await.expect("put 1"); + let after_first = cache.current_bytes(); + cache.put(&cid, &data).await.expect("put 2"); + let after_second = cache.current_bytes(); + + assert_eq!(after_first, data.len() as u64); + assert_eq!(after_second, after_first, "re-insert must not grow current_bytes"); + assert_eq!(cache.entry_count().expect("count"), 1); + } + + #[tokio::test] + async fn test_eviction_on_overflow_keeps_size_under_budget() { + // Insert N blocks of size B each, with budget = (N/2) * B. + // After all inserts settle, current_bytes <= max_bytes * 100/100. + // (We aim for the 80 % low watermark on each eviction.) + let dir = TempDir::new().unwrap(); + let block_size = 16 * 1024; // 16 KiB + let n_blocks = 20; + let budget = (n_blocks as u64 / 2) * block_size; // ~10 blocks fit + + let cache = open_cache(&dir, budget); + + for i in 0..n_blocks { + let cid = test_cid(i); + let data = vec![i as u8; block_size as usize]; + cache.put(&cid, &data).await.expect("put"); + } + + let cur = cache.current_bytes(); + assert_eq!(cache.max_bytes(), budget, "max_bytes accessor returns the configured budget"); + assert!( + cur <= budget, + "current_bytes {} must be <= max_bytes {}", + cur, + budget + ); + // We had eviction (otherwise current_bytes would equal n_blocks * block_size). + assert!( + cur < (n_blocks as u64) * block_size, + "expected at least one eviction; current={}, total-without-evict={}", + cur, + (n_blocks as u64) * block_size + ); + } + + #[tokio::test] + async fn test_lru_oldest_evicted_first() { + // Insert 3 blocks; access #0 to refresh it; insert a 4th to + // trigger eviction. The evicted block must be #1 (oldest + // last-access), NOT #0 (just accessed). + let dir = TempDir::new().unwrap(); + let block_size = 1024; + // Budget exactly 3 blocks — the 4th insert must evict. + let cache = open_cache(&dir, 3 * block_size); + + let data = vec![0u8; block_size as usize]; + cache.put(&test_cid(0), &data).await.expect("put 0"); + // Sleep 5ms so timestamps are reliably ordered. + tokio::time::sleep(Duration::from_millis(5)).await; + cache.put(&test_cid(1), &data).await.expect("put 1"); + tokio::time::sleep(Duration::from_millis(5)).await; + cache.put(&test_cid(2), &data).await.expect("put 2"); + tokio::time::sleep(Duration::from_millis(5)).await; + + // Refresh #0 → it becomes the most-recently-accessed. + let _ = cache.get(&test_cid(0)).expect("get 0").expect("hit 0"); + tokio::time::sleep(Duration::from_millis(5)).await; + + // Insert #3 → must evict (low-watermark = 80% of 3 = 2.4 blocks + // of budget; eviction frees enough to fit the new 1-block). + cache.put(&test_cid(3), &data).await.expect("put 3"); + + // #1 (oldest) should be gone; #0 (refreshed) should still be + // present. + assert!( + cache.get(&test_cid(0)).expect("get").is_some(), + "refreshed #0 must survive eviction" + ); + assert!( + cache.get(&test_cid(1)).expect("get").is_none(), + "oldest #1 must be evicted" + ); + } + + #[tokio::test] + async fn test_concurrent_puts_no_corruption_under_eviction() { + // The hard concurrency case: K concurrent puts, each within + // budget alone, but K-puts collectively over-budget. Verify + // post-condition: current_bytes <= max_bytes. + let dir = TempDir::new().unwrap(); + let block_size = 4 * 1024; // 4 KiB + let n_concurrent = 16; + // Budget = half of total → at least half must be evicted. + let budget = (n_concurrent as u64 / 2) * block_size; + + let cache = open_cache(&dir, budget); + + let mut handles = Vec::new(); + for i in 0..n_concurrent { + let cache = cache.clone(); + let data = vec![i as u8; block_size as usize]; + let cid = test_cid(i); + handles.push(tokio::spawn(async move { + cache.put(&cid, &data).await + })); + } + for h in handles { + h.await.expect("task panicked").expect("put failed"); + } + + // The mutex + watermark policy guarantees we never permanently + // exceed budget — even though briefly between checks we might + // see a transient overshoot. + let cur = cache.current_bytes(); + assert!( + cur <= budget, + "post-concurrency current_bytes {} > budget {}", + cur, + budget + ); + } + + #[tokio::test] + async fn test_block_too_large_returns_typed_error() { + // A block larger than the entire cache budget must surface as + // BlockTooLarge — not as a generic Redb(...) string error — + // so Phase 2.4 can dispatch on it cleanly ("skip caching, + // fetch directly"). + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024); // 1 KiB budget + + let cid = test_cid(99); + let big_block = vec![0u8; 4096]; // 4 KiB > 1 KiB budget + + match cache.put(&cid, &big_block).await { + Err(BlockCacheError::BlockTooLarge { size, budget }) => { + assert_eq!(size, 4096); + assert_eq!(budget, 1024); + } + other => panic!("expected BlockTooLarge, got {:?}", other), + } + } + + #[tokio::test] + async fn test_idempotent_open_after_clean_shutdown() { + // Simulates: SDK opens cache, writes, drops cleanly, re-opens. + // This is the common case for short-lived CLIs. + let dir = TempDir::new().unwrap(); + for round in 0..3 { + let cache = open_cache(&dir, 1024 * 1024); + let cid = test_cid(round as u64 * 100); + let data = vec![round as u8; 256]; + cache.put(&cid, &data).await.expect("put"); + assert!(cache.get(&cid).expect("get").is_some()); + // current_bytes should equal data sizes accumulated across rounds. + assert!(cache.current_bytes() >= 256); + } + } +} diff --git a/crates/fula-client/src/client.rs b/crates/fula-client/src/client.rs index 7ccbf51..af49186 100644 --- a/crates/fula-client/src/client.rs +++ b/crates/fula-client/src/client.rs @@ -2,11 +2,13 @@ use crate::{ Config, ClientError, Result, + health_gate::{HealthGate, GateDecision}, types::*, }; use bytes::Bytes; use reqwest::{Client, Response, header}; use std::collections::HashMap; +use std::sync::Arc; use tracing::{debug, instrument}; /// Fula storage client @@ -14,6 +16,12 @@ use tracing::{debug, instrument}; pub struct FulaClient { config: Config, http: Client, + /// Phase 2.1 of master-independent reads. `Some` when + /// `Config::health_gate_enabled = true`; shared across all clones via + /// `Arc` so a failure observed in one task immediately silences the + /// rest. `None` when the feature is off — request path then runs + /// exactly as before (backward-compat). + health_gate: Option>, } impl FulaClient { @@ -34,7 +42,13 @@ impl FulaClient { let http = builder.build().map_err(ClientError::Http)?; - Ok(Self { config, http }) + let health_gate = if config.health_gate_enabled { + Some(Arc::new(HealthGate::new(config.health_gate_ttl))) + } else { + None + }; + + Ok(Self { config, http, health_gate }) } /// Create with default configuration @@ -460,8 +474,24 @@ impl FulaClient { headers: Option>, body: Option, ) -> Result { + // Phase 2.1: consult the health gate before sending. When Down + + // within TTL, short-circuit with MasterUnreachable so the caller + // doesn't pay the per-read timeout. When the TTL has elapsed the + // gate auto-allows a probe through. + if let Some(gate) = &self.health_gate { + if let GateDecision::ShortCircuit { down_for_secs } = gate.decide() { + debug!( + method = %method, + path = %path, + "health gate Down → short-circuiting (down_for_secs={})", + down_for_secs + ); + return Err(ClientError::MasterUnreachable { down_for_secs }); + } + } + let url = format!("{}{}", self.config.endpoint, path); - + let mut req = match method { "GET" => self.http.get(&url), "PUT" => self.http.put(&url), @@ -494,10 +524,36 @@ impl FulaClient { } debug!("Sending {} request to {}", method, url); - let response = req.send().await?; + let response = match req.send().await { + Ok(r) => r, + Err(e) => { + // Connection-level error (refused, RST, DNS, timeout). Treat + // as a master-down signal for the gate's purposes. Returning + // the original error preserves caller diagnostics. + if let Some(gate) = &self.health_gate { + gate.record_failure(); + } + return Err(ClientError::Http(e)); + } + }; // Check for errors let status = response.status(); + + // Phase 2.1: classify the response status for the health gate. + // 5xx → master-side failure → record_failure + // 4xx → request-level (auth, not-found, precondition, etc.); NOT + // a master-down signal — the server responded, the request + // was just bad. Don't touch the gate. + // 2xx/3xx → success → record_success (also clears any prior Down) + if let Some(gate) = &self.health_gate { + if status.is_server_error() { + gate.record_failure(); + } else if status.is_success() { + gate.record_success(); + } + } + if !status.is_success() { // 412 Precondition Failed surfaces as ConcurrentModification so // callers using If-Match / If-None-Match can retry distinctly. diff --git a/crates/fula-client/src/config.rs b/crates/fula-client/src/config.rs index d400f06..b9187cd 100644 --- a/crates/fula-client/src/config.rs +++ b/crates/fula-client/src/config.rs @@ -43,6 +43,17 @@ pub struct Config { /// mid-stream per-chunk AEAD + size check in the engine itself, so the /// ceiling is an allocation guard, not a security boundary. pub buffered_download_max_bytes: u64, + + /// Phase 2.1 of master-independent reads: enable the master health + /// gate. Off by default (backward-compat). When on, the SDK observes + /// request outcomes and short-circuits with `MasterUnreachable` after + /// two consecutive failures, instead of paying the per-read timeout. + pub health_gate_enabled: bool, + + /// TTL of the `Down` state when `health_gate_enabled = true`. After + /// this duration elapses, the next request is allowed through as a + /// probe (without resetting state — only an observed success resets). + pub health_gate_ttl: Duration, } impl Default for Config { @@ -58,6 +69,8 @@ impl Default for Config { multipart_chunk_size: 256 * 1024, // 256 KB (must be < 1MB for IPFS) per_chunk_download_timeout: Duration::from_secs(300), // 5 min buffered_download_max_bytes: 256 * 1024 * 1024, // 256 MB + health_gate_enabled: false, // backward-compat: off by default + health_gate_ttl: Duration::from_secs(30), } } } diff --git a/crates/fula-client/src/encryption.rs b/crates/fula-client/src/encryption.rs index 7269d01..dc79cfb 100644 --- a/crates/fula-client/src/encryption.rs +++ b/crates/fula-client/src/encryption.rs @@ -2902,6 +2902,26 @@ impl EncryptedClient { Ok(()) } + /// Phase 1.2 of master-independent reads: compute the blinded bucket + /// lookup key as hex for the `x-amz-meta-fula-bucket-lookup-h` header. + /// + /// `bucket_lookup_h = BLAKE3(MetadataKey || bucket_name)[..16]`, where + /// `MetadataKey = derive_path_key("fula-metadata-v1")`. Hex-encoded + /// (32 chars). The 16-byte truncation matches master's `hashed_user_id` + /// convention. Master never sees `MetadataKey`. + /// + /// Attached on every manifest root commit (sharded v7, monolithic v4, + /// and the v1→v7 migration path) so master's put_object handler can + /// populate `BucketMetadata.bucket_lookup_h` regardless of which forest + /// format the SDK is using. Idempotent on master's side. + fn compute_bucket_lookup_h_hex(&self, bucket: &str) -> String { + let metadata_key = self.encryption.key_manager.derive_path_key("fula-metadata-v1"); + let mut input = metadata_key.as_bytes().to_vec(); + input.extend_from_slice(bucket.as_bytes()); + let hash = blake3::hash(&input); + hex::encode(&hash.as_bytes()[..16]) + } + /// Save the private forest index for a bucket (monolithic v4 format with AAD+sequence) pub async fn save_forest(&self, bucket: &str, forest: &PrivateForest) -> Result<()> { let forest_dek = self.encryption.key_manager.derive_path_key(&format!("forest:{}", bucket)); @@ -2924,8 +2944,11 @@ impl EncryptedClient { let data = encrypted.to_bytes() .map_err(ClientError::Encryption)?; + // Phase 1.2: monolithic v4 forest is also a manifest-root commit. + // Same header semantics as save_sharded_hamt_forest's Phase 2 PUT. let metadata = ObjectMetadata::new() - .with_content_type("application/octet-stream"); + .with_content_type("application/octet-stream") + .with_metadata("fula-bucket-lookup-h", &self.compute_bucket_lookup_h_hex(bucket)); let put_result = self.inner.put_object_with_metadata_conditional( bucket, @@ -3237,8 +3260,11 @@ impl EncryptedClient { let data = encrypted_manifest.to_bytes() .map_err(ClientError::Encryption)?; + // Phase 1.2: sharded HAMT v7 manifest root commit. See + // compute_bucket_lookup_h_hex for header semantics. let metadata = ObjectMetadata::new() - .with_content_type("application/octet-stream"); + .with_content_type("application/octet-stream") + .with_metadata("fula-bucket-lookup-h", &self.compute_bucket_lookup_h_hex(bucket)); let put_result = self.inner.put_object_with_metadata_conditional( bucket, @@ -3956,11 +3982,20 @@ impl EncryptedClient { // our HEAD (or GET) and this PUT loses the race — we defer and retry // next session. Crucial because the in-process `migration_lock.write()` // is NOT held during load-time-triggered migration. + // + // Phase 1.2: v1→v7 migration is a manifest-root commit. Attach the + // bucket-lookup-h header so master can populate `bucket_lookup_h` + // here too — otherwise users who migrate to v7 via this path (rather + // than save_sharded_hamt_forest) would never get their lookup_h set. let put_result = match self.inner.put_object_with_metadata_conditional( bucket, &index_key, Bytes::from(manifest_data), - Some(ObjectMetadata::new().with_content_type("application/octet-stream")), + Some( + ObjectMetadata::new() + .with_content_type("application/octet-stream") + .with_metadata("fula-bucket-lookup-h", &self.compute_bucket_lookup_h_hex(bucket)), + ), Some(&v1_etag), None, ).await { diff --git a/crates/fula-client/src/error.rs b/crates/fula-client/src/error.rs index f042329..39920a8 100644 --- a/crates/fula-client/src/error.rs +++ b/crates/fula-client/src/error.rs @@ -98,6 +98,14 @@ pub enum ClientError { /// re-enter the load-time migration path. #[error("Migration lock held for bucket {bucket} (expires at {expires_at} ms)")] MigrationLockHeld { bucket: String, expires_at: i64 }, + + /// Phase 2.1 of master-independent reads: the SDK's health gate + /// observed master is unreachable and short-circuited the request. + /// Phase 2.4 will catch this variant and trigger the gateway-race + /// fallback. Standalone (Phase 2.1 only), this turns "wait 3s for + /// timeout" into "fast-fail with a clear signal." + #[error("Master unreachable (health gate; down for ~{down_for_secs}s)")] + MasterUnreachable { down_for_secs: u64 }, } impl ClientError { diff --git a/crates/fula-client/src/gateway_fetch.rs b/crates/fula-client/src/gateway_fetch.rs new file mode 100644 index 0000000..5c20ff4 --- /dev/null +++ b/crates/fula-client/src/gateway_fetch.rs @@ -0,0 +1,1306 @@ +//! Multi-gateway race + CID verification (Phase 2.3 of master-independent reads). +//! +//! When master is unreachable (per the [`crate::health_gate`]), Phase 2.4 +//! routes reads through this module: race fetches against several public +//! IPFS gateways with dynamic priority, verify each response's bytes +//! against the requested CID's multihash, return the first verified hit. +//! +//! # Default gateway list (post-Step-0) +//! +//! Six gateways in quality-priority order. **`ipfs.cloud.fx.land/gateway/` +//! is intentionally NOT in the default list** — Step 0 verification on +//! 2026-05-01 found it returns HTTP 500 with `{"error":"Error fetching +//! content from IPFS"}` for dag-cbor (codec 0x71) CIDs while serving +//! raw codec correctly. Forest manifest pages and bucket Prolly Tree +//! roots — the metadata SDK cold-start needs — are dag-cbor. Including +//! it in the race would burn a slot returning 500s for half of all +//! requests. Re-add it via `Config::gateway_fallback_urls` once the +//! gateway-side codec bug is fixed (one-line config change, no code +//! change). +//! +//! # CID verification +//! +//! Every fetched body is re-hashed against the multihash declared in +//! the requested CID. We support standard IPFS multihash codes: +//! +//! - `0x1e` (BLAKE3) — used by encrypted SDK chunk uploads via +//! `block/put?mhtype=blake3` (`fula-blockstore::ipfs::put_block_raw`) +//! - `0x12` (SHA2-256) — used by IPFS UnixFS chunked uploads +//! (`block/put` default; `add?cid-version=1`) +//! +//! Any other multihash code → [`VerifyError::UnsupportedHashCode`]. We +//! deliberately do NOT support `cid_utils::create_cid`'s quirky +//! `sha2_256(blake3(data))` scheme because that path is only used by +//! the in-memory `MemoryBlockStore` (test backend) and never produces +//! CIDs that round-trip to public IPFS gateways. +//! +//! # Backward compatibility +//! +//! Phase 2.3 is purely additive new infrastructure. No callers exist +//! yet — Phase 2.4 wires this into [`crate::encryption`]'s GET path. +//! Existing reads against a healthy master continue to behave exactly +//! as before. + +#![cfg(not(target_arch = "wasm32"))] + +use bytes::Bytes; +use cid::Cid; +use parking_lot::Mutex; +use std::time::{Duration, Instant}; + +/// IPFS multihash code for BLAKE3-256. +const MULTIHASH_BLAKE3: u64 = 0x1e; +/// IPFS multihash code for SHA2-256. +const MULTIHASH_SHA2_256: u64 = 0x12; + +/// Default decay time-constant. After `TAU`, a penalty of 1.0 decays +/// to ~0.37; after `3 * TAU` (~3 minutes for default 60s) it's at ~5%. +/// Configurable per `effective_priority` call so tests can use a much +/// shorter TAU without sleeping for minutes. +pub(crate) const DEFAULT_DECAY_TAU: Duration = Duration::from_secs(60); + +/// Cooldown after a CID-verification failure. A gateway returning +/// content that fails CID verification is "returned wrong bytes" — +/// a strong-signal event. Penalty alone (decay TAU=60s) returns to +/// ~5% in 3 min, which is too fast to trust again. Cooldown enforces +/// a hard 5-min skip period before the gateway can re-enter the race. +pub(crate) const VERIFY_FAILURE_COOLDOWN: Duration = Duration::from_secs(300); + +/// Default per-gateway request timeout. The default reqwest timeout +/// (30s) is too generous for K=3 racing; if Cloudflare hangs, we want +/// the race to give up on it within 10s and let dweb.link's faster +/// response win. +pub(crate) const DEFAULT_FETCH_TIMEOUT: Duration = Duration::from_secs(10); + +/// Default ordered list of public gateway URL templates. Lower index = +/// higher base priority. Phase 2.3 races the top K (default 3) of these +/// in parallel; Phase 2.4 will use this directly. +/// +/// Each template uses a literal `{cid}` placeholder that gets substituted +/// with the requested CID's `to_string()` form. All six gateways speak +/// standard `/ipfs/` URL conventions. +pub fn default_gateway_urls() -> Vec { + vec![ + "https://cloudflare-ipfs.com/ipfs/{cid}".to_string(), + "https://dweb.link/ipfs/{cid}".to_string(), + "https://ipfs.io/ipfs/{cid}".to_string(), + "https://trustless-gateway.link/ipfs/{cid}".to_string(), + "https://4everland.io/ipfs/{cid}".to_string(), + "https://gateway.pinata.cloud/ipfs/{cid}".to_string(), + ] +} + +/// Errors specific to gateway-fetched body verification. +#[derive(Debug, thiserror::Error, PartialEq, Eq)] +pub enum VerifyError { + /// The CID's multihash code is one we don't know how to verify. + /// Returned for codes other than BLAKE3 (0x1e) and SHA2-256 (0x12). + #[error("unsupported multihash code: 0x{code:x}")] + UnsupportedHashCode { code: u64 }, + + /// The fetched bytes hash to a different digest than the CID's + /// multihash. Possible causes: gateway returned wrong content, + /// in-flight tampering, or the gateway has the wrong block under + /// this CID (shouldn't happen since CIDs are content-addressed). + #[error("digest mismatch (CID hash code 0x{code:x})")] + DigestMismatch { code: u64 }, +} + +/// Re-hash `data` using the algorithm declared in `cid`'s multihash and +/// compare against the CID's digest. Returns `Ok(())` if the bytes +/// content-address to the CID, otherwise [`VerifyError`]. +/// +/// This is the security boundary: a successful return means the bytes +/// are exactly what the CID claims they are. Callers MUST refuse to +/// hand the bytes to downstream consumers if this fails. +pub fn verify_cid_against_bytes(cid: &Cid, data: &[u8]) -> Result<(), VerifyError> { + let mh = cid.hash(); + let code = mh.code(); + let expected_digest = mh.digest(); + match code { + MULTIHASH_BLAKE3 => { + let actual = blake3::hash(data); + if actual.as_bytes().as_slice() == expected_digest { + Ok(()) + } else { + Err(VerifyError::DigestMismatch { code }) + } + } + MULTIHASH_SHA2_256 => { + use sha2::{Digest, Sha256}; + let mut hasher = Sha256::new(); + hasher.update(data); + let actual = hasher.finalize(); + if actual.as_slice() == expected_digest { + Ok(()) + } else { + Err(VerifyError::DigestMismatch { code }) + } + } + other => Err(VerifyError::UnsupportedHashCode { code: other }), + } +} + +// ============================================================ +// Gateway pool data structures (Checkpoint A skeleton). +// Behavior (penalty math, decay, fetch, race) lands in subsequent +// checkpoints. +// ============================================================ + +/// Per-gateway runtime state. Penalty + cooldown + last-observed +/// timestamp drive the dynamic priority calculation in Checkpoint B. +#[derive(Debug)] +pub(crate) struct GatewayState { + /// 0.0 = healthy, 1.0 = fully sidelined. Decays toward 0 over time. + pub(crate) penalty: f32, + /// Bumped on each failure; reset on each success. Used by the + /// circuit-breaker open rule (Checkpoint C). + pub(crate) consecutive_failures: u32, + /// When the last priority observation / state mutation happened. + /// Used to compute decay lazily on the next read. + pub(crate) last_observed_at: Instant, + /// Hard skip until this instant if Some — set on HTTP 429 with a + /// `Retry-After` header. Bypasses penalty math entirely while in + /// effect. + pub(crate) cooldown_until: Option, +} + +impl GatewayState { + fn fresh() -> Self { + Self { + penalty: 0.0, + consecutive_failures: 0, + last_observed_at: Instant::now(), + cooldown_until: None, + } + } +} + +/// One entry in the gateway pool. Cheap-clone via `Arc` (Checkpoint C +/// will wrap `GatewayPool` in `Arc` so all SDK clones share state). +#[derive(Debug)] +pub(crate) struct Gateway { + /// URL template containing `{cid}` placeholder. + pub(crate) url_template: String, + /// Index in the configured list (0 = highest base priority). + pub(crate) base_priority: u8, + pub(crate) state: Mutex, +} + +impl Gateway { + fn new(url_template: String, base_priority: u8) -> Self { + Self { + url_template, + base_priority, + state: Mutex::new(GatewayState::fresh()), + } + } + + /// Substitute the `{cid}` placeholder with the requested CID's + /// canonical string form. + pub(crate) fn url_for(&self, cid: &Cid) -> String { + self.url_template + .replace("{cid}", &cid.to_string()) + } + + /// Record a successful fetch + verify. Halves the penalty and + /// resets the consecutive-failure counter. + pub(crate) fn record_success(&self) { + let mut s = self.state.lock(); + s.penalty *= 0.5; + s.consecutive_failures = 0; + s.last_observed_at = Instant::now(); + } + + /// Record a transient failure (5xx, timeout, connection error). + /// Bumps penalty by 0.3 (capped at 1.0) and increments the + /// consecutive-failure counter. + pub(crate) fn record_transient_failure(&self) { + let mut s = self.state.lock(); + s.penalty = (s.penalty + 0.3).min(1.0); + s.consecutive_failures = s.consecutive_failures.saturating_add(1); + s.last_observed_at = Instant::now(); + } + + /// Record a CID-verification failure. Strong-signal event — the + /// gateway returned bytes that don't content-address to the CID. + /// Sets penalty to 1.0 AND a 5-minute cooldown. Cooldown is the + /// primary defense (skips this gateway from the race entirely); + /// penalty=1.0 is the diagnostic signal that recovers via decay + /// after cooldown lifts. + pub(crate) fn record_verify_failure(&self) { + let mut s = self.state.lock(); + s.penalty = 1.0; + s.consecutive_failures = s.consecutive_failures.saturating_add(1); + s.last_observed_at = Instant::now(); + s.cooldown_until = Some(Instant::now() + VERIFY_FAILURE_COOLDOWN); + } + + /// Record an HTTP 429 with `Retry-After` header. Sets cooldown + /// without changing penalty — rate limits are a load-shedding + /// signal, not a quality issue. + pub(crate) fn record_rate_limit(&self, retry_after: Duration) { + let mut s = self.state.lock(); + s.cooldown_until = Some(Instant::now() + retry_after); + } + + /// Compute the effective priority at `now` using the given decay + /// time-constant. Lower = higher priority (matches sort order). + /// + /// **Pure** — does NOT mutate state. Decay is computed lazily + /// from `(penalty, last_observed_at)`. Only events + /// (`record_success` / `record_transient_failure` / etc.) update + /// `last_observed_at`. This makes decay a function of + /// time-since-last-event, which is the property we want. + pub(crate) fn effective_priority(&self, now: Instant, tau: Duration) -> f32 { + let (penalty, last_obs) = { + let s = self.state.lock(); + (s.penalty, s.last_observed_at) + }; + let elapsed = now.saturating_duration_since(last_obs).as_secs_f32(); + let tau_secs = tau.as_secs_f32().max(0.001); + let decayed = penalty * (-elapsed / tau_secs).exp(); + self.base_priority as f32 + decayed * 3.0 + } + + /// True iff the gateway is in an active cooldown window. Phase 2.3 + /// Checkpoint C will use this to filter cooldowned gateways out of + /// the race candidate set entirely (rather than letting them + /// participate as f32::INFINITY losers). + pub(crate) fn is_in_cooldown(&self, now: Instant) -> bool { + let s = self.state.lock(); + s.cooldown_until.map_or(false, |until| now < until) + } +} + +/// Outcome of a single-gateway fetch+verify attempt. +#[derive(Debug, thiserror::Error)] +pub enum FetchError { + /// 5xx, request timeout, connection-level error. Caller bumps + /// the gateway's penalty. + #[error("transient gateway failure: {0}")] + Transient(String), + + /// HTTP 429 with `Retry-After` parsed. Caller sets cooldown_until + /// = now + retry_after; no penalty change. + #[error("rate limited (retry after {retry_after_secs}s)")] + RateLimited { retry_after_secs: u64 }, + + /// HTTP 404 / 410. Request-level outcome — gateway responded + /// correctly, content just isn't there. No penalty change. + #[error("content not found at gateway")] + NotFound, + + /// Gateway returned bytes that don't hash to the requested CID's + /// multihash. Caller calls `record_verify_failure` (penalty=1.0 + /// + 5-min cooldown). + #[error("CID verification failed: {0}")] + VerifyFailed(#[from] VerifyError), +} + +/// Fetch a single CID from one specific gateway with timeout + CID +/// verification. Used by the race in Checkpoint C; testable directly. +/// +/// On HTTP 200: reads body, verifies via `verify_cid_against_bytes`. +/// On HTTP 429: parses `Retry-After` (decimal-seconds form; falls +/// back to a 60-second default if the header is missing/unparseable). +/// On HTTP 4xx (other than 429) and 5xx: surfaces as the appropriate +/// `FetchError` variant. +/// +/// Note: this function does NOT call any of the gateway's `record_*` +/// methods. The race orchestrator (Checkpoint C) does that based on +/// the returned `FetchError` variant. Keeping the side-effects in +/// the orchestrator makes per-gateway behavior easier to test. +pub(crate) async fn fetch_one( + gateway: &Gateway, + cid: &Cid, + http: &reqwest::Client, + timeout: Duration, +) -> Result { + let url = gateway.url_for(cid); + let resp = http + .get(&url) + .timeout(timeout) + .send() + .await + .map_err(|e| FetchError::Transient(format!("send: {}", e)))?; + + let status = resp.status(); + + if status.is_success() { + let body = resp + .bytes() + .await + .map_err(|e| FetchError::Transient(format!("body read: {}", e)))?; + verify_cid_against_bytes(cid, &body)?; + Ok(body) + } else if status.as_u16() == 429 { + // Retry-After: HTTP/1.1 spec allows either delta-seconds or + // an HTTP-date. Most public gateways emit delta-seconds. Fall + // back to a 60-second default for missing/unparseable headers + // so we don't loop hot against a rate-limiter. + let retry_after_secs = resp + .headers() + .get("retry-after") + .and_then(|v| v.to_str().ok()) + .and_then(|s| s.trim().parse::().ok()) + .unwrap_or(60); + Err(FetchError::RateLimited { retry_after_secs }) + } else if status.as_u16() == 404 || status.as_u16() == 410 { + Err(FetchError::NotFound) + } else { + Err(FetchError::Transient(format!("HTTP {}", status.as_u16()))) + } +} + +/// A pool of gateways racing the same CID fetch. Constructed once per +/// `FulaClient` and shared across all clones via `Arc` (Checkpoint C). +#[derive(Debug)] +pub struct GatewayPool { + pub(crate) gateways: Vec, + /// How many gateways to race in parallel for a single CID. Default + /// 3 — caches the median Cloudflare-occasional-blip without paying + /// the latency cost of a single sequential request, while not + /// wasting bandwidth on K-1 cancelled losers. + pub(crate) race_concurrency: usize, +} + +impl GatewayPool { + /// Construct a pool with the default 6-gateway list and race + /// concurrency K=3. Use this in production unless an operator has + /// overridden via [`Config::gateway_fallback_urls`](crate::Config). + pub fn default_pool() -> Self { + let gateways = default_gateway_urls() + .into_iter() + .enumerate() + .map(|(i, url)| Gateway::new(url, i as u8)) + .collect(); + Self { + gateways, + race_concurrency: 3, + } + } + + /// Construct a pool from explicit URL templates. Use this for tests + /// (against `wiremock`) or operator overrides. + pub fn with_gateways(urls: Vec, race_concurrency: usize) -> Self { + let gateways = urls + .into_iter() + .enumerate() + .map(|(i, url)| Gateway::new(url, i as u8)) + .collect(); + Self { + gateways, + race_concurrency, + } + } + + /// Number of gateways in the pool. + pub fn len(&self) -> usize { + self.gateways.len() + } + + /// True if no gateways are configured (effectively disables + /// gateway-race fallback). + pub fn is_empty(&self) -> bool { + self.gateways.is_empty() + } + + /// Select gateways eligible to race RIGHT NOW. Filters out + /// cooldowned gateways entirely (rather than letting them + /// participate as f32::INFINITY losers — a sentinel-value hack), + /// sorts the remaining by effective priority ascending (lower = + /// faster path), and takes the top `race_concurrency`. + /// + /// PERF: this calls `effective_priority` twice per gateway during + /// sort comparisons (O(n log n) calls total). For the default 6 + /// gateways, that's ~30 evaluations — acceptable. If the pool + /// grows past ~20 gateways, swap to `sort_by_cached_key` with a + /// `Reverse(NotNan)` wrapper or pre-compute `(priority, gateway)` + /// tuples and sort those. + /// + /// Returns an empty `Vec` if every gateway is in cooldown. + /// Callers (`fetch_verified`) interpret empty as + /// [`GatewayPoolError::AllUnavailable`] — a "try later" signal, + /// distinct from [`GatewayPoolError::AllFailed`] (racers ran but + /// all returned errors). + pub(crate) fn select_for_race(&self, now: Instant) -> Vec<&Gateway> { + let mut alive: Vec<&Gateway> = self + .gateways + .iter() + .filter(|g| !g.is_in_cooldown(now)) + .collect(); + alive.sort_by(|a, b| { + a.effective_priority(now, DEFAULT_DECAY_TAU) + .total_cmp(&b.effective_priority(now, DEFAULT_DECAY_TAU)) + }); + alive.truncate(self.race_concurrency); + alive + } + + /// Fetch a CID by racing the top-K eligible gateways in parallel. + /// Returns the first verified body. Cancels in-flight losers via + /// `Drop` of the spawned futures (reqwest cancels the underlying + /// HTTP request on `Response::drop`, releasing the socket). + /// + /// Per-racer outcomes update the racer's penalty/cooldown state + /// via the orchestrator (here) — `fetch_one` itself is pure. + /// + /// PERF: each settled future synchronously locks the per-gateway + /// `parking_lot::Mutex` to update penalty/cooldown. Two + /// simultaneous `fetch_verified` calls hitting the same gateway + /// will briefly contend on that lock. Negligible for v1; revisit + /// if profiling shows lock contention under heavy parallel-race + /// load. + pub async fn fetch_verified( + &self, + cid: &Cid, + http: &reqwest::Client, + ) -> Result { + use futures::stream::FuturesUnordered; + use futures::StreamExt; + + let now = Instant::now(); + let candidates = self.select_for_race(now); + if candidates.is_empty() { + return Err(GatewayPoolError::AllUnavailable); + } + + // Spawn one future per candidate. Each future returns a tuple + // (gateway_index_in_pool, fetch_result) so the post-race state + // mutation can apply to the right gateway. + let mut in_flight: FuturesUnordered<_> = candidates + .iter() + .enumerate() + .map(|(i, g)| { + let g_ref = *g; + async move { + let r = fetch_one(g_ref, cid, http, DEFAULT_FETCH_TIMEOUT).await; + (i, g_ref, r) + } + }) + .collect(); + + let mut errors: Vec = Vec::new(); + + while let Some((_idx, g, result)) = in_flight.next().await { + match result { + Ok(body) => { + g.record_success(); + // Drop in_flight to cancel remaining racers. + drop(in_flight); + return Ok(body); + } + Err(FetchError::Transient(msg)) => { + g.record_transient_failure(); + errors.push(format!("transient: {}", msg)); + } + Err(FetchError::RateLimited { retry_after_secs }) => { + g.record_rate_limit(Duration::from_secs(retry_after_secs)); + errors.push(format!("rate-limited (retry {}s)", retry_after_secs)); + } + Err(FetchError::NotFound) => { + // Request-level outcome — no penalty change. But + // a 404 from this gateway means the content isn't + // there; collect for diagnostic, race continues. + errors.push("not-found".to_string()); + } + Err(FetchError::VerifyFailed(ve)) => { + g.record_verify_failure(); + errors.push(format!("verify-failed: {}", ve)); + } + } + } + + Err(GatewayPoolError::AllFailed { errors }) + } +} + +/// Outcome of a multi-gateway race. Distinct from `FetchError` +/// because the race aggregates per-gateway results. +#[derive(Debug, thiserror::Error)] +pub enum GatewayPoolError { + /// Every gateway in the pool is currently in a cooldown window + /// (recent rate-limit or verify-failure). This is a "try again + /// later" signal — short-term unavailable, not a failure of the + /// content itself. + #[error("all gateways are in cooldown; retry later")] + AllUnavailable, + + /// All eligible gateways were raced and all returned errors + /// (transient, rate-limited, not-found, or verify-failed). + /// `errors` lists the per-racer outcomes for diagnostic logging. + #[error("all gateway racers failed: {errors:?}")] + AllFailed { errors: Vec }, +} + +#[cfg(test)] +mod tests { + use super::*; + use cid::multihash::Multihash; + use sha2::{Digest, Sha256}; + + /// Build a raw-codec CID with a BLAKE3 multihash over `data`. This + /// matches what `block/put?mhtype=blake3` produces (the encrypted + /// SDK chunk-upload path). + fn cid_blake3(data: &[u8]) -> Cid { + let h = blake3::hash(data); + let mh = Multihash::<64>::wrap(MULTIHASH_BLAKE3, h.as_bytes()).unwrap(); + Cid::new_v1(0x55 /* raw */, mh) + } + + /// Build a dag-cbor CID with a SHA2-256 multihash over `data`. This + /// matches what `block/put` (default) and `add?cid-version=1` + /// produce. + fn cid_sha2(data: &[u8]) -> Cid { + let mut hasher = Sha256::new(); + hasher.update(data); + let digest = hasher.finalize(); + let mh = Multihash::<64>::wrap(MULTIHASH_SHA2_256, digest.as_slice()).unwrap(); + Cid::new_v1(0x71 /* dag-cbor */, mh) + } + + // ============================================================ + // default_gateway_urls + // ============================================================ + + #[test] + fn test_default_gateway_urls_list_is_six_entries() { + let urls = default_gateway_urls(); + assert_eq!(urls.len(), 6); + } + + #[test] + fn test_default_gateway_urls_does_not_include_fula_gateway() { + // Step-0 finding (2026-05-01): ipfs.cloud.fx.land/gateway/ + // returns 500 on dag-cbor codec. Default list MUST NOT include + // it until that codec bug is fixed. + let urls = default_gateway_urls(); + for url in &urls { + assert!( + !url.contains("ipfs.cloud.fx.land"), + "fula gateway must not be in default list (dag-cbor codec bug); found: {}", + url + ); + } + } + + #[test] + fn test_default_gateway_urls_quality_order() { + // Cloudflare is slot 0 (lowest latency, generous rate limits). + // Pinata is the last fallback. Verify the published quality + // order so a reorder is a deliberate change. + let urls = default_gateway_urls(); + assert!(urls[0].contains("cloudflare-ipfs.com")); + assert!(urls[1].contains("dweb.link")); + assert!(urls[2].contains("ipfs.io")); + assert!(urls[3].contains("trustless-gateway.link")); + assert!(urls[4].contains("4everland.io")); + assert!(urls[5].contains("gateway.pinata.cloud")); + } + + #[test] + fn test_default_gateway_urls_have_cid_placeholder() { + for url in default_gateway_urls() { + assert!( + url.contains("{cid}"), + "url must have {{cid}} placeholder: {}", + url + ); + } + } + + // ============================================================ + // verify_cid_against_bytes + // ============================================================ + + #[test] + fn test_verify_blake3_match_passes() { + // The encrypted SDK's chunk-upload path produces blake3+raw + // CIDs. Verification of correct bytes against such a CID must + // pass. + let data = b"hello blake3 world"; + let cid = cid_blake3(data); + verify_cid_against_bytes(&cid, data).expect("blake3 verify on matching bytes"); + } + + #[test] + fn test_verify_blake3_mismatch_rejects() { + // Tampered bytes must be rejected with DigestMismatch. + let original = b"original content"; + let cid = cid_blake3(original); + + let tampered = b"tampered content"; + match verify_cid_against_bytes(&cid, tampered) { + Err(VerifyError::DigestMismatch { code }) => { + assert_eq!(code, MULTIHASH_BLAKE3); + } + other => panic!("expected DigestMismatch, got {:?}", other), + } + } + + #[test] + fn test_verify_sha2_match_passes() { + // IPFS UnixFS / standard `block/put` paths produce sha2-256 + // multihash CIDs. Verification of correct bytes must pass. + let data = b"hello sha2 world"; + let cid = cid_sha2(data); + verify_cid_against_bytes(&cid, data).expect("sha2 verify on matching bytes"); + } + + #[test] + fn test_verify_sha2_mismatch_rejects() { + let original = b"sha2 original"; + let cid = cid_sha2(original); + let tampered = b"sha2 tampered"; + match verify_cid_against_bytes(&cid, tampered) { + Err(VerifyError::DigestMismatch { code }) => { + assert_eq!(code, MULTIHASH_SHA2_256); + } + other => panic!("expected DigestMismatch, got {:?}", other), + } + } + + #[test] + fn test_verify_unsupported_hash_code_rejects() { + // SHA3-256 (code 0x16) is NOT in our supported set. Even if + // the bytes "match" via sha2/blake3, we must refuse rather + // than fake a verification we can't actually perform. + let data = b"sha3 test"; + // Build a CID with an arbitrary code we don't support. + let mh = Multihash::<64>::wrap(0x16 /* sha3-256 */, &[0u8; 32]).unwrap(); + let cid = Cid::new_v1(0x55, mh); + match verify_cid_against_bytes(&cid, data) { + Err(VerifyError::UnsupportedHashCode { code }) => { + assert_eq!(code, 0x16); + } + other => panic!("expected UnsupportedHashCode, got {:?}", other), + } + } + + #[test] + fn test_verify_empty_data_against_empty_blake3() { + // Edge case: empty body. blake3("") has a well-defined digest; + // verification must work on length-0 inputs without panic. + let cid = cid_blake3(b""); + verify_cid_against_bytes(&cid, b"").expect("empty bytes verify"); + } + + // ============================================================ + // GatewayPool skeleton (Checkpoint A — structure only) + // ============================================================ + + #[test] + fn test_default_pool_has_six_gateways() { + let pool = GatewayPool::default_pool(); + assert_eq!(pool.len(), 6); + assert_eq!(pool.race_concurrency, 3); + } + + #[test] + fn test_pool_with_gateways_sets_concurrency() { + let pool = GatewayPool::with_gateways( + vec!["https://test1.example/ipfs/{cid}".to_string()], + 2, + ); + assert_eq!(pool.len(), 1); + assert_eq!(pool.race_concurrency, 2); + } + + #[test] + fn test_pool_assigns_base_priority_by_index() { + let pool = GatewayPool::default_pool(); + for (i, g) in pool.gateways.iter().enumerate() { + assert_eq!(g.base_priority as usize, i); + } + } + + #[test] + fn test_gateway_url_for_substitutes_cid() { + let g = Gateway::new("https://example.test/ipfs/{cid}".to_string(), 0); + let cid = cid_blake3(b"x"); + let url = g.url_for(&cid); + assert!(url.contains(&cid.to_string())); + assert!(!url.contains("{cid}")); + } + + #[test] + fn test_gateway_state_starts_healthy() { + let g = Gateway::new("https://x/{cid}".to_string(), 0); + let s = g.state.lock(); + assert_eq!(s.penalty, 0.0); + assert_eq!(s.consecutive_failures, 0); + assert!(s.cooldown_until.is_none()); + } + + // ============================================================ + // Checkpoint B: per-gateway penalty math + cooldown + // ============================================================ + + #[test] + fn test_record_success_halves_penalty_and_resets_counter() { + let g = Gateway::new("https://x/{cid}".to_string(), 0); + // Pre-condition: simulate an existing penalty + { + let mut s = g.state.lock(); + s.penalty = 0.6; + s.consecutive_failures = 3; + } + g.record_success(); + let s = g.state.lock(); + assert!((s.penalty - 0.3).abs() < f32::EPSILON, "penalty must be halved"); + assert_eq!(s.consecutive_failures, 0); + } + + #[test] + fn test_record_transient_failure_caps_at_one() { + let g = Gateway::new("https://x/{cid}".to_string(), 0); + // Five consecutive failures must cap penalty at exactly 1.0 + // (3 * 0.3 = 0.9, then +0.3 → 1.0; further +0.3 stays at 1.0). + for _ in 0..5 { + g.record_transient_failure(); + } + let s = g.state.lock(); + assert!(s.penalty <= 1.0 + f32::EPSILON, "penalty must cap at 1.0"); + assert!(s.penalty > 0.99, "penalty must reach 1.0 after 5 failures"); + assert_eq!(s.consecutive_failures, 5); + } + + #[test] + fn test_record_verify_failure_pegs_penalty_and_sets_cooldown() { + let g = Gateway::new("https://x/{cid}".to_string(), 0); + let before = Instant::now(); + g.record_verify_failure(); + let s = g.state.lock(); + assert_eq!(s.penalty, 1.0); + let cd = s.cooldown_until.expect("cooldown must be set"); + // Cooldown should be ~5 minutes from now. + let target = before + VERIFY_FAILURE_COOLDOWN; + // Allow slack for the time elapsed during the test. + assert!(cd >= target - Duration::from_secs(1)); + assert!(cd <= target + Duration::from_secs(2)); + } + + #[test] + fn test_record_rate_limit_sets_cooldown_only() { + // Rate-limit cooldown must NOT change penalty (load-shedding, + // not a quality issue). + let g = Gateway::new("https://x/{cid}".to_string(), 0); + let pre_penalty = g.state.lock().penalty; + g.record_rate_limit(Duration::from_secs(30)); + let s = g.state.lock(); + assert_eq!(s.penalty, pre_penalty, "rate limit must not change penalty"); + assert!(s.cooldown_until.is_some()); + } + + #[test] + fn test_is_in_cooldown_transitions_through_expiry() { + let g = Gateway::new("https://x/{cid}".to_string(), 0); + // Fresh: not in cooldown. + assert!(!g.is_in_cooldown(Instant::now())); + + g.record_rate_limit(Duration::from_millis(50)); + assert!(g.is_in_cooldown(Instant::now()), "must be in cooldown immediately after rate-limit"); + + std::thread::sleep(Duration::from_millis(80)); + assert!( + !g.is_in_cooldown(Instant::now()), + "cooldown must auto-expire after the retry-after duration" + ); + } + + #[test] + fn test_effective_priority_reflects_decayed_penalty() { + // With a fast TAU (100ms), penalty=1.0 should decay to ~0.37 + // after 1 TAU and ~0.05 after 3 TAUs. + let g = Gateway::new("https://x/{cid}".to_string(), 2); + // Force penalty to 1.0 directly so we have a known starting + // point (record_transient_failure also bumps last_observed_at, + // which we want to set to "just now" anyway). + g.record_transient_failure(); + g.record_transient_failure(); + g.record_transient_failure(); + g.record_transient_failure(); + let now = Instant::now(); + let tau = Duration::from_millis(100); + + // Immediately after, penalty ~1.0, so effective_priority + // ≈ base_priority + 1.0 * 3.0 = 5.0. + let pri_now = g.effective_priority(now, tau); + assert!( + pri_now > 4.5 && pri_now < 5.5, + "expected ~5.0 (base 2 + penalty*3), got {}", + pri_now + ); + + // After 1 TAU: decay factor exp(-1) ≈ 0.368. Penalty ≈ 0.368, + // effective ≈ 2.0 + 1.1 = ~3.1. + let one_tau_later = now + tau; + let pri_after = g.effective_priority(one_tau_later, tau); + assert!( + pri_after > 2.8 && pri_after < 3.3, + "expected ~3.1 (base 2 + 0.37*3), got {}", + pri_after + ); + + // After 5 TAUs: decay factor exp(-5) ≈ 0.0067. Penalty essentially + // gone, effective ≈ base_priority = 2.0. + let five_tau_later = now + tau * 5; + let pri_far_later = g.effective_priority(five_tau_later, tau); + assert!( + pri_far_later >= 2.0 && pri_far_later < 2.1, + "expected ~2.0 after 5 TAUs of decay, got {}", + pri_far_later + ); + } + + #[test] + fn test_effective_priority_does_not_mutate_state() { + // Decay-on-read is pure. Calling effective_priority multiple + // times must NOT advance last_observed_at — that's what makes + // decay a function of time-since-last-event, not + // time-since-last-read. + let g = Gateway::new("https://x/{cid}".to_string(), 0); + g.record_transient_failure(); + let pre_obs = g.state.lock().last_observed_at; + let pre_pen = g.state.lock().penalty; + + let _ = g.effective_priority(Instant::now() + Duration::from_secs(1), Duration::from_secs(60)); + let _ = g.effective_priority(Instant::now() + Duration::from_secs(2), Duration::from_secs(60)); + + let post_obs = g.state.lock().last_observed_at; + let post_pen = g.state.lock().penalty; + + assert_eq!(pre_obs, post_obs, "last_observed_at must not change on read"); + assert_eq!(pre_pen, post_pen, "penalty must not change on read"); + } + + // ============================================================ + // Checkpoint B: fetch_one against wiremock + // ============================================================ + + #[tokio::test] + async fn test_fetch_one_success_with_matching_bytes() { + use wiremock::matchers::{method, path_regex}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let data = b"hello fetch_one"; + let cid = cid_blake3(data); + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(data.as_ref())) + .mount(&server) + .await; + + let gw = Gateway::new(format!("{}/ipfs/{{cid}}", server.uri()), 0); + let http = reqwest::Client::new(); + let body = fetch_one(&gw, &cid, &http, Duration::from_secs(5)) + .await + .expect("fetch_one ok"); + assert_eq!(body.as_ref(), data); + } + + #[tokio::test] + async fn test_fetch_one_tampered_bytes_rejected() { + // The gateway returns bytes that DON'T hash to the requested + // CID. fetch_one must reject with VerifyFailed — the security + // boundary that defends against malicious or buggy gateways. + use wiremock::matchers::{method, path_regex}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let cid_data = b"original content"; + let cid = cid_blake3(cid_data); + let tampered = b"tampered content"; + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tampered.as_ref())) + .mount(&server) + .await; + + let gw = Gateway::new(format!("{}/ipfs/{{cid}}", server.uri()), 0); + let http = reqwest::Client::new(); + match fetch_one(&gw, &cid, &http, Duration::from_secs(5)).await { + Err(FetchError::VerifyFailed(VerifyError::DigestMismatch { .. })) => { /* ok */ } + other => panic!("expected VerifyFailed/DigestMismatch, got {:?}", other), + } + } + + #[tokio::test] + async fn test_fetch_one_404_returns_not_found() { + use wiremock::matchers::{method, path_regex}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let cid = cid_blake3(b"some content"); + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with(ResponseTemplate::new(404)) + .mount(&server) + .await; + + let gw = Gateway::new(format!("{}/ipfs/{{cid}}", server.uri()), 0); + let http = reqwest::Client::new(); + match fetch_one(&gw, &cid, &http, Duration::from_secs(5)).await { + Err(FetchError::NotFound) => { /* ok */ } + other => panic!("expected NotFound, got {:?}", other), + } + } + + #[tokio::test] + async fn test_fetch_one_503_returns_transient() { + use wiremock::matchers::{method, path_regex}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let cid = cid_blake3(b"some content"); + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with(ResponseTemplate::new(503)) + .mount(&server) + .await; + + let gw = Gateway::new(format!("{}/ipfs/{{cid}}", server.uri()), 0); + let http = reqwest::Client::new(); + match fetch_one(&gw, &cid, &http, Duration::from_secs(5)).await { + Err(FetchError::Transient(_)) => { /* ok */ } + other => panic!("expected Transient, got {:?}", other), + } + } + + #[tokio::test] + async fn test_fetch_one_429_with_retry_after_returns_rate_limited() { + use wiremock::matchers::{method, path_regex}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let cid = cid_blake3(b"some content"); + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with(ResponseTemplate::new(429).insert_header("Retry-After", "42")) + .mount(&server) + .await; + + let gw = Gateway::new(format!("{}/ipfs/{{cid}}", server.uri()), 0); + let http = reqwest::Client::new(); + match fetch_one(&gw, &cid, &http, Duration::from_secs(5)).await { + Err(FetchError::RateLimited { retry_after_secs }) => { + assert_eq!(retry_after_secs, 42, "must parse Retry-After header"); + } + other => panic!("expected RateLimited, got {:?}", other), + } + } + + // ============================================================ + // Checkpoint C: race orchestration + circuit breaker + // ============================================================ + + #[test] + fn test_select_for_race_filters_cooldowned() { + // 3 gateways. Put gateway 1 in cooldown. select_for_race + // returns gateways 0 and 2 only. + let pool = GatewayPool::with_gateways( + vec![ + "https://g0/{cid}".to_string(), + "https://g1/{cid}".to_string(), + "https://g2/{cid}".to_string(), + ], + 3, + ); + pool.gateways[1].record_rate_limit(Duration::from_secs(60)); + + let alive = pool.select_for_race(Instant::now()); + assert_eq!(alive.len(), 2); + assert!(alive.iter().any(|g| g.url_template.contains("g0"))); + assert!(alive.iter().any(|g| g.url_template.contains("g2"))); + assert!(!alive.iter().any(|g| g.url_template.contains("g1"))); + } + + #[test] + fn test_select_for_race_takes_top_k_by_priority() { + // 6 gateways with K=2. select_for_race returns the 2 with + // lowest effective priority (= highest quality), which for + // a fresh pool is just gateways 0 and 1 (base_priority 0, 1). + let pool = GatewayPool::with_gateways( + vec![ + "https://g0/{cid}".to_string(), + "https://g1/{cid}".to_string(), + "https://g2/{cid}".to_string(), + "https://g3/{cid}".to_string(), + "https://g4/{cid}".to_string(), + "https://g5/{cid}".to_string(), + ], + 2, + ); + let racers = pool.select_for_race(Instant::now()); + assert_eq!(racers.len(), 2); + assert_eq!(racers[0].base_priority, 0); + assert_eq!(racers[1].base_priority, 1); + } + + #[test] + fn test_select_for_race_penalty_demotes_gateway() { + // Pile penalty on the top-priority gateway. After enough + // failures, its effective priority should fall below the + // next ones, and select_for_race should pick the others + // first. + let pool = GatewayPool::with_gateways( + vec![ + "https://g0/{cid}".to_string(), + "https://g1/{cid}".to_string(), + "https://g2/{cid}".to_string(), + ], + 2, + ); + // 4 transient failures on g0 → penalty caps near 1.0, + // effective priority ≈ 0 + 1.0*3 = 3.0. + // g1 base = 1, g2 base = 2. + for _ in 0..4 { + pool.gateways[0].record_transient_failure(); + } + let racers = pool.select_for_race(Instant::now()); + // The first two slots should be g1 and g2 (priorities 1 and 2), + // ahead of the penalized g0 (effective ~3.0). + assert_eq!(racers.len(), 2); + assert!( + racers[0].url_template.contains("g1") || racers[0].url_template.contains("g2"), + "penalized g0 must not be top of race; got {}", + racers[0].url_template + ); + } + + #[test] + fn test_select_for_race_empty_when_all_cooled_down() { + let pool = GatewayPool::with_gateways( + vec!["https://g0/{cid}".to_string(), "https://g1/{cid}".to_string()], + 3, + ); + for g in &pool.gateways { + g.record_rate_limit(Duration::from_secs(60)); + } + let alive = pool.select_for_race(Instant::now()); + assert!(alive.is_empty()); + } + + #[tokio::test] + async fn test_fetch_verified_first_gateway_wins() { + use wiremock::matchers::{method, path_regex}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let data = b"race-winner-content"; + let cid = cid_blake3(data); + + // Two mock gateways: one fast 200, one slow 200. + let server_fast = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(data.as_ref())) + .mount(&server_fast) + .await; + + let server_slow = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with( + ResponseTemplate::new(200) + .set_body_bytes(data.as_ref()) + .set_delay(Duration::from_secs(2)), + ) + .mount(&server_slow) + .await; + + let pool = GatewayPool::with_gateways( + vec![ + format!("{}/ipfs/{{cid}}", server_fast.uri()), + format!("{}/ipfs/{{cid}}", server_slow.uri()), + ], + 2, + ); + let http = reqwest::Client::new(); + + let body = pool.fetch_verified(&cid, &http).await.expect("race ok"); + assert_eq!(body.as_ref(), data); + } + + #[tokio::test] + async fn test_fetch_verified_falls_through_failed_gateway_to_succeeding_one() { + use wiremock::matchers::{method, path_regex}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let data = b"second-gateway-saves-the-day"; + let cid = cid_blake3(data); + + // First gateway always returns 503 (fast); second returns 200 + // with matching bytes (slightly delayed) — this 150ms delay + // is for race-DETERMINISM in the test, not realism. Without + // it, both responses are instant and the FuturesUnordered + // arrival order is timing-dependent: when 200 wins the wire, + // the 503 future is cancelled by `drop(in_flight)` before its + // `record_transient_failure` can run, and the test's + // assert-on-503-penalty-bump becomes flaky. + let server_503 = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with(ResponseTemplate::new(503)) + .mount(&server_503) + .await; + + let server_ok = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with( + ResponseTemplate::new(200) + .set_body_bytes(data.as_ref()) + .set_delay(Duration::from_millis(150)), + ) + .mount(&server_ok) + .await; + + let pool = GatewayPool::with_gateways( + vec![ + format!("{}/ipfs/{{cid}}", server_503.uri()), + format!("{}/ipfs/{{cid}}", server_ok.uri()), + ], + 2, + ); + let http = reqwest::Client::new(); + + let body = pool.fetch_verified(&cid, &http).await.expect("fallback ok"); + assert_eq!(body.as_ref(), data); + + // The 503 gateway should have its penalty bumped. + let s = pool.gateways[0].state.lock(); + assert!(s.penalty > 0.0, "503 gateway must be penalized"); + assert_eq!(s.consecutive_failures, 1); + } + + #[tokio::test] + async fn test_fetch_verified_all_failed_returns_aggregate_error() { + use wiremock::matchers::{method, path_regex}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let cid = cid_blake3(b"unreachable"); + + let server_a = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with(ResponseTemplate::new(503)) + .mount(&server_a) + .await; + let server_b = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with(ResponseTemplate::new(404)) + .mount(&server_b) + .await; + + let pool = GatewayPool::with_gateways( + vec![ + format!("{}/ipfs/{{cid}}", server_a.uri()), + format!("{}/ipfs/{{cid}}", server_b.uri()), + ], + 2, + ); + let http = reqwest::Client::new(); + + match pool.fetch_verified(&cid, &http).await { + Err(GatewayPoolError::AllFailed { errors }) => { + assert_eq!(errors.len(), 2, "must aggregate per-gateway errors"); + } + other => panic!("expected AllFailed, got {:?}", other), + } + } + + #[tokio::test] + async fn test_fetch_verified_all_unavailable_when_pool_in_cooldown() { + let pool = GatewayPool::with_gateways( + vec!["http://test.invalid/ipfs/{cid}".to_string()], + 1, + ); + pool.gateways[0].record_rate_limit(Duration::from_secs(60)); + + let cid = cid_blake3(b"x"); + let http = reqwest::Client::new(); + match pool.fetch_verified(&cid, &http).await { + Err(GatewayPoolError::AllUnavailable) => { /* ok */ } + other => panic!("expected AllUnavailable, got {:?}", other), + } + } + + #[tokio::test] + async fn test_fetch_verified_tampered_response_records_verify_failure() { + // Race a gateway that returns tampered bytes alone — the race + // must fail (no verified body), AND the gateway's state must + // record a verify failure (penalty=1.0, cooldown set). + use wiremock::matchers::{method, path_regex}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let cid_data = b"original"; + let cid = cid_blake3(cid_data); + let tampered = b"NOT THE SAME"; + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with(ResponseTemplate::new(200).set_body_bytes(tampered.as_ref())) + .mount(&server) + .await; + + let pool = GatewayPool::with_gateways( + vec![format!("{}/ipfs/{{cid}}", server.uri())], + 1, + ); + let http = reqwest::Client::new(); + let result = pool.fetch_verified(&cid, &http).await; + assert!(matches!(result, Err(GatewayPoolError::AllFailed { .. }))); + + // Critical security assertion: the gateway is now in cooldown, + // so a future race won't include it for ~5 min. + let s = pool.gateways[0].state.lock(); + assert_eq!(s.penalty, 1.0, "verify failure pegs penalty at 1.0"); + assert!(s.cooldown_until.is_some(), "verify failure sets cooldown"); + let cooldown_remaining = s + .cooldown_until + .unwrap() + .saturating_duration_since(Instant::now()); + assert!( + cooldown_remaining > Duration::from_secs(290) + && cooldown_remaining <= VERIFY_FAILURE_COOLDOWN + Duration::from_secs(1), + "cooldown should be ~5 min; got {:?}", + cooldown_remaining + ); + } + + #[tokio::test] + async fn test_fetch_one_429_without_retry_after_uses_default() { + // Some gateways return 429 without a Retry-After header. + // We must not treat that as a parseable 0-second retry — + // the default 60s ensures we don't loop hot. + use wiremock::matchers::{method, path_regex}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + let cid = cid_blake3(b"some content"); + + let server = MockServer::start().await; + Mock::given(method("GET")) + .and(path_regex(r"/ipfs/.+")) + .respond_with(ResponseTemplate::new(429)) + .mount(&server) + .await; + + let gw = Gateway::new(format!("{}/ipfs/{{cid}}", server.uri()), 0); + let http = reqwest::Client::new(); + match fetch_one(&gw, &cid, &http, Duration::from_secs(5)).await { + Err(FetchError::RateLimited { retry_after_secs }) => { + assert_eq!(retry_after_secs, 60, "missing header → 60s default"); + } + other => panic!("expected RateLimited, got {:?}", other), + } + } +} diff --git a/crates/fula-client/src/health_gate.rs b/crates/fula-client/src/health_gate.rs new file mode 100644 index 0000000..a5e2024 --- /dev/null +++ b/crates/fula-client/src/health_gate.rs @@ -0,0 +1,240 @@ +//! Master health gate (Phase 2.1 of master-independent reads). +//! +//! Lock-free, lazy-probed state machine that tracks whether the master S3 +//! endpoint is reachable. The SDK consults the gate inside its HTTP request +//! path: when the gate is `Up`, requests proceed normally; when `Down`, +//! requests short-circuit with `Error::MasterUnreachable` for the configured +//! TTL, avoiding the per-read timeout tax that would otherwise degrade the +//! fast path under any flaky network. +//! +//! ## Design +//! +//! - **Lazy probing.** No eager init probe (which would waste a roundtrip on +//! every SDK construction when master is up — the common case). Failures +//! are observed inside normal traffic; once the gate trips, periodic +//! "probe" attempts are allowed through after the TTL expires. +//! +//! - **2-consecutive-failure threshold.** A single 5xx on a single bucket is +//! not a master-down signal — it's a request-level issue. The gate only +//! trips after **two** consecutive failures across any requests. This +//! prevents one transient error from sidelining the whole client. +//! +//! - **Lock-free atomic state.** `state_ms` is an `AtomicU64` representing +//! either `0` (Up) or the unix-millis when the gate flipped Down. +//! `consecutive_failures` is an `AtomicU32`. No `Mutex` / `RwLock` +//! contention even when 50 in-flight requests all fail simultaneously. +//! +//! - **Phase 2.1 ships the gate; Phase 2.4 wires it into a fallback to the +//! gateway race.** Standalone, the gate just turns "3-second-timeout per +//! read" into "fast-fail with `MasterUnreachable`" when Down. + +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +/// Threshold for flipping from `Up` to `Down`. One transient 5xx on a single +/// bucket isn't the same as "master is unreachable" — only two consecutive +/// signals trip the gate. +const CONSECUTIVE_FAILURE_THRESHOLD: u32 = 2; + +/// State machine for master reachability. +/// +/// `state_ms == 0` → currently `Up`. +/// `state_ms != 0` → flipped `Down` at this unix-millis timestamp. +/// +/// Cheap to clone via `Arc`; shared across all `FulaClient` clones so a +/// failure observed in one task immediately silences the rest. +pub struct HealthGate { + state_ms: AtomicU64, + consecutive_failures: AtomicU32, + ttl: Duration, +} + +impl HealthGate { + /// Create a new gate with the given TTL. Starts in the `Up` state. + pub fn new(ttl: Duration) -> Self { + Self { + state_ms: AtomicU64::new(0), + consecutive_failures: AtomicU32::new(0), + ttl, + } + } + + /// Decide whether a request to master should be sent or short-circuited. + /// + /// Returns: + /// - `GateDecision::Allow` — gate is `Up`, OR `Down` but `now > since + ttl` + /// (the TTL elapsed; this request is the next "probe"). + /// - `GateDecision::ShortCircuit { down_for_secs }` — gate is `Down` and + /// within the TTL; caller should fail fast with `MasterUnreachable`. + pub fn decide(&self) -> GateDecision { + let down_at = self.state_ms.load(Ordering::Acquire); + if down_at == 0 { + return GateDecision::Allow; + } + let now = now_ms(); + let elapsed = now.saturating_sub(down_at); + if elapsed >= self.ttl.as_millis() as u64 { + // TTL elapsed — let this request through as a probe. Don't + // reset the gate yet; reset only on observed success. + GateDecision::Allow + } else { + GateDecision::ShortCircuit { + down_for_secs: elapsed / 1000, + } + } + } + + /// Record a successful master interaction. Resets the failure counter + /// and clears the `Down` timestamp (gate returns to `Up`). + pub fn record_success(&self) { + self.consecutive_failures.store(0, Ordering::Release); + self.state_ms.store(0, Ordering::Release); + } + + /// Record a master-side failure (connection refused / RST / 5xx / + /// request timeout). Increments the consecutive-failure counter; once + /// the threshold is reached, flips the gate to `Down(now)`. + /// + /// 4xx responses are NOT failures for gate purposes — they're + /// request-level issues, not master-down signals. + pub fn record_failure(&self) { + let prior = self.consecutive_failures.fetch_add(1, Ordering::AcqRel); + if prior + 1 >= CONSECUTIVE_FAILURE_THRESHOLD { + // Threshold crossed (or exceeded). Flip to `Down` if not already. + // Only update timestamp on the first transition this window so + // that repeated failures don't keep extending the TTL. + let _ = self.state_ms.compare_exchange( + 0, + now_ms(), + Ordering::AcqRel, + Ordering::Acquire, + ); + } + } +} + +/// Decision returned by `HealthGate::decide`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum GateDecision { + /// Caller should send the request to master normally. + Allow, + /// Caller should fail fast with `Error::MasterUnreachable`. + ShortCircuit { down_for_secs: u64 }, +} + +/// Current unix-time in milliseconds. Wall-clock based (so SystemTime +/// adjustments can shift the gate's perceived "since" — acceptable here +/// since we only compare durations, and a clock jump is at worst a slight +/// TTL anomaly). +fn now_ms() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_state_is_up() { + // A freshly-constructed gate must be `Up`. Lazy probing means we + // never assume master is down without observation. + let gate = HealthGate::new(Duration::from_secs(30)); + assert_eq!(gate.decide(), GateDecision::Allow); + } + + #[test] + fn test_one_failure_does_not_trip() { + // S1 from advisor: a single failure must NOT sideline the gate. + // One-off 5xx on a bucket-level operation is not "master is down." + let gate = HealthGate::new(Duration::from_secs(30)); + gate.record_failure(); + assert_eq!( + gate.decide(), + GateDecision::Allow, + "one failure must not flip the gate" + ); + } + + #[test] + fn test_two_consecutive_failures_trip_to_down() { + // CONSECUTIVE_FAILURE_THRESHOLD = 2. Two consecutive failures + // (across any requests) flip the gate. + let gate = HealthGate::new(Duration::from_secs(30)); + gate.record_failure(); + gate.record_failure(); + match gate.decide() { + GateDecision::ShortCircuit { down_for_secs: _ } => { /* ok */ } + other => panic!("expected ShortCircuit, got {:?}", other), + } + } + + #[test] + fn test_success_resets_consecutive_counter() { + // A success between failures must reset the counter so a second + // failure (after the success) doesn't pile on with the first. + let gate = HealthGate::new(Duration::from_secs(30)); + gate.record_failure(); + gate.record_success(); + gate.record_failure(); + // Only ONE failure since the last success — must not be down. + assert_eq!(gate.decide(), GateDecision::Allow); + } + + #[test] + fn test_success_clears_down_state() { + // When the gate is Down and a probe (after TTL or first attempt + // that gets through) succeeds, the gate must return to Up. + let gate = HealthGate::new(Duration::from_secs(30)); + gate.record_failure(); + gate.record_failure(); + assert!(matches!(gate.decide(), GateDecision::ShortCircuit { .. })); + gate.record_success(); + assert_eq!(gate.decide(), GateDecision::Allow); + } + + #[test] + fn test_down_state_expires_after_ttl() { + // After TTL elapses, the gate allows the next request through + // as a probe (without resetting state — only success resets). + // Use a very short TTL to keep the test fast. + let gate = HealthGate::new(Duration::from_millis(50)); + gate.record_failure(); + gate.record_failure(); + assert!(matches!(gate.decide(), GateDecision::ShortCircuit { .. })); + + std::thread::sleep(Duration::from_millis(80)); + + assert_eq!( + gate.decide(), + GateDecision::Allow, + "after TTL, next decide() must allow a probe" + ); + // State is still Down until a probe succeeds (verify by observing + // that consecutive_failures hasn't auto-reset). + let down_at = gate.state_ms.load(Ordering::Acquire); + assert!(down_at > 0, "state remains Down until success observed"); + } + + #[test] + fn test_concurrent_failures_idempotent() { + // Two threads recording failures concurrently must not produce + // unexpected state. Even with N concurrent failures, the gate is + // either Up (if total < threshold) or Down (if >= threshold). + use std::sync::Arc; + let gate = Arc::new(HealthGate::new(Duration::from_secs(30))); + + let mut handles = Vec::new(); + for _ in 0..8 { + let g = gate.clone(); + handles.push(std::thread::spawn(move || g.record_failure())); + } + for h in handles { + h.join().unwrap(); + } + // 8 failures > threshold(2), so gate must be Down. + assert!(matches!(gate.decide(), GateDecision::ShortCircuit { .. })); + } +} diff --git a/crates/fula-client/src/lib.rs b/crates/fula-client/src/lib.rs index cf3e047..3a233e1 100644 --- a/crates/fula-client/src/lib.rs +++ b/crates/fula-client/src/lib.rs @@ -37,10 +37,15 @@ //! } //! ``` +#[cfg(not(target_arch = "wasm32"))] +mod block_cache; mod client; mod config; mod encryption; mod error; +#[cfg(not(target_arch = "wasm32"))] +mod gateway_fetch; +mod health_gate; mod multipart; mod types; #[cfg(not(target_arch = "wasm32"))] diff --git a/crates/fula-core/src/bucket.rs b/crates/fula-core/src/bucket.rs index 17868c5..c3633b7 100644 --- a/crates/fula-core/src/bucket.rs +++ b/crates/fula-core/src/bucket.rs @@ -1002,6 +1002,44 @@ impl BucketManager { .collect() } + /// Populate `BucketMetadata.bucket_lookup_h` for a user-scoped bucket + /// **only if currently `None`**. Idempotent — never overwrites an + /// existing value. Sets the dirty flag on success; the caller is + /// responsible for triggering registry persistence (typically via the + /// existing `persist_registry_with_token` call in the put_object handler). + /// + /// This is called by master's PUT handler when the SDK includes the + /// `x-amz-meta-fula-bucket-lookup-h` control header on a Phase 2 + /// manifest root PUT (the moment of forest commit). + /// + /// Returns `Ok(true)` if the field was newly populated, `Ok(false)` if it + /// was already set. `Err(BucketNotFound)` if the bucket doesn't exist. + pub fn populate_lookup_h_if_missing( + &self, + user_id: &str, + bucket_name: &str, + lookup_h: [u8; 16], + ) -> Result { + let internal_key = Self::scoped_bucket_key(user_id, bucket_name); + + // Mutate within a sync block; DashMap shard guard never crosses an + // await. Persistence is intentionally NOT triggered here — the put + // handler already calls `persist_registry_with_token` post-flush, + // which picks up the new value via the dirty flag. + match self.buckets.get_mut(&internal_key) { + Some(mut entry) => { + if entry.bucket_lookup_h.is_some() { + Ok(false) + } else { + entry.bucket_lookup_h = Some(lookup_h); + self.dirty.store(true, std::sync::atomic::Ordering::Relaxed); + Ok(true) + } + } + None => Err(CoreError::BucketNotFound(bucket_name.to_string())), + } + } + /// Find a bucket by display name that contains a specific object key /// /// Uses the secondary name index for O(1) lookup of matching buckets @@ -1517,4 +1555,327 @@ mod tests { N ); } + + // ============================================================ + // Phase 1.2 (master-independent reads) — populate_lookup_h_if_missing + // ============================================================ + + #[tokio::test] + async fn test_populate_lookup_h_if_missing_happy_path() { + // First-ever populate on a freshly-created bucket: sets the field, + // returns Ok(true), and marks the manager dirty so the next + // persist_registry call serializes the new value. + let store = Arc::new(MemoryBlockStore::new()); + let manager = BucketManager::new(store); + let user_id = "userA"; + let bucket_name = "photos"; + let owner = Owner::new(user_id); + + manager + .create_bucket_for_user(user_id, bucket_name.to_string(), owner) + .await + .expect("create_bucket_for_user"); + + // Pre-condition: bucket exists, lookup_h is None. + let pre = manager + .get_bucket_metadata_for_user(user_id, bucket_name) + .expect("metadata exists"); + assert_eq!(pre.bucket_lookup_h, None); + + // create_bucket_for_user calls persist_registry which clears dirty; + // populate should re-set it. + manager + .dirty + .store(false, std::sync::atomic::Ordering::Relaxed); + + let h: [u8; 16] = [ + 0xab, 0xcd, 0xef, 0x12, 0x34, 0x56, 0x78, 0x9a, + 0xbc, 0xde, 0xf0, 0x11, 0x22, 0x33, 0x44, 0x55, + ]; + let changed = manager + .populate_lookup_h_if_missing(user_id, bucket_name, h) + .expect("populate ok"); + assert!(changed, "first call must report changed=true"); + + // Post-condition: field is now Some(h), dirty flag is set. + let post = manager + .get_bucket_metadata_for_user(user_id, bucket_name) + .expect("metadata exists"); + assert_eq!(post.bucket_lookup_h, Some(h)); + assert!( + manager.dirty.load(std::sync::atomic::Ordering::Relaxed), + "dirty flag must be set after a real write" + ); + } + + #[tokio::test] + async fn test_populate_lookup_h_if_missing_idempotent() { + // Second call with a DIFFERENT value must NOT overwrite. Returns + // Ok(false) and preserves the original. Dirty flag isn't set on + // the no-op (so we don't churn the registry on every PUT for an + // already-migrated bucket). + let store = Arc::new(MemoryBlockStore::new()); + let manager = BucketManager::new(store); + let user_id = "userB"; + let bucket_name = "documents"; + let owner = Owner::new(user_id); + + manager + .create_bucket_for_user(user_id, bucket_name.to_string(), owner) + .await + .expect("create_bucket_for_user"); + + let original_h: [u8; 16] = [1u8; 16]; + let other_h: [u8; 16] = [2u8; 16]; + + // First populate → sets the field. + let changed = manager + .populate_lookup_h_if_missing(user_id, bucket_name, original_h) + .expect("populate ok"); + assert!(changed); + + // Reset dirty so we can detect whether the no-op call sets it again. + manager + .dirty + .store(false, std::sync::atomic::Ordering::Relaxed); + + // Second populate with a different value → idempotent skip. + let changed = manager + .populate_lookup_h_if_missing(user_id, bucket_name, other_h) + .expect("populate idempotent"); + assert!(!changed, "second call must report changed=false"); + + // Original value preserved; dirty flag NOT set by the no-op. + let post = manager + .get_bucket_metadata_for_user(user_id, bucket_name) + .expect("metadata exists"); + assert_eq!( + post.bucket_lookup_h, + Some(original_h), + "idempotent: original value must NOT be overwritten" + ); + assert!( + !manager.dirty.load(std::sync::atomic::Ordering::Relaxed), + "no-op call must not set dirty flag" + ); + } + + #[tokio::test] + async fn test_populate_lookup_h_bucket_not_found() { + // Calling on a bucket that doesn't exist returns BucketNotFound. + // Master's handler treats this as a non-fatal warn, but the API + // contract here is: explicit error, not silent success. + let store = Arc::new(MemoryBlockStore::new()); + let manager = BucketManager::new(store); + + let h: [u8; 16] = [0u8; 16]; + let result = manager.populate_lookup_h_if_missing("ghost-user", "ghost-bucket", h); + + assert!(matches!(result, Err(CoreError::BucketNotFound(ref n)) if n == "ghost-bucket")); + } + + #[tokio::test] + async fn test_legacy_bucket_lazy_migrates_when_new_client_sends_header() { + // SCENARIO 2 from the rollout matrix: + // - Bucket was created BEFORE Phase 1.2 ships (old data; old client + // SDK; no `bucket_lookup_h` field in the persisted CBOR — i.e. + // deserialized as None via #[serde(default)]). + // - User upgrades their fula-client SDK and writes again. + // - New SDK sends `x-amz-meta-fula-bucket-lookup-h` on the Phase 2 + // manifest root PUT. Master's handler calls populate. + // - Bucket's `bucket_lookup_h` lazy-migrates from None → Some(_) + // and persists. Subsequent reads (incl. Phase 3.2 chain + // publication) see the blinded key. + // + // This test simulates that journey end-to-end through the master's + // BucketManager: persist + reload to mimic server restart between + // the old and new client uploads. + let tmp = std::env::temp_dir().join(format!( + "fula-phase12-legacy-{}.cid", + std::process::id() + )); + let store = Arc::new(MemoryBlockStore::new()); + let user_id = "userL"; // L = Legacy + let bucket_name = "fula-metadata"; + let owner = Owner::new(user_id); + + // (1) Old client created the bucket pre-Phase-1.2. + { + let manager = BucketManager::with_persistence(store.clone(), &tmp); + manager + .create_bucket_for_user(user_id, bucket_name.to_string(), owner) + .await + .expect("legacy create"); + // Old code never set bucket_lookup_h. Verify. + let pre = manager + .get_bucket_metadata_for_user(user_id, bucket_name) + .expect("metadata"); + assert_eq!( + pre.bucket_lookup_h, None, + "legacy bucket must start with no lookup_h" + ); + manager.persist_registry().await.expect("legacy persist"); + } + + // (2) Server restarts. New code loads the legacy CBOR. + let new_manager = BucketManager::with_persistence(store, &tmp); + let count = new_manager.load_registry().await.expect("reload"); + assert_eq!(count, 1); + let after_reload = new_manager + .get_bucket_metadata_for_user(user_id, bucket_name) + .expect("metadata after reload"); + assert_eq!( + after_reload.bucket_lookup_h, None, + "legacy CBOR must round-trip with lookup_h=None" + ); + + // (3) New client uploads. Master receives the header → populates. + let h: [u8; 16] = [ + 0x7c, 0x68, 0xbe, 0x81, 0x43, 0xaf, 0x5b, 0xa2, + 0x12, 0xa3, 0x6f, 0x81, 0x23, 0x20, 0x37, 0xf5, + ]; + let changed = new_manager + .populate_lookup_h_if_missing(user_id, bucket_name, h) + .expect("lazy populate"); + assert!(changed, "first populate on a legacy bucket must change"); + + let after_populate = new_manager + .get_bucket_metadata_for_user(user_id, bucket_name) + .expect("metadata after populate"); + assert_eq!(after_populate.bucket_lookup_h, Some(h)); + + // (4) Persist the migration → durable. + new_manager + .persist_registry() + .await + .expect("post-migration persist"); + + // (5) The next time the same client (or any other) writes, populate + // is a no-op (idempotent — never overwrites). + let other_h: [u8; 16] = [9u8; 16]; + let changed = new_manager + .populate_lookup_h_if_missing(user_id, bucket_name, other_h) + .expect("idempotent"); + assert!(!changed); + let still = new_manager + .get_bucket_metadata_for_user(user_id, bucket_name) + .expect("metadata still present"); + assert_eq!(still.bucket_lookup_h, Some(h), "must NOT overwrite migrated value"); + + // Cleanup + let _ = std::fs::remove_file(&tmp); + let _ = std::fs::remove_file(tmp.with_extension("cid.bak")); + } + + #[tokio::test] + async fn test_old_client_without_header_leaves_bucket_intact() { + // SCENARIO 1 from the rollout matrix: + // - Existing user with old fula-client SDK, post-server-update. + // - Old SDK does NOT send `x-amz-meta-fula-bucket-lookup-h`. + // - Master's handler: header absent → populate never called. + // - Bucket continues to function normally; `bucket_lookup_h` + // stays None. + // - Phase 3.2 publisher will emit this bucket with `legacy=true` + // + plaintext bucket name; SDK cold-start falls back to plain + // bucket-name lookup. + // + // This test verifies the BucketManager side: a bucket without a + // populate call still works for all read/list/persist operations, + // and stays in the legacy (None) state across persist + reload. + let tmp = std::env::temp_dir().join(format!( + "fula-phase12-oldclient-{}.cid", + std::process::id() + )); + let store = Arc::new(MemoryBlockStore::new()); + let user_id = "userO"; // O = Old client + let bucket_name = "videos"; + let owner = Owner::new(user_id); + + let manager = BucketManager::with_persistence(store.clone(), &tmp); + manager + .create_bucket_for_user(user_id, bucket_name.to_string(), owner) + .await + .expect("create"); + + // Simulate many writes from an old client — no populate call ever + // runs. The bucket continues to function; lookup_h stays None. + for _ in 0..3 { + // (in production, each iteration would be a put_object handler + // call without the header — here we just persist to mimic the + // post-flush registry update that handler normally triggers.) + manager.persist_registry().await.expect("persist"); + } + + let pre_reload = manager + .get_bucket_metadata_for_user(user_id, bucket_name) + .expect("metadata"); + assert_eq!(pre_reload.bucket_lookup_h, None); + + // Reload simulates server restart with old-client data still in flight. + let reloaded = BucketManager::with_persistence(store, &tmp); + reloaded.load_registry().await.expect("reload"); + let post_reload = reloaded + .get_bucket_metadata_for_user(user_id, bucket_name) + .expect("metadata after reload"); + assert_eq!( + post_reload.bucket_lookup_h, None, + "old-client buckets stay in legacy state until upgraded" + ); + assert_eq!(post_reload.name, bucket_name); + + // Cleanup + let _ = std::fs::remove_file(&tmp); + let _ = std::fs::remove_file(tmp.with_extension("cid.bak")); + } + + #[tokio::test] + async fn test_populate_lookup_h_persists_through_registry_roundtrip() { + // The lookup_h must survive: populate → persist_registry → reload + // from IPFS → still Some(h). This is the end-to-end backward-compat + // safety: a Phase-1.2-populated bucket round-trips through master + // restart correctly. + let tmp = std::env::temp_dir().join(format!( + "fula-phase12-{}.cid", + std::process::id() + )); + let store = Arc::new(MemoryBlockStore::new()); + let manager = BucketManager::with_persistence(store.clone(), &tmp); + let user_id = "userC"; + let bucket_name = "videos"; + let owner = Owner::new(user_id); + + manager + .create_bucket_for_user(user_id, bucket_name.to_string(), owner) + .await + .expect("create"); + + let h: [u8; 16] = [ + 0x9d, 0xfb, 0x19, 0x47, 0xe5, 0x31, 0x5e, 0x62, + 0xc1, 0x1f, 0x2c, 0xe4, 0x77, 0xc2, 0x80, 0x97, + ]; + manager + .populate_lookup_h_if_missing(user_id, bucket_name, h) + .expect("populate"); + + // Persist (CID file written via with_persistence). + manager.persist_registry().await.expect("persist"); + + // Reload into a fresh manager. + let reloaded = BucketManager::with_persistence(store, &tmp); + let count = reloaded.load_registry().await.expect("reload"); + assert_eq!(count, 1); + + let restored = reloaded + .get_bucket_metadata_for_user(user_id, bucket_name) + .expect("metadata after reload"); + assert_eq!( + restored.bucket_lookup_h, + Some(h), + "lookup_h must survive registry persist + reload" + ); + + // Cleanup + let _ = std::fs::remove_file(&tmp); + let _ = std::fs::remove_file(tmp.with_extension("cid.bak")); + } } diff --git a/crates/fula-core/src/metadata.rs b/crates/fula-core/src/metadata.rs index 1592733..d748d60 100644 --- a/crates/fula-core/src/metadata.rs +++ b/crates/fula-core/src/metadata.rs @@ -228,12 +228,23 @@ pub struct BucketMetadata { /// Object count (cached) pub object_count: u64, - + /// Total size in bytes (cached) pub total_size: u64, - + /// Last modified timestamp pub last_modified: DateTime, + + /// Blinded lookup key for the per-user bucketsIndex CBOR published in + /// Phase 3 chain snapshots. Computed client-side as + /// `BLAKE3(MetadataKey || bucket_name)` truncated to 16 bytes (matches + /// `hashed_user_id`'s 128-bit convention). `None` for buckets created + /// before this field was added; populated lazily on the next forest + /// flush via `BucketManager::populate_lookup_h_if_missing`. + /// `#[serde(default)]` makes existing `fula-bucket-registry` CBOR blocks + /// deserialize fine without migration. + #[serde(default)] + pub bucket_lookup_h: Option<[u8; 16]>, } impl BucketMetadata { @@ -253,6 +264,7 @@ impl BucketMetadata { object_count: 0, total_size: 0, last_modified: now, + bucket_lookup_h: None, } } @@ -390,4 +402,123 @@ mod tests { assert_eq!(bucket.name, "my-bucket"); assert!(!bucket.versioning_enabled); } + + // ============================================================ + // Phase 1.2 (master-independent reads) — bucket_lookup_h tests + // ============================================================ + + #[test] + fn test_bucket_lookup_h_default_is_none() { + // Newly-created BucketMetadata must have bucket_lookup_h = None. + // The field is populated lazily on the next forest flush via the SDK header. + let cid = fula_blockstore::cid_utils::create_cid( + b"root", + fula_blockstore::cid_utils::CidCodec::DagCbor, + ); + let bucket = BucketMetadata::new("b".to_string(), "owner".to_string(), cid); + assert_eq!(bucket.bucket_lookup_h, None); + } + + #[test] + fn test_bucket_lookup_h_dagcbor_roundtrip() { + // BucketMetadata with Some(...) and None must both round-trip cleanly + // through dag-cbor (the production registry format). + let cid = fula_blockstore::cid_utils::create_cid( + b"root", + fula_blockstore::cid_utils::CidCodec::DagCbor, + ); + + // None case + let none_bucket = BucketMetadata::new("b1".into(), "owner".into(), cid); + let bytes = serde_ipld_dagcbor::to_vec(&none_bucket).expect("serialize None"); + let restored: BucketMetadata = + serde_ipld_dagcbor::from_slice(&bytes).expect("deserialize None"); + assert_eq!(restored.bucket_lookup_h, None); + assert_eq!(restored.name, "b1"); + + // Some case + let mut some_bucket = BucketMetadata::new("b2".into(), "owner".into(), cid); + let h: [u8; 16] = [ + 0xd2, 0xe4, 0xc4, 0x3d, 0xa6, 0x60, 0xe0, 0xb8, + 0x5e, 0x7b, 0x08, 0xb6, 0x98, 0x91, 0x26, 0xb3, + ]; + some_bucket.bucket_lookup_h = Some(h); + let bytes = serde_ipld_dagcbor::to_vec(&some_bucket).expect("serialize Some"); + let restored: BucketMetadata = + serde_ipld_dagcbor::from_slice(&bytes).expect("deserialize Some"); + assert_eq!(restored.bucket_lookup_h, Some(h)); + assert_eq!(restored.name, "b2"); + } + + #[test] + fn test_bucket_lookup_h_legacy_cbor_deserializes_to_none() { + // BACKWARD-COMPAT GOLD STANDARD (Phase 1.2 hard-constraint #1): + // existing fula-bucket-registry blocks pinned to IPFS BEFORE this + // field was added must deserialize cleanly into the new struct, + // with bucket_lookup_h = None. Production data must not break. + // + // We simulate this by defining a struct with the same shape as + // BucketMetadata but WITHOUT the new field, serializing it via + // dag-cbor, then deserializing as the new BucketMetadata. The + // #[serde(default)] on the new field is what makes this work. + #[derive(Serialize, Deserialize)] + struct LegacyBucketMetadata { + name: String, + created_at: DateTime, + owner_id: String, + #[serde(with = "cid_serde")] + root_cid: Cid, + #[serde(default)] + versioning_enabled: bool, + #[serde(default)] + default_storage_class: StorageClass, + #[serde(default)] + tags: HashMap, + cors_config: Option, + #[serde(default)] + lifecycle_rules: Vec, + object_count: u64, + total_size: u64, + last_modified: DateTime, + // NOTE: deliberately no `bucket_lookup_h` field — this is the + // pre-Phase-1.2 shape. + } + + let cid = fula_blockstore::cid_utils::create_cid( + b"root", + fula_blockstore::cid_utils::CidCodec::DagCbor, + ); + let now = Utc::now(); + let legacy = LegacyBucketMetadata { + name: "videos".to_string(), + created_at: now, + owner_id: "9797dfb1947e5315e62c11f2ce477c28".to_string(), + root_cid: cid, + versioning_enabled: false, + default_storage_class: StorageClass::default(), + tags: HashMap::new(), + cors_config: None, + lifecycle_rules: Vec::new(), + object_count: 2984, + total_size: 764_932_382, + last_modified: now, + }; + + let legacy_bytes = + serde_ipld_dagcbor::to_vec(&legacy).expect("serialize legacy bucket"); + + // Deserialize the legacy bytes as the NEW BucketMetadata struct. + // This is exactly what happens at runtime when master loads a + // pre-Phase-1.2 fula-bucket-registry block from IPFS. + let modern: BucketMetadata = + serde_ipld_dagcbor::from_slice(&legacy_bytes).expect("legacy → modern"); + + assert_eq!(modern.name, "videos"); + assert_eq!(modern.owner_id, "9797dfb1947e5315e62c11f2ce477c28"); + assert_eq!(modern.object_count, 2984); + assert_eq!(modern.total_size, 764_932_382); + // The critical assertion — Phase 1.2's serde(default) preserves + // the no-migration property for existing CBOR registries. + assert_eq!(modern.bucket_lookup_h, None); + } } diff --git a/crates/fula-flutter/src/api/error.rs b/crates/fula-flutter/src/api/error.rs index 6e0e2f8..d082dae 100644 --- a/crates/fula-flutter/src/api/error.rs +++ b/crates/fula-flutter/src/api/error.rs @@ -123,6 +123,14 @@ impl From for FulaError { ClientError::MigrationLockHeld { bucket, expires_at } => FulaError::InvalidResponse( format!("migration lock held for bucket {} (expires at {} ms)", bucket, expires_at), ), + // Phase 2.1 of master-independent reads: surface as a Network + // error to existing Flutter callers — the closest existing + // category, since the master is effectively unreachable. + // Phase 2.4 catches this variant earlier and falls back to the + // gateway race before reaching this conversion. + ClientError::MasterUnreachable { down_for_secs } => FulaError::Network( + format!("master unreachable (health gate; down for ~{}s)", down_for_secs), + ), } } } From 8d14a2024850a9bf37db78e8278e8a7202390e03 Mon Sep 17 00:00:00 2001 From: ehsan shariati Date: Sat, 2 May 2026 11:42:01 -0400 Subject: [PATCH 2/6] cold start --- Cargo.lock | 1 + crates/fula-cli/Cargo.toml | 3 + crates/fula-cli/src/handlers/internal.rs | 527 ++++++++++++++++++ crates/fula-cli/src/handlers/mod.rs | 1 + .../src/handlers/users_index_publisher.rs | 419 +++++++++++++- crates/fula-cli/src/routes.rs | 22 +- crates/fula-cli/src/server.rs | 16 +- crates/fula-cli/src/state.rs | 122 ++++ 8 files changed, 1106 insertions(+), 5 deletions(-) create mode 100644 crates/fula-cli/src/handlers/internal.rs diff --git a/Cargo.lock b/Cargo.lock index 2cabddd..4c0dfda 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1787,6 +1787,7 @@ dependencies = [ "url", "urlencoding", "uuid", + "wiremock", ] [[package]] diff --git a/crates/fula-cli/Cargo.toml b/crates/fula-cli/Cargo.toml index f8b11f7..7973b4d 100644 --- a/crates/fula-cli/Cargo.toml +++ b/crates/fula-cli/Cargo.toml @@ -82,3 +82,6 @@ tempfile = { workspace = true } rstest = { workspace = true } tokio-test = "0.4" reqwest = { workspace = true } +# Used by users_index_publisher A3 tests to mock kubo's +# /api/v0/name/publish HTTP endpoint without spinning up a real IPFS daemon. +wiremock = { workspace = true } diff --git a/crates/fula-cli/src/handlers/internal.rs b/crates/fula-cli/src/handlers/internal.rs new file mode 100644 index 0000000..aeeb3c6 --- /dev/null +++ b/crates/fula-cli/src/handlers/internal.rs @@ -0,0 +1,527 @@ +//! Phase 3.2 A3 internal endpoints. +//! +//! Two endpoints, both bearer-token-protected: +//! +//! - `GET /_internal/users-index-state` — returns the latest published +//! `(global_cid, sequence, updated_at_unix)` so the 12h chain cron in +//! `mainnet-reward-server` can fetch and submit on-chain. +//! - `POST /_internal/publish-now` — fires a publisher tick on +//! demand. Useful for deploy verification. +//! +//! ## Auth +//! +//! Bearer token from `users_index_publisher.config.internal_token`. +//! When `internal_token = None`: every request returns **503** +//! ("internal endpoints disabled"). Fail-closed: an operator who +//! forgets to set the token doesn't accidentally expose an unauthed +//! state-readout endpoint. +//! +//! When `internal_token = Some(t)`: +//! - missing/wrong bearer → **401** +//! - correct bearer → **200** +//! +//! ## Wiring +//! +//! Routes are added to a dedicated branch in `routes.rs` so they bypass +//! the user-JWT auth middleware and use a small bearer-token check +//! instead. Endpoints return 503 when the publisher itself is `None` +//! (publisher feature disabled at startup) — this is the regression +//! check for "publisher disabled = byte-identical legacy behavior". + +use crate::handlers::users_index_publisher::UsersIndexPublisher; +use crate::AppState; +use axum::{ + extract::State, + http::{HeaderMap, StatusCode}, + response::{IntoResponse, Response}, + Json, +}; +use fula_blockstore::FlexibleBlockStore; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; + +/// `GET /_internal/users-index-state` response body. Designed for the +/// chain cron — single deserialize, no fancy error envelopes. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct UsersIndexStateResponse { + /// Latest published global users-index CID, or `null` if no + /// publish has succeeded yet (fresh master). + pub cid: Option, + /// Monotonic sequence embedded in the most recent global CBOR. + pub sequence: u64, + /// Wall-clock timestamp of the last successful publish. + pub updated_at_unix: u64, + /// IPNS key name (e.g., `fula-users-index`). Operators verify + /// against their kubo `key list` output. Logged-only — clients + /// resolve via the IPNS NAME (libp2p key hash), not this label. + pub ipns_key_name: String, +} + +/// `POST /_internal/publish-now` response body. +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub struct PublishNowResponse { + pub global_cid: String, + pub sequence: u64, + pub changed_users: usize, + pub total_users: usize, + pub global_rebuilt: bool, +} + +/// Bearer-token check. Returns: +/// - `Ok(())` when the publisher is configured AND the bearer matches. +/// - `Err(503)` when the publisher OR `internal_token` is unset +/// (fail-closed; documented in module doc). +/// - `Err(401)` when the bearer is missing/wrong but auth IS configured. +fn authenticate( + publisher: Option<&Arc>>, + headers: &HeaderMap, +) -> Result<(), Response> { + let publisher = match publisher { + Some(p) => p, + None => { + return Err(( + StatusCode::SERVICE_UNAVAILABLE, + "users-index publisher disabled", + ) + .into_response()); + } + }; + let configured = match publisher.config().internal_token.as_deref() { + Some(t) if !t.is_empty() => t, + _ => { + return Err(( + StatusCode::SERVICE_UNAVAILABLE, + "internal endpoints disabled (no internal_token configured)", + ) + .into_response()); + } + }; + let presented = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|s| s.strip_prefix("Bearer ")) + .unwrap_or(""); + // Constant-time compare to defend against timing oracles. + if !constant_time_eq(presented.as_bytes(), configured.as_bytes()) { + return Err((StatusCode::UNAUTHORIZED, "invalid or missing bearer token").into_response()); + } + Ok(()) +} + +fn constant_time_eq(a: &[u8], b: &[u8]) -> bool { + if a.len() != b.len() { + return false; + } + let mut diff: u8 = 0; + for (x, y) in a.iter().zip(b.iter()) { + diff |= x ^ y; + } + diff == 0 +} + +/// `GET /_internal/users-index-state` +pub async fn users_index_state( + State(state): State>, + headers: HeaderMap, +) -> Response { + if let Err(resp) = authenticate(state.users_index_publisher.as_ref(), &headers) { + return resp; + } + let publisher = state + .users_index_publisher + .as_ref() + .expect("authenticate already proved Some"); + let latest = publisher.latest(); + let body = UsersIndexStateResponse { + cid: latest.global_cid.map(|c| c.to_string()), + sequence: latest.sequence, + updated_at_unix: latest.updated_at_unix, + ipns_key_name: publisher.config().ipns_key_name.clone(), + }; + (StatusCode::OK, Json(body)).into_response() +} + +/// `POST /_internal/publish-now` +pub async fn publish_now( + State(state): State>, + headers: HeaderMap, +) -> Response { + if let Err(resp) = authenticate(state.users_index_publisher.as_ref(), &headers) { + return resp; + } + let publisher = state + .users_index_publisher + .as_ref() + .expect("authenticate already proved Some"); + match publisher.run_tick().await { + Ok(outcome) => { + let body = PublishNowResponse { + global_cid: outcome.global_cid.to_string(), + sequence: outcome.sequence, + changed_users: outcome.changed_users, + total_users: outcome.total_users, + global_rebuilt: outcome.global_rebuilt, + }; + (StatusCode::OK, Json(body)).into_response() + } + Err(e) => { + tracing::error!(error = %e, "users-index publish-now failed"); + ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("publish failed: {}", e), + ) + .into_response() + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::handlers::users_index_publisher::PublisherConfig; + use axum::body::to_bytes; + use axum::http::{Method, Request}; + use axum::Router; + use fula_blockstore::MemoryBlockStore; + use fula_core::BucketManager; + use std::path::PathBuf; + use std::time::Duration; + use tempfile::TempDir; + use tower::ServiceExt; + + /// Build a router exposing only the two internal endpoints — no + /// JWT auth middleware in the way. Mirrors what `routes.rs` will + /// wire, minus everything irrelevant to these endpoints. + fn build_internal_router(state: Arc) -> Router { + Router::new() + .route( + "/_internal/users-index-state", + axum::routing::get(users_index_state), + ) + .route( + "/_internal/publish-now", + axum::routing::post(publish_now), + ) + .with_state(state) + } + + /// Build an `AppState` with a publisher backed by `MemoryBlockStore`, + /// optionally wrapping it in an `Arc` to satisfy the FlexibleBlockStore + /// type that AppState expects. + async fn fixture_state(internal_token: Option, with_publisher: bool) -> Arc { + // `keep()` returns the PathBuf and disables the TempDir's + // auto-delete-on-drop. Files persist for the test process; the + // OS cleans them up on next reboot if anything is left. + let dir = TempDir::new().unwrap().keep(); + let state_path: PathBuf = dir.join("state.txt"); + + let inner = FlexibleBlockStore::Memory(MemoryBlockStore::new()); + let block_store = Arc::new(inner); + let bucket_manager = Arc::new(BucketManager::new(Arc::clone(&block_store))); + + let users_index_publisher = if with_publisher { + let config = PublisherConfig { + flush_interval: Duration::from_secs(300), + first_publish_max_pins_per_sec: 100, + ipns_lifetime: Duration::from_secs(36 * 3600), + ipns_ttl: Duration::from_secs(15 * 60), + ipns_key_name: "fula-users-index".to_string(), + state_file_path: state_path, + ipfs_api_url: "http://localhost:5001".to_string(), + internal_token, + }; + // No IPNS publisher — the internal endpoints don't depend on it. + let p = UsersIndexPublisher::open_without_ipns( + config, + Arc::clone(&bucket_manager), + Arc::clone(&block_store), + ) + .expect("open"); + Some(Arc::new(p)) + } else { + None + }; + + let config = crate::config::GatewayConfig::default(); + Arc::new(AppState { + config, + block_store, + bucket_manager, + multipart_manager: Arc::new(crate::multipart::MultipartManager::new(60)), + lock_store: crate::handlers::locks::LockStore::new(), + users_index_publisher, + }) + } + + #[tokio::test] + async fn test_state_endpoint_503_when_publisher_disabled() { + // Publisher = None. Operators who deploy without flipping the + // env flag MUST get a 503, not a 500 or unauthed leak. + let state = fixture_state(None, false).await; + let app = build_internal_router(state); + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/_internal/users-index-state") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE); + } + + #[tokio::test] + async fn test_state_endpoint_503_when_no_token() { + // Publisher is on but `internal_token = None`. Fail-closed. + let state = fixture_state(None, true).await; + let app = build_internal_router(state); + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/_internal/users-index-state") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE); + } + + #[tokio::test] + async fn test_state_endpoint_401_on_wrong_token() { + let state = fixture_state(Some("supersecret".to_string()), true).await; + let app = build_internal_router(state); + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/_internal/users-index-state") + .header("authorization", "Bearer wrongtoken") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::UNAUTHORIZED); + } + + #[tokio::test] + async fn test_state_endpoint_401_on_missing_bearer_prefix() { + let state = fixture_state(Some("supersecret".to_string()), true).await; + let app = build_internal_router(state); + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/_internal/users-index-state") + // No "Bearer " prefix. + .header("authorization", "supersecret") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::UNAUTHORIZED); + } + + #[tokio::test] + async fn test_state_endpoint_200_with_correct_token_returns_default_state() { + // Fresh publisher, never ticked → cid is null, sequence is 0. + // Verifies the JSON shape AND the "fresh" semantics. + let state = fixture_state(Some("supersecret".to_string()), true).await; + let app = build_internal_router(state); + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/_internal/users-index-state") + .header("authorization", "Bearer supersecret") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let body: UsersIndexStateResponse = serde_json::from_slice(&bytes).unwrap(); + assert_eq!(body.cid, None); + assert_eq!(body.sequence, 0); + assert_eq!(body.updated_at_unix, 0); + assert_eq!(body.ipns_key_name, "fula-users-index"); + } + + #[tokio::test] + async fn test_publish_now_runs_tick_and_returns_outcome() { + // After publish-now succeeds, a follow-up GET reads the + // newly-committed state. Round-trip verification. + let token = "supersecret".to_string(); + let state = fixture_state(Some(token.clone()), true).await; + let app = build_internal_router(Arc::clone(&state)); + + // Trigger publish-now. + let resp = app + .clone() + .oneshot( + Request::builder() + .method(Method::POST) + .uri("/_internal/publish-now") + .header("authorization", format!("Bearer {}", token)) + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let body: PublishNowResponse = serde_json::from_slice(&bytes).unwrap(); + assert_eq!(body.sequence, 1); + assert!(body.global_rebuilt); + + // GET the state — must reflect the just-published values. + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/_internal/users-index-state") + .header("authorization", format!("Bearer {}", token)) + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::OK); + let bytes = to_bytes(resp.into_body(), usize::MAX).await.unwrap(); + let state_body: UsersIndexStateResponse = serde_json::from_slice(&bytes).unwrap(); + assert_eq!(state_body.cid, Some(body.global_cid)); + assert_eq!(state_body.sequence, 1); + } + + #[tokio::test] + async fn test_publish_now_503_when_publisher_disabled() { + // Same fail-closed contract as the GET endpoint. + let state = fixture_state(None, false).await; + let app = build_internal_router(state); + let resp = app + .oneshot( + Request::builder() + .method(Method::POST) + .uri("/_internal/publish-now") + .header("authorization", "Bearer anything") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(resp.status(), StatusCode::SERVICE_UNAVAILABLE); + } + + #[test] + fn test_constant_time_eq_correct() { + assert!(constant_time_eq(b"hello", b"hello")); + assert!(!constant_time_eq(b"hello", b"hellP")); + assert!(!constant_time_eq(b"hello", b"hell")); + assert!(!constant_time_eq(b"", b"x")); + assert!(constant_time_eq(b"", b"")); + } + + /// Drive the **real** router from `routes::create_router` to verify + /// `/_internal/*` actually bypasses the user-JWT `auth_middleware`. + /// If the router merge accidentally inherited the parent's auth + /// layer, this test fails (auth_middleware would respond with a + /// 403 "Authentication required" S3 error before reaching our + /// handler). The 503/SERVICE_UNAVAILABLE we expect comes from + /// `authenticate()` in this module — proof the request reached us. + #[tokio::test] + async fn test_internal_route_bypasses_user_jwt_auth() { + // auth_enabled=true: this is what production uses. A request + // to a normal S3 route without a JWT would 403. The internal + // route must reach our handler instead. + let dir = TempDir::new().unwrap().keep(); + let state_path: PathBuf = dir.join("state.txt"); + let inner = FlexibleBlockStore::Memory(MemoryBlockStore::new()); + let block_store = Arc::new(inner); + let bucket_manager = Arc::new(BucketManager::new(Arc::clone(&block_store))); + + let mut config = crate::config::GatewayConfig::default(); + config.auth_enabled = true; + config.jwt_secret = Some("test-secret".to_string()); + + let state = Arc::new(AppState { + config, + block_store, + bucket_manager, + multipart_manager: Arc::new(crate::multipart::MultipartManager::new(60)), + lock_store: crate::handlers::locks::LockStore::new(), + // Publisher disabled — we expect 503, not 401 (no token) + // and not 403 (S3 auth would trigger if middleware leaked). + users_index_publisher: None, + }); + + let _ = state_path; // silence unused; only here to mirror prod path layout + + let app = crate::routes::create_router(Arc::clone(&state)); + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/_internal/users-index-state") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!( + resp.status(), + StatusCode::SERVICE_UNAVAILABLE, + "internal route MUST bypass user-JWT auth — got status {}", + resp.status() + ); + } + + /// Backward-compat regression: when the publisher is disabled + /// (default for un-flagged deploys), the existing S3 routes must + /// still respond. Specifically, the `auth_enabled=false` dev-mode + /// path serves `/healthz` without any token. If publisher wiring + /// somehow broke healthz, an old fula-client deployed against + /// the new master would lose its container health check. + #[tokio::test] + async fn test_publisher_disabled_does_not_break_existing_routes() { + let inner = FlexibleBlockStore::Memory(MemoryBlockStore::new()); + let block_store = Arc::new(inner); + let bucket_manager = Arc::new(BucketManager::new(Arc::clone(&block_store))); + + let mut config = crate::config::GatewayConfig::default(); + config.auth_enabled = false; // dev mode, no JWT required + config.jwt_secret = Some("test-secret".to_string()); + + let state = Arc::new(AppState { + config, + block_store, + bucket_manager, + multipart_manager: Arc::new(crate::multipart::MultipartManager::new(60)), + lock_store: crate::handlers::locks::LockStore::new(), + users_index_publisher: None, + }); + + let app = crate::routes::create_router(Arc::clone(&state)); + let resp = app + .oneshot( + Request::builder() + .method(Method::GET) + .uri("/healthz") + .body(axum::body::Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!( + resp.status(), + StatusCode::OK, + "existing /healthz must still respond when publisher is disabled" + ); + } +} diff --git a/crates/fula-cli/src/handlers/mod.rs b/crates/fula-cli/src/handlers/mod.rs index 91ea72d..216822e 100644 --- a/crates/fula-cli/src/handlers/mod.rs +++ b/crates/fula-cli/src/handlers/mod.rs @@ -3,6 +3,7 @@ pub mod admin; pub mod batch; pub mod bucket; +pub mod internal; pub mod locks; pub mod multipart; pub mod object; diff --git a/crates/fula-cli/src/handlers/users_index_publisher.rs b/crates/fula-cli/src/handlers/users_index_publisher.rs index 448cd56..1fa72d6 100644 --- a/crates/fula-cli/src/handlers/users_index_publisher.rs +++ b/crates/fula-cli/src/handlers/users_index_publisher.rs @@ -39,6 +39,7 @@ use std::collections::{BTreeMap, HashMap}; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use tracing::{info, warn}; /// State that persists across master restarts. Single source of truth /// for "what did we last successfully publish?". Written **after** a @@ -437,6 +438,98 @@ impl From<&PersistedState> for LatestPublished { } } +// ============================================================ +// IPNS publisher (kubo HTTP API client) +// ============================================================ + +/// Kubo `/api/v0/name/publish` response body. We only care about +/// `Name` (= the IPNS NAME, libp2p key hash) for logging — clients +/// resolve via the configured IPNS NAME, not via this response. +#[derive(Clone, Debug, Deserialize, PartialEq, Eq)] +pub struct IpnsPublishResponse { + #[serde(rename = "Name")] + pub name: String, + #[serde(rename = "Value")] + pub value: String, +} + +/// Thin client over kubo's `/api/v0/name/publish`. Plain HTTP POST, +/// no auth (kubo's API is localhost-only by default). Failures are +/// surfaced via `Result` and the caller decides what to do — for +/// the publisher tick, an IPNS failure logs at `warn!` and lets the +/// commit proceed (chain backup at 12h still works). +#[derive(Clone)] +pub struct IpnsPublisher { + client: reqwest::Client, + api_url: String, +} + +impl IpnsPublisher { + /// Construct a publisher targeting `api_url` (e.g., + /// `http://localhost:5001`). The client uses kubo's default + /// timeout; the caller is responsible for outer timeouts if + /// needed (advisor noted: don't add inner backoff/timeout). + pub fn new(api_url: String) -> Self { + Self { + client: reqwest::Client::new(), + api_url, + } + } + + /// Construct from an existing `reqwest::Client` (test hook — + /// lets wiremock-based tests inject a client with custom timeouts + /// if needed; production uses [`new`]). + #[doc(hidden)] + pub fn with_client(client: reqwest::Client, api_url: String) -> Self { + Self { client, api_url } + } + + /// Publish `cid` under IPNS `key_name` with the given lifetime + /// + DHT-cache TTL. + /// + /// Kubo's API: `POST /api/v0/name/publish?arg=&key=&lifetime=&ttl=`. + /// Lifetime/ttl are Go duration strings (`36h`, `15m`, …). + /// Returns the `(Name, Value)` from the response — `Name` is the + /// IPNS NAME (libp2p public-key hash). `Value` is the path the + /// IPNS record now resolves to (the input CID, prefixed with + /// `/ipfs/`). + pub async fn publish( + &self, + cid: &Cid, + key_name: &str, + lifetime: Duration, + ttl: Duration, + ) -> AnyResult { + let url = format!( + "{}/api/v0/name/publish?arg={}&key={}&lifetime={}&ttl={}", + self.api_url.trim_end_matches('/'), + cid, + urlencoding::encode(key_name), + format_go_duration(lifetime), + format_go_duration(ttl), + ); + let resp = self.client.post(&url).send().await?; + let status = resp.status(); + if !status.is_success() { + let body = resp.text().await.unwrap_or_default(); + anyhow::bail!( + "kubo /api/v0/name/publish failed: status={}, body={}", + status, + body + ); + } + let body: IpnsPublishResponse = resp.json().await?; + Ok(body) + } +} + +/// Format a `Duration` as a Go-style duration string accepted by +/// kubo (`s` is universally accepted; we don't need +/// pretty-formatting). E.g., `36h` → `129600s`. Kubo accepts both. +fn format_go_duration(d: Duration) -> String { + format!("{}s", d.as_secs()) +} + // ============================================================ // Publisher skeleton // ============================================================ @@ -447,6 +540,10 @@ pub struct UsersIndexPublisher { config: PublisherConfig, bucket_manager: Arc>, block_store: Arc, + /// Optional IPNS publisher. `None` disables IPNS — useful for + /// tests that exercise just the pin/persist path, and for + /// operators who want the chain-backup path only. + ipns_publisher: Option, /// Per-user diff cache — owner_id → (content_hash, bucketsIndexCid). /// `Mutex` (not `RwLock`) because the tick is the only writer and /// the lock window is tiny (a HashMap insert). @@ -488,10 +585,38 @@ impl UsersIndexPublisher { /// Construct from config + handles to the bucket manager and /// block store. Loads existing state-file on-disk; fresh master /// starts with `PersistedState::default()`. + /// + /// IPNS is enabled by default (constructed from `config.ipfs_api_url`). + /// Tests may disable it via [`open_without_ipns`] to exercise the + /// pin/persist path independently. pub fn open( config: PublisherConfig, bucket_manager: Arc>, block_store: Arc, + ) -> Result { + let ipns_publisher = Some(IpnsPublisher::new(config.ipfs_api_url.clone())); + Self::open_with_ipns(config, bucket_manager, block_store, ipns_publisher) + } + + /// Construct without IPNS. Tick still pins + persists; the chain + /// path (12h cron in `mainnet-reward-server`) still works. Useful + /// for operators who don't want the kubo IPNS hop, and for the + /// pin/persist-only unit tests. + pub fn open_without_ipns( + config: PublisherConfig, + bucket_manager: Arc>, + block_store: Arc, + ) -> Result { + Self::open_with_ipns(config, bucket_manager, block_store, None) + } + + /// Internal constructor — also used by tests to inject a + /// wiremock-backed IPNS client. + pub fn open_with_ipns( + config: PublisherConfig, + bucket_manager: Arc>, + block_store: Arc, + ipns_publisher: Option, ) -> Result { let persisted = PersistedState::load(&config.state_file_path)?; let latest = LatestPublished::from(&persisted); @@ -499,6 +624,7 @@ impl UsersIndexPublisher { config, bucket_manager, block_store, + ipns_publisher, diff_cache: Mutex::new(HashMap::new()), latest: RwLock::new(latest), tick_lock: tokio::sync::Mutex::new(()), @@ -511,6 +637,13 @@ impl UsersIndexPublisher { self.latest.read().clone() } + /// Read-only access to the publisher config. Used by the internal + /// HTTP endpoints to surface `internal_token` (auth check) and + /// `ipns_key_name` (response field). + pub fn config(&self) -> &PublisherConfig { + &self.config + } + /// Read the on-disk persisted state directly (bypasses the /// in-memory `latest` cache). Used by tests and by the startup /// chain-cross-check (see plan 3.2.b advisor note). @@ -705,9 +838,50 @@ impl UsersIndexPublisher { } } - // 8. Persist new state. (A3 will insert IPNS publish between - // pin and persist; commit_state stays last so a crash mid- - // IPNS leaves us in a recoverable place.) + // 8. IPNS publish (best-effort). Order is documented as + // "pin → IPNS → persist" (plan 3.2.b + advisor): an IPNS + // publish failure does NOT abort the commit because the + // chain-backup cron at 12h still works. If the publish + // succeeds but persist fails, the next tick republishes + // the same CID under sequence+1 — IPNS is idempotent on + // `(cid, sequence)`. If we flipped the order to + // persist-then-IPNS, a crash mid-IPNS would leave an + // advanced on-disk sequence pointing at a CID never + // published. Don't flip. + if let Some(ipns) = &self.ipns_publisher { + match ipns + .publish( + &global_cid, + &self.config.ipns_key_name, + self.config.ipns_lifetime, + self.config.ipns_ttl, + ) + .await + { + Ok(resp) => { + info!( + cid = %global_cid, + sequence = next_sequence, + ipns_name = %resp.name, + ipns_value = %resp.value, + "users-index publisher: IPNS publish succeeded" + ); + } + Err(e) => { + warn!( + cid = %global_cid, + sequence = next_sequence, + error = %e, + "users-index publisher: IPNS publish failed (best-effort; chain backup at 12h still works; next tick will retry)" + ); + } + } + } + + // 9. Persist new state. commit_state is last so a crash mid- + // IPNS leaves us in a recoverable place — the next tick + // will retry IPNS with the same content (and on-chain + // sequence enforcement keeps things monotonic regardless). let next_state = PersistedState { global_cid: Some(global_cid), sequence: next_sequence, @@ -723,6 +897,68 @@ impl UsersIndexPublisher { global_rebuilt: true, }) } + + /// Test-only accessor: read the IPNS publisher's API URL. + #[cfg(test)] + fn ipns_api_url_for_test(&self) -> Option { + self.ipns_publisher.as_ref().map(|p| p.api_url.clone()) + } +} + +/// Spawn a background task that calls `publisher.run_tick()` on +/// `flush_interval`. Mirrors `handlers::locks::start_sweeper`: +/// holds an `Arc` to the publisher, lives for the process lifetime. +/// +/// `MissedTickBehavior::Delay` ensures that if a single tick takes +/// unusually long (e.g., master kubo blocked), the next tick fires +/// after a fresh `flush_interval` rather than firing back-to-back to +/// "catch up" — bursts can swamp the pinning service. The first tick +/// is gated by an immediate `interval.tick().await` at the top of +/// the loop, which fires after one interval has elapsed; if you want +/// the first tick at startup, log + call run_tick once before the +/// loop. We do NOT do that here: the operator's sequence-of-events +/// at master startup is `BucketManager.load_registry → spawn this +/// task → first tick fires after flush_interval` so the registry +/// has time to load and persist before the publisher reads from it. +pub fn start_publisher_loop( + publisher: Arc>, +) { + let interval_dur = publisher.config.flush_interval; + tokio::spawn(async move { + let mut interval = tokio::time::interval(interval_dur); + interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay); + // Skip the first tick (which fires immediately) — see fn doc. + interval.tick().await; + loop { + interval.tick().await; + match publisher.run_tick().await { + Ok(outcome) => { + if outcome.global_rebuilt { + info!( + sequence = outcome.sequence, + changed_users = outcome.changed_users, + total_users = outcome.total_users, + cid = %outcome.global_cid, + "users-index publisher: tick committed new global" + ); + } else { + tracing::debug!( + sequence = outcome.sequence, + total_users = outcome.total_users, + "users-index publisher: tick was no-op" + ); + } + } + Err(e) => { + warn!(error = %e, "users-index publisher: tick failed; will retry on next interval"); + } + } + } + }); + info!( + interval_secs = interval_dur.as_secs(), + "users-index publisher loop started" + ); } #[cfg(test)] @@ -1447,4 +1683,181 @@ mod tests { assert!(outcome.global_rebuilt, "first publish must run even on empty"); assert_eq!(outcome.sequence, 1); } + + // ============================================================ + // Phase 3.2 A3 — IPNS publisher tests (wiremock) + // ============================================================ + + use wiremock::matchers::{method, path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + /// Construct a publisher that targets `mock_url` for IPNS calls + /// (instead of a real kubo). The mock has full control over + /// success/failure responses. + fn fixture_publisher_with_ipns( + state_path: PathBuf, + ipns_api_url: String, + ) -> ( + UsersIndexPublisher, + Arc, + Arc>, + ) { + let store = Arc::new(MemoryBlockStore::new()); + let manager = Arc::new(BucketManager::new(Arc::clone(&store))); + let mut config = fixture_config(state_path); + // Speed up: short lifetime/ttl in tests (kubo accepts them + // but our format function is tested below). + config.ipns_lifetime = Duration::from_secs(60); + config.ipns_ttl = Duration::from_secs(15); + let ipns = IpnsPublisher::new(ipns_api_url); + let publisher = UsersIndexPublisher::open_with_ipns( + config, + Arc::clone(&manager), + Arc::clone(&store), + Some(ipns), + ) + .expect("open"); + (publisher, store, manager) + } + + #[test] + fn test_format_go_duration() { + assert_eq!(format_go_duration(Duration::from_secs(36 * 3600)), "129600s"); + assert_eq!(format_go_duration(Duration::from_secs(15 * 60)), "900s"); + assert_eq!(format_go_duration(Duration::from_secs(0)), "0s"); + } + + #[tokio::test] + async fn test_ipns_publisher_success() { + let mock = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/v0/name/publish")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "Name": "k51qzi5uqu5dh-mock", + "Value": "/ipfs/QmFakeCidValue", + }))) + .mount(&mock) + .await; + + let publisher = IpnsPublisher::new(mock.uri()); + let cid = fixture_cid(0xab); + let resp = publisher + .publish( + &cid, + "fula-users-index", + Duration::from_secs(36 * 3600), + Duration::from_secs(15 * 60), + ) + .await + .expect("publish"); + assert_eq!(resp.name, "k51qzi5uqu5dh-mock"); + assert_eq!(resp.value, "/ipfs/QmFakeCidValue"); + } + + #[tokio::test] + async fn test_ipns_publisher_propagates_5xx_error() { + let mock = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/v0/name/publish")) + .respond_with(ResponseTemplate::new(500).set_body_string("internal error")) + .mount(&mock) + .await; + + let publisher = IpnsPublisher::new(mock.uri()); + let cid = fixture_cid(0xab); + let result = publisher + .publish( + &cid, + "fula-users-index", + Duration::from_secs(60), + Duration::from_secs(15), + ) + .await; + assert!(result.is_err(), "5xx must surface as error"); + let err = format!("{}", result.unwrap_err()); + assert!(err.contains("status=500"), "error message exposes status"); + } + + #[tokio::test] + async fn test_run_tick_calls_ipns_with_correct_cid_and_sequence() { + // Verifies the integration point: run_tick fires kubo's + // /api/v0/name/publish with the freshly-built global CID. + let mock = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/v0/name/publish")) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "Name": "k51qzi5uqu5dh-mock", + "Value": "/ipfs/QmIgnored", + }))) + .expect(1) // exactly one IPNS publish per tick + .mount(&mock) + .await; + + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, _store, manager) = + fixture_publisher_with_ipns(path, mock.uri()); + + create_user_bucket(&manager, "alice", "photos").await; + let outcome = publisher.run_tick().await.expect("tick"); + assert_eq!(outcome.sequence, 1); + // wiremock's expect(1) verifies on drop that exactly one + // request hit the IPNS endpoint. + } + + #[tokio::test] + async fn test_run_tick_succeeds_when_ipns_5xx() { + // Operating-state matrix: kubo IPNS endpoint returns 500. + // The tick MUST still return Ok, the persisted state MUST + // still advance, and the global CID MUST still be pinned. + // Otherwise a flaky kubo blocks the entire publisher, + // which blocks subsequent writes on master. + let mock = MockServer::start().await; + Mock::given(method("POST")) + .and(path("/api/v0/name/publish")) + .respond_with(ResponseTemplate::new(500).set_body_string("kubo down")) + .mount(&mock) + .await; + + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, store, manager) = + fixture_publisher_with_ipns(path.clone(), mock.uri()); + + create_user_bucket(&manager, "alice", "photos").await; + let outcome = publisher.run_tick().await.expect("tick still Ok on IPNS 5xx"); + assert_eq!(outcome.sequence, 1); + assert!(outcome.global_rebuilt); + + // Pin happened → block exists in store. + assert!(store.is_pinned(&outcome.global_cid).await.unwrap()); + + // Persist happened → state file reflects new sequence. + let persisted = PersistedState::load(&path).expect("load"); + assert_eq!(persisted.sequence, 1); + assert_eq!(persisted.global_cid, Some(outcome.global_cid)); + } + + #[tokio::test] + async fn test_run_tick_no_ipns_configured_still_pins_and_persists() { + // open_without_ipns: tick still pins + persists; chain backup + // path is the only publish channel. Useful regression check + // for operators who deploy without IPNS. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let store = Arc::new(MemoryBlockStore::new()); + let manager = Arc::new(BucketManager::new(Arc::clone(&store))); + let publisher = UsersIndexPublisher::open_without_ipns( + fixture_config(path.clone()), + Arc::clone(&manager), + Arc::clone(&store), + ) + .expect("open"); + + create_user_bucket(&manager, "alice", "photos").await; + let outcome = publisher.run_tick().await.expect("tick"); + assert_eq!(outcome.sequence, 1); + assert!(outcome.global_rebuilt); + assert!(publisher.ipns_api_url_for_test().is_none()); + } } diff --git a/crates/fula-cli/src/routes.rs b/crates/fula-cli/src/routes.rs index e9f0e24..6d9e703 100644 --- a/crates/fula-cli/src/routes.rs +++ b/crates/fula-cli/src/routes.rs @@ -25,6 +25,25 @@ pub fn create_router(state: Arc) -> Router { // Public routes that must bypass auth (e.g., container health checks) let public = Router::new().route("/healthz", get(handlers::healthz)); + // Phase 3.2 internal endpoints. Bearer-token-protected at the + // handler level (see handlers::internal::authenticate). They + // bypass the user-JWT auth middleware so the chain cron in + // mainnet-reward-server can consume them with a shared secret, + // not a JWT. When the publisher is disabled OR the token is + // unset, both endpoints fail-closed with 503. + let internal = Router::new() + .route( + "/_internal/users-index-state", + get(handlers::internal::users_index_state), + ) + .route( + "/_internal/publish-now", + post(handlers::internal::publish_now), + ) + .layer(axum_middleware::from_fn(middleware::request_id_middleware)) + .layer(axum_middleware::from_fn(middleware::logging_middleware)) + .with_state(state.clone()); + // Admin routes (protected by admin middleware) let admin = Router::new() .route("/admin/users/{user_id}/buckets", get(handlers::list_user_buckets)) @@ -82,10 +101,11 @@ pub fn create_router(state: Arc) -> Router { )) .with_state(state.clone()); - // Combine public, admin, and private, then apply shared layers + // Combine public, admin, internal, and private, then apply shared layers Router::new() .merge(public) .merge(admin) + .merge(internal) .merge(private) .layer(cors) .layer(TraceLayer::new_for_http()) diff --git a/crates/fula-cli/src/server.rs b/crates/fula-cli/src/server.rs index e9c05de..bd33944 100644 --- a/crates/fula-cli/src/server.rs +++ b/crates/fula-cli/src/server.rs @@ -1,6 +1,6 @@ //! Server startup and lifecycle -use crate::handlers::locks; +use crate::handlers::{locks, users_index_publisher}; use crate::{AppState, GatewayConfig, routes}; use std::net::SocketAddr; use std::sync::Arc; @@ -16,6 +16,13 @@ pub async fn run_server(config: GatewayConfig) -> anyhow::Result<()> { // lives for the lifetime of the process. locks::start_sweeper(state.lock_store.clone()); + // Phase 3.2 — spawn the users-index publisher loop iff the env + // flag enabled the publisher at AppState construction time. When + // disabled, this is a no-op and nothing about S3 routing changes. + if let Some(publisher) = state.users_index_publisher.clone() { + users_index_publisher::start_publisher_loop(publisher); + } + // Create router let app = routes::create_router(state); @@ -41,6 +48,13 @@ pub async fn run_server_with_shutdown( locks::start_sweeper(state.lock_store.clone()); + // Phase 3.2 — spawn the users-index publisher loop iff the env + // flag enabled the publisher at AppState construction time. When + // disabled, this is a no-op and nothing about S3 routing changes. + if let Some(publisher) = state.users_index_publisher.clone() { + users_index_publisher::start_publisher_loop(publisher); + } + let app = routes::create_router(state); let addr = config.bind_addr(); diff --git a/crates/fula-cli/src/state.rs b/crates/fula-cli/src/state.rs index 243c797..817a3d5 100644 --- a/crates/fula-cli/src/state.rs +++ b/crates/fula-cli/src/state.rs @@ -34,6 +34,14 @@ pub struct AppState { /// In-memory advisory lock store used to serialize v1 -> v7 forest /// migrations across devices. TTL-bounded; process-local only. pub lock_store: crate::handlers::locks::LockStore, + /// Phase 3.2 master-side users-index publisher. `None` when the + /// `FULA_USERS_INDEX_PUBLISHER_ENABLED` env flag is unset (default). + /// When `None`, the `/_internal/users-index-state` endpoint + /// returns 503; existing S3 handlers behave byte-identically to + /// pre-Phase-3 deploys. + pub users_index_publisher: Option< + Arc>, + >, } impl AppState { @@ -118,12 +126,23 @@ impl AppState { // after AppState is wrapped in an Arc. let lock_store = crate::handlers::locks::LockStore::new(); + // Phase 3.2 users-index publisher — env-flag-gated so day-one + // deploys behave byte-identically to pre-Phase-3 builds. + // Operators flip `FULA_USERS_INDEX_PUBLISHER_ENABLED=1` after + // canary verification. + let users_index_publisher = build_users_index_publisher( + &config, + Arc::clone(&bucket_manager), + Arc::clone(&block_store), + ); + Ok(Self { config, block_store, bucket_manager, multipart_manager, lock_store, + users_index_publisher, }) } @@ -207,6 +226,109 @@ impl UserSession { } } +/// Phase 3.2 users-index publisher constructor — env-flag-gated. +/// +/// Returns `None` when `FULA_USERS_INDEX_PUBLISHER_ENABLED` is unset +/// or "0"/"false". When enabled: +/// +/// | Env var | Default | +/// |---------------------------------------------|------------------------------------------------------------| +/// | `FULA_USERS_INDEX_STATE_PATH` | `/var/lib/fula-gateway/users_index_state.txt` | +/// | `FULA_USERS_INDEX_FLUSH_INTERVAL_SECS` | 300 | +/// | `FULA_USERS_INDEX_INTERNAL_TOKEN` | (none → endpoints fail-closed with 503) | +/// | `FULA_USERS_INDEX_IPNS_KEY_NAME` | `fula-users-index` | +/// | `FULA_USERS_INDEX_IPNS_LIFETIME_SECS` | 129600 (36h) | +/// | `FULA_USERS_INDEX_IPNS_TTL_SECS` | 900 (15m) | +/// | `FULA_USERS_INDEX_IPNS_DISABLED` | unset → IPNS enabled | +/// | `FULA_USERS_INDEX_FIRST_PUBLISH_PINS_PER_S` | 100 | +fn build_users_index_publisher( + config: &GatewayConfig, + bucket_manager: Arc>, + block_store: Arc, +) -> Option>> { + use crate::handlers::users_index_publisher::{ + IpnsPublisher, PublisherConfig, UsersIndexPublisher, + }; + use std::time::Duration; + + let enabled = std::env::var("FULA_USERS_INDEX_PUBLISHER_ENABLED") + .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) + .unwrap_or(false); + if !enabled { + info!("users-index publisher: disabled (FULA_USERS_INDEX_PUBLISHER_ENABLED unset)"); + return None; + } + + let state_file_path = std::env::var("FULA_USERS_INDEX_STATE_PATH") + .unwrap_or_else(|_| "/var/lib/fula-gateway/users_index_state.txt".to_string()) + .into(); + let flush_interval = Duration::from_secs( + std::env::var("FULA_USERS_INDEX_FLUSH_INTERVAL_SECS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(300), + ); + let ipns_lifetime = Duration::from_secs( + std::env::var("FULA_USERS_INDEX_IPNS_LIFETIME_SECS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(36 * 3600), + ); + let ipns_ttl = Duration::from_secs( + std::env::var("FULA_USERS_INDEX_IPNS_TTL_SECS") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(15 * 60), + ); + let ipns_key_name = std::env::var("FULA_USERS_INDEX_IPNS_KEY_NAME") + .unwrap_or_else(|_| "fula-users-index".to_string()); + let internal_token = std::env::var("FULA_USERS_INDEX_INTERNAL_TOKEN").ok().filter(|s| !s.is_empty()); + let first_publish_max_pins_per_sec = std::env::var("FULA_USERS_INDEX_FIRST_PUBLISH_PINS_PER_S") + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(100); + + let pub_config = PublisherConfig { + flush_interval, + first_publish_max_pins_per_sec, + ipns_lifetime, + ipns_ttl, + ipns_key_name: ipns_key_name.clone(), + state_file_path, + ipfs_api_url: config.ipfs_url.clone(), + internal_token: internal_token.clone(), + }; + + let ipns_disabled = std::env::var("FULA_USERS_INDEX_IPNS_DISABLED") + .map(|v| v == "1" || v.eq_ignore_ascii_case("true")) + .unwrap_or(false); + let ipns = if ipns_disabled { + warn!("users-index publisher: IPNS disabled (FULA_USERS_INDEX_IPNS_DISABLED=1) — chain backup is the only publish channel"); + None + } else { + Some(IpnsPublisher::new(config.ipfs_url.clone())) + }; + + match UsersIndexPublisher::open_with_ipns(pub_config, bucket_manager, block_store, ipns) { + Ok(p) => { + info!( + flush_interval_secs = flush_interval.as_secs(), + ipns_key_name = %ipns_key_name, + internal_token_set = internal_token.is_some(), + "users-index publisher: enabled" + ); + Some(Arc::new(p)) + } + Err(e) => { + warn!( + error = %e, + "users-index publisher: failed to open state file; publisher disabled for this run" + ); + None + } + } +} + /// Admin session information #[derive(Clone, Debug)] pub struct AdminSession { From 5e0e2828bb447bca55a4be2b5b6bec9f51e8c5d2 Mon Sep 17 00:00:00 2001 From: ehsan shariati Date: Mon, 4 May 2026 11:44:21 -0400 Subject: [PATCH 3/6] Tolerate per-user pin failures and add wire helpers Make users-index publishing tolerant of individual per-user pin failures: collect per-user pin results without aborting the tick, surface a failed_users count in TickOutcome/PublishNowResponse, emit per-user and tick-level warnings, and add comprehensive tests (including a FaultyBlockStore) for partial/failing/all-fail/retry scenarios. Extract and unit-test HTTP-layer helpers for Phase 1.2: control-header filtering and parse_bucket_lookup_h_header (with explicit error enum) so the lookup_h header is handled cleanly and not persisted as user metadata. Enhance ful a-client block cache: add KEY_TO_CID mapping for offline-fallback, resolver hot-start METADATA rows and accessors, debug impls, and store/load helpers for users_index state. Update ful a-client Cargo.toml to add serde_ipld_dagcbor and sha3 (tests). Misc: wire failed_users through publish_now response and small cleanup/refactors to object header handling. --- Cargo.lock | 3 + crates/fula-cli/src/handlers/internal.rs | 10 + crates/fula-cli/src/handlers/object.rs | 274 ++- .../src/handlers/users_index_publisher.rs | 562 +++++- crates/fula-client/Cargo.toml | 10 + crates/fula-client/src/block_cache.rs | 451 ++++- crates/fula-client/src/client.rs | 1049 +++++++++- crates/fula-client/src/config.rs | 183 +- crates/fula-client/src/encryption.rs | 881 +++++++- crates/fula-client/src/error.rs | 86 + crates/fula-client/src/gateway_fetch.rs | 32 +- crates/fula-client/src/health_gate.rs | 235 ++- crates/fula-client/src/lib.rs | 21 + crates/fula-client/src/registry_resolver.rs | 1785 +++++++++++++++++ crates/fula-client/src/types.rs | 76 + crates/fula-flutter/Cargo.toml | 5 + crates/fula-flutter/src/api/client.rs | 170 +- crates/fula-flutter/src/api/error.rs | 74 + crates/fula-flutter/src/api/types.rs | 72 + crates/fula-flutter/src/frb_generated.rs | 20 + crates/fula-js/src/lib.rs | 204 +- 21 files changed, 6103 insertions(+), 100 deletions(-) create mode 100644 crates/fula-client/src/registry_resolver.rs diff --git a/Cargo.lock b/Cargo.lock index 4c0dfda..a3e6cfd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1813,8 +1813,10 @@ dependencies = [ "redb", "reqwest", "serde", + "serde_ipld_dagcbor", "serde_json", "sha2", + "sha3", "tempfile", "thiserror 2.0.17", "tokio", @@ -1921,6 +1923,7 @@ dependencies = [ "parking_lot", "serde", "serde_json", + "tempfile", "thiserror 2.0.17", "tokio", "wasm-bindgen-futures", diff --git a/crates/fula-cli/src/handlers/internal.rs b/crates/fula-cli/src/handlers/internal.rs index aeeb3c6..2fe7ef8 100644 --- a/crates/fula-cli/src/handlers/internal.rs +++ b/crates/fula-cli/src/handlers/internal.rs @@ -63,6 +63,15 @@ pub struct PublishNowResponse { pub global_cid: String, pub sequence: u64, pub changed_users: usize, + /// Number of users whose per-user CBOR pin failed this tick. + /// Surfaces the per-user-error-tolerance count from + /// `TickOutcome.failed_users` so an operator clicking + /// "publish now" in the admin UI sees per-user pin failures + /// without tailing logs. A non-zero value means the published + /// global may exclude one or more users (or carry their prior + /// CIDs). The per-user `warn!` lines inside `run_tick` identify + /// WHICH users failed; this field is the count for surfacing. + pub failed_users: usize, pub total_users: usize, pub global_rebuilt: bool, } @@ -159,6 +168,7 @@ pub async fn publish_now( global_cid: outcome.global_cid.to_string(), sequence: outcome.sequence, changed_users: outcome.changed_users, + failed_users: outcome.failed_users, total_users: outcome.total_users, global_rebuilt: outcome.global_rebuilt, }; diff --git a/crates/fula-cli/src/handlers/object.rs b/crates/fula-cli/src/handlers/object.rs index e33594f..6863a07 100644 --- a/crates/fula-cli/src/handlers/object.rs +++ b/crates/fula-cli/src/handlers/object.rs @@ -130,14 +130,13 @@ pub async fn put_object( metadata = metadata.with_content_type(ct); } - // Extract user metadata (x-amz-meta-*). - // Internal Fula control headers (consumed by the handler, not stored as - // object metadata) are filtered out — they would otherwise pollute every - // object's persisted metadata. - const FULA_CONTROL_HEADERS: &[&str] = &["fula-bucket-lookup-h"]; + // Extract user metadata (x-amz-meta-*). Internal Fula control + // headers (consumed by the handler, not stored as object metadata) + // are filtered out via `is_fula_control_header` — they would + // otherwise pollute every object's persisted metadata. for (name, value) in headers.iter() { if let Some(key) = name.as_str().strip_prefix("x-amz-meta-") { - if FULA_CONTROL_HEADERS.contains(&key) { + if is_fula_control_header(key) { continue; } if let Ok(v) = value.to_str() { @@ -180,10 +179,8 @@ pub async fn put_object( .get("x-amz-meta-fula-bucket-lookup-h") .and_then(|v| v.to_str().ok()) { - match hex::decode(hex_str) { - Ok(bytes) if bytes.len() == 16 => { - let mut lookup_h = [0u8; 16]; - lookup_h.copy_from_slice(&bytes); + match parse_bucket_lookup_h_header(hex_str) { + Ok(lookup_h) => { match state.bucket_manager.populate_lookup_h_if_missing( &session.hashed_user_id, &bucket_name, @@ -205,12 +202,12 @@ pub async fn put_object( ), } } - Ok(other) => tracing::warn!( - actual_len = other.len(), + Err(BucketLookupHError::WrongLength { actual }) => tracing::warn!( + actual_len = actual, "x-amz-meta-fula-bucket-lookup-h: expected 16-byte hex (32 chars), got {} bytes", - other.len() + actual ), - Err(e) => tracing::warn!( + Err(BucketLookupHError::InvalidHex(e)) => tracing::warn!( error = %e, "Failed to hex-decode x-amz-meta-fula-bucket-lookup-h" ), @@ -768,6 +765,255 @@ fn parse_etag_list(s: &str) -> impl Iterator + '_ { }) } +// ============================================================ +// Phase 1.2 wire-path helpers (master-side) +// ============================================================ +// +// These are extracted out of the put_object handler so the +// header-parsing + control-header-filter logic can be unit-tested +// without spinning up the full HTTP server stack. Audit follow-up +// item #5: cover the wire path beyond the BucketManager-direct +// integration test in users_index_publisher.rs. + +/// Internal Fula control headers (consumed by handler logic, NOT +/// persisted as object metadata). The list is `pub(crate)` so it +/// can be referenced from sibling modules; tests below assert it +/// stays in lockstep with the handler's filtering. +pub(crate) const FULA_CONTROL_HEADERS: &[&str] = &["fula-bucket-lookup-h"]; + +/// Returns `true` if the given x-amz-meta key (already stripped of +/// the `x-amz-meta-` prefix) is a Fula control header — meaning it +/// should NOT end up in `ObjectMetadata.user_metadata` even though +/// it's a perfectly valid `x-amz-meta-*` name. +pub(crate) fn is_fula_control_header(stripped_key: &str) -> bool { + FULA_CONTROL_HEADERS.contains(&stripped_key) +} + +/// Parse error for the `x-amz-meta-fula-bucket-lookup-h` header +/// value. Three failure modes today; expanding this enum is +/// backward-compatible (the handler matches exhaustively). +#[derive(Debug)] +pub(crate) enum BucketLookupHError { + /// hex::decode failed — non-hex characters in the value. + InvalidHex(hex::FromHexError), + /// Decoded byte length wasn't 16 (the only legal width per + /// Phase 1.2 spec — `userKey`-equivalent 128-bit blinded key). + WrongLength { actual: usize }, +} + +impl From for BucketLookupHError { + fn from(e: hex::FromHexError) -> Self { + BucketLookupHError::InvalidHex(e) + } +} + +/// Parse `x-amz-meta-fula-bucket-lookup-h` header value into a +/// 16-byte fixed array. Pure: no I/O, no allocations beyond the +/// transient hex::decode buffer. Used by `put_object` to convert +/// the wire-format string into the format +/// `BucketManager::populate_lookup_h_if_missing` expects. +pub(crate) fn parse_bucket_lookup_h_header( + hex_str: &str, +) -> Result<[u8; 16], BucketLookupHError> { + let bytes = hex::decode(hex_str)?; + if bytes.len() != 16 { + return Err(BucketLookupHError::WrongLength { actual: bytes.len() }); + } + let mut out = [0u8; 16]; + out.copy_from_slice(&bytes); + Ok(out) +} + +#[cfg(test)] +mod phase_1_2_wire_tests { + //! Phase 1.2 wire-path tests. Covers what the existing + //! `users_index_publisher::test_run_tick_legacy_to_blinded_replaces_entry` + //! test does NOT cover: the HTTP-layer header extraction + + //! parsing logic that sits between an SDK request and a + //! `populate_lookup_h_if_missing` call. + + use super::*; + use axum::http::{HeaderMap, HeaderName, HeaderValue}; + + #[test] + fn control_header_filter_includes_lookup_h() { + // Audit gold: the lookup_h header IS recognized as a control + // header. If someone removes it from FULA_CONTROL_HEADERS the + // header would leak into user_metadata storage on every PUT. + assert!(is_fula_control_header("fula-bucket-lookup-h")); + } + + #[test] + fn control_header_filter_excludes_arbitrary_user_metadata() { + // Defensive: an app's own metadata keys must NOT be filtered. + assert!(!is_fula_control_header("content-language")); + assert!(!is_fula_control_header("x-fula-encrypted")); + assert!(!is_fula_control_header("")); + } + + #[test] + fn parse_lookup_h_accepts_valid_32_char_hex() { + // Mirrors what `compute_bucket_lookup_h_hex` produces in the + // SDK: 32 lowercase hex chars = 16 bytes. + let valid = "deadbeefcafebabefeedfacef00dbabe"; + let parsed = parse_bucket_lookup_h_header(valid).expect("valid 32-char hex"); + assert_eq!(parsed.len(), 16); + assert_eq!(parsed[0], 0xde); + assert_eq!(parsed[15], 0xbe); + } + + #[test] + fn parse_lookup_h_accepts_uppercase_hex() { + // hex::decode is case-insensitive; we don't normalize. + let valid = "DEADBEEFCAFEBABEFEEDFACEF00DBABE"; + let parsed = parse_bucket_lookup_h_header(valid).expect("uppercase ok"); + assert_eq!(parsed[0], 0xde); + } + + #[test] + fn parse_lookup_h_rejects_too_short() { + // 30 hex chars = 15 bytes — one short. + let too_short = "deadbeefcafebabefeedfacef00dba"; + match parse_bucket_lookup_h_header(too_short) { + Err(BucketLookupHError::WrongLength { actual: 15 }) => {} + other => panic!("expected WrongLength{{actual:15}}, got {:?}", other), + } + } + + #[test] + fn parse_lookup_h_rejects_too_long() { + // 34 hex chars = 17 bytes — one byte over. + let too_long = "deadbeefcafebabefeedfacef00dbabe11"; + match parse_bucket_lookup_h_header(too_long) { + Err(BucketLookupHError::WrongLength { actual: 17 }) => {} + other => panic!("expected WrongLength{{actual:17}}, got {:?}", other), + } + } + + #[test] + fn parse_lookup_h_rejects_non_hex_chars() { + // 'z' is not a valid hex char; even at correct length this + // fails with InvalidHex. + let bad_chars = "zzadbeefcafebabefeedfacef00dbabe"; + match parse_bucket_lookup_h_header(bad_chars) { + Err(BucketLookupHError::InvalidHex(_)) => {} + other => panic!("expected InvalidHex, got {:?}", other), + } + } + + #[test] + fn parse_lookup_h_rejects_empty_string() { + // An empty header value reaches us as "" — must not parse + // to a zero-byte array. + match parse_bucket_lookup_h_header("") { + Err(BucketLookupHError::WrongLength { actual: 0 }) => {} + other => panic!("expected WrongLength{{actual:0}}, got {:?}", other), + } + } + + #[test] + fn parse_lookup_h_rejects_odd_length_hex() { + // 31 chars — odd-length is invalid per hex spec; hex::decode + // returns OddLength, which we surface as InvalidHex. + let odd = "deadbeefcafebabefeedfacef00dbab"; + match parse_bucket_lookup_h_header(odd) { + Err(BucketLookupHError::InvalidHex(_)) => {} + other => panic!("expected InvalidHex (odd length), got {:?}", other), + } + } + + /// End-to-end-ish wire-path simulation: from a real `HeaderMap` + /// (as the put_object handler would receive), extract: + /// - the user_metadata that should be persisted (lookup_h MUST + /// NOT appear there) + /// - the parsed lookup_h bytes (MUST equal what the SDK sent) + /// + /// This is the critical regression guard for "old client uploads + /// without header → no populate" vs "new client uploads with + /// header → populate fires with correct bytes". The integration + /// with `BucketManager` and the publisher is already covered by + /// `users_index_publisher::test_run_tick_legacy_to_blinded_replaces_entry`. + #[test] + fn old_client_no_header_means_no_populate() { + let mut headers = HeaderMap::new(); + // Old client sends content-type and a user metadata key; no + // lookup_h header. + headers.insert( + HeaderName::from_static("content-type"), + HeaderValue::from_static("image/jpeg"), + ); + headers.insert( + HeaderName::from_static("x-amz-meta-myapp-tag"), + HeaderValue::from_static("vacation"), + ); + + // Wire-path step 1: lookup_h header absent → handler skips populate. + let lookup_h_present = headers.get("x-amz-meta-fula-bucket-lookup-h").is_some(); + assert!(!lookup_h_present, "no header on old-client PUT"); + + // Wire-path step 2: user_metadata extraction filters control + // headers (none to filter here, but the loop must include the + // app's own tag). + let mut user_meta: Vec<(String, String)> = Vec::new(); + for (name, value) in headers.iter() { + if let Some(key) = name.as_str().strip_prefix("x-amz-meta-") { + if is_fula_control_header(key) { + continue; + } + if let Ok(v) = value.to_str() { + user_meta.push((key.to_string(), v.to_string())); + } + } + } + assert_eq!(user_meta, vec![("myapp-tag".to_string(), "vacation".to_string())]); + } + + #[test] + fn new_client_header_parses_and_does_not_leak_into_user_metadata() { + let mut headers = HeaderMap::new(); + headers.insert( + HeaderName::from_static("content-type"), + HeaderValue::from_static("image/jpeg"), + ); + headers.insert( + HeaderName::from_static("x-amz-meta-fula-bucket-lookup-h"), + HeaderValue::from_static("aabbccddeeff00112233445566778899"), + ); + headers.insert( + HeaderName::from_static("x-amz-meta-myapp-tag"), + HeaderValue::from_static("vacation"), + ); + + // Wire-path step 1: lookup_h header parses to expected bytes. + let hex_str = headers + .get("x-amz-meta-fula-bucket-lookup-h") + .and_then(|v| v.to_str().ok()) + .expect("present"); + let parsed = parse_bucket_lookup_h_header(hex_str).expect("valid hex"); + assert_eq!(parsed, [0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x11, + 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88, 0x99]); + + // Wire-path step 2: user_metadata extraction MUST drop the + // lookup_h header and keep the app's own tag. + let mut user_meta: Vec<(String, String)> = Vec::new(); + for (name, value) in headers.iter() { + if let Some(key) = name.as_str().strip_prefix("x-amz-meta-") { + if is_fula_control_header(key) { + continue; + } + if let Ok(v) = value.to_str() { + user_meta.push((key.to_string(), v.to_string())); + } + } + } + assert_eq!( + user_meta, + vec![("myapp-tag".to_string(), "vacation".to_string())], + "lookup_h header must NOT leak into user_metadata" + ); + } +} + #[cfg(test)] mod conditional_tests { use super::{match_if_match, match_if_none_match}; diff --git a/crates/fula-cli/src/handlers/users_index_publisher.rs b/crates/fula-cli/src/handlers/users_index_publisher.rs index 1fa72d6..5f1c699 100644 --- a/crates/fula-cli/src/handlers/users_index_publisher.rs +++ b/crates/fula-cli/src/handlers/users_index_publisher.rs @@ -27,7 +27,13 @@ //! one `tokio::spawn` from `server::run_server` after `AppState` is //! wrapped in `Arc`. The task lives for the process lifetime. -#![allow(dead_code)] // A3 will consume `internal_token` +// `dead_code` is permitted for module-level helpers that are exercised +// only in tests (e.g. `ipns_api_url_for_test`) or that are reserved for +// the planned Phase 3.3 SDK-side caller (e.g. structured config getters). +// Production paths (`run_tick`, `start_publisher_loop`, internal HTTP +// handlers) DO consume every field; this allow simply silences the +// warning chatter on the test-only accessors. +#![allow(dead_code)] use anyhow::Result as AnyResult; use cid::Cid; @@ -567,6 +573,22 @@ pub struct TickOutcome { /// re-pinned this tick. Always equal to `total_users` on the /// first tick (cache is empty). pub changed_users: usize, + /// Number of users whose per-user CBOR pin attempt failed this + /// tick. Per-user failures are tolerated: the tick continues with + /// the users that succeeded, the global is rebuilt with whatever + /// state the diff-cache currently holds (which means failed users + /// retain their PRIOR `bucketsIndexCid` if they had one, and are + /// absent from the published global if they had no prior pin). + /// Failed users are retried on the next tick because their + /// `content_hash` still mismatches the cache row. + /// + /// Operators monitor this field: a sustained non-zero value + /// across many ticks indicates a user whose data triggers a + /// pinning-service edge case and warrants investigation. The + /// publisher loop also emits a `warn!` line per failed user + /// inside `run_tick` (with the user_id and full error chain) so + /// the failing user is identifiable from logs alone. + pub failed_users: usize, /// Total number of users in `BucketManager.buckets` at this tick. pub total_users: usize, /// CID of the global users-index CBOR pinned this tick. @@ -734,18 +756,27 @@ impl UsersIndexPublisher { // Buffer-unordered keeps at most `max_concurrent` pin ops in // flight at any time (advisor's first-publish throttle). + // + // Per-user error tolerance: each task returns + // `(owner_id, AnyResult<(hash, cid)>)` so the outer loop can + // identify WHICH user failed and log it. Without this, an + // anyhow `?` in the inner closure would drop the owner_id + // and the loop level would only see an opaque error. let block_store = Arc::clone(&self.block_store); - let pin_results: Vec> = { + let pin_results: Vec<(String, AnyResult<([u8; 32], Cid)>)> = { use futures::stream::{self, StreamExt}; stream::iter(to_rebuild.into_iter().map(|(owner_id, buckets)| { let bs = Arc::clone(&block_store); async move { - let hash = compute_user_content_hash(&buckets); - let cbor = build_user_buckets_index(&buckets, now); - let cid = bs.put_ipld(&cbor).await?; - bs.pin(&cid, Some("fula-users-index-per-user")) - .await?; - Ok::<_, anyhow::Error>((owner_id, hash, cid)) + let inner: AnyResult<([u8; 32], Cid)> = async { + let hash = compute_user_content_hash(&buckets); + let cbor = build_user_buckets_index(&buckets, now); + let cid = bs.put_ipld(&cbor).await?; + bs.pin(&cid, Some("fula-users-index-per-user")).await?; + Ok((hash, cid)) + } + .await; + (owner_id, inner) } })) .buffer_unordered(max_concurrent) @@ -753,17 +784,43 @@ impl UsersIndexPublisher { .await }; + // Per-user error tolerance: a single user's pin failure must + // NOT abort the tick. Today's behavior (abort on first error) + // means at scale a single corrupted user blocks every user's + // cold-start visibility. With tolerance: + // - succeeded users update their diff_cache row + // - failed users keep their PRIOR diff_cache row (or have + // none if never succeeded) + // - global is rebuilt from the cache as it stands + // - failed users retry on the next tick because their + // `content_hash` still mismatches the (un-updated) cache row + // + // The `warn!` per failure carries owner_id + full anyhow chain + // so an operator can identify the failing user and root cause + // without combing through thread-of-execution traces. let mut changed_users = 0usize; - for r in pin_results { - let (owner_id, hash, cid) = r?; - self.diff_cache.lock().insert( - owner_id, - PerUserDiffEntry { - content_hash: hash, - buckets_index_cid: cid, - }, - ); - changed_users += 1; + let mut failed_users = 0usize; + for (owner_id, r) in pin_results { + match r { + Ok((hash, cid)) => { + self.diff_cache.lock().insert( + owner_id, + PerUserDiffEntry { + content_hash: hash, + buckets_index_cid: cid, + }, + ); + changed_users += 1; + } + Err(e) => { + failed_users += 1; + warn!( + user = %owner_id, + error = %e, + "users-index publisher: per-user pin failed; user will retry on next tick" + ); + } + } } // Prune diff-cache rows for users who disappeared from @@ -796,6 +853,14 @@ impl UsersIndexPublisher { if changed_users == 0 && users_pruned == 0 && prior.global_cid.is_some() { return Ok(TickOutcome { changed_users: 0, + // `failed_users` IS surfaced even on the no-op path — + // operators need to see "we tried to advance state for + // these N users this tick but couldn't" even when the + // global itself is unchanged. Without this, repeated + // failures on the same user would be invisible at the + // tick-outcome layer (only via the per-user warn! line + // inside run_tick). + failed_users, total_users, global_cid: prior.global_cid.expect("checked is_some"), sequence: prior.sequence, @@ -891,6 +956,7 @@ impl UsersIndexPublisher { Ok(TickOutcome { changed_users, + failed_users, total_users, global_cid, sequence: next_sequence, @@ -933,10 +999,28 @@ pub fn start_publisher_loop( interval.tick().await; match publisher.run_tick().await { Ok(outcome) => { + // Tick-level failure surfacing: when ≥ 1 user's + // pin failed but the tick otherwise progressed, + // emit a warn so the failure is visible at the + // loop layer (the per-user warn! inside run_tick + // identifies WHICH user; this one summarizes the + // shape so a log scraper / alerting rule can + // count `failed_users` per tick). + if outcome.failed_users > 0 { + warn!( + sequence = outcome.sequence, + changed_users = outcome.changed_users, + failed_users = outcome.failed_users, + total_users = outcome.total_users, + global_rebuilt = outcome.global_rebuilt, + "users-index publisher: tick had per-user pin failures; failed users will retry next tick" + ); + } if outcome.global_rebuilt { info!( sequence = outcome.sequence, changed_users = outcome.changed_users, + failed_users = outcome.failed_users, total_users = outcome.total_users, cid = %outcome.global_cid, "users-index publisher: tick committed new global" @@ -1404,11 +1488,14 @@ mod tests { // *behavior* (sequence advance, pin/unpin, diff-cache state), // not exact CID values. - async fn create_user_bucket( - manager: &BucketManager, + async fn create_user_bucket( + manager: &BucketManager, user_id: &str, bucket_name: &str, - ) { + ) + where + S: fula_blockstore::BlockStore + fula_blockstore::PinStore + 'static, + { manager .create_bucket_for_user( user_id, @@ -1860,4 +1947,437 @@ mod tests { assert!(outcome.global_rebuilt); assert!(publisher.ipns_api_url_for_test().is_none()); } + + // ============================================================ + // Per-user error tolerance (Phase 3.2 production hardening) + // ============================================================ + // + // Before this hardening: a single user's pin failure aborted the + // ENTIRE tick (the `for r in pin_results { let (...) = r?; }` + // pattern at the per-user collection step). At scale this means + // one corrupted user blocks every user's cold-start visibility. + // + // After: per-user failures are tolerated. The tick continues with + // succeeded users, the global is rebuilt from whatever the + // diff_cache currently holds (failed users keep their PRIOR cache + // row if any), and failed users naturally retry on the next tick + // because their `content_hash` still mismatches the unchanged + // cache row. + // + // The four scenarios below come from the advisor's required + // matrix: + // 1. Partial failure → succeeded users in global, failed users + // not in global, sequence advances. + // 2. All-unchanged + 1 new-but-failing → no rebuild needed, + // sequence does NOT advance (early-return path), and the + // "stale-but-consistent" property holds: prior global keeps + // serving prior CIDs. + // 3. All-fail-first-tick → empty global, sequence = 1, + // failed_users = N (deliberate empty-global semantic, same + // as zero-users-on-first-tick). + // 4. Failed user retries successfully on next tick → eventually + // appears in global. + + /// Test-only fault-injecting block store. Wraps `MemoryBlockStore` + /// and fails `put_ipld` whenever the serialized CBOR bytes contain + /// the configured marker substring. Tests set up a fault by + /// naming a bucket with the marker; the per-user CBOR for that + /// user contains the bucket name (Phase 1.2 legacy mode keys + /// entries by plaintext name when `bucket_lookup_h = None`), so + /// `put_ipld(&UserBucketsIndex)` for that user fails with the + /// marker present. + /// + /// **Why content-driven, not order-driven.** Production failures + /// are content-driven (a specific user's data triggers a + /// pinning-service edge case). Substring matching captures that + /// failure shape and stays robust to any future refactor of + /// `buffer_unordered` ordering inside `run_tick`. + /// + /// The marker is also (incidentally) present in `BucketRegistry` + /// CBORs that `BucketManager::persist_registry` writes, but that + /// failure is caught by `create_bucket_for_user` (line 909-911 + /// in bucket.rs) and only logged at warn level — the in-memory + /// `BucketManager.buckets` is updated regardless, which is what + /// the publisher reads. + #[derive(Clone)] + struct FaultyBlockStore { + inner: Arc, + fail_marker: Arc>>>, + } + + impl FaultyBlockStore { + fn new(inner: Arc) -> Self { + Self { + inner, + fail_marker: Arc::new(Mutex::new(None)), + } + } + + /// Configure the marker. `Some(s)` causes `put_ipld` to fail + /// when serialized bytes contain `s`. `None` clears injection. + fn set_fail_marker(&self, marker: Option<&str>) { + *self.fail_marker.lock() = marker.map(|s| s.as_bytes().to_vec()); + } + + /// Test helper: clone the inner store handle to inspect what + /// got pinned (since FaultyBlockStore.pin delegates). + fn inner(&self) -> Arc { + Arc::clone(&self.inner) + } + } + + #[async_trait::async_trait] + impl fula_blockstore::BlockStore for FaultyBlockStore { + async fn put_block(&self, data: &[u8]) -> fula_blockstore::Result { + self.inner.put_block(data).await + } + async fn get_block(&self, cid: &Cid) -> fula_blockstore::Result { + self.inner.get_block(cid).await + } + async fn has_block(&self, cid: &Cid) -> fula_blockstore::Result { + self.inner.has_block(cid).await + } + async fn delete_block(&self, cid: &Cid) -> fula_blockstore::Result<()> { + self.inner.delete_block(cid).await + } + async fn block_size(&self, cid: &Cid) -> fula_blockstore::Result { + self.inner.block_size(cid).await + } + async fn put_ipld( + &self, + data: &T, + ) -> fula_blockstore::Result { + // Delegate to the inner store first so the bytes are + // available for marker inspection via `get_block`. This + // avoids depending on serde_ipld_dagcbor directly (which + // isn't a fula-cli direct dep). The "block stored but + // not pinned" outcome models real production failures + // where a block reaches kubo but the cluster pin call + // fails — which is exactly the failure-mode this + // tolerance work guards against. + let cid = self.inner.put_ipld(data).await?; + // Snapshot the marker out of the parking_lot mutex guard + // before any `.await`. parking_lot's `MutexGuard` is not + // `Send`, so holding it across an await point makes the + // future non-Send and tokio refuses to spawn it. + let marker_snapshot: Option> = self.fail_marker.lock().clone(); + if let Some(marker) = marker_snapshot { + if !marker.is_empty() { + let bytes = self.inner.get_block(&cid).await?; + if bytes.windows(marker.len()).any(|w| w == marker.as_slice()) { + return Err(fula_blockstore::BlockStoreError::PinFailed( + "test-injected fault: marker substring present in stored block".into(), + )); + } + } + } + Ok(cid) + } + async fn get_ipld( + &self, + cid: &Cid, + ) -> fula_blockstore::Result { + self.inner.get_ipld(cid).await + } + } + + #[async_trait::async_trait] + impl fula_blockstore::PinStore for FaultyBlockStore { + async fn pin(&self, cid: &Cid, name: Option<&str>) -> fula_blockstore::Result<()> { + self.inner.pin(cid, name).await + } + async fn pin_with_token( + &self, + cid: &Cid, + name: Option<&str>, + token: &str, + ) -> fula_blockstore::Result<()> { + self.inner.pin_with_token(cid, name, token).await + } + async fn unpin(&self, cid: &Cid) -> fula_blockstore::Result<()> { + self.inner.unpin(cid).await + } + async fn is_pinned(&self, cid: &Cid) -> fula_blockstore::Result { + self.inner.is_pinned(cid).await + } + async fn list_pins(&self) -> fula_blockstore::Result> { + self.inner.list_pins().await + } + async fn pin_status(&self, cid: &Cid) -> fula_blockstore::Result { + self.inner.pin_status(cid).await + } + } + + /// Marker substring used by the per-user-error-tolerance tests. + /// Picked to be: + /// - lowercase letters + hyphens only → passes + /// `validate_bucket_name` so it can be a real bucket name + /// - long enough (19 chars) that a false-positive substring + /// match in random CBOR bytes is implausible + const FAULT_MARKER: &str = "fault-inject-bucket"; + + fn fixture_publisher_with_faulty_store( + path: PathBuf, + ) -> ( + UsersIndexPublisher, + Arc, + Arc>, + ) { + let inner = Arc::new(MemoryBlockStore::new()); + let faulty = Arc::new(FaultyBlockStore::new(Arc::clone(&inner))); + let manager = Arc::new(BucketManager::new(Arc::clone(&faulty))); + let publisher = UsersIndexPublisher::open_without_ipns( + fixture_config(path), + Arc::clone(&manager), + Arc::clone(&faulty), + ) + .expect("open"); + (publisher, faulty, manager) + } + + #[tokio::test] + async fn test_run_tick_partial_failure_publishes_succeeded_users() { + // Scenario 1: alice has a normal bucket, bob has a bucket + // whose name contains FAULT_MARKER. Bob's per-user CBOR + // pin fails. Alice's succeeds. The tick continues, advances + // sequence, and the published global contains alice but + // NOT bob. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, store, manager) = fixture_publisher_with_faulty_store(path); + + store.set_fail_marker(Some(FAULT_MARKER)); + + create_user_bucket(&manager, "alice", "photos").await; + // Bob's bucket name contains the marker. The per-user CBOR + // for bob is keyed by plaintext bucket name (Phase 1.2 legacy + // mode), so the marker substring lands in the CBOR bytes. + create_user_bucket(&manager, "bob", FAULT_MARKER).await; + + let outcome = publisher + .run_tick() + .await + .expect("tick MUST return Ok despite per-user pin failure"); + + assert_eq!( + outcome.changed_users, 1, + "exactly one user's CBOR was newly pinned (alice)" + ); + assert_eq!( + outcome.failed_users, 1, + "exactly one user's pin failed (bob)" + ); + assert_eq!(outcome.total_users, 2); + assert!( + outcome.global_rebuilt, + "global must be rebuilt to reflect alice's commit" + ); + assert_eq!(outcome.sequence, 1); + + // Decode the global CBOR: alice present, bob absent. + let inner = store.inner(); + let global: GlobalUsersIndex = + inner.get_ipld(&outcome.global_cid).await.expect("global"); + assert!( + global.users.contains_key("alice"), + "alice's userKey must be in published global" + ); + assert!( + !global.users.contains_key("bob"), + "bob's userKey must NOT be in published global (his pin failed)" + ); + } + + #[tokio::test] + async fn test_run_tick_failed_user_keeps_prior_cid_in_global() { + // Scenario 2 (advisor-mandated rigor): tick 1 — alice + bob + // both succeed. Tick 2 — alice gets a new bucket (will succeed), + // bob gets a marker bucket (will fail). The "stale-but- + // consistent" property: bob's entry in tick 2's published + // global must equal bob's PRIOR CID (from tick 1), NOT his + // new failed-pin CID. + // + // This guards against a future refactor that might + // accidentally republish bob with a stale-or-empty entry. If + // that happens, cold-start would point at content that isn't + // pinned, breaking bob's reads. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, store, manager) = fixture_publisher_with_faulty_store(path); + + // Tick 1: both users succeed. + create_user_bucket(&manager, "alice", "photos").await; + create_user_bucket(&manager, "bob", "docs").await; + let first = publisher.run_tick().await.expect("first tick"); + assert_eq!(first.changed_users, 2); + assert_eq!(first.failed_users, 0); + + // Capture bob's PRIOR per-user bucketsIndex CID. + let inner = store.inner(); + let first_global: GlobalUsersIndex = + inner.get_ipld(&first.global_cid).await.expect("first global"); + // CIDs in the global are stored as strings (not Cid), so + // clone for comparison after the next get_ipld call. + let bob_prior_cid = first_global.users["bob"].clone(); + let alice_prior_cid = first_global.users["alice"].clone(); + + // Defensive sanity: bob's prior CID's bytes are present in + // the inner store. If a future refactor made `bob_prior_cid` + // a default/empty Cid, the equality assertion below would + // pass for the wrong reason. This catches that. + let bob_prior_cid_parsed: Cid = bob_prior_cid.parse().expect("parse prior cid"); + assert!( + inner.get_block(&bob_prior_cid_parsed).await.is_ok(), + "bob's prior bucketsIndex CID must reference real bytes (sanity)" + ); + + // Now turn on fault injection. + store.set_fail_marker(Some(FAULT_MARKER)); + + // Alice gets a new (clean) bucket → her CBOR rebuilds + pins OK. + create_user_bucket(&manager, "alice", "videos").await; + // Bob gets a marker bucket → his per-user CBOR pin fails. + create_user_bucket(&manager, "bob", FAULT_MARKER).await; + + let second = publisher.run_tick().await.expect("second tick"); + assert_eq!( + second.changed_users, 1, + "alice's CBOR rebuild succeeded; bob's failed" + ); + assert_eq!( + second.failed_users, 1, + "bob's pin failed" + ); + assert!( + second.global_rebuilt, + "alice's change forces global rebuild" + ); + assert_eq!(second.sequence, 2, "sequence advances on real change"); + assert_ne!( + second.global_cid, first.global_cid, + "global CID must change because alice changed" + ); + + // Decode tick 2's global. bob's entry MUST be his PRIOR cid; + // alice's entry MUST be her new cid. + let second_global: GlobalUsersIndex = + inner.get_ipld(&second.global_cid).await.expect("second global"); + assert_eq!( + second_global.users["bob"], bob_prior_cid, + "stale-but-consistent: bob's failed pin must NOT erase his prior CID; \ + cold-start serves bob's prior bucketsIndex (still pinned + accessible)" + ); + assert_ne!( + second_global.users["alice"], alice_prior_cid, + "alice's CID changed because her content changed and her pin succeeded" + ); + } + + #[tokio::test] + async fn test_run_tick_all_users_fail_first_tick_publishes_empty_global() { + // Scenario 3: every user's pin fails on the first tick. + // No prior state to preserve → publisher proceeds to publish + // an EMPTY global (same code path as "zero users on first + // tick", which the existing + // `test_run_tick_no_users_first_publish_emits_empty_global` + // test already pins down). + // + // Operators see this as a nonzero `failed_users` in TickOutcome + // + per-user `warn!` lines. The empty-global publish itself + // is not a regression: the next tick when users start + // succeeding republishes with non-empty global, sequence + // advances. The chain anchor cron eventually submits the + // first non-empty CID. No data corruption, no stuck state. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, store, manager) = fixture_publisher_with_faulty_store(path); + + store.set_fail_marker(Some(FAULT_MARKER)); + + // Both users have marker buckets → both pins fail. + create_user_bucket(&manager, "alice", FAULT_MARKER).await; + // Different bucket name to ensure two distinct users (BucketManager + // accepts duplicate names per-user but we want two USERS). + let bob_bucket_name = format!("{}-2", FAULT_MARKER); + create_user_bucket(&manager, "bob", &bob_bucket_name).await; + + let outcome = publisher + .run_tick() + .await + .expect("tick MUST return Ok even when every per-user pin fails"); + + assert_eq!( + outcome.changed_users, 0, + "no per-user CBOR was successfully pinned" + ); + assert_eq!(outcome.failed_users, 2); + assert_eq!(outcome.total_users, 2); + assert!( + outcome.global_rebuilt, + "first publish must run even when every user failed (same as zero-users path)" + ); + assert_eq!(outcome.sequence, 1); + + let inner = store.inner(); + let global: GlobalUsersIndex = + inner.get_ipld(&outcome.global_cid).await.expect("global"); + assert_eq!( + global.users.len(), + 0, + "global has zero users — every user's CBOR pin failed" + ); + } + + #[tokio::test] + async fn test_run_tick_failed_user_retries_on_next_tick() { + // Scenario 4: bob fails on tick 1. Marker is cleared between + // ticks. On tick 2, bob's content_hash STILL mismatches his + // (unupdated) diff_cache row, so he's in `to_rebuild`. His + // pin succeeds this time; he appears in tick 2's global. + let dir = TempDir::new().unwrap(); + let path = dir.path().join("state.txt"); + let (publisher, store, manager) = fixture_publisher_with_faulty_store(path); + + // Set up: alice clean, bob with marker. + create_user_bucket(&manager, "alice", "photos").await; + create_user_bucket(&manager, "bob", FAULT_MARKER).await; + + // Tick 1: marker active → bob fails. + store.set_fail_marker(Some(FAULT_MARKER)); + let first = publisher.run_tick().await.expect("first tick"); + assert_eq!(first.changed_users, 1); + assert_eq!(first.failed_users, 1); + + let inner = store.inner(); + let first_global: GlobalUsersIndex = + inner.get_ipld(&first.global_cid).await.expect("first global"); + assert!( + !first_global.users.contains_key("bob"), + "bob absent from tick 1's global (failed pin)" + ); + + // Tick 2: clear the marker. bob's content_hash still doesn't + // match the (empty) cache row, so he's re-attempted. Pin + // succeeds this time → bob is in the global. + store.set_fail_marker(None); + let second = publisher.run_tick().await.expect("second tick"); + assert_eq!( + second.changed_users, 1, + "bob's retry succeeded; alice was unchanged" + ); + assert_eq!(second.failed_users, 0); + assert!(second.global_rebuilt); + assert_eq!(second.sequence, 2); + + let second_global: GlobalUsersIndex = + inner.get_ipld(&second.global_cid).await.expect("second global"); + assert!( + second_global.users.contains_key("bob"), + "bob present in tick 2's global (retry succeeded)" + ); + assert!( + second_global.users.contains_key("alice"), + "alice still present (unchanged across the two ticks)" + ); + } } diff --git a/crates/fula-client/Cargo.toml b/crates/fula-client/Cargo.toml index 017b095..5543805 100644 --- a/crates/fula-client/Cargo.toml +++ b/crates/fula-client/Cargo.toml @@ -56,6 +56,12 @@ cid = { workspace = true } sha2 = { workspace = true } # Mutex for per-gateway state in gateway_fetch (Phase 2.3). parking_lot = { workspace = true } +# Phase 3.3 cold-start hybrid resolver — parses the master-published +# global users-index dag-cbor payload directly (the resolver doesn't +# go through the full BlockStore trait, so it needs the codec by hand). +# `serde_json` for the chain-side eth_call request body is already +# pulled in via the cross-platform top-level deps. +serde_ipld_dagcbor = { workspace = true } [target.'cfg(target_arch = "wasm32")'.dependencies] # WASM: disable default features (no tokio), enable wasm feature @@ -72,6 +78,10 @@ tokio = { workspace = true } tokio-test = "0.4" wiremock = { workspace = true } tempfile = { workspace = true } +# Phase 3.3 — verifies the hardcoded `latest()` ABI selector matches +# `Keccak256("latest()")[..4]`. Tests-only; the prod build hardcodes +# the 4-byte constant to avoid a runtime crypto dep on the hot path. +sha3 = "0.10" [features] default = [] diff --git a/crates/fula-client/src/block_cache.rs b/crates/fula-client/src/block_cache.rs index 52e84ca..b2d6595 100644 --- a/crates/fula-client/src/block_cache.rs +++ b/crates/fula-client/src/block_cache.rs @@ -56,6 +56,40 @@ use tokio::sync::Mutex; const BLOCKS: TableDefinition<&[u8], &[u8]> = TableDefinition::new("blocks"); const META: TableDefinition<&[u8], u64> = TableDefinition::new("meta"); +/// Phase 2.4 lookup table: maps `(bucket, key)` (hashed with a +/// domain separator) → CID bytes. Used by the offline-fallback path +/// to translate an S3-key request into the IPFS CID it can fetch via +/// the gateway race. Populated as a side-effect of master-up reads +/// in `FulaClient::get_object_with_offline_fallback`. +/// +/// Key format: `BLAKE3("fula:block-cache:key-to-cid:v1" || bucket || 0x00 || key)[..32]` +/// — fixed 32 bytes, collision-resistant, fast B-tree lookup. Value: +/// raw CID bytes (the same encoding used as the BLOCKS table key, so +/// a `KEY_TO_CID` lookup directly gives the bytes needed to query +/// BLOCKS or to construct a `Cid` for the gateway race). +const KEY_TO_CID: TableDefinition<&[u8], &[u8]> = TableDefinition::new("key_to_cid"); + +/// Phase 3.3.5 small-key-value metadata table. Stores resolver +/// hot-start state across SDK restarts: +/// - `users_index/cid` → CID bytes (cid.to_bytes()) +/// - `users_index/sequence` → u64 BE +/// - `users_index/observed_at_unix` → u64 BE +/// +/// Three rows, ~80 bytes total. The cached `(cid, sequence)` seeds +/// the resolver's replay-defense floor on construction; a fresh +/// `observed_at` lets the resolver short-circuit IPNS+chain when +/// the entry is within `ResolverConfig::soft_ttl`. +/// +/// Schema versioning: deliberately omitted in 3.3.5 (advisor cut). +/// When a v2 schema lands, add a `metadata.schema_id` constant + +/// drop-on-mismatch logic together with the real migration story. +const METADATA: TableDefinition<&[u8], &[u8]> = TableDefinition::new("metadata"); + +/// Metadata row keys (string literals stored as `&[u8]`). +const META_USERS_INDEX_CID: &[u8] = b"users_index/cid"; +const META_USERS_INDEX_SEQUENCE: &[u8] = b"users_index/sequence"; +const META_USERS_INDEX_OBSERVED_AT: &[u8] = b"users_index/observed_at_unix"; + /// Eviction low-watermark: when triggered, free space until usage is at /// or below this fraction of `max_bytes`. 80 % is the industry-standard /// "evict-once-amortize-many-puts" point. @@ -142,7 +176,7 @@ impl From for BlockCacheError { /// /// Cheap-clone via `Arc`: clones share the same database, so a `put` /// observed by one clone is immediately visible to all others. -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct BlockCache { inner: Arc, } @@ -159,6 +193,19 @@ struct BlockCacheInner { evict_lock: Mutex<()>, } +// `redb::Database` doesn't implement `Debug`, so we hand-roll a +// minimal `Debug` for `BlockCacheInner` that prints just the +// observable knobs. Required because `UsersIndexResolver` derives +// `Debug` and now holds an `Option>`. +impl std::fmt::Debug for BlockCacheInner { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("BlockCacheInner") + .field("max_bytes", &self.max_bytes) + .field("current_bytes", &self.current_bytes.load(Ordering::Acquire)) + .finish_non_exhaustive() + } +} + impl BlockCache { /// Open or create the block cache at `path` with a budget of /// `max_bytes` total stored block-bytes. @@ -194,6 +241,14 @@ impl BlockCache { { let _ = init_txn.open_table(BLOCKS)?; let _ = init_txn.open_table(META)?; + // Phase 2.4 — additive table. An older redb file written + // before Phase 2.4 will not have it; opening it here + // creates it lazily without touching BLOCKS / META data. + let _ = init_txn.open_table(KEY_TO_CID)?; + // Phase 3.3.5 — resolver hot-start metadata. Same + // additive-on-open pattern; older Phase 2.x cache files + // gain it transparently on next open. + let _ = init_txn.open_table(METADATA)?; } init_txn.commit()?; @@ -221,7 +276,18 @@ impl BlockCache { }) } + // The three accessors below — `max_bytes`, `current_bytes`, + // `entry_count` — are public monitoring API for SDK consumers + // (apps that want to surface "cache 240 / 256 MiB used" UI, or + // for ops dashboards). The fula-client crate itself doesn't call + // them internally, hence the `#[allow(dead_code)]` to silence + // the workspace-default warning. Phase 19 (`HealthCallback` / + // `ReadFreshness`) will likely expose these via a typed status + // struct rather than direct field access; keep the accessors + // public until then so app integrators have a stable surface. + /// Configured budget in bytes. + #[allow(dead_code)] pub fn max_bytes(&self) -> u64 { self.inner.max_bytes } @@ -229,12 +295,14 @@ impl BlockCache { /// Approximate current byte usage. Eventually consistent under /// concurrent writes (the next read after all writes settle is /// exact). + #[allow(dead_code)] pub fn current_bytes(&self) -> u64 { self.inner.current_bytes.load(Ordering::Acquire) } /// Number of cached blocks. O(1) approximation via the underlying /// table length. + #[allow(dead_code)] pub fn entry_count(&self) -> Result { let read = self.inner.db.begin_read()?; let table = read.open_table(BLOCKS)?; @@ -336,6 +404,139 @@ impl BlockCache { Ok(()) } + /// Phase 2.4 — record an `(bucket, key) → cid` mapping observed + /// during a successful master-up read. Lets the offline-fallback + /// path translate a future S3-key request into the IPFS CID it + /// can fetch via the gateway race. + /// + /// Idempotent on repeated calls with the same arguments. The + /// underlying redb table grows unbounded today (one entry per + /// distinct `(bucket, key)` tuple ever observed). At expected + /// scale (a few thousand objects per device) this is fine; if + /// growth becomes an issue, eviction can be added at the same + /// point as block-cache LRU eviction in a future iteration. + /// Note that the mapping is small (~40 bytes per entry vs. + /// kilobytes for typical block payloads), so the BLOCKS table's + /// LRU pressure dominates space concerns by orders of magnitude. + pub fn record_key_cid( + &self, + bucket: &str, + key: &str, + cid: &Cid, + ) -> Result<(), BlockCacheError> { + let lookup_key = derive_key_cid_lookup(bucket, key); + let cid_bytes = cid.to_bytes(); + let txn = self.inner.db.begin_write()?; + { + let mut table = txn.open_table(KEY_TO_CID)?; + table.insert(lookup_key.as_slice(), cid_bytes.as_slice())?; + } + txn.commit()?; + Ok(()) + } + + /// Phase 2.4 — look up a previously-observed CID for `(bucket, key)`. + /// Returns `None` if the SDK has not seen this object during a + /// master-up read yet (the cold-start case, which the wrapper + /// surfaces as `MasterUnreachable` so Phase 3.3 can take over). + pub fn lookup_cid(&self, bucket: &str, key: &str) -> Result, BlockCacheError> { + let lookup_key = derive_key_cid_lookup(bucket, key); + let read = self.inner.db.begin_read()?; + let table = read.open_table(KEY_TO_CID)?; + match table.get(lookup_key.as_slice())? { + Some(v) => { + let bytes = v.value(); + // Round-trip through Cid to validate; corrupt entries + // are rare (would mean redb bit-flip) but failing + // closed is safer than serving a malformed CID to the + // gateway race. + Cid::try_from(bytes) + .map(Some) + .map_err(|e| BlockCacheError::Corrupt(format!("invalid CID in KEY_TO_CID: {}", e))) + } + None => Ok(None), + } + } + + /// Phase 3.3.5 — persist the resolver's last successful resolve + /// so a future SDK process can skip the IPNS+chain dance when + /// it's still fresh AND seed the replay-defense floor across + /// restarts. + /// + /// Single redb write transaction (atomic across the three rows). + /// Crate-private: apps must not plant resolver state directly. + pub(crate) fn store_users_index_state( + &self, + cid: &Cid, + sequence: u64, + observed_at_unix: u64, + ) -> Result<(), BlockCacheError> { + let cid_bytes = cid.to_bytes(); + let txn = self.inner.db.begin_write()?; + { + let mut table = txn.open_table(METADATA)?; + table.insert(META_USERS_INDEX_CID, cid_bytes.as_slice())?; + table.insert(META_USERS_INDEX_SEQUENCE, sequence.to_be_bytes().as_slice())?; + table.insert( + META_USERS_INDEX_OBSERVED_AT, + observed_at_unix.to_be_bytes().as_slice(), + )?; + } + txn.commit()?; + Ok(()) + } + + /// Phase 3.3.5 — load the resolver hot-start state. Returns + /// `None` if any of the three rows is missing or malformed + /// (treats partial writes as if the cache were empty — the + /// resolver then falls through to a full IPNS+chain resolve). + pub(crate) fn load_users_index_state( + &self, + ) -> Result, BlockCacheError> { + let read = self.inner.db.begin_read()?; + let table = read.open_table(METADATA)?; + + let cid_bytes = match table.get(META_USERS_INDEX_CID)? { + Some(v) => v.value().to_vec(), + None => return Ok(None), + }; + let cid = match Cid::try_from(cid_bytes.as_slice()) { + Ok(c) => c, + // Malformed → treat as no state (defensive). Don't + // surface as Corrupt — that would block all hot-start + // reads on a single bad row instead of degrading to a + // fresh resolve. + Err(e) => { + tracing::warn!(error = %e, "users-index metadata: invalid CID; treating as empty"); + return Ok(None); + } + }; + + let seq_bytes = match table.get(META_USERS_INDEX_SEQUENCE)? { + Some(v) => v.value().to_vec(), + None => return Ok(None), + }; + let observed_bytes = match table.get(META_USERS_INDEX_OBSERVED_AT)? { + Some(v) => v.value().to_vec(), + None => return Ok(None), + }; + if seq_bytes.len() != 8 || observed_bytes.len() != 8 { + tracing::warn!("users-index metadata: malformed length; treating as empty"); + return Ok(None); + } + + let mut seq = [0u8; 8]; + seq.copy_from_slice(&seq_bytes); + let mut obs = [0u8; 8]; + obs.copy_from_slice(&observed_bytes); + + Ok(Some(( + cid, + u64::from_be_bytes(seq), + u64::from_be_bytes(obs), + ))) + } + /// Evict LRU entries until `current_bytes <= target_bytes`. Caller /// must hold `evict_lock`. Atomic via a single redb write txn. fn evict_to(&self, target_bytes: u64) -> Result<(), BlockCacheError> { @@ -396,6 +597,26 @@ fn now_ms() -> u64 { .unwrap_or(0) } +/// Phase 2.4 — derive the redb-key for the KEY_TO_CID table. +/// +/// `BLAKE3("fula:block-cache:key-to-cid:v1" || bucket || 0x00 || key)[..32]`. +/// Domain separator pins the namespace; the embedded `0x00` between +/// bucket and key forecloses any ambiguity from S3 keys that contain +/// `/` (a single concatenation without separator could collide +/// `bucket=foo, key=bar` with `bucket=foo/bar, key=`). 32-byte output +/// is fixed-length for fast B-tree lookups. +fn derive_key_cid_lookup(bucket: &str, key: &str) -> [u8; 32] { + let mut hasher = blake3::Hasher::new(); + hasher.update(b"fula:block-cache:key-to-cid:v1"); + hasher.update(bucket.as_bytes()); + hasher.update(&[0u8]); + hasher.update(key.as_bytes()); + let h = hasher.finalize(); + let mut out = [0u8; 32]; + out.copy_from_slice(h.as_bytes()); + out +} + #[cfg(test)] mod tests { use super::*; @@ -625,4 +846,232 @@ mod tests { assert!(cache.current_bytes() >= 256); } } + + // ============================================================ + // Phase 2.4 — KEY_TO_CID lookup table tests + // ============================================================ + + #[tokio::test] + async fn test_record_and_lookup_key_cid_roundtrip() { + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024 * 1024); + + let cid = test_cid(123); + cache + .record_key_cid("photos", "vacation/dsc_001.jpg", &cid) + .expect("record"); + + let got = cache + .lookup_cid("photos", "vacation/dsc_001.jpg") + .expect("lookup") + .expect("present"); + assert_eq!(got, cid, "round-trip yields the exact CID"); + } + + #[tokio::test] + async fn test_lookup_missing_key_returns_none() { + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024 * 1024); + let got = cache.lookup_cid("photos", "never-seen.jpg").expect("lookup"); + assert!(got.is_none(), "cold-start must return None, not error"); + } + + #[tokio::test] + async fn test_record_idempotent_on_repeat() { + // Re-recording the same (bucket, key, cid) triple must not error, + // and the lookup must continue returning the same CID. This is + // load-bearing: the offline-fallback wrapper records on every + // master-up read, so the same object will be re-recorded on each + // refetch. + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024 * 1024); + + let cid = test_cid(7); + for _ in 0..5 { + cache.record_key_cid("docs", "tax/2024.pdf", &cid).expect("record"); + } + let got = cache.lookup_cid("docs", "tax/2024.pdf").expect("lookup").expect("hit"); + assert_eq!(got, cid); + } + + #[tokio::test] + async fn test_record_overwrites_when_cid_changes() { + // After an object is updated on master, the etag (= CID) + // changes. The next master-up read records the NEW CID under + // the same `(bucket, key)` — and the old CID entry is replaced. + // Otherwise offline reads would serve a stale block forever. + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024 * 1024); + + let cid_v1 = test_cid(1); + let cid_v2 = test_cid(2); + + cache.record_key_cid("photos", "live.jpg", &cid_v1).expect("v1"); + cache.record_key_cid("photos", "live.jpg", &cid_v2).expect("v2"); + + let got = cache.lookup_cid("photos", "live.jpg").expect("lookup").expect("hit"); + assert_eq!(got, cid_v2, "must reflect the latest recorded CID"); + assert_ne!(got, cid_v1); + } + + #[tokio::test] + async fn test_distinct_buckets_dont_collide() { + // Same key in different buckets must map to distinct CIDs. The + // BLAKE3 domain-separated lookup-key derivation guarantees this; + // a regression here would mean two users seeing each other's data + // via the offline path. + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024 * 1024); + + let cid_a = test_cid(10); + let cid_b = test_cid(11); + + cache.record_key_cid("alice-bucket", "shared.txt", &cid_a).expect("a"); + cache.record_key_cid("bob-bucket", "shared.txt", &cid_b).expect("b"); + + let got_a = cache + .lookup_cid("alice-bucket", "shared.txt") + .expect("lookup") + .expect("hit"); + let got_b = cache + .lookup_cid("bob-bucket", "shared.txt") + .expect("lookup") + .expect("hit"); + assert_eq!(got_a, cid_a); + assert_eq!(got_b, cid_b); + assert_ne!(got_a, got_b, "isolation between buckets is mandatory"); + } + + #[tokio::test] + async fn test_key_to_cid_survives_restart() { + // Same persistence contract as the BLOCKS table: lookups must + // survive SDK process restart. Without this, every SDK launch + // would degrade to "cold start until the cache repopulates", + // which defeats the warm-device offline guarantee. + let dir = TempDir::new().unwrap(); + let cid = test_cid(99); + + { + let cache = open_cache(&dir, 1024 * 1024); + cache + .record_key_cid("persist-bucket", "important.bin", &cid) + .expect("record"); + } + { + let cache = open_cache(&dir, 1024 * 1024); + let got = cache + .lookup_cid("persist-bucket", "important.bin") + .expect("lookup") + .expect("hit after restart"); + assert_eq!(got, cid); + } + } + + #[test] + fn test_derive_key_cid_lookup_is_deterministic() { + // Same inputs → same hash. Required for repeated record/lookup + // to land in the same redb key. + let h1 = derive_key_cid_lookup("foo", "bar"); + let h2 = derive_key_cid_lookup("foo", "bar"); + assert_eq!(h1, h2); + } + + #[test] + fn test_derive_key_cid_lookup_separator_prevents_concat_collision() { + // The 0x00 byte between bucket and key is load-bearing. + // Without it, ("foo/bar", "") and ("foo", "/bar") would collide. + // With it, they hash differently because the null byte is + // disambiguating. + let h1 = derive_key_cid_lookup("foo/bar", ""); + let h2 = derive_key_cid_lookup("foo", "/bar"); + assert_ne!(h1, h2, "domain separator must prevent concat-collision"); + } + + #[test] + fn test_derive_key_cid_lookup_outputs_32_bytes() { + let h = derive_key_cid_lookup("any-bucket", "any-key"); + assert_eq!(h.len(), 32, "BLAKE3 output is exactly 32 bytes"); + } + + // ============================================================ + // Phase 3.3.5 — METADATA table tests + // ============================================================ + + #[tokio::test] + async fn test_load_users_index_state_returns_none_on_fresh_cache() { + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024 * 1024); + let got = cache.load_users_index_state().expect("load"); + assert!( + got.is_none(), + "fresh cache must have no resolver state — full IPNS+chain resolve required on first run" + ); + } + + #[tokio::test] + async fn test_store_and_load_users_index_state_roundtrip() { + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024 * 1024); + let cid = test_cid(0xab); + + cache + .store_users_index_state(&cid, 42, 1_700_000_000) + .expect("store"); + + let (got_cid, got_seq, got_observed) = cache + .load_users_index_state() + .expect("load") + .expect("present"); + assert_eq!(got_cid, cid); + assert_eq!(got_seq, 42); + assert_eq!(got_observed, 1_700_000_000); + } + + #[tokio::test] + async fn test_users_index_state_survives_restart() { + // Replay-defense critical: the `(cid, sequence)` floor MUST + // persist across SDK restarts so a malicious gateway can't + // serve a stale-but-valid payload to a fresh process. + let dir = TempDir::new().unwrap(); + let cid = test_cid(0xee); + + { + let cache = open_cache(&dir, 1024 * 1024); + cache + .store_users_index_state(&cid, 99, 1_700_000_999) + .expect("store"); + } + { + let cache = open_cache(&dir, 1024 * 1024); + let (got_cid, got_seq, got_obs) = cache + .load_users_index_state() + .expect("load") + .expect("survived"); + assert_eq!(got_cid, cid); + assert_eq!(got_seq, 99); + assert_eq!(got_obs, 1_700_000_999); + } + } + + #[tokio::test] + async fn test_store_users_index_state_overwrites() { + // Each successful resolver run writes the latest `(cid, seq, ts)`. + // A subsequent write must overwrite the prior row, not stack. + let dir = TempDir::new().unwrap(); + let cache = open_cache(&dir, 1024 * 1024); + + let cid_v1 = test_cid(1); + cache.store_users_index_state(&cid_v1, 5, 100).expect("v1"); + + let cid_v2 = test_cid(2); + cache.store_users_index_state(&cid_v2, 10, 200).expect("v2"); + + let (got_cid, got_seq, got_obs) = cache + .load_users_index_state() + .expect("load") + .expect("hit"); + assert_eq!(got_cid, cid_v2); + assert_eq!(got_seq, 10); + assert_eq!(got_obs, 200); + } } diff --git a/crates/fula-client/src/client.rs b/crates/fula-client/src/client.rs index af49186..576b69c 100644 --- a/crates/fula-client/src/client.rs +++ b/crates/fula-client/src/client.rs @@ -10,6 +10,17 @@ use reqwest::{Client, Response, header}; use std::collections::HashMap; use std::sync::Arc; use tracing::{debug, instrument}; +// `warn` is only used by the native-only offline-fallback wrapper — +// gate the import so wasm builds don't emit `unused_imports`. +#[cfg(not(target_arch = "wasm32"))] +use tracing::warn; + +#[cfg(not(target_arch = "wasm32"))] +use crate::{ + block_cache::BlockCache, + gateway_fetch::GatewayPool, + registry_resolver::{ResolverConfig, UsersIndexResolver}, +}; /// Fula storage client #[derive(Clone)] @@ -22,6 +33,31 @@ pub struct FulaClient { /// rest. `None` when the feature is off — request path then runs /// exactly as before (backward-compat). health_gate: Option>, + + /// Phase 2.2 / 2.4. `Some` when `Config::block_cache_enabled = true` + /// AND the configured path opens successfully. Native-only — wasm + /// builds compile without this field. Used by the offline-fallback + /// wrapper to record `(bucket, key) → cid` and to short-circuit + /// repeated reads of the same content via the BLOCKS table. + #[cfg(not(target_arch = "wasm32"))] + block_cache: Option>, + + /// Phase 2.3 / 2.4. `Some` when `Config::gateway_fallback_enabled + /// = true` AND `block_cache_enabled = true` (the cache is a + /// prerequisite — without it the fallback has no CID to fetch). + /// Native-only. + #[cfg(not(target_arch = "wasm32"))] + gateway_pool: Option>, + + /// Phase 3.3. `Some` when `Config::users_index_resolver_enabled + /// = true` AND all four resolver fields (chain_rpc_url, + /// anchor_address, ipns_name, user_key) are populated. The + /// EncryptedClient cold-start path uses this to discover the + /// per-user `bucketsIndexCid` when KEY_TO_CID misses. + /// Native-only — cold-start is a no-op on wasm until a + /// browser-friendly resolver lands. + #[cfg(not(target_arch = "wasm32"))] + users_index_resolver: Option>, } impl FulaClient { @@ -43,12 +79,147 @@ impl FulaClient { let http = builder.build().map_err(ClientError::Http)?; let health_gate = if config.health_gate_enabled { - Some(Arc::new(HealthGate::new(config.health_gate_ttl))) + // Phase 19 — wire the optional health callback into the + // gate. With_callback fires `Online` / `OfflineFallbackActive` + // on Up↔Down transitions; without one the gate behaves + // identically to pre-Phase-19 builds (silent). + let gate = match config.health_callback.as_ref() { + Some(cb) => HealthGate::with_callback(config.health_gate_ttl, Arc::clone(cb)), + None => HealthGate::new(config.health_gate_ttl), + }; + Some(Arc::new(gate)) + } else { + None + }; + + // Phase 2.2 / 2.4 — block cache + gateway pool. Native-only. + // Construction failures degrade gracefully to "no cache / + // no fallback" rather than failing SDK init outright; the + // operator's other workflows (master-up reads) keep working. + #[cfg(not(target_arch = "wasm32"))] + let block_cache = if config.block_cache_enabled { + match build_block_cache(&config) { + Ok(cache) => Some(Arc::new(cache)), + Err(e) => { + warn!( + error = %e, + "block_cache: failed to open; offline fallback disabled for this session" + ); + None + } + } + } else { + None + }; + + // GatewayPool requires block_cache as a hard prereq: without + // a cached `(bucket, key) → cid` mapping the fallback path has + // no CID to fetch. If the cache failed to open we silently + // disable gateway fallback too. + #[cfg(not(target_arch = "wasm32"))] + let gateway_pool = if config.gateway_fallback_enabled && block_cache.is_some() { + let pool = if config.gateway_fallback_urls.is_empty() { + GatewayPool::default_pool() + } else { + GatewayPool::with_gateways( + config.gateway_fallback_urls.clone(), + config.gateway_race_concurrency.max(1), + ) + }; + Some(Arc::new(pool)) + } else { + None + }; + + // Phase 3.3 — cold-start hybrid resolver. Configured iff + // ALL four required fields are populated (no separate + // `enabled` bool — field presence is the single source of + // truth, per the audit-driven simplification documented on + // Config). Fails closed: any missing field → resolver stays + // None and cold-start surfaces UsersIndexResolutionFailed + // at the call site rather than imploding SDK init. + #[cfg(not(target_arch = "wasm32"))] + let users_index_resolver = if !config.users_index_chain_rpc_url.is_empty() + && !config.users_index_anchor_address.is_empty() + && !config.users_index_ipns_name.is_empty() + && config.users_index_user_key.is_some() + { + let mut resolver_cfg = ResolverConfig::new( + config.users_index_chain_rpc_url.clone(), + config.users_index_anchor_address.clone(), + config.users_index_ipns_name.clone(), + ); + // Phase 3.3 gateway overrides — empty Vec = use defaults. + // Operators (and tests) can pin custom gateways here. + if !config.users_index_ipns_gateway_urls.is_empty() { + resolver_cfg.ipns_gateways = config.users_index_ipns_gateway_urls.clone(); + } + if !config.users_index_ipfs_gateway_urls.is_empty() { + resolver_cfg.ipfs_gateways = config.users_index_ipfs_gateway_urls.clone(); + } + // Phase 3.3.5 — wire the BlockCache into the resolver + // when both are configured. The cache enables hot-start + // (replay-defense floor seeded across restarts; full + // network round-trip skipped within `soft_ttl`). When + // BlockCache is disabled, the resolver still works — + // just without the on-disk persistence layer. + let resolver_result = match block_cache.as_ref() { + Some(cache) => UsersIndexResolver::new_with_cache(resolver_cfg, Arc::clone(cache)), + None => UsersIndexResolver::new(resolver_cfg), + }; + match resolver_result { + Ok(r) => Some(Arc::new(r)), + Err(e) => { + warn!( + error = %e, + "users_index_resolver: construction failed; cold-start unavailable for this session" + ); + None + } + } } else { None }; - Ok(Self { config, http, health_gate }) + Ok(Self { + config, + http, + health_gate, + #[cfg(not(target_arch = "wasm32"))] + block_cache, + #[cfg(not(target_arch = "wasm32"))] + gateway_pool, + #[cfg(not(target_arch = "wasm32"))] + users_index_resolver, + }) + } + + /// Phase 3.3 — accessor for the cold-start hybrid resolver. + /// Returns `Some` only when all four resolver config fields are + /// populated (`users_index_resolver_enabled = true` plus + /// `chain_rpc_url`, `anchor_address`, `ipns_name`, `user_key`) + /// AND construction succeeded. Native-only. + #[cfg(not(target_arch = "wasm32"))] + pub fn users_index_resolver(&self) -> Option<&Arc> { + self.users_index_resolver.as_ref() + } + + /// Phase 2.2 — accessor for the on-disk block cache. Returns + /// `Some` when the cache is enabled AND opened successfully. + /// Native-only. Used by the cold-start path to populate + /// `KEY_TO_CID` after resolving the manifest CID. + #[cfg(not(target_arch = "wasm32"))] + pub fn block_cache(&self) -> Option<&Arc> { + self.block_cache.as_ref() + } + + /// Phase 2.3 — accessor for the gateway pool. Returns `Some` when + /// the pool is enabled AND `block_cache` is also enabled (the + /// pair is required for the offline-fallback path to fetch + /// CID-addressed bytes). Native-only. + #[cfg(not(target_arch = "wasm32"))] + pub fn gateway_pool(&self) -> Option<&Arc> { + self.gateway_pool.as_ref() } /// Create with default configuration @@ -66,6 +237,36 @@ impl FulaClient { &self.config } + /// Phase 19 — fire a `MasterHealthEvent` through the configured + /// health callback (if any). No-op when no callback is set. + /// Panic-safe: a buggy app callback that panics is swallowed and + /// logged at warn (same protection the gate uses internally). + /// + /// Used by the cold-start failure path in `EncryptedClient` to + /// emit `SeverelyDegraded` when both IPNS and chain channels + /// have exhausted; the health gate itself never emits + /// `SeverelyDegraded` because it can't authoritatively detect + /// "both down" without trying. + /// + /// Native-only: cold-start (the only consumer) is gated to + /// `cfg(not(target_arch = "wasm32"))`. + #[cfg(not(target_arch = "wasm32"))] + pub(crate) fn fire_health_event(&self, event: crate::health_gate::MasterHealthEvent) { + if let Some(cb) = self.config.health_callback.as_ref() { + let cb = Arc::clone(cb); + let event_clone = event.clone(); + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(move || { + cb(event_clone); + })); + if result.is_err() { + tracing::warn!( + event = ?event, + "health_callback panicked (cold-start path); SDK proceeding" + ); + } + } + } + /// Access the pooled HTTP client for internal modules (e.g. multipart /// helpers) that need to issue raw requests. Exposing this keeps /// connection pooling and configured timeouts intact instead of minting @@ -344,19 +545,19 @@ impl FulaClient { ) -> Result { let path = format!("/{}/{}", bucket, key); let response = self.request("GET", &path, None, None, None).await?; - + let headers = response.headers(); let etag = headers .get("ETag") .and_then(|v| v.to_str().ok()) .map(|s| s.trim_matches('"').to_string()) .unwrap_or_default(); - + let content_type = headers .get("Content-Type") .and_then(|v| v.to_str().ok()) .map(|s| s.to_string()); - + let content_length = headers .get("Content-Length") .and_then(|v| v.to_str().ok()) @@ -384,6 +585,252 @@ impl FulaClient { }) } + /// Phase 2.4 — `get_object` with offline fallback to public IPFS + /// gateways when master is unreachable. + /// + /// Behavior matrix: + /// + /// | State | Behavior | source | freshness | + /// |----------------------------------------|-------------------------------------------------------------|------------------------------|-------------------------| + /// | flags off | Identical to `get_object_with_metadata` (backward-compat). | `Master` | `Live` | + /// | flags on, master up, master responds | Serve master bytes; populate KEY_TO_CID + BLOCKS. | `Master` | `Live` | + /// | flags on, master down, KEY_TO_CID hit | Race the gateway pool for the cached CID; verify; populate. | `Gateway(url)` or `LocalCache` | `Cached { observed_at }` | + /// | flags on, master down, KEY_TO_CID miss | Return `MasterUnreachable` (cold-start; Phase 3.3 territory).| n/a | n/a | + /// | wasm32 target | Always delegates to `get_object_with_metadata` (no cache / | `Master` | `Live` | + /// | | gateway race plumbing on web). | | | + /// + /// Etag rewrite: when the bytes come from the gateway race the + /// returned `OfflineGetResult.inner.etag` is set to `cid.to_string()` + /// so downstream callers (e.g., `load_forest_internal`) see the + /// same ETag-as-CID convention master uses on the fast path. + /// + /// **Known offline-path difference (Phase 2.4 v1):** when bytes + /// come from the gateway race or BLOCKS cache, the returned + /// `inner.metadata` is **empty** and `content_type` is `None`. + /// Master-up responses still surface `x-amz-meta-*` headers in + /// `metadata`. Encrypted-SDK callers never read user-metadata, so + /// this is invisible to them; app-level callers that depend on + /// user-metadata should treat the offline path as metadata-stripped. + /// + /// **Phase 19 — return type changed to `OfflineGetResult`.** The + /// extra fields `source: ReadSource` and `freshness: ReadFreshness` + /// let apps surface "you're offline; reading from cache" UI without + /// observing internal state. Existing callers extract `.inner.data` + /// / `.inner.etag` to access the bytes (one-line change). + /// + /// **Breaking change vs. Phase 2.4:** the previous `Result` + /// signature is gone. Audit (2026-05-02) established no external + /// SDK consumers — Phase 2.4 GET-path wiring (task #15) is still + /// pending — so today's blast radius is zero. Document this in + /// the next release note so the Phase 2.4 wiring lands with the + /// new signature and doesn't accidentally inherit a backward-compat + /// expectation. Internal callers (S3BlobBackend, encrypted + /// cold-start) are already updated; their bytes are accessed via + /// `result.inner.data`. + #[cfg(not(target_arch = "wasm32"))] + #[instrument(skip(self))] + pub async fn get_object_with_offline_fallback( + &self, + bucket: &str, + key: &str, + ) -> Result { + // Fast path — if neither flag is on, this is byte-identical + // to the existing call. The new method costs nothing in + // existing deployments. + if self.block_cache.is_none() && self.gateway_pool.is_none() { + let inner = self.get_object_with_metadata(bucket, key).await?; + return Ok(OfflineGetResult { + inner, + source: ReadSource::Master, + freshness: ReadFreshness::Live, + }); + } + + let cache = self.block_cache.clone(); + + // Master attempt. If health gate already says Down, request() + // short-circuits before touching the network. Otherwise we + // hit master normally. + match self.get_object_with_metadata(bucket, key).await { + Ok(result) => { + // Master-up success path: record the CID side-effect. + // Skip if etag is empty (defensive: every master + // response should have one, but a future endpoint + // change shouldn't break the wrapper). + if let Some(cache) = &cache { + if !result.etag.is_empty() { + if let Ok(cid) = result.etag.parse::() { + // Both writes are best-effort: a redb error + // logs and proceeds (the master read already + // succeeded, so the user gets their bytes). + if let Err(e) = cache.record_key_cid(bucket, key, &cid) { + debug!( + error = %e, + "block_cache: record_key_cid failed (best-effort; master fetch already succeeded)" + ); + } + // Cache the bytes themselves so a subsequent + // master-down read can serve them without + // any network round-trip at all. + if let Err(e) = cache.put(&cid, &result.data).await { + // BlockTooLarge is expected for huge + // objects (>cache budget); not a bug. + debug!( + error = %e, + "block_cache: put failed (best-effort)" + ); + } + } + } + } + Ok(OfflineGetResult { + inner: result, + source: ReadSource::Master, + freshness: ReadFreshness::Live, + }) + } + Err(e) if is_master_unreachable_error(&e) => { + // Master-down: try the offline path. Requires the + // cache + pool to be set AND a prior master-up read + // for this `(bucket, key)` to have populated KEY_TO_CID. + self.try_offline_fallback(bucket, key, e).await + } + // Non-master-down errors (4xx, auth failures, etc.) + // propagate without any fallback attempt — they're not + // about availability. + Err(e) => Err(e), + } + } + + /// Wasm version: no offline fallback infrastructure exists on + /// browsers (block_cache + gateway_fetch are gated out). Delegate + /// to the regular method so call sites can use one name across + /// targets without additional `cfg` gates of their own. + #[cfg(target_arch = "wasm32")] + pub async fn get_object_with_offline_fallback( + &self, + bucket: &str, + key: &str, + ) -> Result { + let inner = self.get_object_with_metadata(bucket, key).await?; + Ok(OfflineGetResult { + inner, + source: ReadSource::Master, + freshness: ReadFreshness::Live, + }) + } + + /// Phase 2.4 fallback step. Looks up the cached CID for the + /// requested `(bucket, key)`; if absent, returns the original + /// `MasterUnreachable` error (cold-start case — Phase 3.3 catches + /// it). If present, races the gateway pool for that CID; on + /// verification success, populates BLOCKS and returns a synthesized + /// `OfflineGetResult` with `source = LocalCache` (BLOCKS hit) or + /// `source = Gateway(url_template)` (gateway race), and + /// `freshness = Cached { observed_at }`. On any gateway-side + /// failure, propagates the original master-down error so the + /// caller sees a stable error type regardless of which channel + /// ultimately failed. + #[cfg(not(target_arch = "wasm32"))] + async fn try_offline_fallback( + &self, + bucket: &str, + key: &str, + master_error: ClientError, + ) -> Result { + let (cache, pool) = match (&self.block_cache, &self.gateway_pool) { + (Some(c), Some(p)) => (c.clone(), p.clone()), + _ => return Err(master_error), + }; + + // Step 1 — translate (bucket, key) → CID via the warm-cache + // table populated during prior master-up reads. Cold-start + // misses return MasterUnreachable so the app can show + // "offline mode unavailable for this object yet". + let cid = match cache.lookup_cid(bucket, key) { + Ok(Some(cid)) => cid, + Ok(None) => { + debug!( + bucket = %bucket, key = %key, + "offline fallback: no cached CID for this object (cold-start; needs Phase 3.3)" + ); + return Err(master_error); + } + Err(e) => { + warn!(error = %e, "offline fallback: lookup_cid failed"); + return Err(master_error); + } + }; + + // Step 2 — BLOCKS hit short-circuits the network entirely. + // Cheap: a single redb read. + if let Ok(Some(bytes)) = cache.get(&cid) { + debug!(cid = %cid, "offline fallback: BLOCKS hit"); + let observed_at = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0); + return Ok(OfflineGetResult { + inner: GetObjectResult { + content_length: bytes.len() as u64, + data: bytes, + etag: cid.to_string(), + content_type: None, + last_modified: None, + metadata: HashMap::new(), + }, + source: ReadSource::LocalCache, + freshness: ReadFreshness::Cached { observed_at }, + }); + } + + // Step 3 — race the gateway pool. fetch_verified handles the + // CID verification (verify_cid_against_bytes) internally; + // bytes returned here are guaranteed to content-address to + // the requested CID. The accompanying URL template records + // which gateway won the race for transparency surfacing. + match pool.fetch_verified_with_source(&cid, &self.http).await { + Ok((bytes, gateway_url)) => { + debug!(cid = %cid, gateway = %gateway_url, "offline fallback: gateway race succeeded"); + // Populate BLOCKS so the next read of this object + // serves entirely locally. BlockTooLarge is the only + // expected failure (huge objects); fall through and + // still return the bytes to the caller. + if let Err(e) = cache.put(&cid, &bytes).await { + debug!(error = %e, "offline fallback: BLOCKS put failed (best-effort)"); + } + let observed_at = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0); + Ok(OfflineGetResult { + inner: GetObjectResult { + content_length: bytes.len() as u64, + data: bytes, + etag: cid.to_string(), + content_type: None, + last_modified: None, + metadata: HashMap::new(), + }, + source: ReadSource::Gateway(gateway_url), + freshness: ReadFreshness::Cached { observed_at }, + }) + } + Err(e) => { + warn!( + cid = %cid, + error = %e, + "offline fallback: gateway race failed" + ); + // Propagate the original master error rather than the + // gateway error — callers expect a single failure type + // for "the object is unreachable" and the gateway-race + // error is a secondary signal. + Err(master_error) + } + } + } + /// Check if an object exists #[instrument(skip(self))] pub async fn object_exists(&self, bucket: &str, key: &str) -> Result { @@ -672,6 +1119,89 @@ impl FulaClient { } } +/// Phase 2.4 — classify which error variants represent "master is +/// unreachable" for the purpose of triggering the gateway-race +/// fallback. Tightly scoped to: +/// - explicit `MasterUnreachable` from the health gate short-circuit, +/// - connection-level `Http` errors (DNS, RST, refused, timeout — +/// reqwest::Error wraps these), +/// - 5xx server errors (master is up but failing). +/// +/// 4xx (auth, not-found, precondition-failed, etc.) do NOT count: the +/// server responded correctly, the request was just refused. Falling +/// back to gateway race in those cases would mask real bugs. +/// +/// Native-only because the only caller (`try_offline_fallback`) is +/// gated to `cfg(not(target_arch = "wasm32"))`. Defining it here +/// without gates would yield a dead-code warning on wasm builds. +#[cfg(not(target_arch = "wasm32"))] +fn is_master_unreachable_error(e: &ClientError) -> bool { + match e { + ClientError::MasterUnreachable { .. } => true, + // reqwest::Error: cover the common transport failures. We + // can't easily distinguish "DNS down" from "connection RST" + // without inspecting the inner — for our purposes both are + // "master is unreachable". + ClientError::Http(re) => { + // `is_connect()` exists on native reqwest but not on the + // wasm32 build — guard it. On wasm the offline path is a + // no-op anyway (gated out at the call site), so the + // narrower native-only classification suffices. + // + // We DELIBERATELY do NOT include `is_request()` (audit + // follow-up): that variant covers body-build errors, + // redirect-loops, URL parsing — half are app bugs (bad + // bucket name, malformed header) that the fallback would + // mask. Limit to connect/timeout/5xx — the trio that + // genuinely means "master is unreachable right now". + #[cfg(not(target_arch = "wasm32"))] + let is_connect = re.is_connect(); + #[cfg(target_arch = "wasm32")] + let is_connect = false; + + is_connect + || re.is_timeout() + || matches!(re.status(), Some(s) if s.is_server_error()) + } + ClientError::S3Error { code, .. } => { + // 5xx surfaces as S3Error with a status-derived code. + code.starts_with("HTTP5") || code == "InternalError" || code == "ServiceUnavailable" + || code == "SlowDown" + } + _ => false, + } +} + +// ==================== Phase 2.4 helpers ==================== + +/// Resolve the on-disk path for the block cache. Honors +/// `Config::block_cache_path` if set; otherwise falls back to the +/// platform's local data directory under `fula/cache/blocks.redb`. +/// Native-only; the function is not compiled into the wasm target +/// because BlockCache itself isn't. +#[cfg(not(target_arch = "wasm32"))] +fn resolve_block_cache_path(config: &Config) -> std::path::PathBuf { + if let Some(p) = &config.block_cache_path { + return p.clone(); + } + // dirs::data_local_dir() returns the platform-conventional cache + // root: ~/.local/share on Linux, ~/Library/Application Support on + // macOS, %LOCALAPPDATA% on Windows. Falls back to ./fula-cache if + // dirs cannot resolve a home directory (extremely rare; common in + // CI containers without HOME set). + let base = dirs::data_local_dir().unwrap_or_else(|| std::path::PathBuf::from(".")); + base.join("fula").join("cache").join("blocks.redb") +} + +/// Open the BlockCache for `config`. Returns the typed +/// BlockCacheError on any failure so the caller can decide whether +/// to disable the offline path or surface it. +#[cfg(not(target_arch = "wasm32"))] +fn build_block_cache(config: &Config) -> std::result::Result { + let path = resolve_block_cache_path(config); + BlockCache::open(path, config.block_cache_max_bytes) +} + // ==================== Response Parsers ==================== fn parse_list_buckets_response(xml: &str) -> Result { @@ -822,4 +1352,513 @@ mod tests { assert_eq!(result.buckets.len(), 1); assert_eq!(result.buckets[0].name, "bucket1"); } + + // ============================================================ + // Phase 2.4 — offline-fallback wrapper helper-fn tests + // ============================================================ + // + // The integration tests for the full wrapper (master + gateway + // wiremock combo) live in `phase_2_4_offline_tests` below. These + // smaller unit tests cover the classification helper that decides + // when to attempt the offline path, without spinning up a server. + + #[test] + fn test_master_unreachable_classifier_explicit_variant() { + let e = ClientError::MasterUnreachable { down_for_secs: 5 }; + assert!(is_master_unreachable_error(&e)); + } + + #[test] + fn test_master_unreachable_classifier_5xx_s3_codes() { + // 5xx surfaces as S3Error with a code derived from the body + // or the status line. All these forms must be classified as + // master-unreachable so the offline path triggers. + for code in &["HTTP500", "HTTP502", "HTTP503", "InternalError", "ServiceUnavailable", "SlowDown"] { + let e = ClientError::S3Error { + code: (*code).into(), + message: "x".into(), + request_id: None, + }; + assert!(is_master_unreachable_error(&e), "code={} should classify as master-unreachable", code); + } + } + + #[test] + fn test_master_unreachable_classifier_excludes_4xx() { + // 4xx must NOT trigger fallback — server responded, request + // was simply refused. Falling back here would mask real auth + // / not-found issues. + for code in &["NoSuchKey", "NoSuchBucket", "AccessDenied", "PreconditionFailed", "HTTP404", "HTTP403"] { + let e = ClientError::S3Error { + code: (*code).into(), + message: "x".into(), + request_id: None, + }; + assert!(!is_master_unreachable_error(&e), "code={} must NOT classify as master-unreachable", code); + } + } + + #[test] + fn test_master_unreachable_classifier_excludes_other_variants() { + // Encryption / config / NotFound / etc. are not master-down. + let e = ClientError::Config("bad".into()); + assert!(!is_master_unreachable_error(&e)); + let e = ClientError::BucketNotFound("b".into()); + assert!(!is_master_unreachable_error(&e)); + let e = ClientError::ConcurrentModification("etag mismatch".into()); + assert!(!is_master_unreachable_error(&e)); + } + + /// Audit follow-up: request-build errors (URL parsing, malformed + /// headers, body-encoding) must NOT classify as master-unreachable. + /// Including `re.is_request()` would mask "I gave the SDK a bad + /// bucket name" by silently falling back to the gateway race. + /// Construct a request-build error by passing an invalid URL. + #[tokio::test] + async fn test_master_unreachable_classifier_excludes_request_build_errors() { + let http = reqwest::Client::new(); + // Building a request to a malformed URL fails at request-build + // time, before any network I/O. reqwest classifies this as + // is_builder() / is_request() — NOT is_connect() / is_timeout(). + let result = http.get("ht!tp://bad").build(); + let req_err = match result { + Err(e) => e, + Ok(_) => { + // If reqwest happened to accept this URL, try sending + // it; the send will fail with a different request error. + http.get("ht!tp://bad").send().await.unwrap_err() + } + }; + let wrapped = ClientError::Http(req_err); + assert!( + !is_master_unreachable_error(&wrapped), + "request-build / URL-parse errors must NOT classify as master-unreachable" + ); + } + + // Resolve-block-cache-path is native-only (uses dirs crate). + #[cfg(not(target_arch = "wasm32"))] + #[test] + fn test_resolve_block_cache_path_uses_explicit_when_set() { + let mut config = Config::default(); + config.block_cache_path = Some(std::path::PathBuf::from("/tmp/explicit/blocks.redb")); + let p = resolve_block_cache_path(&config); + assert_eq!(p, std::path::PathBuf::from("/tmp/explicit/blocks.redb")); + } + + #[cfg(not(target_arch = "wasm32"))] + #[test] + fn test_resolve_block_cache_path_uses_platform_default_when_unset() { + let config = Config::default(); + let p = resolve_block_cache_path(&config); + // The exact path depends on the host OS, but it must end in + // the documented "fula/cache/blocks.redb" suffix. + let s = p.to_string_lossy().replace('\\', "/"); + assert!( + s.ends_with("fula/cache/blocks.redb"), + "expected platform default to end with 'fula/cache/blocks.redb', got: {}", + s + ); + } +} + +// ============================================================ +// Phase 2.4 — offline-fallback integration tests (wiremock) +// ============================================================ +// +// These tests spin up: +// 1. A wiremock master at 127.0.0.1: +// 2. A wiremock gateway at 127.0.0.1:/ipfs/{cid} +// +// They exercise the wrapper end-to-end: +// - flags off → no cache, no fallback, byte-identical to old behavior +// - master up → cache populated (KEY_TO_CID + BLOCKS) +// - master down + cache hit → gateway race serves bytes +// - master down + cache miss → MasterUnreachable surfaces +// - master 5xx → fallback triggers +// - master 4xx → fallback does NOT trigger (auth/not-found preserved) +// +// Native-only: wiremock + block_cache aren't compiled into wasm builds. + +#[cfg(not(target_arch = "wasm32"))] +#[cfg(test)] +mod phase_2_4_offline_tests { + use super::*; + use crate::block_cache::BlockCache; + use cid::Cid; + use cid::multihash::Multihash; + use sha2::Digest; + use std::sync::Arc; + use std::time::Duration; + use tempfile::TempDir; + use wiremock::matchers::{method, path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + /// Compute the CID master would set as ETag for the given + /// payload. Master uses CIDv1 + raw codec + sha2-256 multihash + /// for direct S3-PUT objects (per `object.rs:103-105` and + /// `cid_utils::create_cid`). For tests we mirror that exactly so + /// `verify_cid_against_bytes` will pass. + fn cid_for_bytes(data: &[u8]) -> Cid { + let digest = sha2::Sha256::digest(data); + let mh = Multihash::<64>::wrap(0x12 /* sha2-256 */, &digest).unwrap(); + Cid::new_v1(0x55 /* raw */, mh) + } + + /// Helper: build a FulaClient pointed at `master_url` with + /// `gateway_url` in its fallback list. Cache lives in `cache_path`. + fn build_client( + master_url: &str, + cache_path: &std::path::Path, + gateway_url_template: &str, + ) -> FulaClient { + let mut config = Config::new(master_url); + config.timeout = Duration::from_secs(2); + config.block_cache_enabled = true; + config.block_cache_path = Some(cache_path.to_path_buf()); + config.block_cache_max_bytes = 1024 * 1024; + config.gateway_fallback_enabled = true; + config.gateway_fallback_urls = vec![gateway_url_template.to_string()]; + config.gateway_race_concurrency = 1; + // Health gate off — these tests construct the master-down + // signal via 5xx responses or a stopped wiremock; gate + // semantics are exercised separately in health_gate.rs. + config.health_gate_enabled = false; + FulaClient::new(config).expect("client") + } + + #[tokio::test] + async fn test_flags_off_byte_identical_to_get_object_with_metadata() { + // Backward-compat: if neither flag is set, the wrapper must + // delegate to get_object_with_metadata with no observable + // difference (no extra cache writes, no extra network calls). + let master = MockServer::start().await; + let body = b"some bytes"; + let cid = cid_for_bytes(body); + Mock::given(method("GET")) + .and(path("/bucket/key.txt")) + .respond_with( + ResponseTemplate::new(200) + .insert_header("ETag", format!("\"{}\"", cid)) + .set_body_bytes(body.as_slice()), + ) + .expect(1) + .mount(&master) + .await; + + let mut config = Config::new(master.uri()); + // Both flags OFF — backward-compat scenario. + config.block_cache_enabled = false; + config.gateway_fallback_enabled = false; + let client = FulaClient::new(config).expect("client"); + + let r = client + .get_object_with_offline_fallback("bucket", "key.txt") + .await + .expect("get"); + // Phase 19: result is OfflineGetResult; bytes/etag on .inner. + assert_eq!(r.inner.data.as_ref(), body); + assert_eq!(r.inner.etag, cid.to_string()); + assert_eq!(r.source, ReadSource::Master); + assert_eq!(r.freshness, ReadFreshness::Live); + } + + #[tokio::test] + async fn test_master_up_populates_key_to_cid_and_blocks() { + let master = MockServer::start().await; + let body = b"payload bytes for cache"; + let cid = cid_for_bytes(body); + Mock::given(method("GET")) + .and(path("/bucket/file.bin")) + .respond_with( + ResponseTemplate::new(200) + .insert_header("ETag", format!("\"{}\"", cid)) + .set_body_bytes(body.as_slice()), + ) + .mount(&master) + .await; + + let dir = TempDir::new().unwrap(); + let cache_path = dir.path().join("cache.redb"); + let client = build_client( + &master.uri(), + &cache_path, + "http://unused.invalid/ipfs/{cid}", + ); + + let r = client + .get_object_with_offline_fallback("bucket", "file.bin") + .await + .expect("get"); + assert_eq!(r.inner.data.as_ref(), body); + assert_eq!(r.source, ReadSource::Master); + assert_eq!(r.freshness, ReadFreshness::Live); + + // Drop the client (and its BlockCache Arc) so we can re-open + // the on-disk file for inspection. redb holds an exclusive + // file lock; AlreadyOpen otherwise. + drop(client); + + // Cache must have been populated as a side-effect. + let cache = BlockCache::open(&cache_path, 1024 * 1024).expect("re-open cache"); + let looked_up = cache.lookup_cid("bucket", "file.bin").expect("lookup").expect("hit"); + assert_eq!(looked_up, cid, "KEY_TO_CID must record the master's etag"); + let bytes = cache.get(&cid).expect("get").expect("BLOCKS hit"); + assert_eq!(bytes.as_ref(), body, "BLOCKS table must hold the payload"); + } + + #[tokio::test] + async fn test_master_down_with_cached_cid_falls_back_to_gateway() { + // Phase: warm-up against master, then simulate master-down + // and verify the gateway race fills in. + let master = MockServer::start().await; + let gateway = MockServer::start().await; + let body = b"served by gateway after master goes dark"; + let cid = cid_for_bytes(body); + + // Master serves the file ONCE, populating the cache. + Mock::given(method("GET")) + .and(path("/bucket/file.txt")) + .respond_with( + ResponseTemplate::new(200) + .insert_header("ETag", format!("\"{}\"", cid)) + .set_body_bytes(body.as_slice()), + ) + .up_to_n_times(1) + .mount(&master) + .await; + // Subsequent master requests fail with 503. + Mock::given(method("GET")) + .and(path("/bucket/file.txt")) + .respond_with(ResponseTemplate::new(503)) + .mount(&master) + .await; + + // Gateway always serves the same bytes. + let gateway_path = format!("/ipfs/{}", cid); + Mock::given(method("GET")) + .and(path(gateway_path.clone())) + .respond_with(ResponseTemplate::new(200).set_body_bytes(body.as_slice())) + .mount(&gateway) + .await; + + let dir = TempDir::new().unwrap(); + let cache_path = dir.path().join("cache.redb"); + let gateway_template = format!("{}/ipfs/{{cid}}", gateway.uri()); + let client = build_client(&master.uri(), &cache_path, &gateway_template); + + // Read 1: master up — populates cache. + let r1 = client + .get_object_with_offline_fallback("bucket", "file.txt") + .await + .expect("master read"); + assert_eq!(r1.inner.data.as_ref(), body); + assert_eq!(r1.source, ReadSource::Master); + + // Drop the in-process BLOCKS entry to force the gateway race + // (otherwise step 2 would short-circuit on a BLOCKS hit and + // we wouldn't be testing the fallback path). + // We do this by opening a fresh client without the populated + // cache — but actually keeping the same on-disk cache is what + // we want; just clear BLOCKS while keeping KEY_TO_CID. + // Simpler: we test against a SECOND client that re-uses the + // same cache file; since BLOCKS is populated by step 1, we'd + // expect a BLOCKS hit on read 2. So we'll first open a client + // with a different cache path (no warm-up), then manually + // call record_key_cid → that simulates "warm KEY_TO_CID, cold + // BLOCKS" which is the realistic scenario after a long enough + // outage. + let dir2 = TempDir::new().unwrap(); + let cache_path2 = dir2.path().join("cache2.redb"); + let cache2 = BlockCache::open(&cache_path2, 1024 * 1024).expect("open"); + cache2.record_key_cid("bucket", "file.txt", &cid).expect("seed mapping"); + drop(cache2); + + let client2 = build_client(&master.uri(), &cache_path2, &gateway_template); + let r2 = client2 + .get_object_with_offline_fallback("bucket", "file.txt") + .await + .expect("offline path read"); + assert_eq!(r2.inner.data.as_ref(), body, "gateway must have served the bytes"); + assert_eq!(r2.inner.etag, cid.to_string(), "synthesized etag = cid"); + // Phase 19: gateway-served bytes get a Gateway(url) source + + // Cached freshness. The URL template should match the + // configured gateway template (NOT the per-CID-substituted URL). + match &r2.source { + ReadSource::Gateway(url) => { + assert_eq!(url, &gateway_template, "source URL = configured gateway template"); + } + other => panic!("expected ReadSource::Gateway, got {:?}", other), + } + match r2.freshness { + ReadFreshness::Cached { .. } => { /* ok */ } + other => panic!("expected ReadFreshness::Cached, got {:?}", other), + } + } + + #[tokio::test] + async fn test_master_down_no_cached_cid_returns_master_unreachable() { + // Cold-start case: SDK has never read this object before, so + // KEY_TO_CID has no entry. Wrapper must surface the original + // master-down error rather than swallow it — Phase 3.3 will + // pick it up later. + let master = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/bucket/never-read.txt")) + .respond_with(ResponseTemplate::new(503)) + .mount(&master) + .await; + + let dir = TempDir::new().unwrap(); + let cache_path = dir.path().join("cache.redb"); + let client = build_client( + &master.uri(), + &cache_path, + "http://unused.invalid/ipfs/{cid}", + ); + + let result = client + .get_object_with_offline_fallback("bucket", "never-read.txt") + .await; + assert!(result.is_err(), "no cached CID → must propagate master-down"); + let err = result.unwrap_err(); + // Either the explicit MasterUnreachable variant (if health + // gate were involved) or an S3Error with HTTP503 code is + // acceptable here. The point is: NOT Ok, and NOT silently + // swallowed. + assert!( + is_master_unreachable_error(&err), + "error must classify as master-unreachable: {:?}", + err + ); + } + + #[tokio::test] + async fn test_master_4xx_does_not_trigger_fallback() { + // 4xx (auth, not-found) surfaces as S3Error and MUST propagate + // unchanged. The fallback path would mask real bugs (e.g., + // a typo in the bucket name yielding NoSuchBucket). + let master = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/bucket/missing.txt")) + .respond_with( + ResponseTemplate::new(404) + .set_body_string(r#"NoSuchKeynot here"#), + ) + .mount(&master) + .await; + + let dir = TempDir::new().unwrap(); + let cache_path = dir.path().join("cache.redb"); + let client = build_client( + &master.uri(), + &cache_path, + "http://unused.invalid/ipfs/{cid}", + ); + + let err = client + .get_object_with_offline_fallback("bucket", "missing.txt") + .await + .expect_err("404 propagates"); + assert!(err.is_not_found(), "expected NotFound, got: {:?}", err); + assert!( + !is_master_unreachable_error(&err), + "4xx must NOT classify as master-unreachable" + ); + } + + #[tokio::test] + async fn test_master_down_gateway_failure_propagates_original_error() { + // If the offline path tries to fetch via the gateway race AND + // the race exhausts (all gateways down), the wrapper must + // surface the ORIGINAL master-down error so callers see a + // single failure type. The gateway-side error is already + // logged at warn level (operators can debug from logs). + let master = MockServer::start().await; + Mock::given(method("GET")) + .and(path("/bucket/x.txt")) + .respond_with(ResponseTemplate::new(503)) + .mount(&master) + .await; + + // Gateway always 500s — race will exhaust. + let gateway = MockServer::start().await; + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(500)) + .mount(&gateway) + .await; + + let dir = TempDir::new().unwrap(); + let cache_path = dir.path().join("cache.redb"); + let body = b"would have been served"; + let cid = cid_for_bytes(body); + let cache = BlockCache::open(&cache_path, 1024 * 1024).expect("open"); + cache.record_key_cid("bucket", "x.txt", &cid).expect("seed"); + drop(cache); + + let gateway_template = format!("{}/ipfs/{{cid}}", gateway.uri()); + let client = build_client(&master.uri(), &cache_path, &gateway_template); + + let err = client + .get_object_with_offline_fallback("bucket", "x.txt") + .await + .expect_err("both channels failed"); + assert!( + is_master_unreachable_error(&err), + "must surface master-unreachable, not a gateway-specific error" + ); + } + + // ============================================================ + // Phase 19 — transparency surfaces on the offline path + // ============================================================ + + #[tokio::test] + async fn test_phase19_blocks_hit_carries_local_cache_source() { + // Advisor-mandated test #3: when BLOCKS already holds the + // bytes (e.g., from a prior master-up read), the offline path + // serves them from local cache and the result carries + // `ReadSource::LocalCache` + `ReadFreshness::Cached`. No + // network round-trip happens at all. + let master = MockServer::start().await; + // Master is unreachable (every request 503s). + Mock::given(method("GET")) + .and(path("/bucket/cached.txt")) + .respond_with(ResponseTemplate::new(503)) + .mount(&master) + .await; + + let body = b"already cached locally"; + let cid = cid_for_bytes(body); + + // Pre-populate BOTH KEY_TO_CID and BLOCKS so the offline + // fallback's BLOCKS hit short-circuits before any gateway + // race attempt. + let dir = TempDir::new().unwrap(); + let cache_path = dir.path().join("cache.redb"); + let cache = BlockCache::open(&cache_path, 1024 * 1024).expect("open cache"); + cache.record_key_cid("bucket", "cached.txt", &cid).expect("seed key→cid"); + cache.put(&cid, body).await.expect("seed BLOCKS"); + drop(cache); + + // Use a gateway URL that would FAIL if the gateway race were + // even attempted — proves the BLOCKS hit short-circuited. + let gateway_template = "http://gateway-must-not-be-called.invalid/ipfs/{cid}"; + let client = build_client(&master.uri(), &cache_path, gateway_template); + + let r = client + .get_object_with_offline_fallback("bucket", "cached.txt") + .await + .expect("BLOCKS hit serves bytes"); + + assert_eq!(r.inner.data.as_ref(), body); + assert_eq!(r.inner.etag, cid.to_string(), "synthesized etag = cid"); + assert_eq!(r.source, ReadSource::LocalCache, "BLOCKS hit → LocalCache"); + match r.freshness { + ReadFreshness::Cached { observed_at } => { + assert!(observed_at > 0, "Cached.observed_at must be set"); + } + other => panic!("expected ReadFreshness::Cached, got {:?}", other), + } + } } diff --git a/crates/fula-client/src/config.rs b/crates/fula-client/src/config.rs index b9187cd..cbbb5b1 100644 --- a/crates/fula-client/src/config.rs +++ b/crates/fula-client/src/config.rs @@ -1,9 +1,17 @@ //! Client configuration +use std::path::PathBuf; use std::time::Duration; +use crate::health_gate::HealthCallback; + /// Client configuration -#[derive(Clone, Debug)] +/// +/// Note: `Config` is `Clone` but the `health_callback` shares the +/// underlying `Arc` across clones — there's exactly one +/// callback closure per logical SDK construction, fired by every +/// `FulaClient` clone derived from this config. +#[derive(Clone)] pub struct Config { /// Gateway endpoint URL pub endpoint: String, @@ -54,6 +62,150 @@ pub struct Config { /// this duration elapses, the next request is allowed through as a /// probe (without resetting state — only an observed success resets). pub health_gate_ttl: Duration, + + /// Phase 2.2 of master-independent reads: enable the on-disk LRU + /// block cache. Off by default. Native-only — `wasm32` ignores + /// this flag (the redb-backed cache cannot open in browsers). + /// When enabled, master-up reads observe and persist the + /// `(bucket, key) → cid` mapping the offline path needs. + pub block_cache_enabled: bool, + + /// Filesystem path for the block-cache redb database. `None` means + /// "use the platform default" (resolved at SDK init via the + /// `dirs` crate's `data_local_dir()`). Operators can override + /// this for tests or non-standard deployments. Native-only. + pub block_cache_path: Option, + + /// Maximum on-disk bytes for the block cache. Defaults to 256 MiB + /// per plan §2.2. The cache evicts to 80 % of this watermark when + /// `put` would push it past `max_bytes`. Native-only. + pub block_cache_max_bytes: u64, + + /// Phase 2.4 of master-independent reads: enable falling back to + /// public IPFS gateways when master is unreachable AND the SDK has + /// already cached the requested object's CID via Phase 2.2's + /// `(bucket, key) → cid` table. Off by default; flip on AFTER + /// Phase 2.2 has had time to populate the cache during master-up + /// reads. Native-only — `wasm32` returns `MasterUnreachable` + /// instead of falling back (no gateway-race plumbing in the + /// browser target). + pub gateway_fallback_enabled: bool, + + /// Custom gateway URL templates. Each must contain the literal + /// `{cid}` token, which the SDK substitutes per fetch. Empty = + /// use the SDK-shipped default list of six gateways + /// (`gateway_fetch::default_gateway_urls()`). Native-only. + pub gateway_fallback_urls: Vec, + + /// Number of gateways the SDK races in parallel for any single + /// CID. Default 3 per plan §2.3 (cancels in-flight losers via + /// `Drop` of the spawned futures). Capped at the gateway-pool + /// length. Native-only. + pub gateway_race_concurrency: usize, + + // ============================================================ + // Phase 3.3 — cold-start hybrid resolver + // ============================================================ + // + // The resolver is "configured" iff ALL of: + // - `users_index_chain_rpc_url` is non-empty + // - `users_index_anchor_address` is non-empty + // - `users_index_ipns_name` is non-empty + // - `users_index_user_key` is `Some` + // + // are populated. Field presence is the single source of truth — + // there is no separate `enabled` bool. To disable cold-start an + // operator clears any one of the four fields; the SDK degrades + // to "warm-cache only" automatically. This eliminates the + // surprise of "I flipped the master switch but it's still off + // because I forgot field N" — an audit-driven simplification. + + /// JSON-RPC URL for the chain anchor (Base or SKALE). One of + /// the four required fields for the cold-start resolver. + pub users_index_chain_rpc_url: String, + + /// `FulaUsersIndexAnchor.sol` proxy address (20 bytes hex, + /// optionally `0x`-prefixed). Required when the resolver is + /// enabled. + pub users_index_anchor_address: String, + + /// IPNS NAME (libp2p public-key hash, e.g. `k51qzi5...`) under + /// which the master publishes the users-index. Required when + /// the resolver is enabled. + pub users_index_ipns_name: String, + + /// 32-hex-char `userKey` (= `BLAKE3("fula:user_id:" || sha256(lower(email)))[..16]`). + /// Computed once at sign-in via `registry_resolver::derive_user_key_from_email` + /// and passed in here; the SDK does not store the raw email. Required when + /// the resolver is enabled. + pub users_index_user_key: Option, + + /// IPNS-aware gateway URL templates the resolver races against + /// (each must contain `{name}`). Empty Vec = use the SDK-shipped + /// defaults (Cloudflare, dweb.link, ipfs.io, 4everland, Pinata — + /// `trustless-gateway.link` is excluded since it serves only + /// `/ipfs/`). Operators can override e.g. for staging tests + /// against wiremock or to add a private IPNS-aware gateway. + pub users_index_ipns_gateway_urls: Vec, + + /// `/ipfs/{cid}` gateway URL templates the resolver uses for + /// fetching the chain-anchored CID's bytes AND the cold-start + /// path uses for fetching the per-user `bucketsIndex` and forest + /// manifest CBORs. Empty Vec = use the SDK-shipped six-gateway + /// default. Independent of `gateway_fallback_urls` (which serves + /// the warm-device offline path) so cold-start works without + /// Phase 2.2/2.4 enabled. + pub users_index_ipfs_gateway_urls: Vec, + + /// Phase 19 — optional health-status callback. When set, the SDK + /// invokes this closure on every Up↔Down transition of the + /// master health gate (`MasterHealthEvent::Online` / + /// `OfflineFallbackActive`) plus on cold-start failure + /// (`SeverelyDegraded`). Apps wire this to surface offline UI + /// affordances. Default `None` = silent (gate works, just no + /// transparency callback). Native-only — `Arc` doesn't + /// cross FRB / wasm-bindgen cleanly, so wasm/Flutter surface + /// these via the typed error variants instead. + pub health_callback: Option, +} + +// `Config` derives `Clone` but not `Debug` because `HealthCallback` +// is `Arc` which has no `Debug`. Hand-roll a `Debug` impl +// that omits the callback (printing "Some()" or "None"), +// preserving the Phase 1.x behavior where Config could be logged. +impl std::fmt::Debug for Config { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Config") + .field("endpoint", &self.endpoint) + .field("access_token", &self.access_token.as_deref().map(|_| "")) + .field("timeout", &self.timeout) + .field("encryption_enabled", &self.encryption_enabled) + .field("user_agent", &self.user_agent) + .field("max_retries", &self.max_retries) + .field("multipart_threshold", &self.multipart_threshold) + .field("multipart_chunk_size", &self.multipart_chunk_size) + .field("per_chunk_download_timeout", &self.per_chunk_download_timeout) + .field("buffered_download_max_bytes", &self.buffered_download_max_bytes) + .field("health_gate_enabled", &self.health_gate_enabled) + .field("health_gate_ttl", &self.health_gate_ttl) + .field("block_cache_enabled", &self.block_cache_enabled) + .field("block_cache_path", &self.block_cache_path) + .field("block_cache_max_bytes", &self.block_cache_max_bytes) + .field("gateway_fallback_enabled", &self.gateway_fallback_enabled) + .field("gateway_fallback_urls", &self.gateway_fallback_urls) + .field("gateway_race_concurrency", &self.gateway_race_concurrency) + .field("users_index_chain_rpc_url", &self.users_index_chain_rpc_url) + .field("users_index_anchor_address", &self.users_index_anchor_address) + .field("users_index_ipns_name", &self.users_index_ipns_name) + .field("users_index_user_key", &self.users_index_user_key) + .field("users_index_ipns_gateway_urls", &self.users_index_ipns_gateway_urls) + .field("users_index_ipfs_gateway_urls", &self.users_index_ipfs_gateway_urls) + .field( + "health_callback", + &self.health_callback.as_ref().map(|_| ""), + ) + .finish() + } } impl Default for Config { @@ -71,6 +223,26 @@ impl Default for Config { buffered_download_max_bytes: 256 * 1024 * 1024, // 256 MB health_gate_enabled: false, // backward-compat: off by default health_gate_ttl: Duration::from_secs(30), + // Phase 2.2 / 2.4 — off by default for backward-compat. + // SDK consumers must opt in explicitly; existing apps see + // byte-identical behavior to pre-Phase-2 builds. + block_cache_enabled: false, + block_cache_path: None, + block_cache_max_bytes: 256 * 1024 * 1024, // 256 MiB + gateway_fallback_enabled: false, + gateway_fallback_urls: Vec::new(), + gateway_race_concurrency: 3, + // Phase 3.3 — resolver disabled by default (every required + // field is empty/None; field-presence is the single + // source of truth — see config-block doc above). + users_index_chain_rpc_url: String::new(), + users_index_anchor_address: String::new(), + users_index_ipns_name: String::new(), + users_index_user_key: None, + users_index_ipns_gateway_urls: Vec::new(), + users_index_ipfs_gateway_urls: Vec::new(), + // Phase 19 — no callback by default (silent gate). + health_callback: None, } } } @@ -102,6 +274,15 @@ impl Config { self } + /// Phase 19 — set the health-status callback. The closure is shared + /// across `Config` clones via `Arc`; constructing once and + /// cloning the config gives every derived `FulaClient` the same + /// callback wiring. + pub fn with_health_callback(mut self, callback: HealthCallback) -> Self { + self.health_callback = Some(callback); + self + } + /// Build the base URL for API requests pub fn base_url(&self) -> &str { &self.endpoint diff --git a/crates/fula-client/src/encryption.rs b/crates/fula-client/src/encryption.rs index dc79cfb..2968ddc 100644 --- a/crates/fula-client/src/encryption.rs +++ b/crates/fula-client/src/encryption.rs @@ -342,12 +342,28 @@ impl BlobBackend for S3BlobBackend { /// 429/500/502/503/504, S3 `SlowDown`/`InternalError`/`ServiceUnavailable`) /// with a fixed 300 ms + 0-100 ms jitter delay, up to 4 attempts total. /// Non-transient errors (auth failure, NotFound, etc.) short-circuit. + /// + /// Phase 2.4: when the SDK has Phase 2.2/2.3 enabled, this dispatches + /// through `get_object_with_offline_fallback` so a master-down read + /// can transparently fall through to the public-gateway race using + /// the cached `(bucket, key) → cid` mapping. When the flags are off + /// behavior is byte-identical to pre-Phase-2.4 (single inner call, + /// same retry policy). async fn get(&self, path: &str) -> fula_crypto::Result> { let mut attempt: u32 = 0; loop { attempt += 1; - match self.inner.get_object(&self.bucket, path).await { - Ok(bytes) => return Ok(bytes.to_vec()), + match self + .inner + .get_object_with_offline_fallback(&self.bucket, path) + .await + { + // Phase 19: get_object_with_offline_fallback now returns + // OfflineGetResult; the bytes live on `.inner.data`. The + // `source` / `freshness` fields are dropped here — the + // crypto blob backend has no plumbing to surface them + // and isn't a transparency consumer. + Ok(result) => return Ok(result.inner.data.to_vec()), Err(e) if attempt < BLOB_BACKEND_MAX_ATTEMPTS && crate::multipart::is_transient(&e) => @@ -409,12 +425,16 @@ impl BlobBackend for S3BlobBackend { #[async_trait::async_trait(?Send)] impl BlobBackend for S3BlobBackend { async fn get(&self, path: &str) -> fula_crypto::Result> { - let bytes = self + // wasm32 has no offline fallback infrastructure (block_cache + + // gateway_fetch are gated out). The wrapper is a thin delegate + // here so the call site stays identical across targets. + let result = self .inner - .get_object(&self.bucket, path) + .get_object_with_offline_fallback(&self.bucket, path) .await .map_err(client_err_to_crypto)?; - Ok(bytes.to_vec()) + // Phase 19: result is an OfflineGetResult; bytes are on .inner.data. + Ok(result.inner.data.to_vec()) } async fn put(&self, path: &str, bytes: Vec) -> fula_crypto::Result<()> { @@ -2230,8 +2250,23 @@ impl EncryptedClient { let forest_dek = self.encryption.key_manager.derive_path_key(&format!("forest:{}", bucket)); let index_key = derive_index_key(&forest_dek, bucket); - // Try to load from storage - match self.inner.get_object_with_metadata(bucket, &index_key).await { + // Try to load from storage. Phase 2.4: route through the + // offline-fallback wrapper so a master-down read can transparently + // fall through to the gateway race using the cached + // `(bucket, index_key) → cid` mapping. Phase 3.3 layers cold-start + // escalation on top: when the offline-fallback returns + // `MasterUnreachable` (master down AND KEY_TO_CID miss for a + // fresh device that's never read this manifest before) AND the + // resolver is configured, escalate to the IPNS+chain hybrid + // resolver to fetch the manifest CID and its bytes via the + // public network. Wrapper synthesizes `etag = cid.to_string()` + // on the gateway-fetched / cold-start paths so the existing + // forest-format detector + sequence-replay guard handle the + // result identically (master also uses cid.to_string() as ETag). + match self + .fetch_manifest_with_cold_start_escalation(bucket, &index_key) + .await + { Ok(result) => { let observed_etag = if result.etag.is_empty() { None } else { Some(result.etag.clone()) }; // Capture cache generation before dispatch so we can detect cross-format @@ -2922,6 +2957,276 @@ impl EncryptedClient { hex::encode(&hash.as_bytes()[..16]) } + /// Phase 3.3 escalation seam — fetch a manifest via the + /// offline-fallback wrapper, escalating to cold-start on + /// `MasterUnreachable` when the resolver is configured. + /// + /// Behavior: + /// + /// | State | Result | + /// |------------------------------------------------------------------------|---------------------------------------------------| + /// | Master up | normal path through `get_object_with_offline_fallback` | + /// | Master down + KEY_TO_CID hit (warm device) | gateway race serves bytes (Phase 2.4) | + /// | Master down + KEY_TO_CID miss + resolver enabled (cold device) | escalates to `cold_start_resolve_manifest`; populates KEY_TO_CID for next warm-cache read | + /// | Master down + KEY_TO_CID miss + resolver NOT enabled | propagates `MasterUnreachable` | + /// + /// On the cold-start path the synthesized result carries + /// `etag = manifest_cid.to_string()` so the existing forest- + /// format detector + sequence-replay guard handle the bytes + /// identically to a master-served fetch (master also uses + /// `cid.to_string()` as the ETag — see `fula-cli/src/handlers/object.rs:103-105`). + /// + /// Native-only: the cold-start resolver is gated to + /// `cfg(not(target_arch = "wasm32"))`. On wasm this method + /// degrades to the underlying `get_object_with_offline_fallback` + /// (which itself degrades to `get_object_with_metadata`). + #[cfg(not(target_arch = "wasm32"))] + async fn fetch_manifest_with_cold_start_escalation( + &self, + bucket: &str, + index_key: &str, + ) -> Result { + match self + .inner + .get_object_with_offline_fallback(bucket, index_key) + .await + { + // Happy path: master up OR warm-cache hit. Phase 19 wraps + // the result in OfflineGetResult; this internal cold-start + // path doesn't surface source/freshness to callers, so + // unwrap the inner GetObjectResult and propagate. + Ok(r) => Ok(r.inner), + + // Master-down + cache miss → try cold-start if resolver is + // configured. Identifying which "MasterUnreachable" case + // this is doesn't matter — both are "we don't know the + // CID locally, fetch from the public network". + Err(e) if matches!(e, ClientError::MasterUnreachable { .. }) => { + // Resolver-enabled? If not, propagate the original. + if self.inner.users_index_resolver().is_none() { + return Err(e); + } + // Run the cold-start chain. + let (manifest_cid, manifest_bytes) = + self.cold_start_resolve_manifest(bucket).await?; + + // Best-effort: populate KEY_TO_CID so the next read + // of this manifest (which IS predictable — the + // index_key is deterministic from forest_dek) lands + // in the warm-device fast path. Failure is fine; we + // already have the bytes for THIS read. + if let Some(cache) = self.inner.block_cache() { + if let Err(e) = cache.record_key_cid(bucket, index_key, &manifest_cid) { + tracing::debug!( + error = %e, + "cold-start: KEY_TO_CID populate failed (best-effort)" + ); + } + // Also seed the BLOCKS cache with the manifest + // bytes — saves the gateway race on the next read + // of this same manifest. + if let Err(e) = cache.put(&manifest_cid, &manifest_bytes).await { + tracing::debug!( + error = %e, + "cold-start: BLOCKS put failed (best-effort)" + ); + } + } + + Ok(GetObjectResult { + content_length: manifest_bytes.len() as u64, + data: manifest_bytes, + etag: manifest_cid.to_string(), + content_type: None, + last_modified: None, + metadata: std::collections::HashMap::new(), + }) + } + + // Any other error (Http, S3 4xx, encryption, etc.) — + // not a master-down condition. Propagate unchanged. + Err(e) => Err(e), + } + } + + /// Wasm fallback: cold-start is native-only, so on wasm we just + /// delegate to the existing wrapper. The native and wasm signatures + /// are kept identical so call sites don't need cfg gates of their + /// own. + #[cfg(target_arch = "wasm32")] + async fn fetch_manifest_with_cold_start_escalation( + &self, + bucket: &str, + index_key: &str, + ) -> Result { + // Phase 19: extract `.inner` since get_object_with_offline_fallback + // now returns OfflineGetResult on every target. + self.inner + .get_object_with_offline_fallback(bucket, index_key) + .await + .map(|r| r.inner) + } + + /// Phase 3.3 — cold-start resolution of a bucket's forest manifest + /// via the hybrid IPNS+chain resolver. + /// + /// Invoked from the offline-fallback path (see + /// `load_forest_internal`) when the local `KEY_TO_CID` cache + /// has no entry for the manifest's storage key AND the resolver + /// is configured. Walks the published chain: + /// + /// 1. Resolver returns the global `users` map (IPNS or chain). + /// 2. Look up the configured `userKey` → per-user + /// `bucketsIndexCid`. + /// 3. Fetch the bucketsIndex CBOR via gateway race + verify. + /// 4. Compute `bucketLookupH = BLAKE3(MetadataKey || bucket)`; + /// fall back to the legacy plaintext-name entry if the + /// blinded key is absent (Phase 1.2 transition path). + /// 5. Fetch the manifest's CBOR-pinned-bytes via gateway race + /// + verify. + /// + /// Returns `(manifest_cid, manifest_bytes)` so the caller writes + /// the bytes into the existing forest-format-detect / decrypt + /// pipeline without a second network round-trip — saves 5–30 s + /// on the first cold-start read. Caller is also responsible for + /// writing `(bucket, index_key) → manifest_cid` into KEY_TO_CID + /// so subsequent warm-device reads short-circuit. + /// + /// **Bounded semantics.** Phase 3.3 makes the *manifest* CID + /// reachable on a fresh device + master-down. It does **not** + /// fix chunk-level fetches in true cold-start (the chunk's + /// CID isn't derivable from its storage key without a master + /// ping). The user can read manifests, list directories, and + /// re-fetch any object whose chunks the warm-cache previously + /// observed; never-read-before objects still require master to + /// come back briefly. Phase 19+ may close that gap (e.g., by + /// embedding chunk CIDs in the forest manifest). + #[cfg(not(target_arch = "wasm32"))] + pub async fn cold_start_resolve_manifest( + &self, + bucket: &str, + ) -> Result<(cid::Cid, bytes::Bytes)> { + // 1. Resolver must be configured + user_key set. Both are + // deferred to construction time, so absence here means + // the operator has the resolver enabled but missed one of + // the four required Config fields. + let resolver = self + .inner + .users_index_resolver() + .ok_or_else(|| ClientError::UsersIndexResolutionFailed { + reason: "cold-start resolver not configured (Config requires all four fields: \ + users_index_chain_rpc_url, users_index_anchor_address, \ + users_index_ipns_name, users_index_user_key)".into(), + })? + .clone(); + let user_key = self + .inner + .config() + .users_index_user_key + .clone() + .ok_or_else(|| ClientError::UsersIndexResolutionFailed { + reason: "users_index_user_key is not set; compute it via derive_user_key_from_email at sign-in".into(), + })?; + + // 2. Resolve the global users-index. Internal replay defense + // in the resolver bumps the seen-sequence floor. + // + // Phase 19: when both IPNS and chain paths fail, the + // resolver returns `UsersIndexResolutionFailed`. Fire + // `SeverelyDegraded` (master + cold-start network both + // unreachable) before propagating so apps can disable + // "open new bucket" / "first-read" UI affordances. This + // is the ONLY emission point for `SeverelyDegraded` — + // the health gate alone can't authoritatively detect + // "both down" without trying. + let resolved = match resolver.resolve().await { + Ok(r) => r, + Err(e) => { + if matches!(e, ClientError::UsersIndexResolutionFailed { .. }) { + self.inner.fire_health_event( + crate::health_gate::MasterHealthEvent::SeverelyDegraded { + reason: format!("cold-start resolver exhausted: {}", e), + }, + ); + } + return Err(e); + } + }; + + // 3. Look up our user_key in the global map. + let buckets_index_cid_str = resolved + .payload + .users + .get(&user_key) + .cloned() + .ok_or_else(|| ClientError::UsersIndexResolutionFailed { + reason: format!( + "userKey {} not present in published global users-index (size={}); user has not written yet", + user_key, + resolved.payload.users.len(), + ), + })?; + let buckets_index_cid = buckets_index_cid_str.parse::().map_err(|e| { + ClientError::UsersIndexResolutionFailed { + reason: format!("invalid bucketsIndex CID '{}': {}", buckets_index_cid_str, e), + } + })?; + + // 4. Fetch + verify + parse bucketsIndex CBOR. + let gateways = resolver.ipfs_gateways(); + let bi_bytes = crate::registry_resolver::fetch_cid_via_gateways( + &buckets_index_cid, + &gateways, + resolver.http_client(), + resolver.per_request_timeout(), + ) + .await?; + let buckets_index = crate::registry_resolver::decode_user_buckets_index(&bi_bytes)?; + + // 5. Resolve the requested bucket. Try the blinded key + // first (Phase 1.2 migrated state); fall back to the + // plaintext bucket name for legacy entries (the user + // hasn't yet uploaded with a Phase-1.2-aware client + // since the field landed). The legacy fallback only + // accepts entries explicitly marked `legacy = true`, + // closing the loophole where a malicious gateway could + // plant a stronger-looking plaintext-name entry next to + // a real blinded one. + let blinded = self.compute_bucket_lookup_h_hex(bucket); + let entry = if let Some(e) = buckets_index.buckets.get(&blinded) { + e.clone() + } else if let Some(e) = buckets_index.buckets.get(bucket) { + if !e.legacy { + return Err(ClientError::UsersIndexResolutionFailed { + reason: format!( + "bucket {:?} present at plaintext key but legacy=false; refusing as ambiguous", + bucket + ), + }); + } + e.clone() + } else { + return Err(ClientError::BucketNotFound(bucket.to_string())); + }; + + let manifest_cid = entry.manifest.parse::().map_err(|e| { + ClientError::UsersIndexResolutionFailed { + reason: format!("invalid manifest CID '{}' for bucket {}: {}", entry.manifest, bucket, e), + } + })?; + + // 6. Fetch + verify manifest bytes. + let manifest_bytes = crate::registry_resolver::fetch_cid_via_gateways( + &manifest_cid, + &gateways, + resolver.http_client(), + resolver.per_request_timeout(), + ) + .await?; + + Ok((manifest_cid, manifest_bytes)) + } + /// Save the private forest index for a bucket (monolithic v4 format with AAD+sequence) pub async fn save_forest(&self, bucket: &str, forest: &PrivateForest) -> Result<()> { let forest_dek = self.encryption.key_manager.derive_path_key(&format!("forest:{}", bucket)); @@ -7976,4 +8281,566 @@ mod tests { "writer must not be touched when the size guard rejects the manifest" ); } + + // ============================================================ + // Phase 3.3 — cold-start integration tests + // ============================================================ + + #[cfg(not(target_arch = "wasm32"))] + mod cold_start_phase_3_3 { + use super::*; + use crate::registry_resolver::{ + derive_user_key_from_email, BucketEntry, GlobalUsersIndex, UserBucketsIndex, + }; + use sha2::{Digest, Sha256}; + use std::collections::BTreeMap; + use wiremock::matchers::{method, path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + /// Compute a CIDv1 (codec=dag-cbor 0x71, multihash=sha2-256 + /// 0x12) from arbitrary bytes — the format master uses for + /// dag-cbor IPLD objects (production master's + /// `serde_ipld_dagcbor::to_vec → kubo /api/v0/dag/put` + /// produces this shape). + fn cid_for_dag_cbor_bytes(data: &[u8]) -> cid::Cid { + let digest = Sha256::digest(data); + let mh = cid::multihash::Multihash::<64>::wrap(0x12, &digest).unwrap(); + cid::Cid::new_v1(0x71, mh) + } + + /// Cold-start happy path against fully-mocked IPNS + IPFS + /// gateways. Asserts: + /// - resolver fetches global users-index via IPNS gateway + /// - cold-start looks up our `userKey` → bucketsIndexCid + /// - bucketsIndex CBOR is fetched + verified + /// - blinded `bucketLookupH` lookup succeeds + /// - manifest bytes are returned + /// - returned `Cid` matches what the gateway served + /// - returned `Bytes` are byte-identical to the staged + /// manifest payload + /// - resolver advanced its highest-seen-sequence floor + #[tokio::test] + async fn cold_start_resolve_manifest_happy_path_via_ipns() { + let ipns = MockServer::start().await; + let ipfs = MockServer::start().await; + let chain_rpc = MockServer::start().await; + + let email = "alice@example.com"; + let user_key = derive_user_key_from_email(email); + + let bucket = "photos"; + let manifest_payload = + b"placeholder forest-manifest bytes for the cold-start test".to_vec(); + let manifest_cid = cid_for_dag_cbor_bytes(&manifest_payload); + + let secret = fula_crypto::SecretKey::generate(); + let enc_cfg = EncryptionConfig::from_secret_key(secret); + let metadata_key = enc_cfg.key_manager.derive_path_key("fula-metadata-v1"); + let mut h_input = metadata_key.as_bytes().to_vec(); + h_input.extend_from_slice(bucket.as_bytes()); + let blinded_hex = hex::encode(&blake3::hash(&h_input).as_bytes()[..16]); + + let mut buckets = BTreeMap::new(); + buckets.insert( + blinded_hex, + BucketEntry { + manifest: manifest_cid.to_string(), + legacy: false, + }, + ); + let user_buckets = UserBucketsIndex { + v: 2, + buckets, + updated_at_unix: 1_700_000_000, + }; + let user_buckets_cbor = serde_ipld_dagcbor::to_vec(&user_buckets).expect("ubi"); + let buckets_index_cid = cid_for_dag_cbor_bytes(&user_buckets_cbor); + + let mut users_map = BTreeMap::new(); + users_map.insert(user_key.clone(), buckets_index_cid.to_string()); + let global = GlobalUsersIndex { + v: 1, + sequence: 42, + updated_at_unix: 1_700_000_001, + users: users_map, + }; + let global_cbor = serde_ipld_dagcbor::to_vec(&global).expect("global"); + + let ipns_name = "k51qzi5uqu5dh-cold-start-test".to_string(); + Mock::given(method("GET")) + .and(path(format!("/ipns/{}", ipns_name))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(global_cbor)) + .mount(&ipns) + .await; + Mock::given(method("GET")) + .and(path(format!("/ipfs/{}", buckets_index_cid))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(user_buckets_cbor)) + .mount(&ipfs) + .await; + Mock::given(method("GET")) + .and(path(format!("/ipfs/{}", manifest_cid))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(manifest_payload.clone())) + .mount(&ipfs) + .await; + Mock::given(method("POST")) + .respond_with(ResponseTemplate::new(503)) + .mount(&chain_rpc) + .await; + + let mut client_cfg = Config::new("http://master.unreachable.invalid"); + client_cfg.timeout = std::time::Duration::from_secs(2); + client_cfg.users_index_chain_rpc_url = chain_rpc.uri(); + client_cfg.users_index_anchor_address = + "0x0000000000000000000000000000000000000001".into(); + client_cfg.users_index_ipns_name = ipns_name; + client_cfg.users_index_user_key = Some(user_key); + client_cfg.users_index_ipns_gateway_urls = + vec![format!("{}/ipns/{{name}}", ipns.uri())]; + client_cfg.users_index_ipfs_gateway_urls = + vec![format!("{}/ipfs/{{cid}}", ipfs.uri())]; + client_cfg.block_cache_enabled = false; + + let client = EncryptedClient::new(client_cfg, enc_cfg).expect("client"); + let (got_cid, got_bytes) = client + .cold_start_resolve_manifest(bucket) + .await + .expect("cold-start resolves"); + + assert_eq!(got_cid, manifest_cid, "returned CID matches manifest"); + assert_eq!( + got_bytes.as_ref(), + manifest_payload.as_slice(), + "returned bytes match staged manifest" + ); + + let resolver = client + .inner + .users_index_resolver() + .expect("resolver configured") + .clone(); + assert_eq!( + resolver.highest_seen_sequence(), + 42, + "resolver bumped sequence floor on success" + ); + } + + /// Typed error when the configured `userKey` isn't present + /// in the resolved global users-index. + #[tokio::test] + async fn cold_start_user_absent_in_global_returns_typed_error() { + let ipns = MockServer::start().await; + let chain_rpc = MockServer::start().await; + + let our_user_key = derive_user_key_from_email("alice@example.com"); + let other_user_key = derive_user_key_from_email("bob@example.com"); + + let mut users_map = BTreeMap::new(); + users_map.insert(other_user_key, "bafyabcdef".to_string()); + let global = GlobalUsersIndex { + v: 1, + sequence: 5, + updated_at_unix: 1_700_000_000, + users: users_map, + }; + let global_cbor = serde_ipld_dagcbor::to_vec(&global).expect("global"); + + let ipns_name = "k51qzi5uqu5dh-no-alice".to_string(); + Mock::given(method("GET")) + .and(path(format!("/ipns/{}", ipns_name))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(global_cbor)) + .mount(&ipns) + .await; + Mock::given(method("POST")) + .respond_with(ResponseTemplate::new(503)) + .mount(&chain_rpc) + .await; + + let secret = fula_crypto::SecretKey::generate(); + let enc_cfg = EncryptionConfig::from_secret_key(secret); + + let mut client_cfg = Config::new("http://master.unreachable.invalid"); + client_cfg.timeout = std::time::Duration::from_secs(2); + client_cfg.users_index_chain_rpc_url = chain_rpc.uri(); + client_cfg.users_index_anchor_address = + "0x0000000000000000000000000000000000000001".into(); + client_cfg.users_index_ipns_name = ipns_name; + client_cfg.users_index_user_key = Some(our_user_key.clone()); + client_cfg.users_index_ipns_gateway_urls = + vec![format!("{}/ipns/{{name}}", ipns.uri())]; + + let client = EncryptedClient::new(client_cfg, enc_cfg).expect("client"); + let err = client + .cold_start_resolve_manifest("photos") + .await + .expect_err("user absent"); + match err { + ClientError::UsersIndexResolutionFailed { reason } => { + assert!( + reason.contains(&our_user_key), + "expected reason to reference missing userKey, got: {}", + reason + ); + } + other => panic!("expected UsersIndexResolutionFailed, got: {:?}", other), + } + } + + /// Phase 1.2 lazy-migration: legacy plaintext-keyed entry + /// with `legacy = true` is the fallback when the blinded + /// entry is absent. SDK accepts it. + #[tokio::test] + async fn cold_start_legacy_plaintext_fallback() { + let ipns = MockServer::start().await; + let ipfs = MockServer::start().await; + let chain_rpc = MockServer::start().await; + + let user_key = derive_user_key_from_email("legacy@example.com"); + let bucket = "old-photos"; + let manifest_payload = b"legacy manifest".to_vec(); + let manifest_cid = cid_for_dag_cbor_bytes(&manifest_payload); + + let mut buckets = BTreeMap::new(); + buckets.insert( + bucket.to_string(), + BucketEntry { + manifest: manifest_cid.to_string(), + legacy: true, + }, + ); + let user_buckets = UserBucketsIndex { + v: 2, + buckets, + updated_at_unix: 0, + }; + let user_buckets_cbor = serde_ipld_dagcbor::to_vec(&user_buckets).expect("ubi"); + let buckets_index_cid = cid_for_dag_cbor_bytes(&user_buckets_cbor); + + let mut users_map = BTreeMap::new(); + users_map.insert(user_key.clone(), buckets_index_cid.to_string()); + let global = GlobalUsersIndex { + v: 1, + sequence: 1, + updated_at_unix: 0, + users: users_map, + }; + let global_cbor = serde_ipld_dagcbor::to_vec(&global).expect("global"); + + let ipns_name = "k51qzi5uqu5dh-legacy".to_string(); + Mock::given(method("GET")) + .and(path(format!("/ipns/{}", ipns_name))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(global_cbor)) + .mount(&ipns) + .await; + Mock::given(method("GET")) + .and(path(format!("/ipfs/{}", buckets_index_cid))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(user_buckets_cbor)) + .mount(&ipfs) + .await; + Mock::given(method("GET")) + .and(path(format!("/ipfs/{}", manifest_cid))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(manifest_payload.clone())) + .mount(&ipfs) + .await; + Mock::given(method("POST")) + .respond_with(ResponseTemplate::new(503)) + .mount(&chain_rpc) + .await; + + let secret = fula_crypto::SecretKey::generate(); + let enc_cfg = EncryptionConfig::from_secret_key(secret); + + let mut client_cfg = Config::new("http://master.unreachable.invalid"); + client_cfg.timeout = std::time::Duration::from_secs(2); + client_cfg.users_index_chain_rpc_url = chain_rpc.uri(); + client_cfg.users_index_anchor_address = + "0x0000000000000000000000000000000000000001".into(); + client_cfg.users_index_ipns_name = ipns_name; + client_cfg.users_index_user_key = Some(user_key); + client_cfg.users_index_ipns_gateway_urls = + vec![format!("{}/ipns/{{name}}", ipns.uri())]; + client_cfg.users_index_ipfs_gateway_urls = + vec![format!("{}/ipfs/{{cid}}", ipfs.uri())]; + + let client = EncryptedClient::new(client_cfg, enc_cfg).expect("client"); + let (got_cid, got_bytes) = client + .cold_start_resolve_manifest(bucket) + .await + .expect("legacy fallback resolves"); + assert_eq!(got_cid, manifest_cid); + assert_eq!(got_bytes.as_ref(), manifest_payload.as_slice()); + } + + /// Defense: a plaintext-keyed entry without `legacy = true` + /// is rejected. Closes the loophole where a malicious + /// gateway plants a stronger-looking plaintext-named entry + /// next to the real blinded one to trick the SDK. + #[tokio::test] + async fn cold_start_rejects_plaintext_entry_without_legacy_flag() { + let ipns = MockServer::start().await; + let ipfs = MockServer::start().await; + let chain_rpc = MockServer::start().await; + + let user_key = derive_user_key_from_email("strict@example.com"); + let bucket = "test"; + + let bogus_cid = cid_for_dag_cbor_bytes(b"forged manifest"); + let mut buckets = BTreeMap::new(); + buckets.insert( + bucket.to_string(), + BucketEntry { + manifest: bogus_cid.to_string(), + legacy: false, + }, + ); + let user_buckets = UserBucketsIndex { + v: 2, + buckets, + updated_at_unix: 0, + }; + let user_buckets_cbor = serde_ipld_dagcbor::to_vec(&user_buckets).expect("ubi"); + let buckets_index_cid = cid_for_dag_cbor_bytes(&user_buckets_cbor); + + let mut users_map = BTreeMap::new(); + users_map.insert(user_key.clone(), buckets_index_cid.to_string()); + let global = GlobalUsersIndex { + v: 1, + sequence: 1, + updated_at_unix: 0, + users: users_map, + }; + let global_cbor = serde_ipld_dagcbor::to_vec(&global).expect("global"); + + let ipns_name = "k51qzi5uqu5dh-strict".to_string(); + Mock::given(method("GET")) + .and(path(format!("/ipns/{}", ipns_name))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(global_cbor)) + .mount(&ipns) + .await; + Mock::given(method("GET")) + .and(path(format!("/ipfs/{}", buckets_index_cid))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(user_buckets_cbor)) + .mount(&ipfs) + .await; + Mock::given(method("POST")) + .respond_with(ResponseTemplate::new(503)) + .mount(&chain_rpc) + .await; + + let secret = fula_crypto::SecretKey::generate(); + let enc_cfg = EncryptionConfig::from_secret_key(secret); + + let mut client_cfg = Config::new("http://master.unreachable.invalid"); + client_cfg.timeout = std::time::Duration::from_secs(2); + client_cfg.users_index_chain_rpc_url = chain_rpc.uri(); + client_cfg.users_index_anchor_address = + "0x0000000000000000000000000000000000000001".into(); + client_cfg.users_index_ipns_name = ipns_name; + client_cfg.users_index_user_key = Some(user_key); + client_cfg.users_index_ipns_gateway_urls = + vec![format!("{}/ipns/{{name}}", ipns.uri())]; + client_cfg.users_index_ipfs_gateway_urls = + vec![format!("{}/ipfs/{{cid}}", ipfs.uri())]; + + let client = EncryptedClient::new(client_cfg, enc_cfg).expect("client"); + let err = client + .cold_start_resolve_manifest(bucket) + .await + .expect_err("must reject"); + match err { + ClientError::UsersIndexResolutionFailed { reason } => { + assert!( + reason.contains("legacy=false"), + "expected legacy-flag rejection, got: {}", + reason + ); + } + other => panic!("expected UsersIndexResolutionFailed, got: {:?}", other), + } + } + + /// `BucketNotFound` (not a new variant) when bucket is + /// absent from the user's bucketsIndex. Reuses the + /// established error type per advisor's narrowing. + #[tokio::test] + async fn cold_start_returns_bucket_not_found_when_bucket_absent() { + let ipns = MockServer::start().await; + let ipfs = MockServer::start().await; + let chain_rpc = MockServer::start().await; + + let user_key = derive_user_key_from_email("user@example.com"); + let manifest_cid = cid_for_dag_cbor_bytes(b"some manifest"); + let mut buckets = BTreeMap::new(); + buckets.insert( + "videos".to_string(), + BucketEntry { + manifest: manifest_cid.to_string(), + legacy: true, + }, + ); + let user_buckets = UserBucketsIndex { + v: 2, + buckets, + updated_at_unix: 0, + }; + let user_buckets_cbor = serde_ipld_dagcbor::to_vec(&user_buckets).expect("ubi"); + let buckets_index_cid = cid_for_dag_cbor_bytes(&user_buckets_cbor); + + let mut users_map = BTreeMap::new(); + users_map.insert(user_key.clone(), buckets_index_cid.to_string()); + let global = GlobalUsersIndex { + v: 1, + sequence: 1, + updated_at_unix: 0, + users: users_map, + }; + let global_cbor = serde_ipld_dagcbor::to_vec(&global).expect("global"); + + let ipns_name = "k51qzi5uqu5dh-only-videos".to_string(); + Mock::given(method("GET")) + .and(path(format!("/ipns/{}", ipns_name))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(global_cbor)) + .mount(&ipns) + .await; + Mock::given(method("GET")) + .and(path(format!("/ipfs/{}", buckets_index_cid))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(user_buckets_cbor)) + .mount(&ipfs) + .await; + Mock::given(method("POST")) + .respond_with(ResponseTemplate::new(503)) + .mount(&chain_rpc) + .await; + + let secret = fula_crypto::SecretKey::generate(); + let enc_cfg = EncryptionConfig::from_secret_key(secret); + + let mut client_cfg = Config::new("http://master.unreachable.invalid"); + client_cfg.timeout = std::time::Duration::from_secs(2); + client_cfg.users_index_chain_rpc_url = chain_rpc.uri(); + client_cfg.users_index_anchor_address = + "0x0000000000000000000000000000000000000001".into(); + client_cfg.users_index_ipns_name = ipns_name; + client_cfg.users_index_user_key = Some(user_key); + client_cfg.users_index_ipns_gateway_urls = + vec![format!("{}/ipns/{{name}}", ipns.uri())]; + client_cfg.users_index_ipfs_gateway_urls = + vec![format!("{}/ipfs/{{cid}}", ipfs.uri())]; + + let client = EncryptedClient::new(client_cfg, enc_cfg).expect("client"); + let err = client + .cold_start_resolve_manifest("photos") + .await + .expect_err("bucket missing"); + match err { + ClientError::BucketNotFound(name) => assert_eq!(name, "photos"), + other => panic!("expected BucketNotFound, got: {:?}", other), + } + } + + /// Fail-closed when the resolver isn't configured. + /// `UsersIndexResolutionFailed` distinguishes "operator + /// misconfig" from "everything is down". + #[tokio::test] + async fn cold_start_without_resolver_returns_resolution_failed() { + let secret = fula_crypto::SecretKey::generate(); + let enc_cfg = EncryptionConfig::from_secret_key(secret); + + let mut client_cfg = Config::new("http://master.unreachable.invalid"); + client_cfg.timeout = std::time::Duration::from_secs(2); + // No resolver fields populated → resolver stays None + // (field-presence model). Same effect as the old `= + // false` flag. + let client = EncryptedClient::new(client_cfg, enc_cfg).expect("client"); + let err = client + .cold_start_resolve_manifest("any") + .await + .expect_err("not configured"); + assert!( + matches!(err, ClientError::UsersIndexResolutionFailed { .. }), + "expected UsersIndexResolutionFailed, got: {:?}", + err + ); + } + + /// Phase 19 — when both IPNS and chain channels fail, the + /// resolver returns `UsersIndexResolutionFailed`. The + /// cold-start path MUST fire `MasterHealthEvent::SeverelyDegraded` + /// through the configured callback so apps can disable + /// "first-read" UI affordances. + #[tokio::test] + async fn cold_start_fires_severely_degraded_when_both_channels_fail() { + use crate::health_gate::{HealthCallback, MasterHealthEvent}; + + // IPNS: 503 on every request → resolver IPNS path fails. + let ipns = MockServer::start().await; + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(503)) + .mount(&ipns) + .await; + // Chain RPC: 503 on every request → resolver chain path + // fails too. Both channels exhausted → resolver surfaces + // UsersIndexResolutionFailed → cold_start fires SeverelyDegraded. + let chain_rpc = MockServer::start().await; + Mock::given(method("POST")) + .respond_with(ResponseTemplate::new(503)) + .mount(&chain_rpc) + .await; + + // Capturing callback. + let captured: std::sync::Arc>> = + std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let captured_for_cb = std::sync::Arc::clone(&captured); + let cb: HealthCallback = std::sync::Arc::new(move |ev| { + captured_for_cb.lock().unwrap().push(ev); + }); + + let secret = fula_crypto::SecretKey::generate(); + let enc_cfg = EncryptionConfig::from_secret_key(secret); + + let mut client_cfg = Config::new("http://master.unreachable.invalid"); + client_cfg.timeout = std::time::Duration::from_secs(2); + client_cfg.users_index_chain_rpc_url = chain_rpc.uri(); + client_cfg.users_index_anchor_address = + "0x0000000000000000000000000000000000000001".into(); + client_cfg.users_index_ipns_name = "k51qzi5uqu5dh-test".to_string(); + client_cfg.users_index_user_key = + Some(derive_user_key_from_email("alice@example.com")); + client_cfg.users_index_ipns_gateway_urls = + vec![format!("{}/ipns/{{name}}", ipns.uri())]; + client_cfg.health_callback = Some(cb); + + let client = EncryptedClient::new(client_cfg, enc_cfg).expect("client"); + let err = client + .cold_start_resolve_manifest("any-bucket") + .await + .expect_err("both channels exhausted"); + + // Error must be UsersIndexResolutionFailed (the resolver's + // signal that both paths failed). + assert!( + matches!(err, ClientError::UsersIndexResolutionFailed { .. }), + "expected UsersIndexResolutionFailed, got: {:?}", + err + ); + + // And the callback must have observed exactly one + // SeverelyDegraded event. + let events = captured.lock().unwrap().clone(); + assert_eq!( + events.len(), + 1, + "expected exactly one SeverelyDegraded event, got: {:?}", + events + ); + assert!( + matches!( + events[0], + MasterHealthEvent::SeverelyDegraded { .. } + ), + "expected SeverelyDegraded, got: {:?}", + events[0] + ); + } + } } diff --git a/crates/fula-client/src/error.rs b/crates/fula-client/src/error.rs index 39920a8..8825245 100644 --- a/crates/fula-client/src/error.rs +++ b/crates/fula-client/src/error.rs @@ -106,6 +106,85 @@ pub enum ClientError { /// timeout" into "fast-fail with a clear signal." #[error("Master unreachable (health gate; down for ~{down_for_secs}s)")] MasterUnreachable { down_for_secs: u64 }, + + /// Phase 2.2 of master-independent reads: a single block exceeds the + /// configured `block_cache_max_bytes` budget and cannot be cached. + /// + /// **Native-only signal in practice.** `BlockCache` itself is + /// compiled out on `wasm32`; this variant is defined unconditionally + /// so the enum shape stays stable across native and web builds, and + /// so consumers (fula-flutter, app integrators) can write a single + /// exhaustive match arm without `#[cfg]` gates of their own. + /// Triggering it on wasm would require a manual construction — + /// the SDK never raises it there. + /// + /// Apps should surface this to the user with guidance to raise the + /// `block_cache_max_bytes` config or skip the cache for this object. + #[error("Block exceeds cache budget: size={size}, budget={budget}")] + BlockTooLarge { size: u64, budget: u64 }, + + /// Phase 2.2 of master-independent reads: catch-all for the + /// persistent block cache's I/O / storage / commit errors. + /// + /// Stringified at the SDK boundary so app code doesn't need to depend + /// on `redb` or its concrete error type. Native-only in practice + /// (same reasoning as `BlockTooLarge` above); kept unconditional for + /// enum-shape stability. + #[error("Block cache error: {0}")] + BlockCache(String), + + /// Phase 3.3 of master-independent reads: cold-start hybrid + /// resolver could not resolve the master-published global + /// users-index CID through any channel (IPNS exhausted AND + /// chain failed / was unreachable / had no entry / sequence- + /// regressed). Fresh-device cold-start is unrecoverable until + /// at least one channel returns; the app should surface + /// "offline mode unavailable for this device yet". + /// + /// Defined unconditionally so the enum shape stays stable + /// across native and wasm. The native resolver lives in + /// `registry_resolver.rs`; the wasm cold-start path always + /// raises this variant until a browser-friendly resolver lands. + #[error("users-index resolution failed: {reason}")] + UsersIndexResolutionFailed { reason: String }, + + /// Phase 3.3 replay defense: the resolver observed a payload + /// whose embedded `sequence` is strictly less than what the SDK + /// has previously seen and persisted. A compromised gateway, + /// RPC node, or operator could try to serve a stale (but + /// otherwise valid-looking) payload to roll back the user's + /// view; this variant is the SDK's refusal to honor that. + /// + /// Apps should NOT retry — every retry from the same source + /// would fail identically. Surface as "your master appears to + /// be serving stale state; contact support" or equivalent. + /// `channel` is a free-form label identifying which path + /// observed the regression (e.g. `"chain.latest()"`, + /// `"Ipns"`, `"Chain"`). Named `channel` rather than `source` + /// because thiserror gives the latter special meaning + /// (it expects an `std::error::Error` impl). + #[error("sequence regression in {channel}: observed={observed}, highest seen={highest_seen}")] + SequenceRegression { + observed: u64, + highest_seen: u64, + channel: String, + }, +} + +#[cfg(not(target_arch = "wasm32"))] +impl From for ClientError { + fn from(err: crate::block_cache::BlockCacheError) -> Self { + use crate::block_cache::BlockCacheError; + match err { + BlockCacheError::BlockTooLarge { size, budget } => { + ClientError::BlockTooLarge { size, budget } + } + // Catch-all: stringify the rest so app code doesn't have to + // pattern-match on redb internals. Adds zero deps to the + // public SDK surface. + other => ClientError::BlockCache(other.to_string()), + } + } } impl ClientError { @@ -143,6 +222,13 @@ impl ClientError { || matches!(self, Self::S3Error { code, .. } if code == "PreconditionFailed" || code == "HTTP412" || code == "412") } + + /// Check if this is a block-cache error (budget exceeded or storage + /// failure). Useful for app integrators that want to retry without + /// the cache (e.g., directly via the gateway-race path). + pub fn is_cache_error(&self) -> bool { + matches!(self, Self::BlockTooLarge { .. } | Self::BlockCache(_)) + } } fn extract_xml_element(xml: &str, element: &str) -> Option { diff --git a/crates/fula-client/src/gateway_fetch.rs b/crates/fula-client/src/gateway_fetch.rs index 5c20ff4..52aeec9 100644 --- a/crates/fula-client/src/gateway_fetch.rs +++ b/crates/fula-client/src/gateway_fetch.rs @@ -395,13 +395,21 @@ impl GatewayPool { } } + // `len` and `is_empty` are public monitoring API for app + // integrators that want to surface "configured N gateways" or + // detect a misconfigured empty pool before issuing requests. + // The crate itself doesn't call them internally — silence the + // workspace warning while keeping the surface stable for apps. + /// Number of gateways in the pool. + #[allow(dead_code)] pub fn len(&self) -> usize { self.gateways.len() } /// True if no gateways are configured (effectively disables /// gateway-race fallback). + #[allow(dead_code)] pub fn is_empty(&self) -> bool { self.gateways.is_empty() } @@ -457,6 +465,27 @@ impl GatewayPool { cid: &Cid, http: &reqwest::Client, ) -> Result { + self.fetch_verified_with_source(cid, http) + .await + .map(|(b, _url)| b) + } + + /// Phase 19 — like `fetch_verified` but also returns which gateway + /// URL template won the race. Used by the offline-fallback path + /// (Phase 2.4) to populate `OfflineGetResult.source = + /// ReadSource::Gateway(url)` for transparency surfacing. The URL + /// is the configured template (e.g. `https://ipfs.io/ipfs/{cid}`), + /// NOT the per-CID-substituted URL — apps display "served by + /// ipfs.io" without the per-fetch CID noise. + /// + /// Crate-private: this is an internal seam consumed only by + /// `try_offline_fallback`. Apps should call `fetch_verified` (which + /// is `pub` and forwards to this) when they need the bytes alone. + pub(crate) async fn fetch_verified_with_source( + &self, + cid: &Cid, + http: &reqwest::Client, + ) -> Result<(Bytes, String), GatewayPoolError> { use futures::stream::FuturesUnordered; use futures::StreamExt; @@ -487,9 +516,10 @@ impl GatewayPool { match result { Ok(body) => { g.record_success(); + let url = g.url_template.clone(); // Drop in_flight to cancel remaining racers. drop(in_flight); - return Ok(body); + return Ok((body, url)); } Err(FetchError::Transient(msg)) => { g.record_transient_failure(); diff --git a/crates/fula-client/src/health_gate.rs b/crates/fula-client/src/health_gate.rs index a5e2024..0942743 100644 --- a/crates/fula-client/src/health_gate.rs +++ b/crates/fula-client/src/health_gate.rs @@ -29,8 +29,45 @@ //! read" into "fast-fail with `MasterUnreachable`" when Down. use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; +use std::sync::Arc; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +/// Phase 19 transparency surface — events the SDK emits when its +/// view of master-server reachability changes. Apps wire a +/// [`HealthCallback`] via [`Config::health_callback`] and surface +/// the transitions to users (e.g., "you're offline; reading from +/// IPFS gateway"). The default behavior with no callback set is +/// byte-identical to pre-Phase-19 builds — the gate still works, +/// just silently. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum MasterHealthEvent { + /// Master S3 is reachable; reads use the fast path. + Online, + + /// Master S3 is unreachable; SDK is falling back to IPFS + /// gateways (Phase 2.4) or cold-start resolver (Phase 3.3). + /// `reason` is human-readable for logging — not for end-user + /// display (use a localized string from your UI layer). + OfflineFallbackActive { reason: String }, + + /// Both master S3 AND the chain RPC are unreachable. Cold- + /// start reads will fail; warm reads (via cached `(bucket, + /// key) → cid`) still work via gateways. Apps should disable + /// "open new bucket" / "first-read" UI affordances when this + /// fires. **Emitted only from the cold-start failure path** + /// (the resolver), NOT from periodic health-gate observation — + /// the SDK can't authoritatively detect "both down" without + /// trying. + SeverelyDegraded { reason: String }, +} + +/// A callback the SDK invokes on every `MasterHealthEvent` +/// transition. `Arc` so the closure can be +/// shared across all clones of `FulaClient` and called from any +/// task. Transitions are deduplicated — a single Down→Up flip fires +/// exactly one `Online` event, not one per request. +pub type HealthCallback = Arc; + /// Threshold for flipping from `Up` to `Down`. One transient 5xx on a single /// bucket isn't the same as "master is unreachable" — only two consecutive /// signals trip the gate. @@ -47,15 +84,35 @@ pub struct HealthGate { state_ms: AtomicU64, consecutive_failures: AtomicU32, ttl: Duration, + /// Phase 19 — optional transparency callback. `Some` when + /// `Config::health_callback` was set on `FulaClient::new`. + /// Fires `Online` / `OfflineFallbackActive` on Up↔Down state + /// transitions, with deduplication so back-to-back events + /// don't double-fire. + callback: Option, } impl HealthGate { /// Create a new gate with the given TTL. Starts in the `Up` state. + /// No callback registered. pub fn new(ttl: Duration) -> Self { Self { state_ms: AtomicU64::new(0), consecutive_failures: AtomicU32::new(0), ttl, + callback: None, + } + } + + /// Phase 19 — construct a gate with a transparency callback. + /// The callback fires once on each Up↔Down transition; consecutive + /// failures within an already-Down state do NOT re-fire. + pub fn with_callback(ttl: Duration, callback: HealthCallback) -> Self { + Self { + state_ms: AtomicU64::new(0), + consecutive_failures: AtomicU32::new(0), + ttl, + callback: Some(callback), } } @@ -86,9 +143,15 @@ impl HealthGate { /// Record a successful master interaction. Resets the failure counter /// and clears the `Down` timestamp (gate returns to `Up`). + /// + /// Phase 19: fires `MasterHealthEvent::Online` exactly when the gate + /// flips from Down→Up. A success while already Up is a no-op. pub fn record_success(&self) { self.consecutive_failures.store(0, Ordering::Release); - self.state_ms.store(0, Ordering::Release); + let was_down = self.state_ms.swap(0, Ordering::AcqRel) != 0; + if was_down { + self.fire_event(MasterHealthEvent::Online); + } } /// Record a master-side failure (connection refused / RST / 5xx / @@ -97,18 +160,51 @@ impl HealthGate { /// /// 4xx responses are NOT failures for gate purposes — they're /// request-level issues, not master-down signals. + /// + /// Phase 19: fires `MasterHealthEvent::OfflineFallbackActive` exactly + /// once on the Up→Down transition. Subsequent failures while already + /// Down do NOT re-fire (the `compare_exchange` filters duplicates). pub fn record_failure(&self) { let prior = self.consecutive_failures.fetch_add(1, Ordering::AcqRel); if prior + 1 >= CONSECUTIVE_FAILURE_THRESHOLD { // Threshold crossed (or exceeded). Flip to `Down` if not already. // Only update timestamp on the first transition this window so // that repeated failures don't keep extending the TTL. - let _ = self.state_ms.compare_exchange( - 0, - now_ms(), - Ordering::AcqRel, - Ordering::Acquire, - ); + let now = now_ms(); + let prev = self + .state_ms + .compare_exchange(0, now, Ordering::AcqRel, Ordering::Acquire); + // `Ok(_)` means we successfully transitioned Up→Down — fire + // the event once. `Err(_)` means already Down (timestamp + // non-zero), no transition. + if prev.is_ok() { + self.fire_event(MasterHealthEvent::OfflineFallbackActive { + reason: format!( + "{} consecutive master failures observed", + prior + 1 + ), + }); + } + } + } + + /// Phase 19 helper — invoke the registered callback if present. + /// Swallows panics inside the callback so a buggy app handler + /// can't crash the SDK request path. + fn fire_event(&self, event: MasterHealthEvent) { + if let Some(cb) = self.callback.as_ref() { + let cb = Arc::clone(cb); + // Clone the event for the closure; original is dropped after. + let event_clone = event.clone(); + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(move || { + cb(event_clone); + })); + if result.is_err() { + tracing::warn!( + event = ?event, + "health_callback panicked; SDK proceeding (callback panics are swallowed by design)" + ); + } } } } @@ -237,4 +333,129 @@ mod tests { // 8 failures > threshold(2), so gate must be Down. assert!(matches!(gate.decide(), GateDecision::ShortCircuit { .. })); } + + // ============================================================ + // Phase 19 — transparency callback wiring + // ============================================================ + + /// Helper: build a callback that pushes events into a Mutex. + /// Returns the callback Arc + a clone of the same Vec for assertions. + fn capturing_callback() -> ( + HealthCallback, + std::sync::Arc>>, + ) { + let captured: std::sync::Arc>> = + std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let captured_for_cb = std::sync::Arc::clone(&captured); + let cb: HealthCallback = std::sync::Arc::new(move |ev| { + captured_for_cb.lock().unwrap().push(ev); + }); + (cb, captured) + } + + #[test] + fn test_phase19_two_failures_fire_offline_event_single_failure_silent() { + // Advisor-mandated test #1: a single failure must NOT fire the + // callback (the gate stays Up). The second failure that crosses + // the threshold fires `OfflineFallbackActive` exactly once. + let (cb, captured) = capturing_callback(); + let gate = HealthGate::with_callback(Duration::from_secs(30), cb); + + gate.record_failure(); + // After one failure: gate still Up, no callback fired. + assert_eq!( + captured.lock().unwrap().len(), + 0, + "single failure must not fire callback" + ); + + gate.record_failure(); + // After two failures: gate flipped Down, exactly one event fired. + let events = captured.lock().unwrap().clone(); + assert_eq!(events.len(), 1, "expected exactly one event, got: {:?}", events); + match &events[0] { + MasterHealthEvent::OfflineFallbackActive { reason } => { + assert!( + reason.contains("2 consecutive"), + "reason should mention failure count: {}", + reason + ); + } + other => panic!("expected OfflineFallbackActive, got {:?}", other), + } + + // Further failures while already Down must NOT re-fire the event + // (compare_exchange filters the no-transition case). + gate.record_failure(); + gate.record_failure(); + assert_eq!( + captured.lock().unwrap().len(), + 1, + "additional failures while Down must not re-fire OfflineFallbackActive" + ); + } + + #[test] + fn test_phase19_success_after_down_fires_online() { + // Advisor-mandated test #2: when the gate is Down and a probe + // succeeds, the callback observes `Online` exactly once. + let (cb, captured) = capturing_callback(); + let gate = HealthGate::with_callback(Duration::from_secs(30), cb); + + // Trip the gate. + gate.record_failure(); + gate.record_failure(); + // One OfflineFallbackActive event so far. + assert_eq!(captured.lock().unwrap().len(), 1); + + // Success — flips Down→Up; fires Online. + gate.record_success(); + let events = captured.lock().unwrap().clone(); + assert_eq!(events.len(), 2, "expected OfflineFallbackActive + Online"); + assert!(matches!(events[1], MasterHealthEvent::Online)); + + // A second success while already Up must NOT re-fire Online. + gate.record_success(); + assert_eq!( + captured.lock().unwrap().len(), + 2, + "redundant success while Up must not re-fire Online" + ); + } + + #[test] + fn test_phase19_callback_panic_does_not_crash_caller() { + // A buggy app callback that panics must NOT crash the SDK. + // `fire_event` wraps the call in `catch_unwind` and proceeds. + let cb: HealthCallback = std::sync::Arc::new(|_ev| { + panic!("simulated app-level panic"); + }); + let gate = HealthGate::with_callback(Duration::from_secs(30), cb); + + // These calls would propagate the panic if catch_unwind weren't + // wrapping the callback. The test passes by NOT panicking. + gate.record_failure(); + gate.record_failure(); + gate.record_success(); + + // And the gate state itself remains correct: a success after a + // Down state returns to Up. + assert_eq!(gate.decide(), GateDecision::Allow); + } + + #[test] + fn test_phase19_no_callback_means_silent() { + // A gate constructed via `new` (no callback) must work + // identically to pre-Phase-19 builds: state machine works, + // no events are produced anywhere. + let gate = HealthGate::new(Duration::from_secs(30)); + gate.record_failure(); + gate.record_failure(); + gate.record_success(); + // No assertion on event capture — there's no captured Vec. + // The fact that we constructed the gate with `new` (no + // callback wiring) and reached this line proves the silent + // path works. Verify final state is sane. + assert_eq!(gate.decide(), GateDecision::Allow); + } } diff --git a/crates/fula-client/src/lib.rs b/crates/fula-client/src/lib.rs index 3a233e1..53e8240 100644 --- a/crates/fula-client/src/lib.rs +++ b/crates/fula-client/src/lib.rs @@ -47,6 +47,8 @@ mod error; mod gateway_fetch; mod health_gate; mod multipart; +#[cfg(not(target_arch = "wasm32"))] +mod registry_resolver; mod types; #[cfg(not(target_arch = "wasm32"))] mod orphan_queue; @@ -77,6 +79,25 @@ pub use error::{ClientError, Result}; pub use multipart::{MultipartUpload, UploadProgress, ProgressCallback, upload_large_file, MultipartAbortGuard}; pub use types::*; +/// Phase 19 — transparency surfaces. `HealthCallback` is the closure +/// type apps wire via `Config::with_health_callback` to observe master +/// reachability transitions. `MasterHealthEvent` is the variant the +/// callback receives. Re-exported here so app-level code can construct +/// callbacks without depending on internal module paths. +pub use health_gate::{HealthCallback, MasterHealthEvent}; + +/// Phase 3.3 — cold-start hybrid resolver public API. Native-only; +/// the resolver itself is gated to `cfg(not(target_arch = "wasm32"))`. +/// The free helper `derive_user_key_from_email` is also re-exported +/// so JS / Flutter bindings can compute the user_key without holding +/// a client. +#[cfg(not(target_arch = "wasm32"))] +pub use registry_resolver::{ + decode_user_buckets_index, default_ipfs_gateway_urls, default_ipns_gateway_urls, + derive_user_key_from_email, fetch_cid_via_gateways, BucketEntry, GlobalUsersIndex, + ResolutionSource, ResolvedUsersIndex, ResolverConfig, UserBucketsIndex, UsersIndexResolver, +}; + /// Process-wide count of WAL append failures (F11). /// /// The WAL is the crash-recovery log for in-memory forest upserts. When diff --git a/crates/fula-client/src/registry_resolver.rs b/crates/fula-client/src/registry_resolver.rs new file mode 100644 index 0000000..f602eb1 --- /dev/null +++ b/crates/fula-client/src/registry_resolver.rs @@ -0,0 +1,1785 @@ +//! Phase 3.3 — hybrid IPNS-primary + chain-fallback resolver for +//! the master-published global users-index CID. +//! +//! Cold-start flow (per plan §3.3 step 5): +//! +//! 1. **IPNS path (primary).** Race a small fan-out of IPNS-aware +//! public gateways for `/ipns/`. Each gateway +//! resolves the IPNS NAME server-side and returns the underlying +//! dag-cbor bytes. We parse those bytes as +//! [`GlobalUsersIndex`], read the in-payload `sequence`, and +//! accept the first response whose sequence is ≥ the SDK's +//! process-wide `highest_seen_sequence` (replay defense). +//! Budget: 10 s, sequential; no per-gateway dynamic-priority +//! state (the cold-start path is one-shot — the warm-device +//! pool's state machine isn't a fit). +//! +//! 2. **Chain path (fallback).** If the IPNS path fails or times +//! out, fire one `eth_call` against the configured RPC URL for +//! `FulaUsersIndexAnchor.latest()`. The 96-byte ABI response is +//! `(bytes32 cid_digest, uint64 sequence, uint64 timestamp)`. +//! Reconstruct a CIDv1 (codec=dag-cbor 0x71, multihash=sha2-256 +//! 0x12 + the digest bytes), then iterate the same gateway list +//! fetching `/ipfs/` until one body content-addresses to +//! that CID via [`verify_cid_against_bytes`]. Parse the body as +//! [`GlobalUsersIndex`]; verify the in-payload `sequence` +//! matches the on-chain `sequence` and is ≥ `highest_seen_sequence`. +//! +//! 3. **Single sequence stream.** There is one monotonic `sequence` +//! field, embedded inside the CBOR payload itself. Both IPNS and +//! chain paths read it from the bytes — never from IPNS DHT +//! metadata or the chain-call return — so a compromised gateway +//! (or RPC node, or operator) can publish a fresh-but-malicious +//! *higher* sequence (closing that requires user wallets and is +//! out of scope), but **cannot regress** to a stale one. +//! +//! ## Native-only +//! +//! The resolver is gated to `cfg(not(target_arch = "wasm32"))` for +//! the same reason as `block_cache.rs` and `gateway_fetch.rs`: it +//! depends on `tokio::time::timeout`, on the `parking_lot::Mutex` +//! used internally by gateway-side code, and on +//! `verify_cid_against_bytes` (which itself is native-only because +//! it lives in `gateway_fetch.rs`). Cold-start on browser/wasm +//! surfaces [`ClientError::UsersIndexResolutionFailed`] until a +//! browser-friendly resolver lands as a follow-up. + +#![cfg(not(target_arch = "wasm32"))] + +use crate::error::ClientError; +use crate::gateway_fetch::verify_cid_against_bytes; +use bytes::Bytes; +use cid::multihash::Multihash; +use cid::Cid; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +// ============================================================ +// Public types +// ============================================================ + +/// Master's published global users-index CBOR payload. Mirrors the +/// `GlobalUsersIndex` struct in `fula-cli`'s +/// `handlers::users_index_publisher`. The two definitions must stay +/// in lockstep — see plan §3.2.a for the producer side. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] +pub struct GlobalUsersIndex { + pub v: u32, + pub sequence: u64, + pub updated_at_unix: u64, + /// `userKey_hex` (32 hex chars) → bucketsIndexCid (string). + /// The SDK looks up its own `userKey` here on cold-start. + pub users: BTreeMap, +} + +/// Master's per-user `bucketsIndex` CBOR — one per user per snapshot +/// when their state changed. Mirrors the `UserBucketsIndex` struct +/// in `fula-cli`'s `handlers::users_index_publisher` (the producer +/// side; see plan §3.2.a). The two definitions must stay in lockstep. +/// +/// Map keys are either: +/// - 32-hex BLAKE3-derived `bucketLookupH` (Phase 1.2 blinded form) +/// - plaintext bucket name (Phase 1.2 lazy-migration legacy form) +/// +/// `legacy=true` distinguishes the latter so the cold-start dispatch +/// can fall back from `index[blinded_hex]` to `index[bucket_name]` +/// for users who haven't yet uploaded with a Phase-1.2-aware client. +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] +pub struct UserBucketsIndex { + pub v: u32, + pub buckets: BTreeMap, + pub updated_at_unix: u64, +} + +#[derive(Clone, Debug, PartialEq, Eq, Deserialize, Serialize)] +pub struct BucketEntry { + /// CIDv1 string of the user's per-bucket forest manifest. + pub manifest: String, + /// `true` ⇔ map key is the plaintext `bucket_name` (legacy + /// fallback). The cold-start lookup tries blinded first; on + /// miss it tries the plaintext name and accepts only entries + /// where `legacy = true`. + pub legacy: bool, +} + +/// Result of a successful [`UsersIndexResolver::resolve`]. +#[derive(Clone, Debug)] +pub struct ResolvedUsersIndex { + /// Which channel actually served the payload. Surfaced to apps + /// (and to Phase 19's `ReadFreshness`) so users can be told + /// "served from chain backup; expected staleness ≤ 12h". + pub source: ResolutionSource, + /// CID of the parsed payload. For the chain path this is the + /// reconstructed-and-verified CID. For the IPNS path it is + /// `Cid::new_v1(0x71, sha2-256(bytes))` — synthesized from the + /// returned bytes (the IPNS path has no externally-asserted CID + /// to verify against; the gateway does the IPNS-record + /// resolution upstream). + pub cid: Cid, + /// Decoded payload. Apps walk `payload.users` to find their own + /// `userKey` → bucketsIndexCid. + pub payload: GlobalUsersIndex, + /// Raw CBOR bytes — kept so callers can persist them (Phase + /// 3.3.5 hot-start cache) without re-fetching. + pub bytes: Bytes, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum ResolutionSource { + Ipns, + Chain, + /// Phase 3.3.5 — served from the on-disk hot-start cache (the + /// resolver short-circuited IPNS + chain because the cached + /// `(cid, sequence)` was within `soft_ttl`). Apps/Phase 19's + /// `ReadFreshness` can surface this as "served from a recent + /// snapshot — last refreshed N seconds ago". + HotStartCache, +} + +/// Resolver configuration. Construct via [`UsersIndexResolver::new`]. +#[derive(Clone, Debug)] +pub struct ResolverConfig { + /// IPNS-aware gateway URL templates (each must contain `{name}`). + /// Empty = use the SDK-shipped default subset + /// ([`default_ipns_gateway_urls`]). + pub ipns_gateways: Vec, + /// `/ipfs/{cid}` gateway URL templates for the chain path's + /// CID-fetch step. Empty = use the SDK-shipped default six. + pub ipfs_gateways: Vec, + /// JSON-RPC URL for the chain anchor. Required. + pub chain_rpc_url: String, + /// `FulaUsersIndexAnchor.sol` proxy address (20 bytes hex, + /// optionally `0x`-prefixed). Required. + pub anchor_address: String, + /// IPNS NAME (libp2p public-key hash, e.g. `k51qzi5...`). + /// Required. + pub ipns_name: String, + /// Hard ceiling on the IPNS race; fall through to chain after. + /// Default 10 s per plan §3.3 step 5a. + pub ipns_race_timeout: Duration, + /// Per-gateway timeout for individual fetches (both IPNS and the + /// chain path's CID-fetch step). + pub per_request_timeout: Duration, + + /// Phase 3.3.5 — soft TTL for the on-disk hot-start cache. + /// When the resolver was successfully run within this window + /// (per the cached `observed_at_unix`), `resolve()` returns the + /// cached state directly without touching IPNS or chain. + /// Beyond this, the resolver opportunistically re-runs. + /// Default: 5 minutes per plan §3.3.5 — matches the expected + /// IPNS publish cadence. + pub soft_ttl: Duration, +} + +impl ResolverConfig { + /// Default config for a given chain RPC URL, IPNS NAME, and + /// anchor address. All other fields take audit-recommended + /// defaults. + pub fn new( + chain_rpc_url: impl Into, + anchor_address: impl Into, + ipns_name: impl Into, + ) -> Self { + Self { + ipns_gateways: Vec::new(), + ipfs_gateways: Vec::new(), + chain_rpc_url: chain_rpc_url.into(), + anchor_address: anchor_address.into(), + ipns_name: ipns_name.into(), + ipns_race_timeout: Duration::from_secs(10), + per_request_timeout: Duration::from_secs(8), + soft_ttl: Duration::from_secs(300), // 5 min, matches IPNS publish cadence + } + } +} + +/// Derive the SDK-side `userKey` from a user's email address. +/// +/// Replicates the master-side identity derivation chain in +/// `fula-cli/src/state.rs::hash_user_id`: +/// +/// 1. `userId = sha256(lower(email))` — 32 bytes +/// 2. `userIdHex = hex::encode(userId)` — 64 ASCII hex chars +/// 3. `userKey = BLAKE3("fula:user_id:" || userIdHex)[..16]` — 16 bytes +/// 4. Return `hex::encode(userKey)` — 32 ASCII hex chars +/// +/// The 32-hex output matches `BucketMetadata.owner_id` on master +/// (see `fula-cli/src/state.rs:15-22`). The SDK passes this string +/// in `Config::users_index_user_key` so the cold-start path can +/// look itself up in the published `GlobalUsersIndex.users` map. +/// +/// This is a **free function**, not a method, so JS / Flutter +/// bindings can compute the user_key without holding a client. +/// Domain separator + double hashing + lowercase normalization MUST +/// stay in lockstep with the master's `hash_user_id`; the +/// `derive_user_key_matches_master_state_rs_algorithm` test below +/// reproduces the master algorithm step-by-step and asserts equality. +pub fn derive_user_key_from_email(email: &str) -> String { + use sha2::{Digest, Sha256}; + let user_id_digest = Sha256::digest(email.to_lowercase().as_bytes()); + let user_id_hex = hex::encode(user_id_digest); + let mut hasher = blake3::Hasher::new(); + hasher.update(b"fula:user_id:"); + hasher.update(user_id_hex.as_bytes()); + hex::encode(&hasher.finalize().as_bytes()[..16]) +} + +/// Default IPNS-aware gateway list. Excludes +/// `trustless-gateway.link` (only serves `/ipfs/`, not `/ipns/`). +pub fn default_ipns_gateway_urls() -> Vec { + vec![ + "https://cloudflare-ipfs.com/ipns/{name}".into(), + "https://dweb.link/ipns/{name}".into(), + "https://ipfs.io/ipns/{name}".into(), + "https://4everland.io/ipns/{name}".into(), + "https://gateway.pinata.cloud/ipns/{name}".into(), + ] +} + +/// Fetch a CID's bytes via simple sequential iteration over the +/// configured IPFS-gateway list, verifying content-addressing on +/// each successful response. Returns the first body whose +/// `verify_cid_against_bytes` passes; surfaces +/// `UsersIndexResolutionFailed` if all gateways exhaust. +/// +/// Intentionally simpler than `GatewayPool::fetch_verified` (Phase +/// 2.3's dynamic-priority race orchestrator). Cold-start is one-shot +/// — the per-gateway state machine pays no benefit here, and keeping +/// the resolver self-contained means cold-start doesn't require +/// Phase 2.2/2.4 to be enabled. +pub async fn fetch_cid_via_gateways( + cid: &Cid, + gateways: &[String], + http: &reqwest::Client, + per_request_timeout: Duration, +) -> Result { + if gateways.is_empty() { + return Err(ClientError::UsersIndexResolutionFailed { + reason: format!("no IPFS gateways configured to fetch {}", cid), + }); + } + let cid_str = cid.to_string(); + let mut last_err: Option = None; + for tmpl in gateways { + let url = tmpl.replace("{cid}", &cid_str); + let resp = match tokio::time::timeout(per_request_timeout, http.get(&url).send()).await { + Ok(Ok(r)) => r, + Ok(Err(e)) => { + last_err = Some(format!("{} transport: {}", url, e)); + continue; + } + Err(_) => { + last_err = Some(format!("{} timeout", url)); + continue; + } + }; + if !resp.status().is_success() { + last_err = Some(format!("{} HTTP {}", url, resp.status())); + continue; + } + let bytes = match resp.bytes().await { + Ok(b) => b, + Err(e) => { + last_err = Some(format!("{} body: {}", url, e)); + continue; + } + }; + if let Err(e) = verify_cid_against_bytes(cid, &bytes) { + last_err = Some(format!("{} verify: {}", url, e)); + continue; + } + return Ok(bytes); + } + Err(ClientError::UsersIndexResolutionFailed { + reason: format!( + "CID {} unreachable across {} gateways: {}", + cid, + gateways.len(), + last_err.unwrap_or_else(|| "no gateways tried".into()) + ), + }) +} + +/// Decode dag-cbor bytes as a per-user `UserBucketsIndex`. Wraps the +/// dagcbor crate's error so callers see a single ClientError shape. +pub fn decode_user_buckets_index(bytes: &[u8]) -> Result { + serde_ipld_dagcbor::from_slice(bytes).map_err(|e| { + ClientError::UsersIndexResolutionFailed { + reason: format!("UserBucketsIndex CBOR decode: {}", e), + } + }) +} + +/// Default `/ipfs/{cid}` gateway list — same six as the warm-device +/// pool ships in `gateway_fetch::default_gateway_urls`. Re-declared +/// here so the resolver's chain path doesn't need to depend on the +/// pool's state machine. +pub fn default_ipfs_gateway_urls() -> Vec { + vec![ + "https://cloudflare-ipfs.com/ipfs/{cid}".into(), + "https://dweb.link/ipfs/{cid}".into(), + "https://ipfs.io/ipfs/{cid}".into(), + "https://trustless-gateway.link/ipfs/{cid}".into(), + "https://4everland.io/ipfs/{cid}".into(), + "https://gateway.pinata.cloud/ipfs/{cid}".into(), + ] +} + +// ============================================================ +// Resolver +// ============================================================ + +#[derive(Debug)] +pub struct UsersIndexResolver { + config: ResolverConfig, + http: reqwest::Client, + /// Process-wide replay defense — only ever increases. SDK callers + /// can seed it from a persisted hot-start cache (Phase 3.3.5) at + /// construction time via [`UsersIndexResolver::new_with_cache`]; + /// every successful `resolve` then bumps it. + highest_seen_sequence: AtomicU64, + /// Pre-validated 20-byte anchor address. Cached so each `resolve` + /// doesn't re-parse the hex. + anchor_address_bytes: [u8; 20], + /// Phase 3.3.5 — optional hot-start persistence layer. When set, + /// `resolve()` reads cached `(cid, sequence, observed_at_unix)` + /// from the cache's METADATA table on the first call AND writes + /// the freshly-resolved state on every successful resolve. This + /// makes the replay-defense floor survive SDK restarts AND lets + /// the resolver short-circuit IPNS+chain when within `soft_ttl`. + cache: Option>, +} + +impl UsersIndexResolver { + /// Build a resolver. Validates `anchor_address` is 20 bytes hex + /// up-front so misconfiguration fails at construction time, not + /// on the first cold-start. + pub fn new(config: ResolverConfig) -> Result { + if config.chain_rpc_url.is_empty() { + return Err(ClientError::Config( + "registry resolver: chain_rpc_url is empty".into(), + )); + } + if config.ipns_name.is_empty() { + return Err(ClientError::Config( + "registry resolver: ipns_name is empty".into(), + )); + } + let anchor_address_bytes = parse_anchor_address(&config.anchor_address)?; + Ok(Self { + config, + http: reqwest::Client::new(), + highest_seen_sequence: AtomicU64::new(0), + anchor_address_bytes, + cache: None, + }) + } + + /// Phase 3.3.5 — construct a resolver wired to a persistent + /// hot-start cache. On construction the resolver: + /// 1. Reads `(cid, sequence, observed_at_unix)` from the + /// cache's METADATA table. + /// 2. Seeds the replay-defense floor from the cached + /// sequence — a malicious gateway cannot regress to a + /// stale payload across SDK restarts. + /// + /// On every successful `resolve` the resolver: + /// 1. Writes the new `(cid, sequence, now)` to METADATA. + /// 2. Inserts the bytes into BLOCKS (so a future hot-start + /// can serve the payload entirely from disk). + /// + /// The cache load/store paths are **best-effort**: failures + /// log at `warn!` and don't propagate, so a corrupted or + /// unwriteable cache never blocks SDK functionality. (The + /// resolver still works, just without hot-start.) + pub fn new_with_cache( + config: ResolverConfig, + cache: Arc, + ) -> Result { + let mut resolver = Self::new(config)?; + // Seed the floor from cached state, if any. Best-effort — + // a corrupt or empty cache gives us the default floor (0). + match cache.load_users_index_state() { + Ok(Some((_cid, sequence, _observed))) => { + resolver.bump_seen_sequence(sequence); + tracing::debug!( + seeded_sequence = sequence, + "registry_resolver: hot-start floor seeded from cache" + ); + } + Ok(None) => { + tracing::debug!("registry_resolver: no hot-start state cached (fresh)"); + } + Err(e) => { + tracing::warn!( + error = %e, + "registry_resolver: hot-start cache load failed; floor stays at 0 (best-effort)" + ); + } + } + resolver.cache = Some(cache); + Ok(resolver) + } + + /// Test/integration hook — production callers update via + /// `resolve()`'s side-effect of calling `bump_seen_sequence`. + /// Marked `pub(crate)` so tests can seed the floor without a + /// stable public API. + #[cfg(test)] + pub(crate) fn set_highest_seen_sequence(&self, seq: u64) { + self.bump_seen_sequence(seq); + } + + /// Read the current replay-defense floor. + pub fn highest_seen_sequence(&self) -> u64 { + self.highest_seen_sequence.load(Ordering::Acquire) + } + + /// Read-only access to the resolver's HTTP client. The cold-start + /// path on `EncryptedClient` reuses this client for the + /// bucketsIndex + manifest fetches so connection pooling stays + /// intact across all of the cold-start request burst. + pub fn http_client(&self) -> &reqwest::Client { + &self.http + } + + /// Read-only access to the resolver's per-request timeout — + /// reused by the cold-start path's gateway fetches for the + /// bucketsIndex CBOR and the forest manifest, so a single config + /// knob governs all of cold-start. + pub fn per_request_timeout(&self) -> Duration { + self.config.per_request_timeout + } + + /// Read-only access to the IPFS gateway list. Cold-start uses + /// this same list (rather than the warm-device pool's) so it + /// stays self-contained and works without Phase 2.2/2.4 enabled. + pub fn ipfs_gateways(&self) -> Vec { + if self.config.ipfs_gateways.is_empty() { + default_ipfs_gateway_urls() + } else { + self.config.ipfs_gateways.clone() + } + } + + /// Atomic monotonic-max — only ever increases. Lock-free CAS loop. + fn bump_seen_sequence(&self, seq: u64) { + let mut current = self.highest_seen_sequence.load(Ordering::Acquire); + while seq > current { + match self.highest_seen_sequence.compare_exchange_weak( + current, + seq, + Ordering::AcqRel, + Ordering::Acquire, + ) { + Ok(_) => break, + Err(observed) => current = observed, + } + } + } + + /// Hybrid resolve. + /// + /// Order of operations: + /// 0. **Hot-start short-circuit (Phase 3.3.5).** If a cache is + /// configured AND has a `(cid, sequence, observed_at)` row + /// AND `now - observed_at < soft_ttl`, return the cached + /// state directly. Bytes come from BLOCKS if cached, + /// otherwise via gateway race for the cached cid. Sequence + /// is re-checked against the in-memory floor for defense. + /// 1. Try IPNS for `ipns_race_timeout`. + /// 2. Fall through to chain on timeout / all-gateway failure + /// / replay-rejection. + /// 3. On success (any path), write `(cid, sequence, now)` to + /// METADATA and the bytes to BLOCKS — best-effort, so a + /// cache write failure never aborts the resolve. + pub async fn resolve(&self) -> Result { + // Step 0 — hot-start short-circuit. + if let Some(resolved) = self.try_hot_start().await { + return Ok(resolved); + } + + // Steps 1-2 — IPNS-then-chain. + let resolved = self.resolve_via_network().await?; + + // Step 3 — write-back. Best-effort, synchronous-from-async + // so the next call observes the freshly-written cache without + // racing a spawned background task. Cold-start is a once- + // per-session event; the few hundred microseconds for the + // redb txns are negligible vs. the IPNS+chain budget we just + // paid. + self.persist_to_cache(&resolved).await; + + Ok(resolved) + } + + /// Phase 3.3.5 — try to serve from the persistent cache without + /// touching the network. Returns `Some(ResolvedUsersIndex)` when + /// a fresh-enough cached state exists AND the bytes are + /// available (BLOCKS hit OR a fast gateway-race fetch for the + /// cached cid succeeds). Returns `None` to indicate "fall + /// through to full IPNS+chain resolve." + /// + /// A `None` return is silent — the network path takes over. + async fn try_hot_start(&self) -> Option { + let cache = self.cache.as_ref()?; + let (cached_cid, cached_seq, observed_at) = match cache.load_users_index_state() { + Ok(Some(triple)) => triple, + Ok(None) => return None, + Err(e) => { + tracing::warn!(error = %e, "hot-start: cache load failed"); + return None; + } + }; + + // TTL check. Use wall-clock (matches what the writer used). + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0); + if now.saturating_sub(observed_at) >= self.config.soft_ttl.as_secs() { + tracing::debug!( + age_secs = now.saturating_sub(observed_at), + ttl_secs = self.config.soft_ttl.as_secs(), + "hot-start: cache entry beyond TTL; re-resolving" + ); + return None; + } + + // Replay-defense check on the cached sequence itself — + // defends against a corrupt/tampered METADATA row. + let seen = self.highest_seen_sequence(); + if cached_seq < seen { + tracing::warn!( + cached = cached_seq, + seen, + "hot-start: cached sequence < in-memory floor; ignoring (corrupt or rolled-back cache)" + ); + return None; + } + + // Fetch bytes — BLOCKS first, then gateway race for the + // cached cid as a network fallback. + let bytes = match cache.get(&cached_cid) { + Ok(Some(b)) => { + tracing::debug!(cid = %cached_cid, "hot-start: BLOCKS hit"); + b + } + Ok(None) => { + // BLOCKS miss: cached metadata says "we know the + // CID" but we don't have the bytes (LRU evicted, or + // the prior resolve failed mid-write). Fetch via + // gateway race for the cached cid; cheaper than the + // full IPNS dance because we skip the DHT lookup. + let gateways = self.ipfs_gateways(); + match fetch_cid_via_gateways( + &cached_cid, + &gateways, + &self.http, + self.config.per_request_timeout, + ) + .await + { + Ok(b) => { + tracing::debug!(cid = %cached_cid, "hot-start: BLOCKS miss → gateway race"); + // Repopulate BLOCKS for the next read. + if let Err(e) = cache.put(&cached_cid, &b).await { + tracing::debug!(error = %e, "hot-start: BLOCKS put failed (best-effort)"); + } + b + } + Err(e) => { + tracing::debug!( + error = %e, + "hot-start: BLOCKS miss AND gateway fetch failed; falling through" + ); + return None; + } + } + } + Err(e) => { + tracing::warn!(error = %e, "hot-start: BLOCKS lookup failed"); + return None; + } + }; + + // Decode + cross-check sequence. The bytes content-address + // to `cached_cid` (BLOCKS hit) or were verified by the + // gateway-fetch (CID match guaranteed by + // `verify_cid_against_bytes`). Decode failure here is + // silent — fall through to network path so a fresh resolve + // can heal a poisoned cache. + let payload = match decode_users_index_cbor(&bytes) { + Ok(p) => p, + Err(e) => { + tracing::warn!(error = %e, "hot-start: cached CBOR parse failed; re-resolving"); + return None; + } + }; + if payload.sequence != cached_seq { + tracing::warn!( + payload_seq = payload.sequence, + metadata_seq = cached_seq, + "hot-start: payload sequence != metadata sequence; cache inconsistent, re-resolving" + ); + return None; + } + + // All checks passed. Bump the in-memory floor to match + // (no-op if already >= cached_seq) and return. + self.bump_seen_sequence(payload.sequence); + Some(ResolvedUsersIndex { + source: ResolutionSource::HotStartCache, + cid: cached_cid, + payload, + bytes, + }) + } + + /// Network resolve path (IPNS-then-chain). Extracted from the + /// old `resolve()` body so the hot-start short-circuit can fall + /// through to it cleanly. + async fn resolve_via_network(&self) -> Result { + let ipns_outcome = tokio::time::timeout( + self.config.ipns_race_timeout, + self.try_ipns(), + ) + .await; + + match ipns_outcome { + Ok(Ok(resolved)) => { + self.bump_seen_sequence(resolved.payload.sequence); + return Ok(resolved); + } + Ok(Err(e)) => { + tracing::debug!( + error = %e, + "registry_resolver: IPNS path exhausted; falling back to chain" + ); + } + Err(_) => { + tracing::debug!( + timeout_secs = self.config.ipns_race_timeout.as_secs(), + "registry_resolver: IPNS timed out; falling back to chain" + ); + } + } + + match self.try_chain().await { + Ok(resolved) => { + self.bump_seen_sequence(resolved.payload.sequence); + Ok(resolved) + } + Err(e) => Err(ClientError::UsersIndexResolutionFailed { + reason: format!("IPNS exhausted; chain: {}", e), + }), + } + } + + /// Phase 3.3.5 — best-effort write of the just-resolved state to + /// the METADATA table + BLOCKS. Failures log and proceed; the + /// caller already has the resolved value, so cache hiccups never + /// block SDK functionality. + /// + /// Synchronous-from-async (no `tokio::spawn`) so the next + /// `resolve()` call observes the freshly-written cache without + /// racing a background task — important because tests using + /// `Mock::expect(N)` would otherwise be flaky on slow CI hosts. + /// Cost is hundreds of microseconds for the two redb txns; + /// negligible vs. the network budget the caller just spent. + async fn persist_to_cache(&self, resolved: &ResolvedUsersIndex) { + let Some(cache) = self.cache.as_ref() else { + return; + }; + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_secs()) + .unwrap_or(0); + if let Err(e) = cache.store_users_index_state(&resolved.cid, resolved.payload.sequence, now) + { + tracing::warn!( + error = %e, + "registry_resolver: hot-start metadata write failed (best-effort)" + ); + } + if let Err(e) = cache.put(&resolved.cid, &resolved.bytes).await { + // BlockTooLarge is the expected failure for huge global + // CBORs (>cache budget); log at debug, not warn. + tracing::debug!( + error = %e, + "registry_resolver: hot-start BLOCKS put failed (best-effort)" + ); + } + } + + /// IPNS leg — sequential per-gateway fan-out. The first gateway + /// whose body parses + sequence-passes wins. We don't run them + /// in parallel because: + /// - cold-start is rare (once per fresh-device sign-in), + /// - five HEAD-of-line requests waste the user's bandwidth, + /// - the outer 10-s budget bounds the worst case anyway. + async fn try_ipns(&self) -> Result { + let gateways: Vec = if self.config.ipns_gateways.is_empty() { + default_ipns_gateway_urls() + } else { + self.config.ipns_gateways.clone() + }; + + let mut last_err: Option = None; + for tmpl in &gateways { + let url = tmpl.replace("{name}", &self.config.ipns_name); + match self.fetch_with_timeout(&url).await { + Ok(bytes) => match self.parse_and_validate(bytes, ResolutionSource::Ipns) { + Ok(resolved) => return Ok(resolved), + Err(e) => { + // Replay-rejected or parse-failed bodies are + // not a fatal error; another gateway might + // serve a fresher record. + tracing::debug!( + url = %url, error = %e, + "registry_resolver: IPNS body rejected; trying next gateway" + ); + last_err = Some(e.to_string()); + } + }, + Err(e) => { + last_err = Some(e.to_string()); + tracing::debug!(url = %url, error = %e, "registry_resolver: IPNS fetch failed"); + } + } + } + Err(ClientError::UsersIndexResolutionFailed { + reason: format!( + "IPNS exhausted across {} gateways: {}", + gateways.len(), + last_err.unwrap_or_else(|| "no gateways tried".into()) + ), + }) + } + + /// Chain leg — single eth_call to `latest()`, then iterate IPFS + /// gateways for the resulting CID. + async fn try_chain(&self) -> Result { + // Step 1 — eth_call. + let (cid_digest, on_chain_seq) = self.eth_call_latest().await?; + if on_chain_seq < self.highest_seen_sequence() { + return Err(ClientError::SequenceRegression { + observed: on_chain_seq, + highest_seen: self.highest_seen_sequence(), + channel: "chain.latest()".into(), + }); + } + + // Step 2 — reconstruct CID. dag-cbor codec (0x71) + + // sha2-256 multihash (0x12) + the on-chain digest. + let mh = Multihash::<64>::wrap(MULTIHASH_SHA2_256, &cid_digest).map_err(|e| { + ClientError::UsersIndexResolutionFailed { + reason: format!("invalid chain CID digest: {}", e), + } + })?; + let cid = Cid::new_v1(CODEC_DAG_CBOR, mh); + + // Step 3 — iterate IPFS gateways until one body + // content-addresses to `cid`. + let gateways: Vec = if self.config.ipfs_gateways.is_empty() { + default_ipfs_gateway_urls() + } else { + self.config.ipfs_gateways.clone() + }; + let mut last_err: Option = None; + for tmpl in &gateways { + let url = tmpl.replace("{cid}", &cid.to_string()); + let bytes = match self.fetch_with_timeout(&url).await { + Ok(b) => b, + Err(e) => { + last_err = Some(e.to_string()); + continue; + } + }; + if let Err(e) = verify_cid_against_bytes(&cid, &bytes) { + last_err = Some(format!("verify failed at {}: {}", url, e)); + continue; + } + // Step 4 — parse + cross-validate sequence. + let payload = decode_users_index_cbor(&bytes).map_err(|e| { + ClientError::UsersIndexResolutionFailed { + reason: format!("chain-fetched payload parse: {}", e), + } + })?; + if payload.sequence != on_chain_seq { + return Err(ClientError::UsersIndexResolutionFailed { + reason: format!( + "in-CBOR sequence {} != on-chain sequence {} (anomaly: tamper or RPC inconsistency)", + payload.sequence, on_chain_seq + ), + }); + } + return Ok(ResolvedUsersIndex { + source: ResolutionSource::Chain, + cid, + payload, + bytes, + }); + } + Err(ClientError::UsersIndexResolutionFailed { + reason: format!( + "chain CID {} unreachable across {} gateways: {}", + cid, + gateways.len(), + last_err.unwrap_or_else(|| "no gateways tried".into()) + ), + }) + } + + /// Issue the `latest()` eth_call and parse the 96-byte response. + /// Self-contained: assembles the JSON-RPC envelope manually, no + /// dependency on a full ethers-rs client. + async fn eth_call_latest(&self) -> Result<([u8; 32], u64), ClientError> { + let calldata = format!("0x{}", hex::encode(SELECTOR_LATEST)); + let to_addr = format!("0x{}", hex::encode(self.anchor_address_bytes)); + let body = serde_json::json!({ + "jsonrpc": "2.0", + "method": "eth_call", + "params": [{ "to": to_addr, "data": calldata }, "latest"], + "id": 1, + }); + + let resp = tokio::time::timeout( + self.config.per_request_timeout, + self.http + .post(&self.config.chain_rpc_url) + .json(&body) + .send(), + ) + .await + .map_err(|_| ClientError::UsersIndexResolutionFailed { + reason: format!( + "chain RPC timeout after {}s", + self.config.per_request_timeout.as_secs() + ), + })? + .map_err(|e| ClientError::UsersIndexResolutionFailed { + reason: format!("chain RPC transport: {}", e), + })?; + + if !resp.status().is_success() { + return Err(ClientError::UsersIndexResolutionFailed { + reason: format!("chain RPC HTTP {}", resp.status()), + }); + } + let json: serde_json::Value = + resp.json().await.map_err(|e| ClientError::UsersIndexResolutionFailed { + reason: format!("chain RPC response parse: {}", e), + })?; + if let Some(err) = json.get("error") { + return Err(ClientError::UsersIndexResolutionFailed { + reason: format!("chain RPC error: {}", err), + }); + } + let result_hex = json + .get("result") + .and_then(|v| v.as_str()) + .ok_or_else(|| ClientError::UsersIndexResolutionFailed { + reason: "chain RPC: missing result".into(), + })?; + let result_hex = result_hex.strip_prefix("0x").unwrap_or(result_hex); + let raw = + hex::decode(result_hex).map_err(|e| ClientError::UsersIndexResolutionFailed { + reason: format!("chain RPC: hex decode result: {}", e), + })?; + parse_latest_response(&raw) + } + + /// Single-gateway HTTP GET with `per_request_timeout`. Returns + /// raw body on 2xx, error otherwise. Doesn't touch the gateway- + /// pool's dynamic-priority state machine — this is one-shot + /// cold-start, not the ongoing warm-device hot path. + async fn fetch_with_timeout(&self, url: &str) -> Result { + let resp = tokio::time::timeout( + self.config.per_request_timeout, + self.http.get(url).send(), + ) + .await + .map_err(|_| ClientError::UsersIndexResolutionFailed { + reason: format!("HTTP timeout: {}", url), + })? + .map_err(|e| ClientError::UsersIndexResolutionFailed { + reason: format!("HTTP transport ({}): {}", url, e), + })?; + if !resp.status().is_success() { + return Err(ClientError::UsersIndexResolutionFailed { + reason: format!("HTTP {} from {}", resp.status(), url), + }); + } + resp.bytes() + .await + .map_err(|e| ClientError::UsersIndexResolutionFailed { + reason: format!("HTTP body read ({}): {}", url, e), + }) + } + + /// Parse + validate IPNS-fetched bytes. Synthesizes the CID + /// from the bytes (no external CID to verify against on the + /// IPNS path; the gateway did the IPNS-record resolution + /// upstream — the security boundary here is the in-CBOR + /// `sequence` field, not the bytes-to-CID hash). + fn parse_and_validate( + &self, + bytes: Bytes, + source: ResolutionSource, + ) -> Result { + let payload = decode_users_index_cbor(&bytes).map_err(|e| { + ClientError::UsersIndexResolutionFailed { + reason: format!("CBOR decode: {}", e), + } + })?; + let seen = self.highest_seen_sequence(); + if payload.sequence < seen { + return Err(ClientError::SequenceRegression { + observed: payload.sequence, + highest_seen: seen, + channel: format!("{:?}", source), + }); + } + let cid = synthesize_cid_from_bytes(&bytes); + Ok(ResolvedUsersIndex { + source, + cid, + payload, + bytes, + }) + } +} + +// ============================================================ +// Helpers +// ============================================================ + +/// Multihash code for sha2-256 (0x12). +const MULTIHASH_SHA2_256: u64 = 0x12; +/// IPLD codec for dag-cbor (0x71). +const CODEC_DAG_CBOR: u64 = 0x71; + +/// `keccak256("latest()")[..4]`. Hardcoded so the production build +/// has zero crypto dependency for this constant. +/// +/// MUST stay in sync with `tests::abi_selector_latest_matches_keccak256` +/// — that test is the **source of truth**, this constant is just the +/// cache. Do not delete the test "because it's redundant"; without it, +/// a typo here goes unnoticed until the SDK silently calls the wrong +/// 4-byte selector on the deployed `FulaUsersIndexAnchor`. +const SELECTOR_LATEST: [u8; 4] = [0x52, 0xbf, 0xe7, 0x89]; + +/// Parse a 0x-prefixed-or-not 40-char hex address into 20 bytes. +fn parse_anchor_address(s: &str) -> Result<[u8; 20], ClientError> { + let s = s.strip_prefix("0x").unwrap_or(s); + let bytes = hex::decode(s).map_err(|e| { + ClientError::Config(format!("registry resolver: invalid anchor_address hex: {}", e)) + })?; + if bytes.len() != 20 { + return Err(ClientError::Config(format!( + "registry resolver: anchor_address must be 20 bytes, got {}", + bytes.len() + ))); + } + let mut out = [0u8; 20]; + out.copy_from_slice(&bytes); + Ok(out) +} + +/// Parse the 96-byte ABI-encoded return of `latest()`. +/// Layout (Solidity packs `uint64` right-aligned within a 32-byte slot): +/// bytes[0..32] = cid_digest (full 32 bytes) +/// bytes[32..64] = sequence (u64 BE in last 8 bytes) +/// bytes[64..96] = updatedAt (u64 BE in last 8 bytes) — **dropped** +/// +/// We deliberately drop `updatedAt` here: nothing in the SDK's +/// security model depends on it (sequence is the security boundary, +/// and `block.timestamp` is miner-influenceable on EVM chains anyway). +/// Returning a richer tuple would invite callers to make decisions on +/// it; keeping the parser narrow forces the right shape. +fn parse_latest_response(raw: &[u8]) -> Result<([u8; 32], u64), ClientError> { + if raw.len() < 96 { + return Err(ClientError::UsersIndexResolutionFailed { + reason: format!( + "chain `latest()` returned {} bytes (expected ≥ 96)", + raw.len() + ), + }); + } + let mut cid_digest = [0u8; 32]; + cid_digest.copy_from_slice(&raw[0..32]); + // u64 lives in the last 8 bytes of the 32-byte slot. + let mut seq_be = [0u8; 8]; + seq_be.copy_from_slice(&raw[32 + 24..32 + 32]); + let sequence = u64::from_be_bytes(seq_be); + Ok((cid_digest, sequence)) +} + +/// Decode dag-cbor bytes as a `GlobalUsersIndex`. Wraps the +/// ipld-dagcbor crate's error in our own typed error. +fn decode_users_index_cbor(bytes: &[u8]) -> Result { + serde_ipld_dagcbor::from_slice(bytes).map_err(|e| e.to_string()) +} + +/// Synthesize a CIDv1 (dag-cbor + sha2-256) from a body. Used for +/// the IPNS path's reported `ResolvedUsersIndex.cid` so callers can +/// use it as a cache key. NOT a security claim — IPNS bytes are +/// trusted via the in-payload `sequence`, not via this hash. +fn synthesize_cid_from_bytes(bytes: &[u8]) -> Cid { + use sha2::{Digest, Sha256}; + let mut hasher = Sha256::new(); + hasher.update(bytes); + let digest = hasher.finalize(); + // wrap() can only fail if digest is wrong size; sha2-256 is + // always exactly 32 bytes so unwrap is safe. + let mh = Multihash::<64>::wrap(MULTIHASH_SHA2_256, &digest).expect("32-byte sha2 digest"); + Cid::new_v1(CODEC_DAG_CBOR, mh) +} + +// ============================================================ +// Tests +// ============================================================ + +#[cfg(test)] +mod tests { + use super::*; + use sha3::{Digest, Keccak256}; + use wiremock::matchers::{method, path}; + use wiremock::{Mock, MockServer, ResponseTemplate}; + + /// The hardcoded `SELECTOR_LATEST` MUST equal the canonical + /// `keccak256("latest()")[..4]`. If a future refactor renames the + /// solidity function and someone forgets to update the constant, + /// this test catches it before the SDK silently calls the wrong + /// selector against the deployed contract. + #[test] + fn abi_selector_latest_matches_keccak256() { + let mut hasher = Keccak256::new(); + hasher.update(b"latest()"); + let full = hasher.finalize(); + let expected: [u8; 4] = [full[0], full[1], full[2], full[3]]; + assert_eq!( + SELECTOR_LATEST, expected, + "SELECTOR_LATEST drifted from keccak256(\"latest()\")[..4]: \ + expected 0x{:02x}{:02x}{:02x}{:02x}, got 0x{:02x}{:02x}{:02x}{:02x}", + expected[0], expected[1], expected[2], expected[3], + SELECTOR_LATEST[0], SELECTOR_LATEST[1], SELECTOR_LATEST[2], SELECTOR_LATEST[3] + ); + } + + /// Build a syntactically-valid CBOR-encoded payload for tests. + fn make_payload_cbor(sequence: u64) -> (Bytes, GlobalUsersIndex) { + let payload = GlobalUsersIndex { + v: 1, + sequence, + updated_at_unix: 1_700_000_000, + users: BTreeMap::new(), + }; + let bytes = serde_ipld_dagcbor::to_vec(&payload).expect("encode"); + (Bytes::from(bytes), payload) + } + + fn fixture_address() -> String { + // 20-byte zero address with 0x prefix. parse_anchor_address + // accepts both forms. + "0x0000000000000000000000000000000000000001".to_string() + } + + fn fixture_ipns_name() -> String { + // Real-shape libp2p public key hash (b58btc-encoded ed25519); + // resolver doesn't validate the key format, just substitutes. + "k51qzi5uqu5dh-test".to_string() + } + + #[test] + fn parse_anchor_address_accepts_with_or_without_0x() { + let with_prefix = parse_anchor_address("0x0000000000000000000000000000000000000001") + .expect("with 0x"); + let without = parse_anchor_address("0000000000000000000000000000000000000001") + .expect("without 0x"); + assert_eq!(with_prefix, without); + assert_eq!(with_prefix[19], 1); + for &b in &with_prefix[..19] { + assert_eq!(b, 0); + } + } + + #[test] + fn parse_anchor_address_rejects_wrong_length() { + assert!(parse_anchor_address("0xdeadbeef").is_err()); + assert!(parse_anchor_address("0x").is_err()); + assert!(parse_anchor_address("not-hex").is_err()); + } + + #[test] + fn parse_latest_response_extracts_correct_fields() { + // Build a 96-byte response: digest = 0xff*32, sequence = 42, ts = 100. + let mut raw = vec![0u8; 96]; + for i in 0..32 { + raw[i] = 0xff; + } + raw[32 + 24..32 + 32].copy_from_slice(&42u64.to_be_bytes()); + raw[64 + 24..64 + 32].copy_from_slice(&100u64.to_be_bytes()); + + let (digest, seq) = parse_latest_response(&raw).expect("parse"); + assert_eq!(digest, [0xff; 32]); + assert_eq!(seq, 42); + } + + #[test] + fn parse_latest_response_rejects_short_input() { + let short = vec![0u8; 95]; + assert!(parse_latest_response(&short).is_err()); + } + + #[test] + fn synthesize_cid_is_deterministic_and_dagcbor_sha256() { + let bytes = b"some payload bytes"; + let c1 = synthesize_cid_from_bytes(bytes); + let c2 = synthesize_cid_from_bytes(bytes); + assert_eq!(c1, c2, "synthesis is deterministic"); + assert_eq!(c1.codec(), CODEC_DAG_CBOR); + assert_eq!(c1.hash().code(), MULTIHASH_SHA2_256); + assert_eq!(c1.hash().digest().len(), 32); + } + + #[test] + fn resolver_new_rejects_empty_rpc_url() { + let mut cfg = ResolverConfig::new("", fixture_address(), fixture_ipns_name()); + let err = UsersIndexResolver::new(cfg.clone()).unwrap_err(); + assert!(matches!(err, ClientError::Config(_))); + cfg.chain_rpc_url = "https://rpc.example".into(); + cfg.ipns_name = "".into(); + let err = UsersIndexResolver::new(cfg).unwrap_err(); + assert!(matches!(err, ClientError::Config(_))); + } + + #[test] + fn resolver_new_rejects_bad_anchor_address() { + let cfg = ResolverConfig::new( + "https://rpc.example", + "0xdeadbeef", // too short + fixture_ipns_name(), + ); + let err = UsersIndexResolver::new(cfg).unwrap_err(); + assert!(matches!(err, ClientError::Config(_))); + } + + #[tokio::test] + async fn resolve_via_ipns_succeeds_when_first_gateway_serves_valid_payload() { + let (cbor, _) = make_payload_cbor(7); + let mock = MockServer::start().await; + let url_path = format!("/ipns/{}", fixture_ipns_name()); + Mock::given(method("GET")) + .and(path(url_path)) + .respond_with(ResponseTemplate::new(200).set_body_bytes(cbor.as_ref())) + .mount(&mock) + .await; + + let mut cfg = ResolverConfig::new( + "https://chain.example/rpc", // never called on success + fixture_address(), + fixture_ipns_name(), + ); + cfg.ipns_gateways = vec![format!("{}/ipns/{{name}}", mock.uri())]; + cfg.ipns_race_timeout = Duration::from_secs(5); + cfg.per_request_timeout = Duration::from_secs(2); + + let resolver = UsersIndexResolver::new(cfg).expect("new"); + let r = resolver.resolve().await.expect("resolve"); + assert_eq!(r.source, ResolutionSource::Ipns); + assert_eq!(r.payload.sequence, 7); + assert_eq!(resolver.highest_seen_sequence(), 7); + } + + #[tokio::test] + async fn resolve_falls_through_to_chain_when_ipns_rejected_for_sequence_regression() { + // Setup: IPNS returns seq=3, but the resolver's floor is + // already at 5 (apps seeded it from a hot-start cache). The + // IPNS payload is replay-rejected. Chain returns seq=10, + // which is accepted. Resolver returns the chain payload. + let (ipns_cbor, _) = make_payload_cbor(3); + let (chain_cbor, _) = make_payload_cbor(10); + + let ipns = MockServer::start().await; + let chain_rpc = MockServer::start().await; + let chain_gw = MockServer::start().await; + + // IPNS gateway → seq=3 body (will be rejected as regression). + Mock::given(method("GET")) + .and(path(format!("/ipns/{}", fixture_ipns_name()))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(ipns_cbor.as_ref())) + .mount(&ipns) + .await; + + // Compute the chain CID from the chain_cbor bytes so we can + // mock the gateway response correctly. The eth_call returns + // the digest; the gateway serves bytes that hash to it. + let chain_cid = synthesize_cid_from_bytes(&chain_cbor); + let chain_digest = chain_cid.hash().digest(); + + // Chain RPC mock — return the digest + seq=10 + ts=anything. + let mut raw = vec![0u8; 96]; + raw[0..32].copy_from_slice(chain_digest); + raw[32 + 24..32 + 32].copy_from_slice(&10u64.to_be_bytes()); + raw[64 + 24..64 + 32].copy_from_slice(&12345u64.to_be_bytes()); + let result_hex = format!("0x{}", hex::encode(&raw)); + Mock::given(method("POST")) + .respond_with( + ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "jsonrpc": "2.0", + "id": 1, + "result": result_hex, + })), + ) + .mount(&chain_rpc) + .await; + + // IPFS gateway for the chain CID → return chain_cbor bytes. + let cid_str = chain_cid.to_string(); + Mock::given(method("GET")) + .and(path(format!("/ipfs/{}", cid_str))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(chain_cbor.as_ref())) + .mount(&chain_gw) + .await; + + let mut cfg = ResolverConfig::new( + chain_rpc.uri(), + fixture_address(), + fixture_ipns_name(), + ); + cfg.ipns_gateways = vec![format!("{}/ipns/{{name}}", ipns.uri())]; + cfg.ipfs_gateways = vec![format!("{}/ipfs/{{cid}}", chain_gw.uri())]; + cfg.ipns_race_timeout = Duration::from_secs(2); + cfg.per_request_timeout = Duration::from_secs(2); + + let resolver = UsersIndexResolver::new(cfg).expect("new"); + // Seed the floor to 5 so the IPNS seq=3 is rejected. + resolver.set_highest_seen_sequence(5); + + let r = resolver.resolve().await.expect("resolve"); + assert_eq!(r.source, ResolutionSource::Chain); + assert_eq!(r.payload.sequence, 10); + assert_eq!(resolver.highest_seen_sequence(), 10); + } + + #[tokio::test] + async fn resolve_returns_error_when_both_paths_fail() { + // IPNS gateway returns 503; chain RPC returns malformed JSON. + // Resolver surfaces UsersIndexResolutionFailed. + let ipns = MockServer::start().await; + let chain_rpc = MockServer::start().await; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(503)) + .mount(&ipns) + .await; + Mock::given(method("POST")) + .respond_with(ResponseTemplate::new(500).set_body_string("not json")) + .mount(&chain_rpc) + .await; + + let mut cfg = ResolverConfig::new( + chain_rpc.uri(), + fixture_address(), + fixture_ipns_name(), + ); + cfg.ipns_gateways = vec![format!("{}/ipns/{{name}}", ipns.uri())]; + cfg.ipns_race_timeout = Duration::from_secs(2); + cfg.per_request_timeout = Duration::from_secs(2); + + let resolver = UsersIndexResolver::new(cfg).expect("new"); + let err = resolver.resolve().await.expect_err("both fail"); + assert!( + matches!(err, ClientError::UsersIndexResolutionFailed { .. }), + "expected UsersIndexResolutionFailed, got {:?}", + err + ); + } + + #[tokio::test] + async fn resolve_chain_path_rejects_cid_digest_mismatch() { + // The chain returns digest D, but the gateway serves bytes + // whose sha2-256 != D. verify_cid_against_bytes fails and + // the resolver should NOT accept the payload — surfaces an + // UsersIndexResolutionFailed mentioning verify failure. + let (cbor_legit, _) = make_payload_cbor(10); + let cbor_tampered = Bytes::from_static(b"this is not the real CBOR payload"); + + let ipns = MockServer::start().await; + let chain_rpc = MockServer::start().await; + let chain_gw = MockServer::start().await; + + // IPNS gateway serves nothing useful → resolver must use chain. + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(404)) + .mount(&ipns) + .await; + + // Chain RPC says "real CID is X with seq=10". + let real_cid = synthesize_cid_from_bytes(&cbor_legit); + let real_digest = real_cid.hash().digest(); + let mut raw = vec![0u8; 96]; + raw[0..32].copy_from_slice(real_digest); + raw[32 + 24..32 + 32].copy_from_slice(&10u64.to_be_bytes()); + Mock::given(method("POST")) + .respond_with( + ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "jsonrpc": "2.0", + "id": 1, + "result": format!("0x{}", hex::encode(&raw)), + })), + ) + .mount(&chain_rpc) + .await; + + // Gateway serves DIFFERENT bytes — verify_cid_against_bytes + // must reject. + Mock::given(method("GET")) + .and(path(format!("/ipfs/{}", real_cid))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(cbor_tampered.as_ref())) + .mount(&chain_gw) + .await; + + let mut cfg = ResolverConfig::new( + chain_rpc.uri(), + fixture_address(), + fixture_ipns_name(), + ); + cfg.ipns_gateways = vec![format!("{}/ipns/{{name}}", ipns.uri())]; + cfg.ipfs_gateways = vec![format!("{}/ipfs/{{cid}}", chain_gw.uri())]; + cfg.ipns_race_timeout = Duration::from_secs(2); + cfg.per_request_timeout = Duration::from_secs(2); + + let resolver = UsersIndexResolver::new(cfg).expect("new"); + let err = resolver.resolve().await.expect_err("verify fails"); + assert!(matches!(err, ClientError::UsersIndexResolutionFailed { .. })); + } + + #[tokio::test] + async fn resolve_chain_path_rejects_in_cbor_seq_mismatch() { + // Chain says seq=10 but the bytes-fetched payload has seq=11. + // Defensive: resolver must surface this as a tamper / RPC- + // inconsistency anomaly, NOT silently use either side. + let (cbor_seq_11, _) = make_payload_cbor(11); + + let ipns = MockServer::start().await; + let chain_rpc = MockServer::start().await; + let chain_gw = MockServer::start().await; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(404)) + .mount(&ipns) + .await; + + // Chain says seq=10, digest of cbor_seq_11. + let cid = synthesize_cid_from_bytes(&cbor_seq_11); + let mut raw = vec![0u8; 96]; + raw[0..32].copy_from_slice(cid.hash().digest()); + raw[32 + 24..32 + 32].copy_from_slice(&10u64.to_be_bytes()); + Mock::given(method("POST")) + .respond_with( + ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "jsonrpc": "2.0", + "id": 1, + "result": format!("0x{}", hex::encode(&raw)), + })), + ) + .mount(&chain_rpc) + .await; + + Mock::given(method("GET")) + .and(path(format!("/ipfs/{}", cid))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(cbor_seq_11.as_ref())) + .mount(&chain_gw) + .await; + + let mut cfg = ResolverConfig::new( + chain_rpc.uri(), + fixture_address(), + fixture_ipns_name(), + ); + cfg.ipns_gateways = vec![format!("{}/ipns/{{name}}", ipns.uri())]; + cfg.ipfs_gateways = vec![format!("{}/ipfs/{{cid}}", chain_gw.uri())]; + cfg.ipns_race_timeout = Duration::from_secs(2); + cfg.per_request_timeout = Duration::from_secs(2); + + let resolver = UsersIndexResolver::new(cfg).expect("new"); + let err = resolver.resolve().await.expect_err("seq mismatch"); + let msg = format!("{}", err); + assert!( + msg.contains("sequence") + || msg.contains("anomaly") + || matches!(err, ClientError::UsersIndexResolutionFailed { .. }), + "expected sequence-mismatch error, got: {}", + msg + ); + } + + #[tokio::test] + async fn replay_defense_rejects_chain_regression() { + // Floor is 100; chain returns seq=50. Resolver MUST reject + // even though the bytes verify and parse correctly. This + // is the chain-side replay-defense path. + let (cbor, _) = make_payload_cbor(50); + let cid = synthesize_cid_from_bytes(&cbor); + + let ipns = MockServer::start().await; + let chain_rpc = MockServer::start().await; + let chain_gw = MockServer::start().await; + + Mock::given(method("GET")) + .respond_with(ResponseTemplate::new(404)) + .mount(&ipns) + .await; + + let mut raw = vec![0u8; 96]; + raw[0..32].copy_from_slice(cid.hash().digest()); + raw[32 + 24..32 + 32].copy_from_slice(&50u64.to_be_bytes()); + Mock::given(method("POST")) + .respond_with( + ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "jsonrpc": "2.0", + "id": 1, + "result": format!("0x{}", hex::encode(&raw)), + })), + ) + .mount(&chain_rpc) + .await; + + Mock::given(method("GET")) + .and(path(format!("/ipfs/{}", cid))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(cbor.as_ref())) + .mount(&chain_gw) + .await; + + let mut cfg = ResolverConfig::new( + chain_rpc.uri(), + fixture_address(), + fixture_ipns_name(), + ); + cfg.ipns_gateways = vec![format!("{}/ipns/{{name}}", ipns.uri())]; + cfg.ipfs_gateways = vec![format!("{}/ipfs/{{cid}}", chain_gw.uri())]; + cfg.ipns_race_timeout = Duration::from_secs(2); + cfg.per_request_timeout = Duration::from_secs(2); + + let resolver = UsersIndexResolver::new(cfg).expect("new"); + resolver.set_highest_seen_sequence(100); + let err = resolver.resolve().await.expect_err("regression rejected"); + // Either UsersIndexResolutionFailed (wrapper) or + // SequenceRegression directly is acceptable; both signal + // "do not accept" to the caller. + match err { + ClientError::SequenceRegression { observed, highest_seen, channel } => { + assert_eq!(observed, 50); + assert_eq!(highest_seen, 100); + assert!(!channel.is_empty(), "channel label should be set"); + } + ClientError::UsersIndexResolutionFailed { .. } => { /* also fine */ } + other => panic!("unexpected error: {:?}", other), + } + } + + /// `derive_user_key_from_email` MUST produce a 32-hex-char output + /// matching what `fula-cli/src/state.rs::hash_user_id` would + /// produce against the same `userId` (= sha256-hex of + /// lower(email)). Reproduces the master algorithm step-by-step + /// here so the two stay in lockstep — without this test, a + /// future master-side refactor could silently desync the SDK + /// from the published global users-index keys. + #[test] + fn derive_user_key_matches_master_state_rs_algorithm() { + use sha2::{Digest, Sha256}; + + // Reference inputs. + let email = "User@Example.COM"; + let email_lower = "user@example.com"; + + // SDK derives directly from email. + let sdk_key = derive_user_key_from_email(email); + + // Reproduce master's chain: lower(email) → sha256 → hex → blake3 → first 16 bytes hex. + let user_id_digest = Sha256::digest(email_lower.as_bytes()); + let user_id_hex = hex::encode(user_id_digest); + // master state.rs: hash_user_id(user_id_str) = + // blake3::Hasher::new() + // .update(b"fula:user_id:") + // .update(user_id_str.as_bytes()) + // .finalize()[..16] hex + let mut hasher = blake3::Hasher::new(); + hasher.update(b"fula:user_id:"); + hasher.update(user_id_hex.as_bytes()); + let master_key = hex::encode(&hasher.finalize().as_bytes()[..16]); + + assert_eq!( + sdk_key, master_key, + "SDK derive_user_key_from_email diverged from master state.rs::hash_user_id; \ + email={}, sdk={}, master={}", + email, sdk_key, master_key + ); + assert_eq!(sdk_key.len(), 32, "userKey must be 32 hex chars (16 bytes)"); + } + + #[test] + fn derive_user_key_normalizes_email_case() { + // Email is case-insensitive (per RFC 5321 local-part is, in practice, + // a courtesy and master normalizes too). Same email different case + // MUST yield the same userKey, otherwise users would lose access + // when their app capitalizes differently than master. + let a = derive_user_key_from_email("alice@example.com"); + let b = derive_user_key_from_email("ALICE@EXAMPLE.COM"); + let c = derive_user_key_from_email("Alice@Example.com"); + assert_eq!(a, b); + assert_eq!(a, c); + } + + #[test] + fn derive_user_key_distinguishes_different_users() { + let a = derive_user_key_from_email("alice@example.com"); + let b = derive_user_key_from_email("bob@example.com"); + assert_ne!(a, b); + } + + // ============================================================ + // Phase 3.3.5 — hot-start cache reuse tests (advisor-mandated 4) + // ============================================================ + // + // Each test constructs both a network-mock universe (wiremock) + // and a real on-disk BlockCache (TempDir + redb). The cache + // survives across resolver constructions (simulating SDK + // restart) so we can verify replay-defense persistence and the + // soft-TTL short-circuit behavior. + + use crate::block_cache::BlockCache; + use std::path::PathBuf; + use tempfile::TempDir; + + fn make_payload_with_seq(sequence: u64) -> (Bytes, GlobalUsersIndex) { + let payload = GlobalUsersIndex { + v: 1, + sequence, + updated_at_unix: 1_700_000_000, + users: BTreeMap::new(), + }; + let bytes = serde_ipld_dagcbor::to_vec(&payload).expect("encode"); + (Bytes::from(bytes), payload) + } + + fn fixture_resolver_config_with_ipns(ipns_url: &str) -> ResolverConfig { + let mut cfg = ResolverConfig::new( + "http://chain-rpc.unused/", // never called on hot-start path + fixture_address(), + fixture_ipns_name(), + ); + cfg.ipns_gateways = vec![format!("{}/ipns/{{name}}", ipns_url)]; + cfg.ipns_race_timeout = Duration::from_secs(2); + cfg.per_request_timeout = Duration::from_secs(2); + cfg.soft_ttl = Duration::from_secs(60); + cfg + } + + /// Test 1 — replay-defense floor survives SDK restart. + /// Round-trip through the cache: resolve seq=42 → drop resolver + /// → reopen against same cache → highest_seen_sequence == 42. + #[tokio::test] + async fn hot_start_seeds_floor_across_restart() { + let dir = TempDir::new().unwrap(); + let cache_path: PathBuf = dir.path().join("cache.redb"); + + // Open cache, manually plant a (cid, seq) row — simulates + // a prior successful resolve. (Avoids the full wiremock + // setup since this test is about restart semantics, not + // resolve mechanics.) + { + let cache = BlockCache::open(&cache_path, 1024 * 1024).expect("open"); + let cid = synthesize_cid_from_bytes(b"some payload"); + cache + .store_users_index_state(&cid, 42, 1_700_000_000) + .expect("store"); + } // cache dropped → file lock released + + // Re-open cache + construct resolver via new_with_cache. + let cache = Arc::new(BlockCache::open(&cache_path, 1024 * 1024).expect("re-open")); + let cfg = ResolverConfig::new( + "http://rpc.unused/", + fixture_address(), + fixture_ipns_name(), + ); + let resolver = UsersIndexResolver::new_with_cache(cfg, cache).expect("new_with_cache"); + + assert_eq!( + resolver.highest_seen_sequence(), + 42, + "replay-defense floor MUST survive restart and seed from persisted state" + ); + } + + /// Test 2 — replay regression after restart is rejected. + /// Restart with floor=99; IPNS returns seq=50; resolver MUST + /// reject (not silently serve the stale payload). + #[tokio::test] + async fn hot_start_rejects_regression_after_restart() { + let dir = TempDir::new().unwrap(); + let cache_path: PathBuf = dir.path().join("cache.redb"); + + // Plant a high floor (seq=99). + { + let cache = BlockCache::open(&cache_path, 1024 * 1024).expect("open"); + let placeholder = synthesize_cid_from_bytes(b"placeholder"); + cache + .store_users_index_state(&placeholder, 99, 0) + .expect("plant"); + } + + // wiremock IPNS serves seq=50 (regression). + let ipns = MockServer::start().await; + let (regress_bytes, _) = make_payload_with_seq(50); + Mock::given(method("GET")) + .and(path(format!("/ipns/{}", fixture_ipns_name()))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(regress_bytes.as_ref())) + .mount(&ipns) + .await; + + let cache = Arc::new(BlockCache::open(&cache_path, 1024 * 1024).expect("re-open")); + // observed_at = 0 → way past TTL → hot-start short-circuit + // should NOT fire; resolver falls through to network. + let mut cfg = fixture_resolver_config_with_ipns(&ipns.uri()); + cfg.soft_ttl = Duration::from_secs(60); // bigger than 0-vs-now gap doesn't matter; observed_at=0 + let resolver = UsersIndexResolver::new_with_cache(cfg, cache).expect("new_with_cache"); + + assert_eq!(resolver.highest_seen_sequence(), 99, "floor seeded"); + + // resolve() → IPNS returns seq=50; replay-defense rejects. + // Falls through to chain (also fails since RPC URL is + // unused). Final error: UsersIndexResolutionFailed wrapping + // the IPNS exhaustion (the resolver internally rejected the + // regression and treated it as "IPNS failed"). + let err = resolver.resolve().await.expect_err("must reject"); + // The regression is observed inside try_ipns and surfaces + // as UsersIndexResolutionFailed — the chain leg also can't + // help (RPC URL unused), so the wrapper combines them. + assert!( + matches!(err, ClientError::UsersIndexResolutionFailed { .. }), + "expected resolution failure, got: {:?}", + err + ); + + // Floor unchanged — 99 still holds. + assert_eq!( + resolver.highest_seen_sequence(), + 99, + "regression payload must NOT advance the floor" + ); + } + + /// Test 3 — hot-start within TTL serves cached state without + /// touching the network. Uses `Mock::expect(1)` on the IPNS + /// mock: the second resolve() call MUST hit the cache. If the + /// short-circuit is broken, IPNS would be called twice, and + /// wiremock would panic in its `Drop` impl on test exit. + #[tokio::test] + async fn hot_start_within_ttl_skips_network() { + let dir = TempDir::new().unwrap(); + let cache_path: PathBuf = dir.path().join("cache.redb"); + let cache = Arc::new(BlockCache::open(&cache_path, 1024 * 1024).expect("open")); + + let ipns = MockServer::start().await; + let (cbor, _) = make_payload_with_seq(7); + Mock::given(method("GET")) + .and(path(format!("/ipns/{}", fixture_ipns_name()))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(cbor.as_ref())) + .expect(1) // IPNS hit at most ONCE; second resolve MUST be cached + .mount(&ipns) + .await; + + let cfg = fixture_resolver_config_with_ipns(&ipns.uri()); + let resolver = + UsersIndexResolver::new_with_cache(cfg, Arc::clone(&cache)).expect("new_with_cache"); + + // First resolve: hits IPNS, populates cache. `persist_to_cache` + // is synchronous-from-async (no spawned background task), so + // when `resolve` returns the METADATA + BLOCKS rows are + // already on disk. The second resolve will see them. + let r1 = resolver.resolve().await.expect("first resolve"); + assert_eq!(r1.source, ResolutionSource::Ipns); + assert_eq!(r1.payload.sequence, 7); + + // Second resolve: should be served from cache. wiremock + // panics on Drop if IPNS was called more than `expect(1)`. + let r2 = resolver.resolve().await.expect("second resolve"); + assert_eq!( + r2.source, + ResolutionSource::HotStartCache, + "second resolve must be served from hot-start cache (not the network)" + ); + assert_eq!(r2.payload.sequence, 7); + } + + /// Test 4 — hot-start beyond TTL re-resolves. Configure a + /// 1-second `soft_ttl`; resolve once; sleep 2 seconds; resolve + /// again. The second resolve MUST re-hit IPNS (so the mock is + /// expected to fire twice). + #[tokio::test] + async fn hot_start_beyond_ttl_re_resolves() { + let dir = TempDir::new().unwrap(); + let cache_path: PathBuf = dir.path().join("cache.redb"); + let cache = Arc::new(BlockCache::open(&cache_path, 1024 * 1024).expect("open")); + + let ipns = MockServer::start().await; + let (cbor, _) = make_payload_with_seq(11); + Mock::given(method("GET")) + .and(path(format!("/ipns/{}", fixture_ipns_name()))) + .respond_with(ResponseTemplate::new(200).set_body_bytes(cbor.as_ref())) + .expect(2) // both resolves must hit IPNS — TTL elapsed between them + .mount(&ipns) + .await; + + let mut cfg = fixture_resolver_config_with_ipns(&ipns.uri()); + cfg.soft_ttl = Duration::from_secs(1); // tight TTL for the test + let resolver = + UsersIndexResolver::new_with_cache(cfg, Arc::clone(&cache)).expect("new_with_cache"); + + let r1 = resolver.resolve().await.expect("first resolve"); + assert_eq!(r1.source, ResolutionSource::Ipns); + + // Wait past the TTL. + tokio::time::sleep(Duration::from_millis(1500)).await; + + let r2 = resolver.resolve().await.expect("second resolve"); + assert_eq!( + r2.source, + ResolutionSource::Ipns, + "after TTL elapse, resolver must re-fetch from IPNS rather than serve stale cache" + ); + } + + // ============================================================ + // Pre-existing tests below (Phase 3.3 sub-step A) + // ============================================================ + + #[test] + fn highest_seen_sequence_is_monotonic() { + let cfg = ResolverConfig::new( + "https://rpc.example", + fixture_address(), + fixture_ipns_name(), + ); + let resolver = UsersIndexResolver::new(cfg).expect("new"); + assert_eq!(resolver.highest_seen_sequence(), 0); + resolver.bump_seen_sequence(5); + assert_eq!(resolver.highest_seen_sequence(), 5); + // Lower value MUST NOT lower the floor. + resolver.bump_seen_sequence(3); + assert_eq!(resolver.highest_seen_sequence(), 5); + // Equal value is also a no-op. + resolver.bump_seen_sequence(5); + assert_eq!(resolver.highest_seen_sequence(), 5); + // Higher value advances. + resolver.bump_seen_sequence(7); + assert_eq!(resolver.highest_seen_sequence(), 7); + } +} diff --git a/crates/fula-client/src/types.rs b/crates/fula-client/src/types.rs index 58319b0..063aadb 100644 --- a/crates/fula-client/src/types.rs +++ b/crates/fula-client/src/types.rs @@ -115,6 +115,82 @@ pub struct GetObjectResult { pub metadata: std::collections::HashMap, } +/// Phase 19 — origin of a successfully-served byte payload. +/// +/// Apps that surface offline indicators inspect this field to decide +/// what to show: `Master` is the fast path, `LocalCache` is a redb +/// BLOCKS hit (no network), and `Gateway(url)` records which IPFS +/// gateway the gateway-race elected. Defaulting to `Master` keeps +/// pre-Phase-19 callers byte-identical (they ignore the field). +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum ReadSource { + /// Master S3 served the request directly. + Master, + /// On-disk redb BLOCKS table served the bytes — no network round-trip. + LocalCache, + /// Public IPFS gateway served the bytes (master-down fallback path). + /// The string is the URL template (e.g. `https://ipfs.io/ipfs/{cid}`) + /// used at fetch time, useful for diagnostics or "served by Cloudflare" + /// surfacing in operator dashboards. + Gateway(String), +} + +/// Phase 19 — freshness signal for a successfully-served byte payload. +/// +/// `Live` is the master-served fast path. `Cached { observed_at }` is +/// returned when bytes came from local redb (BLOCKS hit) — apps may +/// choose to surface "viewing a saved copy" UI based on age. The +/// `StaleByDesign` / `StaleByOutage` variants are reserved for Phase +/// 3.3 cold-start where the SDK can attribute snapshot age to the +/// publisher cadence vs. an actual master outage; today the master-down +/// fallback path emits `Cached`. +#[derive(Clone, Debug, PartialEq, Eq)] +pub enum ReadFreshness { + /// Master-served bytes (fresh). + Live, + /// Served from on-disk redb cache; `observed_at` is the unix-millis + /// when the entry was first written. Apps display age relative to + /// this if they care to show staleness. + Cached { observed_at: u64 }, + /// Cold-start cross-device read; snapshot age within the configured + /// publisher cadence (≤ `USERS_INDEX_FLUSH_INTERVAL`). Apps may + /// surface "synced N min ago". + /// + /// **Phase 3.3 scaffolding — not emitted by Phase 19.** Wired in + /// when the cold-start resolver lands (task #18); resolver computes + /// `snapshot_age_secs = now - resolved.payload.updated_at_unix` + /// and selects this vs. `StaleByOutage` based on whether age is + /// inside the publisher cadence. + StaleByDesign { snapshot_age_secs: u64 }, + /// Cold-start cross-device read; snapshot age exceeds the + /// publisher cadence — likely indicates an actual master outage. + /// + /// **Phase 3.3 scaffolding — not emitted by Phase 19.** See + /// `StaleByDesign` doc above. + StaleByOutage { snapshot_age_secs: u64 }, +} + +/// Phase 19 — wrapper around `GetObjectResult` carrying transparency +/// fields (`source`, `freshness`). +/// +/// **Why a wrapper instead of fields on `GetObjectResult`:** the +/// existing struct is part of the SDK's public API consumed by callers +/// that pattern-match it exhaustively. Adding fields breaks them. A +/// new wrapper type lets callers opt in to the transparency surface +/// while existing consumers (including encrypted-SDK internals that +/// read `.data` / `.etag`) keep using `GetObjectResult` unchanged. +#[derive(Clone, Debug)] +pub struct OfflineGetResult { + /// The underlying `GetObjectResult` — `data`, `etag`, etc., are on + /// `inner`. Callers that don't care about transparency just read + /// `result.inner.data`. + pub inner: GetObjectResult, + /// Where the bytes ultimately came from. See `ReadSource` for variants. + pub source: ReadSource, + /// How fresh the bytes are. See `ReadFreshness` for variants. + pub freshness: ReadFreshness, +} + /// Head object result #[derive(Clone, Debug)] pub struct HeadObjectResult { diff --git a/crates/fula-flutter/Cargo.toml b/crates/fula-flutter/Cargo.toml index fdaf48e..55b4a72 100644 --- a/crates/fula-flutter/Cargo.toml +++ b/crates/fula-flutter/Cargo.toml @@ -57,6 +57,11 @@ wasm-bindgen-test = "0.3" # time APIs aren't re-introduced into the migration path. web-time = "1" +# Native-only dev-deps (Phase 2.x config plumbing tests construct +# tempdirs to verify block_cache_path round-trips through the bridge). +[target.'cfg(not(target_arch = "wasm32"))'.dev-dependencies] +tempfile = { workspace = true } + [features] default = [] diff --git a/crates/fula-flutter/src/api/client.rs b/crates/fula-flutter/src/api/client.rs index 8154543..84786b0 100644 --- a/crates/fula-flutter/src/api/client.rs +++ b/crates/fula-flutter/src/api/client.rs @@ -15,24 +15,55 @@ use async_lock::RwLock; use crate::api::types::*; +/// Build the underlying `fula_client::Config` from the Dart-facing +/// `FulaConfig`, plumbing every Phase 1.2 / 2.x field through. Used by +/// `create_client`, `create_encrypted_client`, and +/// `create_encrypted_client_with_pinning` to keep the three constructors +/// in lockstep — adding a new field to FulaConfig only requires a +/// change here. +fn build_inner_config(config: &FulaConfig) -> fula_client::Config { + let mut inner = fula_client::Config::new(&config.endpoint) + .with_timeout(Duration::from_secs(config.timeout_seconds)); + + // Existing F8/F10 fields. + inner.per_chunk_download_timeout = + Duration::from_secs(config.per_chunk_download_timeout_seconds); + inner.buffered_download_max_bytes = config.buffered_download_max_bytes; + + // Phase 2.1 — health gate. + inner.health_gate_enabled = config.health_gate_enabled; + inner.health_gate_ttl = Duration::from_secs(config.health_gate_ttl_seconds); + + // Phase 2.2 — block cache. The path-string conversion treats + // empty string as `None` so the SDK's `dirs`-based platform + // default kicks in. + inner.block_cache_enabled = config.block_cache_enabled; + inner.block_cache_path = if config.block_cache_path.is_empty() { + None + } else { + Some(std::path::PathBuf::from(&config.block_cache_path)) + }; + inner.block_cache_max_bytes = config.block_cache_max_bytes; + + // Phase 2.3 / 2.4 — gateway race + offline fallback. + inner.gateway_fallback_enabled = config.gateway_fallback_enabled; + inner.gateway_fallback_urls = config.gateway_fallback_urls.clone(); + inner.gateway_race_concurrency = config.gateway_race_concurrency as usize; + + if let Some(token) = &config.access_token { + inner = inner.with_token(token.clone()); + } + + inner +} + // ============================================================================ // Client Creation // ============================================================================ /// Create a new Fula client with the given configuration pub fn create_client(config: FulaConfig) -> anyhow::Result { - let mut inner_config = fula_client::Config::new(&config.endpoint) - .with_timeout(Duration::from_secs(config.timeout_seconds)); - inner_config.per_chunk_download_timeout = - Duration::from_secs(config.per_chunk_download_timeout_seconds); - inner_config.buffered_download_max_bytes = config.buffered_download_max_bytes; - - let inner_config = if let Some(token) = config.access_token { - inner_config.with_token(token) - } else { - inner_config - }; - + let inner_config = build_inner_config(&config); let client = fula_client::FulaClient::new(inner_config)?; Ok(FulaClientHandle { @@ -45,17 +76,7 @@ pub fn create_encrypted_client( config: FulaConfig, encryption: EncryptionConfig, ) -> anyhow::Result { - let mut inner_config = fula_client::Config::new(&config.endpoint) - .with_timeout(Duration::from_secs(config.timeout_seconds)); - inner_config.per_chunk_download_timeout = - Duration::from_secs(config.per_chunk_download_timeout_seconds); - inner_config.buffered_download_max_bytes = config.buffered_download_max_bytes; - - let inner_config = if let Some(token) = config.access_token { - inner_config.with_token(token) - } else { - inner_config - }; + let inner_config = build_inner_config(&config); // Create encryption config let enc_config = if let Some(secret_key) = encryption.secret_key { @@ -101,17 +122,7 @@ pub fn create_encrypted_client_with_pinning( encryption: EncryptionConfig, pinning: PinningConfig, ) -> anyhow::Result { - let mut inner_config = fula_client::Config::new(&config.endpoint) - .with_timeout(Duration::from_secs(config.timeout_seconds)); - inner_config.per_chunk_download_timeout = - Duration::from_secs(config.per_chunk_download_timeout_seconds); - inner_config.buffered_download_max_bytes = config.buffered_download_max_bytes; - - let inner_config = if let Some(token) = config.access_token { - inner_config.with_token(token) - } else { - inner_config - }; + let inner_config = build_inner_config(&config); // Create encryption config let enc_config = if let Some(secret_key) = encryption.secret_key { @@ -304,6 +315,7 @@ mod tests { max_retries: 3, per_chunk_download_timeout_seconds: 120, buffered_download_max_bytes: 64 * 1024 * 1024, + ..FulaConfig::default() }; let handle = create_client(cfg).expect("create_client should succeed"); let inner_cfg = handle.inner.config(); @@ -333,4 +345,94 @@ mod tests { 256 * 1024 * 1024, ); } + + /// Phase 2.x — verify all new fields plumb from FulaConfig + /// (Dart-facing) through `build_inner_config` into the underlying + /// `fula_client::Config`. Without this test, a future refactor of + /// `build_inner_config` could silently drop a field and Dart apps + /// would observe Phase 2.x as inert (config flag set, runtime + /// flag still false). + #[test] + fn fula_config_plumbs_phase_2_x_health_gate_fields() { + let cfg = FulaConfig { + health_gate_enabled: true, + health_gate_ttl_seconds: 45, + ..FulaConfig::default() + }; + let handle = create_client(cfg).expect("create_client"); + let inner = handle.inner.config(); + assert!(inner.health_gate_enabled, "health_gate_enabled must plumb"); + assert_eq!(inner.health_gate_ttl, Duration::from_secs(45)); + } + + #[test] + fn fula_config_plumbs_phase_2_x_block_cache_fields() { + // Use a path that won't actually open (we only assert the + // config plumbs; the cache opens lazily on first use). + let temp = tempfile::tempdir().expect("tempdir"); + let cache_path = temp.path().join("cache.redb"); + + let cfg = FulaConfig { + block_cache_enabled: true, + block_cache_path: cache_path.to_string_lossy().into_owned(), + block_cache_max_bytes: 64 * 1024 * 1024, + ..FulaConfig::default() + }; + let handle = create_client(cfg).expect("create_client"); + let inner = handle.inner.config(); + assert!(inner.block_cache_enabled); + assert_eq!(inner.block_cache_path, Some(cache_path)); + assert_eq!(inner.block_cache_max_bytes, 64 * 1024 * 1024); + } + + #[test] + fn fula_config_empty_block_cache_path_means_use_platform_default() { + // The Dart-facing field is `String` (FFI doesn't carry + // `Option`); empty string is the documented "use default" form. + // The bridge must translate to `None` so the SDK's `dirs`-based + // default kicks in. + let cfg = FulaConfig { + block_cache_enabled: true, + block_cache_path: String::new(), + ..FulaConfig::default() + }; + let handle = create_client(cfg).expect("create_client"); + let inner = handle.inner.config(); + assert_eq!(inner.block_cache_path, None, + "empty block_cache_path string must translate to None so the SDK uses the platform default"); + } + + #[test] + fn fula_config_plumbs_phase_2_x_gateway_fields() { + let cfg = FulaConfig { + gateway_fallback_enabled: true, + gateway_fallback_urls: vec![ + "https://custom1.example/ipfs/{cid}".into(), + "https://custom2.example/ipfs/{cid}".into(), + ], + gateway_race_concurrency: 5, + ..FulaConfig::default() + }; + let handle = create_client(cfg).expect("create_client"); + let inner = handle.inner.config(); + assert!(inner.gateway_fallback_enabled); + assert_eq!(inner.gateway_fallback_urls.len(), 2); + assert_eq!(inner.gateway_fallback_urls[0], "https://custom1.example/ipfs/{cid}"); + assert_eq!(inner.gateway_race_concurrency, 5); + } + + #[test] + fn fula_config_default_phase_2_x_fields_are_off() { + // Backward-compat invariant: default-constructed Dart config + // produces a default-constructed Rust config. Apps that don't + // touch the new fields see byte-identical pre-Phase-2.x behavior. + let cfg = FulaConfig::default(); + let handle = create_client(cfg).expect("create_client"); + let inner = handle.inner.config(); + assert!(!inner.health_gate_enabled); + assert!(!inner.block_cache_enabled); + assert!(!inner.gateway_fallback_enabled); + assert_eq!(inner.gateway_fallback_urls.len(), 0); + assert_eq!(inner.gateway_race_concurrency, 3); + } } diff --git a/crates/fula-flutter/src/api/error.rs b/crates/fula-flutter/src/api/error.rs index d082dae..bf1ae2e 100644 --- a/crates/fula-flutter/src/api/error.rs +++ b/crates/fula-flutter/src/api/error.rs @@ -69,6 +69,42 @@ pub enum FulaError { #[error("Forest error: {0}")] ForestError(String), + /// Phase 2.2 of master-independent reads: a single block exceeds + /// the configured `block_cache_max_bytes` budget. Surface to the + /// user with guidance to raise the cache size or skip the cache. + /// Native-only signal in practice (BlockCache is compiled out on + /// wasm32) but defined unconditionally so the Dart binding always + /// has the same enum shape across Android, iOS, Ubuntu, Windows, + /// and web (flutter-js + wasm). + #[error("Cache budget exceeded: size={size}, budget={budget}")] + CacheBudgetExceeded { size: u64, budget: u64 }, + + /// Phase 2.2 of master-independent reads: catch-all for the + /// persistent block cache's I/O / storage / commit errors. + /// Stringified at the FFI boundary; Dart code doesn't depend on + /// any Rust storage-engine specifics. Native-only in practice. + #[error("Cache error: {0}")] + CacheError(String), + + /// Phase 3.3 — cold-start hybrid resolver could not resolve the + /// master-published global users-index CID via IPNS or chain. + /// Surface to Dart apps as "offline mode unavailable for this + /// device until master is reachable again" — distinct from + /// `Network` (which is a transient master-side glitch). + #[error("Users-index resolution failed: {0}")] + UsersIndexResolutionFailed(String), + + /// Phase 3.3 — replay defense: a payload's embedded sequence + /// regressed below what the SDK has seen before. Dart apps + /// should NOT silently retry; surface as a clear "stale-state" + /// signal (possibly with a retry-after-N-minutes hint). + #[error("Sequence regression in {channel}: observed={observed}, highest seen={highest_seen}")] + SequenceRegression { + observed: u64, + highest_seen: u64, + channel: String, + }, + /// Internal error #[error("Internal error: {0}")] Internal(String), @@ -131,6 +167,22 @@ impl From for FulaError { ClientError::MasterUnreachable { down_for_secs } => FulaError::Network( format!("master unreachable (health gate; down for ~{}s)", down_for_secs), ), + // Phase 2.2 — block cache surface. Map to first-class + // FulaError variants so Dart code can pattern-match without + // string parsing. Identical shape on every target (native + + // wasm) so flutter-js / web builds compile against the same + // enum. + ClientError::BlockTooLarge { size, budget } => { + FulaError::CacheBudgetExceeded { size, budget } + } + ClientError::BlockCache(msg) => FulaError::CacheError(msg), + // Phase 3.3 cold-start hybrid resolver. + ClientError::UsersIndexResolutionFailed { reason } => { + FulaError::UsersIndexResolutionFailed(reason) + } + ClientError::SequenceRegression { observed, highest_seen, channel } => { + FulaError::SequenceRegression { observed, highest_seen, channel } + } } } } @@ -194,7 +246,29 @@ impl FulaError { FulaError::ShareError(_) => "SHARE_ERROR", FulaError::RotationError(_) => "ROTATION_ERROR", FulaError::ForestError(_) => "FOREST_ERROR", + FulaError::CacheBudgetExceeded { .. } => "CACHE_BUDGET_EXCEEDED", + FulaError::CacheError(_) => "CACHE_ERROR", + FulaError::UsersIndexResolutionFailed(_) => "USERS_INDEX_RESOLUTION_FAILED", + FulaError::SequenceRegression { .. } => "SEQUENCE_REGRESSION", FulaError::Internal(_) => "INTERNAL", } } + + /// Phase 2.2 helper: detect block-cache-related errors so app code + /// can offer a "retry without cache" or "raise budget" prompt + /// without string-parsing the underlying message. + pub fn is_cache_error(&self) -> bool { + matches!(self, FulaError::CacheBudgetExceeded { .. } | FulaError::CacheError(_)) + } + + /// Phase 3.3 helper: detect cold-start resolution errors. Apps + /// should surface this as "offline mode unavailable" instead of + /// a generic "download failed" — the file is fine; we just can't + /// learn its CID without master. + pub fn is_users_index_error(&self) -> bool { + matches!( + self, + FulaError::UsersIndexResolutionFailed(_) | FulaError::SequenceRegression { .. } + ) + } } diff --git a/crates/fula-flutter/src/api/types.rs b/crates/fula-flutter/src/api/types.rs index 5567b30..01c7810 100644 --- a/crates/fula-flutter/src/api/types.rs +++ b/crates/fula-flutter/src/api/types.rs @@ -45,6 +45,67 @@ pub struct FulaConfig { /// buffered path returns an error instead of allocating the buffer. /// Default: 256 MiB. pub buffered_download_max_bytes: u64, + + // ============================================================ + // Phase 2.1 — master-down detection (health gate) + // ============================================================ + /// Enable the SDK's master health gate. Off by default + /// (backward-compat). When on, the SDK observes request outcomes + /// and short-circuits with `Network`/`MasterUnreachable` error + /// after two consecutive failures, instead of paying the per-read + /// timeout. Works on every platform fula-flutter ships against. + pub health_gate_enabled: bool, + + /// TTL of the `Down` state when `health_gate_enabled = true`. + /// After this duration elapses, the next request is allowed + /// through as a probe. Default: 30 seconds. + pub health_gate_ttl_seconds: u64, + + // ============================================================ + // Phase 2.2 — persistent block cache + // ============================================================ + /// Enable the on-disk LRU block cache. + /// + /// **Native-only.** The cache is `redb`-backed and not available + /// in browser-targeted builds. Setting `true` on a wasm32 target + /// is silently inert — the underlying SDK skips construction and + /// the offline path stays unavailable in the browser. On + /// Android/iOS/Ubuntu/Windows the field activates Phase 2.2. + pub block_cache_enabled: bool, + + /// Filesystem path for the block-cache redb database. Empty + /// string = use the platform default (`dirs::data_local_dir()/ + /// fula/cache/blocks.redb`). Native-only; ignored on wasm32. + pub block_cache_path: String, + + /// Maximum on-disk bytes for the block cache. Default: 256 MiB. + /// The cache evicts to 80 % of this watermark when a `put` would + /// push it past `max_bytes`. Native-only; ignored on wasm32. + pub block_cache_max_bytes: u64, + + // ============================================================ + // Phase 2.3 / 2.4 — IPFS gateway race + offline GET fallback + // ============================================================ + /// Enable falling back to public IPFS gateways when master is + /// unreachable AND the SDK has previously cached the requested + /// object's CID via Phase 2.2's KEY_TO_CID table. + /// + /// Requires `block_cache_enabled = true` (the cache holds the + /// `(bucket, key) → cid` map the gateway race needs). Native-only; + /// ignored on wasm32. + pub gateway_fallback_enabled: bool, + + /// Custom gateway URL templates. Each must contain the literal + /// `{cid}` token, which the SDK substitutes per fetch. Empty Vec + /// means "use the SDK-shipped default list of six gateways" + /// (Cloudflare, dweb.link, ipfs.io, trustless-gateway.link, + /// 4everland.io, gateway.pinata.cloud). Native-only. + pub gateway_fallback_urls: Vec, + + /// Number of gateways the SDK races in parallel for any single + /// CID. Default: 3. Capped at the gateway-pool length. + /// Native-only. + pub gateway_race_concurrency: u32, } impl Default for FulaConfig { @@ -56,6 +117,17 @@ impl Default for FulaConfig { max_retries: 3, per_chunk_download_timeout_seconds: 300, buffered_download_max_bytes: 256 * 1024 * 1024, + // Phase 2.x — all flags off by default (backward-compat). + // Apps must opt in explicitly; existing Dart code sees + // byte-identical behavior to pre-Phase-2.x builds. + health_gate_enabled: false, + health_gate_ttl_seconds: 30, + block_cache_enabled: false, + block_cache_path: String::new(), + block_cache_max_bytes: 256 * 1024 * 1024, + gateway_fallback_enabled: false, + gateway_fallback_urls: Vec::new(), + gateway_race_concurrency: 3, } } } diff --git a/crates/fula-flutter/src/frb_generated.rs b/crates/fula-flutter/src/frb_generated.rs index 1102ca3..b1b7398 100644 --- a/crates/fula-flutter/src/frb_generated.rs +++ b/crates/fula-flutter/src/frb_generated.rs @@ -4194,6 +4194,14 @@ impl SseDecode for crate::api::types::FulaConfig { let mut var_maxRetries = ::sse_decode(deserializer); let mut var_perChunkDownloadTimeoutSeconds = ::sse_decode(deserializer); let mut var_bufferedDownloadMaxBytes = ::sse_decode(deserializer); + // MANUAL PATCH (Phase 2.x cross-platform audit): the new + // health_gate / block_cache / gateway_fallback fields are NOT + // yet on the wire from Dart (frb_codegen has not been re-run). + // Defaulting them via struct-update keeps the Rust struct + // initializable while the legacy 6-field wire format is still + // what Dart sends. Re-running `flutter_rust_bridge_codegen + // generate` regenerates this file and the new fields become + // settable from Dart. return crate::api::types::FulaConfig { endpoint: var_endpoint, access_token: var_accessToken, @@ -4201,6 +4209,7 @@ impl SseDecode for crate::api::types::FulaConfig { max_retries: var_maxRetries, per_chunk_download_timeout_seconds: var_perChunkDownloadTimeoutSeconds, buffered_download_max_bytes: var_bufferedDownloadMaxBytes, + ..crate::api::types::FulaConfig::default() }; } } @@ -6336,6 +6345,11 @@ mod io { impl CstDecode for wire_cst_fula_config { // Codec=Cst (C-struct based), see doc to use other codecs fn cst_decode(self) -> crate::api::types::FulaConfig { + // MANUAL PATCH (Phase 2.x cross-platform audit): see + // matching note on `SseDecode for FulaConfig`. The wire + // C-struct still has only the legacy 6 fields; new fields + // default until `flutter_rust_bridge_codegen generate` + // regenerates this file. crate::api::types::FulaConfig { endpoint: self.endpoint.cst_decode(), access_token: self.access_token.cst_decode(), @@ -6345,6 +6359,7 @@ mod io { .per_chunk_download_timeout_seconds .cst_decode(), buffered_download_max_bytes: self.buffered_download_max_bytes.cst_decode(), + ..crate::api::types::FulaConfig::default() } } } @@ -8638,6 +8653,10 @@ mod web { "Expected 6 elements, got {}", self_.length() ); + // MANUAL PATCH (Phase 2.x cross-platform audit): wasm/JS + // CstDecode path. The 6-element JsValue array carries the + // legacy fields; new Phase 2.x fields default until FRB + // regen. crate::api::types::FulaConfig { endpoint: self_.get(0).cst_decode(), access_token: self_.get(1).cst_decode(), @@ -8645,6 +8664,7 @@ mod web { max_retries: self_.get(3).cst_decode(), per_chunk_download_timeout_seconds: self_.get(4).cst_decode(), buffered_download_max_bytes: self_.get(5).cst_decode(), + ..crate::api::types::FulaConfig::default() } } } diff --git a/crates/fula-js/src/lib.rs b/crates/fula-js/src/lib.rs index 567827e..ae3def2 100644 --- a/crates/fula-js/src/lib.rs +++ b/crates/fula-js/src/lib.rs @@ -54,9 +54,80 @@ pub struct JsFulaConfig { /// Request timeout in seconds (default: 30) #[serde(default = "default_timeout")] pub timeout_seconds: u64, + + // ============================================================ + // Phase 2.1 — master-down detection (functional on wasm/web) + // ============================================================ + /// Enable the SDK's master health gate. Off by default + /// (backward-compat). When on, two consecutive failed master + /// requests trip the gate and short-circuit subsequent reads + /// with a `MASTER_UNREACHABLE` error. **Functional on wasm/web.** + #[serde(default)] + pub health_gate_enabled: bool, + + /// TTL of the `Down` state when `healthGateEnabled = true`. + /// After this duration elapses, the next request is allowed + /// through as a probe. Default: 30 seconds. + #[serde(default = "default_health_gate_ttl")] + pub health_gate_ttl_seconds: u64, + + // ============================================================ + // Phase 2.2 / 2.3 / 2.4 — block cache + gateway race + // ============================================================ + // + // These fields are NATIVE-ONLY at runtime. The underlying + // `fula_client::Config` carries them across all builds, but on + // the wasm32 target the SDK gates out the `redb`-backed cache + // and the parking_lot-based gateway pool, so setting these + // flags has no effect in browsers. + // + // We expose them anyway for **API symmetry** with `fula-flutter`: + // a TypeScript app sharing config types between mobile and web + // builds can construct one config object and have it accepted + // by both. On web the offline path silently no-ops; on native + // (Tauri / Electron-with-Rust / Node-via-N-API integrations) the + // path activates as documented for fula-flutter. + + /// Enable the on-disk LRU block cache. **Native-only at runtime.** + /// On wasm/web this flag is silently inert. + #[serde(default)] + pub block_cache_enabled: bool, + + /// Filesystem path for the block-cache redb database. Empty + /// string = use platform default. **Native-only at runtime.** + #[serde(default)] + pub block_cache_path: String, + + /// Maximum on-disk bytes for the block cache. Default: 256 MiB. + /// **Native-only at runtime.** + #[serde(default = "default_block_cache_max_bytes")] + pub block_cache_max_bytes: u64, + + /// Enable falling back to public IPFS gateways when master is + /// unreachable. **Native-only at runtime.** Requires + /// `blockCacheEnabled = true` to populate the `(bucket,key) → cid` + /// lookup table the offline race needs. + #[serde(default)] + pub gateway_fallback_enabled: bool, + + /// Custom gateway URL templates. Each must contain the literal + /// `{cid}` token. Empty Vec = use the SDK-shipped default list + /// of six gateways (Cloudflare, dweb.link, ipfs.io, + /// trustless-gateway.link, 4everland.io, gateway.pinata.cloud). + /// **Native-only at runtime.** + #[serde(default)] + pub gateway_fallback_urls: Vec, + + /// Number of gateways the SDK races in parallel. Default: 3. + /// **Native-only at runtime.** + #[serde(default = "default_gateway_race_concurrency")] + pub gateway_race_concurrency: u32, } fn default_timeout() -> u64 { 30 } +fn default_health_gate_ttl() -> u64 { 30 } +fn default_block_cache_max_bytes() -> u64 { 256 * 1024 * 1024 } +fn default_gateway_race_concurrency() -> u32 { 3 } #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] @@ -173,12 +244,7 @@ pub async fn create_encrypted_client( .map_err(|e| JsError::new(&format!("Invalid encryption config: {}", e)))?; // Build client config - let mut client_config = fula_client::Config::new(&config.endpoint) - .with_timeout(std::time::Duration::from_secs(config.timeout_seconds)); - - if let Some(token) = config.access_token { - client_config = client_config.with_token(token); - } + let client_config = build_inner_config(config); // Build encryption config let enc_config = if let Some(secret_key) = encryption.secret_key { @@ -204,13 +270,123 @@ pub async fn create_encrypted_client( }; let client = fula_client::EncryptedClient::new(client_config, enc_config) - .map_err(|e| JsError::new(&format!("Failed to create client: {}", e)))?; + .map_err(|e| client_error_to_js_error("create_client_failed", e))?; Ok(EncryptedClient { inner: Arc::new(Mutex::new(client)), }) } +// ============================================================================ +// Phase 2.x helpers +// ============================================================================ + +/// Translate a Dart-flavoured `JsFulaConfig` into the underlying +/// `fula_client::Config`, plumbing every Phase 1.2 / 2.x field +/// through. Used by every JS client constructor — adding a new field +/// means changing this function only. +/// +/// Note on wasm32: the block_cache + gateway_fallback fields are +/// silently ignored at runtime (the underlying SDK gates out the +/// redb-backed cache and parking_lot-based pool). They're still +/// plumbed through so that a single shared config struct works +/// across native + web targets. +fn build_inner_config(config: JsFulaConfig) -> fula_client::Config { + let mut inner = fula_client::Config::new(&config.endpoint) + .with_timeout(std::time::Duration::from_secs(config.timeout_seconds)); + + if let Some(token) = config.access_token { + inner = inner.with_token(token); + } + + // Phase 2.1 — health gate (functional on wasm). + inner.health_gate_enabled = config.health_gate_enabled; + inner.health_gate_ttl = + std::time::Duration::from_secs(config.health_gate_ttl_seconds); + + // Phase 2.2 — block cache (native-only at runtime; plumbed for symmetry). + inner.block_cache_enabled = config.block_cache_enabled; + inner.block_cache_path = if config.block_cache_path.is_empty() { + None + } else { + Some(std::path::PathBuf::from(config.block_cache_path)) + }; + inner.block_cache_max_bytes = config.block_cache_max_bytes; + + // Phase 2.3 / 2.4 — gateway race (native-only at runtime). + inner.gateway_fallback_enabled = config.gateway_fallback_enabled; + inner.gateway_fallback_urls = config.gateway_fallback_urls; + inner.gateway_race_concurrency = config.gateway_race_concurrency as usize; + + inner +} + +/// Convert a `fula_client::ClientError` into a `JsError` whose +/// message is a JSON object carrying a stable error `code` plus +/// any structured fields. JS callers can `JSON.parse(err.message)` +/// to dispatch on the code and surface it to UI logic — e.g., +/// "show offline indicator" on `MASTER_UNREACHABLE` rather than +/// just a generic "download failed". +/// +/// The set of codes is stable across native and wasm so apps can +/// share an error-handling layer. +fn client_error_to_js_error(operation: &str, e: fula_client::ClientError) -> JsError { + use fula_client::ClientError; + + // Compose stable code + human-readable message. + let (code, structured) = match &e { + ClientError::MasterUnreachable { down_for_secs } => ( + "MASTER_UNREACHABLE", + serde_json::json!({ "downForSecs": down_for_secs }), + ), + ClientError::BlockTooLarge { size, budget } => ( + "BLOCK_TOO_LARGE", + serde_json::json!({ "size": size, "budget": budget }), + ), + ClientError::BlockCache(_) => ("BLOCK_CACHE_ERROR", serde_json::json!(null)), + ClientError::UsersIndexResolutionFailed { reason } => ( + "USERS_INDEX_RESOLUTION_FAILED", + serde_json::json!({ "reason": reason }), + ), + ClientError::SequenceRegression { observed, highest_seen, channel } => ( + "SEQUENCE_REGRESSION", + serde_json::json!({ + "observed": observed, + "highestSeen": highest_seen, + "channel": channel, + }), + ), + ClientError::NotFound { bucket, key } => ( + "NOT_FOUND", + serde_json::json!({ "bucket": bucket, "key": key }), + ), + ClientError::BucketNotFound(name) => ( + "BUCKET_NOT_FOUND", + serde_json::json!({ "name": name }), + ), + ClientError::AccessDenied(_) => ("ACCESS_DENIED", serde_json::json!(null)), + ClientError::ConcurrentModification(_) + | ClientError::ConcurrentModificationExhausted { .. } => { + ("CONCURRENT_MODIFICATION", serde_json::json!(null)) + } + ClientError::MigrationLockHeld { bucket, expires_at } => ( + "MIGRATION_LOCK_HELD", + serde_json::json!({ "bucket": bucket, "expiresAt": expires_at }), + ), + ClientError::Encryption(_) => ("ENCRYPTION", serde_json::json!(null)), + ClientError::Http(_) => ("HTTP", serde_json::json!(null)), + _ => ("INTERNAL", serde_json::json!(null)), + }; + + let payload = serde_json::json!({ + "code": code, + "operation": operation, + "message": e.to_string(), + "data": structured, + }); + JsError::new(&payload.to_string()) +} + // ============================================================================ // Encrypted Operations // ============================================================================ @@ -270,6 +446,14 @@ pub async fn put_encrypted_with_type( /// @param bucket - Bucket name /// @param key - Original object key (path) /// @returns Decrypted data as Uint8Array +/// +/// Errors surface as `JsError` whose `message` is a JSON-encoded +/// `{ code, operation, message, data }` object — `code` is one of +/// the stable codes documented on `client_error_to_js_error`. Apps +/// should `JSON.parse(err.message)` to dispatch on `code` (e.g., +/// `"MASTER_UNREACHABLE"` is the Phase 2.1 signal that the SDK's +/// health gate has tripped — surface an offline UI rather than a +/// generic "download failed"). #[wasm_bindgen(js_name = getDecrypted)] pub async fn get_decrypted( client: &EncryptedClient, @@ -279,11 +463,13 @@ pub async fn get_decrypted( let guard = client.inner.lock().await; let data = guard.get_object_decrypted(bucket, key) .await - .map_err(|e| JsError::new(&format!("Download failed: {}", e)))?; + .map_err(|e| client_error_to_js_error("get_decrypted", e))?; Ok(data.to_vec()) } /// Download and decrypt data by storage key +/// +/// Same structured-error contract as `getDecrypted`. #[wasm_bindgen(js_name = getDecryptedByStorageKey)] pub async fn get_decrypted_by_storage_key( client: &EncryptedClient, @@ -293,7 +479,7 @@ pub async fn get_decrypted_by_storage_key( let guard = client.inner.lock().await; let data = guard.get_object_decrypted_by_storage_key(bucket, storage_key) .await - .map_err(|e| JsError::new(&format!("Download failed: {}", e)))?; + .map_err(|e| client_error_to_js_error("get_decrypted_by_storage_key", e))?; Ok(data.to_vec()) } From be9ee9f667f70b23e7cfa4fbad1d2929a4bc93cc Mon Sep 17 00:00:00 2001 From: ehsan shariati Date: Mon, 4 May 2026 13:12:21 -0400 Subject: [PATCH 4/6] closed gaps in flutter and wasm for offline download --- crates/fula-client/Cargo.toml | 10 +- crates/fula-client/src/lib.rs | 23 +- crates/fula-client/src/registry_resolver.rs | 18 +- crates/fula-client/src/user_key.rs | 48 +++ crates/fula-flutter/src/api/client.rs | 197 +++++++++- crates/fula-flutter/src/api/types.rs | 272 ++++++++++++++ crates/fula-js/src/lib.rs | 375 +++++++++++++++++++- 7 files changed, 907 insertions(+), 36 deletions(-) create mode 100644 crates/fula-client/src/user_key.rs diff --git a/crates/fula-client/Cargo.toml b/crates/fula-client/Cargo.toml index 5543805..945579b 100644 --- a/crates/fula-client/Cargo.toml +++ b/crates/fula-client/Cargo.toml @@ -35,6 +35,14 @@ url = "2.5" base64 = { workspace = true } hex = { workspace = true } blake3 = { workspace = true } +# Phase 3.3 — `derive_user_key_from_email` lives in `src/user_key.rs` +# and is exposed on every target (wasm + native) so the wasm-bindgen +# binding can compute the user_key without round-tripping through +# `fula-crypto::derive_key_argon2id`. sha2 is a pure-Rust dep that +# builds cleanly on wasm32; was previously gated to native-only when +# the helper still lived inside the native-gated `registry_resolver` +# module, but with the helper extracted we need cross-target. +sha2 = { workspace = true } mime_guess = "2.0" tokio = { version = "1.42", default-features = false, features = ["sync"] } dashmap = { workspace = true } @@ -52,8 +60,6 @@ dirs = "5" # Native-only — wasm builds skip the cache (no persistent storage there anyway). redb = { workspace = true } cid = { workspace = true } -# CID verification on gateway-fetched bytes (Phase 2.3 of master-independent reads). -sha2 = { workspace = true } # Mutex for per-gateway state in gateway_fetch (Phase 2.3). parking_lot = { workspace = true } # Phase 3.3 cold-start hybrid resolver — parses the master-published diff --git a/crates/fula-client/src/lib.rs b/crates/fula-client/src/lib.rs index 53e8240..aae6cb9 100644 --- a/crates/fula-client/src/lib.rs +++ b/crates/fula-client/src/lib.rs @@ -50,6 +50,11 @@ mod multipart; #[cfg(not(target_arch = "wasm32"))] mod registry_resolver; mod types; +/// Phase 3.3 helper module — wasm-friendly userKey derivation +/// extracted from `registry_resolver.rs` so the wasm-bindgen +/// binding can expose it. Source-of-truth lives here; the +/// resolver re-exports it on native. +mod user_key; #[cfg(not(target_arch = "wasm32"))] mod orphan_queue; #[cfg(not(target_arch = "wasm32"))] @@ -86,16 +91,24 @@ pub use types::*; /// callbacks without depending on internal module paths. pub use health_gate::{HealthCallback, MasterHealthEvent}; +/// Phase 3.3 — `derive_user_key_from_email` available on EVERY +/// target (wasm + native). Apps compute the userKey at sign-in +/// time from the OAuth-provided email and stash it in +/// `Config::users_index_user_key`. The same function is also +/// re-exported via `registry_resolver` on native for backward +/// compatibility with code that imports it from there. +pub use user_key::derive_user_key_from_email; + /// Phase 3.3 — cold-start hybrid resolver public API. Native-only; /// the resolver itself is gated to `cfg(not(target_arch = "wasm32"))`. -/// The free helper `derive_user_key_from_email` is also re-exported -/// so JS / Flutter bindings can compute the user_key without holding -/// a client. +/// `derive_user_key_from_email` is re-exported above (cross-target); +/// callers using the `fula_client::registry_resolver::derive_user_key_from_email` +/// path also still resolve through the in-module `pub use`. #[cfg(not(target_arch = "wasm32"))] pub use registry_resolver::{ decode_user_buckets_index, default_ipfs_gateway_urls, default_ipns_gateway_urls, - derive_user_key_from_email, fetch_cid_via_gateways, BucketEntry, GlobalUsersIndex, - ResolutionSource, ResolvedUsersIndex, ResolverConfig, UserBucketsIndex, UsersIndexResolver, + fetch_cid_via_gateways, BucketEntry, GlobalUsersIndex, ResolutionSource, + ResolvedUsersIndex, ResolverConfig, UserBucketsIndex, UsersIndexResolver, }; /// Process-wide count of WAL append failures (F11). diff --git a/crates/fula-client/src/registry_resolver.rs b/crates/fula-client/src/registry_resolver.rs index f602eb1..4f28e5a 100644 --- a/crates/fula-client/src/registry_resolver.rs +++ b/crates/fula-client/src/registry_resolver.rs @@ -217,15 +217,15 @@ impl ResolverConfig { /// stay in lockstep with the master's `hash_user_id`; the /// `derive_user_key_matches_master_state_rs_algorithm` test below /// reproduces the master algorithm step-by-step and asserts equality. -pub fn derive_user_key_from_email(email: &str) -> String { - use sha2::{Digest, Sha256}; - let user_id_digest = Sha256::digest(email.to_lowercase().as_bytes()); - let user_id_hex = hex::encode(user_id_digest); - let mut hasher = blake3::Hasher::new(); - hasher.update(b"fula:user_id:"); - hasher.update(user_id_hex.as_bytes()); - hex::encode(&hasher.finalize().as_bytes()[..16]) -} +/// +/// Source-of-truth lives in `crate::user_key` (extracted there so the +/// wasm-bindgen binding can expose it — the `registry_resolver` +/// module itself is gated to native targets). This re-export keeps +/// the historical `fula_client::registry_resolver::derive_user_key_from_email` +/// import path working for native callers AND lets the test module +/// in this file (line 1485+) call the function via `use super::*;`. +#[allow(unused_imports)] +pub use crate::user_key::derive_user_key_from_email; /// Default IPNS-aware gateway list. Excludes /// `trustless-gateway.link` (only serves `/ipfs/`, not `/ipns/`). diff --git a/crates/fula-client/src/user_key.rs b/crates/fula-client/src/user_key.rs new file mode 100644 index 0000000..4b50265 --- /dev/null +++ b/crates/fula-client/src/user_key.rs @@ -0,0 +1,48 @@ +//! Phase 3.3 — userKey derivation, available on every target. +//! +//! `derive_user_key_from_email` was originally inlined in +//! `registry_resolver.rs`, but that module is gated to native via +//! `#![cfg(not(target_arch = "wasm32"))]` because it depends on +//! `reqwest`, `parking_lot`, and other crates that don't compile on +//! wasm. The userKey computation itself is pure: just `sha2` + +//! `blake3` + `hex` — all of which build cleanly on wasm32 (these +//! are already transitive deps of the wasm SDK build). +//! +//! Extracting the helper here lets the FRB and wasm-bindgen +//! bindings expose `derive_user_key_from_email` without having to +//! re-implement the algorithm. Master and SDK both produce the +//! same `userKey` for the same email, regardless of which target +//! the SDK was built for. +//! +//! **Algorithm (must stay in lockstep with master's `state.rs::hash_user_id`):** +//! +//! ```text +//! email_lower = email.to_lowercase() +//! user_id_digest = sha256(email_lower.as_bytes()) +//! user_id_hex = hex(user_id_digest) +//! domain_separated = "fula:user_id:" || user_id_hex +//! user_key = hex( blake3(domain_separated)[..16] ) +//! ``` +//! +//! Drift here vs. master = silent cold-start failure (master +//! publishes under userKey A, SDK looks up userKey B). The +//! `derive_user_key_matches_master_state_rs_algorithm` test in +//! `registry_resolver.rs` reproduces master's algorithm step-by-step +//! and asserts equality. + +use sha2::{Digest, Sha256}; + +/// Compute the canonical fula `userKey` for cold-start config from a +/// plaintext email. Returns 32 hex chars (16-byte BLAKE3 truncated digest). +/// +/// Apps call this at sign-in time (the OAuth flow has plaintext email) +/// and pass the returned string into `Config::users_index_user_key`. +/// The SDK never persists or transmits the raw email. +pub fn derive_user_key_from_email(email: &str) -> String { + let user_id_digest = Sha256::digest(email.to_lowercase().as_bytes()); + let user_id_hex = hex::encode(user_id_digest); + let mut hasher = blake3::Hasher::new(); + hasher.update(b"fula:user_id:"); + hasher.update(user_id_hex.as_bytes()); + hex::encode(&hasher.finalize().as_bytes()[..16]) +} diff --git a/crates/fula-flutter/src/api/client.rs b/crates/fula-flutter/src/api/client.rs index 84786b0..a0da53c 100644 --- a/crates/fula-flutter/src/api/client.rs +++ b/crates/fula-flutter/src/api/client.rs @@ -16,12 +16,19 @@ use async_lock::RwLock; use crate::api::types::*; /// Build the underlying `fula_client::Config` from the Dart-facing -/// `FulaConfig`, plumbing every Phase 1.2 / 2.x field through. Used by -/// `create_client`, `create_encrypted_client`, and -/// `create_encrypted_client_with_pinning` to keep the three constructors -/// in lockstep — adding a new field to FulaConfig only requires a -/// change here. -fn build_inner_config(config: &FulaConfig) -> fula_client::Config { +/// `FulaConfig`, plumbing every Phase 1.2 / 2.x / 3.3 / 19 field +/// through. Used by `create_client`, `create_encrypted_client`, and +/// `create_encrypted_client_with_pinning` to keep the three +/// constructors in lockstep — adding a new field to FulaConfig only +/// requires a change here. +/// +/// `dispatcher` is the per-handle dispatcher that the FRB layer +/// always wires into `Config::health_callback` so apps can subscribe +/// to `MasterHealthEvent` events via `subscribe_master_health_events`. +fn build_inner_config( + config: &FulaConfig, + dispatcher: &Arc, +) -> fula_client::Config { let mut inner = fula_client::Config::new(&config.endpoint) .with_timeout(Duration::from_secs(config.timeout_seconds)); @@ -50,6 +57,45 @@ fn build_inner_config(config: &FulaConfig) -> fula_client::Config { inner.gateway_fallback_urls = config.gateway_fallback_urls.clone(); inner.gateway_race_concurrency = config.gateway_race_concurrency as usize; + // Phase 3.3 — cold-start hybrid resolver. The resolver activates + // iff all four required strings (rpc_url, anchor_address, + // ipns_name, user_key) are non-empty AND the user_key is `Some`. + // Empty strings collapse to "disabled" — same default behavior as + // pre-Phase-3.3 builds. + inner.users_index_chain_rpc_url = config.users_index_chain_rpc_url.clone(); + inner.users_index_anchor_address = config.users_index_anchor_address.clone(); + inner.users_index_ipns_name = config.users_index_ipns_name.clone(); + inner.users_index_user_key = if config.users_index_user_key.is_empty() { + None + } else { + Some(config.users_index_user_key.clone()) + }; + inner.users_index_ipns_gateway_urls = + config.users_index_ipns_gateway_urls.clone(); + inner.users_index_ipfs_gateway_urls = + config.users_index_ipfs_gateway_urls.clone(); + + // Phase 19 — always wire a forwarding callback into the gate so + // Dart-side subscribers can observe health transitions. The + // dispatcher is per-handle, so events from this client never + // leak to a different client's subscribers. Native-only — wasm + // doesn't include the health-callback Arc in fula_client::Config + // because `Arc` doesn't cross wasm-bindgen cleanly; the + // wasm path surfaces via typed errors. + #[cfg(not(target_arch = "wasm32"))] + { + let dispatcher = Arc::clone(dispatcher); + let cb: fula_client::HealthCallback = Arc::new(move |ev| { + dispatcher.dispatch(ev); + }); + inner.health_callback = Some(cb); + } + // Suppress unused-variable warning on wasm where we don't read + // `dispatcher` at config-build time (subscribers still register; + // they just never receive events because no callback fires). + #[cfg(target_arch = "wasm32")] + let _ = dispatcher; + if let Some(token) = &config.access_token { inner = inner.with_token(token.clone()); } @@ -63,11 +109,13 @@ fn build_inner_config(config: &FulaConfig) -> fula_client::Config { /// Create a new Fula client with the given configuration pub fn create_client(config: FulaConfig) -> anyhow::Result { - let inner_config = build_inner_config(&config); + let dispatcher = Arc::new(HealthEventDispatcher::new()); + let inner_config = build_inner_config(&config, &dispatcher); let client = fula_client::FulaClient::new(inner_config)?; Ok(FulaClientHandle { inner: Arc::new(client), + health_dispatcher: dispatcher, }) } @@ -76,7 +124,8 @@ pub fn create_encrypted_client( config: FulaConfig, encryption: EncryptionConfig, ) -> anyhow::Result { - let inner_config = build_inner_config(&config); + let dispatcher = Arc::new(HealthEventDispatcher::new()); + let inner_config = build_inner_config(&config, &dispatcher); // Create encryption config let enc_config = if let Some(secret_key) = encryption.secret_key { @@ -113,6 +162,7 @@ pub fn create_encrypted_client( Ok(EncryptedClientHandle { inner: Arc::new(RwLock::new(client)), + health_dispatcher: dispatcher, }) } @@ -122,7 +172,8 @@ pub fn create_encrypted_client_with_pinning( encryption: EncryptionConfig, pinning: PinningConfig, ) -> anyhow::Result { - let inner_config = build_inner_config(&config); + let dispatcher = Arc::new(HealthEventDispatcher::new()); + let inner_config = build_inner_config(&config, &dispatcher); // Create encryption config let enc_config = if let Some(secret_key) = encryption.secret_key { @@ -168,9 +219,137 @@ pub fn create_encrypted_client_with_pinning( Ok(EncryptedClientHandle { inner: Arc::new(RwLock::new(client)), + health_dispatcher: dispatcher, }) } +// ============================================================================ +// Phase 3.3 — derive_user_key_from_email +// ============================================================================ + +/// Compute the canonical fula `userKey` for cold-start config from a +/// plaintext email. Mirrors `fula_client::derive_user_key_from_email` +/// — same domain separator, same hash chain (sha256(lower(email)) +/// → BLAKE3("fula:user_id:" || _).bytes[..16] → hex-encode). +/// +/// Apps call this once at sign-in (the OAuth flow has plaintext +/// email), then set `FulaConfig::users_index_user_key` to the +/// returned string. The SDK never sees the raw email. +/// +/// Native-only — wasm32 surfaces this via the JS-side `deriveKey` +/// helper because the cold-start resolver (Phase 3.3) itself isn't +/// wired on wasm. +#[cfg(not(target_arch = "wasm32"))] +pub fn derive_user_key_from_email(email: String) -> String { + fula_client::derive_user_key_from_email(&email) +} + +#[cfg(target_arch = "wasm32")] +pub fn derive_user_key_from_email(_email: String) -> String { + // The Rust cold-start resolver isn't wired on wasm32; expose + // the function for API symmetry but emit an empty key so the + // resolver self-disables (per build_inner_config: empty user_key + // → users_index_user_key=None → resolver inactive). + String::new() +} + +// ============================================================================ +// Phase 19 — health-event subscription +// ============================================================================ + +/// Drain every `MasterHealthEvent` observed since the last call to +/// this function. Returns events in the order they fired (oldest +/// first). After draining the buffer is empty. +/// +/// Apps poll this on a timer (or on UI rebuilds) and update their +/// online/offline indicator. Internal buffer is bounded at 64 +/// entries — if an app falls so far behind that the buffer +/// overflows, the oldest events are dropped first; the latest state +/// is preserved. For latest-only consumers, see +/// [`get_last_master_health_event`]. +/// +/// Events delivered: +/// - `Online` — master went Up after being Down +/// - `OfflineFallbackActive { reason }` — master went Down +/// - `SeverelyDegraded { reason }` — both master AND cold-start +/// channels (IPNS + chain) are unreachable; cold-start GETs +/// will fail +/// +/// Native-only at runtime: on wasm32 the function compiles for API +/// symmetry but never returns events because the health-callback +/// Arc isn't wired on wasm (`Arc` doesn't cross +/// wasm-bindgen cleanly). +pub fn poll_master_health_events( + client: &FulaClientHandle, +) -> Vec { + client.health_dispatcher.drain_events() +} + +/// Same as `poll_master_health_events` for an `EncryptedClientHandle`. +/// Exposed separately because Dart-side the encrypted client has +/// its own handle type and FRB doesn't auto-reflect "this method +/// works on either handle". +pub fn poll_master_health_events_encrypted( + client: &EncryptedClientHandle, +) -> Vec { + client.health_dispatcher.drain_events() +} + +/// Read the most recent `MasterHealthEvent` observed by the SDK +/// without draining the buffer. Returns `None` if no transition has +/// happened yet (master has been Up the whole session). Useful for +/// apps that build UI state from a single field on mount. +pub fn get_last_master_health_event( + client: &FulaClientHandle, +) -> Option { + client.health_dispatcher.last_event() +} + +/// Encrypted-client variant of `get_last_master_health_event`. +pub fn get_last_master_health_event_encrypted( + client: &EncryptedClientHandle, +) -> Option { + client.health_dispatcher.last_event() +} + +// ============================================================================ +// Phase 19 — get_object_with_offline_fallback +// ============================================================================ + +/// Phase 19 GET wrapper that returns transparency fields alongside +/// the bytes. Routes through the SDK's full Phase 2.x + 3.3 stack: +/// +/// | State | Returns | +/// |-----------------------------------|-------------------------------------------| +/// | Master up | source = Master, freshness = Live | +/// | Master down + warm cache hit | source = LocalCache or Gateway(url), | +/// | | freshness = Cached { observed_at } | +/// | Master down + cold-start | source = Gateway(url), | +/// | | freshness = Cached { observed_at } | +/// | Master down + cache miss + no | Err(UsersIndexResolutionFailed) | +/// | resolver configured | | +/// +/// Apps that don't care about transparency can read `result.inner.data`. +/// Apps that surface "you're offline" UI inspect `result.source` / +/// `result.freshness`. +/// +/// Native-only at runtime: on wasm32 the SDK currently only wraps +/// `get_object_with_metadata` (no offline fallback infrastructure on +/// browsers — block_cache + gateway_fetch are gated out). The wasm +/// path returns `OfflineGetResult` with `source = Master, freshness = +/// Live` so the API shape is identical across platforms. +pub async fn get_object_with_offline_fallback( + client: &FulaClientHandle, + bucket: String, + key: String, +) -> anyhow::Result { + let result = client + .inner + .get_object_with_offline_fallback(&bucket, &key) + .await?; + Ok(result.into()) +} + // ============================================================================ // Bucket Operations // ============================================================================ diff --git a/crates/fula-flutter/src/api/types.rs b/crates/fula-flutter/src/api/types.rs index 01c7810..0ba41f6 100644 --- a/crates/fula-flutter/src/api/types.rs +++ b/crates/fula-flutter/src/api/types.rs @@ -106,6 +106,52 @@ pub struct FulaConfig { /// CID. Default: 3. Capped at the gateway-pool length. /// Native-only. pub gateway_race_concurrency: u32, + + // ============================================================ + // Phase 3.3 — cold-start hybrid resolver + // ============================================================ + // + // The resolver activates iff ALL of the following four fields + // are populated: + // - users_index_chain_rpc_url (non-empty) + // - users_index_anchor_address (non-empty) + // - users_index_ipns_name (non-empty) + // - users_index_user_key (non-empty) + // + // When any one is empty the resolver stays disabled; cold-start + // GETs fall through with `UsersIndexResolutionFailed`. Default + // values are all empty strings → resolver disabled (backward + // compat with pre-Phase-3.3 builds). + + /// JSON-RPC URL for the chain anchor contract (Base or SKALE). + /// Required to enable Phase 3.3 cold-start. Empty → disabled. + pub users_index_chain_rpc_url: String, + + /// `FulaUsersIndexAnchor.sol` contract address (20 bytes hex, + /// optionally `0x`-prefixed). Required to enable Phase 3.3. + pub users_index_anchor_address: String, + + /// IPNS NAME (libp2p public-key hash, e.g. `k51qzi5...`) under + /// which the master publishes the global users-index CBOR. + /// Required to enable Phase 3.3. + pub users_index_ipns_name: String, + + /// 32-hex-char `userKey` (= `BLAKE3("fula:user_id:" || sha256(lower(email)))[..16]`). + /// Compute via the free function [`derive_user_key_from_email`] + /// at sign-in time and pass in here. The SDK does not store the + /// raw email. Required to enable Phase 3.3. + pub users_index_user_key: String, + + /// IPNS-aware gateway URL templates (each must contain `{name}`). + /// Empty Vec = use SDK-shipped defaults + /// (Cloudflare/dweb.link/ipfs.io/4everland/Pinata). + /// Native-only — wasm cold-start uses the typed-error path. + pub users_index_ipns_gateway_urls: Vec, + + /// `/ipfs/{cid}` gateway URL templates (each must contain `{cid}`). + /// Empty Vec = use SDK-shipped 6-gateway default list. + /// Native-only. + pub users_index_ipfs_gateway_urls: Vec, } impl Default for FulaConfig { @@ -128,10 +174,92 @@ impl Default for FulaConfig { gateway_fallback_enabled: false, gateway_fallback_urls: Vec::new(), gateway_race_concurrency: 3, + // Phase 3.3 — resolver disabled by default. Operator + // sets the four required fields at sign-in to enable + // cold-start; cold-start surfaces UsersIndexResolutionFailed + // until they're set, mirroring the Rust core's behavior. + users_index_chain_rpc_url: String::new(), + users_index_anchor_address: String::new(), + users_index_ipns_name: String::new(), + users_index_user_key: String::new(), + users_index_ipns_gateway_urls: Vec::new(), + users_index_ipfs_gateway_urls: Vec::new(), } } } +// ============================================================ +// Phase 19 — transparency surfaces +// ============================================================ + +/// Where the bytes of a successfully-served read came from. Mirrors +/// `fula_client::ReadSource`. Apps surface "you're reading from +/// cache" / "served by a public gateway" UI based on this. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FulaReadSource { + /// Master S3 served the request directly (fast path). + Master, + /// On-disk redb BLOCKS table served the bytes — no network at all. + LocalCache, + /// Public IPFS gateway served the bytes (master-down fallback). + /// The string is the URL template (e.g. `https://ipfs.io/ipfs/{cid}`) + /// that won the gateway race — useful for diagnostics or + /// "served by ipfs.io" labeling. + Gateway(String), +} + +/// Freshness of a successfully-served read. Mirrors +/// `fula_client::ReadFreshness`. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FulaReadFreshness { + /// Master-served bytes (fresh, by definition). + Live, + /// Served from on-disk cache. `observed_at` is the unix-millis + /// when the entry was first written to cache. + Cached { observed_at: u64 }, + /// Cold-start cross-device read; snapshot age within the + /// publisher cadence (≤ `USERS_INDEX_FLUSH_INTERVAL`). Apps + /// can surface "synced N min ago". + StaleByDesign { snapshot_age_secs: u64 }, + /// Cold-start cross-device read; snapshot age exceeds the + /// publisher cadence — likely indicates an actual master outage. + StaleByOutage { snapshot_age_secs: u64 }, +} + +/// Wrapper around `GetObjectResult` adding Phase 19 transparency +/// fields. Mirrors `fula_client::OfflineGetResult`. Apps that +/// don't care about transparency just read `.inner.data`. +#[derive(Debug, Clone)] +pub struct OfflineGetResult { + /// Underlying `GetObjectResult` — `data`, `etag`, `content_type`, + /// `content_length`, `last_modified`, `metadata` are on `inner`. + pub inner: GetObjectResult, + /// Where the bytes ultimately came from. + pub source: FulaReadSource, + /// How fresh the bytes are. + pub freshness: FulaReadFreshness, +} + +/// Master-server reachability transition events. Mirrors +/// `fula_client::MasterHealthEvent`. Subscribed via +/// `subscribe_master_health_events`; apps wire the stream to +/// online/offline UI affordances. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum MasterHealthEvent { + /// Master S3 is reachable; reads use the fast path. + Online, + /// Master S3 is unreachable; SDK is falling back to IPFS + /// gateways (Phase 2.4) or cold-start resolver (Phase 3.3). + OfflineFallbackActive { reason: String }, + /// Both master S3 AND chain RPC are unreachable. Cold-start + /// reads fail; warm reads (cached metadata) still work via + /// gateways. Apps should disable "open new bucket" / "first- + /// read" UI affordances when this fires. Emitted only from + /// the cold-start failure path — the health gate alone can't + /// authoritatively detect "both down" without trying. + SeverelyDegraded { reason: String }, +} + /// Configuration for client-side encryption #[derive(Debug, Clone)] pub struct EncryptionConfig { @@ -471,6 +599,14 @@ pub struct UploadProgress { #[derive(Clone)] pub struct FulaClientHandle { pub(crate) inner: Arc, + /// Phase 19 — dispatcher for `MasterHealthEvent` stream + /// subscribers. The Config's `health_callback` (set by + /// `build_inner_config`) captures a clone of this `Arc` and + /// forwards each transition to all live subscribers AND to a + /// "last event seen" slot exposed via `get_last_master_health_event`. + /// Always present so apps can subscribe at any time without + /// re-creating the client. + pub(crate) health_dispatcher: Arc, } /// Handle to an EncryptedClient instance @@ -479,6 +615,86 @@ pub struct FulaClientHandle { #[derive(Clone)] pub struct EncryptedClientHandle { pub(crate) inner: Arc>, + /// Phase 19 — same dispatcher pattern as FulaClientHandle. + /// Encrypted-client construction also threads the callback into + /// the underlying `fula_client::Config` so warm-cache + cold- + /// start transitions both surface to subscribers. + pub(crate) health_dispatcher: Arc, +} + +/// Phase 19 — internal dispatcher that captures `MasterHealthEvent` +/// transitions for two consumption patterns: +/// +/// 1. **Polling drain** (`poll_master_health_events`): apps call +/// this periodically (or on UI rebuilds) and receive every +/// event observed since the last call. The internal buffer is +/// a bounded `VecDeque` capped at `MAX_BUFFERED_EVENTS`; if the +/// app falls so far behind that the buffer overflows, oldest +/// events are dropped first (apps care about the *latest* state, +/// not the entire history). +/// +/// 2. **Latest-state read** (`get_last_master_health_event`): +/// returns the most recent event without draining. Useful for +/// apps that want to display "you're offline" immediately on +/// mount based on whatever the SDK has observed so far. +/// +/// **Why polling instead of a Dart `Stream`:** wiring `StreamSink` +/// requires the FRB codegen to have seen `MasterHealthEvent` — a +/// chicken-and-egg dependency on `flutter_rust_bridge_codegen +/// generate` having run after this commit. Polling sidesteps that +/// while still giving apps every event in order. A future iteration +/// can layer a `Stream` on top once codegen has +/// registered the type, without breaking this polling API. +/// +/// Wrapping the buffer in `parking_lot::Mutex` (sync, no `await`) +/// is required because the dispatcher is invoked from the SDK's hot +/// path inside `health_gate::fire_event`, which doesn't tolerate +/// async locks. +pub struct HealthEventDispatcher { + /// Pending events not yet drained by `poll_master_health_events`. + /// Bounded by [`MAX_BUFFERED_EVENTS`]. + buffer: parking_lot::Mutex>, + /// Most recent event observed, regardless of whether it was + /// drained. Read by `get_last_master_health_event`. + last_event: parking_lot::Mutex>, +} + +/// Maximum number of pending events held by [`HealthEventDispatcher`] +/// before older ones get dropped. 64 is plenty for typical apps — +/// a healthy session sees a handful of transitions per hour at most. +const MAX_BUFFERED_EVENTS: usize = 64; + +impl HealthEventDispatcher { + pub(crate) fn new() -> Self { + Self { + buffer: parking_lot::Mutex::new(std::collections::VecDeque::new()), + last_event: parking_lot::Mutex::new(None), + } + } + + /// Called from the `health_callback` set on the underlying + /// `fula_client::Config`. Captures the event for both polling + /// drain and latest-state read. + pub(crate) fn dispatch(&self, event: fula_client::MasterHealthEvent) { + let app_event: MasterHealthEvent = event.into(); + *self.last_event.lock() = Some(app_event.clone()); + let mut buf = self.buffer.lock(); + if buf.len() >= MAX_BUFFERED_EVENTS { + // Drop oldest to make room for newest. Apps care about + // the latest state more than ancient history. + buf.pop_front(); + } + buf.push_back(app_event); + } + + pub(crate) fn drain_events(&self) -> Vec { + let mut buf = self.buffer.lock(); + buf.drain(..).collect() + } + + pub(crate) fn last_event(&self) -> Option { + self.last_event.lock().clone() + } } /// Handle to an accepted share @@ -604,6 +820,62 @@ impl From for GetObjectResult { } } +// Phase 19 transparency conversions. These bridge the Rust-core +// `fula_client::*` types to FRB-friendly Dart-side equivalents. +// They're plain unit/struct/string-payload variants so FRB v2's +// codegen produces a sealed Dart class without any custom adapter. + +impl From for FulaReadSource { + fn from(s: fula_client::ReadSource) -> Self { + match s { + fula_client::ReadSource::Master => FulaReadSource::Master, + fula_client::ReadSource::LocalCache => FulaReadSource::LocalCache, + fula_client::ReadSource::Gateway(url) => FulaReadSource::Gateway(url), + } + } +} + +impl From for FulaReadFreshness { + fn from(f: fula_client::ReadFreshness) -> Self { + match f { + fula_client::ReadFreshness::Live => FulaReadFreshness::Live, + fula_client::ReadFreshness::Cached { observed_at } => { + FulaReadFreshness::Cached { observed_at } + } + fula_client::ReadFreshness::StaleByDesign { snapshot_age_secs } => { + FulaReadFreshness::StaleByDesign { snapshot_age_secs } + } + fula_client::ReadFreshness::StaleByOutage { snapshot_age_secs } => { + FulaReadFreshness::StaleByOutage { snapshot_age_secs } + } + } + } +} + +impl From for OfflineGetResult { + fn from(r: fula_client::OfflineGetResult) -> Self { + Self { + inner: r.inner.into(), + source: r.source.into(), + freshness: r.freshness.into(), + } + } +} + +impl From for MasterHealthEvent { + fn from(e: fula_client::MasterHealthEvent) -> Self { + match e { + fula_client::MasterHealthEvent::Online => MasterHealthEvent::Online, + fula_client::MasterHealthEvent::OfflineFallbackActive { reason } => { + MasterHealthEvent::OfflineFallbackActive { reason } + } + fula_client::MasterHealthEvent::SeverelyDegraded { reason } => { + MasterHealthEvent::SeverelyDegraded { reason } + } + } + } +} + impl From for DecryptedObjectInfo { fn from(r: fula_client::DecryptedObjectInfo) -> Self { Self { diff --git a/crates/fula-js/src/lib.rs b/crates/fula-js/src/lib.rs index ae3def2..ed3b3fb 100644 --- a/crates/fula-js/src/lib.rs +++ b/crates/fula-js/src/lib.rs @@ -122,6 +122,53 @@ pub struct JsFulaConfig { /// **Native-only at runtime.** #[serde(default = "default_gateway_race_concurrency")] pub gateway_race_concurrency: u32, + + // ============================================================ + // Phase 3.3 — cold-start hybrid resolver (native-only at runtime) + // ============================================================ + // + // The cold-start resolver itself is gated to native targets in + // `fula-client` (the JSON-RPC eth_call + IPNS gateway race rely + // on `reqwest` + `parking_lot` paths that aren't compiled on + // wasm32). These fields are accepted on wasm for **API symmetry** + // — a TS app sharing a config object across mobile + web can + // pass them through unconditionally; the wasm build silently + // disables cold-start. Apps that need offline reads on the web + // still get Phase 2.1 (health gate + typed `MASTER_UNREACHABLE` + // error); cold-start cross-device support is mobile-only today. + + /// JSON-RPC URL for the chain anchor (Base or SKALE). Empty = + /// disabled. **Native-only at runtime.** + #[serde(default)] + pub users_index_chain_rpc_url: String, + + /// `FulaUsersIndexAnchor.sol` proxy address (20 bytes hex, + /// optionally `0x`-prefixed). Empty = disabled. **Native-only + /// at runtime.** + #[serde(default)] + pub users_index_anchor_address: String, + + /// IPNS NAME (libp2p public-key hash, e.g. `k51qzi5...`). + /// Empty = disabled. **Native-only at runtime.** + #[serde(default)] + pub users_index_ipns_name: String, + + /// 32-hex-char `userKey` derived from the user's email via + /// [`derive_user_key_from_email`]. Empty = disabled. + /// **Native-only at runtime.** + #[serde(default)] + pub users_index_user_key: String, + + /// IPNS-aware gateway URL templates (each must contain `{name}`). + /// Empty Vec = use SDK-shipped defaults. **Native-only at runtime.** + #[serde(default)] + pub users_index_ipns_gateway_urls: Vec, + + /// `/ipfs/{cid}` gateway URL templates (each must contain `{cid}`). + /// Empty Vec = use SDK-shipped 6-gateway default. **Native-only + /// at runtime.** + #[serde(default)] + pub users_index_ipfs_gateway_urls: Vec, } fn default_timeout() -> u64 { 30 } @@ -208,14 +255,191 @@ pub struct JsSharePermissions { pub expires_at: Option, } +// ============================================================================ +// Phase 19 — transparency types +// ============================================================================ +// +// All three are `serde`-tagged enums / structs so JS sees an idiomatic +// shape: +// ReadSource: { kind: "Master" } +// { kind: "LocalCache" } +// { kind: "Gateway", url: "https://ipfs.io/ipfs/{cid}" } +// ReadFreshness: { kind: "Live" } +// { kind: "Cached", observedAt: 1234567890 } +// { kind: "StaleByDesign", snapshotAgeSecs: 60 } +// { kind: "StaleByOutage", snapshotAgeSecs: 7200 } +// MasterHealthEvent: same `kind` discriminant +// Apps `switch` on `result.source.kind` to drive UI. + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase", tag = "kind")] +pub enum JsReadSource { + Master, + LocalCache, + Gateway { url: String }, +} + +impl From for JsReadSource { + fn from(s: fula_client::ReadSource) -> Self { + match s { + fula_client::ReadSource::Master => JsReadSource::Master, + fula_client::ReadSource::LocalCache => JsReadSource::LocalCache, + fula_client::ReadSource::Gateway(url) => JsReadSource::Gateway { url }, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase", tag = "kind")] +pub enum JsReadFreshness { + Live, + #[serde(rename_all = "camelCase")] + Cached { observed_at: u64 }, + #[serde(rename_all = "camelCase")] + StaleByDesign { snapshot_age_secs: u64 }, + #[serde(rename_all = "camelCase")] + StaleByOutage { snapshot_age_secs: u64 }, +} + +impl From for JsReadFreshness { + fn from(f: fula_client::ReadFreshness) -> Self { + match f { + fula_client::ReadFreshness::Live => JsReadFreshness::Live, + fula_client::ReadFreshness::Cached { observed_at } => { + JsReadFreshness::Cached { observed_at } + } + fula_client::ReadFreshness::StaleByDesign { snapshot_age_secs } => { + JsReadFreshness::StaleByDesign { snapshot_age_secs } + } + fula_client::ReadFreshness::StaleByOutage { snapshot_age_secs } => { + JsReadFreshness::StaleByOutage { snapshot_age_secs } + } + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct JsOfflineGetResult { + /// Object data (bytes). + pub data: Vec, + /// ETag (CID string when bytes came from gateway race / cache; + /// master-issued ETag when bytes came from master). + pub etag: String, + /// Content type if known (always `None` on offline-fallback paths + /// today; master-served reads carry the response Content-Type). + pub content_type: Option, + /// Object size in bytes. + pub size: u64, + /// Last-modified timestamp (Unix epoch seconds) if master served + /// the bytes; 0 on offline-fallback paths. + pub last_modified: i64, + /// Where the bytes ultimately came from. + pub source: JsReadSource, + /// How fresh the bytes are. + pub freshness: JsReadFreshness, +} + +impl From for JsOfflineGetResult { + fn from(r: fula_client::OfflineGetResult) -> Self { + let inner = r.inner; + Self { + data: inner.data.to_vec(), + etag: inner.etag, + content_type: inner.content_type, + size: inner.content_length, + last_modified: inner.last_modified.map(|d| d.timestamp()).unwrap_or(0), + source: r.source.into(), + freshness: r.freshness.into(), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "camelCase", tag = "kind")] +pub enum JsMasterHealthEvent { + Online, + OfflineFallbackActive { reason: String }, + SeverelyDegraded { reason: String }, +} + +impl From for JsMasterHealthEvent { + fn from(e: fula_client::MasterHealthEvent) -> Self { + match e { + fula_client::MasterHealthEvent::Online => JsMasterHealthEvent::Online, + fula_client::MasterHealthEvent::OfflineFallbackActive { reason } => { + JsMasterHealthEvent::OfflineFallbackActive { reason } + } + fula_client::MasterHealthEvent::SeverelyDegraded { reason } => { + JsMasterHealthEvent::SeverelyDegraded { reason } + } + } + } +} + // ============================================================================ // Client Handles (opaque types exposed to JS) // ============================================================================ +/// Phase 19 — wasm dispatcher capturing `MasterHealthEvent` +/// transitions for polling consumers. The Rust callback set on the +/// inner `Config::health_callback` pushes events here; JS apps drain +/// via `pollMasterHealthEvents` or read latest via +/// `getLastMasterHealthEvent`. +/// +/// Buffer is bounded at 64 entries — apps that fall further behind +/// drop the oldest events (latest state is what UI cares about). +struct WasmHealthEventDispatcher { + buffer: std::sync::Mutex>, + last_event: std::sync::Mutex>, +} + +const WASM_MAX_BUFFERED_EVENTS: usize = 64; + +impl WasmHealthEventDispatcher { + fn new() -> Self { + Self { + buffer: std::sync::Mutex::new(std::collections::VecDeque::new()), + last_event: std::sync::Mutex::new(None), + } + } + + /// Called from the `health_callback` set on the inner Config. + /// Captures the event for both polling drain + latest-state read. + fn dispatch(&self, event: fula_client::MasterHealthEvent) { + let app_event: JsMasterHealthEvent = event.into(); + if let Ok(mut last) = self.last_event.lock() { + *last = Some(app_event.clone()); + } + if let Ok(mut buf) = self.buffer.lock() { + if buf.len() >= WASM_MAX_BUFFERED_EVENTS { + buf.pop_front(); + } + buf.push_back(app_event); + } + } + + fn drain_events(&self) -> Vec { + self.buffer + .lock() + .map(|mut buf| buf.drain(..).collect()) + .unwrap_or_default() + } + + fn last_event(&self) -> Option { + self.last_event.lock().ok().and_then(|guard| guard.clone()) + } +} + /// Handle to an encrypted Fula client #[wasm_bindgen] pub struct EncryptedClient { inner: Arc>, + /// Phase 19 — per-client health-event dispatcher. Always present + /// so apps can poll regardless of whether they wired + /// `healthGateEnabled = true`. When the gate is off, the buffer + /// stays empty (no events fire); polling returns `[]`. + health_dispatcher: Arc, } /// Handle to an accepted share for accessing shared files @@ -243,8 +467,11 @@ pub async fn create_encrypted_client( let encryption: JsEncryptionConfig = serde_wasm_bindgen::from_value(encryption) .map_err(|e| JsError::new(&format!("Invalid encryption config: {}", e)))?; - // Build client config - let client_config = build_inner_config(config); + // Phase 19 dispatcher — created per client so events from one + // EncryptedClient never leak to another's poll buffer. + let dispatcher = Arc::new(WasmHealthEventDispatcher::new()); + // Build client config (callback wired to dispatcher). + let client_config = build_inner_config(config, &dispatcher); // Build encryption config let enc_config = if let Some(secret_key) = encryption.secret_key { @@ -274,6 +501,7 @@ pub async fn create_encrypted_client( Ok(EncryptedClient { inner: Arc::new(Mutex::new(client)), + health_dispatcher: dispatcher, }) } @@ -282,16 +510,23 @@ pub async fn create_encrypted_client( // ============================================================================ /// Translate a Dart-flavoured `JsFulaConfig` into the underlying -/// `fula_client::Config`, plumbing every Phase 1.2 / 2.x field -/// through. Used by every JS client constructor — adding a new field -/// means changing this function only. +/// `fula_client::Config`, plumbing every Phase 1.2 / 2.x / 3.3 / 19 +/// field through. Used by every JS client constructor — adding a new +/// field means changing this function only. /// -/// Note on wasm32: the block_cache + gateway_fallback fields are -/// silently ignored at runtime (the underlying SDK gates out the -/// redb-backed cache and parking_lot-based pool). They're still -/// plumbed through so that a single shared config struct works -/// across native + web targets. -fn build_inner_config(config: JsFulaConfig) -> fula_client::Config { +/// `dispatcher` is the per-client Phase 19 dispatcher; the callback +/// wired into `Config::health_callback` forwards each transition to +/// it so JS apps can poll via `pollMasterHealthEvents`. +/// +/// Note on wasm32: the block_cache + gateway_fallback + cold-start +/// resolver fields are silently inert at runtime (the underlying SDK +/// gates out the redb-backed cache, parking_lot-based pool, and +/// reqwest-based resolver). They're still plumbed through so a single +/// shared config struct works across native + web targets. +fn build_inner_config( + config: JsFulaConfig, + dispatcher: &Arc, +) -> fula_client::Config { let mut inner = fula_client::Config::new(&config.endpoint) .with_timeout(std::time::Duration::from_secs(config.timeout_seconds)); @@ -304,6 +539,17 @@ fn build_inner_config(config: JsFulaConfig) -> fula_client::Config { inner.health_gate_ttl = std::time::Duration::from_secs(config.health_gate_ttl_seconds); + // Phase 19 — wire forwarding callback into the gate. The callback + // is `Arc` which lives entirely in Rust; it never crosses + // the wasm-bindgen boundary (the wasm boundary is between Rust + // and JS — the Arc stays inside Rust). HealthGate fires + // it from `record_success` / `record_failure` regardless of target. + let dispatcher_for_cb = Arc::clone(dispatcher); + let cb: fula_client::HealthCallback = Arc::new(move |ev| { + dispatcher_for_cb.dispatch(ev); + }); + inner.health_callback = Some(cb); + // Phase 2.2 — block cache (native-only at runtime; plumbed for symmetry). inner.block_cache_enabled = config.block_cache_enabled; inner.block_cache_path = if config.block_cache_path.is_empty() { @@ -318,6 +564,21 @@ fn build_inner_config(config: JsFulaConfig) -> fula_client::Config { inner.gateway_fallback_urls = config.gateway_fallback_urls; inner.gateway_race_concurrency = config.gateway_race_concurrency as usize; + // Phase 3.3 — cold-start hybrid resolver (native-only at runtime; + // plumbed for symmetry). Empty strings → resolver disabled (the + // four required fields are all string-empty in JsFulaConfig's + // Default impl-equivalent via `#[serde(default)]`). + inner.users_index_chain_rpc_url = config.users_index_chain_rpc_url; + inner.users_index_anchor_address = config.users_index_anchor_address; + inner.users_index_ipns_name = config.users_index_ipns_name; + inner.users_index_user_key = if config.users_index_user_key.is_empty() { + None + } else { + Some(config.users_index_user_key) + }; + inner.users_index_ipns_gateway_urls = config.users_index_ipns_gateway_urls; + inner.users_index_ipfs_gateway_urls = config.users_index_ipfs_gateway_urls; + inner } @@ -793,6 +1054,98 @@ pub async fn is_flat_namespace(client: &EncryptedClient) -> bool { guard.is_flat_namespace() } +// ============================================================================ +// Phase 3.3 — userKey derivation +// ============================================================================ + +/// Compute the canonical fula `userKey` for cold-start config from a +/// plaintext email. Mirrors `fula_client::derive_user_key_from_email` +/// — same domain separator + double-hash chain (sha256(lower(email)) +/// → BLAKE3("fula:user_id:" || _).bytes[..16] → hex). +/// +/// Apps call this once at sign-in (the OAuth flow has plaintext +/// email), then set `users_index_user_key` on the config object +/// passed to `createEncryptedClient`. The SDK never persists or +/// transmits the raw email. +/// +/// On wasm32 the cold-start RESOLVER itself isn't wired (it depends +/// on reqwest + parking_lot which aren't compiled for browsers), so +/// this helper is exposed for API symmetry — apps can compute the +/// userKey on web for sharing across native + web identity flows. +#[wasm_bindgen(js_name = deriveUserKeyFromEmail)] +pub fn derive_user_key_from_email(email: String) -> String { + fula_client::derive_user_key_from_email(&email) +} + +// ============================================================================ +// Phase 19 — get_object_with_offline_fallback + transparency polling +// ============================================================================ + +/// Phase 19 GET wrapper that returns transparency fields alongside +/// the bytes. Mirrors `fula-flutter`'s `getObjectWithOfflineFallback`. +/// On wasm32 the offline fallback infrastructure is gated out (no +/// block cache, no gateway race), so this delegates to the +/// master-only `get_object_with_metadata` path; the returned shape +/// always carries `source = Master, freshness = Live`. Exposed for +/// API symmetry with the Flutter binding. +/// +/// @param client - EncryptedClient (the underlying wraps a FulaClient too) +/// @param bucket - Bucket name +/// @param key - Object key +/// @returns - JSON object matching `JsOfflineGetResult` +/// (`data: number[]`, `etag: string`, `source: {kind: ...}`, +/// `freshness: {kind: ...}`, ...) +#[wasm_bindgen(js_name = getObjectWithOfflineFallback)] +pub async fn get_object_with_offline_fallback( + client: &EncryptedClient, + bucket: String, + key: String, +) -> Result { + let guard = client.inner.lock().await; + // The `EncryptedClient` doesn't expose `get_object_with_offline_fallback` + // directly; it's on the underlying `FulaClient`. Reach in via + // `inner()`. + let result = guard + .inner() + .get_object_with_offline_fallback(&bucket, &key) + .await + .map_err(|e| client_error_to_js_error("get_offline_fallback_failed", e))?; + let js_result: JsOfflineGetResult = result.into(); + serde_wasm_bindgen::to_value(&js_result) + .map_err(|e| JsError::new(&format!("serialize OfflineGetResult: {}", e))) +} + +/// Drain every `MasterHealthEvent` observed since the last call to +/// this function. Returns events in the order they fired (oldest +/// first); after draining the buffer is empty. +/// +/// JS apps poll this on a timer (or on UI rebuilds) and update an +/// online/offline indicator. Internal buffer bounded at 64 entries — +/// if an app falls behind, oldest events drop first, latest state is +/// preserved. For latest-only consumers, see `getLastMasterHealthEvent`. +/// +/// Returned shape: `Array<{kind: 'Online'} | {kind: 'OfflineFallbackActive', reason: string} | {kind: 'SeverelyDegraded', reason: string}>`. +#[wasm_bindgen(js_name = pollMasterHealthEvents)] +pub fn poll_master_health_events(client: &EncryptedClient) -> Result { + let events = client.health_dispatcher.drain_events(); + serde_wasm_bindgen::to_value(&events) + .map_err(|e| JsError::new(&format!("serialize health events: {}", e))) +} + +/// Read the most recent `MasterHealthEvent` observed by the SDK +/// without draining the buffer. Returns `null` if no transition has +/// happened yet (master has been Up the whole session). Useful for +/// apps that build UI state from a single field on mount. +/// +/// Returned shape: same as a single element from `pollMasterHealthEvents`, +/// or `null`. +#[wasm_bindgen(js_name = getLastMasterHealthEvent)] +pub fn get_last_master_health_event(client: &EncryptedClient) -> Result { + let last = client.health_dispatcher.last_event(); + serde_wasm_bindgen::to_value(&last) + .map_err(|e| JsError::new(&format!("serialize last health event: {}", e))) +} + /// Get SDK version #[wasm_bindgen(js_name = getVersion)] pub fn get_version() -> String { From 6bfca4e0d5417f59dfe7d2d168a2e7a891803d60 Mon Sep 17 00:00:00 2001 From: ehsan shariati Date: Mon, 4 May 2026 13:43:29 -0400 Subject: [PATCH 5/6] updated version + doc fix + CI tests fix --- Cargo.lock | 16 +- Cargo.toml | 2 +- crates/fula-client/src/health_gate.rs | 21 +- crates/fula-crypto/src/time.rs | 43 +++++ .../tests/flutter_bridge_tests.rs | 7 + docs/flutter-integration.md | 172 ++++++++++++++++- docs/wasm-compatibility.md | 34 ++++ docs/website/api.html | 136 ++++++++++++- docs/website/benchmark.html | 2 +- docs/website/index.html | 2 +- docs/website/platforms.html | 2 +- docs/website/sdk.html | 180 +++++++++++++++++- docs/website/security.html | 4 +- docs/website/x402.html | 2 +- packages/fula_client/CHANGELOG.md | 45 +++++ packages/fula_client/ios/fula_client.podspec | 2 +- packages/fula_client/pubspec.yaml | 2 +- 17 files changed, 640 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a3e6cfd..d4d9e49 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1676,7 +1676,7 @@ dependencies = [ [[package]] name = "fula-api" -version = "0.3.7" +version = "0.4.0" dependencies = [ "anyhow", "axum", @@ -1704,7 +1704,7 @@ dependencies = [ [[package]] name = "fula-blockstore" -version = "0.3.7" +version = "0.4.0" dependencies = [ "anyhow", "async-trait", @@ -1742,7 +1742,7 @@ dependencies = [ [[package]] name = "fula-cli" -version = "0.3.7" +version = "0.4.0" dependencies = [ "anyhow", "async-trait", @@ -1792,7 +1792,7 @@ dependencies = [ [[package]] name = "fula-client" -version = "0.3.7" +version = "0.4.0" dependencies = [ "anyhow", "async-trait", @@ -1831,7 +1831,7 @@ dependencies = [ [[package]] name = "fula-core" -version = "0.3.7" +version = "0.4.0" dependencies = [ "anyhow", "async-trait", @@ -1866,7 +1866,7 @@ dependencies = [ [[package]] name = "fula-crypto" -version = "0.3.7" +version = "0.4.0" dependencies = [ "aes-gcm", "anyhow", @@ -1910,7 +1910,7 @@ dependencies = [ [[package]] name = "fula-flutter" -version = "0.3.7" +version = "0.4.0" dependencies = [ "anyhow", "async-lock", @@ -1933,7 +1933,7 @@ dependencies = [ [[package]] name = "fula-js" -version = "0.3.7" +version = "0.4.0" dependencies = [ "base64 0.22.1", "bytes", diff --git a/Cargo.toml b/Cargo.toml index e835bbc..bfad95e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -77,7 +77,7 @@ name = "encrypted_upload_test" path = "examples/encrypted_upload_test.rs" [workspace.package] -version = "0.3.7" +version = "0.4.0" edition = "2021" license = "MIT OR Apache-2.0" repository = "https://github.com/functionland/fula-api" diff --git a/crates/fula-client/src/health_gate.rs b/crates/fula-client/src/health_gate.rs index 0942743..7e9081b 100644 --- a/crates/fula-client/src/health_gate.rs +++ b/crates/fula-client/src/health_gate.rs @@ -30,7 +30,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; use std::sync::Arc; -use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use std::time::Duration; /// Phase 19 transparency surface — events the SDK emits when its /// view of master-server reachability changes. Apps wire a @@ -218,15 +218,18 @@ pub enum GateDecision { ShortCircuit { down_for_secs: u64 }, } -/// Current unix-time in milliseconds. Wall-clock based (so SystemTime -/// adjustments can shift the gate's perceived "since" — acceptable here -/// since we only compare durations, and a clock jump is at worst a slight -/// TTL anomaly). +/// Current unix-time in milliseconds. Wall-clock based (so a system- +/// clock adjustment can shift the gate's perceived "since" — +/// acceptable here since we only compare durations, and a clock jump +/// is at worst a slight TTL anomaly). +/// +/// Routed through `fula_crypto::time::now_millis` so the wasm32 build +/// uses `js_sys::Date::now()` instead of `SystemTime::now()` (the +/// latter panics on wasm32 with "time not implemented on this +/// platform" — the wasm clippy `disallowed-methods` config catches +/// this at lint time). fn now_ms() -> u64 { - SystemTime::now() - .duration_since(UNIX_EPOCH) - .map(|d| d.as_millis() as u64) - .unwrap_or(0) + fula_crypto::time::now_millis() } #[cfg(test)] diff --git a/crates/fula-crypto/src/time.rs b/crates/fula-crypto/src/time.rs index f44d032..a39af7f 100644 --- a/crates/fula-crypto/src/time.rs +++ b/crates/fula-crypto/src/time.rs @@ -21,6 +21,27 @@ pub fn now_timestamp() -> i64 { .as_secs() as i64 } +/// Get current Unix timestamp in milliseconds (WASM-compatible) +/// +/// Returns the current time as milliseconds since the Unix epoch. +/// Companion to `now_timestamp` for callers that need millisecond +/// resolution (e.g., the master health gate's TTL bookkeeping where +/// sub-second precision matters across rapid Up↔Down transitions). +/// Works in both native Rust and WASM environments. +#[cfg(target_arch = "wasm32")] +pub fn now_millis() -> u64 { + js_sys::Date::now() as u64 +} + +/// Get current Unix timestamp in milliseconds (native) +#[cfg(not(target_arch = "wasm32"))] +pub fn now_millis() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0) +} + #[cfg(test)] mod tests { use super::*; @@ -33,4 +54,26 @@ mod tests { // Should be before Jan 1, 2100 (timestamp: 4102444800) assert!(ts < 4102444800, "Timestamp should be before 2100"); } + + #[test] + fn test_now_millis_reasonable() { + let ms = now_millis(); + // Should be after Jan 1, 2020 in ms (1577836800000) + assert!(ms > 1_577_836_800_000, "ms timestamp should be after 2020"); + // Should be before Jan 1, 2100 in ms + assert!(ms < 4_102_444_800_000, "ms timestamp should be before 2100"); + } + + #[test] + fn test_now_millis_matches_seconds_within_tolerance() { + // Sanity: the millis helper agrees with the seconds helper to + // the second. Catches an accidental scaling bug. + let ms = now_millis(); + let s = now_timestamp() as u64; + let derived_s = ms / 1000; + assert!( + derived_s.abs_diff(s) <= 1, + "now_millis()/1000 ({derived_s}) and now_timestamp() ({s}) must agree to within 1s", + ); + } } diff --git a/crates/fula-flutter/tests/flutter_bridge_tests.rs b/crates/fula-flutter/tests/flutter_bridge_tests.rs index eaca096..6ef503b 100644 --- a/crates/fula-flutter/tests/flutter_bridge_tests.rs +++ b/crates/fula-flutter/tests/flutter_bridge_tests.rs @@ -25,6 +25,12 @@ fn test_fula_config_default() { #[test] fn test_fula_config_with_values() { + // Construct via `..Default::default()` so adding new fields to + // `FulaConfig` (e.g., Phase 2.x / 3.3 / 19) doesn't require + // updating this test. The pre-Phase-2.x fields below are the + // ones this test specifically exercises; everything else inherits + // from `Default::default()` which is the documented backward- + // compat shape (all new flags off / empty). let config = FulaConfig { endpoint: "https://api.example.com".to_string(), access_token: Some("test-token".to_string()), @@ -32,6 +38,7 @@ fn test_fula_config_with_values() { max_retries: 5, per_chunk_download_timeout_seconds: 120, buffered_download_max_bytes: 64 * 1024 * 1024, + ..Default::default() }; assert_eq!(config.endpoint, "https://api.example.com"); assert_eq!(config.access_token, Some("test-token".to_string())); diff --git a/docs/flutter-integration.md b/docs/flutter-integration.md index fd6942e..9d664a1 100644 --- a/docs/flutter-integration.md +++ b/docs/flutter-integration.md @@ -104,15 +104,46 @@ application/wasm ### Configuration Types #### FulaConfig + ```dart class FulaConfig { - final String endpoint; // Gateway URL (e.g., "http://localhost:9000") - final String? accessToken; // JWT authentication token - final int timeoutSeconds; // Request timeout (default: 30) - final int maxRetries; // Retry attempts (default: 3) + // Connection + final String endpoint; // Gateway URL (e.g., "http://localhost:9000") + final String? accessToken; // JWT authentication token + final int timeoutSeconds; // Request timeout (default: 30) + final int maxRetries; // Retry attempts (default: 3) + final int perChunkDownloadTimeoutSeconds; // F10: per-chunk timeout (default: 300) + final int bufferedDownloadMaxBytes; // F8: buffered download cap (default: 256 MiB) + + // Phase 2.1 — master-down detection (functional on every target) + final bool healthGateEnabled; // default: false + final int healthGateTtlSeconds; // default: 30 + + // Phase 2.2 — persistent block cache (native-only at runtime; flags + // accepted on web for config symmetry, silently inert in browsers) + final bool blockCacheEnabled; // default: false + final String blockCachePath; // default: "" → platform default + final int blockCacheMaxBytes; // default: 256 MiB + + // Phase 2.3 / 2.4 — IPFS gateway race + warm-device offline GET + final bool gatewayFallbackEnabled; // default: false (requires blockCacheEnabled) + final List gatewayFallbackUrls; // default: [] → ships 6 public gateways + final int gatewayRaceConcurrency; // default: 3 + + // Phase 3.3 — cold-start hybrid resolver (native-only at runtime). + // The resolver activates iff ALL four required fields are populated; + // empty values disable cold-start (the warm-device path still works). + final String usersIndexChainRpcUrl; // operator-supplied (Base/SKALE) + final String usersIndexAnchorAddress; // operator-supplied + final String usersIndexIpnsName; // operator-supplied (k51qzi5...) + final String usersIndexUserKey; // app-derived via deriveUserKeyFromEmail + final List usersIndexIpnsGatewayUrls; // default: [] → SDK defaults + final List usersIndexIpfsGatewayUrls; // default: [] → SDK defaults } ``` +All flags default OFF — apps that don't opt in see byte-identical behavior to pre-Phase-2.x builds. + #### EncryptionConfig ```dart class EncryptionConfig { @@ -372,6 +403,111 @@ final restoredClient = await createEncryptedClient( ); ``` +## Offline Reads (Phase 2 + 3) + +When the master gateway is unreachable, the SDK can transparently fall back to public IPFS gateways AND, on a fresh device install, cold-start by resolving a globally-published users-index from IPNS or the chain anchor — no client wallet, no fresh master required. + +### Two-tier offline read + +| Scenario | Path | +|---|---| +| **Warm device** (signed in before, has block cache) | Phase 2.x — gateway race using cached `(bucket, key) → cid` | +| **Fresh install** (no cache) | Phase 3.3 — cold-start resolver fetches global users-index via IPNS or chain, then walks per-user manifest | +| **Master up** | Direct master read (fast path, byte-identical to today) | + +### Step 1 — Enable warm-device offline reads + +```dart +final config = FulaConfig( + endpoint: 'https://your-fula-gateway.com:9000', + accessToken: jwt, + // Phase 2.1 — detect master-down without per-read timeout tax + healthGateEnabled: true, + // Phase 2.2 — persistent block cache (gateway hits land here) + blockCacheEnabled: true, + // Phase 2.4 — fall back to public gateways when master is down + gatewayFallbackEnabled: true, +); +``` + +### Step 2 — (Optional) Enable cold-start for fresh installs + +In addition to the Phase 2.x flags, pass the four operator-supplied resolver fields and the per-user `userKey` derived from the user's email: + +```dart +import 'package:fula_client/fula_client.dart'; + +// Compute userKey once at sign-in. Email is hashed locally; +// the SDK never sees the plaintext on the wire. +final userKey = deriveUserKeyFromEmail(userEmail); + +final config = FulaConfig( + endpoint: 'https://your-fula-gateway.com:9000', + accessToken: jwt, + healthGateEnabled: true, + blockCacheEnabled: true, + gatewayFallbackEnabled: true, + // Phase 3.3 — cold-start hybrid resolver + usersIndexChainRpcUrl: 'https://mainnet.base.org', // or SKALE + usersIndexAnchorAddress: '0x...FulaUsersIndexAnchor...', + usersIndexIpnsName: 'k51qzi5uqu5dh...', // operator's published IPNS NAME + usersIndexUserKey: userKey, +); +``` + +### Step 3 — Read with transparency fields + +```dart +final result = await getObjectWithOfflineFallback(client, 'my-bucket', 'photos/cat.jpg'); +final bytes = result.inner.data; + +// Surface "you're offline" UI +switch (result.source) { + case FulaReadSource.master: + // fast path — master served the bytes directly + break; + case FulaReadSource.localCache: + // BLOCKS hit — no network round-trip at all + showToast('Reading from cache (offline)'); + break; + case FulaReadSource.gateway: + // gateway race served the bytes; result.source.url has the gateway URL + showToast('Reading via public IPFS (master is down)'); + break; +} +``` + +### Step 4 — Subscribe to master health transitions + +Two patterns are exposed; pick whichever fits your app: + +```dart +// Pattern A: drain events on a timer / on UI rebuild +final events = pollMasterHealthEvents(client); +for (final event in events) { + switch (event) { + case MasterHealthEvent.online: + setState(() => isOffline = false); + break; + case MasterHealthEvent.offlineFallbackActive: + setState(() => isOffline = true); + break; + case MasterHealthEvent.severelyDegraded: + // both master AND chain unreachable — disable "create new bucket" UI + setState(() => canStartFresh = false); + break; + } +} + +// Pattern B: read latest event on mount (no buffer drain) +final last = getLastMasterHealthEvent(client); +if (last is MasterHealthEvent.offlineFallbackActive) { + // app started while master is down +} +``` + +The `EncryptedClient` has corresponding `pollMasterHealthEventsEncrypted` and `getLastMasterHealthEventEncrypted` variants. + ## Error Handling All operations can throw `FulaError` with specific error types: @@ -388,11 +524,39 @@ try { print('Access denied: ${e.message}'); break; case FulaError.network: + // includes Phase 2.1 MasterUnreachable print('Network error: ${e.message}'); break; case FulaError.encryption: print('Encryption error: ${e.message}'); break; + // Phase 2.x cache errors + case FulaError.cacheBudgetExceeded: + // Phase 2.2: block too large for the cache budget; not fatal — + // the read still succeeded, just not cached. + print('Cache budget exceeded for ${e.size} bytes (budget: ${e.budget})'); + break; + case FulaError.cacheError: + // Phase 2.2: redb open / read / write failure; offline path + // disabled for this session. + print('Block cache unavailable: ${e.message}'); + break; + // Phase 3.3 cold-start errors + case FulaError.usersIndexResolutionFailed: + // Both IPNS and chain channels failed — cold-start unavailable. + // Surface to user as "can't reach storage; please try again later". + print('Cold-start resolver exhausted: ${e.reason}'); + break; + case FulaError.sequenceRegression: + // Replay-defense rejection — the resolver observed a sequence + // older than what it has previously seen. Either a stale gateway + // response or (rarely) a tampered payload. SDK retries the + // alternate channel automatically; this surface is for logging. + print( + 'Sequence regression on ${e.channel}: ' + 'observed=${e.observed}, highestSeen=${e.highestSeen}', + ); + break; default: print('Error: $e'); } diff --git a/docs/wasm-compatibility.md b/docs/wasm-compatibility.md index 72bc2b0..1bbc65b 100644 --- a/docs/wasm-compatibility.md +++ b/docs/wasm-compatibility.md @@ -248,6 +248,40 @@ The API surface is identical between native and WASM builds. However: 2. **Multithreading:** WASM is single-threaded. Async operations use the event loop. 3. **Timing:** High-resolution timers may be limited in WASM for security reasons. +## Master-Independent Reads (v0.4.0) — what works on wasm + +v0.4.0 adds the offline-read story (Phase 2.1 / 2.2 / 2.3 / 2.4 / 3.3 / 19). The full surface is exposed in **both** `fula-flutter` and `fula-js` bindings for API symmetry, but several layers are **inert at runtime on wasm32** because their dependencies (redb, `parking_lot`-based gateway pool, reqwest-with-tls, `std::time::SystemTime`) don't compile cleanly in browsers. + +### Functional on wasm32 + +| Surface | Notes | +|---|---| +| **Health gate (Phase 2.1)** | `health_gate_enabled` + `health_gate_ttl` work. Internal `now_ms()` routes through `fula_crypto::time::now_millis()` which uses `js_sys::Date::now()` on wasm32 (the `clippy::disallowed_methods` config in `.github/clippy-wasm/clippy.toml` bans `std::time::SystemTime::now` to catch regressions at lint time). Two consecutive request failures still trip the gate; reads short-circuit with `MasterUnreachable` instead of paying the timeout tax. | +| **Transparency polling (Phase 19)** | `pollMasterHealthEvents()` and `getLastMasterHealthEvent()` work. The dispatcher captures every transition fired by the in-Rust health gate. On wasm32 the gate fires for the same conditions as native (failed master requests), so apps see the same event stream. `MasterHealthEvent::SeverelyDegraded` is only emitted by the cold-start resolver, which is native-only — on wasm you'll see `Online` / `OfflineFallbackActive` only. | +| **`derive_user_key_from_email`** | Pure `sha256` + `blake3` + `hex` — no native-only deps. Apps can compute the userKey on web for cross-platform identity flows (e.g., compute on a desktop Tauri app, replicate on a web companion using the same email + algorithm). | +| **`get_object_with_offline_fallback`** | Compiles and runs on wasm32. With block_cache + gateway_fallback inert (see below), the wasm path always returns `source: Master, freshness: Live` — i.e., it's effectively `get_object_with_metadata` wrapped in the `OfflineGetResult` shape. Apps can consume the result identically across native + web. | + +### Inert on wasm32 (fields accepted, runtime no-op) + +| Surface | Why inert | Effect | +|---|---|---| +| **Block cache (Phase 2.2)** | `redb` is a native-only embedded KV (mmap + file locks). The whole `crates/fula-client/src/block_cache.rs` file is gated `#![cfg(not(target_arch = "wasm32"))]`. | Setting `block_cache_enabled = true` on wasm is silently ignored. The SDK never persists block bytes and never observes the `(bucket, key) → cid` map needed by the gateway-race fallback. | +| **Gateway race (Phase 2.3 / 2.4)** | Depends on the block cache (for the `(bucket, key) → cid` lookup table) AND on `parking_lot`'s native-only mutex behavior in the gateway-state ring. Whole `gateway_fetch.rs` is gated. | Setting `gateway_fallback_enabled = true` on wasm is silently ignored. Master-down reads on web surface as `MasterUnreachable` errors instead of falling through to a public gateway. | +| **Cold-start hybrid resolver (Phase 3.3)** | `registry_resolver.rs` is gated `#![cfg(not(target_arch = "wasm32"))]` because it depends on `reqwest` with native-tls, `parking_lot`, and `serde_ipld_dagcbor` paths that the wasm build chain doesn't currently support. | Setting `users_index_*` fields on wasm is silently ignored. Cold-start GETs (master-down + cache miss) surface `UsersIndexResolutionFailed`. | + +### Why expose inert flags at all + +The fields are accepted on every target so a TypeScript app sharing a config struct between mobile (where everything works) and web (where some flags are silently inert) can construct one config object without per-platform branches. On web, the offline path simply degrades to "no offline path" — typed errors come back instead of fallback paths firing. Apps that want web-side offline reads today should rely on browser caching (HTTP-level service workers) until the wasm-side gateway race lands in a future release. + +### Adding new wasm-incompatible API to fula-client + +When adding a new SDK surface that depends on `std::time::SystemTime::now`, `std::time::Instant::now`, file I/O, or any other native-only call: + +1. **Either** gate the function with `#[cfg(not(target_arch = "wasm32"))]` so it's excluded from wasm builds entirely. +2. **Or** route the call through `fula_crypto::time::now_timestamp()` / `now_millis()` (or `web_time::Instant::now()` for monotonic timing). + +The CI's `test-wasm` job loads `.github/clippy-wasm/clippy.toml` via `CLIPPY_CONF_DIR` and runs `cargo clippy --target wasm32-unknown-unknown -D clippy::disallowed-methods`. This catches `SystemTime::now` / `Instant::now` regressions before merge. Native clippy ignores the config. + ## Performance Considerations 1. **Crypto operations:** libcrux-ml-kem is optimized but may be ~10-20% slower than native C in WASM. diff --git a/docs/website/api.html b/docs/website/api.html index c0d1dff..d436689 100644 --- a/docs/website/api.html +++ b/docs/website/api.html @@ -43,7 +43,7 @@ +
@@ -1214,6 +1224,130 @@

Permissions builder helpers

+
+
+
+

Master-Independent Reads (v0.4.0)

+

+ v0.4.0 adds a coordinated server + SDK story so non-blox clients can read their own files even when the master gateway is offline. + The SDK-side surfaces (gateway race, block cache, cold-start resolver, transparency types) live in SDK Examples; + this section documents the HTTP-level additions on the master server itself. +

+

What's new on the master

+
    +
  • Optional PUT header x-amz-meta-fula-bucket-lookup-h the SDK attaches on Phase 2 manifest-root commits (Phase 1.2).
  • +
  • Background users-index publisher that pins per-user + global CBORs to ipfs-cluster, publishes to IPNS every 5 min, and exposes the latest state via an internal endpoint (Phase 3.2).
  • +
  • Bearer-protected internal admin endpoints at /_internal/* (master) and /admin/users-index-anchor/trigger (mainnet-rewards-server) so operators can force a publish / chain-submit on demand instead of waiting up to 12 hours.
  • +
+

Every new server-side path is gated by an env flag default OFF; old fula-clients see byte-identical behavior to pre-v0.4.0 builds.

+
+
+
+ +
+
+
+

Phase 1.2 — x-amz-meta-fula-bucket-lookup-h

+

+ Optional user-metadata header that the encrypted SDK attaches on the Phase 2 manifest-root PUT. + Carries a 16-byte client-derived blinded bucket lookup key (BLAKE3 of MetadataKey || bucket_name), + so the published global users-index CBOR can key its bucket entries without leaking plaintext bucket names to anyone who fetches it. +

+

Format

+
x-amz-meta-fula-bucket-lookup-h: <32 hex chars>
+

32 lowercase hex chars (16 bytes). Master-side handler at fula-cli/src/handlers/object.rs calls BucketManager::populate_lookup_h_if_missing after the flush.

+

Behavior

+
    +
  • Idempotent — once populated for a bucket, the field is never overwritten on subsequent PUTs.
  • +
  • Non-fatal — a malformed or missing header is logged at warn! level; the PUT response is unchanged.
  • +
  • Env-gated — master ignores the header unless FULA_BUCKET_LOOKUP_H_ENABLED=1.
  • +
  • Backward-compat — old clients (no header) work unchanged. Buckets created without the header are emitted in the published CBOR with legacy: true and a plaintext-name key, so cold-start can still find them.
  • +
+
+
+
+ +
+
+
+

Phase 3.2 — GET /_internal/users-index-state

+

Returns the master's current published users-index state. Consumed by the chain-anchor cron in mainnet-rewards-server; rarely useful to apps directly.

+

Auth

+

Authorization: Bearer <FULA_USERS_INDEX_INTERNAL_TOKEN> — generated by the operator setup script and shared between master + cron + (optionally) pinning-webui.

+

Response codes

+
    +
  • 200 — success; body documented below.
  • +
  • 401 — bearer missing or wrong (constant-time compared).
  • +
  • 503 — fail-closed: publisher disabled (FULA_USERS_INDEX_PUBLISHER_ENABLED unset) OR token unset.
  • +
+

Response body (JSON)

+
{
+  "cid": "bafyrei...",          // CID of the latest pinned global users-index CBOR (or null on pre-first-tick)
+  "sequence": 17,                // monotonic sequence inside the CBOR payload
+  "updated_at_unix": 1714780000, // wall-clock timestamp of last commit
+  "ipns_key_name": "fula-users-index"
+}
+
+
+
+ +
+
+
+

Operator — POST /_internal/publish-now

+

Triggers an immediate publisher tick instead of waiting up to FULA_USERS_INDEX_FLUSH_INTERVAL_SECS (default 5 min). Useful during deploy verification.

+

Auth

+

Same bearer token as /_internal/users-index-state. Same 401 / 503 fail-closed semantics.

+

Response (200) body

+
{
+  "global_cid": "bafyrei...",
+  "sequence": 18,
+  "changed_users": 1,    // users whose per-user CBOR was newly pinned
+  "failed_users": 0,     // per-user pins that failed (tick continues; failed users retry next tick)
+  "total_users": 6,
+  "global_rebuilt": true
+}
+

Operator UI

+

+ The pinning-webui admin section (/admin/fula) ships a "Publish now" button that proxies to this endpoint. + Reuses the operator's session cookie for the inbound auth and the bearer token for the outbound call. +

+
+
+
+ +
+
+
+

Operator — POST /admin/users-index-anchor/trigger

+

+ On the mainnet-rewards-server (not the master gateway). Triggers an immediate users-index chain-anchor submission instead of waiting up to 12 h for the periodic cron. +

+

Auth

+

Reuses FULA_USERS_INDEX_INTERNAL_TOKEN via Authorization: Bearer .... Constant-time compare.

+

Response codes

+
    +
  • 200 — tick committed; per-network results in body.
  • +
  • 401 — wrong/missing bearer.
  • +
  • 409 — another tick is already in flight (cron OR a prior HTTP trigger). Retry after a moment.
  • +
  • 503 — fail-closed: anchor service disabled (FULA_USERS_INDEX_ANCHOR_ENABLED unset) OR token unset.
  • +
+

Response (200) body

+
{
+  "committed": true,
+  "masterCid": "bafyrei...",
+  "masterSequence": "18",
+  "networks": [
+    { "network": "base",  "status": "fulfilled", "submitted": true },
+    { "network": "skale", "status": "fulfilled", "submitted": false }
+  ]
+}
+

Concurrency

+

An in-flight flag inside runTick prevents two simultaneous ticks from racing the on-chain latest() reads + publish() calls (which would cause one tx to revert with NonMonotonicSequence). HTTP triggers that contend with the periodic cron get a clean 409.

+
+
+
+

Fula API Documentation • Built with ❤️ for decentralized storage

diff --git a/docs/website/benchmark.html b/docs/website/benchmark.html index b751c62..18739a8 100644 --- a/docs/website/benchmark.html +++ b/docs/website/benchmark.html @@ -42,7 +42,7 @@ +
+
+
+

📡 Offline Reads (v0.4.0)

+

+ When the master gateway is unreachable, the SDK can transparently fall back to public IPFS gateways (warm-device path) and, on a fresh device install, cold-start by resolving a globally-published users-index from IPNS or the chain anchor — no client wallet, no fresh master required. +

+

+ Every flag defaults OFF for backward compatibility. Apps that don't opt in see byte-identical behavior to pre-v0.4.0 builds. +

+ +

Three-tier read path

+
    +
  • Master up — direct master read (fast path, byte-identical to today)
  • +
  • Master down + warm cache — gateway race using the cached (bucket, key) → cid map (Phase 2.x)
  • +
  • Master down + fresh install — cold-start hybrid resolver fetches the global users-index via IPNS first, then chain anchor as backup; walks the per-user manifest from there (Phase 3.3)
  • +
+ +

Rust — enable warm-device offline reads

+
use fula_client::{Config, FulaClient};
+use std::time::Duration;
+
+let mut config = Config::new("https://your-fula-gateway.com:9000")
+    .with_token(jwt);
+
+// Phase 2.1 — detect master-down without per-read timeout tax
+config.health_gate_enabled = true;
+config.health_gate_ttl = Duration::from_secs(30);
+
+// Phase 2.2 — persistent block cache (gateway hits land here)
+config.block_cache_enabled = true;
+// config.block_cache_path = Some(...);  // None → platform default
+config.block_cache_max_bytes = 256 * 1024 * 1024;
+
+// Phase 2.4 — fall back to public gateways when master is down
+config.gateway_fallback_enabled = true;
+// config.gateway_fallback_urls = vec![...];  // [] → SDK ships 6 default gateways
+config.gateway_race_concurrency = 3;
+
+let client = FulaClient::new(config)?;
+
+ +

Rust — also enable cold-start (fresh device install)

+
use fula_client::derive_user_key_from_email;
+
+// Compute userKey ONCE at sign-in. Email is hashed locally; the SDK
+// never persists or transmits it.
+let user_key = derive_user_key_from_email(&user_email);
+
+// In addition to the warm-device flags above:
+config.users_index_chain_rpc_url = "https://mainnet.base.org".into();
+config.users_index_anchor_address =
+    "0x...FulaUsersIndexAnchor...".into();
+config.users_index_ipns_name = "k51qzi5uqu5dh...".into();
+config.users_index_user_key = Some(user_key);
+// config.users_index_ipns_gateway_urls = vec![...];  // [] → SDK defaults
+// config.users_index_ipfs_gateway_urls = vec![...];  // [] → SDK defaults
+
+ +

Rust — read with transparency fields

+
use fula_client::{ReadSource, ReadFreshness};
+
+let result = client
+    .get_object_with_offline_fallback("my-bucket", "photos/cat.jpg")
+    .await?;
+
+let bytes = &result.inner.data;
+match result.source {
+    ReadSource::Master           => log::debug!("served by master"),
+    ReadSource::LocalCache       => log::info!("served from local cache (offline)"),
+    ReadSource::Gateway(url)     => log::info!("served via {}", url),
+}
+match result.freshness {
+    ReadFreshness::Live                                 => {}
+    ReadFreshness::Cached { observed_at }               => {
+        log::info!("cached entry from unix-millis {}", observed_at);
+    }
+    ReadFreshness::StaleByDesign { snapshot_age_secs }  => { /* Phase 3.3 */ }
+    ReadFreshness::StaleByOutage { snapshot_age_secs }  => { /* Phase 3.3 */ }
+}
+
+ +

Flutter / Dart — same surface via FRB bindings

+
// 1. Compute userKey at sign-in
+final userKey = deriveUserKeyFromEmail(userEmail);
+
+// 2. Construct config with all relevant flags
+final config = FulaConfig(
+  endpoint: 'https://your-fula-gateway.com:9000',
+  accessToken: jwt,
+  healthGateEnabled: true,
+  blockCacheEnabled: true,
+  gatewayFallbackEnabled: true,
+  // cold-start (Phase 3.3) — native-only at runtime
+  usersIndexChainRpcUrl: 'https://mainnet.base.org',
+  usersIndexAnchorAddress: '0x...',
+  usersIndexIpnsName: 'k51qzi5uqu5dh...',
+  usersIndexUserKey: userKey,
+);
+final client = await createClient(config);
+
+// 3. Read with transparency
+final result = await getObjectWithOfflineFallback(client, 'my-bucket', 'photos/cat.jpg');
+final bytes = result.inner.data;
+print('source: ${result.source}, freshness: ${result.freshness}');
+
+ +

JavaScript / TypeScript — wasm-bindgen surface

+
import {
+  createEncryptedClient,
+  getObjectWithOfflineFallback,
+  deriveUserKeyFromEmail,
+  pollMasterHealthEvents,
+  getLastMasterHealthEvent,
+} from '@functionland/fula-client';
+
+const userKey = deriveUserKeyFromEmail(userEmail);
+
+const client = await createEncryptedClient(
+  {
+    endpoint: 'https://your-fula-gateway.com:9000',
+    accessToken: jwt,
+    healthGateEnabled: true,                 // functional on web
+    blockCacheEnabled: true,                 // accepted but inert on web
+    gatewayFallbackEnabled: true,            // accepted but inert on web
+    usersIndexChainRpcUrl: '...',            // accepted but inert on web
+    usersIndexAnchorAddress: '0x...',
+    usersIndexIpnsName: 'k51qzi5...',
+    usersIndexUserKey: userKey,
+  },
+  encryptionConfig,
+);
+
+const result = await getObjectWithOfflineFallback(client, 'my-bucket', 'photos/cat.jpg');
+console.log(result.source);     // {kind: 'Master'} | {kind: 'LocalCache'} | {kind: 'Gateway', url: ...}
+console.log(result.freshness);  // {kind: 'Live'} | {kind: 'Cached', observedAt: ...}
+
+ +

Health-event subscription (Phase 19)

+

Two patterns are exposed in every binding (Rust closure / Dart polling / JS polling); pick whichever fits your app.

+ +
// JS / TS — drain on a timer or UI rebuild
+const events = pollMasterHealthEvents(client);  // Array<MasterHealthEvent>
+for (const event of events) {
+  switch (event.kind) {
+    case 'Online':                  setOffline(false); break;
+    case 'OfflineFallbackActive':   setOffline(true); break;
+    case 'SeverelyDegraded':        disableNewBucketUI(); break;
+  }
+}
+
+// Or read latest on mount
+const last = getLastMasterHealthEvent(client);
+if (last && last.kind === 'OfflineFallbackActive') {
+  // app started while master is down
+}
+
+ +

What works where

+ + + + + + + + + + +
SurfaceNative (Rust / Flutter)Web (wasm / JS)
Health gate (Phase 2.1)✅ functional✅ functional
Block cache (Phase 2.2)✅ functional⚪ flag accepted, inert
Gateway race (Phase 2.3 / 2.4)✅ functional⚪ flag accepted, inert
Cold-start resolver (Phase 3.3)✅ functional⚪ fields accepted, inert
Transparency types & polling (Phase 19)✅ functional✅ functional (returns Master/Live on the inert paths)
derive_user_key_from_email helper✅ (computed locally; useful for cross-platform identity flows)
+

+ See WASM Compatibility for the full gating story. +

+
+
+
+
diff --git a/docs/website/security.html b/docs/website/security.html index 39d69b2..5ec0b98 100644 --- a/docs/website/security.html +++ b/docs/website/security.html @@ -42,7 +42,7 @@
diff --git a/docs/website/x402.html b/docs/website/x402.html index cfb1578..a1a78e7 100644 --- a/docs/website/x402.html +++ b/docs/website/x402.html @@ -232,7 +232,7 @@