diff --git a/src/auto_select.rs b/src/auto_select.rs new file mode 100644 index 0000000..274ea8d --- /dev/null +++ b/src/auto_select.rs @@ -0,0 +1,229 @@ +//! Spectral probe for automatic operator selection. +//! +//! Computes two cheap diagnostic numbers on the left-Markov matrix +//! M^(L) Cleora builds from its hypergraph clique expansion: +//! +//! * `sigma_1` — top singular value (power iteration on M^T M). +//! * `cos_v1_d` — cosine between the absolute top right singular +//! vector and the node-degree vector. +//! +//! The ratio `cos_v1_d / sigma_1` is a useful discriminator for +//! choosing between propagation strategies on real recsys graphs: in +//! a 9-dataset full-pipeline MRR study, a threshold of ~0.10 cleanly +//! separated graphs where deflation-style anti-collapse methods won +//! from graphs where plain left-Markov propagation was sufficient. +//! +//! This module contributes only the probe primitive; users compose +//! it with their preferred embedding method. + +use std::hash::Hasher; + +use twox_hash::XxHash64; + +use crate::sparse_matrix::SparseMatrix; + +/// Default number of power-iteration rounds. 30 converges tightly +/// enough for the top singular vector on graphs up to ~1M nodes. +pub const DEFAULT_PROBE_ITERS: usize = 30; + +/// Diagnostic output of `spectral_probe`. +#[derive(Clone, Debug)] +pub struct ProbeResult { + pub sigma_1: f64, + pub cos_v1_d: f64, + /// Convenience: `cos_v1_d / sigma_1`. Returns 0 if sigma_1 is 0. + pub ratio: f64, + pub probe_iters: usize, +} + +#[inline] +fn normalize(v: &mut [f64]) { + let norm: f64 = v.iter().map(|&x| x * x).sum::().sqrt(); + if norm > 1e-30 { + let inv = 1.0 / norm; + for x in v.iter_mut() { + *x *= inv; + } + } +} + +#[inline] +fn spmv_left(sm: &SparseMatrix, x: &[f64], y: &mut [f64]) { + y.iter_mut().for_each(|yi| *yi = 0.0); + for (row, &(s, e)) in sm.slices.iter().enumerate() { + let mut acc = 0.0_f64; + for edge in &sm.edges[s..e] { + acc += (edge.left_markov_value as f64) * x[edge.other_entity_ix as usize]; + } + y[row] = acc; + } +} + +#[inline] +fn spmv_left_t(sm: &SparseMatrix, x: &[f64], y: &mut [f64]) { + y.iter_mut().for_each(|yi| *yi = 0.0); + for (row, &(s, e)) in sm.slices.iter().enumerate() { + let xi = x[row]; + for edge in &sm.edges[s..e] { + y[edge.other_entity_ix as usize] += (edge.left_markov_value as f64) * xi; + } + } +} + +fn row_degrees(sm: &SparseMatrix) -> Vec { + sm.slices.iter().map(|(s, e)| (e - s) as f64).collect() +} + +/// Run the spectral probe. One-shot top-1 power iteration over +/// `M^(L)`, returns σ₁, cos(|v₁|, d), and their ratio. +/// +/// Deterministic under a given `seed`. Complexity is +/// O(`probe_iters` × nnz) — 2 SpMVs per round. +pub fn spectral_probe(sm: &SparseMatrix, seed: u64, probe_iters: usize) -> ProbeResult { + let n = sm.slices.len(); + if n == 0 { + return ProbeResult { + sigma_1: 0.0, + cos_v1_d: 0.0, + ratio: 0.0, + probe_iters: 0, + }; + } + + // Hash-based deterministic init in (-1, 1). + let mut v: Vec = (0..n) + .map(|i| { + let mut h = XxHash64::with_seed(seed); + h.write_u64(i as u64); + let u = (h.finish() as f64) / (u64::MAX as f64); + u * 2.0 - 1.0 + }) + .collect(); + normalize(&mut v); + + let mut y = vec![0.0_f64; n]; + let mut w = vec![0.0_f64; n]; + + for _ in 0..probe_iters { + spmv_left(sm, &v, &mut y); + spmv_left_t(sm, &y, &mut w); + std::mem::swap(&mut v, &mut w); + normalize(&mut v); + } + + spmv_left(sm, &v, &mut y); + let sigma_1: f64 = y.iter().map(|&x| x * x).sum::().sqrt(); + + let d = row_degrees(sm); + let mut abs_v: Vec = v.iter().map(|x| x.abs()).collect(); + let norm_v: f64 = abs_v.iter().map(|&x| x * x).sum::().sqrt(); + let norm_d: f64 = d.iter().map(|&x| x * x).sum::().sqrt(); + let cos_v1_d = if norm_v > 1e-30 && norm_d > 1e-30 { + abs_v + .iter_mut() + .zip(d.iter()) + .map(|(a, dv)| *a * dv) + .sum::() + / (norm_v * norm_d) + } else { + 0.0 + }; + + let ratio = if sigma_1 > 1e-30 { + cos_v1_d / sigma_1 + } else { + 0.0 + }; + + ProbeResult { + sigma_1, + cos_v1_d, + ratio, + probe_iters, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::sparse_matrix::{Edge, Entity, SparseMatrixDescriptor}; + + fn make_sm(n: usize, adj: Vec>) -> SparseMatrix { + let descriptor = SparseMatrixDescriptor { + col_a_id: 0, + col_a_name: "a".into(), + col_b_id: 0, + col_b_name: "a".into(), + }; + let mut edges: Vec = Vec::new(); + let mut slices: Vec<(usize, usize)> = Vec::with_capacity(n); + for neighbours in &adj { + let start = edges.len(); + for &(other, w) in neighbours { + edges.push(Edge { + other_entity_ix: other, + left_markov_value: w, + symmetric_markov_value: w, + }); + } + slices.push((start, edges.len())); + } + SparseMatrix { + descriptor, + entity_ids: (0..n).map(|i| format!("n{}", i)).collect(), + entities: vec![Entity { row_sum: 1.0 }; n], + edges, + slices, + column_ids: vec![0; n], + } + } + + #[test] + fn probe_empty_graph_returns_zero() { + let sm = make_sm(0, vec![]); + let r = spectral_probe(&sm, 0, 10); + assert_eq!(r.sigma_1, 0.0); + assert_eq!(r.cos_v1_d, 0.0); + assert_eq!(r.ratio, 0.0); + } + + #[test] + fn probe_is_deterministic() { + let n = 20usize; + let w = 1.0 / ((n - 1) as f32); + let mut adj: Vec> = Vec::new(); + for i in 0..n { + adj.push( + (0..n as u32) + .filter(|&j| j != i as u32) + .map(|j| (j, w)) + .collect(), + ); + } + let sm = make_sm(n, adj); + let a = spectral_probe(&sm, 42, 30); + let b = spectral_probe(&sm, 42, 30); + assert!((a.sigma_1 - b.sigma_1).abs() < 1e-10); + assert!((a.cos_v1_d - b.cos_v1_d).abs() < 1e-10); + } + + #[test] + fn probe_on_clique_finds_perron() { + let n = 20usize; + let w = 1.0 / ((n - 1) as f32); + let mut adj: Vec> = Vec::new(); + for i in 0..n { + adj.push( + (0..n as u32) + .filter(|&j| j != i as u32) + .map(|j| (j, w)) + .collect(), + ); + } + let sm = make_sm(n, adj); + let r = spectral_probe(&sm, 42, 40); + // Row-stochastic clique has σ₁ = 1 (Perron) and v₁ = 1 → cos = 1. + assert!((r.sigma_1 - 1.0).abs() < 1e-3); + assert!(r.cos_v1_d > 0.99); + } +} diff --git a/src/lib.rs b/src/lib.rs index 134b351..0b13ba8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,6 +18,7 @@ use crate::entity::hash_entity; use crate::pipeline::{build_graph_from_files, build_graph_from_iterator}; use crate::sparse_matrix::{create_sparse_matrix_descriptor, SparseMatrix, SparseMatrixDescriptor}; +pub mod auto_select; pub mod configuration; pub mod embedding; pub mod entity; @@ -473,6 +474,44 @@ impl SparseMatrix { *self = sm; Ok(()) } + + /// Spectral probe of the left-Markov matrix `M^(L)`. + /// + /// Returns a dict with three diagnostic numbers computed via a single + /// top-1 power iteration on `M^(L)^T M^(L)`: + /// + /// * ``sigma_1`` — the top singular value of `M^(L)`. + /// * ``cos_v1_d`` — cosine similarity between the absolute top + /// right singular vector of `M^(L)` and the node-degree vector + /// (count of distinct neighbours per node in the clique-expanded + /// graph). + /// * ``ratio`` — `cos_v1_d / sigma_1`. + /// + /// Cost: ``O(probe_iters × nnz)``, where `probe_iters` defaults to + /// 30 (tight enough to separate the top singular direction on + /// graphs up to ~1M nodes). Deterministic under a given ``seed``. + /// + /// Intended use: a cheap pre-flight signal for selecting between + /// propagation strategies. In a 9-dataset full-pipeline MRR study + /// (PANEL3_RESULTS.md §15 in the downstream benchmark harness) the + /// ratio `cos_v1_d / sigma_1 ≈ 0.10` cleanly separated graphs where + /// anti-collapse (deflation) wins from graphs where plain left- + /// Markov propagation is sufficient. + #[pyo3(signature = (seed = 42, probe_iters = auto_select::DEFAULT_PROBE_ITERS))] + fn spectral_probe<'py>( + &self, + py: Python<'py>, + seed: u64, + probe_iters: usize, + ) -> PyResult<&'py pyo3::types::PyDict> { + let r = py.allow_threads(|| auto_select::spectral_probe(self, seed, probe_iters)); + let dict = pyo3::types::PyDict::new(py); + dict.set_item("sigma_1", r.sigma_1)?; + dict.set_item("cos_v1_d", r.cos_v1_d)?; + dict.set_item("ratio", r.ratio)?; + dict.set_item("probe_iters", r.probe_iters as u64)?; + Ok(dict) + } } fn init_value(col: usize, hsh: u64, fixed_random_value: i64) -> f32 {