Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
229 changes: 229 additions & 0 deletions src/auto_select.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
//! Spectral probe for automatic operator selection.
//!
//! Computes two cheap diagnostic numbers on the left-Markov matrix
//! M^(L) Cleora builds from its hypergraph clique expansion:
//!
//! * `sigma_1` — top singular value (power iteration on M^T M).
//! * `cos_v1_d` — cosine between the absolute top right singular
//! vector and the node-degree vector.
//!
//! The ratio `cos_v1_d / sigma_1` is a useful discriminator for
//! choosing between propagation strategies on real recsys graphs: in
//! a 9-dataset full-pipeline MRR study, a threshold of ~0.10 cleanly
//! separated graphs where deflation-style anti-collapse methods won
//! from graphs where plain left-Markov propagation was sufficient.
//!
//! This module contributes only the probe primitive; users compose
//! it with their preferred embedding method.

use std::hash::Hasher;

use twox_hash::XxHash64;

use crate::sparse_matrix::SparseMatrix;

/// Default number of power-iteration rounds. 30 converges tightly
/// enough for the top singular vector on graphs up to ~1M nodes.
pub const DEFAULT_PROBE_ITERS: usize = 30;

/// Diagnostic output of `spectral_probe`.
#[derive(Clone, Debug)]
pub struct ProbeResult {
pub sigma_1: f64,
pub cos_v1_d: f64,
/// Convenience: `cos_v1_d / sigma_1`. Returns 0 if sigma_1 is 0.
pub ratio: f64,
pub probe_iters: usize,
}

#[inline]
fn normalize(v: &mut [f64]) {
let norm: f64 = v.iter().map(|&x| x * x).sum::<f64>().sqrt();
if norm > 1e-30 {
let inv = 1.0 / norm;
for x in v.iter_mut() {
*x *= inv;
}
}
}

#[inline]
fn spmv_left(sm: &SparseMatrix, x: &[f64], y: &mut [f64]) {
y.iter_mut().for_each(|yi| *yi = 0.0);
for (row, &(s, e)) in sm.slices.iter().enumerate() {
let mut acc = 0.0_f64;
for edge in &sm.edges[s..e] {
acc += (edge.left_markov_value as f64) * x[edge.other_entity_ix as usize];
}
y[row] = acc;
}
}

#[inline]
fn spmv_left_t(sm: &SparseMatrix, x: &[f64], y: &mut [f64]) {
y.iter_mut().for_each(|yi| *yi = 0.0);
for (row, &(s, e)) in sm.slices.iter().enumerate() {
let xi = x[row];
for edge in &sm.edges[s..e] {
y[edge.other_entity_ix as usize] += (edge.left_markov_value as f64) * xi;
}
}
}

fn row_degrees(sm: &SparseMatrix) -> Vec<f64> {
sm.slices.iter().map(|(s, e)| (e - s) as f64).collect()
}

/// Run the spectral probe. One-shot top-1 power iteration over
/// `M^(L)`, returns σ₁, cos(|v₁|, d), and their ratio.
///
/// Deterministic under a given `seed`. Complexity is
/// O(`probe_iters` × nnz) — 2 SpMVs per round.
pub fn spectral_probe(sm: &SparseMatrix, seed: u64, probe_iters: usize) -> ProbeResult {
let n = sm.slices.len();
if n == 0 {
return ProbeResult {
sigma_1: 0.0,
cos_v1_d: 0.0,
ratio: 0.0,
probe_iters: 0,
};
}

// Hash-based deterministic init in (-1, 1).
let mut v: Vec<f64> = (0..n)
.map(|i| {
let mut h = XxHash64::with_seed(seed);
h.write_u64(i as u64);
let u = (h.finish() as f64) / (u64::MAX as f64);
u * 2.0 - 1.0
})
.collect();
normalize(&mut v);

let mut y = vec![0.0_f64; n];
let mut w = vec![0.0_f64; n];

for _ in 0..probe_iters {
spmv_left(sm, &v, &mut y);
spmv_left_t(sm, &y, &mut w);
std::mem::swap(&mut v, &mut w);
normalize(&mut v);
}

spmv_left(sm, &v, &mut y);
let sigma_1: f64 = y.iter().map(|&x| x * x).sum::<f64>().sqrt();

let d = row_degrees(sm);
let mut abs_v: Vec<f64> = v.iter().map(|x| x.abs()).collect();
let norm_v: f64 = abs_v.iter().map(|&x| x * x).sum::<f64>().sqrt();
let norm_d: f64 = d.iter().map(|&x| x * x).sum::<f64>().sqrt();
let cos_v1_d = if norm_v > 1e-30 && norm_d > 1e-30 {
abs_v
.iter_mut()
.zip(d.iter())
.map(|(a, dv)| *a * dv)
.sum::<f64>()
/ (norm_v * norm_d)
} else {
0.0
};

let ratio = if sigma_1 > 1e-30 {
cos_v1_d / sigma_1
} else {
0.0
};

ProbeResult {
sigma_1,
cos_v1_d,
ratio,
probe_iters,
}
}

#[cfg(test)]
mod tests {
use super::*;
use crate::sparse_matrix::{Edge, Entity, SparseMatrixDescriptor};

fn make_sm(n: usize, adj: Vec<Vec<(u32, f32)>>) -> SparseMatrix {
let descriptor = SparseMatrixDescriptor {
col_a_id: 0,
col_a_name: "a".into(),
col_b_id: 0,
col_b_name: "a".into(),
};
let mut edges: Vec<Edge> = Vec::new();
let mut slices: Vec<(usize, usize)> = Vec::with_capacity(n);
for neighbours in &adj {
let start = edges.len();
for &(other, w) in neighbours {
edges.push(Edge {
other_entity_ix: other,
left_markov_value: w,
symmetric_markov_value: w,
});
}
slices.push((start, edges.len()));
}
SparseMatrix {
descriptor,
entity_ids: (0..n).map(|i| format!("n{}", i)).collect(),
entities: vec![Entity { row_sum: 1.0 }; n],
edges,
slices,
column_ids: vec![0; n],
}
}

#[test]
fn probe_empty_graph_returns_zero() {
let sm = make_sm(0, vec![]);
let r = spectral_probe(&sm, 0, 10);
assert_eq!(r.sigma_1, 0.0);
assert_eq!(r.cos_v1_d, 0.0);
assert_eq!(r.ratio, 0.0);
}

#[test]
fn probe_is_deterministic() {
let n = 20usize;
let w = 1.0 / ((n - 1) as f32);
let mut adj: Vec<Vec<(u32, f32)>> = Vec::new();
for i in 0..n {
adj.push(
(0..n as u32)
.filter(|&j| j != i as u32)
.map(|j| (j, w))
.collect(),
);
}
let sm = make_sm(n, adj);
let a = spectral_probe(&sm, 42, 30);
let b = spectral_probe(&sm, 42, 30);
assert!((a.sigma_1 - b.sigma_1).abs() < 1e-10);
assert!((a.cos_v1_d - b.cos_v1_d).abs() < 1e-10);
}

#[test]
fn probe_on_clique_finds_perron() {
let n = 20usize;
let w = 1.0 / ((n - 1) as f32);
let mut adj: Vec<Vec<(u32, f32)>> = Vec::new();
for i in 0..n {
adj.push(
(0..n as u32)
.filter(|&j| j != i as u32)
.map(|j| (j, w))
.collect(),
);
}
let sm = make_sm(n, adj);
let r = spectral_probe(&sm, 42, 40);
// Row-stochastic clique has σ₁ = 1 (Perron) and v₁ = 1 → cos = 1.
assert!((r.sigma_1 - 1.0).abs() < 1e-3);
assert!(r.cos_v1_d > 0.99);
}
}
39 changes: 39 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use crate::entity::hash_entity;
use crate::pipeline::{build_graph_from_files, build_graph_from_iterator};
use crate::sparse_matrix::{create_sparse_matrix_descriptor, SparseMatrix, SparseMatrixDescriptor};

pub mod auto_select;
pub mod configuration;
pub mod embedding;
pub mod entity;
Expand Down Expand Up @@ -473,6 +474,44 @@ impl SparseMatrix {
*self = sm;
Ok(())
}

/// Spectral probe of the left-Markov matrix `M^(L)`.
///
/// Returns a dict with three diagnostic numbers computed via a single
/// top-1 power iteration on `M^(L)^T M^(L)`:
///
/// * ``sigma_1`` — the top singular value of `M^(L)`.
/// * ``cos_v1_d`` — cosine similarity between the absolute top
/// right singular vector of `M^(L)` and the node-degree vector
/// (count of distinct neighbours per node in the clique-expanded
/// graph).
/// * ``ratio`` — `cos_v1_d / sigma_1`.
///
/// Cost: ``O(probe_iters × nnz)``, where `probe_iters` defaults to
/// 30 (tight enough to separate the top singular direction on
/// graphs up to ~1M nodes). Deterministic under a given ``seed``.
///
/// Intended use: a cheap pre-flight signal for selecting between
/// propagation strategies. In a 9-dataset full-pipeline MRR study
/// (PANEL3_RESULTS.md §15 in the downstream benchmark harness) the
/// ratio `cos_v1_d / sigma_1 ≈ 0.10` cleanly separated graphs where
/// anti-collapse (deflation) wins from graphs where plain left-
/// Markov propagation is sufficient.
#[pyo3(signature = (seed = 42, probe_iters = auto_select::DEFAULT_PROBE_ITERS))]
fn spectral_probe<'py>(
&self,
py: Python<'py>,
seed: u64,
probe_iters: usize,
) -> PyResult<&'py pyo3::types::PyDict> {
let r = py.allow_threads(|| auto_select::spectral_probe(self, seed, probe_iters));
let dict = pyo3::types::PyDict::new(py);
dict.set_item("sigma_1", r.sigma_1)?;
dict.set_item("cos_v1_d", r.cos_v1_d)?;
dict.set_item("ratio", r.ratio)?;
dict.set_item("probe_iters", r.probe_iters as u64)?;
Ok(dict)
}
}

fn init_value(col: usize, hsh: u64, fixed_random_value: i64) -> f32 {
Expand Down
Loading