From ae2f41172f1bffd20c3868a577c5a27eecca04e1 Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Mon, 29 Jun 2026 14:23:29 +0100
Subject: [PATCH 01/15] Initial caching layer implementation

---
 ggsql-cli/CLAUDE.md             |   2 +
 ggsql-cli/src/main.rs           | 127 ++++----
 ggsql-jupyter/CLAUDE.md         |   2 +-
 ggsql-jupyter/src/executor.rs   |  43 +--
 src/CLAUDE.md                   |   3 +
 src/execute/cte.rs              |  11 +-
 src/execute/mod.rs              |  45 ++-
 src/naming.rs                   |  42 +++
 src/reader/adbc.rs              |  66 +----
 src/reader/cache.rs             | 504 ++++++++++++++++++++++++++++++++
 src/reader/cache_equivalence.rs | 334 +++++++++++++++++++++
 src/reader/connection.rs        | 249 +++++++++-------
 src/reader/duckdb.rs            |  36 ++-
 src/reader/mod.rs               | 211 +++++++++++++
 src/reader/sqlite.rs            |  44 ++-
 15 files changed, 1399 insertions(+), 320 deletions(-)
 create mode 100644 src/reader/cache.rs
 create mode 100644 src/reader/cache_equivalence.rs
diff --git a/ggsql-cli/CLAUDE.md b/ggsql-cli/CLAUDE.md
index c6ed59703..17dba8134 100644
--- a/ggsql-cli/CLAUDE.md
+++ b/ggsql-cli/CLAUDE.md
@@ -32,6 +32,8 @@ The binary name is `ggsql` (not `ggsql-cli`) — that's what release artifacts a
 
 Only public `ggsql::*` API is used (`reader`, `writer`, `validate`, `parser`, `VERSION`) — this crate has no awareness of internal modules.
 
+`exec`/`run` build their reader via the library factory `ggsql::reader::connection::reader_from_uri`. They accept an in-memory caching layer (off by default) selected either by the composite connection scheme `<primary>+<cache>://…` (e.g. `odbc+duckdb://…`) or the `--cache <duckdb|sqlite>` flag; the two cannot be combined.
+
 ## Build & install
 
 ```sh
diff --git a/ggsql-cli/src/main.rs b/ggsql-cli/src/main.rs
index d1b0cb0de..0a004fbce 100644
--- a/ggsql-cli/src/main.rs
+++ b/ggsql-cli/src/main.rs
@@ -34,10 +34,14 @@ pub enum Commands {
         /// The ggsql query to execute
         query: String,
 
-        /// Data source connection string (duckdb://, sqlite://, odbc://)
+        /// Data source connection string (duckdb://, sqlite://, odbc://).
         #[arg(long, default_value = "duckdb://memory")]
         reader: String,
 
+        /// In-memory cache backend wrapping the reader (duckdb, sqlite). Off by default.
+        #[arg(long)]
+        cache: Option<String>,
+
         /// Output format (vegalite)
         #[arg(long, default_value = "vegalite")]
         writer: String,
@@ -56,10 +60,14 @@ pub enum Commands {
         /// Path to .sql file containing ggsql query
         file: PathBuf,
 
-        /// Data source connection string (duckdb://, sqlite://, odbc://)
+        /// Data source connection string (duckdb://, sqlite://, odbc://).
         #[arg(long, default_value = "duckdb://memory")]
         reader: String,
 
+        /// In-memory cache backend wrapping the reader (duckdb, sqlite). Off by default.
+        #[arg(long)]
+        cache: Option<String>,
+
         /// Output format (vegalite)
         #[arg(long, default_value = "vegalite")]
         writer: String,
@@ -147,6 +155,7 @@ fn main() -> anyhow::Result<()> {
         Commands::Exec {
             query,
             reader,
+            cache,
             writer,
             output,
             verbose,
@@ -154,12 +163,13 @@ fn main() -> anyhow::Result<()> {
             if verbose {
                 eprintln!("Executing query: {}", query);
             }
-            cmd_exec(query, reader, writer, output, verbose);
+            cmd_exec(query, reader, cache, writer, output, verbose);
         }
 
         Commands::Run {
             file,
             reader,
+            cache,
             writer,
             output,
             verbose,
@@ -167,7 +177,7 @@ fn main() -> anyhow::Result<()> {
             if verbose {
                 eprintln!("Running query from file: {}", file.display());
             }
-            cmd_run(file, reader, writer, output, verbose);
+            cmd_run(file, reader, cache, writer, output, verbose);
         }
 
         Commands::Parse { query, format } => {
@@ -194,9 +204,16 @@ fn main() -> anyhow::Result<()> {
     Ok(())
 }
 
-fn cmd_run(file: PathBuf, reader: String, writer: String, output: Option<PathBuf>, verbose: bool) {
+fn cmd_run(
+    file: PathBuf,
+    reader: String,
+    cache: Option<String>,
+    writer: String,
+    output: Option<PathBuf>,
+    verbose: bool,
+) {
     match std::fs::read_to_string(&file) {
-        Ok(query) => cmd_exec(query, reader, writer, output, verbose),
+        Ok(query) => cmd_exec(query, reader, cache, writer, output, verbose),
         Err(e) => {
             eprintln!("Failed to read file {}: {}", file.display(), e);
             std::process::exit(1);
@@ -204,76 +221,64 @@ fn cmd_run(file: PathBuf, reader: String, writer: String, output: Option<PathBuf
     }
 }
 
-fn cmd_exec(query: String, reader: String, writer: String, output: Option<PathBuf>, verbose: bool) {
+fn cmd_exec(
+    query: String,
+    reader: String,
+    cache: Option<String>,
+    writer: String,
+    output: Option<PathBuf>,
+    verbose: bool,
+) {
+    use ggsql::reader::connection;
+
     if verbose {
         eprintln!("Reader: {}", reader);
+        if let Some(ref cache) = cache {
+            eprintln!("Cache: {}", cache);
+        }
         eprintln!("Writer: {}", writer);
         if let Some(ref output_file) = output {
             eprintln!("Output: {}", output_file.display());
         }
     }
 
-    if reader.starts_with("duckdb://") {
-        #[cfg(feature = "duckdb")]
-        {
-            let r = match ggsql::reader::DuckDBReader::from_connection_string(&reader) {
-                Ok(r) => r,
-                Err(e) => {
-                    eprintln!("Failed to create reader: {}", e);
-                    std::process::exit(1);
-                }
-            };
-            exec_with_reader(&query, &r, &writer, output, verbose);
-        }
-        #[cfg(not(feature = "duckdb"))]
-        {
-            eprintln!("DuckDB reader not compiled in. Rebuild with --features duckdb");
-            std::process::exit(1);
-        }
-    } else if reader.starts_with("sqlite://") {
-        #[cfg(feature = "sqlite")]
-        {
-            let r = match ggsql::reader::SqliteReader::from_connection_string(&reader) {
-                Ok(r) => r,
-                Err(e) => {
-                    eprintln!("Failed to create reader: {}", e);
-                    std::process::exit(1);
+    // Build the reader. A composite `<primary>+<cache>://` URI is handled by
+    // `reader_from_uri`; the `--cache` flag is an explicit alternative and may
+    // not be combined with a composite URI.
+    let built = match cache {
+        Some(cache_scheme) => {
+            if connection::split_cache_uri(&reader).is_some() {
+                eprintln!(
+                    "Cannot combine --cache with a composite '<primary>+<cache>://' connection string"
+                );
+                std::process::exit(1);
+            }
+            // `--cache <scheme>` is sugar for the composite `<primary>+<scheme>://` URI.
+            match reader.split_once("://") {
+                Some((scheme, rest)) => {
+                    connection::reader_from_uri(&format!("{scheme}+{cache_scheme}://{rest}"))
                 }
-            };
-            exec_with_reader(&query, &r, &writer, output, verbose);
-        }
-        #[cfg(not(feature = "sqlite"))]
-        {
-            eprintln!("SQLite reader not compiled in. Rebuild with --features sqlite");
-            std::process::exit(1);
-        }
-    } else if reader.starts_with("odbc://") {
-        #[cfg(feature = "odbc")]
-        {
-            let r = match ggsql::reader::OdbcReader::from_connection_string(&reader) {
-                Ok(r) => r,
-                Err(e) => {
-                    eprintln!("Failed to create reader: {}", e);
+                None => {
+                    eprintln!("Invalid --reader connection string: {reader}");
                     std::process::exit(1);
                 }
-            };
-            exec_with_reader(&query, &r, &writer, output, verbose);
+            }
         }
-        #[cfg(not(feature = "odbc"))]
-        {
-            eprintln!("ODBC reader not compiled in. Rebuild with --features odbc");
+        None => connection::reader_from_uri(&reader),
+    };
+
+    let reader = match built {
+        Ok(r) => r,
+        Err(e) => {
+            eprintln!("Failed to create reader: {}", e);
             std::process::exit(1);
         }
-    } else if reader.starts_with("postgres://") || reader.starts_with("postgresql://") {
-        eprintln!("PostgreSQL reader is not yet implemented");
-        std::process::exit(1);
-    } else {
-        eprintln!("Unsupported connection string: {}", reader);
-        std::process::exit(1);
-    }
+    };
+
+    exec_with_reader(&query, reader.as_ref(), &writer, output, verbose);
 }
 
-fn exec_with_reader<R: Reader>(
+fn exec_with_reader<R: Reader + ?Sized>(
     query: &str,
     reader: &R,
     writer: &str,
@@ -430,7 +435,7 @@ fn cmd_validate(query: String, _reader: Option<String>) {
 }
 
 // Prints a CSV-like output to stdout with aligned columns
-fn print_table_fallback<R: Reader>(query: &str, reader: &R, max_rows: usize) {
+fn print_table_fallback<R: Reader + ?Sized>(query: &str, reader: &R, max_rows: usize) {
     let source_tree = match parser::SourceTree::new(query) {
         Ok(st) => st,
         Err(e) => {
diff --git a/ggsql-jupyter/CLAUDE.md b/ggsql-jupyter/CLAUDE.md
index adcb5ecd3..18868e3a9 100644
--- a/ggsql-jupyter/CLAUDE.md
+++ b/ggsql-jupyter/CLAUDE.md
@@ -32,7 +32,7 @@ ggsql-jupyter/
 
 1. `ggsql-jupyter --install` writes a kernelspec into the active Python environment (Jupyter, conda, uv, virtualenv — auto-detected).
 2. `ggsql-jupyter <connection-file>` is the entry point Jupyter invokes; it reads the connection JSON, opens the five ZMQ sockets (shell, control, iopub, stdin, heartbeat), and runs `kernel.rs`'s message loop.
-3. Each `execute_request` is dispatched through `executor.rs` → `ggsql::reader::DuckDBReader::execute(...)`. The kernel keeps a single persistent in-memory DuckDB session so cells share state.
+3. Each `execute_request` is dispatched through `executor.rs` → `ggsql::reader::DuckDBReader::execute(...)`. The kernel keeps a single persistent in-memory DuckDB session so cells share state. Readers are built via the library factory `ggsql::reader::connection::reader_from_uri`, so a composite `<primary>+<cache>://` connection string (e.g. via `-- @connect:`) wraps the reader in an in-memory caching layer — the persistent kernel session means repeated cells reuse cached remote reads.
 4. The result is wrapped by `display.rs` into a Jupyter `display_data` message — Vega-Lite specs go through vega-embed in an HTML payload (works in classic Jupyter, JupyterLab, and Positron); pure SQL goes out as an HTML table.
 
 ## Positron-specific bits
diff --git a/ggsql-jupyter/src/executor.rs b/ggsql-jupyter/src/executor.rs
index 6f928e7ab..fb9d51bfb 100644
--- a/ggsql-jupyter/src/executor.rs
+++ b/ggsql-jupyter/src/executor.rs
@@ -7,8 +7,8 @@
 use anyhow::Result;
 use ggsql::{
     reader::{
-        connection::{extract_odbc_value, parse_connection_string},
-        DuckDBReader, Reader,
+        connection::{extract_odbc_value, reader_from_uri},
+        Reader,
     },
     validate::validate,
     writer::{VegaLiteWriter, Writer},
@@ -28,41 +28,6 @@ pub enum ExecutionResult {
     ConnectionChanged { uri: String, display_name: String },
 }
 
-/// Create a reader from a connection URI string.
-///
-/// Supported schemes:
-/// - `duckdb://memory` or `duckdb://<path>` (always available)
-/// - `sqlite://<path>` (requires `sqlite` feature)
-/// - `odbc://...` (requires `odbc` feature)
-pub fn create_reader(uri: &str) -> Result<Box<dyn Reader + Send>> {
-    use ggsql::reader::connection::ConnectionInfo;
-
-    let info = parse_connection_string(uri)?;
-    match info {
-        ConnectionInfo::DuckDBMemory => {
-            let reader = DuckDBReader::from_connection_string("duckdb://memory")?;
-            Ok(Box::new(reader))
-        }
-        ConnectionInfo::DuckDBFile(path) => {
-            let reader = DuckDBReader::from_connection_string(&format!("duckdb://{}", path))?;
-            Ok(Box::new(reader))
-        }
-        #[cfg(feature = "odbc")]
-        ConnectionInfo::ODBC(conn_str) => {
-            let reader =
-                ggsql::reader::OdbcReader::from_connection_string(&format!("odbc://{}", conn_str))?;
-            Ok(Box::new(reader))
-        }
-        #[cfg(feature = "sqlite")]
-        ConnectionInfo::SQLite(path) => {
-            let reader =
-                ggsql::reader::SqliteReader::from_connection_string(&format!("sqlite://{}", path))?;
-            Ok(Box::new(reader))
-        }
-        _ => anyhow::bail!("Unsupported reader type for connection string: {}", uri),
-    }
-}
-
 /// Generate a human-readable display name for a connection URI.
 pub fn display_name_for_uri(uri: &str) -> String {
     if uri == "duckdb://memory" {
@@ -156,7 +121,7 @@ impl QueryExecutor {
     /// Create a new query executor with a given connection URI
     pub fn new_with_uri(uri: &str) -> Result<Self> {
         tracing::info!("Initializing query executor with reader: {}", uri);
-        let reader = create_reader(uri)?;
+        let reader = reader_from_uri(uri)?;
         let writer = VegaLiteWriter::new();
 
         Ok(Self {
@@ -184,7 +149,7 @@ impl QueryExecutor {
 
     /// Swap the reader to a new connection, returning the old URI
     pub fn swap_reader(&mut self, uri: &str) -> Result<String> {
-        let new_reader = create_reader(uri)?;
+        let new_reader = reader_from_uri(uri)?;
         self.reader = new_reader;
         let old_uri = std::mem::replace(&mut self.reader_uri, uri.to_string());
         Ok(old_uri)
diff --git a/src/CLAUDE.md b/src/CLAUDE.md
index b9b68dc91..8047c38a7 100644
--- a/src/CLAUDE.md
+++ b/src/CLAUDE.md
@@ -46,11 +46,14 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding
 | `duckdb.rs` | DuckDB (in-memory or file) | `duckdb` (default) |
 | `sqlite.rs` | SQLite | `sqlite` (default) |
 | `odbc.rs` | ODBC | `odbc` (default) |
+| `cache.rs` | `CachingReader` — wraps any primary `Reader` with an in-memory cache | `duckdb` or `sqlite` |
 | `connection.rs` | Connection-string parsing for all of the above | — |
 | `data.rs`, `spec.rs` | `Spec` type returned by `execute()`, plus DataFrame conversion | — |
 
 `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`).
 
+**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`. It routes each `execute_sql`: cache-resident names and `ggsql:` builtins go to the cache, base reads go to the primary (and are memoized). The `Reader::materialize_table` seam (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to run the read and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's extra materialization of explicit per-layer sources. `dialect()` returns the **cache** dialect because every executor-generated query targets a cache-resident table. Selected via the composite `<primary>+<cache>://` connection scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
+
 ### `execute/`
 
 The pipeline that takes a parsed `Plot` plus a `Reader` and produces a fully-resolved `Spec` (typed data per layer, scales resolved, casts applied). Submodules:
diff --git a/src/execute/cte.rs b/src/execute/cte.rs
index 9b544988d..e843fa074 100644
--- a/src/execute/cte.rs
+++ b/src/execute/cte.rs
@@ -146,16 +146,11 @@ pub fn materialize_ctes(ctes: &[CteDefinition], reader: &dyn Reader) -> Result<H
 
         let temp_table_name = naming::cte_table(&cte.name);
 
-        let statements = reader.dialect().create_or_replace_temp_table_sql(
-            &temp_table_name,
-            &cte.column_aliases,
-            &transformed_body,
-        );
-        for stmt in &statements {
-            reader.execute_sql(stmt).map_err(|e| {
+        reader
+            .materialize_table(&temp_table_name, &cte.column_aliases, &transformed_body)
+            .map_err(|e| {
                 GgsqlError::ReaderError(format!("Failed to materialize CTE '{}': {}", cte.name, e))
             })?;
-        }
 
         materialized.insert(cte.name.clone());
     }
diff --git a/src/execute/mod.rs b/src/execute/mod.rs
index 59be3b807..cf205c505 100644
--- a/src/execute/mod.rs
+++ b/src/execute/mod.rs
@@ -1156,16 +1156,11 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<Prep
         }
 
         if let Some(query) = query {
-            // Materialize global result as a temp table directly on the backend
-            // (no roundtrip through Rust).
-            let statements = reader.dialect().create_or_replace_temp_table_sql(
-                &naming::global_table(),
-                &[],
-                &query,
-            );
-            for stmt in &statements {
-                execute_query(stmt)?;
-            }
+            // Materialize the global result as a temp table. For a plain reader
+            // this emits `CREATE TEMP TABLE … AS …` on the backend (no roundtrip
+            // through Rust); a caching reader runs the query and registers the
+            // result into its cache.
+            reader.materialize_table(&naming::global_table(), &[], &query)?;
 
             // NOTE: Don't read into data_map yet - defer until after casting is determined
             // The temp table exists and can be used for schema fetching
@@ -1186,12 +1181,40 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<Prep
     // Build source queries for each layer to fetch initial type info
     // Every layer now has its own source query (either explicit source or global table)
     // For annotation layers, this is where array recycling and parameter→mapping conversion happens
-    let layer_source_queries: Vec<String> = specs[0]
+    let mut layer_source_queries: Vec<String> = specs[0]
         .layers
         .iter_mut()
         .map(|l| layer::layer_source_query(l, &materialized_ctes, has_global_table, dialect))
         .collect::<Result<Vec<_>>>()?;
 
+    // When the reader stages sources into a cache (a caching layer is active),
+    // materialize explicit external layer sources into the cache and rewrite the
+    // layer to read the cached table.
+    if reader.caches_sources() {
+        let mut materialized_sources: HashMap<String, String> = HashMap::new();
+        for (idx, layer) in specs[0].layers.iter().enumerate() {
+            let is_external = match &layer.source {
+                Some(crate::DataSource::Identifier(name)) => !materialized_ctes.contains(name),
+                Some(crate::DataSource::FilePath(_)) => true,
+                _ => false,
+            };
+            if !is_external {
+                continue;
+            }
+            let source_query = layer_source_queries[idx].clone();
+            let table = match materialized_sources.get(&source_query) {
+                Some(t) => t.clone(),
+                None => {
+                    let t = naming::layer_source_table(materialized_sources.len());
+                    reader.materialize_table(&t, &[], &source_query)?;
+                    materialized_sources.insert(source_query, t.clone());
+                    t
+                }
+            };
+            layer_source_queries[idx] = format!("SELECT * FROM {}", naming::quote_ident(&table));
+        }
+    }
+
     // Get types for each layer from source queries (Phase 1: types only, no min/max yet)
     let mut layer_type_info: Vec<Vec<schema::TypeInfo>> = Vec::new();
     for source_query in &layer_source_queries {
diff --git a/src/naming.rs b/src/naming.rs
index 6259f5bd3..1d1357cea 100644
--- a/src/naming.rs
+++ b/src/naming.rs
@@ -137,6 +137,48 @@ pub fn cte_table(cte_name: &str) -> String {
     )
 }
 
+/// Generate temp table name for a memoized primary query result in a cache.
+///
+/// Format: `__ggsql_cache_<uuid>_<id>__`
+///
+/// # Example
+/// ```
+/// use ggsql::naming;
+/// let table = naming::cache_result_table(0);
+/// assert!(table.starts_with("__ggsql_cache_"));
+/// assert!(table.ends_with("_0__"));
+/// ```
+pub fn cache_result_table(id: u64) -> String {
+    format!(
+        "{}cache_{}_{}{}",
+        GGSQL_PREFIX,
+        session_id(),
+        id,
+        GGSQL_SUFFIX
+    )
+}
+
+/// Generate temp table name for a materialized explicit layer source.
+///
+/// Format: `__ggsql_layer_src_<index>_<uuid>__`
+///
+/// # Example
+/// ```
+/// use ggsql::naming;
+/// let table = naming::layer_source_table(2);
+/// assert!(table.starts_with("__ggsql_layer_src_2_"));
+/// assert!(table.ends_with("__"));
+/// ```
+pub fn layer_source_table(index: usize) -> String {
+    format!(
+        "{}layer_src_{}_{}{}",
+        GGSQL_PREFIX,
+        index,
+        session_id(),
+        GGSQL_SUFFIX
+    )
+}
+
 /// Generate table name for a builtin dataset.
 ///
 /// Used when rewriting `ggsql:penguins` to the internal table name.
diff --git a/src/reader/adbc.rs b/src/reader/adbc.rs
index 8de00b01d..203d31e26 100644
--- a/src/reader/adbc.rs
+++ b/src/reader/adbc.rs
@@ -687,71 +687,7 @@ mod equivalence_tests {
             .expect("SqliteReader at the same path")
     }
 
-    /// Compare two DataFrames by schema (field names + types) and by
-    /// per-column Arrow array contents. We don't use a blanket
-    /// `assert_eq!(df, df)` because `DataFrame` doesn't implement `PartialEq`;
-    /// going through schema + per-column equality is also more diagnostic
-    /// when one of them diverges.
-    fn assert_dataframes_equal(
-        adbc_df: &crate::DataFrame,
-        sqlite_df: &crate::DataFrame,
-        ctx: &str,
-    ) {
-        let adbc_schema = adbc_df.schema();
-        let sqlite_schema = sqlite_df.schema();
-        assert_eq!(
-            adbc_schema.fields().len(),
-            sqlite_schema.fields().len(),
-            "{}: column count mismatch (adbc={}, sqlite={})",
-            ctx,
-            adbc_schema.fields().len(),
-            sqlite_schema.fields().len(),
-        );
-        for (i, (a, s)) in adbc_schema
-            .fields()
-            .iter()
-            .zip(sqlite_schema.fields().iter())
-            .enumerate()
-        {
-            assert_eq!(
-                a.name(),
-                s.name(),
-                "{}: column {} name mismatch (adbc='{}', sqlite='{}')",
-                ctx,
-                i,
-                a.name(),
-                s.name(),
-            );
-            assert_eq!(
-                a.data_type(),
-                s.data_type(),
-                "{}: column '{}' type mismatch (adbc={:?}, sqlite={:?})",
-                ctx,
-                a.name(),
-                a.data_type(),
-                s.data_type(),
-            );
-        }
-        assert_eq!(
-            adbc_df.height(),
-            sqlite_df.height(),
-            "{}: row count mismatch (adbc={}, sqlite={})",
-            ctx,
-            adbc_df.height(),
-            sqlite_df.height(),
-        );
-        for field in adbc_schema.fields() {
-            let a = adbc_df.column(field.name()).unwrap();
-            let s = sqlite_df.column(field.name()).unwrap();
-            assert_eq!(
-                a.as_ref(),
-                s.as_ref(),
-                "{}: column '{}' data mismatch",
-                ctx,
-                field.name(),
-            );
-        }
-    }
+    use crate::reader::test_support::assert_dataframes_equal;
 
     #[test]
     #[ignore = "requires `dbc install sqlite`; see module docs"]
diff --git a/src/reader/cache.rs b/src/reader/cache.rs
new file mode 100644
index 000000000..015022282
--- /dev/null
+++ b/src/reader/cache.rs
@@ -0,0 +1,504 @@
+//! Caching reader: any primary [`Reader`] + an in-memory writable cache.
+//!
+//! [`CachingReader`] wraps a `primary` reader and a `cache` backend (an
+//! in-memory [`CacheBackend`]). It sits between the primary reader and the
+//! rest of ggsql:
+//!
+//! - [`Reader::register`] writes to the cache and records the name.
+//! - [`Reader::materialize_table`] runs the body and `register`s the resulting
+//!   DataFrame into the cache, so the primary is only ever read from — base
+//!   reads happen via passthrough SQL, derived tables live in the cache.
+//! - [`Reader::execute_sql`] routes each statement: queries referencing a
+//!   cache-resident name (or a `ggsql:` builtin) run on the cache; base reads
+//!   run on the primary and are memoized into the cache.
+//! - [`Reader::dialect`] returns the **cache** dialect: every dialect-specific
+//!   query the executor builds targets a cache-resident table.
+
+use crate::reader::{execute_with_reader, ColumnInfo, Reader, Spec, SqlDialect, TableInfo};
+use crate::{naming, DataFrame, Result};
+use std::cell::{Cell, RefCell};
+use std::collections::{HashMap, HashSet};
+
+pub struct CachingReader {
+    primary: Box<dyn Reader + Send>,
+    cache: Box<dyn Reader + Send>,
+    /// Names that currently live in the cache (registered tables / memo results).
+    cached_names: RefCell<HashSet<String>>,
+    /// Memo of base primary reads: SQL text -> cache table name holding the result.
+    result_cache: RefCell<HashMap<String, String>>,
+    /// Monotonic counter for unique memo table names.
+    next_id: Cell<u64>,
+}
+
+impl CachingReader {
+    /// Construct a `CachingReader` from a primary reader and an in-memory cache
+    /// backend. The cache is owned by the `CachingReader` and dropped with it.
+    pub fn new(primary: Box<dyn Reader + Send>, cache: Box<dyn Reader + Send>) -> Self {
+        Self {
+            primary,
+            cache,
+            cached_names: RefCell::new(HashSet::new()),
+            result_cache: RefCell::new(HashMap::new()),
+            next_id: Cell::new(0),
+        }
+    }
+
+    /// Whether `sql` is one of the cache dialect's spatial setup statements.
+    ///
+    /// ggsql-generated spatial setup (`INSTALL`/`LOAD spatial`, …) must run on
+    /// the cache, where the spatial SQL executes — not on the primary.
+    fn is_cache_spatial_setup(&self, sql: &str) -> bool {
+        self.cache
+            .dialect()
+            .sql_spatial_setup()
+            .iter()
+            .any(|stmt| stmt == sql)
+    }
+}
+
+impl Reader for CachingReader {
+    fn execute_sql(&self, sql: &str) -> Result<DataFrame> {
+        // Cache-resident names and `ggsql:` builtins are served by the cache.
+        if references_cached_name(sql, &self.cached_names.borrow()) || sql.contains("ggsql:") {
+            return self.cache.execute_sql(sql);
+        }
+
+        // ggsql-generated spatial setup targets the cache (where spatial SQL runs).
+        if self.is_cache_spatial_setup(sql) {
+            return self.cache.execute_sql(sql);
+        }
+
+        // Base read against the primary, with result memoization.
+        if let Some(table) = self.result_cache.borrow().get(sql) {
+            return self
+                .cache
+                .execute_sql(&format!("SELECT * FROM {}", naming::quote_ident(table)));
+        }
+
+        let df = self.primary.execute_sql(sql)?;
+
+        if super::returns_rows(sql) {
+            let id = self.next_id.get();
+            self.next_id.set(id + 1);
+            let table = naming::cache_result_table(id);
+            self.cache.register(&table, df.clone(), true)?;
+            self.cached_names.borrow_mut().insert(table.clone());
+            self.result_cache
+                .borrow_mut()
+                .insert(sql.to_string(), table);
+        }
+
+        Ok(df)
+    }
+
+    fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> {
+        self.cache.register(name, df, replace)?;
+        self.cached_names.borrow_mut().insert(name.to_string());
+        Ok(())
+    }
+
+    fn unregister(&self, name: &str) -> Result<()> {
+        self.cache.unregister(name)?;
+        self.cached_names.borrow_mut().remove(name);
+        Ok(())
+    }
+
+    fn execute(&self, query: &str) -> Result<Spec> {
+        execute_with_reader(self, query)
+    }
+
+    fn dialect(&self) -> &dyn SqlDialect {
+        // All executor-generated SQL targets cache-resident tables.
+        self.cache.dialect()
+    }
+
+    fn materialize_table(
+        &self,
+        name: &str,
+        column_aliases: &[String],
+        body_sql: &str,
+    ) -> Result<()> {
+        let body = super::wrap_with_column_aliases(body_sql, column_aliases);
+        let df = self.execute_sql(&body)?;
+        self.register(name, df, true)
+    }
+
+    fn caches_sources(&self) -> bool {
+        true
+    }
+
+    // Schema introspection describes the real data source, so delegate to the
+    // primary; the cache only holds synthetic `__ggsql_*` tables.
+    fn list_catalogs(&self) -> Result<Vec<String>> {
+        self.primary.list_catalogs()
+    }
+
+    fn list_schemas(&self, catalog: &str) -> Result<Vec<String>> {
+        self.primary.list_schemas(catalog)
+    }
+
+    fn list_tables(&self, catalog: &str, schema: &str) -> Result<Vec<TableInfo>> {
+        self.primary.list_tables(catalog, schema)
+    }
+
+    fn list_columns(&self, catalog: &str, schema: &str, table: &str) -> Result<Vec<ColumnInfo>> {
+        self.primary.list_columns(catalog, schema, table)
+    }
+}
+
+/// Check whether `sql` references any name in `cached_names` as a SQL
+/// identifier (not part of a longer identifier, not inside a single-quoted
+/// string literal).
+///
+/// Matches a bare reference (`FROM orders`), a double-quoted identifier
+/// (`FROM "orders"`), or a qualified reference (`FROM catalog.schema.orders`).
+/// Does not match substrings of longer identifiers (`orders_detail`) or
+/// string-literal contents (`'orders of magnitude'`). This is a lightweight
+/// scanner, not a full SQL parser: a cached name appearing only inside a
+/// comment is misrouted to the cache and fails with a clear error rather than
+/// silently hitting the primary.
+fn references_cached_name(sql: &str, cached_names: &HashSet<String>) -> bool {
+    cached_names
+        .iter()
+        .any(|name| sql_references_identifier(sql, name))
+}
+
+fn sql_references_identifier(sql: &str, name: &str) -> bool {
+    let bytes = sql.as_bytes();
+    let name_bytes = name.as_bytes();
+    let n = name_bytes.len();
+    if n == 0 {
+        return false;
+    }
+    let mut i = 0;
+    while i + n <= bytes.len() {
+        if &bytes[i..i + n] == name_bytes {
+            let before_ok = i == 0 || !is_identifier_byte(bytes[i - 1]);
+            let after_ok = i + n == bytes.len() || !is_identifier_byte(bytes[i + n]);
+            if before_ok && after_ok && !is_inside_string_literal(bytes, i) {
+                return true;
+            }
+        }
+        i += 1;
+    }
+    false
+}
+
+fn is_identifier_byte(b: u8) -> bool {
+    b.is_ascii_alphanumeric() || b == b'_'
+}
+
+/// Walk from start to `pos` tracking whether we're inside a single-quoted
+/// string literal. SQL-standard doubled-single-quote (`''`) is an escape that
+/// keeps us inside the literal.
+fn is_inside_string_literal(bytes: &[u8], pos: usize) -> bool {
+    let mut inside = false;
+    let mut i = 0;
+    while i < pos && i < bytes.len() {
+        if bytes[i] == b'\'' {
+            if inside && i + 1 < bytes.len() && bytes[i + 1] == b'\'' {
+                i += 2;
+                continue;
+            }
+            inside = !inside;
+        }
+        i += 1;
+    }
+    inside
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_references_cached_name_empty_set() {
+        let set = HashSet::new();
+        assert!(!references_cached_name("SELECT * FROM foo", &set));
+    }
+
+    #[test]
+    fn test_references_cached_name_match() {
+        let mut set = HashSet::new();
+        set.insert("__ggsql_global_abc123__".to_string());
+        assert!(references_cached_name(
+            "SELECT * FROM __ggsql_global_abc123__ WHERE x > 1",
+            &set
+        ));
+    }
+
+    #[test]
+    fn test_references_cached_name_rejects_longer_identifier() {
+        let mut set = HashSet::new();
+        set.insert("orders".to_string());
+        assert!(!references_cached_name(
+            "SELECT * FROM orders_detail WHERE x > 1",
+            &set
+        ));
+    }
+
+    #[test]
+    fn test_references_cached_name_rejects_prefix_of_longer_identifier() {
+        let mut set = HashSet::new();
+        set.insert("col".to_string());
+        assert!(!references_cached_name("SELECT col_id FROM users", &set));
+    }
+
+    #[test]
+    fn test_references_cached_name_rejects_inside_string_literal() {
+        let mut set = HashSet::new();
+        set.insert("orders".to_string());
+        assert!(!references_cached_name(
+            "SELECT 'orders of magnitude' AS label",
+            &set
+        ));
+    }
+
+    #[test]
+    fn test_references_cached_name_matches_quoted_identifier() {
+        let mut set = HashSet::new();
+        set.insert("orders".to_string());
+        assert!(references_cached_name(r#"SELECT * FROM "orders""#, &set));
+    }
+
+    #[test]
+    fn test_references_cached_name_matches_qualified_reference() {
+        let mut set = HashSet::new();
+        set.insert("orders".to_string());
+        assert!(references_cached_name(
+            "SELECT * FROM catalog.schema.orders WHERE x > 1",
+            &set
+        ));
+    }
+
+    #[test]
+    fn test_references_cached_name_handles_escaped_quotes_in_literal() {
+        let mut set = HashSet::new();
+        set.insert("orders".to_string());
+        assert!(references_cached_name(
+            "SELECT 'it''s fine' FROM orders",
+            &set
+        ));
+    }
+}
+
+#[cfg(all(test, feature = "duckdb"))]
+mod behavior_tests {
+    use super::*;
+    use crate::array_util::as_i64;
+    use crate::df;
+    use crate::reader::test_support::{ReadOnlyReader, SpyReader};
+    use crate::reader::{CacheBackend, DuckDBReader};
+
+    #[test]
+    fn test_register_writes_to_cache_and_query_routes_there() {
+        let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap()));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache);
+
+        reader
+            .register("t", df! { "x" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let out = reader.execute_sql("SELECT COUNT(*) AS n FROM t").unwrap();
+
+        assert_eq!(as_i64(out.column("n").unwrap()).unwrap().value(0), 3);
+        // The query routed to the cache; the primary was never touched.
+        assert!(log.lock().unwrap().is_empty());
+    }
+
+    #[test]
+    fn test_base_read_routes_to_primary_and_memoizes() {
+        let inner = DuckDBReader::new_in_memory().unwrap();
+        inner
+            .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let (primary, log) = SpyReader::wrap(Box::new(inner));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache);
+
+        let q = "SELECT y FROM base ORDER BY y";
+        let d1 = reader.execute_sql(q).unwrap();
+        let d2 = reader.execute_sql(q).unwrap();
+        assert_eq!(d1.height(), 3);
+        assert_eq!(d2.height(), 3);
+
+        // The primary executed the base read exactly once; the repeat was served
+        // from the cache memo.
+        let hits = log
+            .lock()
+            .unwrap()
+            .iter()
+            .filter(|s| s.as_str() == q)
+            .count();
+        assert_eq!(hits, 1);
+    }
+
+    #[test]
+    fn test_full_execute_keeps_computation_off_primary() {
+        let inner = DuckDBReader::new_in_memory().unwrap();
+        inner
+            .register(
+                "sales",
+                df! { "x" => vec![1_i64, 2, 3, 4], "y" => vec![10_i64, 20, 30, 40] }.unwrap(),
+                true,
+            )
+            .unwrap();
+        let (primary, log) = SpyReader::wrap(Box::new(inner));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache);
+
+        reader
+            .execute("SELECT x, y FROM sales VISUALISE x, y DRAW point")
+            .unwrap();
+
+        let log = log.lock().unwrap();
+        // The primary is only ever read from: no temp-table DDL, no derived
+        // `__ggsql_*` tables ever reach it.
+        assert!(
+            log.iter().all(|s| !s.to_uppercase().contains("TEMP TABLE")),
+            "primary must not be written to: {:?}",
+            *log
+        );
+        assert!(
+            log.iter().all(|s| !s.contains("__ggsql_")),
+            "primary must not see derived tables: {:?}",
+            *log
+        );
+        // It did see the base read.
+        assert!(log.iter().any(|s| s.contains("sales")));
+    }
+
+    #[test]
+    fn test_caching_makes_read_only_primary_usable() {
+        // The read-only-primary value proposition, exercised in every DuckDB
+        // build (including duckdb-only CI): a primary that refuses all writes is
+        // unusable on its own — materializing the global temp table fails — but
+        // works once wrapped in a caching layer, because every write goes to the
+        // cache and the primary is only read.
+        let query = "SELECT v FROM t VISUALISE v AS x DRAW histogram";
+
+        // Bare read-only primary: materialization must fail.
+        let bare_primary = DuckDBReader::new_in_memory().unwrap();
+        bare_primary
+            .register(
+                "t",
+                df! { "v" => vec![1.0_f64, 2.0, 3.0, 4.0] }.unwrap(),
+                true,
+            )
+            .unwrap();
+        let bare = ReadOnlyReader::new(Box::new(bare_primary));
+        assert!(
+            bare.execute(query).is_err(),
+            "a read-only primary with no cache must fail to materialize"
+        );
+
+        // Same read-only primary behind a cache: must succeed.
+        let primary = DuckDBReader::new_in_memory().unwrap();
+        primary
+            .register(
+                "t",
+                df! { "v" => vec![1.0_f64, 2.0, 3.0, 4.0] }.unwrap(),
+                true,
+            )
+            .unwrap();
+        let cached = CachingReader::new(
+            Box::new(ReadOnlyReader::new(Box::new(primary))),
+            Box::new(DuckDBReader::new_in_memory().unwrap()),
+        );
+        assert!(
+            cached.execute(query).is_ok(),
+            "caching should make a read-only primary usable"
+        );
+    }
+
+    #[test]
+    fn test_no_cache_path_materializes_on_the_reader() {
+        // A plain reader (no CachingReader) must keep today's behavior:
+        // derived tables are materialized on the reader itself.
+        let inner = DuckDBReader::new_in_memory().unwrap();
+        inner
+            .register(
+                "sales",
+                df! { "x" => vec![1_i64, 2, 3], "y" => vec![10_i64, 20, 30] }.unwrap(),
+                true,
+            )
+            .unwrap();
+        let (reader, log) = SpyReader::wrap(Box::new(inner));
+
+        reader
+            .execute("SELECT x, y FROM sales VISUALISE x, y DRAW point")
+            .unwrap();
+
+        assert!(
+            log.lock()
+                .unwrap()
+                .iter()
+                .any(|s| s.to_uppercase().contains("TEMP TABLE")),
+            "default path must materialize on the reader"
+        );
+    }
+
+    #[cfg(feature = "spatial")]
+    #[test]
+    fn test_spatial_setup_routes_to_cache() {
+        let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap()));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache);
+
+        // ggsql-generated spatial setup must run on the cache (where the spatial
+        // SQL executes), never on the primary.
+        let setup = reader.dialect().sql_spatial_setup();
+        assert!(!setup.is_empty(), "DuckDB cache should emit spatial setup");
+        for stmt in &setup {
+            // Ignore execution result (may require the extension); we only assert routing.
+            let _ = reader.execute_sql(stmt);
+        }
+        let log = log.lock().unwrap();
+        for stmt in &setup {
+            assert!(
+                !log.contains(stmt),
+                "spatial setup must route to the cache, not the primary: {}",
+                stmt
+            );
+        }
+    }
+
+    #[cfg(feature = "sqlite")]
+    #[test]
+    fn test_dialect_returns_cache_dialect() {
+        use crate::reader::SqliteReader;
+        // SQLite primary, DuckDB cache: dialect() must return DuckDB's (native
+        // GREATEST), not SQLite's (CASE fallback).
+        let primary = Box::new(SqliteReader::new().unwrap());
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache);
+        assert_eq!(reader.dialect().sql_greatest(&["a", "b"]), "GREATEST(a, b)");
+    }
+
+    #[cfg(feature = "sqlite")]
+    #[test]
+    fn test_explicit_layer_source_with_stat_heterogeneous() {
+        use crate::reader::SqliteReader;
+        // SQLite primary holds the table; DuckDB is the cache. A layer draws a
+        // histogram from the primary table, which generates DuckDB-dialect stat
+        // SQL. This only works because the layer source is materialized into the
+        // cache; otherwise DuckDB SQL would run against the SQLite primary.
+        let primary = SqliteReader::new().unwrap();
+        primary
+            .register(
+                "tbl",
+                df! { "val" => vec![1.0_f64, 2.0, 2.0, 3.0, 3.0, 3.0, 9.0] }.unwrap(),
+                true,
+            )
+            .unwrap();
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(Box::new(primary), cache);
+
+        let spec = reader.execute("VISUALISE x DRAW histogram MAPPING val AS x FROM tbl");
+        assert!(
+            spec.is_ok(),
+            "explicit-source histogram failed: {:?}",
+            spec.err()
+        );
+    }
+}
diff --git a/src/reader/cache_equivalence.rs b/src/reader/cache_equivalence.rs
new file mode 100644
index 000000000..6b8f808d2
--- /dev/null
+++ b/src/reader/cache_equivalence.rs
@@ -0,0 +1,334 @@
+//! Equivalence and read-only-safety tests for the caching layer.
+//!
+//! These exercise the whole mechanism (materialization, routing, the result
+//! cache, builtin routing, layer-source staging, dialect selection).
+
+use super::CachingReader;
+use crate::reader::test_support::{ReadOnlyReader, SpyReader};
+use crate::reader::{CacheBackend, DuckDBReader, Reader, SqliteReader};
+use crate::DataFrame;
+
+/// One corpus entry. `builtin_only` queries read only `ggsql:` datasets (which
+/// route to the cache), so they can be compared exactly to plain DuckDB.
+struct Case {
+    query: &'static str,
+    builtin_only: bool,
+}
+
+/// A stat-heavy corpus — these generate the most cache-dialect SQL
+/// (`sql_percentile`, `sql_greatest`/`sql_least`, `sql_generate_series`,
+/// casts), where caching is most likely to diverge.
+const CORPUS: &[Case] = &[
+    // boxplot: quantiles / IQR
+    Case {
+        query: "VISUALISE species AS x, bill_len AS y FROM ggsql:penguins DRAW boxplot",
+        builtin_only: true,
+    },
+    // histogram: binning + casts (global SELECT over a builtin)
+    Case {
+        query: "SELECT Temp FROM ggsql:airquality VISUALISE Temp AS x DRAW histogram",
+        builtin_only: true,
+    },
+    // density: percentile + generate_series + stddev
+    Case {
+        query: "VISUALISE bill_len AS x, species AS colour FROM ggsql:penguins DRAW density",
+        builtin_only: true,
+    },
+    // grouped aggregation + facet + discrete scale
+    Case {
+        query: "SELECT species, bill_len, island FROM ggsql:penguins \
+                VISUALISE species AS x, bill_len AS y \
+                DRAW bar SETTING aggregate => 'mean' FACET island",
+        builtin_only: true,
+    },
+    // WITH CTE + multi-layer + FILTER (CTE materialization + global + per-layer routing)
+    Case {
+        query: "WITH hot AS (SELECT Date, Temp FROM ggsql:airquality WHERE Temp > 70) \
+                SELECT Date, Temp FROM hot \
+                VISUALISE Date AS x, Temp AS y \
+                DRAW line DRAW point FILTER Temp > 80 SCALE x VIA date",
+        builtin_only: true,
+    },
+    // explicit per-layer source from a seeded table → forces layer-source staging
+    // (`caches_sources`) and the read-from-primary + stat-on-cache path.
+    Case {
+        query: "VISUALISE species AS x, bill_len AS y \
+                DRAW boxplot MAPPING species AS x, bill_len AS y FROM cache_eq_tbl",
+        builtin_only: false,
+    },
+];
+
+/// Seed the table referenced by the non-builtin corpus entry.
+fn seed(reader: &dyn Reader) {
+    let df = crate::df! {
+        "species" => vec!["A", "A", "B", "B", "B", "C"],
+        "bill_len" => vec![1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0],
+    }
+    .unwrap();
+    reader.register("cache_eq_tbl", df, true).unwrap();
+}
+
+/// Stringify each row using the given (canonical, sorted) column order, so two
+/// DataFrames can be compared as row multisets — ignoring both physical row order
+/// (a query without `ORDER BY` may return rows in a different order on each
+/// materialization path) and column order (aesthetic columns are emitted in
+/// HashMap order; they bind to encoding channels by name, not position).
+fn row_multiset(df: &DataFrame, names: &[String]) -> Vec<String> {
+    (0..df.height())
+        .map(|i| {
+            names
+                .iter()
+                .map(|n| crate::array_util::value_to_string(df.column(n).unwrap(), i))
+                .collect::<Vec<_>>()
+                .join("\u{1f}")
+        })
+        .collect()
+}
+
+/// Assert two layer DataFrames have the same columns (by name + type) and the
+/// same set of rows — insensitive to row and column ordering.
+fn assert_data_equivalent(a: &DataFrame, b: &DataFrame, ctx: &str) {
+    let mut na: Vec<String> = a
+        .schema()
+        .fields()
+        .iter()
+        .map(|f| f.name().to_string())
+        .collect();
+    let mut nb: Vec<String> = b
+        .schema()
+        .fields()
+        .iter()
+        .map(|f| f.name().to_string())
+        .collect();
+    na.sort();
+    nb.sort();
+    assert_eq!(na, nb, "{ctx}: column-name set");
+    for name in &na {
+        assert_eq!(
+            a.column(name).unwrap().data_type(),
+            b.column(name).unwrap().data_type(),
+            "{ctx}: column '{name}' type",
+        );
+    }
+    assert_eq!(a.height(), b.height(), "{ctx}: row count");
+    let (mut ra, mut rb) = (row_multiset(a, &na), row_multiset(b, &na));
+    ra.sort();
+    rb.sort();
+    assert_eq!(ra, rb, "{ctx}: row multiset mismatch");
+}
+
+/// Assert that a query produces equivalent output through `plain` and `cached`.
+/// A query a backend can't run must fail the same way with or without the cache.
+fn assert_equivalent(plain: &dyn Reader, cached: &dyn Reader, query: &str) {
+    let a = plain.execute(query);
+    let b = cached.execute(query);
+    assert_eq!(
+        a.is_ok(),
+        b.is_ok(),
+        "ok-mismatch for `{query}`: plain={:?} cached={:?}",
+        a.as_ref().err(),
+        b.as_ref().err(),
+    );
+    let (Ok(sa), Ok(sb)) = (a, b) else { return };
+
+    assert_eq!(
+        sa.layer_count(),
+        sb.layer_count(),
+        "layer count for `{query}`"
+    );
+    for i in 0..sa.layer_count() {
+        match (sa.layer_data(i), sb.layer_data(i)) {
+            (Some(da), Some(db)) => assert_data_equivalent(da, db, &format!("`{query}` layer {i}")),
+            (None, None) => {}
+            _ => panic!("layer {i} data-presence mismatch for `{query}`"),
+        }
+    }
+}
+
+/// The builtin corpus through `{ ReadOnlyReader(SQLite), DuckDB }` matches plain
+/// DuckDB exactly. The DuckDB cache does all reading and computing for `ggsql:`
+/// sources; the SQLite primary stays idle and unwritten.
+#[test]
+fn mode1_builtin_equivalence_matches_plain_duckdb() {
+    if !cfg!(feature = "builtin-data") {
+        return; // builtin corpus needs the embedded datasets
+    }
+    for case in CORPUS.iter().filter(|c| c.builtin_only) {
+        let plain = DuckDBReader::new_in_memory().unwrap();
+        let primary = ReadOnlyReader::new(Box::new(SqliteReader::new().unwrap()));
+        let cache = DuckDBReader::new_in_memory().unwrap();
+        let cached = CachingReader::new(Box::new(primary), Box::new(cache));
+        assert_equivalent(&plain, &cached, case.query);
+    }
+}
+
+/// Read-only safety: the full corpus succeeds through the caching reader,
+/// proving a read-only/remote primary is sufficient.
+#[test]
+fn mode1_read_only_primary_is_sufficient() {
+    for case in CORPUS {
+        if case.builtin_only && !cfg!(feature = "builtin-data") {
+            continue;
+        }
+        let sqlite = SqliteReader::new().unwrap();
+        seed(&sqlite);
+        let primary = ReadOnlyReader::new(Box::new(sqlite));
+        let cache = DuckDBReader::new_in_memory().unwrap();
+        let cached = CachingReader::new(Box::new(primary), Box::new(cache));
+        let r = cached.execute(case.query);
+        assert!(
+            r.is_ok(),
+            "read-only primary should suffice with caching for `{}`: {:?}",
+            case.query,
+            r.err(),
+        );
+    }
+}
+
+/// Cross-call memoization: a second identical execute does not re-read the
+/// primary, because the base read is served from the cache memo.
+#[test]
+fn cross_call_memoization_avoids_second_primary_read() {
+    let sqlite = SqliteReader::new().unwrap();
+    seed(&sqlite);
+    let (primary, log) = SpyReader::wrap(Box::new(sqlite));
+    let cache = DuckDBReader::new_in_memory().unwrap();
+    let cached = CachingReader::new(primary, Box::new(cache));
+
+    let query = "SELECT bill_len FROM cache_eq_tbl VISUALISE bill_len AS x DRAW histogram";
+    cached.execute(query).unwrap();
+    let after_first = log.lock().unwrap().len();
+    cached.execute(query).unwrap();
+    let after_second = log.lock().unwrap().len();
+
+    assert!(after_first >= 1, "first execute should read the primary");
+    assert_eq!(
+        after_first,
+        after_second,
+        "second execute must not re-hit the primary; log: {:?}",
+        *log.lock().unwrap(),
+    );
+}
+
+/// The library factory builds working caching readers from composite URIs.
+#[test]
+fn factory_builds_caching_readers() {
+    use crate::reader::connection::reader_from_uri;
+    for uri in ["duckdb+sqlite://memory", "sqlite+duckdb://memory"] {
+        let r = reader_from_uri(uri).unwrap_or_else(|e| panic!("build `{uri}`: {e}"));
+        if cfg!(feature = "builtin-data") {
+            let spec =
+                r.execute("VISUALISE species AS x, bill_len AS y FROM ggsql:penguins DRAW boxplot");
+            assert!(
+                spec.is_ok(),
+                "factory reader failed for `{uri}`: {:?}",
+                spec.err()
+            );
+        }
+    }
+}
+
+/// A real external ADBC SQLite primary + DuckDB cache, compared against a bare ADBC reader.
+/// `#[ignore]` — requires `dbc install sqlite`.
+#[cfg(feature = "adbc")]
+mod adbc_mode {
+    use super::*;
+    use crate::reader::sqlite::SqliteDialect;
+    use crate::reader::test_support::assert_dataframes_equal;
+    use crate::reader::{AdbcReader, Spec, SqlDialect};
+    use crate::{DataFrame, Result};
+    use adbc_core::options::{AdbcVersion, OptionDatabase, OptionValue};
+    use adbc_core::LOAD_FLAG_DEFAULT;
+    use adbc_driver_manager::ManagedDriver;
+    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::sync::Arc;
+    use tempfile::NamedTempFile;
+
+    fn make_adbc_reader(db_path: &str) -> AdbcReader<ManagedDriver> {
+        let driver = ManagedDriver::load_from_name(
+            "sqlite",
+            None,
+            AdbcVersion::V110,
+            LOAD_FLAG_DEFAULT,
+            None,
+        )
+        .expect("`dbc install sqlite` first; see adbc.rs::equivalence_tests docs");
+        let dialect: Box<dyn SqlDialect + Send> = Box::new(SqliteDialect);
+        AdbcReader::new_with_database_opts(
+            driver,
+            dialect,
+            std::iter::once((
+                OptionDatabase::Uri,
+                OptionValue::String(format!("file:{}", db_path)),
+            )),
+        )
+        .expect("construct AdbcReader<sqlite>")
+    }
+
+    fn seed_adbc(path: &str) {
+        let bare = make_adbc_reader(path);
+        let df = crate::df! {
+            "x" => vec![1_i64, 2, 3, 4, 5],
+            "y" => vec![10_i64, 20, 30, 40, 50],
+        }
+        .unwrap();
+        bare.register("t", df, false).unwrap();
+    }
+
+    /// Counts `execute_sql` calls reaching the ADBC primary.
+    struct CountingAdbcReader {
+        inner: AdbcReader<ManagedDriver>,
+        calls: Arc<AtomicUsize>,
+    }
+
+    impl Reader for CountingAdbcReader {
+        fn execute_sql(&self, sql: &str) -> Result<DataFrame> {
+            self.calls.fetch_add(1, Ordering::SeqCst);
+            self.inner.execute_sql(sql)
+        }
+        fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> {
+            self.inner.register(name, df, replace)
+        }
+        fn unregister(&self, name: &str) -> Result<()> {
+            self.inner.unregister(name)
+        }
+        fn execute(&self, query: &str) -> Result<Spec> {
+            crate::reader::execute_with_reader(self, query)
+        }
+        fn dialect(&self) -> &dyn SqlDialect {
+            self.inner.dialect()
+        }
+    }
+
+    #[test]
+    #[ignore = "requires `dbc install sqlite`"]
+    fn mode2_adbc_primary_duckdb_cache_equiv_and_memo() {
+        let db = NamedTempFile::new().unwrap();
+        let path = db.path().to_str().unwrap();
+        seed_adbc(path);
+
+        let sql = "SELECT x, y, x*y AS xy FROM t WHERE y > 15 ORDER BY x";
+        let golden = make_adbc_reader(path).execute_sql(sql).unwrap();
+
+        let calls = Arc::new(AtomicUsize::new(0));
+        let primary = CountingAdbcReader {
+            inner: make_adbc_reader(path),
+            calls: calls.clone(),
+        };
+        let cache = DuckDBReader::new_in_memory().unwrap();
+        let cached = CachingReader::new(Box::new(primary), Box::new(cache));
+
+        let miss = cached.execute_sql(sql).unwrap();
+        assert_dataframes_equal(&golden, &miss, "adbc cache miss");
+        let after_miss = calls.load(Ordering::SeqCst);
+        assert!(after_miss >= 1, "miss should reach the ADBC primary");
+
+        let hit = cached.execute_sql(sql).unwrap();
+        assert_dataframes_equal(&golden, &hit, "adbc cache hit");
+        let after_hit = calls.load(Ordering::SeqCst);
+        assert_eq!(
+            after_miss, after_hit,
+            "cache hit must not round-trip to the ADBC primary"
+        );
+    }
+}
diff --git a/src/reader/connection.rs b/src/reader/connection.rs
index 8f72c7c57..eebeffdf1 100644
--- a/src/reader/connection.rs
+++ b/src/reader/connection.rs
@@ -1,78 +1,125 @@
-//! Connection string parsing for data sources
+//! Connection string handling for data sources.
 //!
-//! Parses URI-style connection strings to determine database type and connection parameters.
+//! Maps URI-style connection strings (`duckdb://…`, `sqlite://…`, `odbc://…`) and
+//! the composite caching form (`<primary>+<cache>://…`) to readers.
 
+use crate::reader::Reader;
 use crate::{GgsqlError, Result};
 
-/// Parsed connection information
-#[derive(Debug, Clone, PartialEq)]
-pub enum ConnectionInfo {
-    /// DuckDB in-memory database
-    DuckDBMemory,
-    /// DuckDB file-based database
-    DuckDBFile(String),
-    /// PostgreSQL connection
-    #[allow(dead_code)]
-    PostgreSQL(String),
-    /// SQLite file-based database
-    #[allow(dead_code)]
-    SQLite(String),
-    /// Generic ODBC connection (raw connection string after `odbc://` prefix)
-    #[allow(dead_code)]
-    ODBC(String),
-}
-
-/// Parse a connection string into connection information
+/// Split a composite cache URI `<primary>+<cache>://<rest>` into the primary
+/// connection URI and the cache backend scheme.
 ///
-/// # Supported Formats
+/// Returns `None` when there is no `+<cache>` before `://` (a plain URI).
 ///
-/// - `duckdb://memory` - DuckDB in-memory database
-/// - `duckdb://...` - DuckDB path
-/// - `postgres://...` - PostgreSQL connection string
-/// - `sqlite://...` - SQLite file path
+/// # Example
+/// ```
+/// use ggsql::reader::connection::split_cache_uri;
+/// assert_eq!(
+///     split_cache_uri("odbc+duckdb://DSN=foo"),
+///     Some(("odbc://DSN=foo".to_string(), "duckdb".to_string()))
+/// );
+/// assert_eq!(split_cache_uri("duckdb://memory"), None);
 /// ```
-pub fn parse_connection_string(uri: &str) -> Result<ConnectionInfo> {
-    if uri == "duckdb://memory" {
-        return Ok(ConnectionInfo::DuckDBMemory);
+pub fn split_cache_uri(uri: &str) -> Option<(String, String)> {
+    let (scheme, rest) = uri.split_once("://")?;
+    let (primary, cache) = scheme.split_once('+')?;
+    if primary.is_empty() || cache.is_empty() || cache.contains('+') {
+        return None;
     }
+    Some((format!("{}://{}", primary, rest), cache.to_string()))
+}
 
-    if let Some(path) = uri.strip_prefix("duckdb://") {
-        if path.is_empty() {
+/// Map a cache-backend scheme to its in-memory connection URI.
+#[cfg(any(feature = "duckdb", feature = "sqlite"))]
+fn cache_uri(scheme: &str) -> Result<&'static str> {
+    match scheme {
+        "duckdb" => Ok("duckdb://memory"),
+        "sqlite" => Ok("sqlite://memory"),
+        _ => Err(GgsqlError::ReaderError(format!(
+            "Unsupported cache backend '{}'. Supported: duckdb, sqlite",
+            scheme
+        ))),
+    }
+}
+
+/// Build a reader from a non-composite connection URI
+pub fn build_reader(uri: &str) -> Result<Box<dyn Reader + Send>> {
+    if uri.starts_with("duckdb://") {
+        #[cfg(feature = "duckdb")]
+        {
+            return Ok(Box::new(
+                crate::reader::DuckDBReader::from_connection_string(uri)?,
+            ));
+        }
+        #[cfg(not(feature = "duckdb"))]
+        {
             return Err(GgsqlError::ReaderError(
-                "DuckDB file path cannot be empty".to_string(),
+                "DuckDB reader not compiled in. Rebuild with --features duckdb".to_string(),
             ));
         }
-        return Ok(ConnectionInfo::DuckDBFile(path.to_string()));
     }
-
-    if uri.starts_with("postgres://") || uri.starts_with("postgresql://") {
-        return Ok(ConnectionInfo::PostgreSQL(uri.to_string()));
-    }
-
-    if let Some(path) = uri.strip_prefix("sqlite://") {
-        if path.is_empty() {
+    if uri.starts_with("sqlite://") {
+        #[cfg(feature = "sqlite")]
+        {
+            return Ok(Box::new(
+                crate::reader::SqliteReader::from_connection_string(uri)?,
+            ));
+        }
+        #[cfg(not(feature = "sqlite"))]
+        {
             return Err(GgsqlError::ReaderError(
-                "SQLite file path cannot be empty".to_string(),
+                "SQLite reader not compiled in. Rebuild with --features sqlite".to_string(),
             ));
         }
-        return Ok(ConnectionInfo::SQLite(path.to_string()));
     }
-
-    if let Some(conn_str) = uri.strip_prefix("odbc://") {
-        if conn_str.is_empty() {
+    if uri.starts_with("odbc://") {
+        #[cfg(feature = "odbc")]
+        {
+            return Ok(Box::new(crate::reader::OdbcReader::from_connection_string(
+                uri,
+            )?));
+        }
+        #[cfg(not(feature = "odbc"))]
+        {
             return Err(GgsqlError::ReaderError(
-                "ODBC connection string cannot be empty".to_string(),
+                "ODBC reader not compiled in. Rebuild with --features odbc".to_string(),
             ));
         }
-        return Ok(ConnectionInfo::ODBC(conn_str.to_string()));
     }
-
+    if uri.starts_with("postgres://") || uri.starts_with("postgresql://") {
+        return Err(GgsqlError::ReaderError(
+            "PostgreSQL reader is not yet implemented".to_string(),
+        ));
+    }
     Err(GgsqlError::ReaderError(format!(
-        "Unsupported connection string format: {}. Supported: duckdb://, postgres://, sqlite://, odbc://",
+        "Unsupported connection string: {}. Supported: duckdb://, sqlite://, odbc://",
         uri
     )))
 }
 
+/// Construct a reader from a connection URI, wrapping it in a [`CachingReader`]
+/// when the URI uses the composite `<primary>+<cache>://` form.
+///
+/// [`CachingReader`]: crate::reader::CachingReader
+pub fn reader_from_uri(uri: &str) -> Result<Box<dyn Reader + Send>> {
+    if let Some((primary_uri, cache_scheme)) = split_cache_uri(uri) {
+        #[cfg(any(feature = "duckdb", feature = "sqlite"))]
+        {
+            let primary = build_reader(&primary_uri)?;
+            let cache = build_reader(cache_uri(&cache_scheme)?)?;
+            return Ok(Box::new(crate::reader::CachingReader::new(primary, cache)));
+        }
+        #[cfg(not(any(feature = "duckdb", feature = "sqlite")))]
+        {
+            let _ = (&primary_uri, &cache_scheme);
+            return Err(GgsqlError::ReaderError(
+                "Caching layer requires the duckdb or sqlite feature".to_string(),
+            ));
+        }
+    }
+    build_reader(uri)
+}
+
 /// Extract a value from an ODBC connection string by key, stripping braces.
 pub fn extract_odbc_value(conn_str: &str, key: &str) -> Option<String> {
     let lower = conn_str.to_lowercase();
@@ -93,91 +140,77 @@ mod tests {
     use super::*;
 
     #[test]
-    fn test_duckdb_memory() {
-        let info = parse_connection_string("duckdb://memory").unwrap();
-        assert_eq!(info, ConnectionInfo::DuckDBMemory);
+    fn test_build_reader_unsupported_scheme() {
+        let err = build_reader("mysql://localhost/db")
+            .err()
+            .unwrap()
+            .to_string();
+        assert!(err.contains("Unsupported connection string"), "got: {err}");
     }
 
     #[test]
-    fn test_duckdb_file_relative() {
-        let info = parse_connection_string("duckdb://data.db").unwrap();
-        assert_eq!(info, ConnectionInfo::DuckDBFile("data.db".to_string()));
+    fn test_build_reader_postgres_not_implemented() {
+        let err = build_reader("postgres://user@localhost/db")
+            .err()
+            .unwrap()
+            .to_string();
+        assert!(err.contains("not yet implemented"), "got: {err}");
     }
 
+    #[cfg(feature = "duckdb")]
     #[test]
-    fn test_duckdb_file_absolute() {
-        let info = parse_connection_string("duckdb:///tmp/data.db").unwrap();
-        assert_eq!(info, ConnectionInfo::DuckDBFile("/tmp/data.db".to_string()));
+    fn test_build_reader_duckdb_memory_and_empty() {
+        assert!(build_reader("duckdb://memory").is_ok());
+        assert!(build_reader("duckdb://").is_err());
     }
 
+    #[cfg(feature = "sqlite")]
     #[test]
-    fn test_duckdb_file_nested() {
-        let info = parse_connection_string("duckdb://path/to/data.db").unwrap();
-        assert_eq!(
-            info,
-            ConnectionInfo::DuckDBFile("path/to/data.db".to_string())
-        );
+    fn test_build_reader_sqlite_memory() {
+        assert!(build_reader("sqlite://memory").is_ok());
+        assert!(build_reader("sqlite://:memory:").is_ok());
     }
 
+    #[cfg(all(feature = "duckdb", feature = "sqlite"))]
     #[test]
-    fn test_postgres() {
-        let uri = "postgres://user:pass@localhost/db";
-        let info = parse_connection_string(uri).unwrap();
-        assert_eq!(info, ConnectionInfo::PostgreSQL(uri.to_string()));
+    fn test_reader_from_uri_composite_builds() {
+        assert!(reader_from_uri("duckdb+sqlite://memory").is_ok());
+        assert!(reader_from_uri("sqlite+duckdb://memory").is_ok());
     }
 
     #[test]
-    fn test_postgresql_alias() {
-        let uri = "postgresql://user:pass@localhost/db";
-        let info = parse_connection_string(uri).unwrap();
-        assert_eq!(info, ConnectionInfo::PostgreSQL(uri.to_string()));
-    }
-
-    #[test]
-    fn test_sqlite() {
-        let info = parse_connection_string("sqlite://data.db").unwrap();
-        assert_eq!(info, ConnectionInfo::SQLite("data.db".to_string()));
-    }
-
-    #[test]
-    fn test_sqlite_absolute() {
-        let info = parse_connection_string("sqlite:///tmp/data.db").unwrap();
-        assert_eq!(info, ConnectionInfo::SQLite("/tmp/data.db".to_string()));
+    fn test_split_cache_uri_odbc_duckdb() {
+        assert_eq!(
+            split_cache_uri("odbc+duckdb://Driver=Snowflake;Server=x"),
+            Some((
+                "odbc://Driver=Snowflake;Server=x".to_string(),
+                "duckdb".to_string()
+            ))
+        );
     }
 
     #[test]
-    fn test_empty_duckdb_path() {
-        let result = parse_connection_string("duckdb://");
-        assert!(result.is_err());
+    fn test_split_cache_uri_duckdb_sqlite_memory() {
+        assert_eq!(
+            split_cache_uri("duckdb+sqlite://memory"),
+            Some(("duckdb://memory".to_string(), "sqlite".to_string()))
+        );
     }
 
     #[test]
-    fn test_odbc() {
-        let info = parse_connection_string(
-            "odbc://Driver=Snowflake;Server=myaccount.snowflakecomputing.com",
-        )
-        .unwrap();
-        assert_eq!(
-            info,
-            ConnectionInfo::ODBC(
-                "Driver=Snowflake;Server=myaccount.snowflakecomputing.com".to_string()
-            )
-        );
+    fn test_split_cache_uri_plain_is_none() {
+        assert_eq!(split_cache_uri("duckdb://memory"), None);
+        assert_eq!(split_cache_uri("odbc://DSN=x"), None);
     }
 
     #[test]
-    fn test_odbc_empty() {
-        let result = parse_connection_string("odbc://");
-        assert!(result.is_err());
+    fn test_split_cache_uri_rejects_multiple_plus() {
+        assert_eq!(split_cache_uri("a+b+c://x"), None);
     }
 
     #[test]
-    fn test_unsupported_scheme() {
-        let result = parse_connection_string("mysql://localhost/db");
-        assert!(result.is_err());
-        assert!(result
-            .unwrap_err()
-            .to_string()
-            .contains("Unsupported connection string"));
+    fn test_split_cache_uri_rejects_empty_parts() {
+        assert_eq!(split_cache_uri("+duckdb://x"), None);
+        assert_eq!(split_cache_uri("odbc+://x"), None);
     }
 }
diff --git a/src/reader/duckdb.rs b/src/reader/duckdb.rs
index bf459c155..5badc5324 100644
--- a/src/reader/duckdb.rs
+++ b/src/reader/duckdb.rs
@@ -2,7 +2,7 @@
 //!
 //! Provides a reader for DuckDB databases with Arrow DataFrame integration.
 
-use crate::reader::{connection::ConnectionInfo, Reader};
+use crate::reader::{CacheBackend, Reader};
 use crate::{naming, DataFrame, GgsqlError, Result};
 use arrow::compute::{cast, concat_batches};
 use arrow::datatypes::{DataType, Field, Schema};
@@ -243,21 +243,25 @@ impl DuckDBReader {
     /// - The database file cannot be opened
     /// - DuckDB initialization fails
     pub fn from_connection_string(uri: &str) -> Result<Self> {
-        let conn_info = super::connection::parse_connection_string(uri)?;
+        let path = uri.strip_prefix("duckdb://").ok_or_else(|| {
+            GgsqlError::ReaderError(format!(
+                "DuckDB URI must start with duckdb://, got '{}'",
+                uri
+            ))
+        })?;
 
-        let conn = match conn_info {
-            ConnectionInfo::DuckDBMemory => Connection::open_in_memory().map_err(|e| {
+        let conn = if path == "memory" {
+            Connection::open_in_memory().map_err(|e| {
                 GgsqlError::ReaderError(format!("Failed to open in-memory DuckDB: {}", e))
-            })?,
-            ConnectionInfo::DuckDBFile(path) => Connection::open(&path).map_err(|e| {
+            })?
+        } else if path.is_empty() {
+            return Err(GgsqlError::ReaderError(
+                "DuckDB file path cannot be empty".to_string(),
+            ));
+        } else {
+            Connection::open(path).map_err(|e| {
                 GgsqlError::ReaderError(format!("Failed to open DuckDB file '{}': {}", path, e))
-            })?,
-            _ => {
-                return Err(GgsqlError::ReaderError(format!(
-                    "Connection string '{}' is not supported by DuckDBReader",
-                    uri
-                )))
-            }
+            })?
         };
 
         // https://github.com/duckdb/duckdb/issues/22133
@@ -349,6 +353,12 @@ fn normalize_arrow_types(batch: RecordBatch) -> Result<RecordBatch> {
         .map_err(|e| GgsqlError::ReaderError(format!("Failed to normalize types: {}", e)))
 }
 
+impl CacheBackend for DuckDBReader {
+    fn new_in_memory() -> Result<Self> {
+        Self::from_connection_string("duckdb://memory")
+    }
+}
+
 impl Reader for DuckDBReader {
     fn execute_sql(&self, sql: &str) -> Result<DataFrame> {
         // Register builtin datasets if referenced
diff --git a/src/reader/mod.rs b/src/reader/mod.rs
index a6609d0c8..db9ad7791 100644
--- a/src/reader/mod.rs
+++ b/src/reader/mod.rs
@@ -429,6 +429,12 @@ pub mod odbc;
 #[cfg(feature = "adbc")]
 pub mod adbc;
 
+#[cfg(any(feature = "duckdb", feature = "sqlite"))]
+pub mod cache;
+
+#[cfg(all(test, feature = "duckdb", feature = "sqlite"))]
+mod cache_equivalence;
+
 pub mod connection;
 pub mod data;
 mod spec;
@@ -445,6 +451,9 @@ pub use odbc::OdbcReader;
 #[cfg(feature = "adbc")]
 pub use adbc::AdbcReader;
 
+#[cfg(any(feature = "duckdb", feature = "sqlite"))]
+pub use cache::CachingReader;
+
 // ============================================================================
 // Shared utilities
 // ============================================================================
@@ -489,6 +498,177 @@ pub(crate) fn returns_rows(sql: &str) -> bool {
     )
 }
 
+/// Shared test helpers for reader equivalence suites.
+#[cfg(test)]
+pub(crate) mod test_support {
+    use super::{
+        execute_with_reader, returns_rows, ColumnInfo, Reader, Spec, SqlDialect, TableInfo,
+    };
+    use crate::{DataFrame, GgsqlError, Result};
+    use std::sync::{Arc, Mutex};
+
+    /// A `Reader` that records every `execute_sql` it receives, delegating
+    /// everything to an inner reader.
+    pub(crate) struct SpyReader {
+        inner: Box<dyn Reader + Send>,
+        log: Arc<Mutex<Vec<String>>>,
+    }
+
+    impl SpyReader {
+        /// Wrap `inner`, returning the boxed spy and a handle to its call log.
+        pub(crate) fn wrap(
+            inner: Box<dyn Reader + Send>,
+        ) -> (Box<dyn Reader + Send>, Arc<Mutex<Vec<String>>>) {
+            let log = Arc::new(Mutex::new(Vec::new()));
+            (
+                Box::new(SpyReader {
+                    inner,
+                    log: log.clone(),
+                }),
+                log,
+            )
+        }
+    }
+
+    impl Reader for SpyReader {
+        fn execute_sql(&self, sql: &str) -> Result<DataFrame> {
+            self.log.lock().unwrap().push(sql.to_string());
+            self.inner.execute_sql(sql)
+        }
+        fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> {
+            self.inner.register(name, df, replace)
+        }
+        fn unregister(&self, name: &str) -> Result<()> {
+            self.inner.unregister(name)
+        }
+        fn execute(&self, query: &str) -> Result<Spec> {
+            execute_with_reader(self, query)
+        }
+        fn dialect(&self) -> &dyn SqlDialect {
+            self.inner.dialect()
+        }
+        fn list_catalogs(&self) -> Result<Vec<String>> {
+            self.inner.list_catalogs()
+        }
+        fn list_schemas(&self, c: &str) -> Result<Vec<String>> {
+            self.inner.list_schemas(c)
+        }
+        fn list_tables(&self, c: &str, s: &str) -> Result<Vec<TableInfo>> {
+            self.inner.list_tables(c, s)
+        }
+        fn list_columns(&self, c: &str, s: &str, t: &str) -> Result<Vec<ColumnInfo>> {
+            self.inner.list_columns(c, s, t)
+        }
+    }
+
+    /// A `Reader` that wraps an inner reader and **refuses every write**: any
+    /// `register`/`unregister` and any non-row-returning `execute_sql`
+    /// (CREATE/INSERT/DROP/…) returns an error.
+    pub(crate) struct ReadOnlyReader {
+        inner: Box<dyn Reader + Send>,
+    }
+
+    impl ReadOnlyReader {
+        pub(crate) fn new(inner: Box<dyn Reader + Send>) -> Self {
+            Self { inner }
+        }
+
+        fn refuse(op: &str) -> GgsqlError {
+            GgsqlError::ReaderError(format!("read-only primary: refused {op}"))
+        }
+    }
+
+    impl Reader for ReadOnlyReader {
+        fn execute_sql(&self, sql: &str) -> Result<DataFrame> {
+            if !returns_rows(sql) {
+                return Err(Self::refuse(&format!("write statement: {sql}")));
+            }
+            self.inner.execute_sql(sql)
+        }
+        fn register(&self, _name: &str, _df: DataFrame, _replace: bool) -> Result<()> {
+            Err(Self::refuse("register"))
+        }
+        fn unregister(&self, _name: &str) -> Result<()> {
+            Err(Self::refuse("unregister"))
+        }
+        fn execute(&self, query: &str) -> Result<Spec> {
+            execute_with_reader(self, query)
+        }
+        fn dialect(&self) -> &dyn SqlDialect {
+            self.inner.dialect()
+        }
+        fn list_catalogs(&self) -> Result<Vec<String>> {
+            self.inner.list_catalogs()
+        }
+        fn list_schemas(&self, c: &str) -> Result<Vec<String>> {
+            self.inner.list_schemas(c)
+        }
+        fn list_tables(&self, c: &str, s: &str) -> Result<Vec<TableInfo>> {
+            self.inner.list_tables(c, s)
+        }
+        fn list_columns(&self, c: &str, s: &str, t: &str) -> Result<Vec<ColumnInfo>> {
+            self.inner.list_columns(c, s, t)
+        }
+    }
+
+    /// Compare two DataFrames by schema (field names + types) and by
+    /// per-column Arrow array contents. We don't use a blanket
+    /// `assert_eq!(df, df)` because `DataFrame` doesn't implement `PartialEq`;
+    /// going through schema + per-column equality is also more diagnostic
+    /// when one of them diverges.
+    #[cfg(feature = "adbc")]
+    pub(crate) fn assert_dataframes_equal(a: &DataFrame, b: &DataFrame, ctx: &str) {
+        let a_schema = a.schema();
+        let b_schema = b.schema();
+        assert_eq!(
+            a_schema.fields().len(),
+            b_schema.fields().len(),
+            "{ctx}: column count mismatch (a={}, b={})",
+            a_schema.fields().len(),
+            b_schema.fields().len(),
+        );
+        for (i, (af, bf)) in a_schema
+            .fields()
+            .iter()
+            .zip(b_schema.fields().iter())
+            .enumerate()
+        {
+            assert_eq!(
+                af.name(),
+                bf.name(),
+                "{ctx}: column {i} name mismatch (a='{}', b='{}')",
+                af.name(),
+                bf.name(),
+            );
+            assert_eq!(
+                af.data_type(),
+                bf.data_type(),
+                "{ctx}: column '{}' type mismatch (a={:?}, b={:?})",
+                af.name(),
+                af.data_type(),
+                bf.data_type(),
+            );
+        }
+        assert_eq!(
+            a.height(),
+            b.height(),
+            "{ctx}: row count mismatch (a={}, b={})",
+            a.height(),
+            b.height(),
+        );
+        for field in a_schema.fields() {
+            let ac = a.column(field.name()).unwrap();
+            let bc = b.column(field.name()).unwrap();
+            assert_eq!(
+                ac.as_ref(),
+                bc.as_ref(),
+                "{ctx}: column '{}' data mismatch",
+                field.name(),
+            );
+        }
+    }
+}
+
 // ============================================================================
 // Spec - Result of reader.execute()
 // ============================================================================
@@ -642,6 +822,27 @@ pub trait Reader {
         &AnsiDialect
     }
 
+    /// Materialize the result of `body_sql` as a temporary table named `name`.
+    fn materialize_table(
+        &self,
+        name: &str,
+        column_aliases: &[String],
+        body_sql: &str,
+    ) -> Result<()> {
+        for stmt in self
+            .dialect()
+            .create_or_replace_temp_table_sql(name, column_aliases, body_sql)
+        {
+            self.execute_sql(&stmt)?;
+        }
+        Ok(())
+    }
+
+    /// Whether this reader stages external data sources into a separate cache.
+    fn caches_sources(&self) -> bool {
+        false
+    }
+
     // =========================================================================
     // Schema introspection
     // =========================================================================
@@ -721,6 +922,16 @@ pub trait Reader {
     }
 }
 
+/// A reader that can serve as an in-memory, writable caching backend.
+///
+/// Cache backends take no options: they are always a fresh in-memory, writable
+/// database scoped to the process; consumed by [`CachingReader`].
+pub trait CacheBackend: Reader {
+    fn new_in_memory() -> Result<Self>
+    where
+        Self: Sized;
+}
+
 /// A table or view in the schema.
 pub struct TableInfo {
     pub name: String,
diff --git a/src/reader/sqlite.rs b/src/reader/sqlite.rs
index 8f1e1f9f6..245c65a96 100644
--- a/src/reader/sqlite.rs
+++ b/src/reader/sqlite.rs
@@ -3,7 +3,7 @@
 //! Provides a reader for SQLite databases with Arrow DataFrame integration.
 //! Works on both native targets and wasm32-unknown-unknown (via sqlite-wasm-rs).
 
-use crate::reader::Reader;
+use crate::reader::{CacheBackend, Reader};
 use crate::{naming, DataFrame, GgsqlError, Result};
 use arrow::array::*;
 use arrow::datatypes::{DataType, TimeUnit};
@@ -161,21 +161,31 @@ impl SqliteReader {
     }
 
     /// Create a SQLite reader from a connection string
+    ///
+    /// `sqlite://memory` (or `sqlite://:memory:`) opens an in-memory database;
+    /// any other path opens that file.
     pub fn from_connection_string(uri: &str) -> Result<Self> {
-        let conn_info = super::connection::parse_connection_string(uri)?;
+        let path = uri.strip_prefix("sqlite://").ok_or_else(|| {
+            GgsqlError::ReaderError(format!(
+                "SQLite URI must start with sqlite://, got '{}'",
+                uri
+            ))
+        })?;
 
-        let conn = match conn_info {
-            super::connection::ConnectionInfo::SQLite(path) => {
-                Connection::open(&path).map_err(|e| {
-                    GgsqlError::ReaderError(format!("Failed to open SQLite file '{}': {}", path, e))
-                })?
-            }
-            _ => {
-                return Err(GgsqlError::ReaderError(format!(
-                    "Connection string '{}' is not supported by SqliteReader",
-                    uri
-                )))
-            }
+        if path.is_empty() {
+            return Err(GgsqlError::ReaderError(
+                "SQLite file path cannot be empty".to_string(),
+            ));
+        }
+
+        let conn = if path == "memory" || path == ":memory:" {
+            Connection::open_in_memory().map_err(|e| {
+                GgsqlError::ReaderError(format!("Failed to open in-memory SQLite: {}", e))
+            })?
+        } else {
+            Connection::open(path).map_err(|e| {
+                GgsqlError::ReaderError(format!("Failed to open SQLite file '{}': {}", path, e))
+            })?
         };
 
         #[cfg(feature = "spatial")]
@@ -366,6 +376,12 @@ fn to_sql_value(v: &dyn rusqlite::types::ToSql) -> Option<rusqlite::types::Value
     }
 }
 
+impl CacheBackend for SqliteReader {
+    fn new_in_memory() -> Result<Self> {
+        Self::new()
+    }
+}
+
 impl Reader for SqliteReader {
     fn execute_sql(&self, sql: &str) -> Result<DataFrame> {
         // Handle ggsql:name namespaced identifiers (builtin datasets)

From 6bf435e0ed90cd94cc9b2678672d151d3d8995d1 Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Mon, 29 Jun 2026 15:02:19 +0100
Subject: [PATCH 02/15] Explicit compute/source split for caching layer

---
 src/CLAUDE.md                   |   2 +-
 src/execute/mod.rs              |   9 +-
 src/reader/cache.rs             | 240 ++++----------------------------
 src/reader/cache_equivalence.rs |  32 ++++-
 src/reader/mod.rs               |  26 +++-
 5 files changed, 88 insertions(+), 221 deletions(-)

diff --git a/src/CLAUDE.md b/src/CLAUDE.md
index 8047c38a7..dd07af6e1 100644
--- a/src/CLAUDE.md
+++ b/src/CLAUDE.md
@@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding
 
 `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`).
 
-**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`. It routes each `execute_sql`: cache-resident names and `ggsql:` builtins go to the cache, base reads go to the primary (and are memoized). The `Reader::materialize_table` seam (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to run the read and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's extra materialization of explicit per-layer sources. `dialect()` returns the **cache** dialect because every executor-generated query targets a cache-resident table. Selected via the composite `<primary>+<cache>://` connection scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
+**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache. **`execute_sql_primary` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins and references to already-cached internal tables, which go to the cache. The executor (`prepare_data_with_reader`) decides per call site, passing an `execute_query` closure to derived sites and `execute_query_primary` to the setup/side-effect loops — so the routing is explicit, not guessed. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's extra materialization of explicit per-layer sources. `dialect()` returns the **cache** dialect. Selected via the composite `<primary>+<cache>://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
 
 ### `execute/`
 
diff --git a/src/execute/mod.rs b/src/execute/mod.rs
index cf205c505..a041aef48 100644
--- a/src/execute/mod.rs
+++ b/src/execute/mod.rs
@@ -1098,7 +1098,12 @@ pub struct PreparedData {
 /// * `query` - The full ggsql query string
 /// * `reader` - A Reader implementation for executing SQL
 pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<PreparedData> {
+    // Two execution surfaces: `execute_query` is the COMPUTE surface for
+    // derived/dialect-generated SQL over internal `__ggsql_*` tables;
+    // `execute_query_primary` is the SOURCE surface for base reads of the
+    // user's data and user setup/DML.
     let execute_query = |sql: &str| reader.execute_sql(sql);
+    let execute_query_primary = |sql: &str| reader.execute_sql_primary(sql);
     let dialect = reader.dialect();
 
     // Parse once and create SourceTree
@@ -1129,7 +1134,7 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<Prep
     // Structured DML (CREATE, INSERT, UPDATE, DELETE) is handled separately as
     // side-effects in cte::transform_global_sql.
     for stmt in source_tree.find_texts(&root, "(sql_statement (other_sql_statement) @stmt)") {
-        execute_query(&stmt)?;
+        execute_query_primary(&stmt)?;
     }
 
     // Extract CTE definitions from the source tree (in declaration order)
@@ -1152,7 +1157,7 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<Prep
         let (side_effects, query) = cte::transform_global_sql(&source_tree, &materialized_ctes);
 
         for stmt in &side_effects {
-            execute_query(stmt)?;
+            execute_query_primary(stmt)?;
         }
 
         if let Some(query) = query {
diff --git a/src/reader/cache.rs b/src/reader/cache.rs
index 015022282..fdeb47233 100644
--- a/src/reader/cache.rs
+++ b/src/reader/cache.rs
@@ -1,30 +1,27 @@
-//! Caching reader: any primary [`Reader`] + an in-memory writable cache.
+//! Caching reader: any primary [`Reader`] + an in-memory writeable cache.
 //!
-//! [`CachingReader`] wraps a `primary` reader and a `cache` backend (an
-//! in-memory [`CacheBackend`]). It sits between the primary reader and the
-//! rest of ggsql:
+//! [`CachingReader`] wraps a `primary` reader and an in-memory `cache` backend,
+//! splitting work across two surfaces:
 //!
-//! - [`Reader::register`] writes to the cache and records the name.
-//! - [`Reader::materialize_table`] runs the body and `register`s the resulting
-//!   DataFrame into the cache, so the primary is only ever read from — base
-//!   reads happen via passthrough SQL, derived tables live in the cache.
-//! - [`Reader::execute_sql`] routes each statement: queries referencing a
-//!   cache-resident name (or a `ggsql:` builtin) run on the cache; base reads
-//!   run on the primary and are memoized into the cache.
-//! - [`Reader::dialect`] returns the **cache** dialect: every dialect-specific
-//!   query the executor builds targets a cache-resident table.
+//! - **Compute** ([`Reader::execute_sql`]) runs on the cache: all derived,
+//!   dialect-generated SQL operates on the `__ggsql_*` tables.
+//! - **Source** ([`Reader::execute_sql_primary`]) reads the primary: base reads of
+//!   the user's data, plus user setup/DML.
+//! - [`Reader::materialize_table`] reads a body via the source surface and
+//!   `register`s the result into the cache.
+//! - [`Reader::dialect`] returns the cache dialect.
 
 use crate::reader::{execute_with_reader, ColumnInfo, Reader, Spec, SqlDialect, TableInfo};
 use crate::{naming, DataFrame, Result};
 use std::cell::{Cell, RefCell};
-use std::collections::{HashMap, HashSet};
+use std::collections::HashMap;
 
 pub struct CachingReader {
+    /// Primary backend — the real data source.
     primary: Box<dyn Reader + Send>,
+    /// In-memory writeable cache: derived tables, registered data, memoized reads.
     cache: Box<dyn Reader + Send>,
-    /// Names that currently live in the cache (registered tables / memo results).
-    cached_names: RefCell<HashSet<String>>,
-    /// Memo of base primary reads: SQL text -> cache table name holding the result.
+    /// Memo of base primary reads: source SQL text -> cache table holding the result.
     result_cache: RefCell<HashMap<String, String>>,
     /// Monotonic counter for unique memo table names.
     next_id: Cell<u64>,
@@ -37,38 +34,24 @@ impl CachingReader {
         Self {
             primary,
             cache,
-            cached_names: RefCell::new(HashSet::new()),
             result_cache: RefCell::new(HashMap::new()),
             next_id: Cell::new(0),
         }
     }
-
-    /// Whether `sql` is one of the cache dialect's spatial setup statements.
-    ///
-    /// ggsql-generated spatial setup (`INSTALL`/`LOAD spatial`, …) must run on
-    /// the cache, where the spatial SQL executes — not on the primary.
-    fn is_cache_spatial_setup(&self, sql: &str) -> bool {
-        self.cache
-            .dialect()
-            .sql_spatial_setup()
-            .iter()
-            .any(|stmt| stmt == sql)
-    }
 }
 
 impl Reader for CachingReader {
+    /// Compute surface: all derived/dialect-generated SQL runs on the cache.
     fn execute_sql(&self, sql: &str) -> Result<DataFrame> {
-        // Cache-resident names and `ggsql:` builtins are served by the cache.
-        if references_cached_name(sql, &self.cached_names.borrow()) || sql.contains("ggsql:") {
-            return self.cache.execute_sql(sql);
-        }
+        self.cache.execute_sql(sql)
+    }
 
-        // ggsql-generated spatial setup targets the cache (where spatial SQL runs).
-        if self.is_cache_spatial_setup(sql) {
+    /// Source surface: base reads of the user's data (plus user setup/DML).
+    fn execute_sql_primary(&self, sql: &str) -> Result<DataFrame> {
+        if sql.contains("ggsql:") || sql.contains("__ggsql_") {
             return self.cache.execute_sql(sql);
         }
 
-        // Base read against the primary, with result memoization.
         if let Some(table) = self.result_cache.borrow().get(sql) {
             return self
                 .cache
@@ -82,7 +65,6 @@ impl Reader for CachingReader {
             self.next_id.set(id + 1);
             let table = naming::cache_result_table(id);
             self.cache.register(&table, df.clone(), true)?;
-            self.cached_names.borrow_mut().insert(table.clone());
             self.result_cache
                 .borrow_mut()
                 .insert(sql.to_string(), table);
@@ -92,15 +74,11 @@ impl Reader for CachingReader {
     }
 
     fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> {
-        self.cache.register(name, df, replace)?;
-        self.cached_names.borrow_mut().insert(name.to_string());
-        Ok(())
+        self.cache.register(name, df, replace)
     }
 
     fn unregister(&self, name: &str) -> Result<()> {
-        self.cache.unregister(name)?;
-        self.cached_names.borrow_mut().remove(name);
-        Ok(())
+        self.cache.unregister(name)
     }
 
     fn execute(&self, query: &str) -> Result<Spec> {
@@ -118,8 +96,10 @@ impl Reader for CachingReader {
         column_aliases: &[String],
         body_sql: &str,
     ) -> Result<()> {
+        // Read the body via the source surface, then register the result
+        // into the cache.
         let body = super::wrap_with_column_aliases(body_sql, column_aliases);
-        let df = self.execute_sql(&body)?;
+        let df = self.execute_sql_primary(&body)?;
         self.register(name, df, true)
     }
 
@@ -146,142 +126,6 @@ impl Reader for CachingReader {
     }
 }
 
-/// Check whether `sql` references any name in `cached_names` as a SQL
-/// identifier (not part of a longer identifier, not inside a single-quoted
-/// string literal).
-///
-/// Matches a bare reference (`FROM orders`), a double-quoted identifier
-/// (`FROM "orders"`), or a qualified reference (`FROM catalog.schema.orders`).
-/// Does not match substrings of longer identifiers (`orders_detail`) or
-/// string-literal contents (`'orders of magnitude'`). This is a lightweight
-/// scanner, not a full SQL parser: a cached name appearing only inside a
-/// comment is misrouted to the cache and fails with a clear error rather than
-/// silently hitting the primary.
-fn references_cached_name(sql: &str, cached_names: &HashSet<String>) -> bool {
-    cached_names
-        .iter()
-        .any(|name| sql_references_identifier(sql, name))
-}
-
-fn sql_references_identifier(sql: &str, name: &str) -> bool {
-    let bytes = sql.as_bytes();
-    let name_bytes = name.as_bytes();
-    let n = name_bytes.len();
-    if n == 0 {
-        return false;
-    }
-    let mut i = 0;
-    while i + n <= bytes.len() {
-        if &bytes[i..i + n] == name_bytes {
-            let before_ok = i == 0 || !is_identifier_byte(bytes[i - 1]);
-            let after_ok = i + n == bytes.len() || !is_identifier_byte(bytes[i + n]);
-            if before_ok && after_ok && !is_inside_string_literal(bytes, i) {
-                return true;
-            }
-        }
-        i += 1;
-    }
-    false
-}
-
-fn is_identifier_byte(b: u8) -> bool {
-    b.is_ascii_alphanumeric() || b == b'_'
-}
-
-/// Walk from start to `pos` tracking whether we're inside a single-quoted
-/// string literal. SQL-standard doubled-single-quote (`''`) is an escape that
-/// keeps us inside the literal.
-fn is_inside_string_literal(bytes: &[u8], pos: usize) -> bool {
-    let mut inside = false;
-    let mut i = 0;
-    while i < pos && i < bytes.len() {
-        if bytes[i] == b'\'' {
-            if inside && i + 1 < bytes.len() && bytes[i + 1] == b'\'' {
-                i += 2;
-                continue;
-            }
-            inside = !inside;
-        }
-        i += 1;
-    }
-    inside
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_references_cached_name_empty_set() {
-        let set = HashSet::new();
-        assert!(!references_cached_name("SELECT * FROM foo", &set));
-    }
-
-    #[test]
-    fn test_references_cached_name_match() {
-        let mut set = HashSet::new();
-        set.insert("__ggsql_global_abc123__".to_string());
-        assert!(references_cached_name(
-            "SELECT * FROM __ggsql_global_abc123__ WHERE x > 1",
-            &set
-        ));
-    }
-
-    #[test]
-    fn test_references_cached_name_rejects_longer_identifier() {
-        let mut set = HashSet::new();
-        set.insert("orders".to_string());
-        assert!(!references_cached_name(
-            "SELECT * FROM orders_detail WHERE x > 1",
-            &set
-        ));
-    }
-
-    #[test]
-    fn test_references_cached_name_rejects_prefix_of_longer_identifier() {
-        let mut set = HashSet::new();
-        set.insert("col".to_string());
-        assert!(!references_cached_name("SELECT col_id FROM users", &set));
-    }
-
-    #[test]
-    fn test_references_cached_name_rejects_inside_string_literal() {
-        let mut set = HashSet::new();
-        set.insert("orders".to_string());
-        assert!(!references_cached_name(
-            "SELECT 'orders of magnitude' AS label",
-            &set
-        ));
-    }
-
-    #[test]
-    fn test_references_cached_name_matches_quoted_identifier() {
-        let mut set = HashSet::new();
-        set.insert("orders".to_string());
-        assert!(references_cached_name(r#"SELECT * FROM "orders""#, &set));
-    }
-
-    #[test]
-    fn test_references_cached_name_matches_qualified_reference() {
-        let mut set = HashSet::new();
-        set.insert("orders".to_string());
-        assert!(references_cached_name(
-            "SELECT * FROM catalog.schema.orders WHERE x > 1",
-            &set
-        ));
-    }
-
-    #[test]
-    fn test_references_cached_name_handles_escaped_quotes_in_literal() {
-        let mut set = HashSet::new();
-        set.insert("orders".to_string());
-        assert!(references_cached_name(
-            "SELECT 'it''s fine' FROM orders",
-            &set
-        ));
-    }
-}
-
 #[cfg(all(test, feature = "duckdb"))]
 mod behavior_tests {
     use super::*;
@@ -299,15 +143,16 @@ mod behavior_tests {
         reader
             .register("t", df! { "x" => vec![1_i64, 2, 3] }.unwrap(), true)
             .unwrap();
+        // register writes to the cache; the compute surface reads it back.
         let out = reader.execute_sql("SELECT COUNT(*) AS n FROM t").unwrap();
 
         assert_eq!(as_i64(out.column("n").unwrap()).unwrap().value(0), 3);
-        // The query routed to the cache; the primary was never touched.
+        // The primary was never touched.
         assert!(log.lock().unwrap().is_empty());
     }
 
     #[test]
-    fn test_base_read_routes_to_primary_and_memoizes() {
+    fn test_source_read_hits_primary_and_memoizes() {
         let inner = DuckDBReader::new_in_memory().unwrap();
         inner
             .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true)
@@ -317,8 +162,8 @@ mod behavior_tests {
         let reader = CachingReader::new(primary, cache);
 
         let q = "SELECT y FROM base ORDER BY y";
-        let d1 = reader.execute_sql(q).unwrap();
-        let d2 = reader.execute_sql(q).unwrap();
+        let d1 = reader.execute_sql_primary(q).unwrap();
+        let d2 = reader.execute_sql_primary(q).unwrap();
         assert_eq!(d1.height(), 3);
         assert_eq!(d2.height(), 3);
 
@@ -438,31 +283,6 @@ mod behavior_tests {
         );
     }
 
-    #[cfg(feature = "spatial")]
-    #[test]
-    fn test_spatial_setup_routes_to_cache() {
-        let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap()));
-        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(primary, cache);
-
-        // ggsql-generated spatial setup must run on the cache (where the spatial
-        // SQL executes), never on the primary.
-        let setup = reader.dialect().sql_spatial_setup();
-        assert!(!setup.is_empty(), "DuckDB cache should emit spatial setup");
-        for stmt in &setup {
-            // Ignore execution result (may require the extension); we only assert routing.
-            let _ = reader.execute_sql(stmt);
-        }
-        let log = log.lock().unwrap();
-        for stmt in &setup {
-            assert!(
-                !log.contains(stmt),
-                "spatial setup must route to the cache, not the primary: {}",
-                stmt
-            );
-        }
-    }
-
     #[cfg(feature = "sqlite")]
     #[test]
     fn test_dialect_returns_cache_dialect() {
diff --git a/src/reader/cache_equivalence.rs b/src/reader/cache_equivalence.rs
index 6b8f808d2..c490919b0 100644
--- a/src/reader/cache_equivalence.rs
+++ b/src/reader/cache_equivalence.rs
@@ -228,6 +228,33 @@ fn factory_builds_caching_readers() {
     }
 }
 
+/// Map projections run entirely on the cache.
+#[cfg(all(feature = "spatial", feature = "builtin-data"))]
+#[test]
+fn map_projection_runs_on_cache_not_primary() {
+    let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap()));
+    let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+    let reader = CachingReader::new(primary, cache);
+
+    let spec = reader.execute("VISUALISE FROM ggsql:world DRAW spatial PROJECT TO orthographic");
+    assert!(
+        spec.is_ok(),
+        "map projection via cache failed: {:?}",
+        spec.err()
+    );
+
+    // No dialect-specific spatial SQL, temp-table DDL, or `__ggsql_*` reference
+    // ever reached the primary.
+    let log = log.lock().unwrap();
+    for stmt in log.iter() {
+        let upper = stmt.to_uppercase();
+        assert!(
+            !upper.contains("ST_") && !upper.contains("TEMP TABLE") && !stmt.contains("__ggsql_"),
+            "derived spatial SQL leaked to the primary: {stmt}"
+        );
+    }
+}
+
 /// A real external ADBC SQLite primary + DuckDB cache, compared against a bare ADBC reader.
 /// `#[ignore]` — requires `dbc install sqlite`.
 #[cfg(feature = "adbc")]
@@ -318,12 +345,13 @@ mod adbc_mode {
         let cache = DuckDBReader::new_in_memory().unwrap();
         let cached = CachingReader::new(Box::new(primary), Box::new(cache));
 
-        let miss = cached.execute_sql(sql).unwrap();
+        // Base reads go through the source surface; the cache memoizes them.
+        let miss = cached.execute_sql_primary(sql).unwrap();
         assert_dataframes_equal(&golden, &miss, "adbc cache miss");
         let after_miss = calls.load(Ordering::SeqCst);
         assert!(after_miss >= 1, "miss should reach the ADBC primary");
 
-        let hit = cached.execute_sql(sql).unwrap();
+        let hit = cached.execute_sql_primary(sql).unwrap();
         assert_dataframes_equal(&golden, &hit, "adbc cache hit");
         let after_hit = calls.load(Ordering::SeqCst);
         assert_eq!(
diff --git a/src/reader/mod.rs b/src/reader/mod.rs
index db9ad7791..2eef03bea 100644
--- a/src/reader/mod.rs
+++ b/src/reader/mod.rs
@@ -724,15 +724,27 @@ pub struct Metadata {
 /// let result = reader.execute_sql("SELECT * FROM sales WHERE amount > 100")?;
 /// ```
 pub trait Reader {
-    /// Execute a SQL query and return the result as a DataFrame
+    /// Execute a SQL query and return the result as a DataFrame.
     ///
-    /// # Arguments
+    /// This is the **compute surface**: ggsql runs all dialect-generated/derived
+    /// SQL here. A plain reader runs it on its own connection; a
+    /// caching reader runs it on the in-memory cache, where all derived
+    /// `__ggsql_*` tables live.
     ///
-    /// * `sql` - The SQL query to execute
+    /// # Errors
     ///
-    /// # Returns
+    /// Returns `GgsqlError::ReaderError` if:
+    /// - The SQL is invalid
+    /// - The connection fails
+    /// - The table or columns don't exist
+    fn execute_sql(&self, sql: &str) -> Result<DataFrame>;
+
+    /// Execute SQL against the *source* surface — base reads of the user's data
+    /// plus user setup/DML.
     ///
-    /// A Polars DataFrame containing the query results
+    /// Defaults to the compute surface ([`Reader::execute_sql`]), so a plain
+    /// reader runs everything on one connection. A caching reader overrides this
+    /// to read the primary (with result memoization).
     ///
     /// # Errors
     ///
@@ -740,7 +752,9 @@ pub trait Reader {
     /// - The SQL is invalid
     /// - The connection fails
     /// - The table or columns don't exist
-    fn execute_sql(&self, sql: &str) -> Result<DataFrame>;
+    fn execute_sql_primary(&self, sql: &str) -> Result<DataFrame> {
+        self.execute_sql(sql)
+    }
 
     /// Register a DataFrame as a queryable table (takes ownership)
     ///

From 5a435999713ec23b39dab51b443cf0d939475151 Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Mon, 29 Jun 2026 15:03:53 +0100
Subject: [PATCH 03/15] Update changelog

---
 CHANGELOG.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b9eb1a45c..327a651f7 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 ## [Unreleased]
 
+### Added
+
+- New caching layer that wraps any `Reader` with an in-memory, writeable cache
+  backend (currently duckdb or sqlite), making write-constrained databases
+  usable and avoiding repeated remote reads during interactive iteration.
+
 ## 0.4.1 - 2026-06-22
 
 ### Changed

From 589f179d32169061682726f675ab0e390135998c Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Mon, 29 Jun 2026 16:16:44 +0100
Subject: [PATCH 04/15] Apply changes from code review

---
 ggsql-cli/src/main.rs         |   2 +-
 ggsql-jupyter/src/executor.rs |   4 +-
 src/CLAUDE.md                 |   2 +-
 src/execute/mod.rs            |   8 +-
 src/reader/cache.rs           | 165 +++++++++++++++++++++++++++++++++-
 5 files changed, 172 insertions(+), 9 deletions(-)

diff --git a/ggsql-cli/src/main.rs b/ggsql-cli/src/main.rs
index 0a004fbce..413dae6b4 100644
--- a/ggsql-cli/src/main.rs
+++ b/ggsql-cli/src/main.rs
@@ -446,7 +446,7 @@ fn print_table_fallback<R: Reader + ?Sized>(query: &str, reader: &R, max_rows: u
 
     let sql_part = source_tree.extract_sql().unwrap_or_default();
 
-    let data = reader.execute_sql(&sql_part);
+    let data = reader.execute_sql_primary(&sql_part);
     if let Err(e) = data {
         eprintln!("Failed to execute SQL query: {}", e);
         std::process::exit(1)
diff --git a/ggsql-jupyter/src/executor.rs b/ggsql-jupyter/src/executor.rs
index fb9d51bfb..668d5f3dd 100644
--- a/ggsql-jupyter/src/executor.rs
+++ b/ggsql-jupyter/src/executor.rs
@@ -177,8 +177,8 @@ impl QueryExecutor {
 
         // 2. Check if there's a visualization
         if !validated.has_visual() {
-            // Pure SQL query - execute directly and return DataFrame
-            let df = self.reader.execute_sql(code)?;
+            // Pure SQL query - execute directly and return DataFrame.
+            let df = self.reader.execute_sql_primary(code)?;
             tracing::info!(
                 "Pure SQL executed: {} rows, {} cols",
                 df.height(),
diff --git a/src/CLAUDE.md b/src/CLAUDE.md
index dd07af6e1..d0a12fba0 100644
--- a/src/CLAUDE.md
+++ b/src/CLAUDE.md
@@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding
 
 `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`).
 
-**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache. **`execute_sql_primary` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins and references to already-cached internal tables, which go to the cache. The executor (`prepare_data_with_reader`) decides per call site, passing an `execute_query` closure to derived sites and `execute_query_primary` to the setup/side-effect loops — so the routing is explicit, not guessed. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's extra materialization of explicit per-layer sources. `dialect()` returns the **cache** dialect. Selected via the composite `<primary>+<cache>://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
+**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache. **`execute_sql_primary` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins and reads that reference a cache-resident internal table, which go to the cache. Cache routing is by **membership** in the set of tables actually registered into the cache (the `resident` set: internal `__ggsql_*` tables plus user-registered tables). A non-row-returning source statement (DML/DDL on the primary) clears the read memo, so later identical reads don't return stale data. Pure/non-visual SQL (CLI table fallback, Jupyter) also goes through `execute_sql_primary` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `<primary>+<cache>://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
 
 ### `execute/`
 
diff --git a/src/execute/mod.rs b/src/execute/mod.rs
index a041aef48..acec6e512 100644
--- a/src/execute/mod.rs
+++ b/src/execute/mod.rs
@@ -1211,7 +1211,13 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<Prep
                 Some(t) => t.clone(),
                 None => {
                     let t = naming::layer_source_table(materialized_sources.len());
-                    reader.materialize_table(&t, &[], &source_query)?;
+                    if matches!(layer.source, Some(crate::DataSource::FilePath(_))) {
+                        // The cache backend reads local files
+                        let df = reader.execute_sql(&source_query)?;
+                        reader.register(&t, df, true)?;
+                    } else {
+                        reader.materialize_table(&t, &[], &source_query)?;
+                    }
                     materialized_sources.insert(source_query, t.clone());
                     t
                 }
diff --git a/src/reader/cache.rs b/src/reader/cache.rs
index fdeb47233..e57393acf 100644
--- a/src/reader/cache.rs
+++ b/src/reader/cache.rs
@@ -14,7 +14,7 @@
 use crate::reader::{execute_with_reader, ColumnInfo, Reader, Spec, SqlDialect, TableInfo};
 use crate::{naming, DataFrame, Result};
 use std::cell::{Cell, RefCell};
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 
 pub struct CachingReader {
     /// Primary backend — the real data source.
@@ -25,6 +25,9 @@ pub struct CachingReader {
     result_cache: RefCell<HashMap<String, String>>,
     /// Monotonic counter for unique memo table names.
     next_id: Cell<u64>,
+    /// Names registered into the cache. A source read that references one
+    /// is routed to the cache rather than the primary.
+    resident: RefCell<HashSet<String>>,
 }
 
 impl CachingReader {
@@ -36,6 +39,7 @@ impl CachingReader {
             cache,
             result_cache: RefCell::new(HashMap::new()),
             next_id: Cell::new(0),
+            resident: RefCell::new(HashSet::new()),
         }
     }
 }
@@ -48,7 +52,12 @@ impl Reader for CachingReader {
 
     /// Source surface: base reads of the user's data (plus user setup/DML).
     fn execute_sql_primary(&self, sql: &str) -> Result<DataFrame> {
-        if sql.contains("ggsql:") || sql.contains("__ggsql_") {
+        // Route to the cache when the read targets cache-resident objects
+        let references_cache_table = {
+            let resident = self.resident.borrow();
+            resident.iter().any(|t| sql.contains(t.as_str()))
+        };
+        if sql.contains("ggsql:") || references_cache_table {
             return self.cache.execute_sql(sql);
         }
 
@@ -68,17 +77,25 @@ impl Reader for CachingReader {
             self.result_cache
                 .borrow_mut()
                 .insert(sql.to_string(), table);
+        } else {
+            // A source-side write or DDL on the primary can change the data
+            // behind a memoized read; drop the memo.
+            self.result_cache.borrow_mut().clear();
         }
 
         Ok(df)
     }
 
     fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> {
-        self.cache.register(name, df, replace)
+        self.cache.register(name, df, replace)?;
+        self.resident.borrow_mut().insert(name.to_string());
+        Ok(())
     }
 
     fn unregister(&self, name: &str) -> Result<()> {
-        self.cache.unregister(name)
+        self.cache.unregister(name)?;
+        self.resident.borrow_mut().remove(name);
+        Ok(())
     }
 
     fn execute(&self, query: &str) -> Result<Spec> {
@@ -321,4 +338,144 @@ mod behavior_tests {
             spec.err()
         );
     }
+
+    #[test]
+    fn test_aliased_cte_reading_primary_routes_to_primary() {
+        // A column-aliased CTE whose body reads a primary-only table must run on
+        // the primary. The `__ggsql_aliased__` column-alias wrapper must not
+        // misroute the read to the (empty) cache.
+        let base = DuckDBReader::new_in_memory().unwrap();
+        base.register("base", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let primary = Box::new(ReadOnlyReader::new(Box::new(base)));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache);
+
+        let spec = reader.execute(
+            "WITH t(a) AS (SELECT v FROM base) SELECT a FROM t VISUALISE a AS x DRAW point",
+        );
+        assert!(
+            spec.is_ok(),
+            "aliased CTE over a primary table should succeed: {:?}",
+            spec.err()
+        );
+    }
+
+    #[test]
+    fn test_aliased_cte_referencing_prior_cte_routes_to_cache() {
+        // A column-aliased CTE that references a *prior* CTE reads a table that
+        // lives in the cache, so its body must route to the cache, while the
+        // first CTE still reads the primary.
+        let base = DuckDBReader::new_in_memory().unwrap();
+        base.register("base", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let primary = Box::new(ReadOnlyReader::new(Box::new(base)));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache);
+
+        let spec = reader.execute(
+            "WITH a(p) AS (SELECT v FROM base), b(q) AS (SELECT p FROM a) \
+             SELECT q FROM b VISUALISE q AS x DRAW point",
+        );
+        assert!(
+            spec.is_ok(),
+            "dependent aliased CTE should succeed: {:?}",
+            spec.err()
+        );
+    }
+
+    #[test]
+    fn test_source_write_invalidates_memo() {
+        // Memoize a base read, mutate the primary, then re-read: the memo must be
+        // invalidated by the non-row-returning statement so the second read is fresh.
+        let primary = DuckDBReader::new_in_memory().unwrap();
+        primary
+            .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(Box::new(primary), cache);
+
+        let q = "SELECT v FROM t";
+        let d1 = reader.execute_sql_primary(q).unwrap();
+        assert_eq!(d1.height(), 3);
+
+        reader
+            .execute_sql_primary("INSERT INTO t VALUES (4)")
+            .unwrap();
+
+        let d2 = reader.execute_sql_primary(q).unwrap();
+        assert_eq!(
+            d2.height(),
+            4,
+            "re-read must see the inserted row, not the memo"
+        );
+    }
+
+    #[test]
+    fn test_pure_sql_reads_primary_not_cache() {
+        // The pure-SQL display path uses `execute_sql_primary`, which reads the
+        // primary; `execute_sql` would hit the empty cache and fail.
+        let primary = DuckDBReader::new_in_memory().unwrap();
+        primary
+            .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(Box::new(primary), cache);
+
+        let df = reader.execute_sql_primary("SELECT v FROM t").unwrap();
+        assert_eq!(df.height(), 3);
+        assert!(
+            reader.execute_sql("SELECT v FROM t").is_err(),
+            "compute surface should not find the primary-only table"
+        );
+    }
+
+    #[test]
+    fn test_cache_resident_table_as_layer_source() {
+        // A table registered directly on the CachingReader lives only in the
+        // cache.
+        let primary = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache);
+        reader
+            .register(
+                "only_in_cache",
+                df! { "val" => vec![1.0_f64, 2.0, 2.0, 3.0, 3.0, 3.0, 9.0] }.unwrap(),
+                true,
+            )
+            .unwrap();
+
+        let spec = reader.execute("VISUALISE x DRAW histogram MAPPING val AS x FROM only_in_cache");
+        assert!(
+            spec.is_ok(),
+            "cache-resident layer source should succeed: {:?}",
+            spec.err()
+        );
+    }
+
+    #[cfg(feature = "sqlite")]
+    #[test]
+    fn test_file_layer_source_staged_via_cache() {
+        use crate::reader::SqliteReader;
+        // A file source must be staged on the cache surface.
+        let dir = std::env::temp_dir();
+        let path = dir.join(format!("ggsql_cache_file_test_{}.csv", std::process::id()));
+        std::fs::write(&path, "val\n1.0\n2.0\n2.0\n3.0\n3.0\n3.0\n9.0\n").unwrap();
+        let path_str = path.to_str().unwrap().to_string();
+
+        let primary = Box::new(SqliteReader::new().unwrap());
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache);
+
+        let spec = reader.execute(&format!(
+            "VISUALISE x DRAW histogram MAPPING val AS x FROM '{}'",
+            path_str
+        ));
+        let _ = std::fs::remove_file(&path);
+        assert!(
+            spec.is_ok(),
+            "file layer source via cache should succeed: {:?}",
+            spec.err()
+        );
+    }
 }

From 78c856ea4c743866c2d3b306029d0b41b430681f Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Tue, 30 Jun 2026 10:13:41 +0100
Subject: [PATCH 05/15] Store cache metadata in the cache reader

---
 src/naming.rs                   |  15 ++-
 src/reader/cache.rs             | 227 +++++++++++++++++++++++++++-----
 src/reader/cache_equivalence.rs |  10 +-
 src/reader/connection.rs        |   6 +-
 4 files changed, 215 insertions(+), 43 deletions(-)

diff --git a/src/naming.rs b/src/naming.rs
index 1d1357cea..dfcf6d5e4 100644
--- a/src/naming.rs
+++ b/src/naming.rs
@@ -85,6 +85,9 @@ pub const ORDER_COLUMN: &str = concatcp!(GGSQL_PREFIX, "order", GGSQL_SUFFIX);
 /// Used with Vega-Lite filter transforms to select per-layer data.
 pub const SOURCE_COLUMN: &str = concatcp!(GGSQL_PREFIX, "source", GGSQL_SUFFIX);
 
+/// Name of the caching layer's metadata table, held in the cache backend.
+pub const CACHE_META_TABLE: &str = concatcp!(GGSQL_PREFIX, "cache_meta", GGSQL_SUFFIX);
+
 /// Alias for schema extraction queries
 pub const SCHEMA_ALIAS: &str = concatcp!(GGSQL_SUFFIX, "schema", GGSQL_SUFFIX);
 
@@ -139,21 +142,22 @@ pub fn cte_table(cte_name: &str) -> String {
 
 /// Generate temp table name for a memoized primary query result in a cache.
 ///
-/// Format: `__ggsql_cache_<uuid>_<id>__`
+/// The cache key is a hex hash of the primary URI and SQL.
+/// Format: `__ggsql_cache_<uuid>_<key>__`
 ///
 /// # Example
 /// ```
 /// use ggsql::naming;
-/// let table = naming::cache_result_table(0);
+/// let table = naming::cache_result_table("deadbeef");
 /// assert!(table.starts_with("__ggsql_cache_"));
-/// assert!(table.ends_with("_0__"));
+/// assert!(table.ends_with("_deadbeef__"));
 /// ```
-pub fn cache_result_table(id: u64) -> String {
+pub fn cache_result_table(key: &str) -> String {
     format!(
         "{}cache_{}_{}{}",
         GGSQL_PREFIX,
         session_id(),
-        id,
+        key,
         GGSQL_SUFFIX
     )
 }
@@ -536,6 +540,7 @@ mod tests {
         assert_eq!(ORDER_COLUMN, "__ggsql_order__");
         assert_eq!(SOURCE_COLUMN, "__ggsql_source__");
         assert_eq!(SCHEMA_ALIAS, "__schema__");
+        assert_eq!(CACHE_META_TABLE, "__ggsql_cache_meta__");
     }
 
     #[test]
diff --git a/src/reader/cache.rs b/src/reader/cache.rs
index e57393acf..9cb27affc 100644
--- a/src/reader/cache.rs
+++ b/src/reader/cache.rs
@@ -14,34 +14,124 @@
 use crate::reader::{execute_with_reader, ColumnInfo, Reader, Spec, SqlDialect, TableInfo};
 use crate::{naming, DataFrame, Result};
 use std::cell::{Cell, RefCell};
-use std::collections::{HashMap, HashSet};
+use std::collections::hash_map::DefaultHasher;
+use std::collections::HashSet;
+use std::hash::Hasher;
 
 pub struct CachingReader {
     /// Primary backend — the real data source.
     primary: Box<dyn Reader + Send>,
     /// In-memory writeable cache: derived tables, registered data, memoized reads.
     cache: Box<dyn Reader + Send>,
-    /// Memo of base primary reads: source SQL text -> cache table holding the result.
-    result_cache: RefCell<HashMap<String, String>>,
-    /// Monotonic counter for unique memo table names.
-    next_id: Cell<u64>,
+    /// Connection URI of the primary.
+    primary_uri: String,
+    /// Whether the metadata table has been created in the cache backend.
+    meta_ready: Cell<bool>,
     /// Names registered into the cache. A source read that references one
     /// is routed to the cache rather than the primary.
     resident: RefCell<HashSet<String>>,
 }
 
 impl CachingReader {
-    /// Construct a `CachingReader` from a primary reader and an in-memory cache
-    /// backend. The cache is owned by the `CachingReader` and dropped with it.
-    pub fn new(primary: Box<dyn Reader + Send>, cache: Box<dyn Reader + Send>) -> Self {
+    /// Construct a `CachingReader` from a primary reader, an in-memory cache
+    /// backend, and the primary's connection URI. The cache is owned by the
+    /// `CachingReader` and dropped with it.
+    pub fn new(
+        primary: Box<dyn Reader + Send>,
+        cache: Box<dyn Reader + Send>,
+        primary_uri: impl Into<String>,
+    ) -> Self {
         Self {
             primary,
             cache,
-            result_cache: RefCell::new(HashMap::new()),
-            next_id: Cell::new(0),
+            primary_uri: primary_uri.into(),
+            meta_ready: Cell::new(false),
             resident: RefCell::new(HashSet::new()),
         }
     }
+
+    /// Derive a stable cache key from the primary URI and the SQL text.
+    fn cache_key(&self, sql: &str) -> String {
+        let mut hasher = DefaultHasher::new();
+        hasher.write(self.primary_uri.as_bytes());
+        hasher.write(b"\n");
+        hasher.write(sql.as_bytes());
+        format!("{:016x}", hasher.finish())
+    }
+
+    /// Create the metadata table in the cache backend if it doesn't exist yet.
+    fn ensure_meta_table(&self) -> Result<()> {
+        if self.meta_ready.get() {
+            return Ok(());
+        }
+        let sql = format!(
+            "CREATE TABLE IF NOT EXISTS {} \
+             (cache_key VARCHAR PRIMARY KEY, sql VARCHAR NOT NULL, table_name VARCHAR NOT NULL)",
+            naming::quote_ident(naming::CACHE_META_TABLE)
+        );
+        self.cache.execute_sql(&sql)?;
+        self.meta_ready.set(true);
+        Ok(())
+    }
+
+    /// Look up the cache table holding the memoized result for `key`.
+    fn lookup_memo(&self, key: &str) -> Result<Option<String>> {
+        let sql = format!(
+            "SELECT table_name FROM {} WHERE cache_key = {}",
+            naming::quote_ident(naming::CACHE_META_TABLE),
+            naming::quote_literal(key),
+        );
+        let df = self.cache.execute_sql(&sql)?;
+        if df.height() == 0 {
+            return Ok(None);
+        }
+        let table = crate::array_util::as_str(df.column("table_name")?)?
+            .value(0)
+            .to_string();
+        Ok(Some(table))
+    }
+
+    /// Record a memoized read in the metadata table.
+    fn insert_memo(&self, key: &str, sql: &str, table: &str) -> Result<()> {
+        let stmt = format!(
+            "INSERT OR REPLACE INTO {} (cache_key, sql, table_name) VALUES ({}, {}, {})",
+            naming::quote_ident(naming::CACHE_META_TABLE),
+            naming::quote_literal(key),
+            naming::quote_literal(sql),
+            naming::quote_literal(table),
+        );
+        self.cache.execute_sql(&stmt)?;
+        Ok(())
+    }
+
+    /// The cache tables holding all currently-memoized results.
+    fn memo_tables(&self) -> Result<Vec<String>> {
+        let sql = format!(
+            "SELECT table_name FROM {}",
+            naming::quote_ident(naming::CACHE_META_TABLE)
+        );
+        let df = self.cache.execute_sql(&sql)?;
+        let n = df.height();
+        if n == 0 {
+            return Ok(Vec::new());
+        }
+        let col = crate::array_util::as_str(df.column("table_name")?)?;
+        Ok((0..n).map(|i| col.value(i).to_string()).collect())
+    }
+
+    /// Drop every memoized result and empty the metadata table.
+    fn clear_memo(&self) -> Result<()> {
+        self.ensure_meta_table()?;
+        for table in self.memo_tables()? {
+            let _ = self.cache.unregister(&table);
+        }
+        let del = format!(
+            "DELETE FROM {}",
+            naming::quote_ident(naming::CACHE_META_TABLE)
+        );
+        self.cache.execute_sql(&del)?;
+        Ok(())
+    }
 }
 
 impl Reader for CachingReader {
@@ -52,35 +142,38 @@ impl Reader for CachingReader {
 
     /// Source surface: base reads of the user's data (plus user setup/DML).
     fn execute_sql_primary(&self, sql: &str) -> Result<DataFrame> {
-        // Route to the cache when the read targets cache-resident objects
+        // Route to the cache when the read targets cache-resident objects, the
+        // metadata table, or a builtin dataset.
         let references_cache_table = {
             let resident = self.resident.borrow();
             resident.iter().any(|t| sql.contains(t.as_str()))
         };
-        if sql.contains("ggsql:") || references_cache_table {
+        if sql.contains("ggsql:")
+            || sql.contains(naming::CACHE_META_TABLE)
+            || references_cache_table
+        {
             return self.cache.execute_sql(sql);
         }
 
-        if let Some(table) = self.result_cache.borrow().get(sql) {
+        self.ensure_meta_table()?;
+        let key = self.cache_key(sql);
+
+        if let Some(table) = self.lookup_memo(&key)? {
             return self
                 .cache
-                .execute_sql(&format!("SELECT * FROM {}", naming::quote_ident(table)));
+                .execute_sql(&format!("SELECT * FROM {}", naming::quote_ident(&table)));
         }
 
         let df = self.primary.execute_sql(sql)?;
 
         if super::returns_rows(sql) {
-            let id = self.next_id.get();
-            self.next_id.set(id + 1);
-            let table = naming::cache_result_table(id);
+            let table = naming::cache_result_table(&key);
             self.cache.register(&table, df.clone(), true)?;
-            self.result_cache
-                .borrow_mut()
-                .insert(sql.to_string(), table);
+            self.insert_memo(&key, sql, &table)?;
         } else {
             // A source-side write or DDL on the primary can change the data
             // behind a memoized read; drop the memo.
-            self.result_cache.borrow_mut().clear();
+            self.clear_memo()?;
         }
 
         Ok(df)
@@ -155,7 +248,7 @@ mod behavior_tests {
     fn test_register_writes_to_cache_and_query_routes_there() {
         let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap()));
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(primary, cache);
+        let reader = CachingReader::new(primary, cache, "test://primary");
 
         reader
             .register("t", df! { "x" => vec![1_i64, 2, 3] }.unwrap(), true)
@@ -176,7 +269,7 @@ mod behavior_tests {
             .unwrap();
         let (primary, log) = SpyReader::wrap(Box::new(inner));
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(primary, cache);
+        let reader = CachingReader::new(primary, cache, "test://primary");
 
         let q = "SELECT y FROM base ORDER BY y";
         let d1 = reader.execute_sql_primary(q).unwrap();
@@ -207,7 +300,7 @@ mod behavior_tests {
             .unwrap();
         let (primary, log) = SpyReader::wrap(Box::new(inner));
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(primary, cache);
+        let reader = CachingReader::new(primary, cache, "test://primary");
 
         reader
             .execute("SELECT x, y FROM sales VISUALISE x, y DRAW point")
@@ -266,6 +359,7 @@ mod behavior_tests {
         let cached = CachingReader::new(
             Box::new(ReadOnlyReader::new(Box::new(primary))),
             Box::new(DuckDBReader::new_in_memory().unwrap()),
+            "test://primary",
         );
         assert!(
             cached.execute(query).is_ok(),
@@ -308,7 +402,7 @@ mod behavior_tests {
         // GREATEST), not SQLite's (CASE fallback).
         let primary = Box::new(SqliteReader::new().unwrap());
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(primary, cache);
+        let reader = CachingReader::new(primary, cache, "test://primary");
         assert_eq!(reader.dialect().sql_greatest(&["a", "b"]), "GREATEST(a, b)");
     }
 
@@ -329,7 +423,7 @@ mod behavior_tests {
             )
             .unwrap();
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(Box::new(primary), cache);
+        let reader = CachingReader::new(Box::new(primary), cache, "test://primary");
 
         let spec = reader.execute("VISUALISE x DRAW histogram MAPPING val AS x FROM tbl");
         assert!(
@@ -349,7 +443,7 @@ mod behavior_tests {
             .unwrap();
         let primary = Box::new(ReadOnlyReader::new(Box::new(base)));
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(primary, cache);
+        let reader = CachingReader::new(primary, cache, "test://primary");
 
         let spec = reader.execute(
             "WITH t(a) AS (SELECT v FROM base) SELECT a FROM t VISUALISE a AS x DRAW point",
@@ -371,7 +465,7 @@ mod behavior_tests {
             .unwrap();
         let primary = Box::new(ReadOnlyReader::new(Box::new(base)));
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(primary, cache);
+        let reader = CachingReader::new(primary, cache, "test://primary");
 
         let spec = reader.execute(
             "WITH a(p) AS (SELECT v FROM base), b(q) AS (SELECT p FROM a) \
@@ -384,6 +478,75 @@ mod behavior_tests {
         );
     }
 
+    #[test]
+    fn test_meta_table_records_and_serves_memo() {
+        // A memoized read is recorded in the metadata table and served back from
+        // the cache on repeat, without touching the primary again.
+        let inner = DuckDBReader::new_in_memory().unwrap();
+        inner
+            .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let (primary, log) = SpyReader::wrap(Box::new(inner));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let q = "SELECT y FROM base ORDER BY y";
+        reader.execute_sql_primary(q).unwrap();
+
+        // The metadata table now has exactly one row for this read.
+        let meta = reader
+            .execute_sql(&format!("SELECT sql FROM {}", naming::CACHE_META_TABLE))
+            .unwrap();
+        assert_eq!(meta.height(), 1);
+        assert_eq!(
+            crate::array_util::as_str(meta.column("sql").unwrap())
+                .unwrap()
+                .value(0),
+            q
+        );
+
+        // The repeat read is served from the cache, not the primary.
+        reader.execute_sql_primary(q).unwrap();
+        let hits = log
+            .lock()
+            .unwrap()
+            .iter()
+            .filter(|s| s.as_str() == q)
+            .count();
+        assert_eq!(hits, 1);
+    }
+
+    #[cfg(feature = "sqlite")]
+    #[test]
+    fn test_sqlite_cache_backend_memoizes() {
+        use crate::reader::SqliteReader;
+        // A SQLite cache backend must support the metadata table DDL/DML
+        // (CREATE TABLE IF NOT EXISTS, INSERT OR REPLACE) and serve memoized reads.
+        let inner = DuckDBReader::new_in_memory().unwrap();
+        inner
+            .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let (primary, log) = SpyReader::wrap(Box::new(inner));
+        let cache = Box::new(SqliteReader::new().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let q = "SELECT y FROM base ORDER BY y";
+        let d1 = reader.execute_sql_primary(q).unwrap();
+        let d2 = reader.execute_sql_primary(q).unwrap();
+        assert_eq!(d1.height(), 3);
+        assert_eq!(d2.height(), 3);
+        let hits = log
+            .lock()
+            .unwrap()
+            .iter()
+            .filter(|s| s.as_str() == q)
+            .count();
+        assert_eq!(
+            hits, 1,
+            "SQLite cache should serve the repeat from the memo"
+        );
+    }
+
     #[test]
     fn test_source_write_invalidates_memo() {
         // Memoize a base read, mutate the primary, then re-read: the memo must be
@@ -393,7 +556,7 @@ mod behavior_tests {
             .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true)
             .unwrap();
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(Box::new(primary), cache);
+        let reader = CachingReader::new(Box::new(primary), cache, "test://primary");
 
         let q = "SELECT v FROM t";
         let d1 = reader.execute_sql_primary(q).unwrap();
@@ -420,7 +583,7 @@ mod behavior_tests {
             .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true)
             .unwrap();
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(Box::new(primary), cache);
+        let reader = CachingReader::new(Box::new(primary), cache, "test://primary");
 
         let df = reader.execute_sql_primary("SELECT v FROM t").unwrap();
         assert_eq!(df.height(), 3);
@@ -436,7 +599,7 @@ mod behavior_tests {
         // cache.
         let primary = Box::new(DuckDBReader::new_in_memory().unwrap());
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(primary, cache);
+        let reader = CachingReader::new(primary, cache, "test://primary");
         reader
             .register(
                 "only_in_cache",
@@ -465,7 +628,7 @@ mod behavior_tests {
 
         let primary = Box::new(SqliteReader::new().unwrap());
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(primary, cache);
+        let reader = CachingReader::new(primary, cache, "test://primary");
 
         let spec = reader.execute(&format!(
             "VISUALISE x DRAW histogram MAPPING val AS x FROM '{}'",
diff --git a/src/reader/cache_equivalence.rs b/src/reader/cache_equivalence.rs
index c490919b0..819d92d65 100644
--- a/src/reader/cache_equivalence.rs
+++ b/src/reader/cache_equivalence.rs
@@ -157,7 +157,7 @@ fn mode1_builtin_equivalence_matches_plain_duckdb() {
         let plain = DuckDBReader::new_in_memory().unwrap();
         let primary = ReadOnlyReader::new(Box::new(SqliteReader::new().unwrap()));
         let cache = DuckDBReader::new_in_memory().unwrap();
-        let cached = CachingReader::new(Box::new(primary), Box::new(cache));
+        let cached = CachingReader::new(Box::new(primary), Box::new(cache), "test://primary");
         assert_equivalent(&plain, &cached, case.query);
     }
 }
@@ -174,7 +174,7 @@ fn mode1_read_only_primary_is_sufficient() {
         seed(&sqlite);
         let primary = ReadOnlyReader::new(Box::new(sqlite));
         let cache = DuckDBReader::new_in_memory().unwrap();
-        let cached = CachingReader::new(Box::new(primary), Box::new(cache));
+        let cached = CachingReader::new(Box::new(primary), Box::new(cache), "test://primary");
         let r = cached.execute(case.query);
         assert!(
             r.is_ok(),
@@ -193,7 +193,7 @@ fn cross_call_memoization_avoids_second_primary_read() {
     seed(&sqlite);
     let (primary, log) = SpyReader::wrap(Box::new(sqlite));
     let cache = DuckDBReader::new_in_memory().unwrap();
-    let cached = CachingReader::new(primary, Box::new(cache));
+    let cached = CachingReader::new(primary, Box::new(cache), "test://primary");
 
     let query = "SELECT bill_len FROM cache_eq_tbl VISUALISE bill_len AS x DRAW histogram";
     cached.execute(query).unwrap();
@@ -234,7 +234,7 @@ fn factory_builds_caching_readers() {
 fn map_projection_runs_on_cache_not_primary() {
     let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap()));
     let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-    let reader = CachingReader::new(primary, cache);
+    let reader = CachingReader::new(primary, cache, "test://primary");
 
     let spec = reader.execute("VISUALISE FROM ggsql:world DRAW spatial PROJECT TO orthographic");
     assert!(
@@ -343,7 +343,7 @@ mod adbc_mode {
             calls: calls.clone(),
         };
         let cache = DuckDBReader::new_in_memory().unwrap();
-        let cached = CachingReader::new(Box::new(primary), Box::new(cache));
+        let cached = CachingReader::new(Box::new(primary), Box::new(cache), "test://primary");
 
         // Base reads go through the source surface; the cache memoizes them.
         let miss = cached.execute_sql_primary(sql).unwrap();
diff --git a/src/reader/connection.rs b/src/reader/connection.rs
index eebeffdf1..b699e60f3 100644
--- a/src/reader/connection.rs
+++ b/src/reader/connection.rs
@@ -107,7 +107,11 @@ pub fn reader_from_uri(uri: &str) -> Result<Box<dyn Reader + Send>> {
         {
             let primary = build_reader(&primary_uri)?;
             let cache = build_reader(cache_uri(&cache_scheme)?)?;
-            return Ok(Box::new(crate::reader::CachingReader::new(primary, cache)));
+            return Ok(Box::new(crate::reader::CachingReader::new(
+                primary,
+                cache,
+                primary_uri,
+            )));
         }
         #[cfg(not(any(feature = "duckdb", feature = "sqlite")))]
         {

From bd2b1552bd92f1c18178fee974654ac265b77393 Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Tue, 30 Jun 2026 11:59:59 +0100
Subject: [PATCH 06/15] Back cache memo with a meta table

---
 ggsql-cli/src/main.rs           |  2 +-
 ggsql-jupyter/src/executor.rs   |  2 +-
 src/CLAUDE.md                   |  2 +-
 src/execute/mod.rs              | 16 +++---
 src/reader/cache.rs             | 98 ++++++++++++++++++++++-----------
 src/reader/cache_equivalence.rs |  4 +-
 src/reader/data.rs              | 16 ++++++
 src/reader/mod.rs               | 19 +++----
 8 files changed, 104 insertions(+), 55 deletions(-)

diff --git a/ggsql-cli/src/main.rs b/ggsql-cli/src/main.rs
index 413dae6b4..0a004fbce 100644
--- a/ggsql-cli/src/main.rs
+++ b/ggsql-cli/src/main.rs
@@ -446,7 +446,7 @@ fn print_table_fallback<R: Reader + ?Sized>(query: &str, reader: &R, max_rows: u
 
     let sql_part = source_tree.extract_sql().unwrap_or_default();
 
-    let data = reader.execute_sql_primary(&sql_part);
+    let data = reader.execute_sql(&sql_part);
     if let Err(e) = data {
         eprintln!("Failed to execute SQL query: {}", e);
         std::process::exit(1)
diff --git a/ggsql-jupyter/src/executor.rs b/ggsql-jupyter/src/executor.rs
index 668d5f3dd..fab09a9e4 100644
--- a/ggsql-jupyter/src/executor.rs
+++ b/ggsql-jupyter/src/executor.rs
@@ -178,7 +178,7 @@ impl QueryExecutor {
         // 2. Check if there's a visualization
         if !validated.has_visual() {
             // Pure SQL query - execute directly and return DataFrame.
-            let df = self.reader.execute_sql_primary(code)?;
+            let df = self.reader.execute_sql(code)?;
             tracing::info!(
                 "Pure SQL executed: {} rows, {} cols",
                 df.height(),
diff --git a/src/CLAUDE.md b/src/CLAUDE.md
index d0a12fba0..5394c90fd 100644
--- a/src/CLAUDE.md
+++ b/src/CLAUDE.md
@@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding
 
 `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`).
 
-**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache. **`execute_sql_primary` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins and reads that reference a cache-resident internal table, which go to the cache. Cache routing is by **membership** in the set of tables actually registered into the cache (the `resident` set: internal `__ggsql_*` tables plus user-registered tables). A non-row-returning source statement (DML/DDL on the primary) clears the read memo, so later identical reads don't return stale data. Pure/non-visual SQL (CLI table fallback, Jupyter) also goes through `execute_sql_primary` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `<primary>+<cache>://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
+**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. A non-row-returning source statement (DML/DDL on the primary) clears the read memo, so later identical reads don't return stale data. Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `<primary>+<cache>://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
 
 ### `execute/`
 
diff --git a/src/execute/mod.rs b/src/execute/mod.rs
index acec6e512..2baa2032e 100644
--- a/src/execute/mod.rs
+++ b/src/execute/mod.rs
@@ -1098,12 +1098,10 @@ pub struct PreparedData {
 /// * `query` - The full ggsql query string
 /// * `reader` - A Reader implementation for executing SQL
 pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<PreparedData> {
-    // Two execution surfaces: `execute_query` is the COMPUTE surface for
-    // derived/dialect-generated SQL over internal `__ggsql_*` tables;
-    // `execute_query_primary` is the SOURCE surface for base reads of the
-    // user's data and user setup/DML.
-    let execute_query = |sql: &str| reader.execute_sql(sql);
-    let execute_query_primary = |sql: &str| reader.execute_sql_primary(sql);
+    // `execute_query` is the COMPUTE surface for derived/dialect-generated SQL
+    // over internal `__ggsql_*` tables. Base source reads (user setup/DML, the
+    // global query) call `reader.execute_sql(...)` directly.
+    let execute_query = |sql: &str| reader.execute_sql_cached(sql);
     let dialect = reader.dialect();
 
     // Parse once and create SourceTree
@@ -1134,7 +1132,7 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<Prep
     // Structured DML (CREATE, INSERT, UPDATE, DELETE) is handled separately as
     // side-effects in cte::transform_global_sql.
     for stmt in source_tree.find_texts(&root, "(sql_statement (other_sql_statement) @stmt)") {
-        execute_query_primary(&stmt)?;
+        reader.execute_sql(&stmt)?;
     }
 
     // Extract CTE definitions from the source tree (in declaration order)
@@ -1157,7 +1155,7 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<Prep
         let (side_effects, query) = cte::transform_global_sql(&source_tree, &materialized_ctes);
 
         for stmt in &side_effects {
-            execute_query_primary(stmt)?;
+            reader.execute_sql(stmt)?;
         }
 
         if let Some(query) = query {
@@ -1213,7 +1211,7 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<Prep
                     let t = naming::layer_source_table(materialized_sources.len());
                     if matches!(layer.source, Some(crate::DataSource::FilePath(_))) {
                         // The cache backend reads local files
-                        let df = reader.execute_sql(&source_query)?;
+                        let df = reader.execute_sql_cached(&source_query)?;
                         reader.register(&t, df, true)?;
                     } else {
                         reader.materialize_table(&t, &[], &source_query)?;
diff --git a/src/reader/cache.rs b/src/reader/cache.rs
index 9cb27affc..b951b4154 100644
--- a/src/reader/cache.rs
+++ b/src/reader/cache.rs
@@ -3,10 +3,10 @@
 //! [`CachingReader`] wraps a `primary` reader and an in-memory `cache` backend,
 //! splitting work across two surfaces:
 //!
-//! - **Compute** ([`Reader::execute_sql`]) runs on the cache: all derived,
+//! - **Source** ([`Reader::execute_sql`]) reads the primary: base reads of the
+//!   user's data, plus user setup/DML.
+//! - **Compute** ([`Reader::execute_sql_cached`]) runs on the cache: all derived,
 //!   dialect-generated SQL operates on the `__ggsql_*` tables.
-//! - **Source** ([`Reader::execute_sql_primary`]) reads the primary: base reads of
-//!   the user's data, plus user setup/DML.
 //! - [`Reader::materialize_table`] reads a body via the source surface and
 //!   `register`s the result into the cache.
 //! - [`Reader::dialect`] returns the cache dialect.
@@ -119,6 +119,18 @@ impl CachingReader {
         Ok((0..n).map(|i| col.value(i).to_string()).collect())
     }
 
+    /// Whether `sql` references a cache-resident table by exact name.
+    ///
+    /// Parses the query's `table_ref` targets and tests them against `resident`.
+    /// On a parse failure we conservatively return `false`.
+    fn references_resident(&self, sql: &str) -> bool {
+        let Ok(refs) = super::data::extract_table_refs(sql) else {
+            return false;
+        };
+        let resident = self.resident.borrow();
+        refs.iter().any(|t| resident.contains(t))
+    }
+
     /// Drop every memoized result and empty the metadata table.
     fn clear_memo(&self) -> Result<()> {
         self.ensure_meta_table()?;
@@ -135,22 +147,13 @@ impl CachingReader {
 }
 
 impl Reader for CachingReader {
-    /// Compute surface: all derived/dialect-generated SQL runs on the cache.
-    fn execute_sql(&self, sql: &str) -> Result<DataFrame> {
-        self.cache.execute_sql(sql)
-    }
-
     /// Source surface: base reads of the user's data (plus user setup/DML).
-    fn execute_sql_primary(&self, sql: &str) -> Result<DataFrame> {
+    fn execute_sql(&self, sql: &str) -> Result<DataFrame> {
         // Route to the cache when the read targets cache-resident objects, the
         // metadata table, or a builtin dataset.
-        let references_cache_table = {
-            let resident = self.resident.borrow();
-            resident.iter().any(|t| sql.contains(t.as_str()))
-        };
         if sql.contains("ggsql:")
             || sql.contains(naming::CACHE_META_TABLE)
-            || references_cache_table
+            || self.references_resident(sql)
         {
             return self.cache.execute_sql(sql);
         }
@@ -179,6 +182,11 @@ impl Reader for CachingReader {
         Ok(df)
     }
 
+    /// Compute surface: derived/dialect-generated SQL runs on the cache.
+    fn execute_sql_cached(&self, sql: &str) -> Result<DataFrame> {
+        self.cache.execute_sql(sql)
+    }
+
     fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> {
         self.cache.register(name, df, replace)?;
         self.resident.borrow_mut().insert(name.to_string());
@@ -209,7 +217,7 @@ impl Reader for CachingReader {
         // Read the body via the source surface, then register the result
         // into the cache.
         let body = super::wrap_with_column_aliases(body_sql, column_aliases);
-        let df = self.execute_sql_primary(&body)?;
+        let df = self.execute_sql(&body)?;
         self.register(name, df, true)
     }
 
@@ -254,7 +262,9 @@ mod behavior_tests {
             .register("t", df! { "x" => vec![1_i64, 2, 3] }.unwrap(), true)
             .unwrap();
         // register writes to the cache; the compute surface reads it back.
-        let out = reader.execute_sql("SELECT COUNT(*) AS n FROM t").unwrap();
+        let out = reader
+            .execute_sql_cached("SELECT COUNT(*) AS n FROM t")
+            .unwrap();
 
         assert_eq!(as_i64(out.column("n").unwrap()).unwrap().value(0), 3);
         // The primary was never touched.
@@ -272,8 +282,8 @@ mod behavior_tests {
         let reader = CachingReader::new(primary, cache, "test://primary");
 
         let q = "SELECT y FROM base ORDER BY y";
-        let d1 = reader.execute_sql_primary(q).unwrap();
-        let d2 = reader.execute_sql_primary(q).unwrap();
+        let d1 = reader.execute_sql(q).unwrap();
+        let d2 = reader.execute_sql(q).unwrap();
         assert_eq!(d1.height(), 3);
         assert_eq!(d2.height(), 3);
 
@@ -491,7 +501,7 @@ mod behavior_tests {
         let reader = CachingReader::new(primary, cache, "test://primary");
 
         let q = "SELECT y FROM base ORDER BY y";
-        reader.execute_sql_primary(q).unwrap();
+        reader.execute_sql(q).unwrap();
 
         // The metadata table now has exactly one row for this read.
         let meta = reader
@@ -506,7 +516,7 @@ mod behavior_tests {
         );
 
         // The repeat read is served from the cache, not the primary.
-        reader.execute_sql_primary(q).unwrap();
+        reader.execute_sql(q).unwrap();
         let hits = log
             .lock()
             .unwrap()
@@ -531,8 +541,8 @@ mod behavior_tests {
         let reader = CachingReader::new(primary, cache, "test://primary");
 
         let q = "SELECT y FROM base ORDER BY y";
-        let d1 = reader.execute_sql_primary(q).unwrap();
-        let d2 = reader.execute_sql_primary(q).unwrap();
+        let d1 = reader.execute_sql(q).unwrap();
+        let d2 = reader.execute_sql(q).unwrap();
         assert_eq!(d1.height(), 3);
         assert_eq!(d2.height(), 3);
         let hits = log
@@ -547,6 +557,34 @@ mod behavior_tests {
         );
     }
 
+    #[test]
+    fn test_resident_substring_not_false_matched() {
+        // A primary-only table whose name *contains* a cache-resident table name
+        // as a substring must still route to the primary. Exact-identifier
+        // matching distinguishes `orders` (resident) from `orders_archive`
+        // (primary-only).
+        let primary = DuckDBReader::new_in_memory().unwrap();
+        primary
+            .register(
+                "orders_archive",
+                df! { "v" => vec![1_i64, 2, 3] }.unwrap(),
+                true,
+            )
+            .unwrap();
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(Box::new(primary), cache, "test://primary");
+
+        // `orders` lives only in the cache.
+        reader
+            .register("orders", df! { "v" => vec![9_i64] }.unwrap(), true)
+            .unwrap();
+
+        // Reading the primary-only `orders_archive` must hit the primary (3 rows),
+        // not the resident `orders` (1 row).
+        let df = reader.execute_sql("SELECT v FROM orders_archive").unwrap();
+        assert_eq!(df.height(), 3);
+    }
+
     #[test]
     fn test_source_write_invalidates_memo() {
         // Memoize a base read, mutate the primary, then re-read: the memo must be
@@ -559,14 +597,12 @@ mod behavior_tests {
         let reader = CachingReader::new(Box::new(primary), cache, "test://primary");
 
         let q = "SELECT v FROM t";
-        let d1 = reader.execute_sql_primary(q).unwrap();
+        let d1 = reader.execute_sql(q).unwrap();
         assert_eq!(d1.height(), 3);
 
-        reader
-            .execute_sql_primary("INSERT INTO t VALUES (4)")
-            .unwrap();
+        reader.execute_sql("INSERT INTO t VALUES (4)").unwrap();
 
-        let d2 = reader.execute_sql_primary(q).unwrap();
+        let d2 = reader.execute_sql(q).unwrap();
         assert_eq!(
             d2.height(),
             4,
@@ -576,8 +612,8 @@ mod behavior_tests {
 
     #[test]
     fn test_pure_sql_reads_primary_not_cache() {
-        // The pure-SQL display path uses `execute_sql_primary`, which reads the
-        // primary; `execute_sql` would hit the empty cache and fail.
+        // The pure-SQL display path uses `execute_sql` (source), which reads the
+        // primary; `execute_sql_cached` (compute) would hit the empty cache and fail.
         let primary = DuckDBReader::new_in_memory().unwrap();
         primary
             .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true)
@@ -585,10 +621,10 @@ mod behavior_tests {
         let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
         let reader = CachingReader::new(Box::new(primary), cache, "test://primary");
 
-        let df = reader.execute_sql_primary("SELECT v FROM t").unwrap();
+        let df = reader.execute_sql("SELECT v FROM t").unwrap();
         assert_eq!(df.height(), 3);
         assert!(
-            reader.execute_sql("SELECT v FROM t").is_err(),
+            reader.execute_sql_cached("SELECT v FROM t").is_err(),
             "compute surface should not find the primary-only table"
         );
     }
diff --git a/src/reader/cache_equivalence.rs b/src/reader/cache_equivalence.rs
index 819d92d65..52960a4a7 100644
--- a/src/reader/cache_equivalence.rs
+++ b/src/reader/cache_equivalence.rs
@@ -346,12 +346,12 @@ mod adbc_mode {
         let cached = CachingReader::new(Box::new(primary), Box::new(cache), "test://primary");
 
         // Base reads go through the source surface; the cache memoizes them.
-        let miss = cached.execute_sql_primary(sql).unwrap();
+        let miss = cached.execute_sql(sql).unwrap();
         assert_dataframes_equal(&golden, &miss, "adbc cache miss");
         let after_miss = calls.load(Ordering::SeqCst);
         assert!(after_miss >= 1, "miss should reach the ADBC primary");
 
-        let hit = cached.execute_sql_primary(sql).unwrap();
+        let hit = cached.execute_sql(sql).unwrap();
         assert_dataframes_equal(&golden, &hit, "adbc cache hit");
         let after_hit = calls.load(Ordering::SeqCst);
         assert_eq!(
diff --git a/src/reader/data.rs b/src/reader/data.rs
index abf9e9604..384d944ff 100644
--- a/src/reader/data.rs
+++ b/src/reader/data.rs
@@ -146,6 +146,22 @@ pub fn extract_builtin_dataset_names(sql: &str) -> Result<Vec<String>, GgsqlErro
     Ok(datasets)
 }
 
+/// Extract the table names referenced in a SQL query's `FROM`/`JOIN` clauses.
+///
+/// Returns the `table_ref` targets (deduplicated), with surrounding quotes
+/// stripped via [`naming::unquote_ident`] so they can be compared against
+/// unquoted registered table names. Subquery sources contribute no name.
+pub fn extract_table_refs(sql: &str) -> Result<Vec<String>, GgsqlError> {
+    let token_def = r#"(table_ref table: (_) @table)"#;
+    let mut tokens = tokens_from_tree(sql, token_def, "table")?
+        .iter()
+        .map(|t| naming::unquote_ident(t))
+        .collect::<Vec<_>>();
+    tokens.sort_unstable();
+    tokens.dedup();
+    Ok(tokens)
+}
+
 /// Rewrite SQL to replace namespaced identifiers with internal table names.
 ///
 /// e.g., `SELECT * FROM ggsql:penguins` -> `SELECT * FROM __ggsql_data_penguins__`
diff --git a/src/reader/mod.rs b/src/reader/mod.rs
index 2eef03bea..34a63f306 100644
--- a/src/reader/mod.rs
+++ b/src/reader/mod.rs
@@ -726,10 +726,9 @@ pub struct Metadata {
 pub trait Reader {
     /// Execute a SQL query and return the result as a DataFrame.
     ///
-    /// This is the **compute surface**: ggsql runs all dialect-generated/derived
-    /// SQL here. A plain reader runs it on its own connection; a
-    /// caching reader runs it on the in-memory cache, where all derived
-    /// `__ggsql_*` tables live.
+    /// This is the **source surface**: base reads of the user's data plus user
+    /// setup/DML. A plain reader runs everything on its one connection; a caching
+    /// reader reads the primary here (with result memoization).
     ///
     /// # Errors
     ///
@@ -739,12 +738,12 @@ pub trait Reader {
     /// - The table or columns don't exist
     fn execute_sql(&self, sql: &str) -> Result<DataFrame>;
 
-    /// Execute SQL against the *source* surface — base reads of the user's data
-    /// plus user setup/DML.
+    /// Execute SQL against the *compute surface* — dialect-generated/derived SQL
+    /// over internal `__ggsql_*` tables.
     ///
-    /// Defaults to the compute surface ([`Reader::execute_sql`]), so a plain
-    /// reader runs everything on one connection. A caching reader overrides this
-    /// to read the primary (with result memoization).
+    /// Defaults to the source surface ([`Reader::execute_sql`]), so a plain reader
+    /// runs everything on one connection. A caching reader overrides this to run
+    /// on the in-memory cache, where all derived `__ggsql_*` tables live.
     ///
     /// # Errors
     ///
@@ -752,7 +751,7 @@ pub trait Reader {
     /// - The SQL is invalid
     /// - The connection fails
     /// - The table or columns don't exist
-    fn execute_sql_primary(&self, sql: &str) -> Result<DataFrame> {
+    fn execute_sql_cached(&self, sql: &str) -> Result<DataFrame> {
         self.execute_sql(sql)
     }
 

From 9e6958e828f03fc4e50365bf637795b712b5ea1f Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Tue, 30 Jun 2026 15:03:25 +0100
Subject: [PATCH 07/15] Add TTL + LRU byte-budget eviction to the cache

---
 src/CLAUDE.md       |   2 +-
 src/reader/cache.rs | 538 ++++++++++++++++++++++++++++++++++++++++----
 src/reader/mod.rs   |   5 +
 3 files changed, 500 insertions(+), 45 deletions(-)

diff --git a/src/CLAUDE.md b/src/CLAUDE.md
index 5394c90fd..5f94c2589 100644
--- a/src/CLAUDE.md
+++ b/src/CLAUDE.md
@@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding
 
 `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`).
 
-**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. A non-row-returning source statement (DML/DDL on the primary) clears the read memo, so later identical reads don't return stale data. Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `<primary>+<cache>://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
+**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. Each memoized read is bounded by a **TTL** (default 300s) and the whole memo by an **LRU byte budget** (default 512 MB); both are configurable via `CacheConfig` (env `GGSQL_CACHE_DISABLED`/`GGSQL_CACHE_TTL`/`GGSQL_CACHE_MAX_BYTES`). Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `<primary>+<cache>://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
 
 ### `execute/`
 
diff --git a/src/reader/cache.rs b/src/reader/cache.rs
index b951b4154..7b7a14ce9 100644
--- a/src/reader/cache.rs
+++ b/src/reader/cache.rs
@@ -11,12 +11,123 @@
 //!   `register`s the result into the cache.
 //! - [`Reader::dialect`] returns the cache dialect.
 
+use crate::array_util::{as_i64, as_str};
 use crate::reader::{execute_with_reader, ColumnInfo, Reader, Spec, SqlDialect, TableInfo};
 use crate::{naming, DataFrame, Result};
+use arrow::array::Array;
 use std::cell::{Cell, RefCell};
 use std::collections::hash_map::DefaultHasher;
 use std::collections::HashSet;
 use std::hash::Hasher;
+use std::time::{SystemTime, UNIX_EPOCH};
+
+/// Runtime configuration for the result memo: TTL and LRU byte-budget.
+#[derive(Debug, Clone)]
+pub struct CacheConfig {
+    /// When `false`, reads always hit the primary.
+    pub enabled: bool,
+    /// Entries older than this are treated as misses and re-fetched.
+    pub ttl_secs: u64,
+    /// Cumulative byte budget across all memo entries before LRU eviction.
+    pub max_bytes: u64,
+}
+
+impl Default for CacheConfig {
+    fn default() -> Self {
+        Self {
+            enabled: true,
+            ttl_secs: 300,
+            max_bytes: 512 * 1024 * 1024,
+        }
+    }
+}
+
+/// Per-field overrides applied on top of an env-derived [`CacheConfig`].
+#[derive(Debug, Clone, Default)]
+pub struct CacheConfigOverride {
+    pub enabled: Option<bool>,
+    pub ttl_secs: Option<u64>,
+    pub max_bytes: Option<u64>,
+}
+
+impl CacheConfig {
+    /// Read configuration from the environment, falling back to defaults.
+    ///
+    /// - `GGSQL_CACHE_DISABLED` — set, non-empty and not `0` disables the cache.
+    /// - `GGSQL_CACHE_TTL` — TTL in seconds.
+    /// - `GGSQL_CACHE_MAX_BYTES` — byte budget, accepts `512mb`/`1gb`/bytes.
+    pub fn from_env() -> Self {
+        let mut cfg = Self::default();
+        if std::env::var("GGSQL_CACHE_DISABLED")
+            .ok()
+            .filter(|v| !v.is_empty() && v != "0")
+            .is_some()
+        {
+            cfg.enabled = false;
+        }
+        if let Ok(v) = std::env::var("GGSQL_CACHE_TTL") {
+            if let Ok(secs) = v.trim().parse::<u64>() {
+                cfg.ttl_secs = secs;
+            }
+        }
+        if let Ok(v) = std::env::var("GGSQL_CACHE_MAX_BYTES") {
+            if let Some(bytes) = parse_human_bytes(&v) {
+                cfg.max_bytes = bytes;
+            }
+        }
+        cfg
+    }
+
+    /// Apply per-field overrides; each `Some` value wins over `self`.
+    pub fn merge(self, over: CacheConfigOverride) -> Self {
+        Self {
+            enabled: over.enabled.unwrap_or(self.enabled),
+            ttl_secs: over.ttl_secs.unwrap_or(self.ttl_secs),
+            max_bytes: over.max_bytes.unwrap_or(self.max_bytes),
+        }
+    }
+}
+
+/// Parse a byte count, accepting an optional `kb`/`mb`/`gb` suffix.
+pub fn parse_human_bytes(s: &str) -> Option<u64> {
+    let s = s.trim();
+    let lower = s.to_ascii_lowercase();
+    let (num, mult) = if let Some(n) = lower.strip_suffix("gb") {
+        (n, 1024 * 1024 * 1024)
+    } else if let Some(n) = lower.strip_suffix("mb") {
+        (n, 1024 * 1024)
+    } else if let Some(n) = lower.strip_suffix("kb") {
+        (n, 1024)
+    } else {
+        (lower.as_str(), 1)
+    };
+    num.trim().parse::<u64>().ok().map(|n| n * mult)
+}
+
+/// Wall-clock milliseconds since the UNIX epoch. On a clock earlier than the
+/// epoch (a misconfigured system clock) we return `i64::MAX` so existing
+/// entries appear ancient and force a re-fetch.
+fn now_ms() -> i64 {
+    SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .map(|d| d.as_millis() as i64)
+        .unwrap_or(i64::MAX)
+}
+
+/// Estimate the in-memory size of a DataFrame as the sum of its Arrow columns'
+/// memory footprints.
+fn estimate_bytes(df: &DataFrame) -> i64 {
+    df.get_columns()
+        .iter()
+        .map(|col| col.get_array_memory_size())
+        .sum::<usize>() as i64
+}
+
+/// One memo row, trimmed to the fields consulted at runtime.
+struct MemoEntry {
+    table_name: String,
+    fetched_at_epoch_ms: i64,
+}
 
 pub struct CachingReader {
     /// Primary backend — the real data source.
@@ -25,6 +136,8 @@ pub struct CachingReader {
     cache: Box<dyn Reader + Send>,
     /// Connection URI of the primary.
     primary_uri: String,
+    /// TTL + byte-budget configuration for the result memo.
+    config: CacheConfig,
     /// Whether the metadata table has been created in the cache backend.
     meta_ready: Cell<bool>,
     /// Names registered into the cache. A source read that references one
@@ -34,22 +147,39 @@ pub struct CachingReader {
 
 impl CachingReader {
     /// Construct a `CachingReader` from a primary reader, an in-memory cache
-    /// backend, and the primary's connection URI. The cache is owned by the
-    /// `CachingReader` and dropped with it.
+    /// backend, and the primary's connection URI, using environment-derived
+    /// cache configuration. The cache is owned by the `CachingReader` and
+    /// dropped with it.
     pub fn new(
         primary: Box<dyn Reader + Send>,
         cache: Box<dyn Reader + Send>,
         primary_uri: impl Into<String>,
+    ) -> Self {
+        Self::with_config(primary, cache, primary_uri, CacheConfig::from_env())
+    }
+
+    /// Construct a `CachingReader` with explicit cache configuration.
+    pub fn with_config(
+        primary: Box<dyn Reader + Send>,
+        cache: Box<dyn Reader + Send>,
+        primary_uri: impl Into<String>,
+        config: CacheConfig,
     ) -> Self {
         Self {
             primary,
             cache,
             primary_uri: primary_uri.into(),
+            config,
             meta_ready: Cell::new(false),
             resident: RefCell::new(HashSet::new()),
         }
     }
 
+    /// The active cache configuration.
+    pub fn cache_config(&self) -> &CacheConfig {
+        &self.config
+    }
+
     /// Derive a stable cache key from the primary URI and the SQL text.
     fn cache_key(&self, sql: &str) -> String {
         let mut hasher = DefaultHasher::new();
@@ -65,8 +195,10 @@ impl CachingReader {
             return Ok(());
         }
         let sql = format!(
-            "CREATE TABLE IF NOT EXISTS {} \
-             (cache_key VARCHAR PRIMARY KEY, sql VARCHAR NOT NULL, table_name VARCHAR NOT NULL)",
+            "CREATE TABLE IF NOT EXISTS {} (\
+             cache_key VARCHAR PRIMARY KEY, sql VARCHAR NOT NULL, table_name VARCHAR NOT NULL, \
+             fetched_at_epoch_ms BIGINT NOT NULL, last_accessed_epoch_ms BIGINT NOT NULL, \
+             byte_estimate BIGINT NOT NULL, row_count BIGINT NOT NULL)",
             naming::quote_ident(naming::CACHE_META_TABLE)
         );
         self.cache.execute_sql(&sql)?;
@@ -74,10 +206,10 @@ impl CachingReader {
         Ok(())
     }
 
-    /// Look up the cache table holding the memoized result for `key`.
-    fn lookup_memo(&self, key: &str) -> Result<Option<String>> {
+    /// Look up the memo entry for `key`.
+    fn lookup_memo(&self, key: &str) -> Result<Option<MemoEntry>> {
         let sql = format!(
-            "SELECT table_name FROM {} WHERE cache_key = {}",
+            "SELECT table_name, fetched_at_epoch_ms FROM {} WHERE cache_key = {}",
             naming::quote_ident(naming::CACHE_META_TABLE),
             naming::quote_literal(key),
         );
@@ -85,38 +217,95 @@ impl CachingReader {
         if df.height() == 0 {
             return Ok(None);
         }
-        let table = crate::array_util::as_str(df.column("table_name")?)?
-            .value(0)
-            .to_string();
-        Ok(Some(table))
+        let table_name = as_str(df.column("table_name")?)?.value(0).to_string();
+        let fetched_at_epoch_ms = as_i64(df.column("fetched_at_epoch_ms")?)?.value(0);
+        Ok(Some(MemoEntry {
+            table_name,
+            fetched_at_epoch_ms,
+        }))
     }
 
     /// Record a memoized read in the metadata table.
-    fn insert_memo(&self, key: &str, sql: &str, table: &str) -> Result<()> {
+    fn insert_memo(
+        &self,
+        key: &str,
+        sql: &str,
+        table: &str,
+        byte_estimate: i64,
+        row_count: i64,
+    ) -> Result<()> {
+        let now = now_ms();
         let stmt = format!(
-            "INSERT OR REPLACE INTO {} (cache_key, sql, table_name) VALUES ({}, {}, {})",
+            "INSERT OR REPLACE INTO {} \
+             (cache_key, sql, table_name, fetched_at_epoch_ms, last_accessed_epoch_ms, \
+              byte_estimate, row_count) \
+             VALUES ({}, {}, {}, {}, {}, {}, {})",
             naming::quote_ident(naming::CACHE_META_TABLE),
             naming::quote_literal(key),
             naming::quote_literal(sql),
             naming::quote_literal(table),
+            now,
+            now,
+            byte_estimate,
+            row_count,
         );
         self.cache.execute_sql(&stmt)?;
         Ok(())
     }
 
-    /// The cache tables holding all currently-memoized results.
-    fn memo_tables(&self) -> Result<Vec<String>> {
-        let sql = format!(
-            "SELECT table_name FROM {}",
+    /// Advance the last-accessed timestamp for `key` (LRU bookkeeping).
+    fn touch(&self, key: &str) -> Result<()> {
+        let stmt = format!(
+            "UPDATE {} SET last_accessed_epoch_ms = {} WHERE cache_key = {}",
+            naming::quote_ident(naming::CACHE_META_TABLE),
+            now_ms(),
+            naming::quote_literal(key),
+        );
+        self.cache.execute_sql(&stmt)?;
+        Ok(())
+    }
+
+    /// Drop a single memo entry: delete its meta row and unregister the table.
+    fn drop_entry(&self, key: &str, table: &str) -> Result<()> {
+        let del = format!(
+            "DELETE FROM {} WHERE cache_key = {}",
+            naming::quote_ident(naming::CACHE_META_TABLE),
+            naming::quote_literal(key),
+        );
+        self.cache.execute_sql(&del)?;
+        let _ = self.cache.unregister(table);
+        Ok(())
+    }
+
+    /// Evict LRU entries until the cumulative byte estimate is within `max_bytes`.
+    fn evict_over_budget(&self) -> Result<()> {
+        // Cast the SUM to BIGINT.
+        let sum_sql = format!(
+            "SELECT CAST(COALESCE(SUM(byte_estimate), 0) AS BIGINT) AS n FROM {}",
             naming::quote_ident(naming::CACHE_META_TABLE)
         );
-        let df = self.cache.execute_sql(&sql)?;
-        let n = df.height();
-        if n == 0 {
-            return Ok(Vec::new());
+        loop {
+            let df = self.cache.execute_sql(&sum_sql)?;
+            let total = if df.height() == 0 {
+                0
+            } else {
+                as_i64(df.column("n")?)?.value(0)
+            };
+            if total <= self.config.max_bytes as i64 {
+                return Ok(());
+            }
+            let pick = format!(
+                "SELECT cache_key, table_name FROM {} ORDER BY last_accessed_epoch_ms ASC LIMIT 1",
+                naming::quote_ident(naming::CACHE_META_TABLE)
+            );
+            let df = self.cache.execute_sql(&pick)?;
+            if df.height() == 0 {
+                return Ok(());
+            }
+            let key = as_str(df.column("cache_key")?)?.value(0).to_string();
+            let table = as_str(df.column("table_name")?)?.value(0).to_string();
+            self.drop_entry(&key, &table)?;
         }
-        let col = crate::array_util::as_str(df.column("table_name")?)?;
-        Ok((0..n).map(|i| col.value(i).to_string()).collect())
     }
 
     /// Whether `sql` references a cache-resident table by exact name.
@@ -131,17 +320,34 @@ impl CachingReader {
         refs.iter().any(|t| resident.contains(t))
     }
 
-    /// Drop every memoized result and empty the metadata table.
+    /// Drop every memoized result, one entry at a time.
     fn clear_memo(&self) -> Result<()> {
         self.ensure_meta_table()?;
-        for table in self.memo_tables()? {
-            let _ = self.cache.unregister(&table);
-        }
-        let del = format!(
-            "DELETE FROM {}",
+        let df = self.cache.execute_sql(&format!(
+            "SELECT cache_key, table_name FROM {}",
             naming::quote_ident(naming::CACHE_META_TABLE)
-        );
-        self.cache.execute_sql(&del)?;
+        ))?;
+        let n = df.height();
+        if n == 0 {
+            return Ok(());
+        }
+        let keys = as_str(df.column("cache_key")?)?;
+        let tables = as_str(df.column("table_name")?)?;
+        let mut failures: Vec<String> = Vec::new();
+        for i in 0..n {
+            let key = keys.value(i);
+            let table = tables.value(i);
+            if let Err(e) = self.drop_entry(key, table) {
+                failures.push(format!("{key}: {e}"));
+            }
+        }
+        if !failures.is_empty() {
+            return Err(crate::GgsqlError::ReaderError(format!(
+                "clear_cache: {} cache entries failed to drop: {}",
+                failures.len(),
+                failures.join("; ")
+            )));
+        }
         Ok(())
     }
 }
@@ -159,24 +365,44 @@ impl Reader for CachingReader {
         }
 
         self.ensure_meta_table()?;
+
+        // With caching disabled the memo is never consulted or written.
+        if !self.config.enabled {
+            return self.primary.execute_sql(sql);
+        }
+
         let key = self.cache_key(sql);
 
-        if let Some(table) = self.lookup_memo(&key)? {
-            return self
-                .cache
-                .execute_sql(&format!("SELECT * FROM {}", naming::quote_ident(&table)));
+        // Serve a fresh, still-present entry; on a stale or vanished entry drop
+        // it and fall through to a primary re-fetch.
+        if let Some(entry) = self.lookup_memo(&key)? {
+            let age_ms = (now_ms() - entry.fetched_at_epoch_ms).max(0);
+            let ttl_ms = (self.config.ttl_secs as i64).saturating_mul(1000);
+            if age_ms < ttl_ms {
+                let select = format!("SELECT * FROM {}", naming::quote_ident(&entry.table_name));
+                if let Ok(df) = self.cache.execute_sql(&select) {
+                    self.touch(&key)?;
+                    return Ok(df);
+                }
+            }
+            self.drop_entry(&key, &entry.table_name)?;
         }
 
         let df = self.primary.execute_sql(sql)?;
 
-        if super::returns_rows(sql) {
+        // Cache row-returning reads only.
+        if super::returns_rows(sql) && df.width() > 0 {
             let table = naming::cache_result_table(&key);
+            let byte_estimate = estimate_bytes(&df);
+            let row_count = df.height() as i64;
             self.cache.register(&table, df.clone(), true)?;
-            self.insert_memo(&key, sql, &table)?;
-        } else {
-            // A source-side write or DDL on the primary can change the data
-            // behind a memoized read; drop the memo.
-            self.clear_memo()?;
+            if let Err(e) = self.insert_memo(&key, sql, &table, byte_estimate, row_count) {
+                // The result table registered but the meta row didn't: drop the
+                // orphan so it can't leak, then surface the error.
+                let _ = self.cache.unregister(&table);
+                return Err(e);
+            }
+            self.evict_over_budget()?;
         }
 
         Ok(df)
@@ -225,6 +451,10 @@ impl Reader for CachingReader {
         true
     }
 
+    fn clear_cache(&self) -> Result<()> {
+        self.clear_memo()
+    }
+
     // Schema introspection describes the real data source, so delegate to the
     // primary; the cache only holds synthetic `__ggsql_*` tables.
     fn list_catalogs(&self) -> Result<Vec<String>> {
@@ -603,11 +833,231 @@ mod behavior_tests {
         reader.execute_sql("INSERT INTO t VALUES (4)").unwrap();
 
         let d2 = reader.execute_sql(q).unwrap();
+        assert_eq!(d2.height(), 3, "the memo is served despite the write");
+
+        // After an explicit clear, the re-read sees the inserted row.
+        reader.clear_cache().unwrap();
+        let d3 = reader.execute_sql(q).unwrap();
+        assert_eq!(d3.height(), 4, "clear_cache forces a fresh primary read");
+    }
+
+    #[test]
+    fn test_default_config_enabled_ttl_300() {
+        let reader = CachingReader::with_config(
+            Box::new(DuckDBReader::new_in_memory().unwrap()),
+            Box::new(DuckDBReader::new_in_memory().unwrap()),
+            "test://primary",
+            CacheConfig::default(),
+        );
+        assert!(reader.cache_config().enabled);
+        assert_eq!(reader.cache_config().ttl_secs, 300);
+    }
+
+    #[test]
+    fn test_repeat_query_hits_primary_once() {
+        let inner = DuckDBReader::new_in_memory().unwrap();
+        inner
+            .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let (primary, log) = SpyReader::wrap(Box::new(inner));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let q = "SELECT y FROM base ORDER BY y";
+        reader.execute_sql(q).unwrap();
+        reader.execute_sql(q).unwrap();
+
+        let hits = log
+            .lock()
+            .unwrap()
+            .iter()
+            .filter(|s| s.as_str() == q)
+            .count();
+        assert_eq!(hits, 1, "the repeat read is served from the memo");
+    }
+
+    #[test]
+    fn test_ttl_zero_always_misses() {
+        let inner = DuckDBReader::new_in_memory().unwrap();
+        inner
+            .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let (primary, log) = SpyReader::wrap(Box::new(inner));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::with_config(
+            primary,
+            cache,
+            "test://primary",
+            CacheConfig {
+                enabled: true,
+                ttl_secs: 0,
+                max_bytes: 512 * 1024 * 1024,
+            },
+        );
+
+        let q = "SELECT y FROM base ORDER BY y";
+        reader.execute_sql(q).unwrap();
+        reader.execute_sql(q).unwrap();
+
+        let hits = log
+            .lock()
+            .unwrap()
+            .iter()
+            .filter(|s| s.as_str() == q)
+            .count();
+        assert_eq!(hits, 2, "ttl=0 must miss on every read");
+    }
+
+    #[test]
+    fn test_disabled_always_hits_primary() {
+        let inner = DuckDBReader::new_in_memory().unwrap();
+        inner
+            .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let (primary, log) = SpyReader::wrap(Box::new(inner));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::with_config(
+            primary,
+            cache,
+            "test://primary",
+            CacheConfig {
+                enabled: false,
+                ttl_secs: 300,
+                max_bytes: 512 * 1024 * 1024,
+            },
+        );
+
+        let q = "SELECT y FROM base ORDER BY y";
+        reader.execute_sql(q).unwrap();
+        reader.execute_sql(q).unwrap();
+
+        let hits = log
+            .lock()
+            .unwrap()
+            .iter()
+            .filter(|s| s.as_str() == q)
+            .count();
+        assert_eq!(hits, 2, "a disabled cache always hits the primary");
+        // The memo was never written.
+        let meta = reader
+            .execute_sql(&format!("SELECT * FROM {}", naming::CACHE_META_TABLE))
+            .unwrap();
+        assert_eq!(meta.height(), 0);
+    }
+
+    #[test]
+    fn test_lru_evicts_oldest_when_over_budget() {
+        // A 1-byte budget forces eviction after every insert, so each cached
+        // read is gone by the next time it's requested.
+        let inner = DuckDBReader::new_in_memory().unwrap();
+        inner
+            .register(
+                "base",
+                df! { "a" => vec![1_i64, 2, 3], "b" => vec![4_i64, 5, 6] }.unwrap(),
+                true,
+            )
+            .unwrap();
+        let (primary, log) = SpyReader::wrap(Box::new(inner));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::with_config(
+            primary,
+            cache,
+            "test://primary",
+            CacheConfig {
+                enabled: true,
+                ttl_secs: 300,
+                max_bytes: 1,
+            },
+        );
+
+        let q1 = "SELECT a FROM base ORDER BY a";
+        let q2 = "SELECT b FROM base ORDER BY b";
+        reader.execute_sql(q1).unwrap();
+        reader.execute_sql(q2).unwrap();
+        // q1 was evicted when q2 was inserted, so re-reading q1 hits the primary
+        // again.
+        reader.execute_sql(q1).unwrap();
+
+        let q1_hits = log
+            .lock()
+            .unwrap()
+            .iter()
+            .filter(|s| s.as_str() == q1)
+            .count();
         assert_eq!(
-            d2.height(),
-            4,
-            "re-read must see the inserted row, not the memo"
+            q1_hits, 2,
+            "the evicted entry is re-fetched from the primary"
         );
+        // The budget is enforced: at most one entry resides.
+        let meta = reader
+            .execute_sql(&format!("SELECT * FROM {}", naming::CACHE_META_TABLE))
+            .unwrap();
+        assert!(meta.height() <= 1, "over-budget entries are evicted");
+    }
+
+    #[test]
+    fn test_missing_cached_table_self_heals() {
+        // If the cached result table vanishes out from under the memo, the entry
+        // is dropped and the read falls through to the primary instead of erroring.
+        let inner = DuckDBReader::new_in_memory().unwrap();
+        inner
+            .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true)
+            .unwrap();
+        let (primary, log) = SpyReader::wrap(Box::new(inner));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let q = "SELECT y FROM base ORDER BY y";
+        reader.execute_sql(q).unwrap();
+
+        // Drop the cached table directly, leaving a dangling meta row.
+        let table = reader
+            .execute_sql(&format!(
+                "SELECT table_name FROM {}",
+                naming::CACHE_META_TABLE
+            ))
+            .unwrap();
+        let table_name = crate::array_util::as_str(table.column("table_name").unwrap())
+            .unwrap()
+            .value(0)
+            .to_string();
+        reader
+            .cache
+            .execute_sql(&format!("DROP TABLE {}", naming::quote_ident(&table_name)))
+            .unwrap();
+
+        // The next read self-heals: it re-fetches from the primary.
+        let df = reader.execute_sql(q).unwrap();
+        assert_eq!(df.height(), 3);
+        let hits = log
+            .lock()
+            .unwrap()
+            .iter()
+            .filter(|s| s.as_str() == q)
+            .count();
+        assert_eq!(hits, 2, "the missing table forced a primary re-fetch");
+    }
+
+    #[test]
+    fn test_parse_human_bytes() {
+        assert_eq!(parse_human_bytes("1024"), Some(1024));
+        assert_eq!(parse_human_bytes("512mb"), Some(512 * 1024 * 1024));
+        assert_eq!(parse_human_bytes("1GB"), Some(1024 * 1024 * 1024));
+        assert_eq!(parse_human_bytes(" 2kb "), Some(2 * 1024));
+        assert_eq!(parse_human_bytes("nonsense"), None);
+    }
+
+    #[test]
+    fn test_config_merge_uri_wins() {
+        let base = CacheConfig::default();
+        let merged = base.merge(CacheConfigOverride {
+            enabled: Some(false),
+            ttl_secs: Some(60),
+            max_bytes: None,
+        });
+        assert!(!merged.enabled);
+        assert_eq!(merged.ttl_secs, 60);
+        assert_eq!(merged.max_bytes, 512 * 1024 * 1024);
     }
 
     #[test]
diff --git a/src/reader/mod.rs b/src/reader/mod.rs
index 34a63f306..aaa961a83 100644
--- a/src/reader/mod.rs
+++ b/src/reader/mod.rs
@@ -856,6 +856,11 @@ pub trait Reader {
         false
     }
 
+    /// Clear any cached query results held by this reader.
+    fn clear_cache(&self) -> Result<()> {
+        Ok(())
+    }
+
     // =========================================================================
     // Schema introspection
     // =========================================================================

From fc162a6dddaefb8083c6d9448657ba78c3715493 Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Tue, 30 Jun 2026 15:31:35 +0100
Subject: [PATCH 08/15] Add -- @uncache and make cell meta-commands
 line-oriented

---
 ggsql-jupyter/src/executor.rs | 163 +++++++++++++++++++++++++++++-----
 ggsql-jupyter/src/kernel.rs   |  15 ++--
 2 files changed, 151 insertions(+), 27 deletions(-)

diff --git a/ggsql-jupyter/src/executor.rs b/ggsql-jupyter/src/executor.rs
index fab09a9e4..18f6e0116 100644
--- a/ggsql-jupyter/src/executor.rs
+++ b/ggsql-jupyter/src/executor.rs
@@ -2,7 +2,8 @@
 //!
 //! This module handles the execution of ggsql queries using the existing
 //! ggsql library components (parser, DuckDB reader, Vega-Lite writer).
-//! It supports dynamic reader switching via `-- @connect:` meta-commands.
+//! It supports leading `--` meta-command lines. Each occupies its own comment
+//! line, so a cell may stack them above a query that then runs as normal.
 
 use anyhow::Result;
 use ggsql::{
@@ -25,7 +26,7 @@ pub enum ExecutionResult {
         spec: String, // Vega-Lite JSON
     },
     /// Connection changed via meta-command
-    ConnectionChanged { uri: String, display_name: String },
+    ConnectionChanged { display_name: String },
 }
 
 /// Generate a human-readable display name for a connection URI.
@@ -101,13 +102,49 @@ pub fn host_for_uri(uri: &str) -> String {
 
 /// The `-- @connect:` meta-command prefix.
 const META_CONNECT_PREFIX: &str = "-- @connect:";
+/// The `-- @uncache` meta-command prefix.
+const META_UNCACHE_PREFIX: &str = "-- @uncache";
+
+/// A leading cell directive expressed as a `--` line comment.
+#[derive(Debug, PartialEq, Eq)]
+pub enum MetaCommand {
+    /// Switch the active reader to the given connection URI.
+    Connect(String),
+    /// Clear the active reader's cache.
+    Uncache,
+}
+
+/// Split `code` into its first line and the remainder.
+/// Handles `\n`, `\r\n`, and a lone `\r`.
+fn split_first_line(code: &str) -> (&str, &str) {
+    match code.find(['\n', '\r']) {
+        None => (code, ""),
+        Some(i) => {
+            let line = &code[..i];
+            let rest = &code[i..];
+            let rest = rest
+                .strip_prefix("\r\n")
+                .or_else(|| rest.strip_prefix('\n'))
+                .or_else(|| rest.strip_prefix('\r'))
+                .unwrap_or(rest);
+            (line, rest)
+        }
+    }
+}
 
-/// Parse a `-- @connect: <uri>` meta-command, returning the URI if present.
-pub fn parse_meta_command(code: &str) -> Option<String> {
-    let trimmed = code.trim();
-    trimmed
-        .strip_prefix(META_CONNECT_PREFIX)
-        .map(|rest| rest.trim().to_string())
+/// Peel a single leading meta-command from `code`, returning it together with
+/// the rest of the cell to process next.
+pub fn take_leading_meta(code: &str) -> Option<(MetaCommand, &str)> {
+    let trimmed = code.trim_start();
+    let (line, rest) = split_first_line(trimmed);
+    let line = line.trim();
+    if let Some(uri) = line.strip_prefix(META_CONNECT_PREFIX) {
+        return Some((MetaCommand::Connect(uri.trim().to_string()), rest));
+    }
+    if line == META_UNCACHE_PREFIX {
+        return Some((MetaCommand::Uncache, rest));
+    }
+    None
 }
 
 /// Query executor maintaining persistent database connection
@@ -158,18 +195,38 @@ impl QueryExecutor {
     /// Execute a ggsql query or meta-command
     ///
     /// This handles:
-    /// - `-- @connect: <uri>` meta-commands for switching readers
+    /// - `-- @` meta-commands
     /// - Pure SQL queries (no VISUALISE)
     /// - ggsql queries with VISUALISE clauses
     pub fn execute(&mut self, code: &str) -> Result<ExecutionResult> {
         tracing::debug!("Executing query: {} chars", code.len());
 
-        // Check for meta-commands first
-        if let Some(uri) = parse_meta_command(code) {
-            tracing::info!("Meta-command: switching reader to {}", uri);
-            self.swap_reader(&uri)?;
-            let display_name = display_name_for_uri(&uri);
-            return Ok(ExecutionResult::ConnectionChanged { uri, display_name });
+        // Apply any leading meta-command lines, then run whatever SQL remains.
+        let mut code = code;
+        let mut last_connect: Option<String> = None;
+        while let Some((cmd, rest)) = take_leading_meta(code) {
+            match cmd {
+                MetaCommand::Connect(uri) => {
+                    tracing::info!("Meta-command: switching reader to {}", uri);
+                    self.swap_reader(&uri)?;
+                    last_connect = Some(uri);
+                }
+                MetaCommand::Uncache => {
+                    tracing::info!("Meta-command: clearing cache");
+                    self.reader.clear_cache()?;
+                }
+            }
+            code = rest;
+        }
+
+        // A cell that was nothing but meta-commands.
+        if code.trim().is_empty() {
+            if let Some(uri) = last_connect {
+                let display_name = display_name_for_uri(&uri);
+                return Ok(ExecutionResult::ConnectionChanged { display_name });
+            }
+            // An empty DataFrame renders no cell output.
+            return Ok(ExecutionResult::DataFrame(DataFrame::empty()));
         }
 
         // 1. Validate to check if there's a visualization
@@ -238,16 +295,44 @@ mod tests {
     }
 
     #[test]
-    fn test_parse_meta_command() {
+    fn test_take_leading_meta_connect() {
+        // `-- @connect:` takes the rest of its line as the URI; the next line is
+        // the remainder.
+        assert_eq!(
+            take_leading_meta("-- @connect: duckdb://memory"),
+            Some((MetaCommand::Connect("duckdb://memory".to_string()), ""))
+        );
+        assert_eq!(
+            take_leading_meta("  -- @connect:  duckdb://my.db  \nSELECT 1"),
+            Some((
+                MetaCommand::Connect("duckdb://my.db".to_string()),
+                "SELECT 1"
+            ))
+        );
+    }
+
+    #[test]
+    fn test_take_leading_meta_uncache() {
+        assert_eq!(
+            take_leading_meta("-- @uncache"),
+            Some((MetaCommand::Uncache, ""))
+        );
         assert_eq!(
-            parse_meta_command("-- @connect: duckdb://memory"),
-            Some("duckdb://memory".to_string())
+            take_leading_meta("-- @uncache\nSELECT 1"),
+            Some((MetaCommand::Uncache, "SELECT 1"))
         );
         assert_eq!(
-            parse_meta_command("  -- @connect:  duckdb://my.db  "),
-            Some("duckdb://my.db".to_string())
+            take_leading_meta("-- @uncache  \r\nSELECT 1"),
+            Some((MetaCommand::Uncache, "SELECT 1"))
         );
-        assert_eq!(parse_meta_command("SELECT 1"), None);
+        // `-- @uncache foo` on one line is an ordinary SQL comment, not the directive.
+        assert_eq!(take_leading_meta("-- @uncache foo"), None);
+    }
+
+    #[test]
+    fn test_take_leading_meta_non_directive() {
+        assert_eq!(take_leading_meta("SELECT 1"), None);
+        assert_eq!(take_leading_meta("-- a normal comment\nSELECT 1"), None);
     }
 
     #[test]
@@ -259,6 +344,42 @@ mod tests {
         assert!(matches!(result, ExecutionResult::ConnectionChanged { .. }));
     }
 
+    #[test]
+    fn test_connect_then_runs_remaining_query() {
+        // A leading `-- @connect:` switches the reader and still runs the query
+        // below it in the same cell.
+        let mut executor = QueryExecutor::new().unwrap();
+        let result = executor
+            .execute(
+                "-- @connect: duckdb://memory\nSELECT 1 AS x, 2 AS y VISUALISE x, y DRAW point",
+            )
+            .unwrap();
+        assert_eq!(executor.reader_uri(), "duckdb://memory");
+        assert!(matches!(result, ExecutionResult::Visualization { .. }));
+    }
+
+    #[test]
+    fn test_uncache_meta_command_clears_cache() {
+        // On the default reader (no cache) `clear_cache` is a no-op; this proves
+        // the dispatch arm is wired and yields an empty DataFrame.
+        let mut executor = QueryExecutor::new().unwrap();
+        let result = executor.execute("-- @uncache").unwrap();
+        match result {
+            ExecutionResult::DataFrame(df) => assert_eq!(df.width(), 0),
+            other => panic!("expected empty DataFrame, got {other:?}"),
+        }
+    }
+
+    #[test]
+    fn test_uncache_then_runs_remaining_query() {
+        // A leading `-- @uncache` clears the cache and still runs the query below.
+        let mut executor = QueryExecutor::new().unwrap();
+        let result = executor
+            .execute("-- @uncache\nSELECT 1 AS x, 2 AS y VISUALISE x, y DRAW point")
+            .unwrap();
+        assert!(matches!(result, ExecutionResult::Visualization { .. }));
+    }
+
     #[test]
     fn test_display_name_for_uri() {
         assert_eq!(display_name_for_uri("duckdb://memory"), "DuckDB (memory)");
diff --git a/ggsql-jupyter/src/kernel.rs b/ggsql-jupyter/src/kernel.rs
index b9844c602..85e82d0ad 100644
--- a/ggsql-jupyter/src/kernel.rs
+++ b/ggsql-jupyter/src/kernel.rs
@@ -323,21 +323,24 @@ impl KernelServer {
         }
 
         // Execute the query
+        let uri_before = self.executor.reader_uri().to_string();
         let result = self.executor.execute(code);
+        let connection_changed = self.executor.reader_uri() != uri_before;
+        if connection_changed {
+            let uri = self.executor.reader_uri().to_string();
+            self.open_connection_comm(&uri).await?;
+        }
 
         match result {
             Ok(exec_result) => {
-                // If the connection changed, open a new connection comm
-                let is_connection_changed =
+                // A bare connection change renders nothing.
+                let suppress_output =
                     matches!(&exec_result, ExecutionResult::ConnectionChanged { .. });
-                if let ExecutionResult::ConnectionChanged { ref uri, .. } = &exec_result {
-                    self.open_connection_comm(uri).await?;
-                }
 
                 // Send execute_result (not display_data)
                 // Per Jupyter spec: execute_result includes execution_count
                 // Only send if there's something to display (DDL returns None)
-                if !silent && !is_connection_changed {
+                if !silent && !suppress_output {
                     if let Some(display_data) = format_display_data(exec_result, &hints) {
                         // Build message content, including output_location if present
                         let mut content = json!({

From 4f2cd554d2cb2066268ef6a53c3e6e07acc64c92 Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Tue, 30 Jun 2026 15:55:30 +0100
Subject: [PATCH 09/15] Configure the cache via URI query parameters

---
 CHANGELOG.md             |  2 +
 src/CLAUDE.md            |  2 +-
 src/reader/connection.rs | 99 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 101 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 327a651f7..916238567 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@
 - New caching layer that wraps any `Reader` with an in-memory, writeable cache
   backend (currently duckdb or sqlite), making write-constrained databases
   usable and avoiding repeated remote reads during interactive iteration.
+  Memoized reads are bounded by a TTL and an LRU byte budget, configurable per
+  connection. The cache can be cleared mid-session with the `-- @uncache` meta-command.
 
 ## 0.4.1 - 2026-06-22
 
diff --git a/src/CLAUDE.md b/src/CLAUDE.md
index 5f94c2589..321a2fdcf 100644
--- a/src/CLAUDE.md
+++ b/src/CLAUDE.md
@@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding
 
 `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`).
 
-**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. Each memoized read is bounded by a **TTL** (default 300s) and the whole memo by an **LRU byte budget** (default 512 MB); both are configurable via `CacheConfig` (env `GGSQL_CACHE_DISABLED`/`GGSQL_CACHE_TTL`/`GGSQL_CACHE_MAX_BYTES`). Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `<primary>+<cache>://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
+**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. Each memoized read is bounded by a **TTL** (default 300s) and the whole memo by an **LRU byte budget** (default 512 MB); both are configurable via `CacheConfig` (env `GGSQL_CACHE_DISABLED`/`GGSQL_CACHE_TTL`/`GGSQL_CACHE_MAX_BYTES`, or per-connection URI query parameters `?ttl=…&max_bytes=…&disabled=…`. The `__ggsql_cache_meta__` table is queryable for introspection (`SELECT * FROM __ggsql_cache_meta__`). Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `<primary>+<cache>://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
 
 ### `execute/`
 
diff --git a/src/reader/connection.rs b/src/reader/connection.rs
index b699e60f3..7c1fba364 100644
--- a/src/reader/connection.rs
+++ b/src/reader/connection.rs
@@ -29,6 +29,45 @@ pub fn split_cache_uri(uri: &str) -> Option<(String, String)> {
     Some((format!("{}://{}", primary, rest), cache.to_string()))
 }
 
+/// Cache-config keys recognised in a connection URI's trailing `?` query string.
+#[cfg(any(feature = "duckdb", feature = "sqlite"))]
+const KNOWN_CACHE_PARAMS: &[&str] = &["ttl", "max_bytes", "disabled"];
+
+/// Pull cache-config keys out of a connection URI's trailing `?key=value&…`
+/// query string, returning the URI with those keys removed plus the overrides.
+#[cfg(any(feature = "duckdb", feature = "sqlite"))]
+fn strip_cache_params(uri: &str) -> (String, crate::reader::cache::CacheConfigOverride) {
+    use crate::reader::cache::{parse_human_bytes, CacheConfigOverride};
+
+    let mut over = CacheConfigOverride::default();
+    let Some((body, query)) = uri.split_once('?') else {
+        return (uri.to_string(), over);
+    };
+
+    let mut kept: Vec<&str> = Vec::new();
+    for segment in query.split('&') {
+        match segment.split_once('=') {
+            Some((key, value)) if KNOWN_CACHE_PARAMS.contains(&key) => match key {
+                "ttl" => over.ttl_secs = value.trim().parse::<u64>().ok(),
+                "max_bytes" => over.max_bytes = parse_human_bytes(value),
+                "disabled" => {
+                    let v = value.trim().to_ascii_lowercase();
+                    over.enabled = Some(!matches!(v.as_str(), "1" | "true" | "yes"));
+                }
+                _ => unreachable!("validated against KNOWN_CACHE_PARAMS"),
+            },
+            // Not a cache key: keep it on the URI for the primary reader.
+            _ => kept.push(segment),
+        }
+    }
+
+    if kept.is_empty() {
+        (body.to_string(), over)
+    } else {
+        (format!("{}?{}", body, kept.join("&")), over)
+    }
+}
+
 /// Map a cache-backend scheme to its in-memory connection URI.
 #[cfg(any(feature = "duckdb", feature = "sqlite"))]
 fn cache_uri(scheme: &str) -> Result<&'static str> {
@@ -105,12 +144,17 @@ pub fn reader_from_uri(uri: &str) -> Result<Box<dyn Reader + Send>> {
     if let Some((primary_uri, cache_scheme)) = split_cache_uri(uri) {
         #[cfg(any(feature = "duckdb", feature = "sqlite"))]
         {
+            use crate::reader::cache::CacheConfig;
+
+            let (primary_uri, over) = strip_cache_params(&primary_uri);
+            let config = CacheConfig::from_env().merge(over);
             let primary = build_reader(&primary_uri)?;
             let cache = build_reader(cache_uri(&cache_scheme)?)?;
-            return Ok(Box::new(crate::reader::CachingReader::new(
+            return Ok(Box::new(crate::reader::CachingReader::with_config(
                 primary,
                 cache,
                 primary_uri,
+                config,
             )));
         }
         #[cfg(not(any(feature = "duckdb", feature = "sqlite")))]
@@ -217,4 +261,57 @@ mod tests {
         assert_eq!(split_cache_uri("+duckdb://x"), None);
         assert_eq!(split_cache_uri("odbc+://x"), None);
     }
+
+    #[cfg(any(feature = "duckdb", feature = "sqlite"))]
+    #[test]
+    fn test_strip_cache_params_parses_known_keys() {
+        let (uri, over) = strip_cache_params("duckdb://memory?ttl=600");
+        assert_eq!(uri, "duckdb://memory");
+        assert_eq!(over.ttl_secs, Some(600));
+        assert_eq!(over.max_bytes, None);
+        assert_eq!(over.enabled, None);
+
+        let (uri, over) = strip_cache_params("duckdb://memory?max_bytes=256mb&disabled=true");
+        assert_eq!(uri, "duckdb://memory");
+        assert_eq!(over.max_bytes, Some(256 * 1024 * 1024));
+        assert_eq!(over.enabled, Some(false));
+    }
+
+    #[cfg(any(feature = "duckdb", feature = "sqlite"))]
+    #[test]
+    fn test_strip_cache_params_keeps_non_cache_segments() {
+        // A non-cache `?key=` tail contributes no overrides and is left in place.
+        let (uri, over) = strip_cache_params("odbc://DSN=foo?warehouse=PROD");
+        assert_eq!(uri, "odbc://DSN=foo?warehouse=PROD");
+        assert_eq!(over.ttl_secs, None);
+
+        // ODBC body with `=`/`;` and no `?` is returned verbatim.
+        let (uri, over) = strip_cache_params("odbc://Driver=Snowflake;Server=x");
+        assert_eq!(uri, "odbc://Driver=Snowflake;Server=x");
+        assert_eq!(over.enabled, None);
+
+        // Cache keys are extracted; other params (e.g. ODBC settings) are kept.
+        let (uri, over) = strip_cache_params("odbc://DSN=foo?warehouse=PROD&ttl=10&max_bytes=8mb");
+        assert_eq!(uri, "odbc://DSN=foo?warehouse=PROD");
+        assert_eq!(over.ttl_secs, Some(10));
+        assert_eq!(over.max_bytes, Some(8 * 1024 * 1024));
+
+        // When every param is a cache key, the `?` is dropped entirely.
+        let (uri, over) = strip_cache_params("duckdb://memory?ttl=10&disabled=1");
+        assert_eq!(uri, "duckdb://memory");
+        assert_eq!(over.ttl_secs, Some(10));
+        assert_eq!(over.enabled, Some(false));
+
+        // Plain URI, no query string.
+        let (uri, over) = strip_cache_params("duckdb://memory");
+        assert_eq!(uri, "duckdb://memory");
+        assert_eq!(over.ttl_secs, None);
+    }
+
+    #[cfg(all(feature = "duckdb", feature = "sqlite"))]
+    #[test]
+    fn test_reader_from_uri_applies_uri_cache_params() {
+        // A composite URI with a cache-param tail builds a CachingReader
+        assert!(reader_from_uri("duckdb+sqlite://memory?ttl=600&max_bytes=64mb").is_ok());
+    }
 }

From 55b04c22469d1a739cb0ffb2a3cfc0bd99fbd1b7 Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Wed, 1 Jul 2026 09:38:06 +0100
Subject: [PATCH 10/15] Namespace cache query params

---
 src/CLAUDE.md            |  2 +-
 src/reader/connection.rs | 24 ++++++++++++++----------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/src/CLAUDE.md b/src/CLAUDE.md
index 321a2fdcf..c674b9808 100644
--- a/src/CLAUDE.md
+++ b/src/CLAUDE.md
@@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding
 
 `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`).
 
-**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. Each memoized read is bounded by a **TTL** (default 300s) and the whole memo by an **LRU byte budget** (default 512 MB); both are configurable via `CacheConfig` (env `GGSQL_CACHE_DISABLED`/`GGSQL_CACHE_TTL`/`GGSQL_CACHE_MAX_BYTES`, or per-connection URI query parameters `?ttl=…&max_bytes=…&disabled=…`. The `__ggsql_cache_meta__` table is queryable for introspection (`SELECT * FROM __ggsql_cache_meta__`). Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `<primary>+<cache>://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
+**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. Each memoized read is bounded by a **TTL** (default 300s) and the whole memo by an **LRU byte budget** (default 512 MB); both are configurable via `CacheConfig` (env `GGSQL_CACHE_DISABLED`/`GGSQL_CACHE_TTL`/`GGSQL_CACHE_MAX_BYTES`, or per-connection URI query parameters `?cache_ttl=…&cache_max_bytes=…&cache_disabled=…`). The `__ggsql_cache_meta__` table is queryable for introspection (`SELECT * FROM __ggsql_cache_meta__`). Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `<primary>+<cache>://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default.
 
 ### `execute/`
 
diff --git a/src/reader/connection.rs b/src/reader/connection.rs
index 7c1fba364..557712c73 100644
--- a/src/reader/connection.rs
+++ b/src/reader/connection.rs
@@ -31,7 +31,7 @@ pub fn split_cache_uri(uri: &str) -> Option<(String, String)> {
 
 /// Cache-config keys recognised in a connection URI's trailing `?` query string.
 #[cfg(any(feature = "duckdb", feature = "sqlite"))]
-const KNOWN_CACHE_PARAMS: &[&str] = &["ttl", "max_bytes", "disabled"];
+const KNOWN_CACHE_PARAMS: &[&str] = &["cache_ttl", "cache_max_bytes", "cache_disabled"];
 
 /// Pull cache-config keys out of a connection URI's trailing `?key=value&…`
 /// query string, returning the URI with those keys removed plus the overrides.
@@ -48,9 +48,9 @@ fn strip_cache_params(uri: &str) -> (String, crate::reader::cache::CacheConfigOv
     for segment in query.split('&') {
         match segment.split_once('=') {
             Some((key, value)) if KNOWN_CACHE_PARAMS.contains(&key) => match key {
-                "ttl" => over.ttl_secs = value.trim().parse::<u64>().ok(),
-                "max_bytes" => over.max_bytes = parse_human_bytes(value),
-                "disabled" => {
+                "cache_ttl" => over.ttl_secs = value.trim().parse::<u64>().ok(),
+                "cache_max_bytes" => over.max_bytes = parse_human_bytes(value),
+                "cache_disabled" => {
                     let v = value.trim().to_ascii_lowercase();
                     over.enabled = Some(!matches!(v.as_str(), "1" | "true" | "yes"));
                 }
@@ -265,13 +265,14 @@ mod tests {
     #[cfg(any(feature = "duckdb", feature = "sqlite"))]
     #[test]
     fn test_strip_cache_params_parses_known_keys() {
-        let (uri, over) = strip_cache_params("duckdb://memory?ttl=600");
+        let (uri, over) = strip_cache_params("duckdb://memory?cache_ttl=600");
         assert_eq!(uri, "duckdb://memory");
         assert_eq!(over.ttl_secs, Some(600));
         assert_eq!(over.max_bytes, None);
         assert_eq!(over.enabled, None);
 
-        let (uri, over) = strip_cache_params("duckdb://memory?max_bytes=256mb&disabled=true");
+        let (uri, over) =
+            strip_cache_params("duckdb://memory?cache_max_bytes=256mb&cache_disabled=true");
         assert_eq!(uri, "duckdb://memory");
         assert_eq!(over.max_bytes, Some(256 * 1024 * 1024));
         assert_eq!(over.enabled, Some(false));
@@ -291,13 +292,14 @@ mod tests {
         assert_eq!(over.enabled, None);
 
         // Cache keys are extracted; other params (e.g. ODBC settings) are kept.
-        let (uri, over) = strip_cache_params("odbc://DSN=foo?warehouse=PROD&ttl=10&max_bytes=8mb");
-        assert_eq!(uri, "odbc://DSN=foo?warehouse=PROD");
+        let (uri, over) =
+            strip_cache_params("odbc://DSN=foo?ttl=99&cache_ttl=10&cache_max_bytes=8mb");
+        assert_eq!(uri, "odbc://DSN=foo?ttl=99");
         assert_eq!(over.ttl_secs, Some(10));
         assert_eq!(over.max_bytes, Some(8 * 1024 * 1024));
 
         // When every param is a cache key, the `?` is dropped entirely.
-        let (uri, over) = strip_cache_params("duckdb://memory?ttl=10&disabled=1");
+        let (uri, over) = strip_cache_params("duckdb://memory?cache_ttl=10&cache_disabled=1");
         assert_eq!(uri, "duckdb://memory");
         assert_eq!(over.ttl_secs, Some(10));
         assert_eq!(over.enabled, Some(false));
@@ -312,6 +314,8 @@ mod tests {
     #[test]
     fn test_reader_from_uri_applies_uri_cache_params() {
         // A composite URI with a cache-param tail builds a CachingReader
-        assert!(reader_from_uri("duckdb+sqlite://memory?ttl=600&max_bytes=64mb").is_ok());
+        assert!(
+            reader_from_uri("duckdb+sqlite://memory?cache_ttl=600&cache_max_bytes=64mb").is_ok()
+        );
     }
 }

From e8b6991fce907a93d2fbd52bba0ec2e248477caa Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Wed, 1 Jul 2026 09:38:25 +0100
Subject: [PATCH 11/15] Remove now-irrelevant test

---
 src/reader/cache.rs | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/src/reader/cache.rs b/src/reader/cache.rs
index 7b7a14ce9..1d57ba0af 100644
--- a/src/reader/cache.rs
+++ b/src/reader/cache.rs
@@ -815,32 +815,6 @@ mod behavior_tests {
         assert_eq!(df.height(), 3);
     }
 
-    #[test]
-    fn test_source_write_invalidates_memo() {
-        // Memoize a base read, mutate the primary, then re-read: the memo must be
-        // invalidated by the non-row-returning statement so the second read is fresh.
-        let primary = DuckDBReader::new_in_memory().unwrap();
-        primary
-            .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true)
-            .unwrap();
-        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
-        let reader = CachingReader::new(Box::new(primary), cache, "test://primary");
-
-        let q = "SELECT v FROM t";
-        let d1 = reader.execute_sql(q).unwrap();
-        assert_eq!(d1.height(), 3);
-
-        reader.execute_sql("INSERT INTO t VALUES (4)").unwrap();
-
-        let d2 = reader.execute_sql(q).unwrap();
-        assert_eq!(d2.height(), 3, "the memo is served despite the write");
-
-        // After an explicit clear, the re-read sees the inserted row.
-        reader.clear_cache().unwrap();
-        let d3 = reader.execute_sql(q).unwrap();
-        assert_eq!(d3.height(), 4, "clear_cache forces a fresh primary read");
-    }
-
     #[test]
     fn test_default_config_enabled_ttl_300() {
         let reader = CachingReader::with_config(

From 70d387224896362905de65bdc7457535bab094cc Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Wed, 1 Jul 2026 09:38:58 +0100
Subject: [PATCH 12/15] Switch ordering in drop_entry

---
 src/reader/cache.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/reader/cache.rs b/src/reader/cache.rs
index 1d57ba0af..3e4e962bc 100644
--- a/src/reader/cache.rs
+++ b/src/reader/cache.rs
@@ -265,15 +265,15 @@ impl CachingReader {
         Ok(())
     }
 
-    /// Drop a single memo entry: delete its meta row and unregister the table.
+    /// Drop a single memo entry: unregister the table, then delete its meta row.
     fn drop_entry(&self, key: &str, table: &str) -> Result<()> {
+        self.cache.unregister(table)?;
         let del = format!(
             "DELETE FROM {} WHERE cache_key = {}",
             naming::quote_ident(naming::CACHE_META_TABLE),
             naming::quote_literal(key),
         );
         self.cache.execute_sql(&del)?;
-        let _ = self.cache.unregister(table);
         Ok(())
     }
 

From e750c14bec5ff26c5ecce3ae8a08a5ef70803022 Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Wed, 1 Jul 2026 11:59:32 +0100
Subject: [PATCH 13/15] Stage primary sources when executing a query with mixed
 residency tables

---
 src/execute/cte.rs  | 228 ++++++++++++++++++++++++++++++++++++++++++--
 src/execute/mod.rs  |   3 +-
 src/naming.rs       |  35 +++++++
 src/reader/cache.rs |  78 +++++++++++++++
 4 files changed, 333 insertions(+), 11 deletions(-)

diff --git a/src/execute/cte.rs b/src/execute/cte.rs
index e843fa074..5b4522282 100644
--- a/src/execute/cte.rs
+++ b/src/execute/cte.rs
@@ -130,6 +130,80 @@ pub fn transform_cte_references(sql: &str, cte_names: &HashSet<String>) -> Strin
     result
 }
 
+/// Stage the primary base tables in a body destined for the cache.
+///
+/// A body that is entirely primary (no cache-resident reference) or entirely
+/// cache-resident is returned unchanged.
+pub fn transform_source_references(sql: &str, reader: &dyn Reader) -> Result<String> {
+    if !reader.caches_sources() {
+        return Ok(sql.to_string());
+    }
+
+    // Discover the names referenced after FROM/JOIN — including quoted (`"foo"`)
+    // and schema-qualified (`schema.table`) names. Subqueries contribute no name.
+    let re = match regex::Regex::new(
+        r#"(?i)\b(?:FROM|JOIN)\s+("(?:[^"]|"")*"|[A-Za-z_][A-Za-z0-9_$]*(?:\.[A-Za-z_][A-Za-z0-9_$]*)*)"#,
+    ) {
+        Ok(re) => re,
+        Err(_) => return Ok(sql.to_string()),
+    };
+
+    let mut seen = HashSet::new();
+    let mut refs: Vec<String> = Vec::new();
+    for caps in re.captures_iter(sql) {
+        if let Some(m) = caps.get(1) {
+            let name = m.as_str().to_string();
+            if seen.insert(name.clone()) {
+                refs.push(name);
+            }
+        }
+    }
+
+    // A name resolves against the cache backend (rather than the primary) if it
+    // is an internal `__ggsql_*` table or a `ggsql:` builtin dataset.
+    let is_cache_resolvable = |name: &str| {
+        naming::is_internal_table(&naming::unquote_ident(name)) || name.starts_with("ggsql:")
+    };
+
+    let has_cache_ref = refs.iter().any(|r| is_cache_resolvable(r));
+    let primary_refs: Vec<&String> = refs.iter().filter(|r| !is_cache_resolvable(r)).collect();
+
+    // Only mixed bodies need staging.
+    if !has_cache_ref || primary_refs.is_empty() {
+        return Ok(sql.to_string());
+    }
+
+    let mut result = sql.to_string();
+    for (index, raw) in primary_refs.into_iter().enumerate() {
+        let staged = naming::staged_source_table(index);
+        reader.materialize_table(&staged, &[], &format!("SELECT * FROM {}", raw))?;
+        let target = naming::quote_ident(&staged);
+
+        // Rewrite FROM/JOIN/qualified references to the staged copy.
+        let patterns = [
+            (
+                format!(r"(?i)(\bFROM\s+){}(\s|,|\)|$)", regex::escape(raw)),
+                format!("${{1}}{}${{2}}", target),
+            ),
+            (
+                format!(r"(?i)(\bJOIN\s+){}(\s|,|\)|$)", regex::escape(raw)),
+                format!("${{1}}{}${{2}}", target),
+            ),
+            (
+                format!(r"(?i)\b{}(\.[a-zA-Z_][a-zA-Z0-9_]*)", regex::escape(raw)),
+                format!("{}${{1}}", target),
+            ),
+        ];
+        for (pattern, replacement) in patterns {
+            if let Ok(re) = regex::Regex::new(&pattern) {
+                result = re.replace_all(&result, replacement.as_str()).to_string();
+            }
+        }
+    }
+
+    Ok(result)
+}
+
 /// Materialize CTEs as temporary tables in the database
 ///
 /// Creates a temp table for each CTE in declaration order. When a CTE
@@ -143,6 +217,8 @@ pub fn materialize_ctes(ctes: &[CteDefinition], reader: &dyn Reader) -> Result<H
     for cte in ctes {
         // Transform the CTE body to replace references to earlier CTEs
         let transformed_body = transform_cte_references(&cte.body, &materialized);
+        // Stage any primary base tables this body joins against into the cache.
+        let transformed_body = transform_source_references(&transformed_body, reader)?;
 
         let temp_table_name = naming::cte_table(&cte.name);
 
@@ -213,7 +289,8 @@ pub fn split_with_query(source_tree: &SourceTree) -> Option<(String, String)> {
 pub fn transform_global_sql(
     source_tree: &SourceTree,
     materialized_ctes: &HashSet<String>,
-) -> (Vec<String>, Option<String>) {
+    reader: &dyn Reader,
+) -> Result<(Vec<String>, Option<String>)> {
     // Collect side-effect statements (CREATE, INSERT, UPDATE, DELETE) that
     // need to run before the main query. These appear alongside a trailing
     // SELECT or VISUALISE FROM.
@@ -245,14 +322,15 @@ pub fn transform_global_sql(
         });
 
     if let Some(select_sql) = select_sql {
-        return (
+        let select_sql = transform_cte_references(&select_sql, materialized_ctes);
+        return Ok((
             side_effects,
-            Some(transform_cte_references(&select_sql, materialized_ctes)),
-        );
+            Some(transform_source_references(&select_sql, reader)?),
+        ));
     }
 
     if !has_executable_sql(source_tree) {
-        return (vec![], None);
+        return Ok((vec![], None));
     }
 
     // We have non-SELECT executable SQL and/or VISUALISE FROM.
@@ -266,18 +344,24 @@ pub fn transform_global_sql(
         )
         .map(|table| {
             let q = format!("SELECT * FROM {}", table);
-            transform_cte_references(&q, materialized_ctes)
-        });
+            let q = transform_cte_references(&q, materialized_ctes);
+            transform_source_references(&q, reader)
+        })
+        .transpose()?;
 
     if !side_effects.is_empty() || viz_from_query.is_some() {
-        (side_effects, viz_from_query)
+        Ok((side_effects, viz_from_query))
     } else {
         // has_executable_sql was true but we found no specific statements or
         // VISUALISE FROM — fall back to extract_sql as the query.
         let query = source_tree
             .extract_sql()
-            .map(|s| transform_cte_references(&s, materialized_ctes));
-        (vec![], query)
+            .map(|s| {
+                let s = transform_cte_references(&s, materialized_ctes);
+                transform_source_references(&s, reader)
+            })
+            .transpose()?;
+        Ok((vec![], query))
     }
 }
 
@@ -469,6 +553,130 @@ mod tests {
         }
     }
 
+    /// Minimal reader that records `materialize_table` calls, used to unit-test
+    /// source-reference staging without a database.
+    struct MockReader {
+        caches: bool,
+        staged: std::cell::RefCell<Vec<(String, String)>>,
+    }
+
+    impl MockReader {
+        fn new(caches: bool) -> Self {
+            Self {
+                caches,
+                staged: std::cell::RefCell::new(Vec::new()),
+            }
+        }
+    }
+
+    impl Reader for MockReader {
+        fn execute_sql(&self, _sql: &str) -> Result<crate::DataFrame> {
+            unreachable!("staging must not touch execute_sql in these tests")
+        }
+        fn register(&self, _name: &str, _df: crate::DataFrame, _replace: bool) -> Result<()> {
+            Ok(())
+        }
+        fn execute(&self, _query: &str) -> Result<crate::reader::Spec> {
+            unreachable!()
+        }
+        fn caches_sources(&self) -> bool {
+            self.caches
+        }
+        fn materialize_table(
+            &self,
+            name: &str,
+            _column_aliases: &[String],
+            body_sql: &str,
+        ) -> Result<()> {
+            self.staged
+                .borrow_mut()
+                .push((name.to_string(), body_sql.to_string()));
+            Ok(())
+        }
+    }
+
+    #[test]
+    fn test_transform_source_references_quoted_primary_ref() {
+        let reader = MockReader::new(true);
+        // A quoted primary base table joined against a cache-resident CTE temp.
+        let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN \"my base\" ON 1 = 1";
+        let out = transform_source_references(sql, &reader).unwrap();
+
+        let staged = reader.staged.borrow();
+        assert_eq!(staged.len(), 1);
+        assert_eq!(staged[0].1, "SELECT * FROM \"my base\"");
+        assert!(out.contains("__ggsql_staged_0_"));
+        assert!(!out.contains("JOIN \"my base\""));
+    }
+
+    #[test]
+    fn test_transform_source_references_non_caching_reader_unchanged() {
+        let reader = MockReader::new(false);
+        let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN base ON base.k = 1";
+        let out = transform_source_references(sql, &reader).unwrap();
+        assert_eq!(out, sql);
+        assert!(reader.staged.borrow().is_empty());
+    }
+
+    #[test]
+    fn test_transform_source_references_all_primary_unchanged() {
+        let reader = MockReader::new(true);
+        let out = transform_source_references("SELECT * FROM a JOIN base ON a.k = base.k", &reader)
+            .unwrap();
+        assert_eq!(out, "SELECT * FROM a JOIN base ON a.k = base.k");
+        assert!(reader.staged.borrow().is_empty());
+    }
+
+    #[test]
+    fn test_transform_source_references_stages_mixed_body() {
+        let reader = MockReader::new(true);
+        // A cache-resident CTE temp joined against a primary base table.
+        let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN base ON base.k = 1";
+        let out = transform_source_references(sql, &reader).unwrap();
+
+        let staged = reader.staged.borrow();
+        assert_eq!(staged.len(), 1, "base should be staged exactly once");
+        assert!(staged[0].0.starts_with("__ggsql_staged_0_"));
+        assert_eq!(staged[0].1, "SELECT * FROM base");
+
+        assert!(out.contains("__ggsql_staged_0_"));
+        assert!(out.contains("\"__ggsql_cte_t__\"")); // cte ref preserved
+        assert!(!out.contains("JOIN base"));
+    }
+
+    #[test]
+    fn test_transform_source_references_reversed_from_join() {
+        let reader = MockReader::new(true);
+        // Primary base table in FROM, cache-resident CTE temp in JOIN.
+        let sql = "SELECT * FROM base JOIN \"__ggsql_cte_t__\" ON base.k = 1";
+        let out = transform_source_references(sql, &reader).unwrap();
+
+        assert_eq!(reader.staged.borrow().len(), 1);
+        assert!(out.contains("FROM \"__ggsql_staged_0_"));
+        assert!(!out.contains("FROM base"));
+    }
+
+    #[test]
+    fn test_transform_source_references_schema_qualified() {
+        let reader = MockReader::new(true);
+        let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN myschema.base ON base.k = 1";
+        let out = transform_source_references(sql, &reader).unwrap();
+
+        let staged = reader.staged.borrow();
+        assert_eq!(staged.len(), 1);
+        assert_eq!(staged[0].1, "SELECT * FROM myschema.base");
+        assert!(out.contains("__ggsql_staged_0_"));
+        assert!(!out.contains("JOIN myschema.base"));
+    }
+
+    #[test]
+    fn test_transform_source_references_same_ref_staged_once() {
+        let reader = MockReader::new(true);
+        let sql = "SELECT * FROM base JOIN \"__ggsql_cte_t__\" ON base.k = base.j";
+        let _ = transform_source_references(sql, &reader).unwrap();
+        assert_eq!(reader.staged.borrow().len(), 1);
+    }
+
     #[test]
     fn test_split_with_query_basic() {
         let sql = "WITH cte AS (SELECT * FROM x) SELECT * FROM cte";
diff --git a/src/execute/mod.rs b/src/execute/mod.rs
index 2baa2032e..0e6379d01 100644
--- a/src/execute/mod.rs
+++ b/src/execute/mod.rs
@@ -1152,7 +1152,8 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<Prep
     // The global result is stored as a temp table so filtered layers can query it efficiently.
     let mut has_global_table = false;
     if sql_part.is_some() {
-        let (side_effects, query) = cte::transform_global_sql(&source_tree, &materialized_ctes);
+        let (side_effects, query) =
+            cte::transform_global_sql(&source_tree, &materialized_ctes, reader)?;
 
         for stmt in &side_effects {
             reader.execute_sql(stmt)?;
diff --git a/src/naming.rs b/src/naming.rs
index dfcf6d5e4..95a11536d 100644
--- a/src/naming.rs
+++ b/src/naming.rs
@@ -183,6 +183,27 @@ pub fn layer_source_table(index: usize) -> String {
     )
 }
 
+/// Generate temp table name for a primary base table staged into the cache.
+///
+/// Format: `__ggsql_staged_<index>_<uuid>__`
+///
+/// # Example
+/// ```
+/// use ggsql::naming;
+/// let table = naming::staged_source_table(1);
+/// assert!(table.starts_with("__ggsql_staged_1_"));
+/// assert!(table.ends_with("__"));
+/// ```
+pub fn staged_source_table(index: usize) -> String {
+    format!(
+        "{}staged_{}_{}{}",
+        GGSQL_PREFIX,
+        index,
+        session_id(),
+        GGSQL_SUFFIX
+    )
+}
+
 /// Generate table name for a builtin dataset.
 ///
 /// Used when rewriting `ggsql:penguins` to the internal table name.
@@ -328,6 +349,20 @@ pub fn quote_literal(s: &str) -> String {
 // Detection Functions
 // ============================================================================
 
+/// Check if a name refers to an internal ggsql table (CTE temp, staged source,
+/// global, builtin dataset, cache result, …).
+///
+/// # Example
+/// ```
+/// use ggsql::naming;
+/// assert!(naming::is_internal_table("__ggsql_cte_sales_abc__"));
+/// assert!(naming::is_internal_table("__ggsql_staged_0_abc__"));
+/// assert!(!naming::is_internal_table("sales"));
+/// ```
+pub fn is_internal_table(name: &str) -> bool {
+    name.starts_with(GGSQL_PREFIX)
+}
+
 /// Check if a column name is a synthetic constant column.
 ///
 /// # Example
diff --git a/src/reader/cache.rs b/src/reader/cache.rs
index 3e4e962bc..6ed5dc6c2 100644
--- a/src/reader/cache.rs
+++ b/src/reader/cache.rs
@@ -718,6 +718,84 @@ mod behavior_tests {
         );
     }
 
+    #[test]
+    fn test_cte_joined_against_primary_base_table() {
+        // A materialized CTE (cache-resident) joined against a primary-only base
+        // table: the global query is mixed, so the base table must be staged
+        // into the cache for the join to resolve there.
+        let base = DuckDBReader::new_in_memory().unwrap();
+        base.register(
+            "base",
+            df! { "k" => vec![1_i64, 2], "w" => vec![10_i64, 20] }.unwrap(),
+            true,
+        )
+        .unwrap();
+        let primary = Box::new(ReadOnlyReader::new(Box::new(base)));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let spec = reader.execute(
+            "WITH t AS (SELECT 1 AS k, 100 AS v) \
+             SELECT t.v, base.w FROM t JOIN base ON t.k = base.k \
+             VISUALISE v AS x, w AS y DRAW point",
+        );
+        assert!(
+            spec.is_ok(),
+            "CTE joined against a primary base table should succeed: {:?}",
+            spec.err()
+        );
+    }
+
+    #[test]
+    fn test_schema_qualified_base_table_join() {
+        // Same as above, but the primary base table is schema-qualified.
+        let base = DuckDBReader::new_in_memory().unwrap();
+        base.execute_sql("CREATE SCHEMA myschema").unwrap();
+        base.execute_sql("CREATE TABLE myschema.base AS SELECT 1 AS k, 10 AS w")
+            .unwrap();
+        let primary = Box::new(ReadOnlyReader::new(Box::new(base)));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let spec = reader.execute(
+            "WITH t AS (SELECT 1 AS k, 100 AS v) \
+             SELECT t.v, myschema.base.w FROM t JOIN myschema.base ON t.k = myschema.base.k \
+             VISUALISE v AS x, w AS y DRAW point",
+        );
+        assert!(
+            spec.is_ok(),
+            "CTE joined against a schema-qualified base table should succeed: {:?}",
+            spec.err()
+        );
+    }
+
+    #[test]
+    fn test_mixed_later_cte_body_joins_primary() {
+        // A later CTE whose body references an earlier (cache-resident) CTE and a
+        // primary-only base table: the CTE body is mixed and must stage the base.
+        let base = DuckDBReader::new_in_memory().unwrap();
+        base.register(
+            "base",
+            df! { "k" => vec![1_i64], "w" => vec![10_i64] }.unwrap(),
+            true,
+        )
+        .unwrap();
+        let primary = Box::new(ReadOnlyReader::new(Box::new(base)));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let spec = reader.execute(
+            "WITH a AS (SELECT 1 AS k, 5 AS p), \
+                  b AS (SELECT a.p, base.w FROM a JOIN base ON a.k = base.k) \
+             SELECT * FROM b VISUALISE p AS x, w AS y DRAW point",
+        );
+        assert!(
+            spec.is_ok(),
+            "mixed later-CTE body joining a primary table should succeed: {:?}",
+            spec.err()
+        );
+    }
+
     #[test]
     fn test_meta_table_records_and_serves_memo() {
         // A memoized read is recorded in the metadata table and served back from

From 536a392028984af35856f5bd2dfd53816c3aad06 Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Wed, 1 Jul 2026 15:19:01 +0100
Subject: [PATCH 14/15] Update ggsql grammar for JOIN and qualified window
 function

---
 tree-sitter-ggsql/grammar.js            |  92 ++++++++--
 tree-sitter-ggsql/test/corpus/basic.txt | 221 ++++++++++++++++++++++++
 2 files changed, 300 insertions(+), 13 deletions(-)

diff --git a/tree-sitter-ggsql/grammar.js b/tree-sitter-ggsql/grammar.js
index c19938fcc..521f9de40 100644
--- a/tree-sitter-ggsql/grammar.js
+++ b/tree-sitter-ggsql/grammar.js
@@ -310,16 +310,7 @@ module.exports = grammar({
 
     non_from_sql_keyword: $ => choice(
       caseInsensitive('WHERE'),
-      caseInsensitive('JOIN'),
-      caseInsensitive('LEFT'),
-      caseInsensitive('RIGHT'),
-      caseInsensitive('INNER'),
-      caseInsensitive('OUTER'),
       caseInsensitive('LATERAL'),
-      caseInsensitive('CROSS'),
-      caseInsensitive('NATURAL'),
-      caseInsensitive('FULL'),
-      caseInsensitive('ON'),
       caseInsensitive('AND'),
       caseInsensitive('OR'),
       caseInsensitive('NOT'),
@@ -518,8 +509,8 @@ module.exports = grammar({
     window_partition_clause: $ => seq(
       caseInsensitive('PARTITION'),
       caseInsensitive('BY'),
-      $.identifier,
-      repeat(seq(',', $.identifier))
+      $._dotted_column,
+      repeat(seq(',', $._dotted_column))
     ),
 
     window_order_clause: $ => seq(
@@ -529,12 +520,19 @@ module.exports = grammar({
       repeat(seq(',', $.order_item))
     ),
 
+    // ORDER BY
     order_item: $ => seq(
-      $.identifier,
+      $._dotted_column,
       optional(choice(caseInsensitive('ASC'), caseInsensitive('DESC'))),
       optional(seq(caseInsensitive('NULLS'), choice(caseInsensitive('FIRST'), caseInsensitive('LAST'))))
     ),
 
+    // A (possibly dotted) column reference used as a window sort/partition key.
+    _dotted_column: $ => prec.right(seq(
+      $.identifier,
+      repeat(seq('.', $.identifier))
+    )),
+
     frame_clause: $ => seq(
       choice(caseInsensitive('ROWS'), caseInsensitive('RANGE')),
       choice(
@@ -576,9 +574,77 @@ module.exports = grammar({
     from_clause: $ => prec.right(1, seq(
       token(prec(1, caseInsensitive('FROM'))),
       $.table_ref,
-      repeat(seq(',', $.table_ref))
+      repeat(choice(
+        seq(',', $.table_ref),
+        $.join_clause
+      ))
+    )),
+
+    // ANSI join: an operator, the target table, and an ON or USING condition.
+    join_clause: $ => prec.right(seq(
+      $.join_operator,
+      $.table_ref,
+      optional($.join_condition)
     )),
 
+    // ANSI join operators.
+    join_operator: $ => choice(
+      seq(caseInsensitive('CROSS'), caseInsensitive('JOIN')),
+      seq(
+        optional(caseInsensitive('NATURAL')),
+        optional(choice(
+          caseInsensitive('INNER'),
+          seq(
+            choice(
+              caseInsensitive('LEFT'),
+              caseInsensitive('RIGHT'),
+              caseInsensitive('FULL')
+            ),
+            optional(caseInsensitive('OUTER'))
+          )
+        )),
+        caseInsensitive('JOIN')
+      )
+    ),
+
+    join_condition: $ => choice(
+      seq(caseInsensitive('ON'), $.join_condition_expression),
+      seq(
+        caseInsensitive('USING'),
+        '(',
+        $.identifier,
+        repeat(seq(',', $.identifier)),
+        ')'
+      )
+    ),
+
+    // The ON search condition.
+    join_condition_expression: $ => prec.right(repeat1(choice(
+      $.case_expression,
+      $.cast_expression,
+      $.function_call,
+      $.jinja_template,
+      $.join_condition_keyword,
+      $.string,
+      $.number,
+      '.', '*', '=', '<', '>', '!', '+', '-', '/', '%', '|', '&', '^', '~', '::',
+      $.subquery,
+      $.identifier
+    ))),
+
+    // Expression-level keywords allowed inside an ON condition.
+    join_condition_keyword: $ => choice(
+      caseInsensitive('AND'),
+      caseInsensitive('OR'),
+      caseInsensitive('NOT'),
+      caseInsensitive('IN'),
+      caseInsensitive('IS'),
+      caseInsensitive('NULL'),
+      caseInsensitive('EXISTS'),
+      caseInsensitive('BETWEEN'),
+      caseInsensitive('LIKE')
+    ),
+
     // VISUALISE/VISUALIZE [global_mapping] [FROM source] with clauses
     // Global mapping sets default aesthetics for all layers
     // FROM source can be an identifier (table/CTE) or string (file path)
diff --git a/tree-sitter-ggsql/test/corpus/basic.txt b/tree-sitter-ggsql/test/corpus/basic.txt
index 43834daf7..ee1fdf991 100644
--- a/tree-sitter-ggsql/test/corpus/basic.txt
+++ b/tree-sitter-ggsql/test/corpus/basic.txt
@@ -941,6 +941,158 @@ DRAW bar
       (draw_clause
         (geom_type)))))
 
+================================================================================
+SELECT with JOIN
+================================================================================
+
+SELECT * FROM x JOIN y ON x.id = y.id
+VISUALISE a AS x
+DRAW bar
+
+--------------------------------------------------------------------------------
+
+(query
+  (sql_portion
+    (sql_statement
+      (select_statement
+        (select_body
+          (from_clause
+            (table_ref
+              table: (qualified_name
+                (identifier
+                  (bare_identifier))))
+            (join_clause
+              (join_operator)
+              (table_ref
+                table: (qualified_name
+                  (identifier
+                    (bare_identifier))))
+              (join_condition
+                (join_condition_expression
+                  (identifier
+                    (bare_identifier))
+                  (identifier
+                    (bare_identifier))
+                  (identifier
+                    (bare_identifier))
+                  (identifier
+                    (bare_identifier))))))))))
+  (visualise_statement
+    (visualise_keyword)
+    (global_mapping
+      (mapping_list
+        (mapping_element
+          (explicit_mapping
+            value: (mapping_value
+              (column_reference
+                (identifier
+                  (bare_identifier))))
+            name: (aesthetic_name)))))
+    (viz_clause
+      (draw_clause
+        (geom_type)))))
+
+================================================================================
+SELECT with LEFT OUTER JOIN, aliases and ON condition
+================================================================================
+
+SELECT * FROM orders o LEFT OUTER JOIN customers c ON o.cid = c.id
+VISUALISE a AS x
+DRAW bar
+
+--------------------------------------------------------------------------------
+
+(query
+  (sql_portion
+    (sql_statement
+      (select_statement
+        (select_body
+          (from_clause
+            (table_ref
+              table: (qualified_name
+                (identifier
+                  (bare_identifier)))
+              alias: (identifier
+                (bare_identifier)))
+            (join_clause
+              (join_operator)
+              (table_ref
+                table: (qualified_name
+                  (identifier
+                    (bare_identifier)))
+                alias: (identifier
+                  (bare_identifier)))
+              (join_condition
+                (join_condition_expression
+                  (identifier
+                    (bare_identifier))
+                  (identifier
+                    (bare_identifier))
+                  (identifier
+                    (bare_identifier))
+                  (identifier
+                    (bare_identifier))))))))))
+  (visualise_statement
+    (visualise_keyword)
+    (global_mapping
+      (mapping_list
+        (mapping_element
+          (explicit_mapping
+            value: (mapping_value
+              (column_reference
+                (identifier
+                  (bare_identifier))))
+            name: (aesthetic_name)))))
+    (viz_clause
+      (draw_clause
+        (geom_type)))))
+
+================================================================================
+SELECT with INNER JOIN USING column list
+================================================================================
+
+SELECT * FROM a INNER JOIN b USING (k, id)
+VISUALISE a AS x
+DRAW bar
+
+--------------------------------------------------------------------------------
+
+(query
+  (sql_portion
+    (sql_statement
+      (select_statement
+        (select_body
+          (from_clause
+            (table_ref
+              table: (qualified_name
+                (identifier
+                  (bare_identifier))))
+            (join_clause
+              (join_operator)
+              (table_ref
+                table: (qualified_name
+                  (identifier
+                    (bare_identifier))))
+              (join_condition
+                (identifier
+                  (bare_identifier))
+                (identifier
+                  (bare_identifier)))))))))
+  (visualise_statement
+    (visualise_keyword)
+    (global_mapping
+      (mapping_list
+        (mapping_element
+          (explicit_mapping
+            value: (mapping_value
+              (column_reference
+                (identifier
+                  (bare_identifier))))
+            name: (aesthetic_name)))))
+    (viz_clause
+      (draw_clause
+        (geom_type)))))
+
 ================================================================================
 Literal value in MAPPING
 ================================================================================
@@ -1339,6 +1491,75 @@ DRAW point MAPPING x AS x, rnk AS y
                 name: (aesthetic_name)))))))))
 
 ================================================================================
+Window function with qualified PARTITION BY and ORDER BY
+================================================================================
+
+SELECT ROW_NUMBER() OVER (PARTITION BY a.k ORDER BY b.w DESC) AS rn FROM a JOIN b ON a.k = b.k
+VISUALISE
+DRAW point MAPPING rn AS x
+
+--------------------------------------------------------------------------------
+
+(query
+  (sql_portion
+    (sql_statement
+      (select_statement
+        (select_body
+          (window_function
+            function: (identifier
+              (bare_identifier))
+            (window_specification
+              (window_partition_clause
+                (identifier
+                  (bare_identifier))
+                (identifier
+                  (bare_identifier)))
+              (window_order_clause
+                (order_item
+                  (identifier
+                    (bare_identifier))
+                  (identifier
+                    (bare_identifier))))))
+          (identifier
+            (bare_identifier))
+          (identifier
+            (bare_identifier))
+          (from_clause
+            (table_ref
+              table: (qualified_name
+                (identifier
+                  (bare_identifier))))
+            (join_clause
+              (join_operator)
+              (table_ref
+                table: (qualified_name
+                  (identifier
+                    (bare_identifier))))
+              (join_condition
+                (join_condition_expression
+                  (identifier
+                    (bare_identifier))
+                  (identifier
+                    (bare_identifier))
+                  (identifier
+                    (bare_identifier))
+                  (identifier
+                    (bare_identifier))))))))))
+  (visualise_statement
+    (visualise_keyword)
+    (viz_clause
+      (draw_clause
+        (geom_type)
+        (mapping_clause
+          (mapping_list
+            (mapping_element
+              (explicit_mapping
+                value: (mapping_value
+                  (column_reference
+                    (identifier
+                      (bare_identifier))))
+                name: (aesthetic_name)))))))))
+================================================================================
 Window function with frame clause
 ================================================================================
 

From a42350150b377eabc1c0f2a9d72275bb32839654 Mon Sep 17 00:00:00 2001
From: George Stagg <george.stagg@posit.co>
Date: Wed, 1 Jul 2026 15:55:42 +0100
Subject: [PATCH 15/15] Improve SQL table-reference parsing

---
 src/CLAUDE.md        |   4 +-
 src/execute/cte.rs   | 490 +++++++++++++++++++++++++++++++++----------
 src/execute/mod.rs   |  16 +-
 src/parser/mod.rs    |   5 +
 src/parser/sql.rs    | 222 ++++++++++++++++++++
 src/reader/cache.rs  | 152 +++++++++++++-
 src/reader/data.rs   | 248 +---------------------
 src/reader/duckdb.rs |   4 +-
 src/reader/sqlite.rs |   4 +-
 9 files changed, 764 insertions(+), 381 deletions(-)
 create mode 100644 src/parser/sql.rs

diff --git a/src/CLAUDE.md b/src/CLAUDE.md
index c674b9808..c32ea6975 100644
--- a/src/CLAUDE.md
+++ b/src/CLAUDE.md
@@ -34,6 +34,7 @@ src/
 - `mod.rs` exposes `parse_query()` which builds a `Vec<Plot>` from a query string.
 - `source_tree.rs` is the parse-once wrapper: holds the tree-sitter `Tree`, source text, and language; offers a declarative query API (`find_node`, `find_text`, …) plus lazy `extract_sql()` / `extract_visualise()` extractors. It also handles the `VISUALISE FROM <source>` shorthand by injecting `SELECT * FROM <source>`.
 - `builder.rs` walks the CST and produces typed `Plot` values. This is where new grammar nodes become `Plot` fields.
+- `sql.rs` extracts structure from SQL fragments over the parse tree.
 
 Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding syntax, edit `grammar.js`, regenerate, then teach `builder.rs` about the new nodes.
 
@@ -48,7 +49,8 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding
 | `odbc.rs` | ODBC | `odbc` (default) |
 | `cache.rs` | `CachingReader` — wraps any primary `Reader` with an in-memory cache | `duckdb` or `sqlite` |
 | `connection.rs` | Connection-string parsing for all of the above | — |
-| `data.rs`, `spec.rs` | `Spec` type returned by `execute()`, plus DataFrame conversion | — |
+| `spec.rs` | `Spec` type returned by `execute()`, plus DataFrame conversion | — |
+| `data.rs` | Bundled sample datasets — the `ggsql:` builtins | `builtin-data` |
 
 `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`).
 
diff --git a/src/execute/cte.rs b/src/execute/cte.rs
index 5b4522282..bcfc1cf7a 100644
--- a/src/execute/cte.rs
+++ b/src/execute/cte.rs
@@ -79,57 +79,120 @@ pub(crate) fn get_node_text<'a>(node: &Node, source: &'a str) -> &'a str {
     &source[node.start_byte()..node.end_byte()]
 }
 
-/// Transform CTE references in SQL to use temp table names
+/// Transform CTE references in SQL to use temp table names.
 ///
-/// Replaces references to CTEs (e.g., `FROM sales`, `JOIN sales`) with
-/// the corresponding temp table names (e.g., `FROM __ggsql_cte_sales__`).
+/// Replaces references to CTEs (e.g. `FROM sales`, `JOIN sales`, `sales.col`)
+/// with the corresponding temp table names (e.g. `__ggsql_cte_sales__`).
 ///
-/// This handles table references after FROM and JOIN keywords, being careful
-/// to only replace whole word matches (not substrings).
+/// Table references are found via the parser; column references are rewritten
+/// tolerant of whitespace around the dot and never inside string literals.
 pub fn transform_cte_references(sql: &str, cte_names: &HashSet<String>) -> String {
     if cte_names.is_empty() {
         return sql.to_string();
     }
 
-    let mut result = sql.to_string();
+    // On a parse failure leave the SQL unchanged.
+    let Ok(sites) = crate::parser::extract_table_ref_sites(sql) else {
+        return sql.to_string();
+    };
+    let string_ranges = crate::parser::string_literal_ranges(sql);
+    let in_string = |pos: usize| string_ranges.iter().any(|&(s, e)| pos >= s && pos < e);
+
+    // Match CTE names against the unquoted reference text, mirroring the
+    // definition names.
+    let temp_of = |raw: &str| -> Option<String> {
+        let name = naming::unquote_ident(raw);
+        cte_names
+            .iter()
+            .find(|c| naming::unquote_ident(c).eq_ignore_ascii_case(&name))
+            .map(|c| naming::quote_ident(&naming::cte_table(c)))
+    };
 
-    for cte_name in cte_names {
-        let temp_table_name = naming::quote_ident(&naming::cte_table(cte_name));
+    let mut replacements: Vec<(usize, usize, String)> = Vec::new();
 
-        // Replace table references: FROM cte_name, JOIN cte_name, cte_name.column
-        // Use word boundary matching to avoid replacing substrings
-        // Pattern: (FROM|JOIN)\s+<cte_name>(\s|,|)|$)
-        let patterns = [
-            // FROM cte_name (case insensitive)
-            (
-                format!(r"(?i)(\bFROM\s+){}(\s|,|\)|$)", regex::escape(cte_name)),
-                format!("${{1}}{}${{2}}", temp_table_name),
-            ),
-            // JOIN cte_name (case insensitive) - handles LEFT JOIN, RIGHT JOIN, etc.
-            (
-                format!(r"(?i)(\bJOIN\s+){}(\s|,|\)|$)", regex::escape(cte_name)),
-                format!("${{1}}{}${{2}}", temp_table_name),
-            ),
-            // Qualified column references: cte_name.column (case insensitive)
-            (
-                format!(
-                    r"(?i)\b{}(\.[a-zA-Z_][a-zA-Z0-9_]*)",
-                    regex::escape(cte_name)
-                ),
-                format!("{}${{1}}", temp_table_name),
-            ),
-        ];
+    // Rewrite each table_ref that names a CTE to its temp table.
+    for site in &sites {
+        if let Some(temp) = temp_of(&site.raw) {
+            replacements.push((site.start, site.end, temp));
+        }
+    }
 
-        for (pattern, replacement) in patterns {
-            if let Ok(re) = regex::Regex::new(&pattern) {
-                result = re.replace_all(&result, replacement.as_str()).to_string();
+    // Rewrite qualified column references `cte.col` -> `temp.col`.
+    let site_starts: HashSet<usize> = sites.iter().map(|s| s.start).collect();
+    for cte in cte_names {
+        let temp = naming::quote_ident(&naming::cte_table(cte));
+        let bare = naming::unquote_ident(cte);
+        let pattern = format!(r"((?i:{}))\s*\.", regex::escape(&bare));
+        let Ok(re) = regex::Regex::new(&pattern) else {
+            continue;
+        };
+        for caps in re.captures_iter(sql) {
+            let name = caps.get(1).unwrap();
+            let start = name.start();
+            if site_starts.contains(&start) || in_string(start) {
+                continue;
+            }
+            // Require a boundary before the name.
+            let ok_prefix = sql[..start]
+                .chars()
+                .next_back()
+                .is_none_or(|c| !(c.is_alphanumeric() || c == '_' || c == '.' || c == '"'));
+            if ok_prefix {
+                replacements.push((name.start(), name.end(), temp.clone()));
             }
         }
     }
 
+    // Apply in reverse byte order so earlier offsets stay valid.
+    let mut result = sql.to_string();
+    replacements.sort_by_key(|(start, _, _)| std::cmp::Reverse(*start));
+    // Distinct offsets only.
+    replacements.dedup_by_key(|(start, _, _)| *start);
+
+    for (start, end, replacement) in replacements {
+        result.replace_range(start..end, &replacement);
+    }
+
     result
 }
 
+/// Byte offsets of the `.` separators in a (possibly quoted) dotted identifier.
+fn unquoted_dot_positions(raw: &str) -> Vec<usize> {
+    let mut positions = Vec::new();
+    let mut in_quote = false;
+    for (i, b) in raw.bytes().enumerate() {
+        match b {
+            b'"' => in_quote = !in_quote,
+            b'.' if !in_quote => positions.push(i),
+            _ => {}
+        }
+    }
+    positions
+}
+
+/// Split a (possibly quoted) dotted identifier into its components, trimming
+/// surrounding whitespace: `schema . base` → `["schema","base"]`.
+fn identifier_components(raw: &str) -> Vec<&str> {
+    let mut parts = Vec::new();
+    let mut start = 0;
+    for pos in unquoted_dot_positions(raw) {
+        parts.push(raw[start..pos].trim());
+        start = pos + 1;
+    }
+    parts.push(raw[start..].trim());
+    parts
+}
+
+/// The final component of a (possibly quoted) dotted identifier: `base` for
+/// `schema.base`, `"base"` for `"schema"."base"`, and the whole (trimmed) string
+/// for a single (possibly quoted) name.
+fn last_identifier_component(raw: &str) -> &str {
+    match unquoted_dot_positions(raw).last() {
+        Some(&pos) => raw[pos + 1..].trim(),
+        None => raw.trim(),
+    }
+}
+
 /// Stage the primary base tables in a body destined for the cache.
 ///
 /// A body that is entirely primary (no cache-resident reference) or entirely
@@ -139,68 +202,130 @@ pub fn transform_source_references(sql: &str, reader: &dyn Reader) -> Result<Str
         return Ok(sql.to_string());
     }
 
-    // Discover the names referenced after FROM/JOIN — including quoted (`"foo"`)
-    // and schema-qualified (`schema.table`) names. Subqueries contribute no name.
-    let re = match regex::Regex::new(
-        r#"(?i)\b(?:FROM|JOIN)\s+("(?:[^"]|"")*"|[A-Za-z_][A-Za-z0-9_$]*(?:\.[A-Za-z_][A-Za-z0-9_$]*)*)"#,
-    ) {
-        Ok(re) => re,
+    // Discover table references via the parser.
+    let sites = match crate::parser::extract_table_ref_sites(sql) {
+        Ok(sites) => sites,
         Err(_) => return Ok(sql.to_string()),
     };
 
-    let mut seen = HashSet::new();
-    let mut refs: Vec<String> = Vec::new();
-    for caps in re.captures_iter(sql) {
-        if let Some(m) = caps.get(1) {
-            let name = m.as_str().to_string();
-            if seen.insert(name.clone()) {
-                refs.push(name);
-            }
-        }
-    }
-
-    // A name resolves against the cache backend (rather than the primary) if it
-    // is an internal `__ggsql_*` table or a `ggsql:` builtin dataset.
-    let is_cache_resolvable = |name: &str| {
-        naming::is_internal_table(&naming::unquote_ident(name)) || name.starts_with("ggsql:")
+    // A ref resolves against the cache backend (rather than the primary) when it
+    // is an internal `__ggsql_*` table, a `ggsql:` builtin, or a file-path string.
+    let is_cache_resolvable = |raw: &str| {
+        raw.starts_with('\'')
+            || raw.starts_with("ggsql:")
+            || naming::is_internal_table(&naming::unquote_ident(raw))
     };
 
-    let has_cache_ref = refs.iter().any(|r| is_cache_resolvable(r));
-    let primary_refs: Vec<&String> = refs.iter().filter(|r| !is_cache_resolvable(r)).collect();
+    let has_cache_ref = sites.iter().any(|s| is_cache_resolvable(&s.raw));
+    let primary_sites: Vec<&crate::parser::TableRefSite> = sites
+        .iter()
+        .filter(|s| !is_cache_resolvable(&s.raw))
+        .collect();
 
     // Only mixed bodies need staging.
-    if !has_cache_ref || primary_refs.is_empty() {
+    if !has_cache_ref || primary_sites.is_empty() {
         return Ok(sql.to_string());
     }
 
-    let mut result = sql.to_string();
-    for (index, raw) in primary_refs.into_iter().enumerate() {
-        let staged = naming::staged_source_table(index);
-        reader.materialize_table(&staged, &[], &format!("SELECT * FROM {}", raw))?;
-        let target = naming::quote_ident(&staged);
+    // The final identifier component of a ref. This is how an unaliased ref
+    // is spelled at column sites, so it doubles as the alias to attach.
+    let last_of = |raw: &str| -> String { last_identifier_component(raw).to_string() };
 
-        // Rewrite FROM/JOIN/qualified references to the staged copy.
-        let patterns = [
-            (
-                format!(r"(?i)(\bFROM\s+){}(\s|,|\)|$)", regex::escape(raw)),
-                format!("${{1}}{}${{2}}", target),
-            ),
-            (
-                format!(r"(?i)(\bJOIN\s+){}(\s|,|\)|$)", regex::escape(raw)),
-                format!("${{1}}{}${{2}}", target),
-            ),
-            (
-                format!(r"(?i)\b{}(\.[a-zA-Z_][a-zA-Z0-9_]*)", regex::escape(raw)),
-                format!("{}${{1}}", target),
-            ),
-        ];
-        for (pattern, replacement) in patterns {
-            if let Ok(re) = regex::Regex::new(&pattern) {
-                result = re.replace_all(&result, replacement.as_str()).to_string();
+    // Stage each distinct primary table into the cache once.
+    let mut staged_for: std::collections::HashMap<String, String> =
+        std::collections::HashMap::new();
+    for site in &primary_sites {
+        if !staged_for.contains_key(&site.raw) {
+            let t = naming::staged_source_table(staged_for.len());
+            reader.materialize_table(&t, &[], &format!("SELECT * FROM {}", site.raw))?;
+            staged_for.insert(site.raw.clone(), t);
+        }
+    }
+
+    // An unaliased ref is aliased back to its last component so `base.col` keeps
+    // resolving. Two unaliased refs sharing a last component (`a.base`+`b.base`)
+    // would collide, so those use their unique staged name instead.
+    let mut last_counts: std::collections::HashMap<String, usize> =
+        std::collections::HashMap::new();
+    let mut seen_unaliased: HashSet<&str> = HashSet::new();
+    for site in &primary_sites {
+        if !site.has_alias && seen_unaliased.insert(site.raw.as_str()) {
+            *last_counts.entry(last_of(&site.raw)).or_default() += 1;
+        }
+    }
+    let last_collides = |raw: &str| last_counts.get(&last_of(raw)).copied().unwrap_or(0) > 1;
+
+    let string_ranges = crate::parser::string_literal_ranges(sql);
+    let in_string = |pos: usize| string_ranges.iter().any(|&(s, e)| pos >= s && pos < e);
+    let site_starts: HashSet<usize> = primary_sites.iter().map(|s| s.start).collect();
+
+    let mut replacements: Vec<(usize, usize, String)> = Vec::new();
+
+    // Rewrite each table_ref occurrence to the staged table.
+    for site in &primary_sites {
+        let quoted = naming::quote_ident(&staged_for[&site.raw]);
+        let replacement = if site.has_alias || last_collides(&site.raw) {
+            quoted
+        } else {
+            format!("{} AS {}", quoted, last_of(&site.raw))
+        };
+        replacements.push((site.start, site.end, replacement));
+    }
+
+    // Rewrite full column qualifiers (`schema.base.col`) to the alias, or the
+    // staged table when it collides — skipping string literals and table_ref sites.
+    for raw in staged_for.keys() {
+        // Only multi-part refs can appear as a full qualifier at column sites.
+        if unquoted_dot_positions(raw).is_empty() {
+            continue;
+        }
+        let target = if last_collides(raw) {
+            naming::quote_ident(&staged_for[raw])
+        } else {
+            last_of(raw)
+        };
+        // Match the qualifier tolerant of whitespace around its dots.
+        let pattern = format!(
+            r"({})\s*\.",
+            identifier_components(raw)
+                .iter()
+                .map(|c| {
+                    if c.starts_with('"') {
+                        regex::escape(c)
+                    } else {
+                        format!("(?i:{})", regex::escape(c))
+                    }
+                })
+                .collect::<Vec<_>>()
+                .join(r"\s*\.\s*")
+        );
+        let Ok(re) = regex::Regex::new(&pattern) else {
+            continue;
+        };
+        for caps in re.captures_iter(sql) {
+            let qualifier = caps.get(1).unwrap();
+            let start = qualifier.start();
+            if site_starts.contains(&start) || in_string(start) {
+                continue;
+            }
+            // Require a boundary before the qualifier.
+            let ok_prefix = sql[..start]
+                .chars()
+                .next_back()
+                .is_none_or(|c| !(c.is_alphanumeric() || c == '_' || c == '.' || c == '"'));
+            if ok_prefix {
+                replacements.push((start, qualifier.end(), target.clone()));
             }
         }
     }
 
+    // Apply in reverse byte order so earlier offsets stay valid.
+    let mut result = sql.to_string();
+    replacements.sort_by_key(|(start, _, _)| std::cmp::Reverse(*start));
+    for (start, end, replacement) in replacements {
+        result.replace_range(start..end, &replacement);
+    }
+
     Ok(result)
 }
 
@@ -281,24 +406,13 @@ pub fn split_with_query(source_tree: &SourceTree) -> Option<(String, String)> {
     Some((cte_prefix, trailing))
 }
 
-/// Transform global SQL for execution with temp tables.
+/// Collect side-effect statements (CREATE, INSERT, UPDATE, DELETE) that
+/// need to run before the main query.
 ///
-/// Returns statements to execute directly as side effects (CREATE, INSERT, …)
-/// and an optional query whose result should be wrapped as the global temp
-/// table.
-pub fn transform_global_sql(
-    source_tree: &SourceTree,
-    materialized_ctes: &HashSet<String>,
-    reader: &dyn Reader,
-) -> Result<(Vec<String>, Option<String>)> {
-    // Collect side-effect statements (CREATE, INSERT, UPDATE, DELETE) that
-    // need to run before the main query. These appear alongside a trailing
-    // SELECT or VISUALISE FROM.
-    //
-    // Only structured DML is handled here — other_sql_statement nodes
-    // (INSTALL, LOAD, SET, …) are pre-executed in prepare_data_with_reader.
+/// Only structured DML is handled here — other_sql_statement nodes
+/// (INSTALL, LOAD, SET, …) are pre-executed in prepare_data_with_reader.
+pub fn extract_side_effects(source_tree: &SourceTree) -> Vec<String> {
     let root = source_tree.root();
-
     let side_effect_stmts = r#"
         (sql_statement
           [(create_statement)
@@ -306,12 +420,24 @@ pub fn transform_global_sql(
            (update_statement)
            (delete_statement)] @stmt)
     "#;
-    let side_effects: Vec<String> = source_tree
+    source_tree
         .find_texts(&root, side_effect_stmts)
         .into_iter()
-        .map(|s| transform_cte_references(s.trim(), materialized_ctes))
+        .map(|s| s.trim().to_string())
         .filter(|s| !s.is_empty())
-        .collect();
+        .collect()
+}
+
+/// Transform global SQL for execution as the global temp table.
+///
+/// Returns the query whose result should be wrapped as the global temp table,
+/// with CTE references rewritten and primary base tables staged.
+pub fn transform_global_sql(
+    source_tree: &SourceTree,
+    materialized_ctes: &HashSet<String>,
+    reader: &dyn Reader,
+) -> Result<Option<String>> {
+    let root = source_tree.root();
 
     // Try to extract trailing SELECT (WITH...SELECT or direct SELECT)
     let select_sql = split_with_query(source_tree)
@@ -323,20 +449,17 @@ pub fn transform_global_sql(
 
     if let Some(select_sql) = select_sql {
         let select_sql = transform_cte_references(&select_sql, materialized_ctes);
-        return Ok((
-            side_effects,
-            Some(transform_source_references(&select_sql, reader)?),
-        ));
+        return Ok(Some(transform_source_references(&select_sql, reader)?));
     }
 
     if !has_executable_sql(source_tree) {
-        return Ok((vec![], None));
+        return Ok(None);
     }
 
-    // We have non-SELECT executable SQL and/or VISUALISE FROM.
-    // Side-effects run directly, VISUALISE FROM becomes the queryable part.
-    // A bare WITH clause without a trailing statement is not executable on
-    // its own (its CTEs are already materialized separately).
+    // We have non-SELECT executable SQL and/or VISUALISE FROM. VISUALISE FROM
+    // becomes the queryable part; a bare WITH clause without a trailing
+    // statement is not executable on its own (its CTEs are materialized
+    // separately).
     let viz_from_query = source_tree
         .find_text(
             &root,
@@ -349,19 +472,18 @@ pub fn transform_global_sql(
         })
         .transpose()?;
 
-    if !side_effects.is_empty() || viz_from_query.is_some() {
-        Ok((side_effects, viz_from_query))
+    if viz_from_query.is_some() || !extract_side_effects(source_tree).is_empty() {
+        Ok(viz_from_query)
     } else {
         // has_executable_sql was true but we found no specific statements or
         // VISUALISE FROM — fall back to extract_sql as the query.
-        let query = source_tree
+        source_tree
             .extract_sql()
             .map(|s| {
                 let s = transform_cte_references(&s, materialized_ctes);
                 transform_source_references(&s, reader)
             })
-            .transpose()?;
-        Ok((vec![], query))
+            .transpose()
     }
 }
 
@@ -553,6 +675,46 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_transform_cte_references_comma_join_second_position() {
+        // A CTE in a non-first comma position must still be rewritten.
+        let ctes: HashSet<String> = ["cte"].iter().map(|s| s.to_string()).collect();
+        let out = transform_cte_references("SELECT * FROM base, cte WHERE base.k = cte.k", &ctes);
+        assert!(
+            !out.contains("FROM base, cte "),
+            "cte table ref not rewritten: {out}"
+        );
+        assert!(out.contains("__ggsql_cte_cte_"));
+        // Both the table ref and the qualified column ref are rewritten.
+        assert_eq!(out.matches("__ggsql_cte_cte_").count(), 2);
+    }
+
+    #[test]
+    fn test_transform_cte_references_preserves_string_literals() {
+        // A CTE name inside a string literal must not be rewritten.
+        let ctes: HashSet<String> = ["cte"].iter().map(|s| s.to_string()).collect();
+        let out = transform_cte_references("SELECT cte.k, 'cte.k' AS lit FROM cte", &ctes);
+        assert!(out.contains("'cte.k'"), "literal was corrupted: {out}");
+        // The real qualifier and table ref are still rewritten.
+        assert_eq!(out.matches("__ggsql_cte_cte_").count(), 2);
+    }
+
+    #[test]
+    fn test_transform_cte_references_whitespace_around_dot() {
+        let ctes: HashSet<String> = ["cte"].iter().map(|s| s.to_string()).collect();
+        let out = transform_cte_references("SELECT cte . v FROM cte", &ctes);
+        // The whitespace-separated qualifier is rewritten too.
+        assert!(!out.contains("cte . v"), "qualifier not rewritten: {out}");
+        assert_eq!(out.matches("__ggsql_cte_cte_").count(), 2);
+    }
+
+    #[test]
+    fn test_transform_cte_references_case_insensitive() {
+        let ctes: HashSet<String> = ["cte"].iter().map(|s| s.to_string()).collect();
+        let out = transform_cte_references("SELECT CTE.v FROM CTE", &ctes);
+        assert_eq!(out.matches("__ggsql_cte_cte_").count(), 2);
+    }
+
     /// Minimal reader that records `materialize_table` calls, used to unit-test
     /// source-reference staging without a database.
     struct MockReader {
@@ -656,6 +818,65 @@ mod tests {
         assert!(!out.contains("FROM base"));
     }
 
+    #[test]
+    fn test_transform_source_references_case_insensitive_qualifier() {
+        let reader = MockReader::new(true);
+        // The full qualifier is spelled in a different case than the table_ref;
+        // unquoted identifiers fold, so it must still be rewritten.
+        let sql = "SELECT MYSCHEMA.BASE.w FROM \"__ggsql_cte_t__\" \
+                   JOIN myschema.base ON MYSCHEMA.BASE.k = 1";
+        let out = transform_source_references(sql, &reader).unwrap();
+
+        assert_eq!(reader.staged.borrow().len(), 1);
+        assert!(
+            !out.contains("MYSCHEMA"),
+            "case-variant qualifier not rewritten: {out}"
+        );
+    }
+
+    #[test]
+    fn test_last_identifier_component() {
+        assert_eq!(last_identifier_component("base"), "base");
+        assert_eq!(last_identifier_component("schema.base"), "base");
+        assert_eq!(last_identifier_component("cat.schema.base"), "base");
+        assert_eq!(last_identifier_component("\"schema\".\"base\""), "\"base\"");
+        assert_eq!(last_identifier_component("schema.\"base\""), "\"base\"");
+        // A dot inside a quoted component is not a separator.
+        assert_eq!(last_identifier_component("\"my.base\""), "\"my.base\"");
+    }
+
+    #[test]
+    fn test_transform_source_references_fully_quoted_qualified() {
+        let reader = MockReader::new(true);
+        // `"s"."t"` must stage and alias to the (quoted) last component.
+        let sql = "SELECT \"myschema\".\"base\".w FROM \"__ggsql_cte_t__\" \
+                   JOIN \"myschema\".\"base\" ON \"myschema\".\"base\".k = 1";
+        let out = transform_source_references(sql, &reader).unwrap();
+
+        let staged = reader.staged.borrow();
+        assert_eq!(staged.len(), 1);
+        assert_eq!(staged[0].1, "SELECT * FROM \"myschema\".\"base\"");
+        // The staged table is aliased AS "base" and full qualifiers rewritten.
+        assert!(out.contains("AS \"base\""));
+        assert!(!out.contains("\"myschema\".\"base\""));
+    }
+
+    #[test]
+    fn test_transform_source_references_whitespace_around_dot() {
+        let reader = MockReader::new(true);
+        // Whitespace/newlines around the dots (in both the table_ref and the full
+        // column qualifier) must not defeat staging or the qualifier rewrite.
+        let sql = "SELECT myschema .\nbase . w FROM \"__ggsql_cte_t__\" \
+                   JOIN myschema . base ON myschema . base . k = 1";
+        let out = transform_source_references(sql, &reader).unwrap();
+
+        let staged = reader.staged.borrow();
+        assert_eq!(staged.len(), 1);
+        // The full column qualifiers are rewritten to the alias despite spacing.
+        assert!(!out.contains("myschema"), "qualifier not rewritten: {out}");
+        assert!(out.contains("AS base"));
+    }
+
     #[test]
     fn test_transform_source_references_schema_qualified() {
         let reader = MockReader::new(true);
@@ -677,6 +898,47 @@ mod tests {
         assert_eq!(reader.staged.borrow().len(), 1);
     }
 
+    #[test]
+    fn test_transform_source_references_comma_join() {
+        // A comma join between a cache-resident temp and a primary base table:
+        // the primary side must be staged (the old FROM/JOIN regex missed this).
+        let reader = MockReader::new(true);
+        let sql = "SELECT * FROM \"__ggsql_cte_t__\", base";
+        let out = transform_source_references(sql, &reader).unwrap();
+
+        let staged = reader.staged.borrow();
+        assert_eq!(staged.len(), 1);
+        assert_eq!(staged[0].1, "SELECT * FROM base");
+        assert!(out.contains("__ggsql_staged_0_"));
+        assert!(out.contains("\"__ggsql_cte_t__\""));
+    }
+
+    #[test]
+    fn test_transform_source_references_preserves_string_literals() {
+        // A string literal that happens to look like a qualified reference must
+        // not be rewritten.
+        let reader = MockReader::new(true);
+        let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN myschema.base \
+                   ON note = 'myschema.base.k'";
+        let out = transform_source_references(sql, &reader).unwrap();
+
+        assert_eq!(reader.staged.borrow().len(), 1);
+        // The literal is untouched; the real qualifier in the table_ref is staged.
+        assert!(out.contains("'myschema.base.k'"));
+        assert!(!out.contains("JOIN myschema.base "));
+    }
+
+    #[test]
+    fn test_transform_source_references_builtin_not_staged() {
+        // A `ggsql:` builtin resolves against the cache, so a JOIN against it and
+        // a resident temp is entirely cache-side and needs no staging.
+        let reader = MockReader::new(true);
+        let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN ggsql:penguins ON 1 = 1";
+        let out = transform_source_references(sql, &reader).unwrap();
+        assert!(reader.staged.borrow().is_empty());
+        assert_eq!(out, sql);
+    }
+
     #[test]
     fn test_split_with_query_basic() {
         let sql = "WITH cte AS (SELECT * FROM x) SELECT * FROM cte";
diff --git a/src/execute/mod.rs b/src/execute/mod.rs
index 0e6379d01..51b90b94b 100644
--- a/src/execute/mod.rs
+++ b/src/execute/mod.rs
@@ -1135,6 +1135,13 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<Prep
         reader.execute_sql(&stmt)?;
     }
 
+    // Run structured DML (CREATE, INSERT, UPDATE, DELETE) before CTE
+    // materialization and the global query, so any table they create or
+    // populate exists before it is read.
+    for stmt in cte::extract_side_effects(&source_tree) {
+        reader.execute_sql(&stmt)?;
+    }
+
     // Extract CTE definitions from the source tree (in declaration order)
     let ctes = cte::extract_ctes(&source_tree);
 
@@ -1152,12 +1159,9 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result<Prep
     // The global result is stored as a temp table so filtered layers can query it efficiently.
     let mut has_global_table = false;
     if sql_part.is_some() {
-        let (side_effects, query) =
-            cte::transform_global_sql(&source_tree, &materialized_ctes, reader)?;
-
-        for stmt in &side_effects {
-            reader.execute_sql(stmt)?;
-        }
+        // Side-effect DML has already run above; this yields just the global
+        // query (CTE references rewritten, primary sources staged).
+        let query = cte::transform_global_sql(&source_tree, &materialized_ctes, reader)?;
 
         if let Some(query) = query {
             // Materialize the global result as a temp table. For a plain reader
diff --git a/src/parser/mod.rs b/src/parser/mod.rs
index 3fae71627..a9171c64f 100644
--- a/src/parser/mod.rs
+++ b/src/parser/mod.rs
@@ -42,9 +42,14 @@ use crate::{Plot, Result};
 
 pub mod builder;
 pub mod source_tree;
+pub mod sql;
 
 pub use builder::build_ast;
 pub use source_tree::SourceTree;
+pub use sql::{
+    extract_builtin_dataset_names, extract_table_ref_sites, extract_table_refs,
+    rewrite_namespaced_sql, string_literal_ranges, TableRefSite,
+};
 
 /// Main entry point for parsing ggsql queries
 ///
diff --git a/src/parser/sql.rs b/src/parser/sql.rs
new file mode 100644
index 000000000..da2a96f32
--- /dev/null
+++ b/src/parser/sql.rs
@@ -0,0 +1,222 @@
+//! SQL structural extraction over the tree-sitter parse tree.
+//!
+//! These helpers parse a SQL fragment and pull structure out of it for
+//! the reader and execution layers.
+
+use crate::parser::SourceTree;
+use crate::{naming, Result};
+
+/// A `table_ref` occurrence in a SQL query.
+pub struct TableRefSite {
+    /// Byte offset of the start of the `table` node.
+    pub start: usize,
+    /// Byte offset of the end of the `table` node.
+    pub end: usize,
+    /// Raw source text of the `table` node.
+    pub raw: String,
+    /// Whether the ref carries an explicit `alias`.
+    pub has_alias: bool,
+}
+
+/// Locate every `table_ref` in a SQL query, with the byte range of its `table`
+/// target and whether it is aliased.
+///
+/// Subquery sources have no `table` field and are skipped.
+pub fn extract_table_ref_sites(sql: &str) -> Result<Vec<TableRefSite>> {
+    let source_tree = SourceTree::new(sql)?;
+    let root = source_tree.root();
+
+    let mut sites = Vec::new();
+    for node in source_tree.find_nodes(&root, "(table_ref) @ref") {
+        let Some(table) = node.child_by_field_name("table") else {
+            continue;
+        };
+        sites.push(TableRefSite {
+            start: table.start_byte(),
+            end: table.end_byte(),
+            raw: source_tree.get_text(&table),
+            has_alias: node.child_by_field_name("alias").is_some(),
+        });
+    }
+    Ok(sites)
+}
+
+/// Extract the table names referenced in a SQL query's `FROM`/`JOIN` clauses.
+///
+/// Returns the `table_ref` targets, with surrounding quotes stripped. Subquery
+/// sources contribute no name.
+pub fn extract_table_refs(sql: &str) -> Result<Vec<String>> {
+    let source_tree = SourceTree::new(sql)?;
+    let root = source_tree.root();
+
+    let mut names: Vec<String> = source_tree
+        .find_texts(&root, "(table_ref table: (_) @table)")
+        .iter()
+        .map(|t| naming::unquote_ident(t))
+        .collect();
+    names.sort_unstable();
+    names.dedup();
+    Ok(names)
+}
+
+/// Byte ranges of string literals in `sql`.
+pub fn string_literal_ranges(sql: &str) -> Vec<(usize, usize)> {
+    let Ok(source_tree) = SourceTree::new(sql) else {
+        return Vec::new();
+    };
+    let root = source_tree.root();
+    source_tree
+        .find_nodes(&root, "(string) @s")
+        .iter()
+        .map(|n| (n.start_byte(), n.end_byte()))
+        .collect()
+}
+
+/// Extract builtin dataset names from SQL containing namespaced identifiers.
+pub fn extract_builtin_dataset_names(sql: &str) -> Result<Vec<String>> {
+    let source_tree = SourceTree::new(sql)?;
+    let root = source_tree.root();
+
+    let mut names: Vec<String> = source_tree
+        .find_texts(&root, "(namespaced_identifier) @select")
+        .iter()
+        .filter_map(|token| token.strip_prefix("ggsql:").map(|s| s.to_string()))
+        .collect();
+    names.sort_unstable();
+    names.dedup();
+    Ok(names)
+}
+
+/// Rewrite SQL to replace namespaced identifiers with internal table names.
+///
+/// e.g. `SELECT * FROM ggsql:penguins` → `SELECT * FROM __ggsql_data_penguins__`.
+///
+/// Uses the parse tree to find the positions of namespaced identifiers, then
+/// replaces them.
+pub fn rewrite_namespaced_sql(sql: &str) -> Result<String> {
+    let source_tree = SourceTree::new(sql)?;
+    let root = source_tree.root();
+
+    // Collect (start_byte, end_byte, replacement) tuples.
+    let mut replacements: Vec<(usize, usize, String)> = Vec::new();
+    for node in source_tree.find_nodes(&root, "(namespaced_identifier) @select") {
+        let full_text = source_tree.get_text(&node);
+        if let Some(name) = full_text.strip_prefix("ggsql:") {
+            replacements.push((
+                node.start_byte(),
+                node.end_byte(),
+                naming::quote_ident(&naming::builtin_data_table(name)),
+            ));
+        }
+    }
+
+    if replacements.is_empty() {
+        return Ok(sql.to_string());
+    }
+
+    // Apply replacements in reverse byte order to preserve earlier offsets.
+    let mut result = sql.to_string();
+    replacements.sort_by_key(|(start, _, _)| std::cmp::Reverse(*start));
+    for (start, end, replacement) in replacements {
+        result.replace_range(start..end, &replacement);
+    }
+    Ok(result)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_extract_builtin_dataset_names_single() {
+        let sql = "SELECT * FROM ggsql:penguins VISUALISE DRAW point MAPPING x AS x";
+        assert_eq!(
+            extract_builtin_dataset_names(sql).unwrap(),
+            vec!["penguins"]
+        );
+    }
+
+    #[test]
+    fn test_extract_builtin_dataset_names_multiple() {
+        let sql =
+            "SELECT * FROM ggsql:penguins, ggsql:airquality VISUALISE DRAW point MAPPING x AS x";
+        let names = extract_builtin_dataset_names(sql).unwrap();
+        assert_eq!(names.len(), 2);
+        assert!(names.contains(&"airquality".to_string()));
+        assert!(names.contains(&"penguins".to_string()));
+    }
+
+    #[test]
+    fn test_extract_builtin_dataset_names_dedup() {
+        let sql = "SELECT * FROM ggsql:penguins p1, ggsql:penguins p2 VISUALISE DRAW point MAPPING x AS x";
+        assert_eq!(
+            extract_builtin_dataset_names(sql).unwrap(),
+            vec!["penguins"]
+        );
+    }
+
+    #[test]
+    fn test_extract_builtin_dataset_names_none() {
+        let sql = "SELECT * FROM regular_table VISUALISE DRAW point MAPPING x AS x";
+        assert!(extract_builtin_dataset_names(sql).unwrap().is_empty());
+    }
+
+    #[test]
+    fn test_rewrite_namespaced_sql_simple() {
+        let sql = "SELECT * FROM ggsql:penguins";
+        assert_eq!(
+            rewrite_namespaced_sql(sql).unwrap(),
+            "SELECT * FROM \"__ggsql_data_penguins__\""
+        );
+    }
+
+    #[test]
+    fn test_rewrite_namespaced_sql_multiple() {
+        let sql = "SELECT * FROM ggsql:penguins p, ggsql:airquality a WHERE p.id = a.id";
+        assert_eq!(
+            rewrite_namespaced_sql(sql).unwrap(),
+            "SELECT * FROM \"__ggsql_data_penguins__\" p, \"__ggsql_data_airquality__\" a WHERE p.id = a.id"
+        );
+    }
+
+    #[test]
+    fn test_rewrite_namespaced_sql_no_change() {
+        let sql = "SELECT * FROM regular_table WHERE x > 5";
+        assert_eq!(rewrite_namespaced_sql(sql).unwrap(), sql);
+    }
+
+    #[test]
+    fn test_rewrite_namespaced_sql_with_visualise() {
+        let sql = "SELECT * FROM ggsql:penguins VISUALISE DRAW point MAPPING bill_len AS x, bill_dep AS y";
+        let rewritten = rewrite_namespaced_sql(sql).unwrap();
+        assert!(rewritten.starts_with("SELECT * FROM \"__ggsql_data_penguins__\""));
+        assert!(!rewritten.contains("ggsql:"));
+    }
+
+    #[test]
+    fn test_extract_table_refs_basic() {
+        let mut refs = extract_table_refs("SELECT * FROM a JOIN \"b c\" ON a.k = 1").unwrap();
+        refs.sort();
+        assert_eq!(refs, vec!["a".to_string(), "b c".to_string()]);
+    }
+
+    #[test]
+    fn test_extract_table_ref_sites_alias_and_range() {
+        let sites =
+            extract_table_ref_sites("SELECT * FROM orders o JOIN items ON o.k = 1").unwrap();
+        assert_eq!(sites.len(), 2);
+        assert_eq!(sites[0].raw, "orders");
+        assert!(sites[0].has_alias);
+        assert_eq!(sites[1].raw, "items");
+        assert!(!sites[1].has_alias);
+    }
+
+    #[test]
+    fn test_string_literal_ranges_finds_literals() {
+        let sql = "SELECT * FROM t WHERE note = 'hello'";
+        let ranges = string_literal_ranges(sql);
+        assert_eq!(ranges.len(), 1);
+        let (s, e) = ranges[0];
+        assert_eq!(&sql[s..e], "'hello'");
+    }
+}
diff --git a/src/reader/cache.rs b/src/reader/cache.rs
index 6ed5dc6c2..fcab27329 100644
--- a/src/reader/cache.rs
+++ b/src/reader/cache.rs
@@ -308,16 +308,24 @@ impl CachingReader {
         }
     }
 
-    /// Whether `sql` references a cache-resident table by exact name.
+    /// Whether `sql` reads something the cache backend owns: a cache-resident
+    /// table, the metadata table, or a `ggsql:` builtin dataset.
     ///
-    /// Parses the query's `table_ref` targets and tests them against `resident`.
+    /// Determined from the parsed `table_ref`/namespaced-identifier targets.
     /// On a parse failure we conservatively return `false`.
-    fn references_resident(&self, sql: &str) -> bool {
-        let Ok(refs) = super::data::extract_table_refs(sql) else {
+    fn references_cache_resident(&self, sql: &str) -> bool {
+        if crate::parser::extract_builtin_dataset_names(sql)
+            .map(|d| !d.is_empty())
+            .unwrap_or(false)
+        {
+            return true;
+        }
+        let Ok(refs) = crate::parser::extract_table_refs(sql) else {
             return false;
         };
         let resident = self.resident.borrow();
-        refs.iter().any(|t| resident.contains(t))
+        refs.iter()
+            .any(|t| t == naming::CACHE_META_TABLE || resident.contains(t))
     }
 
     /// Drop every memoized result, one entry at a time.
@@ -357,10 +365,7 @@ impl Reader for CachingReader {
     fn execute_sql(&self, sql: &str) -> Result<DataFrame> {
         // Route to the cache when the read targets cache-resident objects, the
         // metadata table, or a builtin dataset.
-        if sql.contains("ggsql:")
-            || sql.contains(naming::CACHE_META_TABLE)
-            || self.references_resident(sql)
-        {
+        if self.references_cache_resident(sql) {
             return self.cache.execute_sql(sql);
         }
 
@@ -769,6 +774,135 @@ mod behavior_tests {
         );
     }
 
+    #[test]
+    fn test_multiple_primary_joins_in_chain() {
+        // A chain of joins bringing in two distinct primary tables must stage both.
+        let base = DuckDBReader::new_in_memory().unwrap();
+        base.execute_sql("CREATE TABLE base AS SELECT 1 AS k, 10 AS w")
+            .unwrap();
+        base.execute_sql("CREATE TABLE base2 AS SELECT 1 AS k, 20 AS z")
+            .unwrap();
+        let primary = Box::new(ReadOnlyReader::new(Box::new(base)));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let spec = reader.execute(
+            "WITH t AS (SELECT 1 AS k) \
+             SELECT base.w, base2.z FROM t JOIN base ON t.k = base.k JOIN base2 ON t.k = base2.k \
+             VISUALISE w AS x, z AS y DRAW point",
+        );
+        assert!(
+            spec.is_ok(),
+            "two primaries in a join chain should both be staged: {:?}",
+            spec.err()
+        );
+    }
+
+    #[test]
+    fn test_same_named_schema_tables_joined() {
+        // Two tables sharing a final name across schemas, joined and referenced
+        // by full qualifier: they must not collapse to the same alias.
+        let base = DuckDBReader::new_in_memory().unwrap();
+        base.execute_sql("CREATE SCHEMA a").unwrap();
+        base.execute_sql("CREATE SCHEMA b").unwrap();
+        base.execute_sql("CREATE TABLE a.base AS SELECT 1 AS k, 10 AS w")
+            .unwrap();
+        base.execute_sql("CREATE TABLE b.base AS SELECT 1 AS k, 20 AS w")
+            .unwrap();
+        let primary = Box::new(ReadOnlyReader::new(Box::new(base)));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let spec = reader.execute(
+            "WITH t AS (SELECT 1 AS k) \
+             SELECT a.base.w AS aw, b.base.w AS bw \
+             FROM t JOIN a.base ON t.k = a.base.k JOIN b.base ON t.k = b.base.k \
+             VISUALISE aw AS x, bw AS y DRAW point",
+        );
+        assert!(
+            spec.is_ok(),
+            "same-named schema tables should stage to distinct aliases: {:?}",
+            spec.err()
+        );
+    }
+
+    #[test]
+    fn test_setup_dml_runs_before_staging() {
+        // A query that CREATEs a primary table and then joins it against a CTE
+        // (mixed) must run the CREATE before staging reads the table.
+        let primary = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let spec = reader.execute(
+            "CREATE TABLE sales AS SELECT * FROM (VALUES (1, 10), (2, 20)) t(k, w); \
+             WITH c AS (SELECT 1 AS k UNION ALL SELECT 2) \
+             SELECT sales.k, sales.w FROM sales JOIN c ON sales.k = c.k \
+             VISUALISE k AS x, w AS y DRAW point",
+        );
+        assert!(
+            spec.is_ok(),
+            "setup DML must run before staging reads the created table: {:?}",
+            spec.err()
+        );
+    }
+
+    #[test]
+    fn test_multi_statement_dml_ordering_before_staging() {
+        // Multiple dependent side-effects (CREATE, INSERT, UPDATE) must run in
+        // order before staging, and the staged data must reflect them.
+        let primary = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let prepared = crate::execute::prepare_data_with_reader(
+            "CREATE TABLE t(k INTEGER, w INTEGER); \
+             INSERT INTO t VALUES (1, 10), (2, 20); \
+             UPDATE t SET w = w + 5 WHERE k = 1; \
+             WITH c AS (SELECT 1 AS k UNION ALL SELECT 2) \
+             SELECT t.k, t.w FROM t JOIN c ON t.k = c.k \
+             VISUALISE k AS x, w AS y DRAW point",
+            &reader,
+        )
+        .expect("multi-statement setup DML + staging should succeed");
+
+        let df = prepared.data.get(&naming::layer_key(0)).unwrap();
+        // Row for k=1 must reflect the UPDATE (w = 15), k=2 unchanged (w = 20).
+        assert_eq!(df.height(), 2);
+        let ws = df.column("__ggsql_aes_pos2__").unwrap();
+        let mut vals: Vec<String> = (0..df.height())
+            .map(|i| crate::array_util::value_to_string(ws, i))
+            .collect();
+        vals.sort();
+        assert_eq!(vals, vec!["15".to_string(), "20".to_string()]);
+    }
+
+    #[test]
+    fn test_fully_quoted_schema_qualified_join() {
+        // A fully double-quoted schema-qualified primary (`"s"."t"`) joined with
+        // a cache-resident CTE: the last component must be extracted correctly so
+        // the staged alias is valid.
+        let base = DuckDBReader::new_in_memory().unwrap();
+        base.execute_sql("CREATE SCHEMA myschema").unwrap();
+        base.execute_sql("CREATE TABLE myschema.base AS SELECT 1 AS k, 10 AS w")
+            .unwrap();
+        let primary = Box::new(ReadOnlyReader::new(Box::new(base)));
+        let cache = Box::new(DuckDBReader::new_in_memory().unwrap());
+        let reader = CachingReader::new(primary, cache, "test://primary");
+
+        let spec = reader.execute(
+            "WITH t AS (SELECT 1 AS k) \
+             SELECT \"myschema\".\"base\".w FROM t JOIN \"myschema\".\"base\" \
+             ON t.k = \"myschema\".\"base\".k \
+             VISUALISE w AS x DRAW point",
+        );
+        assert!(
+            spec.is_ok(),
+            "fully-quoted schema-qualified join should succeed: {:?}",
+            spec.err()
+        );
+    }
+
     #[test]
     fn test_mixed_later_cte_body_joins_primary() {
         // A later CTE whose body references an earlier (cache-resident) CTE and a
diff --git a/src/reader/data.rs b/src/reader/data.rs
index 384d944ff..bc9966826 100644
--- a/src/reader/data.rs
+++ b/src/reader/data.rs
@@ -1,6 +1,4 @@
-use tree_sitter::{Parser, Query, StreamingIterator};
-
-use crate::{naming, GgsqlError};
+use crate::GgsqlError;
 
 // =============================================================================
 // Embedded dataset bytes
@@ -119,250 +117,6 @@ pub fn is_known_builtin(name: &str) -> bool {
     KNOWN_DATASETS.contains(&name)
 }
 
-// =============================================================================
-// SQL namespace rewriting (always available, including WASM)
-// =============================================================================
-
-/// Extract builtin dataset names from SQL containing namespaced identifiers.
-///
-/// Finds `ggsql:X` patterns via tree-sitter and returns the dataset names
-/// (without the `ggsql:` prefix), deduplicated.
-pub fn extract_builtin_dataset_names(sql: &str) -> Result<Vec<String>, GgsqlError> {
-    let token_def = r#"(namespaced_identifier) @select"#;
-    let mut tokens = tokens_from_tree(sql, token_def, "select")?;
-
-    if tokens.is_empty() {
-        return Ok(Vec::new());
-    }
-
-    tokens.sort_unstable();
-    tokens.dedup();
-
-    let datasets: Vec<String> = tokens
-        .iter()
-        .filter_map(|token| token.strip_prefix("ggsql:").map(|s| s.to_string()))
-        .collect();
-
-    Ok(datasets)
-}
-
-/// Extract the table names referenced in a SQL query's `FROM`/`JOIN` clauses.
-///
-/// Returns the `table_ref` targets (deduplicated), with surrounding quotes
-/// stripped via [`naming::unquote_ident`] so they can be compared against
-/// unquoted registered table names. Subquery sources contribute no name.
-pub fn extract_table_refs(sql: &str) -> Result<Vec<String>, GgsqlError> {
-    let token_def = r#"(table_ref table: (_) @table)"#;
-    let mut tokens = tokens_from_tree(sql, token_def, "table")?
-        .iter()
-        .map(|t| naming::unquote_ident(t))
-        .collect::<Vec<_>>();
-    tokens.sort_unstable();
-    tokens.dedup();
-    Ok(tokens)
-}
-
-/// Rewrite SQL to replace namespaced identifiers with internal table names.
-///
-/// e.g., `SELECT * FROM ggsql:penguins` -> `SELECT * FROM __ggsql_data_penguins__`
-///
-/// Uses tree-sitter to find the exact byte positions of namespaced identifiers,
-/// then replaces them in reverse order to preserve offsets.
-pub fn rewrite_namespaced_sql(sql: &str) -> Result<String, GgsqlError> {
-    let token_def = r#"(namespaced_identifier) @select"#;
-
-    // Parse to get byte positions
-    let mut parser = Parser::new();
-    parser
-        .set_language(&tree_sitter_ggsql::language())
-        .map_err(|e| GgsqlError::ParseError(format!("Failed to initialise parser: {}", e)))?;
-
-    let tree = parser
-        .parse(sql, None)
-        .ok_or_else(|| GgsqlError::ParseError(format!("Failed to parse query: {}", sql)))?;
-
-    let query = Query::new(&tree.language(), token_def)
-        .map_err(|e| GgsqlError::ParseError(format!("Failed to initialise tree_query: {}", e)))?;
-
-    let index = query
-        .capture_index_for_name("select")
-        .ok_or_else(|| GgsqlError::ParseError("Failed to capture index".to_string()))?;
-
-    let mut cursor = tree_sitter::QueryCursor::new();
-    let mut matches = cursor.matches(&query, tree.root_node(), sql.as_bytes());
-
-    // Collect (start_byte, end_byte, replacement) tuples
-    let mut replacements: Vec<(usize, usize, String)> = Vec::new();
-    while let Some(matching) = matches.next() {
-        for item in matching.captures {
-            if item.index != index {
-                continue;
-            }
-            let node = item.node;
-            let full_text = &sql[node.start_byte()..node.end_byte()];
-            if let Some(name) = full_text.strip_prefix("ggsql:") {
-                replacements.push((
-                    node.start_byte(),
-                    node.end_byte(),
-                    naming::quote_ident(&naming::builtin_data_table(name)),
-                ));
-            }
-        }
-    }
-
-    if replacements.is_empty() {
-        return Ok(sql.to_string());
-    }
-
-    // Apply replacements in reverse byte order to preserve earlier offsets
-    let mut result = sql.to_string();
-    replacements.sort_by_key(|(start, _, _)| std::cmp::Reverse(*start));
-    for (start, end, replacement) in replacements {
-        result.replace_range(start..end, &replacement);
-    }
-
-    Ok(result)
-}
-
-// =============================================================================
-// Shared tree-sitter helpers
-// =============================================================================
-
-fn tokens_from_tree(
-    sql_query: &str,
-    tree_query: &str,
-    name: &str,
-) -> Result<Vec<String>, GgsqlError> {
-    // Setup parser
-    let mut parser = Parser::new();
-    if let Err(e) = parser.set_language(&tree_sitter_ggsql::language()) {
-        return Err(GgsqlError::ParseError(format!(
-            "Failed to initialise parser: {}",
-            e
-        )));
-    }
-
-    // Digest SQL to tree
-    let tree = parser.parse(sql_query, None);
-    if tree.is_none() {
-        return Err(GgsqlError::ParseError(format!(
-            "Failed to parse query: {}",
-            sql_query
-        )));
-    }
-    let tree = tree.unwrap();
-
-    // Setup query for tree
-    let query = Query::new(&tree.language(), tree_query);
-    if let Err(e) = query {
-        return Err(GgsqlError::ParseError(format!(
-            "Failed to initialise `tree_query`: {}",
-            e
-        )));
-    }
-    let query = query.unwrap();
-
-    // Find `name` in `tree_query`
-    let index = query.capture_index_for_name(name);
-    if index.is_none() {
-        return Err(GgsqlError::ParseError(
-            "Failed to capture index for `tree_query`".to_string(),
-        ));
-    }
-    let index = index.unwrap();
-
-    // Find matches of `tree_query` in the parsed tree
-    let mut cursor = tree_sitter::QueryCursor::new();
-    let mut matches = cursor.matches(&query, tree.root_node(), sql_query.as_bytes());
-
-    // Collect results
-    let mut result: Vec<String> = Vec::new();
-    while let Some(matching) = matches.next() {
-        for item in matching.captures {
-            if item.index != index {
-                // We have a match with a different @keyword than the one defined by `name`.
-                continue;
-            }
-            let node = item.node;
-            let token = &sql_query[node.start_byte()..node.end_byte()];
-            result.push(token.to_string());
-        }
-    }
-    Ok(result)
-}
-
-// =============================================================================
-// Tests
-// =============================================================================
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_extract_builtin_dataset_names_single() {
-        let sql = "SELECT * FROM ggsql:penguins VISUALISE DRAW point MAPPING x AS x";
-        let names = extract_builtin_dataset_names(sql).unwrap();
-        assert_eq!(names, vec!["penguins"]);
-    }
-
-    #[test]
-    fn test_extract_builtin_dataset_names_multiple() {
-        let sql =
-            "SELECT * FROM ggsql:penguins, ggsql:airquality VISUALISE DRAW point MAPPING x AS x";
-        let names = extract_builtin_dataset_names(sql).unwrap();
-        assert_eq!(names.len(), 2);
-        assert!(names.contains(&"airquality".to_string()));
-        assert!(names.contains(&"penguins".to_string()));
-    }
-
-    #[test]
-    fn test_extract_builtin_dataset_names_dedup() {
-        let sql = "SELECT * FROM ggsql:penguins p1, ggsql:penguins p2 VISUALISE DRAW point MAPPING x AS x";
-        let names = extract_builtin_dataset_names(sql).unwrap();
-        assert_eq!(names, vec!["penguins"]);
-    }
-
-    #[test]
-    fn test_extract_builtin_dataset_names_none() {
-        let sql = "SELECT * FROM regular_table VISUALISE DRAW point MAPPING x AS x";
-        let names = extract_builtin_dataset_names(sql).unwrap();
-        assert!(names.is_empty());
-    }
-
-    #[test]
-    fn test_rewrite_namespaced_sql_simple() {
-        let sql = "SELECT * FROM ggsql:penguins";
-        let rewritten = rewrite_namespaced_sql(sql).unwrap();
-        assert_eq!(rewritten, "SELECT * FROM \"__ggsql_data_penguins__\"");
-    }
-
-    #[test]
-    fn test_rewrite_namespaced_sql_multiple() {
-        let sql = "SELECT * FROM ggsql:penguins p, ggsql:airquality a WHERE p.id = a.id";
-        let rewritten = rewrite_namespaced_sql(sql).unwrap();
-        assert_eq!(
-            rewritten,
-            "SELECT * FROM \"__ggsql_data_penguins__\" p, \"__ggsql_data_airquality__\" a WHERE p.id = a.id"
-        );
-    }
-
-    #[test]
-    fn test_rewrite_namespaced_sql_no_change() {
-        let sql = "SELECT * FROM regular_table WHERE x > 5";
-        let rewritten = rewrite_namespaced_sql(sql).unwrap();
-        assert_eq!(rewritten, sql);
-    }
-
-    #[test]
-    fn test_rewrite_namespaced_sql_with_visualise() {
-        let sql = "SELECT * FROM ggsql:penguins VISUALISE DRAW point MAPPING bill_len AS x, bill_dep AS y";
-        let rewritten = rewrite_namespaced_sql(sql).unwrap();
-        assert!(rewritten.starts_with("SELECT * FROM \"__ggsql_data_penguins__\""));
-        assert!(!rewritten.contains("ggsql:"));
-    }
-}
-
 #[cfg(all(feature = "duckdb", feature = "builtin-data"))]
 #[cfg(test)]
 mod duckdb_tests {
diff --git a/src/reader/duckdb.rs b/src/reader/duckdb.rs
index 5badc5324..cde53e32e 100644
--- a/src/reader/duckdb.rs
+++ b/src/reader/duckdb.rs
@@ -25,7 +25,7 @@ use std::sync::Arc;
 fn register_builtin_datasets_duckdb(sql: &str, conn: &Connection) -> Result<()> {
     use std::{env, fs};
 
-    let dataset_names = super::data::extract_builtin_dataset_names(sql)?;
+    let dataset_names = crate::parser::extract_builtin_dataset_names(sql)?;
 
     // Load spatial extension before registering datasets that contain
     // geometry columns, so that spatial features are available.
@@ -366,7 +366,7 @@ impl Reader for DuckDBReader {
         register_builtin_datasets_duckdb(sql, &self.conn)?;
 
         // Rewrite ggsql:name → __ggsql_data_name__ in SQL
-        let sql = super::data::rewrite_namespaced_sql(sql)?;
+        let sql = crate::parser::rewrite_namespaced_sql(sql)?;
 
         if !super::returns_rows(&sql) {
             self.conn
diff --git a/src/reader/sqlite.rs b/src/reader/sqlite.rs
index 245c65a96..6df181633 100644
--- a/src/reader/sqlite.rs
+++ b/src/reader/sqlite.rs
@@ -387,7 +387,7 @@ impl Reader for SqliteReader {
         // Handle ggsql:name namespaced identifiers (builtin datasets)
         #[cfg(all(feature = "builtin-data", feature = "parquet"))]
         {
-            let dataset_names = super::data::extract_builtin_dataset_names(sql)?;
+            let dataset_names = crate::parser::extract_builtin_dataset_names(sql)?;
             for name in &dataset_names {
                 let table_name = naming::builtin_data_table(name);
                 if !self.table_exists(&table_name) {
@@ -398,7 +398,7 @@ impl Reader for SqliteReader {
         }
 
         // Rewrite ggsql:name → __ggsql_data_name__ in SQL
-        let sql = super::data::rewrite_namespaced_sql(sql)?;
+        let sql = crate::parser::rewrite_namespaced_sql(sql)?;
 
         if !super::returns_rows(&sql) {
             self.conn