From ae2f41172f1bffd20c3868a577c5a27eecca04e1 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Mon, 29 Jun 2026 14:23:29 +0100 Subject: [PATCH 01/15] Initial caching layer implementation --- ggsql-cli/CLAUDE.md | 2 + ggsql-cli/src/main.rs | 127 ++++---- ggsql-jupyter/CLAUDE.md | 2 +- ggsql-jupyter/src/executor.rs | 43 +-- src/CLAUDE.md | 3 + src/execute/cte.rs | 11 +- src/execute/mod.rs | 45 ++- src/naming.rs | 42 +++ src/reader/adbc.rs | 66 +---- src/reader/cache.rs | 504 ++++++++++++++++++++++++++++++++ src/reader/cache_equivalence.rs | 334 +++++++++++++++++++++ src/reader/connection.rs | 249 +++++++++------- src/reader/duckdb.rs | 36 ++- src/reader/mod.rs | 211 +++++++++++++ src/reader/sqlite.rs | 44 ++- 15 files changed, 1399 insertions(+), 320 deletions(-) create mode 100644 src/reader/cache.rs create mode 100644 src/reader/cache_equivalence.rs diff --git a/ggsql-cli/CLAUDE.md b/ggsql-cli/CLAUDE.md index c6ed59703..17dba8134 100644 --- a/ggsql-cli/CLAUDE.md +++ b/ggsql-cli/CLAUDE.md @@ -32,6 +32,8 @@ The binary name is `ggsql` (not `ggsql-cli`) — that's what release artifacts a Only public `ggsql::*` API is used (`reader`, `writer`, `validate`, `parser`, `VERSION`) — this crate has no awareness of internal modules. +`exec`/`run` build their reader via the library factory `ggsql::reader::connection::reader_from_uri`. They accept an in-memory caching layer (off by default) selected either by the composite connection scheme `+://…` (e.g. `odbc+duckdb://…`) or the `--cache ` flag; the two cannot be combined. + ## Build & install ```sh diff --git a/ggsql-cli/src/main.rs b/ggsql-cli/src/main.rs index d1b0cb0de..0a004fbce 100644 --- a/ggsql-cli/src/main.rs +++ b/ggsql-cli/src/main.rs @@ -34,10 +34,14 @@ pub enum Commands { /// The ggsql query to execute query: String, - /// Data source connection string (duckdb://, sqlite://, odbc://) + /// Data source connection string (duckdb://, sqlite://, odbc://). #[arg(long, default_value = "duckdb://memory")] reader: String, + /// In-memory cache backend wrapping the reader (duckdb, sqlite). Off by default. + #[arg(long)] + cache: Option, + /// Output format (vegalite) #[arg(long, default_value = "vegalite")] writer: String, @@ -56,10 +60,14 @@ pub enum Commands { /// Path to .sql file containing ggsql query file: PathBuf, - /// Data source connection string (duckdb://, sqlite://, odbc://) + /// Data source connection string (duckdb://, sqlite://, odbc://). #[arg(long, default_value = "duckdb://memory")] reader: String, + /// In-memory cache backend wrapping the reader (duckdb, sqlite). Off by default. + #[arg(long)] + cache: Option, + /// Output format (vegalite) #[arg(long, default_value = "vegalite")] writer: String, @@ -147,6 +155,7 @@ fn main() -> anyhow::Result<()> { Commands::Exec { query, reader, + cache, writer, output, verbose, @@ -154,12 +163,13 @@ fn main() -> anyhow::Result<()> { if verbose { eprintln!("Executing query: {}", query); } - cmd_exec(query, reader, writer, output, verbose); + cmd_exec(query, reader, cache, writer, output, verbose); } Commands::Run { file, reader, + cache, writer, output, verbose, @@ -167,7 +177,7 @@ fn main() -> anyhow::Result<()> { if verbose { eprintln!("Running query from file: {}", file.display()); } - cmd_run(file, reader, writer, output, verbose); + cmd_run(file, reader, cache, writer, output, verbose); } Commands::Parse { query, format } => { @@ -194,9 +204,16 @@ fn main() -> anyhow::Result<()> { Ok(()) } -fn cmd_run(file: PathBuf, reader: String, writer: String, output: Option, verbose: bool) { +fn cmd_run( + file: PathBuf, + reader: String, + cache: Option, + writer: String, + output: Option, + verbose: bool, +) { match std::fs::read_to_string(&file) { - Ok(query) => cmd_exec(query, reader, writer, output, verbose), + Ok(query) => cmd_exec(query, reader, cache, writer, output, verbose), Err(e) => { eprintln!("Failed to read file {}: {}", file.display(), e); std::process::exit(1); @@ -204,76 +221,64 @@ fn cmd_run(file: PathBuf, reader: String, writer: String, output: Option, verbose: bool) { +fn cmd_exec( + query: String, + reader: String, + cache: Option, + writer: String, + output: Option, + verbose: bool, +) { + use ggsql::reader::connection; + if verbose { eprintln!("Reader: {}", reader); + if let Some(ref cache) = cache { + eprintln!("Cache: {}", cache); + } eprintln!("Writer: {}", writer); if let Some(ref output_file) = output { eprintln!("Output: {}", output_file.display()); } } - if reader.starts_with("duckdb://") { - #[cfg(feature = "duckdb")] - { - let r = match ggsql::reader::DuckDBReader::from_connection_string(&reader) { - Ok(r) => r, - Err(e) => { - eprintln!("Failed to create reader: {}", e); - std::process::exit(1); - } - }; - exec_with_reader(&query, &r, &writer, output, verbose); - } - #[cfg(not(feature = "duckdb"))] - { - eprintln!("DuckDB reader not compiled in. Rebuild with --features duckdb"); - std::process::exit(1); - } - } else if reader.starts_with("sqlite://") { - #[cfg(feature = "sqlite")] - { - let r = match ggsql::reader::SqliteReader::from_connection_string(&reader) { - Ok(r) => r, - Err(e) => { - eprintln!("Failed to create reader: {}", e); - std::process::exit(1); + // Build the reader. A composite `+://` URI is handled by + // `reader_from_uri`; the `--cache` flag is an explicit alternative and may + // not be combined with a composite URI. + let built = match cache { + Some(cache_scheme) => { + if connection::split_cache_uri(&reader).is_some() { + eprintln!( + "Cannot combine --cache with a composite '+://' connection string" + ); + std::process::exit(1); + } + // `--cache ` is sugar for the composite `+://` URI. + match reader.split_once("://") { + Some((scheme, rest)) => { + connection::reader_from_uri(&format!("{scheme}+{cache_scheme}://{rest}")) } - }; - exec_with_reader(&query, &r, &writer, output, verbose); - } - #[cfg(not(feature = "sqlite"))] - { - eprintln!("SQLite reader not compiled in. Rebuild with --features sqlite"); - std::process::exit(1); - } - } else if reader.starts_with("odbc://") { - #[cfg(feature = "odbc")] - { - let r = match ggsql::reader::OdbcReader::from_connection_string(&reader) { - Ok(r) => r, - Err(e) => { - eprintln!("Failed to create reader: {}", e); + None => { + eprintln!("Invalid --reader connection string: {reader}"); std::process::exit(1); } - }; - exec_with_reader(&query, &r, &writer, output, verbose); + } } - #[cfg(not(feature = "odbc"))] - { - eprintln!("ODBC reader not compiled in. Rebuild with --features odbc"); + None => connection::reader_from_uri(&reader), + }; + + let reader = match built { + Ok(r) => r, + Err(e) => { + eprintln!("Failed to create reader: {}", e); std::process::exit(1); } - } else if reader.starts_with("postgres://") || reader.starts_with("postgresql://") { - eprintln!("PostgreSQL reader is not yet implemented"); - std::process::exit(1); - } else { - eprintln!("Unsupported connection string: {}", reader); - std::process::exit(1); - } + }; + + exec_with_reader(&query, reader.as_ref(), &writer, output, verbose); } -fn exec_with_reader( +fn exec_with_reader( query: &str, reader: &R, writer: &str, @@ -430,7 +435,7 @@ fn cmd_validate(query: String, _reader: Option) { } // Prints a CSV-like output to stdout with aligned columns -fn print_table_fallback(query: &str, reader: &R, max_rows: usize) { +fn print_table_fallback(query: &str, reader: &R, max_rows: usize) { let source_tree = match parser::SourceTree::new(query) { Ok(st) => st, Err(e) => { diff --git a/ggsql-jupyter/CLAUDE.md b/ggsql-jupyter/CLAUDE.md index adcb5ecd3..18868e3a9 100644 --- a/ggsql-jupyter/CLAUDE.md +++ b/ggsql-jupyter/CLAUDE.md @@ -32,7 +32,7 @@ ggsql-jupyter/ 1. `ggsql-jupyter --install` writes a kernelspec into the active Python environment (Jupyter, conda, uv, virtualenv — auto-detected). 2. `ggsql-jupyter ` is the entry point Jupyter invokes; it reads the connection JSON, opens the five ZMQ sockets (shell, control, iopub, stdin, heartbeat), and runs `kernel.rs`'s message loop. -3. Each `execute_request` is dispatched through `executor.rs` → `ggsql::reader::DuckDBReader::execute(...)`. The kernel keeps a single persistent in-memory DuckDB session so cells share state. +3. Each `execute_request` is dispatched through `executor.rs` → `ggsql::reader::DuckDBReader::execute(...)`. The kernel keeps a single persistent in-memory DuckDB session so cells share state. Readers are built via the library factory `ggsql::reader::connection::reader_from_uri`, so a composite `+://` connection string (e.g. via `-- @connect:`) wraps the reader in an in-memory caching layer — the persistent kernel session means repeated cells reuse cached remote reads. 4. The result is wrapped by `display.rs` into a Jupyter `display_data` message — Vega-Lite specs go through vega-embed in an HTML payload (works in classic Jupyter, JupyterLab, and Positron); pure SQL goes out as an HTML table. ## Positron-specific bits diff --git a/ggsql-jupyter/src/executor.rs b/ggsql-jupyter/src/executor.rs index 6f928e7ab..fb9d51bfb 100644 --- a/ggsql-jupyter/src/executor.rs +++ b/ggsql-jupyter/src/executor.rs @@ -7,8 +7,8 @@ use anyhow::Result; use ggsql::{ reader::{ - connection::{extract_odbc_value, parse_connection_string}, - DuckDBReader, Reader, + connection::{extract_odbc_value, reader_from_uri}, + Reader, }, validate::validate, writer::{VegaLiteWriter, Writer}, @@ -28,41 +28,6 @@ pub enum ExecutionResult { ConnectionChanged { uri: String, display_name: String }, } -/// Create a reader from a connection URI string. -/// -/// Supported schemes: -/// - `duckdb://memory` or `duckdb://` (always available) -/// - `sqlite://` (requires `sqlite` feature) -/// - `odbc://...` (requires `odbc` feature) -pub fn create_reader(uri: &str) -> Result> { - use ggsql::reader::connection::ConnectionInfo; - - let info = parse_connection_string(uri)?; - match info { - ConnectionInfo::DuckDBMemory => { - let reader = DuckDBReader::from_connection_string("duckdb://memory")?; - Ok(Box::new(reader)) - } - ConnectionInfo::DuckDBFile(path) => { - let reader = DuckDBReader::from_connection_string(&format!("duckdb://{}", path))?; - Ok(Box::new(reader)) - } - #[cfg(feature = "odbc")] - ConnectionInfo::ODBC(conn_str) => { - let reader = - ggsql::reader::OdbcReader::from_connection_string(&format!("odbc://{}", conn_str))?; - Ok(Box::new(reader)) - } - #[cfg(feature = "sqlite")] - ConnectionInfo::SQLite(path) => { - let reader = - ggsql::reader::SqliteReader::from_connection_string(&format!("sqlite://{}", path))?; - Ok(Box::new(reader)) - } - _ => anyhow::bail!("Unsupported reader type for connection string: {}", uri), - } -} - /// Generate a human-readable display name for a connection URI. pub fn display_name_for_uri(uri: &str) -> String { if uri == "duckdb://memory" { @@ -156,7 +121,7 @@ impl QueryExecutor { /// Create a new query executor with a given connection URI pub fn new_with_uri(uri: &str) -> Result { tracing::info!("Initializing query executor with reader: {}", uri); - let reader = create_reader(uri)?; + let reader = reader_from_uri(uri)?; let writer = VegaLiteWriter::new(); Ok(Self { @@ -184,7 +149,7 @@ impl QueryExecutor { /// Swap the reader to a new connection, returning the old URI pub fn swap_reader(&mut self, uri: &str) -> Result { - let new_reader = create_reader(uri)?; + let new_reader = reader_from_uri(uri)?; self.reader = new_reader; let old_uri = std::mem::replace(&mut self.reader_uri, uri.to_string()); Ok(old_uri) diff --git a/src/CLAUDE.md b/src/CLAUDE.md index b9b68dc91..8047c38a7 100644 --- a/src/CLAUDE.md +++ b/src/CLAUDE.md @@ -46,11 +46,14 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding | `duckdb.rs` | DuckDB (in-memory or file) | `duckdb` (default) | | `sqlite.rs` | SQLite | `sqlite` (default) | | `odbc.rs` | ODBC | `odbc` (default) | +| `cache.rs` | `CachingReader` — wraps any primary `Reader` with an in-memory cache | `duckdb` or `sqlite` | | `connection.rs` | Connection-string parsing for all of the above | — | | `data.rs`, `spec.rs` | `Spec` type returned by `execute()`, plus DataFrame conversion | — | `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`). +**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`. It routes each `execute_sql`: cache-resident names and `ggsql:` builtins go to the cache, base reads go to the primary (and are memoized). The `Reader::materialize_table` seam (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to run the read and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's extra materialization of explicit per-layer sources. `dialect()` returns the **cache** dialect because every executor-generated query targets a cache-resident table. Selected via the composite `+://` connection scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. + ### `execute/` The pipeline that takes a parsed `Plot` plus a `Reader` and produces a fully-resolved `Spec` (typed data per layer, scales resolved, casts applied). Submodules: diff --git a/src/execute/cte.rs b/src/execute/cte.rs index 9b544988d..e843fa074 100644 --- a/src/execute/cte.rs +++ b/src/execute/cte.rs @@ -146,16 +146,11 @@ pub fn materialize_ctes(ctes: &[CteDefinition], reader: &dyn Reader) -> Result Result Result = specs[0] + let mut layer_source_queries: Vec = specs[0] .layers .iter_mut() .map(|l| layer::layer_source_query(l, &materialized_ctes, has_global_table, dialect)) .collect::>>()?; + // When the reader stages sources into a cache (a caching layer is active), + // materialize explicit external layer sources into the cache and rewrite the + // layer to read the cached table. + if reader.caches_sources() { + let mut materialized_sources: HashMap = HashMap::new(); + for (idx, layer) in specs[0].layers.iter().enumerate() { + let is_external = match &layer.source { + Some(crate::DataSource::Identifier(name)) => !materialized_ctes.contains(name), + Some(crate::DataSource::FilePath(_)) => true, + _ => false, + }; + if !is_external { + continue; + } + let source_query = layer_source_queries[idx].clone(); + let table = match materialized_sources.get(&source_query) { + Some(t) => t.clone(), + None => { + let t = naming::layer_source_table(materialized_sources.len()); + reader.materialize_table(&t, &[], &source_query)?; + materialized_sources.insert(source_query, t.clone()); + t + } + }; + layer_source_queries[idx] = format!("SELECT * FROM {}", naming::quote_ident(&table)); + } + } + // Get types for each layer from source queries (Phase 1: types only, no min/max yet) let mut layer_type_info: Vec> = Vec::new(); for source_query in &layer_source_queries { diff --git a/src/naming.rs b/src/naming.rs index 6259f5bd3..1d1357cea 100644 --- a/src/naming.rs +++ b/src/naming.rs @@ -137,6 +137,48 @@ pub fn cte_table(cte_name: &str) -> String { ) } +/// Generate temp table name for a memoized primary query result in a cache. +/// +/// Format: `__ggsql_cache____` +/// +/// # Example +/// ``` +/// use ggsql::naming; +/// let table = naming::cache_result_table(0); +/// assert!(table.starts_with("__ggsql_cache_")); +/// assert!(table.ends_with("_0__")); +/// ``` +pub fn cache_result_table(id: u64) -> String { + format!( + "{}cache_{}_{}{}", + GGSQL_PREFIX, + session_id(), + id, + GGSQL_SUFFIX + ) +} + +/// Generate temp table name for a materialized explicit layer source. +/// +/// Format: `__ggsql_layer_src____` +/// +/// # Example +/// ``` +/// use ggsql::naming; +/// let table = naming::layer_source_table(2); +/// assert!(table.starts_with("__ggsql_layer_src_2_")); +/// assert!(table.ends_with("__")); +/// ``` +pub fn layer_source_table(index: usize) -> String { + format!( + "{}layer_src_{}_{}{}", + GGSQL_PREFIX, + index, + session_id(), + GGSQL_SUFFIX + ) +} + /// Generate table name for a builtin dataset. /// /// Used when rewriting `ggsql:penguins` to the internal table name. diff --git a/src/reader/adbc.rs b/src/reader/adbc.rs index 8de00b01d..203d31e26 100644 --- a/src/reader/adbc.rs +++ b/src/reader/adbc.rs @@ -687,71 +687,7 @@ mod equivalence_tests { .expect("SqliteReader at the same path") } - /// Compare two DataFrames by schema (field names + types) and by - /// per-column Arrow array contents. We don't use a blanket - /// `assert_eq!(df, df)` because `DataFrame` doesn't implement `PartialEq`; - /// going through schema + per-column equality is also more diagnostic - /// when one of them diverges. - fn assert_dataframes_equal( - adbc_df: &crate::DataFrame, - sqlite_df: &crate::DataFrame, - ctx: &str, - ) { - let adbc_schema = adbc_df.schema(); - let sqlite_schema = sqlite_df.schema(); - assert_eq!( - adbc_schema.fields().len(), - sqlite_schema.fields().len(), - "{}: column count mismatch (adbc={}, sqlite={})", - ctx, - adbc_schema.fields().len(), - sqlite_schema.fields().len(), - ); - for (i, (a, s)) in adbc_schema - .fields() - .iter() - .zip(sqlite_schema.fields().iter()) - .enumerate() - { - assert_eq!( - a.name(), - s.name(), - "{}: column {} name mismatch (adbc='{}', sqlite='{}')", - ctx, - i, - a.name(), - s.name(), - ); - assert_eq!( - a.data_type(), - s.data_type(), - "{}: column '{}' type mismatch (adbc={:?}, sqlite={:?})", - ctx, - a.name(), - a.data_type(), - s.data_type(), - ); - } - assert_eq!( - adbc_df.height(), - sqlite_df.height(), - "{}: row count mismatch (adbc={}, sqlite={})", - ctx, - adbc_df.height(), - sqlite_df.height(), - ); - for field in adbc_schema.fields() { - let a = adbc_df.column(field.name()).unwrap(); - let s = sqlite_df.column(field.name()).unwrap(); - assert_eq!( - a.as_ref(), - s.as_ref(), - "{}: column '{}' data mismatch", - ctx, - field.name(), - ); - } - } + use crate::reader::test_support::assert_dataframes_equal; #[test] #[ignore = "requires `dbc install sqlite`; see module docs"] diff --git a/src/reader/cache.rs b/src/reader/cache.rs new file mode 100644 index 000000000..015022282 --- /dev/null +++ b/src/reader/cache.rs @@ -0,0 +1,504 @@ +//! Caching reader: any primary [`Reader`] + an in-memory writable cache. +//! +//! [`CachingReader`] wraps a `primary` reader and a `cache` backend (an +//! in-memory [`CacheBackend`]). It sits between the primary reader and the +//! rest of ggsql: +//! +//! - [`Reader::register`] writes to the cache and records the name. +//! - [`Reader::materialize_table`] runs the body and `register`s the resulting +//! DataFrame into the cache, so the primary is only ever read from — base +//! reads happen via passthrough SQL, derived tables live in the cache. +//! - [`Reader::execute_sql`] routes each statement: queries referencing a +//! cache-resident name (or a `ggsql:` builtin) run on the cache; base reads +//! run on the primary and are memoized into the cache. +//! - [`Reader::dialect`] returns the **cache** dialect: every dialect-specific +//! query the executor builds targets a cache-resident table. + +use crate::reader::{execute_with_reader, ColumnInfo, Reader, Spec, SqlDialect, TableInfo}; +use crate::{naming, DataFrame, Result}; +use std::cell::{Cell, RefCell}; +use std::collections::{HashMap, HashSet}; + +pub struct CachingReader { + primary: Box, + cache: Box, + /// Names that currently live in the cache (registered tables / memo results). + cached_names: RefCell>, + /// Memo of base primary reads: SQL text -> cache table name holding the result. + result_cache: RefCell>, + /// Monotonic counter for unique memo table names. + next_id: Cell, +} + +impl CachingReader { + /// Construct a `CachingReader` from a primary reader and an in-memory cache + /// backend. The cache is owned by the `CachingReader` and dropped with it. + pub fn new(primary: Box, cache: Box) -> Self { + Self { + primary, + cache, + cached_names: RefCell::new(HashSet::new()), + result_cache: RefCell::new(HashMap::new()), + next_id: Cell::new(0), + } + } + + /// Whether `sql` is one of the cache dialect's spatial setup statements. + /// + /// ggsql-generated spatial setup (`INSTALL`/`LOAD spatial`, …) must run on + /// the cache, where the spatial SQL executes — not on the primary. + fn is_cache_spatial_setup(&self, sql: &str) -> bool { + self.cache + .dialect() + .sql_spatial_setup() + .iter() + .any(|stmt| stmt == sql) + } +} + +impl Reader for CachingReader { + fn execute_sql(&self, sql: &str) -> Result { + // Cache-resident names and `ggsql:` builtins are served by the cache. + if references_cached_name(sql, &self.cached_names.borrow()) || sql.contains("ggsql:") { + return self.cache.execute_sql(sql); + } + + // ggsql-generated spatial setup targets the cache (where spatial SQL runs). + if self.is_cache_spatial_setup(sql) { + return self.cache.execute_sql(sql); + } + + // Base read against the primary, with result memoization. + if let Some(table) = self.result_cache.borrow().get(sql) { + return self + .cache + .execute_sql(&format!("SELECT * FROM {}", naming::quote_ident(table))); + } + + let df = self.primary.execute_sql(sql)?; + + if super::returns_rows(sql) { + let id = self.next_id.get(); + self.next_id.set(id + 1); + let table = naming::cache_result_table(id); + self.cache.register(&table, df.clone(), true)?; + self.cached_names.borrow_mut().insert(table.clone()); + self.result_cache + .borrow_mut() + .insert(sql.to_string(), table); + } + + Ok(df) + } + + fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> { + self.cache.register(name, df, replace)?; + self.cached_names.borrow_mut().insert(name.to_string()); + Ok(()) + } + + fn unregister(&self, name: &str) -> Result<()> { + self.cache.unregister(name)?; + self.cached_names.borrow_mut().remove(name); + Ok(()) + } + + fn execute(&self, query: &str) -> Result { + execute_with_reader(self, query) + } + + fn dialect(&self) -> &dyn SqlDialect { + // All executor-generated SQL targets cache-resident tables. + self.cache.dialect() + } + + fn materialize_table( + &self, + name: &str, + column_aliases: &[String], + body_sql: &str, + ) -> Result<()> { + let body = super::wrap_with_column_aliases(body_sql, column_aliases); + let df = self.execute_sql(&body)?; + self.register(name, df, true) + } + + fn caches_sources(&self) -> bool { + true + } + + // Schema introspection describes the real data source, so delegate to the + // primary; the cache only holds synthetic `__ggsql_*` tables. + fn list_catalogs(&self) -> Result> { + self.primary.list_catalogs() + } + + fn list_schemas(&self, catalog: &str) -> Result> { + self.primary.list_schemas(catalog) + } + + fn list_tables(&self, catalog: &str, schema: &str) -> Result> { + self.primary.list_tables(catalog, schema) + } + + fn list_columns(&self, catalog: &str, schema: &str, table: &str) -> Result> { + self.primary.list_columns(catalog, schema, table) + } +} + +/// Check whether `sql` references any name in `cached_names` as a SQL +/// identifier (not part of a longer identifier, not inside a single-quoted +/// string literal). +/// +/// Matches a bare reference (`FROM orders`), a double-quoted identifier +/// (`FROM "orders"`), or a qualified reference (`FROM catalog.schema.orders`). +/// Does not match substrings of longer identifiers (`orders_detail`) or +/// string-literal contents (`'orders of magnitude'`). This is a lightweight +/// scanner, not a full SQL parser: a cached name appearing only inside a +/// comment is misrouted to the cache and fails with a clear error rather than +/// silently hitting the primary. +fn references_cached_name(sql: &str, cached_names: &HashSet) -> bool { + cached_names + .iter() + .any(|name| sql_references_identifier(sql, name)) +} + +fn sql_references_identifier(sql: &str, name: &str) -> bool { + let bytes = sql.as_bytes(); + let name_bytes = name.as_bytes(); + let n = name_bytes.len(); + if n == 0 { + return false; + } + let mut i = 0; + while i + n <= bytes.len() { + if &bytes[i..i + n] == name_bytes { + let before_ok = i == 0 || !is_identifier_byte(bytes[i - 1]); + let after_ok = i + n == bytes.len() || !is_identifier_byte(bytes[i + n]); + if before_ok && after_ok && !is_inside_string_literal(bytes, i) { + return true; + } + } + i += 1; + } + false +} + +fn is_identifier_byte(b: u8) -> bool { + b.is_ascii_alphanumeric() || b == b'_' +} + +/// Walk from start to `pos` tracking whether we're inside a single-quoted +/// string literal. SQL-standard doubled-single-quote (`''`) is an escape that +/// keeps us inside the literal. +fn is_inside_string_literal(bytes: &[u8], pos: usize) -> bool { + let mut inside = false; + let mut i = 0; + while i < pos && i < bytes.len() { + if bytes[i] == b'\'' { + if inside && i + 1 < bytes.len() && bytes[i + 1] == b'\'' { + i += 2; + continue; + } + inside = !inside; + } + i += 1; + } + inside +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_references_cached_name_empty_set() { + let set = HashSet::new(); + assert!(!references_cached_name("SELECT * FROM foo", &set)); + } + + #[test] + fn test_references_cached_name_match() { + let mut set = HashSet::new(); + set.insert("__ggsql_global_abc123__".to_string()); + assert!(references_cached_name( + "SELECT * FROM __ggsql_global_abc123__ WHERE x > 1", + &set + )); + } + + #[test] + fn test_references_cached_name_rejects_longer_identifier() { + let mut set = HashSet::new(); + set.insert("orders".to_string()); + assert!(!references_cached_name( + "SELECT * FROM orders_detail WHERE x > 1", + &set + )); + } + + #[test] + fn test_references_cached_name_rejects_prefix_of_longer_identifier() { + let mut set = HashSet::new(); + set.insert("col".to_string()); + assert!(!references_cached_name("SELECT col_id FROM users", &set)); + } + + #[test] + fn test_references_cached_name_rejects_inside_string_literal() { + let mut set = HashSet::new(); + set.insert("orders".to_string()); + assert!(!references_cached_name( + "SELECT 'orders of magnitude' AS label", + &set + )); + } + + #[test] + fn test_references_cached_name_matches_quoted_identifier() { + let mut set = HashSet::new(); + set.insert("orders".to_string()); + assert!(references_cached_name(r#"SELECT * FROM "orders""#, &set)); + } + + #[test] + fn test_references_cached_name_matches_qualified_reference() { + let mut set = HashSet::new(); + set.insert("orders".to_string()); + assert!(references_cached_name( + "SELECT * FROM catalog.schema.orders WHERE x > 1", + &set + )); + } + + #[test] + fn test_references_cached_name_handles_escaped_quotes_in_literal() { + let mut set = HashSet::new(); + set.insert("orders".to_string()); + assert!(references_cached_name( + "SELECT 'it''s fine' FROM orders", + &set + )); + } +} + +#[cfg(all(test, feature = "duckdb"))] +mod behavior_tests { + use super::*; + use crate::array_util::as_i64; + use crate::df; + use crate::reader::test_support::{ReadOnlyReader, SpyReader}; + use crate::reader::{CacheBackend, DuckDBReader}; + + #[test] + fn test_register_writes_to_cache_and_query_routes_there() { + let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap())); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache); + + reader + .register("t", df! { "x" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let out = reader.execute_sql("SELECT COUNT(*) AS n FROM t").unwrap(); + + assert_eq!(as_i64(out.column("n").unwrap()).unwrap().value(0), 3); + // The query routed to the cache; the primary was never touched. + assert!(log.lock().unwrap().is_empty()); + } + + #[test] + fn test_base_read_routes_to_primary_and_memoizes() { + let inner = DuckDBReader::new_in_memory().unwrap(); + inner + .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let (primary, log) = SpyReader::wrap(Box::new(inner)); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache); + + let q = "SELECT y FROM base ORDER BY y"; + let d1 = reader.execute_sql(q).unwrap(); + let d2 = reader.execute_sql(q).unwrap(); + assert_eq!(d1.height(), 3); + assert_eq!(d2.height(), 3); + + // The primary executed the base read exactly once; the repeat was served + // from the cache memo. + let hits = log + .lock() + .unwrap() + .iter() + .filter(|s| s.as_str() == q) + .count(); + assert_eq!(hits, 1); + } + + #[test] + fn test_full_execute_keeps_computation_off_primary() { + let inner = DuckDBReader::new_in_memory().unwrap(); + inner + .register( + "sales", + df! { "x" => vec![1_i64, 2, 3, 4], "y" => vec![10_i64, 20, 30, 40] }.unwrap(), + true, + ) + .unwrap(); + let (primary, log) = SpyReader::wrap(Box::new(inner)); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache); + + reader + .execute("SELECT x, y FROM sales VISUALISE x, y DRAW point") + .unwrap(); + + let log = log.lock().unwrap(); + // The primary is only ever read from: no temp-table DDL, no derived + // `__ggsql_*` tables ever reach it. + assert!( + log.iter().all(|s| !s.to_uppercase().contains("TEMP TABLE")), + "primary must not be written to: {:?}", + *log + ); + assert!( + log.iter().all(|s| !s.contains("__ggsql_")), + "primary must not see derived tables: {:?}", + *log + ); + // It did see the base read. + assert!(log.iter().any(|s| s.contains("sales"))); + } + + #[test] + fn test_caching_makes_read_only_primary_usable() { + // The read-only-primary value proposition, exercised in every DuckDB + // build (including duckdb-only CI): a primary that refuses all writes is + // unusable on its own — materializing the global temp table fails — but + // works once wrapped in a caching layer, because every write goes to the + // cache and the primary is only read. + let query = "SELECT v FROM t VISUALISE v AS x DRAW histogram"; + + // Bare read-only primary: materialization must fail. + let bare_primary = DuckDBReader::new_in_memory().unwrap(); + bare_primary + .register( + "t", + df! { "v" => vec![1.0_f64, 2.0, 3.0, 4.0] }.unwrap(), + true, + ) + .unwrap(); + let bare = ReadOnlyReader::new(Box::new(bare_primary)); + assert!( + bare.execute(query).is_err(), + "a read-only primary with no cache must fail to materialize" + ); + + // Same read-only primary behind a cache: must succeed. + let primary = DuckDBReader::new_in_memory().unwrap(); + primary + .register( + "t", + df! { "v" => vec![1.0_f64, 2.0, 3.0, 4.0] }.unwrap(), + true, + ) + .unwrap(); + let cached = CachingReader::new( + Box::new(ReadOnlyReader::new(Box::new(primary))), + Box::new(DuckDBReader::new_in_memory().unwrap()), + ); + assert!( + cached.execute(query).is_ok(), + "caching should make a read-only primary usable" + ); + } + + #[test] + fn test_no_cache_path_materializes_on_the_reader() { + // A plain reader (no CachingReader) must keep today's behavior: + // derived tables are materialized on the reader itself. + let inner = DuckDBReader::new_in_memory().unwrap(); + inner + .register( + "sales", + df! { "x" => vec![1_i64, 2, 3], "y" => vec![10_i64, 20, 30] }.unwrap(), + true, + ) + .unwrap(); + let (reader, log) = SpyReader::wrap(Box::new(inner)); + + reader + .execute("SELECT x, y FROM sales VISUALISE x, y DRAW point") + .unwrap(); + + assert!( + log.lock() + .unwrap() + .iter() + .any(|s| s.to_uppercase().contains("TEMP TABLE")), + "default path must materialize on the reader" + ); + } + + #[cfg(feature = "spatial")] + #[test] + fn test_spatial_setup_routes_to_cache() { + let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap())); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache); + + // ggsql-generated spatial setup must run on the cache (where the spatial + // SQL executes), never on the primary. + let setup = reader.dialect().sql_spatial_setup(); + assert!(!setup.is_empty(), "DuckDB cache should emit spatial setup"); + for stmt in &setup { + // Ignore execution result (may require the extension); we only assert routing. + let _ = reader.execute_sql(stmt); + } + let log = log.lock().unwrap(); + for stmt in &setup { + assert!( + !log.contains(stmt), + "spatial setup must route to the cache, not the primary: {}", + stmt + ); + } + } + + #[cfg(feature = "sqlite")] + #[test] + fn test_dialect_returns_cache_dialect() { + use crate::reader::SqliteReader; + // SQLite primary, DuckDB cache: dialect() must return DuckDB's (native + // GREATEST), not SQLite's (CASE fallback). + let primary = Box::new(SqliteReader::new().unwrap()); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache); + assert_eq!(reader.dialect().sql_greatest(&["a", "b"]), "GREATEST(a, b)"); + } + + #[cfg(feature = "sqlite")] + #[test] + fn test_explicit_layer_source_with_stat_heterogeneous() { + use crate::reader::SqliteReader; + // SQLite primary holds the table; DuckDB is the cache. A layer draws a + // histogram from the primary table, which generates DuckDB-dialect stat + // SQL. This only works because the layer source is materialized into the + // cache; otherwise DuckDB SQL would run against the SQLite primary. + let primary = SqliteReader::new().unwrap(); + primary + .register( + "tbl", + df! { "val" => vec![1.0_f64, 2.0, 2.0, 3.0, 3.0, 3.0, 9.0] }.unwrap(), + true, + ) + .unwrap(); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(Box::new(primary), cache); + + let spec = reader.execute("VISUALISE x DRAW histogram MAPPING val AS x FROM tbl"); + assert!( + spec.is_ok(), + "explicit-source histogram failed: {:?}", + spec.err() + ); + } +} diff --git a/src/reader/cache_equivalence.rs b/src/reader/cache_equivalence.rs new file mode 100644 index 000000000..6b8f808d2 --- /dev/null +++ b/src/reader/cache_equivalence.rs @@ -0,0 +1,334 @@ +//! Equivalence and read-only-safety tests for the caching layer. +//! +//! These exercise the whole mechanism (materialization, routing, the result +//! cache, builtin routing, layer-source staging, dialect selection). + +use super::CachingReader; +use crate::reader::test_support::{ReadOnlyReader, SpyReader}; +use crate::reader::{CacheBackend, DuckDBReader, Reader, SqliteReader}; +use crate::DataFrame; + +/// One corpus entry. `builtin_only` queries read only `ggsql:` datasets (which +/// route to the cache), so they can be compared exactly to plain DuckDB. +struct Case { + query: &'static str, + builtin_only: bool, +} + +/// A stat-heavy corpus — these generate the most cache-dialect SQL +/// (`sql_percentile`, `sql_greatest`/`sql_least`, `sql_generate_series`, +/// casts), where caching is most likely to diverge. +const CORPUS: &[Case] = &[ + // boxplot: quantiles / IQR + Case { + query: "VISUALISE species AS x, bill_len AS y FROM ggsql:penguins DRAW boxplot", + builtin_only: true, + }, + // histogram: binning + casts (global SELECT over a builtin) + Case { + query: "SELECT Temp FROM ggsql:airquality VISUALISE Temp AS x DRAW histogram", + builtin_only: true, + }, + // density: percentile + generate_series + stddev + Case { + query: "VISUALISE bill_len AS x, species AS colour FROM ggsql:penguins DRAW density", + builtin_only: true, + }, + // grouped aggregation + facet + discrete scale + Case { + query: "SELECT species, bill_len, island FROM ggsql:penguins \ + VISUALISE species AS x, bill_len AS y \ + DRAW bar SETTING aggregate => 'mean' FACET island", + builtin_only: true, + }, + // WITH CTE + multi-layer + FILTER (CTE materialization + global + per-layer routing) + Case { + query: "WITH hot AS (SELECT Date, Temp FROM ggsql:airquality WHERE Temp > 70) \ + SELECT Date, Temp FROM hot \ + VISUALISE Date AS x, Temp AS y \ + DRAW line DRAW point FILTER Temp > 80 SCALE x VIA date", + builtin_only: true, + }, + // explicit per-layer source from a seeded table → forces layer-source staging + // (`caches_sources`) and the read-from-primary + stat-on-cache path. + Case { + query: "VISUALISE species AS x, bill_len AS y \ + DRAW boxplot MAPPING species AS x, bill_len AS y FROM cache_eq_tbl", + builtin_only: false, + }, +]; + +/// Seed the table referenced by the non-builtin corpus entry. +fn seed(reader: &dyn Reader) { + let df = crate::df! { + "species" => vec!["A", "A", "B", "B", "B", "C"], + "bill_len" => vec![1.0_f64, 2.0, 3.0, 4.0, 5.0, 6.0], + } + .unwrap(); + reader.register("cache_eq_tbl", df, true).unwrap(); +} + +/// Stringify each row using the given (canonical, sorted) column order, so two +/// DataFrames can be compared as row multisets — ignoring both physical row order +/// (a query without `ORDER BY` may return rows in a different order on each +/// materialization path) and column order (aesthetic columns are emitted in +/// HashMap order; they bind to encoding channels by name, not position). +fn row_multiset(df: &DataFrame, names: &[String]) -> Vec { + (0..df.height()) + .map(|i| { + names + .iter() + .map(|n| crate::array_util::value_to_string(df.column(n).unwrap(), i)) + .collect::>() + .join("\u{1f}") + }) + .collect() +} + +/// Assert two layer DataFrames have the same columns (by name + type) and the +/// same set of rows — insensitive to row and column ordering. +fn assert_data_equivalent(a: &DataFrame, b: &DataFrame, ctx: &str) { + let mut na: Vec = a + .schema() + .fields() + .iter() + .map(|f| f.name().to_string()) + .collect(); + let mut nb: Vec = b + .schema() + .fields() + .iter() + .map(|f| f.name().to_string()) + .collect(); + na.sort(); + nb.sort(); + assert_eq!(na, nb, "{ctx}: column-name set"); + for name in &na { + assert_eq!( + a.column(name).unwrap().data_type(), + b.column(name).unwrap().data_type(), + "{ctx}: column '{name}' type", + ); + } + assert_eq!(a.height(), b.height(), "{ctx}: row count"); + let (mut ra, mut rb) = (row_multiset(a, &na), row_multiset(b, &na)); + ra.sort(); + rb.sort(); + assert_eq!(ra, rb, "{ctx}: row multiset mismatch"); +} + +/// Assert that a query produces equivalent output through `plain` and `cached`. +/// A query a backend can't run must fail the same way with or without the cache. +fn assert_equivalent(plain: &dyn Reader, cached: &dyn Reader, query: &str) { + let a = plain.execute(query); + let b = cached.execute(query); + assert_eq!( + a.is_ok(), + b.is_ok(), + "ok-mismatch for `{query}`: plain={:?} cached={:?}", + a.as_ref().err(), + b.as_ref().err(), + ); + let (Ok(sa), Ok(sb)) = (a, b) else { return }; + + assert_eq!( + sa.layer_count(), + sb.layer_count(), + "layer count for `{query}`" + ); + for i in 0..sa.layer_count() { + match (sa.layer_data(i), sb.layer_data(i)) { + (Some(da), Some(db)) => assert_data_equivalent(da, db, &format!("`{query}` layer {i}")), + (None, None) => {} + _ => panic!("layer {i} data-presence mismatch for `{query}`"), + } + } +} + +/// The builtin corpus through `{ ReadOnlyReader(SQLite), DuckDB }` matches plain +/// DuckDB exactly. The DuckDB cache does all reading and computing for `ggsql:` +/// sources; the SQLite primary stays idle and unwritten. +#[test] +fn mode1_builtin_equivalence_matches_plain_duckdb() { + if !cfg!(feature = "builtin-data") { + return; // builtin corpus needs the embedded datasets + } + for case in CORPUS.iter().filter(|c| c.builtin_only) { + let plain = DuckDBReader::new_in_memory().unwrap(); + let primary = ReadOnlyReader::new(Box::new(SqliteReader::new().unwrap())); + let cache = DuckDBReader::new_in_memory().unwrap(); + let cached = CachingReader::new(Box::new(primary), Box::new(cache)); + assert_equivalent(&plain, &cached, case.query); + } +} + +/// Read-only safety: the full corpus succeeds through the caching reader, +/// proving a read-only/remote primary is sufficient. +#[test] +fn mode1_read_only_primary_is_sufficient() { + for case in CORPUS { + if case.builtin_only && !cfg!(feature = "builtin-data") { + continue; + } + let sqlite = SqliteReader::new().unwrap(); + seed(&sqlite); + let primary = ReadOnlyReader::new(Box::new(sqlite)); + let cache = DuckDBReader::new_in_memory().unwrap(); + let cached = CachingReader::new(Box::new(primary), Box::new(cache)); + let r = cached.execute(case.query); + assert!( + r.is_ok(), + "read-only primary should suffice with caching for `{}`: {:?}", + case.query, + r.err(), + ); + } +} + +/// Cross-call memoization: a second identical execute does not re-read the +/// primary, because the base read is served from the cache memo. +#[test] +fn cross_call_memoization_avoids_second_primary_read() { + let sqlite = SqliteReader::new().unwrap(); + seed(&sqlite); + let (primary, log) = SpyReader::wrap(Box::new(sqlite)); + let cache = DuckDBReader::new_in_memory().unwrap(); + let cached = CachingReader::new(primary, Box::new(cache)); + + let query = "SELECT bill_len FROM cache_eq_tbl VISUALISE bill_len AS x DRAW histogram"; + cached.execute(query).unwrap(); + let after_first = log.lock().unwrap().len(); + cached.execute(query).unwrap(); + let after_second = log.lock().unwrap().len(); + + assert!(after_first >= 1, "first execute should read the primary"); + assert_eq!( + after_first, + after_second, + "second execute must not re-hit the primary; log: {:?}", + *log.lock().unwrap(), + ); +} + +/// The library factory builds working caching readers from composite URIs. +#[test] +fn factory_builds_caching_readers() { + use crate::reader::connection::reader_from_uri; + for uri in ["duckdb+sqlite://memory", "sqlite+duckdb://memory"] { + let r = reader_from_uri(uri).unwrap_or_else(|e| panic!("build `{uri}`: {e}")); + if cfg!(feature = "builtin-data") { + let spec = + r.execute("VISUALISE species AS x, bill_len AS y FROM ggsql:penguins DRAW boxplot"); + assert!( + spec.is_ok(), + "factory reader failed for `{uri}`: {:?}", + spec.err() + ); + } + } +} + +/// A real external ADBC SQLite primary + DuckDB cache, compared against a bare ADBC reader. +/// `#[ignore]` — requires `dbc install sqlite`. +#[cfg(feature = "adbc")] +mod adbc_mode { + use super::*; + use crate::reader::sqlite::SqliteDialect; + use crate::reader::test_support::assert_dataframes_equal; + use crate::reader::{AdbcReader, Spec, SqlDialect}; + use crate::{DataFrame, Result}; + use adbc_core::options::{AdbcVersion, OptionDatabase, OptionValue}; + use adbc_core::LOAD_FLAG_DEFAULT; + use adbc_driver_manager::ManagedDriver; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::Arc; + use tempfile::NamedTempFile; + + fn make_adbc_reader(db_path: &str) -> AdbcReader { + let driver = ManagedDriver::load_from_name( + "sqlite", + None, + AdbcVersion::V110, + LOAD_FLAG_DEFAULT, + None, + ) + .expect("`dbc install sqlite` first; see adbc.rs::equivalence_tests docs"); + let dialect: Box = Box::new(SqliteDialect); + AdbcReader::new_with_database_opts( + driver, + dialect, + std::iter::once(( + OptionDatabase::Uri, + OptionValue::String(format!("file:{}", db_path)), + )), + ) + .expect("construct AdbcReader") + } + + fn seed_adbc(path: &str) { + let bare = make_adbc_reader(path); + let df = crate::df! { + "x" => vec![1_i64, 2, 3, 4, 5], + "y" => vec![10_i64, 20, 30, 40, 50], + } + .unwrap(); + bare.register("t", df, false).unwrap(); + } + + /// Counts `execute_sql` calls reaching the ADBC primary. + struct CountingAdbcReader { + inner: AdbcReader, + calls: Arc, + } + + impl Reader for CountingAdbcReader { + fn execute_sql(&self, sql: &str) -> Result { + self.calls.fetch_add(1, Ordering::SeqCst); + self.inner.execute_sql(sql) + } + fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> { + self.inner.register(name, df, replace) + } + fn unregister(&self, name: &str) -> Result<()> { + self.inner.unregister(name) + } + fn execute(&self, query: &str) -> Result { + crate::reader::execute_with_reader(self, query) + } + fn dialect(&self) -> &dyn SqlDialect { + self.inner.dialect() + } + } + + #[test] + #[ignore = "requires `dbc install sqlite`"] + fn mode2_adbc_primary_duckdb_cache_equiv_and_memo() { + let db = NamedTempFile::new().unwrap(); + let path = db.path().to_str().unwrap(); + seed_adbc(path); + + let sql = "SELECT x, y, x*y AS xy FROM t WHERE y > 15 ORDER BY x"; + let golden = make_adbc_reader(path).execute_sql(sql).unwrap(); + + let calls = Arc::new(AtomicUsize::new(0)); + let primary = CountingAdbcReader { + inner: make_adbc_reader(path), + calls: calls.clone(), + }; + let cache = DuckDBReader::new_in_memory().unwrap(); + let cached = CachingReader::new(Box::new(primary), Box::new(cache)); + + let miss = cached.execute_sql(sql).unwrap(); + assert_dataframes_equal(&golden, &miss, "adbc cache miss"); + let after_miss = calls.load(Ordering::SeqCst); + assert!(after_miss >= 1, "miss should reach the ADBC primary"); + + let hit = cached.execute_sql(sql).unwrap(); + assert_dataframes_equal(&golden, &hit, "adbc cache hit"); + let after_hit = calls.load(Ordering::SeqCst); + assert_eq!( + after_miss, after_hit, + "cache hit must not round-trip to the ADBC primary" + ); + } +} diff --git a/src/reader/connection.rs b/src/reader/connection.rs index 8f72c7c57..eebeffdf1 100644 --- a/src/reader/connection.rs +++ b/src/reader/connection.rs @@ -1,78 +1,125 @@ -//! Connection string parsing for data sources +//! Connection string handling for data sources. //! -//! Parses URI-style connection strings to determine database type and connection parameters. +//! Maps URI-style connection strings (`duckdb://…`, `sqlite://…`, `odbc://…`) and +//! the composite caching form (`+://…`) to readers. +use crate::reader::Reader; use crate::{GgsqlError, Result}; -/// Parsed connection information -#[derive(Debug, Clone, PartialEq)] -pub enum ConnectionInfo { - /// DuckDB in-memory database - DuckDBMemory, - /// DuckDB file-based database - DuckDBFile(String), - /// PostgreSQL connection - #[allow(dead_code)] - PostgreSQL(String), - /// SQLite file-based database - #[allow(dead_code)] - SQLite(String), - /// Generic ODBC connection (raw connection string after `odbc://` prefix) - #[allow(dead_code)] - ODBC(String), -} - -/// Parse a connection string into connection information +/// Split a composite cache URI `+://` into the primary +/// connection URI and the cache backend scheme. /// -/// # Supported Formats +/// Returns `None` when there is no `+` before `://` (a plain URI). /// -/// - `duckdb://memory` - DuckDB in-memory database -/// - `duckdb://...` - DuckDB path -/// - `postgres://...` - PostgreSQL connection string -/// - `sqlite://...` - SQLite file path +/// # Example +/// ``` +/// use ggsql::reader::connection::split_cache_uri; +/// assert_eq!( +/// split_cache_uri("odbc+duckdb://DSN=foo"), +/// Some(("odbc://DSN=foo".to_string(), "duckdb".to_string())) +/// ); +/// assert_eq!(split_cache_uri("duckdb://memory"), None); /// ``` -pub fn parse_connection_string(uri: &str) -> Result { - if uri == "duckdb://memory" { - return Ok(ConnectionInfo::DuckDBMemory); +pub fn split_cache_uri(uri: &str) -> Option<(String, String)> { + let (scheme, rest) = uri.split_once("://")?; + let (primary, cache) = scheme.split_once('+')?; + if primary.is_empty() || cache.is_empty() || cache.contains('+') { + return None; } + Some((format!("{}://{}", primary, rest), cache.to_string())) +} - if let Some(path) = uri.strip_prefix("duckdb://") { - if path.is_empty() { +/// Map a cache-backend scheme to its in-memory connection URI. +#[cfg(any(feature = "duckdb", feature = "sqlite"))] +fn cache_uri(scheme: &str) -> Result<&'static str> { + match scheme { + "duckdb" => Ok("duckdb://memory"), + "sqlite" => Ok("sqlite://memory"), + _ => Err(GgsqlError::ReaderError(format!( + "Unsupported cache backend '{}'. Supported: duckdb, sqlite", + scheme + ))), + } +} + +/// Build a reader from a non-composite connection URI +pub fn build_reader(uri: &str) -> Result> { + if uri.starts_with("duckdb://") { + #[cfg(feature = "duckdb")] + { + return Ok(Box::new( + crate::reader::DuckDBReader::from_connection_string(uri)?, + )); + } + #[cfg(not(feature = "duckdb"))] + { return Err(GgsqlError::ReaderError( - "DuckDB file path cannot be empty".to_string(), + "DuckDB reader not compiled in. Rebuild with --features duckdb".to_string(), )); } - return Ok(ConnectionInfo::DuckDBFile(path.to_string())); } - - if uri.starts_with("postgres://") || uri.starts_with("postgresql://") { - return Ok(ConnectionInfo::PostgreSQL(uri.to_string())); - } - - if let Some(path) = uri.strip_prefix("sqlite://") { - if path.is_empty() { + if uri.starts_with("sqlite://") { + #[cfg(feature = "sqlite")] + { + return Ok(Box::new( + crate::reader::SqliteReader::from_connection_string(uri)?, + )); + } + #[cfg(not(feature = "sqlite"))] + { return Err(GgsqlError::ReaderError( - "SQLite file path cannot be empty".to_string(), + "SQLite reader not compiled in. Rebuild with --features sqlite".to_string(), )); } - return Ok(ConnectionInfo::SQLite(path.to_string())); } - - if let Some(conn_str) = uri.strip_prefix("odbc://") { - if conn_str.is_empty() { + if uri.starts_with("odbc://") { + #[cfg(feature = "odbc")] + { + return Ok(Box::new(crate::reader::OdbcReader::from_connection_string( + uri, + )?)); + } + #[cfg(not(feature = "odbc"))] + { return Err(GgsqlError::ReaderError( - "ODBC connection string cannot be empty".to_string(), + "ODBC reader not compiled in. Rebuild with --features odbc".to_string(), )); } - return Ok(ConnectionInfo::ODBC(conn_str.to_string())); } - + if uri.starts_with("postgres://") || uri.starts_with("postgresql://") { + return Err(GgsqlError::ReaderError( + "PostgreSQL reader is not yet implemented".to_string(), + )); + } Err(GgsqlError::ReaderError(format!( - "Unsupported connection string format: {}. Supported: duckdb://, postgres://, sqlite://, odbc://", + "Unsupported connection string: {}. Supported: duckdb://, sqlite://, odbc://", uri ))) } +/// Construct a reader from a connection URI, wrapping it in a [`CachingReader`] +/// when the URI uses the composite `+://` form. +/// +/// [`CachingReader`]: crate::reader::CachingReader +pub fn reader_from_uri(uri: &str) -> Result> { + if let Some((primary_uri, cache_scheme)) = split_cache_uri(uri) { + #[cfg(any(feature = "duckdb", feature = "sqlite"))] + { + let primary = build_reader(&primary_uri)?; + let cache = build_reader(cache_uri(&cache_scheme)?)?; + return Ok(Box::new(crate::reader::CachingReader::new(primary, cache))); + } + #[cfg(not(any(feature = "duckdb", feature = "sqlite")))] + { + let _ = (&primary_uri, &cache_scheme); + return Err(GgsqlError::ReaderError( + "Caching layer requires the duckdb or sqlite feature".to_string(), + )); + } + } + build_reader(uri) +} + /// Extract a value from an ODBC connection string by key, stripping braces. pub fn extract_odbc_value(conn_str: &str, key: &str) -> Option { let lower = conn_str.to_lowercase(); @@ -93,91 +140,77 @@ mod tests { use super::*; #[test] - fn test_duckdb_memory() { - let info = parse_connection_string("duckdb://memory").unwrap(); - assert_eq!(info, ConnectionInfo::DuckDBMemory); + fn test_build_reader_unsupported_scheme() { + let err = build_reader("mysql://localhost/db") + .err() + .unwrap() + .to_string(); + assert!(err.contains("Unsupported connection string"), "got: {err}"); } #[test] - fn test_duckdb_file_relative() { - let info = parse_connection_string("duckdb://data.db").unwrap(); - assert_eq!(info, ConnectionInfo::DuckDBFile("data.db".to_string())); + fn test_build_reader_postgres_not_implemented() { + let err = build_reader("postgres://user@localhost/db") + .err() + .unwrap() + .to_string(); + assert!(err.contains("not yet implemented"), "got: {err}"); } + #[cfg(feature = "duckdb")] #[test] - fn test_duckdb_file_absolute() { - let info = parse_connection_string("duckdb:///tmp/data.db").unwrap(); - assert_eq!(info, ConnectionInfo::DuckDBFile("/tmp/data.db".to_string())); + fn test_build_reader_duckdb_memory_and_empty() { + assert!(build_reader("duckdb://memory").is_ok()); + assert!(build_reader("duckdb://").is_err()); } + #[cfg(feature = "sqlite")] #[test] - fn test_duckdb_file_nested() { - let info = parse_connection_string("duckdb://path/to/data.db").unwrap(); - assert_eq!( - info, - ConnectionInfo::DuckDBFile("path/to/data.db".to_string()) - ); + fn test_build_reader_sqlite_memory() { + assert!(build_reader("sqlite://memory").is_ok()); + assert!(build_reader("sqlite://:memory:").is_ok()); } + #[cfg(all(feature = "duckdb", feature = "sqlite"))] #[test] - fn test_postgres() { - let uri = "postgres://user:pass@localhost/db"; - let info = parse_connection_string(uri).unwrap(); - assert_eq!(info, ConnectionInfo::PostgreSQL(uri.to_string())); + fn test_reader_from_uri_composite_builds() { + assert!(reader_from_uri("duckdb+sqlite://memory").is_ok()); + assert!(reader_from_uri("sqlite+duckdb://memory").is_ok()); } #[test] - fn test_postgresql_alias() { - let uri = "postgresql://user:pass@localhost/db"; - let info = parse_connection_string(uri).unwrap(); - assert_eq!(info, ConnectionInfo::PostgreSQL(uri.to_string())); - } - - #[test] - fn test_sqlite() { - let info = parse_connection_string("sqlite://data.db").unwrap(); - assert_eq!(info, ConnectionInfo::SQLite("data.db".to_string())); - } - - #[test] - fn test_sqlite_absolute() { - let info = parse_connection_string("sqlite:///tmp/data.db").unwrap(); - assert_eq!(info, ConnectionInfo::SQLite("/tmp/data.db".to_string())); + fn test_split_cache_uri_odbc_duckdb() { + assert_eq!( + split_cache_uri("odbc+duckdb://Driver=Snowflake;Server=x"), + Some(( + "odbc://Driver=Snowflake;Server=x".to_string(), + "duckdb".to_string() + )) + ); } #[test] - fn test_empty_duckdb_path() { - let result = parse_connection_string("duckdb://"); - assert!(result.is_err()); + fn test_split_cache_uri_duckdb_sqlite_memory() { + assert_eq!( + split_cache_uri("duckdb+sqlite://memory"), + Some(("duckdb://memory".to_string(), "sqlite".to_string())) + ); } #[test] - fn test_odbc() { - let info = parse_connection_string( - "odbc://Driver=Snowflake;Server=myaccount.snowflakecomputing.com", - ) - .unwrap(); - assert_eq!( - info, - ConnectionInfo::ODBC( - "Driver=Snowflake;Server=myaccount.snowflakecomputing.com".to_string() - ) - ); + fn test_split_cache_uri_plain_is_none() { + assert_eq!(split_cache_uri("duckdb://memory"), None); + assert_eq!(split_cache_uri("odbc://DSN=x"), None); } #[test] - fn test_odbc_empty() { - let result = parse_connection_string("odbc://"); - assert!(result.is_err()); + fn test_split_cache_uri_rejects_multiple_plus() { + assert_eq!(split_cache_uri("a+b+c://x"), None); } #[test] - fn test_unsupported_scheme() { - let result = parse_connection_string("mysql://localhost/db"); - assert!(result.is_err()); - assert!(result - .unwrap_err() - .to_string() - .contains("Unsupported connection string")); + fn test_split_cache_uri_rejects_empty_parts() { + assert_eq!(split_cache_uri("+duckdb://x"), None); + assert_eq!(split_cache_uri("odbc+://x"), None); } } diff --git a/src/reader/duckdb.rs b/src/reader/duckdb.rs index bf459c155..5badc5324 100644 --- a/src/reader/duckdb.rs +++ b/src/reader/duckdb.rs @@ -2,7 +2,7 @@ //! //! Provides a reader for DuckDB databases with Arrow DataFrame integration. -use crate::reader::{connection::ConnectionInfo, Reader}; +use crate::reader::{CacheBackend, Reader}; use crate::{naming, DataFrame, GgsqlError, Result}; use arrow::compute::{cast, concat_batches}; use arrow::datatypes::{DataType, Field, Schema}; @@ -243,21 +243,25 @@ impl DuckDBReader { /// - The database file cannot be opened /// - DuckDB initialization fails pub fn from_connection_string(uri: &str) -> Result { - let conn_info = super::connection::parse_connection_string(uri)?; + let path = uri.strip_prefix("duckdb://").ok_or_else(|| { + GgsqlError::ReaderError(format!( + "DuckDB URI must start with duckdb://, got '{}'", + uri + )) + })?; - let conn = match conn_info { - ConnectionInfo::DuckDBMemory => Connection::open_in_memory().map_err(|e| { + let conn = if path == "memory" { + Connection::open_in_memory().map_err(|e| { GgsqlError::ReaderError(format!("Failed to open in-memory DuckDB: {}", e)) - })?, - ConnectionInfo::DuckDBFile(path) => Connection::open(&path).map_err(|e| { + })? + } else if path.is_empty() { + return Err(GgsqlError::ReaderError( + "DuckDB file path cannot be empty".to_string(), + )); + } else { + Connection::open(path).map_err(|e| { GgsqlError::ReaderError(format!("Failed to open DuckDB file '{}': {}", path, e)) - })?, - _ => { - return Err(GgsqlError::ReaderError(format!( - "Connection string '{}' is not supported by DuckDBReader", - uri - ))) - } + })? }; // https://github.com/duckdb/duckdb/issues/22133 @@ -349,6 +353,12 @@ fn normalize_arrow_types(batch: RecordBatch) -> Result { .map_err(|e| GgsqlError::ReaderError(format!("Failed to normalize types: {}", e))) } +impl CacheBackend for DuckDBReader { + fn new_in_memory() -> Result { + Self::from_connection_string("duckdb://memory") + } +} + impl Reader for DuckDBReader { fn execute_sql(&self, sql: &str) -> Result { // Register builtin datasets if referenced diff --git a/src/reader/mod.rs b/src/reader/mod.rs index a6609d0c8..db9ad7791 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -429,6 +429,12 @@ pub mod odbc; #[cfg(feature = "adbc")] pub mod adbc; +#[cfg(any(feature = "duckdb", feature = "sqlite"))] +pub mod cache; + +#[cfg(all(test, feature = "duckdb", feature = "sqlite"))] +mod cache_equivalence; + pub mod connection; pub mod data; mod spec; @@ -445,6 +451,9 @@ pub use odbc::OdbcReader; #[cfg(feature = "adbc")] pub use adbc::AdbcReader; +#[cfg(any(feature = "duckdb", feature = "sqlite"))] +pub use cache::CachingReader; + // ============================================================================ // Shared utilities // ============================================================================ @@ -489,6 +498,177 @@ pub(crate) fn returns_rows(sql: &str) -> bool { ) } +/// Shared test helpers for reader equivalence suites. +#[cfg(test)] +pub(crate) mod test_support { + use super::{ + execute_with_reader, returns_rows, ColumnInfo, Reader, Spec, SqlDialect, TableInfo, + }; + use crate::{DataFrame, GgsqlError, Result}; + use std::sync::{Arc, Mutex}; + + /// A `Reader` that records every `execute_sql` it receives, delegating + /// everything to an inner reader. + pub(crate) struct SpyReader { + inner: Box, + log: Arc>>, + } + + impl SpyReader { + /// Wrap `inner`, returning the boxed spy and a handle to its call log. + pub(crate) fn wrap( + inner: Box, + ) -> (Box, Arc>>) { + let log = Arc::new(Mutex::new(Vec::new())); + ( + Box::new(SpyReader { + inner, + log: log.clone(), + }), + log, + ) + } + } + + impl Reader for SpyReader { + fn execute_sql(&self, sql: &str) -> Result { + self.log.lock().unwrap().push(sql.to_string()); + self.inner.execute_sql(sql) + } + fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> { + self.inner.register(name, df, replace) + } + fn unregister(&self, name: &str) -> Result<()> { + self.inner.unregister(name) + } + fn execute(&self, query: &str) -> Result { + execute_with_reader(self, query) + } + fn dialect(&self) -> &dyn SqlDialect { + self.inner.dialect() + } + fn list_catalogs(&self) -> Result> { + self.inner.list_catalogs() + } + fn list_schemas(&self, c: &str) -> Result> { + self.inner.list_schemas(c) + } + fn list_tables(&self, c: &str, s: &str) -> Result> { + self.inner.list_tables(c, s) + } + fn list_columns(&self, c: &str, s: &str, t: &str) -> Result> { + self.inner.list_columns(c, s, t) + } + } + + /// A `Reader` that wraps an inner reader and **refuses every write**: any + /// `register`/`unregister` and any non-row-returning `execute_sql` + /// (CREATE/INSERT/DROP/…) returns an error. + pub(crate) struct ReadOnlyReader { + inner: Box, + } + + impl ReadOnlyReader { + pub(crate) fn new(inner: Box) -> Self { + Self { inner } + } + + fn refuse(op: &str) -> GgsqlError { + GgsqlError::ReaderError(format!("read-only primary: refused {op}")) + } + } + + impl Reader for ReadOnlyReader { + fn execute_sql(&self, sql: &str) -> Result { + if !returns_rows(sql) { + return Err(Self::refuse(&format!("write statement: {sql}"))); + } + self.inner.execute_sql(sql) + } + fn register(&self, _name: &str, _df: DataFrame, _replace: bool) -> Result<()> { + Err(Self::refuse("register")) + } + fn unregister(&self, _name: &str) -> Result<()> { + Err(Self::refuse("unregister")) + } + fn execute(&self, query: &str) -> Result { + execute_with_reader(self, query) + } + fn dialect(&self) -> &dyn SqlDialect { + self.inner.dialect() + } + fn list_catalogs(&self) -> Result> { + self.inner.list_catalogs() + } + fn list_schemas(&self, c: &str) -> Result> { + self.inner.list_schemas(c) + } + fn list_tables(&self, c: &str, s: &str) -> Result> { + self.inner.list_tables(c, s) + } + fn list_columns(&self, c: &str, s: &str, t: &str) -> Result> { + self.inner.list_columns(c, s, t) + } + } + + /// Compare two DataFrames by schema (field names + types) and by + /// per-column Arrow array contents. We don't use a blanket + /// `assert_eq!(df, df)` because `DataFrame` doesn't implement `PartialEq`; + /// going through schema + per-column equality is also more diagnostic + /// when one of them diverges. + #[cfg(feature = "adbc")] + pub(crate) fn assert_dataframes_equal(a: &DataFrame, b: &DataFrame, ctx: &str) { + let a_schema = a.schema(); + let b_schema = b.schema(); + assert_eq!( + a_schema.fields().len(), + b_schema.fields().len(), + "{ctx}: column count mismatch (a={}, b={})", + a_schema.fields().len(), + b_schema.fields().len(), + ); + for (i, (af, bf)) in a_schema + .fields() + .iter() + .zip(b_schema.fields().iter()) + .enumerate() + { + assert_eq!( + af.name(), + bf.name(), + "{ctx}: column {i} name mismatch (a='{}', b='{}')", + af.name(), + bf.name(), + ); + assert_eq!( + af.data_type(), + bf.data_type(), + "{ctx}: column '{}' type mismatch (a={:?}, b={:?})", + af.name(), + af.data_type(), + bf.data_type(), + ); + } + assert_eq!( + a.height(), + b.height(), + "{ctx}: row count mismatch (a={}, b={})", + a.height(), + b.height(), + ); + for field in a_schema.fields() { + let ac = a.column(field.name()).unwrap(); + let bc = b.column(field.name()).unwrap(); + assert_eq!( + ac.as_ref(), + bc.as_ref(), + "{ctx}: column '{}' data mismatch", + field.name(), + ); + } + } +} + // ============================================================================ // Spec - Result of reader.execute() // ============================================================================ @@ -642,6 +822,27 @@ pub trait Reader { &AnsiDialect } + /// Materialize the result of `body_sql` as a temporary table named `name`. + fn materialize_table( + &self, + name: &str, + column_aliases: &[String], + body_sql: &str, + ) -> Result<()> { + for stmt in self + .dialect() + .create_or_replace_temp_table_sql(name, column_aliases, body_sql) + { + self.execute_sql(&stmt)?; + } + Ok(()) + } + + /// Whether this reader stages external data sources into a separate cache. + fn caches_sources(&self) -> bool { + false + } + // ========================================================================= // Schema introspection // ========================================================================= @@ -721,6 +922,16 @@ pub trait Reader { } } +/// A reader that can serve as an in-memory, writable caching backend. +/// +/// Cache backends take no options: they are always a fresh in-memory, writable +/// database scoped to the process; consumed by [`CachingReader`]. +pub trait CacheBackend: Reader { + fn new_in_memory() -> Result + where + Self: Sized; +} + /// A table or view in the schema. pub struct TableInfo { pub name: String, diff --git a/src/reader/sqlite.rs b/src/reader/sqlite.rs index 8f1e1f9f6..245c65a96 100644 --- a/src/reader/sqlite.rs +++ b/src/reader/sqlite.rs @@ -3,7 +3,7 @@ //! Provides a reader for SQLite databases with Arrow DataFrame integration. //! Works on both native targets and wasm32-unknown-unknown (via sqlite-wasm-rs). -use crate::reader::Reader; +use crate::reader::{CacheBackend, Reader}; use crate::{naming, DataFrame, GgsqlError, Result}; use arrow::array::*; use arrow::datatypes::{DataType, TimeUnit}; @@ -161,21 +161,31 @@ impl SqliteReader { } /// Create a SQLite reader from a connection string + /// + /// `sqlite://memory` (or `sqlite://:memory:`) opens an in-memory database; + /// any other path opens that file. pub fn from_connection_string(uri: &str) -> Result { - let conn_info = super::connection::parse_connection_string(uri)?; + let path = uri.strip_prefix("sqlite://").ok_or_else(|| { + GgsqlError::ReaderError(format!( + "SQLite URI must start with sqlite://, got '{}'", + uri + )) + })?; - let conn = match conn_info { - super::connection::ConnectionInfo::SQLite(path) => { - Connection::open(&path).map_err(|e| { - GgsqlError::ReaderError(format!("Failed to open SQLite file '{}': {}", path, e)) - })? - } - _ => { - return Err(GgsqlError::ReaderError(format!( - "Connection string '{}' is not supported by SqliteReader", - uri - ))) - } + if path.is_empty() { + return Err(GgsqlError::ReaderError( + "SQLite file path cannot be empty".to_string(), + )); + } + + let conn = if path == "memory" || path == ":memory:" { + Connection::open_in_memory().map_err(|e| { + GgsqlError::ReaderError(format!("Failed to open in-memory SQLite: {}", e)) + })? + } else { + Connection::open(path).map_err(|e| { + GgsqlError::ReaderError(format!("Failed to open SQLite file '{}': {}", path, e)) + })? }; #[cfg(feature = "spatial")] @@ -366,6 +376,12 @@ fn to_sql_value(v: &dyn rusqlite::types::ToSql) -> Option Result { + Self::new() + } +} + impl Reader for SqliteReader { fn execute_sql(&self, sql: &str) -> Result { // Handle ggsql:name namespaced identifiers (builtin datasets) From 6bf435e0ed90cd94cc9b2678672d151d3d8995d1 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Mon, 29 Jun 2026 15:02:19 +0100 Subject: [PATCH 02/15] Explicit compute/source split for caching layer --- src/CLAUDE.md | 2 +- src/execute/mod.rs | 9 +- src/reader/cache.rs | 240 ++++---------------------------- src/reader/cache_equivalence.rs | 32 ++++- src/reader/mod.rs | 26 +++- 5 files changed, 88 insertions(+), 221 deletions(-) diff --git a/src/CLAUDE.md b/src/CLAUDE.md index 8047c38a7..dd07af6e1 100644 --- a/src/CLAUDE.md +++ b/src/CLAUDE.md @@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`). -**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`. It routes each `execute_sql`: cache-resident names and `ggsql:` builtins go to the cache, base reads go to the primary (and are memoized). The `Reader::materialize_table` seam (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to run the read and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's extra materialization of explicit per-layer sources. `dialect()` returns the **cache** dialect because every executor-generated query targets a cache-resident table. Selected via the composite `+://` connection scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. +**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache. **`execute_sql_primary` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins and references to already-cached internal tables, which go to the cache. The executor (`prepare_data_with_reader`) decides per call site, passing an `execute_query` closure to derived sites and `execute_query_primary` to the setup/side-effect loops — so the routing is explicit, not guessed. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's extra materialization of explicit per-layer sources. `dialect()` returns the **cache** dialect. Selected via the composite `+://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. ### `execute/` diff --git a/src/execute/mod.rs b/src/execute/mod.rs index cf205c505..a041aef48 100644 --- a/src/execute/mod.rs +++ b/src/execute/mod.rs @@ -1098,7 +1098,12 @@ pub struct PreparedData { /// * `query` - The full ggsql query string /// * `reader` - A Reader implementation for executing SQL pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result { + // Two execution surfaces: `execute_query` is the COMPUTE surface for + // derived/dialect-generated SQL over internal `__ggsql_*` tables; + // `execute_query_primary` is the SOURCE surface for base reads of the + // user's data and user setup/DML. let execute_query = |sql: &str| reader.execute_sql(sql); + let execute_query_primary = |sql: &str| reader.execute_sql_primary(sql); let dialect = reader.dialect(); // Parse once and create SourceTree @@ -1129,7 +1134,7 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result Result, + /// In-memory writeable cache: derived tables, registered data, memoized reads. cache: Box, - /// Names that currently live in the cache (registered tables / memo results). - cached_names: RefCell>, - /// Memo of base primary reads: SQL text -> cache table name holding the result. + /// Memo of base primary reads: source SQL text -> cache table holding the result. result_cache: RefCell>, /// Monotonic counter for unique memo table names. next_id: Cell, @@ -37,38 +34,24 @@ impl CachingReader { Self { primary, cache, - cached_names: RefCell::new(HashSet::new()), result_cache: RefCell::new(HashMap::new()), next_id: Cell::new(0), } } - - /// Whether `sql` is one of the cache dialect's spatial setup statements. - /// - /// ggsql-generated spatial setup (`INSTALL`/`LOAD spatial`, …) must run on - /// the cache, where the spatial SQL executes — not on the primary. - fn is_cache_spatial_setup(&self, sql: &str) -> bool { - self.cache - .dialect() - .sql_spatial_setup() - .iter() - .any(|stmt| stmt == sql) - } } impl Reader for CachingReader { + /// Compute surface: all derived/dialect-generated SQL runs on the cache. fn execute_sql(&self, sql: &str) -> Result { - // Cache-resident names and `ggsql:` builtins are served by the cache. - if references_cached_name(sql, &self.cached_names.borrow()) || sql.contains("ggsql:") { - return self.cache.execute_sql(sql); - } + self.cache.execute_sql(sql) + } - // ggsql-generated spatial setup targets the cache (where spatial SQL runs). - if self.is_cache_spatial_setup(sql) { + /// Source surface: base reads of the user's data (plus user setup/DML). + fn execute_sql_primary(&self, sql: &str) -> Result { + if sql.contains("ggsql:") || sql.contains("__ggsql_") { return self.cache.execute_sql(sql); } - // Base read against the primary, with result memoization. if let Some(table) = self.result_cache.borrow().get(sql) { return self .cache @@ -82,7 +65,6 @@ impl Reader for CachingReader { self.next_id.set(id + 1); let table = naming::cache_result_table(id); self.cache.register(&table, df.clone(), true)?; - self.cached_names.borrow_mut().insert(table.clone()); self.result_cache .borrow_mut() .insert(sql.to_string(), table); @@ -92,15 +74,11 @@ impl Reader for CachingReader { } fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> { - self.cache.register(name, df, replace)?; - self.cached_names.borrow_mut().insert(name.to_string()); - Ok(()) + self.cache.register(name, df, replace) } fn unregister(&self, name: &str) -> Result<()> { - self.cache.unregister(name)?; - self.cached_names.borrow_mut().remove(name); - Ok(()) + self.cache.unregister(name) } fn execute(&self, query: &str) -> Result { @@ -118,8 +96,10 @@ impl Reader for CachingReader { column_aliases: &[String], body_sql: &str, ) -> Result<()> { + // Read the body via the source surface, then register the result + // into the cache. let body = super::wrap_with_column_aliases(body_sql, column_aliases); - let df = self.execute_sql(&body)?; + let df = self.execute_sql_primary(&body)?; self.register(name, df, true) } @@ -146,142 +126,6 @@ impl Reader for CachingReader { } } -/// Check whether `sql` references any name in `cached_names` as a SQL -/// identifier (not part of a longer identifier, not inside a single-quoted -/// string literal). -/// -/// Matches a bare reference (`FROM orders`), a double-quoted identifier -/// (`FROM "orders"`), or a qualified reference (`FROM catalog.schema.orders`). -/// Does not match substrings of longer identifiers (`orders_detail`) or -/// string-literal contents (`'orders of magnitude'`). This is a lightweight -/// scanner, not a full SQL parser: a cached name appearing only inside a -/// comment is misrouted to the cache and fails with a clear error rather than -/// silently hitting the primary. -fn references_cached_name(sql: &str, cached_names: &HashSet) -> bool { - cached_names - .iter() - .any(|name| sql_references_identifier(sql, name)) -} - -fn sql_references_identifier(sql: &str, name: &str) -> bool { - let bytes = sql.as_bytes(); - let name_bytes = name.as_bytes(); - let n = name_bytes.len(); - if n == 0 { - return false; - } - let mut i = 0; - while i + n <= bytes.len() { - if &bytes[i..i + n] == name_bytes { - let before_ok = i == 0 || !is_identifier_byte(bytes[i - 1]); - let after_ok = i + n == bytes.len() || !is_identifier_byte(bytes[i + n]); - if before_ok && after_ok && !is_inside_string_literal(bytes, i) { - return true; - } - } - i += 1; - } - false -} - -fn is_identifier_byte(b: u8) -> bool { - b.is_ascii_alphanumeric() || b == b'_' -} - -/// Walk from start to `pos` tracking whether we're inside a single-quoted -/// string literal. SQL-standard doubled-single-quote (`''`) is an escape that -/// keeps us inside the literal. -fn is_inside_string_literal(bytes: &[u8], pos: usize) -> bool { - let mut inside = false; - let mut i = 0; - while i < pos && i < bytes.len() { - if bytes[i] == b'\'' { - if inside && i + 1 < bytes.len() && bytes[i + 1] == b'\'' { - i += 2; - continue; - } - inside = !inside; - } - i += 1; - } - inside -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_references_cached_name_empty_set() { - let set = HashSet::new(); - assert!(!references_cached_name("SELECT * FROM foo", &set)); - } - - #[test] - fn test_references_cached_name_match() { - let mut set = HashSet::new(); - set.insert("__ggsql_global_abc123__".to_string()); - assert!(references_cached_name( - "SELECT * FROM __ggsql_global_abc123__ WHERE x > 1", - &set - )); - } - - #[test] - fn test_references_cached_name_rejects_longer_identifier() { - let mut set = HashSet::new(); - set.insert("orders".to_string()); - assert!(!references_cached_name( - "SELECT * FROM orders_detail WHERE x > 1", - &set - )); - } - - #[test] - fn test_references_cached_name_rejects_prefix_of_longer_identifier() { - let mut set = HashSet::new(); - set.insert("col".to_string()); - assert!(!references_cached_name("SELECT col_id FROM users", &set)); - } - - #[test] - fn test_references_cached_name_rejects_inside_string_literal() { - let mut set = HashSet::new(); - set.insert("orders".to_string()); - assert!(!references_cached_name( - "SELECT 'orders of magnitude' AS label", - &set - )); - } - - #[test] - fn test_references_cached_name_matches_quoted_identifier() { - let mut set = HashSet::new(); - set.insert("orders".to_string()); - assert!(references_cached_name(r#"SELECT * FROM "orders""#, &set)); - } - - #[test] - fn test_references_cached_name_matches_qualified_reference() { - let mut set = HashSet::new(); - set.insert("orders".to_string()); - assert!(references_cached_name( - "SELECT * FROM catalog.schema.orders WHERE x > 1", - &set - )); - } - - #[test] - fn test_references_cached_name_handles_escaped_quotes_in_literal() { - let mut set = HashSet::new(); - set.insert("orders".to_string()); - assert!(references_cached_name( - "SELECT 'it''s fine' FROM orders", - &set - )); - } -} - #[cfg(all(test, feature = "duckdb"))] mod behavior_tests { use super::*; @@ -299,15 +143,16 @@ mod behavior_tests { reader .register("t", df! { "x" => vec![1_i64, 2, 3] }.unwrap(), true) .unwrap(); + // register writes to the cache; the compute surface reads it back. let out = reader.execute_sql("SELECT COUNT(*) AS n FROM t").unwrap(); assert_eq!(as_i64(out.column("n").unwrap()).unwrap().value(0), 3); - // The query routed to the cache; the primary was never touched. + // The primary was never touched. assert!(log.lock().unwrap().is_empty()); } #[test] - fn test_base_read_routes_to_primary_and_memoizes() { + fn test_source_read_hits_primary_and_memoizes() { let inner = DuckDBReader::new_in_memory().unwrap(); inner .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true) @@ -317,8 +162,8 @@ mod behavior_tests { let reader = CachingReader::new(primary, cache); let q = "SELECT y FROM base ORDER BY y"; - let d1 = reader.execute_sql(q).unwrap(); - let d2 = reader.execute_sql(q).unwrap(); + let d1 = reader.execute_sql_primary(q).unwrap(); + let d2 = reader.execute_sql_primary(q).unwrap(); assert_eq!(d1.height(), 3); assert_eq!(d2.height(), 3); @@ -438,31 +283,6 @@ mod behavior_tests { ); } - #[cfg(feature = "spatial")] - #[test] - fn test_spatial_setup_routes_to_cache() { - let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap())); - let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(primary, cache); - - // ggsql-generated spatial setup must run on the cache (where the spatial - // SQL executes), never on the primary. - let setup = reader.dialect().sql_spatial_setup(); - assert!(!setup.is_empty(), "DuckDB cache should emit spatial setup"); - for stmt in &setup { - // Ignore execution result (may require the extension); we only assert routing. - let _ = reader.execute_sql(stmt); - } - let log = log.lock().unwrap(); - for stmt in &setup { - assert!( - !log.contains(stmt), - "spatial setup must route to the cache, not the primary: {}", - stmt - ); - } - } - #[cfg(feature = "sqlite")] #[test] fn test_dialect_returns_cache_dialect() { diff --git a/src/reader/cache_equivalence.rs b/src/reader/cache_equivalence.rs index 6b8f808d2..c490919b0 100644 --- a/src/reader/cache_equivalence.rs +++ b/src/reader/cache_equivalence.rs @@ -228,6 +228,33 @@ fn factory_builds_caching_readers() { } } +/// Map projections run entirely on the cache. +#[cfg(all(feature = "spatial", feature = "builtin-data"))] +#[test] +fn map_projection_runs_on_cache_not_primary() { + let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap())); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache); + + let spec = reader.execute("VISUALISE FROM ggsql:world DRAW spatial PROJECT TO orthographic"); + assert!( + spec.is_ok(), + "map projection via cache failed: {:?}", + spec.err() + ); + + // No dialect-specific spatial SQL, temp-table DDL, or `__ggsql_*` reference + // ever reached the primary. + let log = log.lock().unwrap(); + for stmt in log.iter() { + let upper = stmt.to_uppercase(); + assert!( + !upper.contains("ST_") && !upper.contains("TEMP TABLE") && !stmt.contains("__ggsql_"), + "derived spatial SQL leaked to the primary: {stmt}" + ); + } +} + /// A real external ADBC SQLite primary + DuckDB cache, compared against a bare ADBC reader. /// `#[ignore]` — requires `dbc install sqlite`. #[cfg(feature = "adbc")] @@ -318,12 +345,13 @@ mod adbc_mode { let cache = DuckDBReader::new_in_memory().unwrap(); let cached = CachingReader::new(Box::new(primary), Box::new(cache)); - let miss = cached.execute_sql(sql).unwrap(); + // Base reads go through the source surface; the cache memoizes them. + let miss = cached.execute_sql_primary(sql).unwrap(); assert_dataframes_equal(&golden, &miss, "adbc cache miss"); let after_miss = calls.load(Ordering::SeqCst); assert!(after_miss >= 1, "miss should reach the ADBC primary"); - let hit = cached.execute_sql(sql).unwrap(); + let hit = cached.execute_sql_primary(sql).unwrap(); assert_dataframes_equal(&golden, &hit, "adbc cache hit"); let after_hit = calls.load(Ordering::SeqCst); assert_eq!( diff --git a/src/reader/mod.rs b/src/reader/mod.rs index db9ad7791..2eef03bea 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -724,15 +724,27 @@ pub struct Metadata { /// let result = reader.execute_sql("SELECT * FROM sales WHERE amount > 100")?; /// ``` pub trait Reader { - /// Execute a SQL query and return the result as a DataFrame + /// Execute a SQL query and return the result as a DataFrame. /// - /// # Arguments + /// This is the **compute surface**: ggsql runs all dialect-generated/derived + /// SQL here. A plain reader runs it on its own connection; a + /// caching reader runs it on the in-memory cache, where all derived + /// `__ggsql_*` tables live. /// - /// * `sql` - The SQL query to execute + /// # Errors /// - /// # Returns + /// Returns `GgsqlError::ReaderError` if: + /// - The SQL is invalid + /// - The connection fails + /// - The table or columns don't exist + fn execute_sql(&self, sql: &str) -> Result; + + /// Execute SQL against the *source* surface — base reads of the user's data + /// plus user setup/DML. /// - /// A Polars DataFrame containing the query results + /// Defaults to the compute surface ([`Reader::execute_sql`]), so a plain + /// reader runs everything on one connection. A caching reader overrides this + /// to read the primary (with result memoization). /// /// # Errors /// @@ -740,7 +752,9 @@ pub trait Reader { /// - The SQL is invalid /// - The connection fails /// - The table or columns don't exist - fn execute_sql(&self, sql: &str) -> Result; + fn execute_sql_primary(&self, sql: &str) -> Result { + self.execute_sql(sql) + } /// Register a DataFrame as a queryable table (takes ownership) /// From 5a435999713ec23b39dab51b443cf0d939475151 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Mon, 29 Jun 2026 15:03:53 +0100 Subject: [PATCH 03/15] Update changelog --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b9eb1a45c..327a651f7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ ## [Unreleased] +### Added + +- New caching layer that wraps any `Reader` with an in-memory, writeable cache + backend (currently duckdb or sqlite), making write-constrained databases + usable and avoiding repeated remote reads during interactive iteration. + ## 0.4.1 - 2026-06-22 ### Changed From 589f179d32169061682726f675ab0e390135998c Mon Sep 17 00:00:00 2001 From: George Stagg Date: Mon, 29 Jun 2026 16:16:44 +0100 Subject: [PATCH 04/15] Apply changes from code review --- ggsql-cli/src/main.rs | 2 +- ggsql-jupyter/src/executor.rs | 4 +- src/CLAUDE.md | 2 +- src/execute/mod.rs | 8 +- src/reader/cache.rs | 165 +++++++++++++++++++++++++++++++++- 5 files changed, 172 insertions(+), 9 deletions(-) diff --git a/ggsql-cli/src/main.rs b/ggsql-cli/src/main.rs index 0a004fbce..413dae6b4 100644 --- a/ggsql-cli/src/main.rs +++ b/ggsql-cli/src/main.rs @@ -446,7 +446,7 @@ fn print_table_fallback(query: &str, reader: &R, max_rows: u let sql_part = source_tree.extract_sql().unwrap_or_default(); - let data = reader.execute_sql(&sql_part); + let data = reader.execute_sql_primary(&sql_part); if let Err(e) = data { eprintln!("Failed to execute SQL query: {}", e); std::process::exit(1) diff --git a/ggsql-jupyter/src/executor.rs b/ggsql-jupyter/src/executor.rs index fb9d51bfb..668d5f3dd 100644 --- a/ggsql-jupyter/src/executor.rs +++ b/ggsql-jupyter/src/executor.rs @@ -177,8 +177,8 @@ impl QueryExecutor { // 2. Check if there's a visualization if !validated.has_visual() { - // Pure SQL query - execute directly and return DataFrame - let df = self.reader.execute_sql(code)?; + // Pure SQL query - execute directly and return DataFrame. + let df = self.reader.execute_sql_primary(code)?; tracing::info!( "Pure SQL executed: {} rows, {} cols", df.height(), diff --git a/src/CLAUDE.md b/src/CLAUDE.md index dd07af6e1..d0a12fba0 100644 --- a/src/CLAUDE.md +++ b/src/CLAUDE.md @@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`). -**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache. **`execute_sql_primary` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins and references to already-cached internal tables, which go to the cache. The executor (`prepare_data_with_reader`) decides per call site, passing an `execute_query` closure to derived sites and `execute_query_primary` to the setup/side-effect loops — so the routing is explicit, not guessed. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's extra materialization of explicit per-layer sources. `dialect()` returns the **cache** dialect. Selected via the composite `+://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. +**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache. **`execute_sql_primary` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins and reads that reference a cache-resident internal table, which go to the cache. Cache routing is by **membership** in the set of tables actually registered into the cache (the `resident` set: internal `__ggsql_*` tables plus user-registered tables). A non-row-returning source statement (DML/DDL on the primary) clears the read memo, so later identical reads don't return stale data. Pure/non-visual SQL (CLI table fallback, Jupyter) also goes through `execute_sql_primary` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `+://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. ### `execute/` diff --git a/src/execute/mod.rs b/src/execute/mod.rs index a041aef48..acec6e512 100644 --- a/src/execute/mod.rs +++ b/src/execute/mod.rs @@ -1211,7 +1211,13 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result t.clone(), None => { let t = naming::layer_source_table(materialized_sources.len()); - reader.materialize_table(&t, &[], &source_query)?; + if matches!(layer.source, Some(crate::DataSource::FilePath(_))) { + // The cache backend reads local files + let df = reader.execute_sql(&source_query)?; + reader.register(&t, df, true)?; + } else { + reader.materialize_table(&t, &[], &source_query)?; + } materialized_sources.insert(source_query, t.clone()); t } diff --git a/src/reader/cache.rs b/src/reader/cache.rs index fdeb47233..e57393acf 100644 --- a/src/reader/cache.rs +++ b/src/reader/cache.rs @@ -14,7 +14,7 @@ use crate::reader::{execute_with_reader, ColumnInfo, Reader, Spec, SqlDialect, TableInfo}; use crate::{naming, DataFrame, Result}; use std::cell::{Cell, RefCell}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; pub struct CachingReader { /// Primary backend — the real data source. @@ -25,6 +25,9 @@ pub struct CachingReader { result_cache: RefCell>, /// Monotonic counter for unique memo table names. next_id: Cell, + /// Names registered into the cache. A source read that references one + /// is routed to the cache rather than the primary. + resident: RefCell>, } impl CachingReader { @@ -36,6 +39,7 @@ impl CachingReader { cache, result_cache: RefCell::new(HashMap::new()), next_id: Cell::new(0), + resident: RefCell::new(HashSet::new()), } } } @@ -48,7 +52,12 @@ impl Reader for CachingReader { /// Source surface: base reads of the user's data (plus user setup/DML). fn execute_sql_primary(&self, sql: &str) -> Result { - if sql.contains("ggsql:") || sql.contains("__ggsql_") { + // Route to the cache when the read targets cache-resident objects + let references_cache_table = { + let resident = self.resident.borrow(); + resident.iter().any(|t| sql.contains(t.as_str())) + }; + if sql.contains("ggsql:") || references_cache_table { return self.cache.execute_sql(sql); } @@ -68,17 +77,25 @@ impl Reader for CachingReader { self.result_cache .borrow_mut() .insert(sql.to_string(), table); + } else { + // A source-side write or DDL on the primary can change the data + // behind a memoized read; drop the memo. + self.result_cache.borrow_mut().clear(); } Ok(df) } fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> { - self.cache.register(name, df, replace) + self.cache.register(name, df, replace)?; + self.resident.borrow_mut().insert(name.to_string()); + Ok(()) } fn unregister(&self, name: &str) -> Result<()> { - self.cache.unregister(name) + self.cache.unregister(name)?; + self.resident.borrow_mut().remove(name); + Ok(()) } fn execute(&self, query: &str) -> Result { @@ -321,4 +338,144 @@ mod behavior_tests { spec.err() ); } + + #[test] + fn test_aliased_cte_reading_primary_routes_to_primary() { + // A column-aliased CTE whose body reads a primary-only table must run on + // the primary. The `__ggsql_aliased__` column-alias wrapper must not + // misroute the read to the (empty) cache. + let base = DuckDBReader::new_in_memory().unwrap(); + base.register("base", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let primary = Box::new(ReadOnlyReader::new(Box::new(base))); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache); + + let spec = reader.execute( + "WITH t(a) AS (SELECT v FROM base) SELECT a FROM t VISUALISE a AS x DRAW point", + ); + assert!( + spec.is_ok(), + "aliased CTE over a primary table should succeed: {:?}", + spec.err() + ); + } + + #[test] + fn test_aliased_cte_referencing_prior_cte_routes_to_cache() { + // A column-aliased CTE that references a *prior* CTE reads a table that + // lives in the cache, so its body must route to the cache, while the + // first CTE still reads the primary. + let base = DuckDBReader::new_in_memory().unwrap(); + base.register("base", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let primary = Box::new(ReadOnlyReader::new(Box::new(base))); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache); + + let spec = reader.execute( + "WITH a(p) AS (SELECT v FROM base), b(q) AS (SELECT p FROM a) \ + SELECT q FROM b VISUALISE q AS x DRAW point", + ); + assert!( + spec.is_ok(), + "dependent aliased CTE should succeed: {:?}", + spec.err() + ); + } + + #[test] + fn test_source_write_invalidates_memo() { + // Memoize a base read, mutate the primary, then re-read: the memo must be + // invalidated by the non-row-returning statement so the second read is fresh. + let primary = DuckDBReader::new_in_memory().unwrap(); + primary + .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(Box::new(primary), cache); + + let q = "SELECT v FROM t"; + let d1 = reader.execute_sql_primary(q).unwrap(); + assert_eq!(d1.height(), 3); + + reader + .execute_sql_primary("INSERT INTO t VALUES (4)") + .unwrap(); + + let d2 = reader.execute_sql_primary(q).unwrap(); + assert_eq!( + d2.height(), + 4, + "re-read must see the inserted row, not the memo" + ); + } + + #[test] + fn test_pure_sql_reads_primary_not_cache() { + // The pure-SQL display path uses `execute_sql_primary`, which reads the + // primary; `execute_sql` would hit the empty cache and fail. + let primary = DuckDBReader::new_in_memory().unwrap(); + primary + .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(Box::new(primary), cache); + + let df = reader.execute_sql_primary("SELECT v FROM t").unwrap(); + assert_eq!(df.height(), 3); + assert!( + reader.execute_sql("SELECT v FROM t").is_err(), + "compute surface should not find the primary-only table" + ); + } + + #[test] + fn test_cache_resident_table_as_layer_source() { + // A table registered directly on the CachingReader lives only in the + // cache. + let primary = Box::new(DuckDBReader::new_in_memory().unwrap()); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache); + reader + .register( + "only_in_cache", + df! { "val" => vec![1.0_f64, 2.0, 2.0, 3.0, 3.0, 3.0, 9.0] }.unwrap(), + true, + ) + .unwrap(); + + let spec = reader.execute("VISUALISE x DRAW histogram MAPPING val AS x FROM only_in_cache"); + assert!( + spec.is_ok(), + "cache-resident layer source should succeed: {:?}", + spec.err() + ); + } + + #[cfg(feature = "sqlite")] + #[test] + fn test_file_layer_source_staged_via_cache() { + use crate::reader::SqliteReader; + // A file source must be staged on the cache surface. + let dir = std::env::temp_dir(); + let path = dir.join(format!("ggsql_cache_file_test_{}.csv", std::process::id())); + std::fs::write(&path, "val\n1.0\n2.0\n2.0\n3.0\n3.0\n3.0\n9.0\n").unwrap(); + let path_str = path.to_str().unwrap().to_string(); + + let primary = Box::new(SqliteReader::new().unwrap()); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache); + + let spec = reader.execute(&format!( + "VISUALISE x DRAW histogram MAPPING val AS x FROM '{}'", + path_str + )); + let _ = std::fs::remove_file(&path); + assert!( + spec.is_ok(), + "file layer source via cache should succeed: {:?}", + spec.err() + ); + } } From 78c856ea4c743866c2d3b306029d0b41b430681f Mon Sep 17 00:00:00 2001 From: George Stagg Date: Tue, 30 Jun 2026 10:13:41 +0100 Subject: [PATCH 05/15] Store cache metadata in the cache reader --- src/naming.rs | 15 ++- src/reader/cache.rs | 227 +++++++++++++++++++++++++++----- src/reader/cache_equivalence.rs | 10 +- src/reader/connection.rs | 6 +- 4 files changed, 215 insertions(+), 43 deletions(-) diff --git a/src/naming.rs b/src/naming.rs index 1d1357cea..dfcf6d5e4 100644 --- a/src/naming.rs +++ b/src/naming.rs @@ -85,6 +85,9 @@ pub const ORDER_COLUMN: &str = concatcp!(GGSQL_PREFIX, "order", GGSQL_SUFFIX); /// Used with Vega-Lite filter transforms to select per-layer data. pub const SOURCE_COLUMN: &str = concatcp!(GGSQL_PREFIX, "source", GGSQL_SUFFIX); +/// Name of the caching layer's metadata table, held in the cache backend. +pub const CACHE_META_TABLE: &str = concatcp!(GGSQL_PREFIX, "cache_meta", GGSQL_SUFFIX); + /// Alias for schema extraction queries pub const SCHEMA_ALIAS: &str = concatcp!(GGSQL_SUFFIX, "schema", GGSQL_SUFFIX); @@ -139,21 +142,22 @@ pub fn cte_table(cte_name: &str) -> String { /// Generate temp table name for a memoized primary query result in a cache. /// -/// Format: `__ggsql_cache____` +/// The cache key is a hex hash of the primary URI and SQL. +/// Format: `__ggsql_cache____` /// /// # Example /// ``` /// use ggsql::naming; -/// let table = naming::cache_result_table(0); +/// let table = naming::cache_result_table("deadbeef"); /// assert!(table.starts_with("__ggsql_cache_")); -/// assert!(table.ends_with("_0__")); +/// assert!(table.ends_with("_deadbeef__")); /// ``` -pub fn cache_result_table(id: u64) -> String { +pub fn cache_result_table(key: &str) -> String { format!( "{}cache_{}_{}{}", GGSQL_PREFIX, session_id(), - id, + key, GGSQL_SUFFIX ) } @@ -536,6 +540,7 @@ mod tests { assert_eq!(ORDER_COLUMN, "__ggsql_order__"); assert_eq!(SOURCE_COLUMN, "__ggsql_source__"); assert_eq!(SCHEMA_ALIAS, "__schema__"); + assert_eq!(CACHE_META_TABLE, "__ggsql_cache_meta__"); } #[test] diff --git a/src/reader/cache.rs b/src/reader/cache.rs index e57393acf..9cb27affc 100644 --- a/src/reader/cache.rs +++ b/src/reader/cache.rs @@ -14,34 +14,124 @@ use crate::reader::{execute_with_reader, ColumnInfo, Reader, Spec, SqlDialect, TableInfo}; use crate::{naming, DataFrame, Result}; use std::cell::{Cell, RefCell}; -use std::collections::{HashMap, HashSet}; +use std::collections::hash_map::DefaultHasher; +use std::collections::HashSet; +use std::hash::Hasher; pub struct CachingReader { /// Primary backend — the real data source. primary: Box, /// In-memory writeable cache: derived tables, registered data, memoized reads. cache: Box, - /// Memo of base primary reads: source SQL text -> cache table holding the result. - result_cache: RefCell>, - /// Monotonic counter for unique memo table names. - next_id: Cell, + /// Connection URI of the primary. + primary_uri: String, + /// Whether the metadata table has been created in the cache backend. + meta_ready: Cell, /// Names registered into the cache. A source read that references one /// is routed to the cache rather than the primary. resident: RefCell>, } impl CachingReader { - /// Construct a `CachingReader` from a primary reader and an in-memory cache - /// backend. The cache is owned by the `CachingReader` and dropped with it. - pub fn new(primary: Box, cache: Box) -> Self { + /// Construct a `CachingReader` from a primary reader, an in-memory cache + /// backend, and the primary's connection URI. The cache is owned by the + /// `CachingReader` and dropped with it. + pub fn new( + primary: Box, + cache: Box, + primary_uri: impl Into, + ) -> Self { Self { primary, cache, - result_cache: RefCell::new(HashMap::new()), - next_id: Cell::new(0), + primary_uri: primary_uri.into(), + meta_ready: Cell::new(false), resident: RefCell::new(HashSet::new()), } } + + /// Derive a stable cache key from the primary URI and the SQL text. + fn cache_key(&self, sql: &str) -> String { + let mut hasher = DefaultHasher::new(); + hasher.write(self.primary_uri.as_bytes()); + hasher.write(b"\n"); + hasher.write(sql.as_bytes()); + format!("{:016x}", hasher.finish()) + } + + /// Create the metadata table in the cache backend if it doesn't exist yet. + fn ensure_meta_table(&self) -> Result<()> { + if self.meta_ready.get() { + return Ok(()); + } + let sql = format!( + "CREATE TABLE IF NOT EXISTS {} \ + (cache_key VARCHAR PRIMARY KEY, sql VARCHAR NOT NULL, table_name VARCHAR NOT NULL)", + naming::quote_ident(naming::CACHE_META_TABLE) + ); + self.cache.execute_sql(&sql)?; + self.meta_ready.set(true); + Ok(()) + } + + /// Look up the cache table holding the memoized result for `key`. + fn lookup_memo(&self, key: &str) -> Result> { + let sql = format!( + "SELECT table_name FROM {} WHERE cache_key = {}", + naming::quote_ident(naming::CACHE_META_TABLE), + naming::quote_literal(key), + ); + let df = self.cache.execute_sql(&sql)?; + if df.height() == 0 { + return Ok(None); + } + let table = crate::array_util::as_str(df.column("table_name")?)? + .value(0) + .to_string(); + Ok(Some(table)) + } + + /// Record a memoized read in the metadata table. + fn insert_memo(&self, key: &str, sql: &str, table: &str) -> Result<()> { + let stmt = format!( + "INSERT OR REPLACE INTO {} (cache_key, sql, table_name) VALUES ({}, {}, {})", + naming::quote_ident(naming::CACHE_META_TABLE), + naming::quote_literal(key), + naming::quote_literal(sql), + naming::quote_literal(table), + ); + self.cache.execute_sql(&stmt)?; + Ok(()) + } + + /// The cache tables holding all currently-memoized results. + fn memo_tables(&self) -> Result> { + let sql = format!( + "SELECT table_name FROM {}", + naming::quote_ident(naming::CACHE_META_TABLE) + ); + let df = self.cache.execute_sql(&sql)?; + let n = df.height(); + if n == 0 { + return Ok(Vec::new()); + } + let col = crate::array_util::as_str(df.column("table_name")?)?; + Ok((0..n).map(|i| col.value(i).to_string()).collect()) + } + + /// Drop every memoized result and empty the metadata table. + fn clear_memo(&self) -> Result<()> { + self.ensure_meta_table()?; + for table in self.memo_tables()? { + let _ = self.cache.unregister(&table); + } + let del = format!( + "DELETE FROM {}", + naming::quote_ident(naming::CACHE_META_TABLE) + ); + self.cache.execute_sql(&del)?; + Ok(()) + } } impl Reader for CachingReader { @@ -52,35 +142,38 @@ impl Reader for CachingReader { /// Source surface: base reads of the user's data (plus user setup/DML). fn execute_sql_primary(&self, sql: &str) -> Result { - // Route to the cache when the read targets cache-resident objects + // Route to the cache when the read targets cache-resident objects, the + // metadata table, or a builtin dataset. let references_cache_table = { let resident = self.resident.borrow(); resident.iter().any(|t| sql.contains(t.as_str())) }; - if sql.contains("ggsql:") || references_cache_table { + if sql.contains("ggsql:") + || sql.contains(naming::CACHE_META_TABLE) + || references_cache_table + { return self.cache.execute_sql(sql); } - if let Some(table) = self.result_cache.borrow().get(sql) { + self.ensure_meta_table()?; + let key = self.cache_key(sql); + + if let Some(table) = self.lookup_memo(&key)? { return self .cache - .execute_sql(&format!("SELECT * FROM {}", naming::quote_ident(table))); + .execute_sql(&format!("SELECT * FROM {}", naming::quote_ident(&table))); } let df = self.primary.execute_sql(sql)?; if super::returns_rows(sql) { - let id = self.next_id.get(); - self.next_id.set(id + 1); - let table = naming::cache_result_table(id); + let table = naming::cache_result_table(&key); self.cache.register(&table, df.clone(), true)?; - self.result_cache - .borrow_mut() - .insert(sql.to_string(), table); + self.insert_memo(&key, sql, &table)?; } else { // A source-side write or DDL on the primary can change the data // behind a memoized read; drop the memo. - self.result_cache.borrow_mut().clear(); + self.clear_memo()?; } Ok(df) @@ -155,7 +248,7 @@ mod behavior_tests { fn test_register_writes_to_cache_and_query_routes_there() { let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap())); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(primary, cache); + let reader = CachingReader::new(primary, cache, "test://primary"); reader .register("t", df! { "x" => vec![1_i64, 2, 3] }.unwrap(), true) @@ -176,7 +269,7 @@ mod behavior_tests { .unwrap(); let (primary, log) = SpyReader::wrap(Box::new(inner)); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(primary, cache); + let reader = CachingReader::new(primary, cache, "test://primary"); let q = "SELECT y FROM base ORDER BY y"; let d1 = reader.execute_sql_primary(q).unwrap(); @@ -207,7 +300,7 @@ mod behavior_tests { .unwrap(); let (primary, log) = SpyReader::wrap(Box::new(inner)); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(primary, cache); + let reader = CachingReader::new(primary, cache, "test://primary"); reader .execute("SELECT x, y FROM sales VISUALISE x, y DRAW point") @@ -266,6 +359,7 @@ mod behavior_tests { let cached = CachingReader::new( Box::new(ReadOnlyReader::new(Box::new(primary))), Box::new(DuckDBReader::new_in_memory().unwrap()), + "test://primary", ); assert!( cached.execute(query).is_ok(), @@ -308,7 +402,7 @@ mod behavior_tests { // GREATEST), not SQLite's (CASE fallback). let primary = Box::new(SqliteReader::new().unwrap()); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(primary, cache); + let reader = CachingReader::new(primary, cache, "test://primary"); assert_eq!(reader.dialect().sql_greatest(&["a", "b"]), "GREATEST(a, b)"); } @@ -329,7 +423,7 @@ mod behavior_tests { ) .unwrap(); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(Box::new(primary), cache); + let reader = CachingReader::new(Box::new(primary), cache, "test://primary"); let spec = reader.execute("VISUALISE x DRAW histogram MAPPING val AS x FROM tbl"); assert!( @@ -349,7 +443,7 @@ mod behavior_tests { .unwrap(); let primary = Box::new(ReadOnlyReader::new(Box::new(base))); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(primary, cache); + let reader = CachingReader::new(primary, cache, "test://primary"); let spec = reader.execute( "WITH t(a) AS (SELECT v FROM base) SELECT a FROM t VISUALISE a AS x DRAW point", @@ -371,7 +465,7 @@ mod behavior_tests { .unwrap(); let primary = Box::new(ReadOnlyReader::new(Box::new(base))); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(primary, cache); + let reader = CachingReader::new(primary, cache, "test://primary"); let spec = reader.execute( "WITH a(p) AS (SELECT v FROM base), b(q) AS (SELECT p FROM a) \ @@ -384,6 +478,75 @@ mod behavior_tests { ); } + #[test] + fn test_meta_table_records_and_serves_memo() { + // A memoized read is recorded in the metadata table and served back from + // the cache on repeat, without touching the primary again. + let inner = DuckDBReader::new_in_memory().unwrap(); + inner + .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let (primary, log) = SpyReader::wrap(Box::new(inner)); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let q = "SELECT y FROM base ORDER BY y"; + reader.execute_sql_primary(q).unwrap(); + + // The metadata table now has exactly one row for this read. + let meta = reader + .execute_sql(&format!("SELECT sql FROM {}", naming::CACHE_META_TABLE)) + .unwrap(); + assert_eq!(meta.height(), 1); + assert_eq!( + crate::array_util::as_str(meta.column("sql").unwrap()) + .unwrap() + .value(0), + q + ); + + // The repeat read is served from the cache, not the primary. + reader.execute_sql_primary(q).unwrap(); + let hits = log + .lock() + .unwrap() + .iter() + .filter(|s| s.as_str() == q) + .count(); + assert_eq!(hits, 1); + } + + #[cfg(feature = "sqlite")] + #[test] + fn test_sqlite_cache_backend_memoizes() { + use crate::reader::SqliteReader; + // A SQLite cache backend must support the metadata table DDL/DML + // (CREATE TABLE IF NOT EXISTS, INSERT OR REPLACE) and serve memoized reads. + let inner = DuckDBReader::new_in_memory().unwrap(); + inner + .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let (primary, log) = SpyReader::wrap(Box::new(inner)); + let cache = Box::new(SqliteReader::new().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let q = "SELECT y FROM base ORDER BY y"; + let d1 = reader.execute_sql_primary(q).unwrap(); + let d2 = reader.execute_sql_primary(q).unwrap(); + assert_eq!(d1.height(), 3); + assert_eq!(d2.height(), 3); + let hits = log + .lock() + .unwrap() + .iter() + .filter(|s| s.as_str() == q) + .count(); + assert_eq!( + hits, 1, + "SQLite cache should serve the repeat from the memo" + ); + } + #[test] fn test_source_write_invalidates_memo() { // Memoize a base read, mutate the primary, then re-read: the memo must be @@ -393,7 +556,7 @@ mod behavior_tests { .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true) .unwrap(); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(Box::new(primary), cache); + let reader = CachingReader::new(Box::new(primary), cache, "test://primary"); let q = "SELECT v FROM t"; let d1 = reader.execute_sql_primary(q).unwrap(); @@ -420,7 +583,7 @@ mod behavior_tests { .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true) .unwrap(); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(Box::new(primary), cache); + let reader = CachingReader::new(Box::new(primary), cache, "test://primary"); let df = reader.execute_sql_primary("SELECT v FROM t").unwrap(); assert_eq!(df.height(), 3); @@ -436,7 +599,7 @@ mod behavior_tests { // cache. let primary = Box::new(DuckDBReader::new_in_memory().unwrap()); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(primary, cache); + let reader = CachingReader::new(primary, cache, "test://primary"); reader .register( "only_in_cache", @@ -465,7 +628,7 @@ mod behavior_tests { let primary = Box::new(SqliteReader::new().unwrap()); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(primary, cache); + let reader = CachingReader::new(primary, cache, "test://primary"); let spec = reader.execute(&format!( "VISUALISE x DRAW histogram MAPPING val AS x FROM '{}'", diff --git a/src/reader/cache_equivalence.rs b/src/reader/cache_equivalence.rs index c490919b0..819d92d65 100644 --- a/src/reader/cache_equivalence.rs +++ b/src/reader/cache_equivalence.rs @@ -157,7 +157,7 @@ fn mode1_builtin_equivalence_matches_plain_duckdb() { let plain = DuckDBReader::new_in_memory().unwrap(); let primary = ReadOnlyReader::new(Box::new(SqliteReader::new().unwrap())); let cache = DuckDBReader::new_in_memory().unwrap(); - let cached = CachingReader::new(Box::new(primary), Box::new(cache)); + let cached = CachingReader::new(Box::new(primary), Box::new(cache), "test://primary"); assert_equivalent(&plain, &cached, case.query); } } @@ -174,7 +174,7 @@ fn mode1_read_only_primary_is_sufficient() { seed(&sqlite); let primary = ReadOnlyReader::new(Box::new(sqlite)); let cache = DuckDBReader::new_in_memory().unwrap(); - let cached = CachingReader::new(Box::new(primary), Box::new(cache)); + let cached = CachingReader::new(Box::new(primary), Box::new(cache), "test://primary"); let r = cached.execute(case.query); assert!( r.is_ok(), @@ -193,7 +193,7 @@ fn cross_call_memoization_avoids_second_primary_read() { seed(&sqlite); let (primary, log) = SpyReader::wrap(Box::new(sqlite)); let cache = DuckDBReader::new_in_memory().unwrap(); - let cached = CachingReader::new(primary, Box::new(cache)); + let cached = CachingReader::new(primary, Box::new(cache), "test://primary"); let query = "SELECT bill_len FROM cache_eq_tbl VISUALISE bill_len AS x DRAW histogram"; cached.execute(query).unwrap(); @@ -234,7 +234,7 @@ fn factory_builds_caching_readers() { fn map_projection_runs_on_cache_not_primary() { let (primary, log) = SpyReader::wrap(Box::new(DuckDBReader::new_in_memory().unwrap())); let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(primary, cache); + let reader = CachingReader::new(primary, cache, "test://primary"); let spec = reader.execute("VISUALISE FROM ggsql:world DRAW spatial PROJECT TO orthographic"); assert!( @@ -343,7 +343,7 @@ mod adbc_mode { calls: calls.clone(), }; let cache = DuckDBReader::new_in_memory().unwrap(); - let cached = CachingReader::new(Box::new(primary), Box::new(cache)); + let cached = CachingReader::new(Box::new(primary), Box::new(cache), "test://primary"); // Base reads go through the source surface; the cache memoizes them. let miss = cached.execute_sql_primary(sql).unwrap(); diff --git a/src/reader/connection.rs b/src/reader/connection.rs index eebeffdf1..b699e60f3 100644 --- a/src/reader/connection.rs +++ b/src/reader/connection.rs @@ -107,7 +107,11 @@ pub fn reader_from_uri(uri: &str) -> Result> { { let primary = build_reader(&primary_uri)?; let cache = build_reader(cache_uri(&cache_scheme)?)?; - return Ok(Box::new(crate::reader::CachingReader::new(primary, cache))); + return Ok(Box::new(crate::reader::CachingReader::new( + primary, + cache, + primary_uri, + ))); } #[cfg(not(any(feature = "duckdb", feature = "sqlite")))] { From bd2b1552bd92f1c18178fee974654ac265b77393 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Tue, 30 Jun 2026 11:59:59 +0100 Subject: [PATCH 06/15] Back cache memo with a meta table --- ggsql-cli/src/main.rs | 2 +- ggsql-jupyter/src/executor.rs | 2 +- src/CLAUDE.md | 2 +- src/execute/mod.rs | 16 +++--- src/reader/cache.rs | 98 ++++++++++++++++++++++----------- src/reader/cache_equivalence.rs | 4 +- src/reader/data.rs | 16 ++++++ src/reader/mod.rs | 19 +++---- 8 files changed, 104 insertions(+), 55 deletions(-) diff --git a/ggsql-cli/src/main.rs b/ggsql-cli/src/main.rs index 413dae6b4..0a004fbce 100644 --- a/ggsql-cli/src/main.rs +++ b/ggsql-cli/src/main.rs @@ -446,7 +446,7 @@ fn print_table_fallback(query: &str, reader: &R, max_rows: u let sql_part = source_tree.extract_sql().unwrap_or_default(); - let data = reader.execute_sql_primary(&sql_part); + let data = reader.execute_sql(&sql_part); if let Err(e) = data { eprintln!("Failed to execute SQL query: {}", e); std::process::exit(1) diff --git a/ggsql-jupyter/src/executor.rs b/ggsql-jupyter/src/executor.rs index 668d5f3dd..fab09a9e4 100644 --- a/ggsql-jupyter/src/executor.rs +++ b/ggsql-jupyter/src/executor.rs @@ -178,7 +178,7 @@ impl QueryExecutor { // 2. Check if there's a visualization if !validated.has_visual() { // Pure SQL query - execute directly and return DataFrame. - let df = self.reader.execute_sql_primary(code)?; + let df = self.reader.execute_sql(code)?; tracing::info!( "Pure SQL executed: {} rows, {} cols", df.height(), diff --git a/src/CLAUDE.md b/src/CLAUDE.md index d0a12fba0..5394c90fd 100644 --- a/src/CLAUDE.md +++ b/src/CLAUDE.md @@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`). -**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache. **`execute_sql_primary` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins and reads that reference a cache-resident internal table, which go to the cache. Cache routing is by **membership** in the set of tables actually registered into the cache (the `resident` set: internal `__ggsql_*` tables plus user-registered tables). A non-row-returning source statement (DML/DDL on the primary) clears the read memo, so later identical reads don't return stale data. Pure/non-visual SQL (CLI table fallback, Jupyter) also goes through `execute_sql_primary` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `+://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. +**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. A non-row-returning source statement (DML/DDL on the primary) clears the read memo, so later identical reads don't return stale data. Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `+://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. ### `execute/` diff --git a/src/execute/mod.rs b/src/execute/mod.rs index acec6e512..2baa2032e 100644 --- a/src/execute/mod.rs +++ b/src/execute/mod.rs @@ -1098,12 +1098,10 @@ pub struct PreparedData { /// * `query` - The full ggsql query string /// * `reader` - A Reader implementation for executing SQL pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result { - // Two execution surfaces: `execute_query` is the COMPUTE surface for - // derived/dialect-generated SQL over internal `__ggsql_*` tables; - // `execute_query_primary` is the SOURCE surface for base reads of the - // user's data and user setup/DML. - let execute_query = |sql: &str| reader.execute_sql(sql); - let execute_query_primary = |sql: &str| reader.execute_sql_primary(sql); + // `execute_query` is the COMPUTE surface for derived/dialect-generated SQL + // over internal `__ggsql_*` tables. Base source reads (user setup/DML, the + // global query) call `reader.execute_sql(...)` directly. + let execute_query = |sql: &str| reader.execute_sql_cached(sql); let dialect = reader.dialect(); // Parse once and create SourceTree @@ -1134,7 +1132,7 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result Result Result bool { + let Ok(refs) = super::data::extract_table_refs(sql) else { + return false; + }; + let resident = self.resident.borrow(); + refs.iter().any(|t| resident.contains(t)) + } + /// Drop every memoized result and empty the metadata table. fn clear_memo(&self) -> Result<()> { self.ensure_meta_table()?; @@ -135,22 +147,13 @@ impl CachingReader { } impl Reader for CachingReader { - /// Compute surface: all derived/dialect-generated SQL runs on the cache. - fn execute_sql(&self, sql: &str) -> Result { - self.cache.execute_sql(sql) - } - /// Source surface: base reads of the user's data (plus user setup/DML). - fn execute_sql_primary(&self, sql: &str) -> Result { + fn execute_sql(&self, sql: &str) -> Result { // Route to the cache when the read targets cache-resident objects, the // metadata table, or a builtin dataset. - let references_cache_table = { - let resident = self.resident.borrow(); - resident.iter().any(|t| sql.contains(t.as_str())) - }; if sql.contains("ggsql:") || sql.contains(naming::CACHE_META_TABLE) - || references_cache_table + || self.references_resident(sql) { return self.cache.execute_sql(sql); } @@ -179,6 +182,11 @@ impl Reader for CachingReader { Ok(df) } + /// Compute surface: derived/dialect-generated SQL runs on the cache. + fn execute_sql_cached(&self, sql: &str) -> Result { + self.cache.execute_sql(sql) + } + fn register(&self, name: &str, df: DataFrame, replace: bool) -> Result<()> { self.cache.register(name, df, replace)?; self.resident.borrow_mut().insert(name.to_string()); @@ -209,7 +217,7 @@ impl Reader for CachingReader { // Read the body via the source surface, then register the result // into the cache. let body = super::wrap_with_column_aliases(body_sql, column_aliases); - let df = self.execute_sql_primary(&body)?; + let df = self.execute_sql(&body)?; self.register(name, df, true) } @@ -254,7 +262,9 @@ mod behavior_tests { .register("t", df! { "x" => vec![1_i64, 2, 3] }.unwrap(), true) .unwrap(); // register writes to the cache; the compute surface reads it back. - let out = reader.execute_sql("SELECT COUNT(*) AS n FROM t").unwrap(); + let out = reader + .execute_sql_cached("SELECT COUNT(*) AS n FROM t") + .unwrap(); assert_eq!(as_i64(out.column("n").unwrap()).unwrap().value(0), 3); // The primary was never touched. @@ -272,8 +282,8 @@ mod behavior_tests { let reader = CachingReader::new(primary, cache, "test://primary"); let q = "SELECT y FROM base ORDER BY y"; - let d1 = reader.execute_sql_primary(q).unwrap(); - let d2 = reader.execute_sql_primary(q).unwrap(); + let d1 = reader.execute_sql(q).unwrap(); + let d2 = reader.execute_sql(q).unwrap(); assert_eq!(d1.height(), 3); assert_eq!(d2.height(), 3); @@ -491,7 +501,7 @@ mod behavior_tests { let reader = CachingReader::new(primary, cache, "test://primary"); let q = "SELECT y FROM base ORDER BY y"; - reader.execute_sql_primary(q).unwrap(); + reader.execute_sql(q).unwrap(); // The metadata table now has exactly one row for this read. let meta = reader @@ -506,7 +516,7 @@ mod behavior_tests { ); // The repeat read is served from the cache, not the primary. - reader.execute_sql_primary(q).unwrap(); + reader.execute_sql(q).unwrap(); let hits = log .lock() .unwrap() @@ -531,8 +541,8 @@ mod behavior_tests { let reader = CachingReader::new(primary, cache, "test://primary"); let q = "SELECT y FROM base ORDER BY y"; - let d1 = reader.execute_sql_primary(q).unwrap(); - let d2 = reader.execute_sql_primary(q).unwrap(); + let d1 = reader.execute_sql(q).unwrap(); + let d2 = reader.execute_sql(q).unwrap(); assert_eq!(d1.height(), 3); assert_eq!(d2.height(), 3); let hits = log @@ -547,6 +557,34 @@ mod behavior_tests { ); } + #[test] + fn test_resident_substring_not_false_matched() { + // A primary-only table whose name *contains* a cache-resident table name + // as a substring must still route to the primary. Exact-identifier + // matching distinguishes `orders` (resident) from `orders_archive` + // (primary-only). + let primary = DuckDBReader::new_in_memory().unwrap(); + primary + .register( + "orders_archive", + df! { "v" => vec![1_i64, 2, 3] }.unwrap(), + true, + ) + .unwrap(); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(Box::new(primary), cache, "test://primary"); + + // `orders` lives only in the cache. + reader + .register("orders", df! { "v" => vec![9_i64] }.unwrap(), true) + .unwrap(); + + // Reading the primary-only `orders_archive` must hit the primary (3 rows), + // not the resident `orders` (1 row). + let df = reader.execute_sql("SELECT v FROM orders_archive").unwrap(); + assert_eq!(df.height(), 3); + } + #[test] fn test_source_write_invalidates_memo() { // Memoize a base read, mutate the primary, then re-read: the memo must be @@ -559,14 +597,12 @@ mod behavior_tests { let reader = CachingReader::new(Box::new(primary), cache, "test://primary"); let q = "SELECT v FROM t"; - let d1 = reader.execute_sql_primary(q).unwrap(); + let d1 = reader.execute_sql(q).unwrap(); assert_eq!(d1.height(), 3); - reader - .execute_sql_primary("INSERT INTO t VALUES (4)") - .unwrap(); + reader.execute_sql("INSERT INTO t VALUES (4)").unwrap(); - let d2 = reader.execute_sql_primary(q).unwrap(); + let d2 = reader.execute_sql(q).unwrap(); assert_eq!( d2.height(), 4, @@ -576,8 +612,8 @@ mod behavior_tests { #[test] fn test_pure_sql_reads_primary_not_cache() { - // The pure-SQL display path uses `execute_sql_primary`, which reads the - // primary; `execute_sql` would hit the empty cache and fail. + // The pure-SQL display path uses `execute_sql` (source), which reads the + // primary; `execute_sql_cached` (compute) would hit the empty cache and fail. let primary = DuckDBReader::new_in_memory().unwrap(); primary .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true) @@ -585,10 +621,10 @@ mod behavior_tests { let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); let reader = CachingReader::new(Box::new(primary), cache, "test://primary"); - let df = reader.execute_sql_primary("SELECT v FROM t").unwrap(); + let df = reader.execute_sql("SELECT v FROM t").unwrap(); assert_eq!(df.height(), 3); assert!( - reader.execute_sql("SELECT v FROM t").is_err(), + reader.execute_sql_cached("SELECT v FROM t").is_err(), "compute surface should not find the primary-only table" ); } diff --git a/src/reader/cache_equivalence.rs b/src/reader/cache_equivalence.rs index 819d92d65..52960a4a7 100644 --- a/src/reader/cache_equivalence.rs +++ b/src/reader/cache_equivalence.rs @@ -346,12 +346,12 @@ mod adbc_mode { let cached = CachingReader::new(Box::new(primary), Box::new(cache), "test://primary"); // Base reads go through the source surface; the cache memoizes them. - let miss = cached.execute_sql_primary(sql).unwrap(); + let miss = cached.execute_sql(sql).unwrap(); assert_dataframes_equal(&golden, &miss, "adbc cache miss"); let after_miss = calls.load(Ordering::SeqCst); assert!(after_miss >= 1, "miss should reach the ADBC primary"); - let hit = cached.execute_sql_primary(sql).unwrap(); + let hit = cached.execute_sql(sql).unwrap(); assert_dataframes_equal(&golden, &hit, "adbc cache hit"); let after_hit = calls.load(Ordering::SeqCst); assert_eq!( diff --git a/src/reader/data.rs b/src/reader/data.rs index abf9e9604..384d944ff 100644 --- a/src/reader/data.rs +++ b/src/reader/data.rs @@ -146,6 +146,22 @@ pub fn extract_builtin_dataset_names(sql: &str) -> Result, GgsqlErro Ok(datasets) } +/// Extract the table names referenced in a SQL query's `FROM`/`JOIN` clauses. +/// +/// Returns the `table_ref` targets (deduplicated), with surrounding quotes +/// stripped via [`naming::unquote_ident`] so they can be compared against +/// unquoted registered table names. Subquery sources contribute no name. +pub fn extract_table_refs(sql: &str) -> Result, GgsqlError> { + let token_def = r#"(table_ref table: (_) @table)"#; + let mut tokens = tokens_from_tree(sql, token_def, "table")? + .iter() + .map(|t| naming::unquote_ident(t)) + .collect::>(); + tokens.sort_unstable(); + tokens.dedup(); + Ok(tokens) +} + /// Rewrite SQL to replace namespaced identifiers with internal table names. /// /// e.g., `SELECT * FROM ggsql:penguins` -> `SELECT * FROM __ggsql_data_penguins__` diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 2eef03bea..34a63f306 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -726,10 +726,9 @@ pub struct Metadata { pub trait Reader { /// Execute a SQL query and return the result as a DataFrame. /// - /// This is the **compute surface**: ggsql runs all dialect-generated/derived - /// SQL here. A plain reader runs it on its own connection; a - /// caching reader runs it on the in-memory cache, where all derived - /// `__ggsql_*` tables live. + /// This is the **source surface**: base reads of the user's data plus user + /// setup/DML. A plain reader runs everything on its one connection; a caching + /// reader reads the primary here (with result memoization). /// /// # Errors /// @@ -739,12 +738,12 @@ pub trait Reader { /// - The table or columns don't exist fn execute_sql(&self, sql: &str) -> Result; - /// Execute SQL against the *source* surface — base reads of the user's data - /// plus user setup/DML. + /// Execute SQL against the *compute surface* — dialect-generated/derived SQL + /// over internal `__ggsql_*` tables. /// - /// Defaults to the compute surface ([`Reader::execute_sql`]), so a plain - /// reader runs everything on one connection. A caching reader overrides this - /// to read the primary (with result memoization). + /// Defaults to the source surface ([`Reader::execute_sql`]), so a plain reader + /// runs everything on one connection. A caching reader overrides this to run + /// on the in-memory cache, where all derived `__ggsql_*` tables live. /// /// # Errors /// @@ -752,7 +751,7 @@ pub trait Reader { /// - The SQL is invalid /// - The connection fails /// - The table or columns don't exist - fn execute_sql_primary(&self, sql: &str) -> Result { + fn execute_sql_cached(&self, sql: &str) -> Result { self.execute_sql(sql) } From 9e6958e828f03fc4e50365bf637795b712b5ea1f Mon Sep 17 00:00:00 2001 From: George Stagg Date: Tue, 30 Jun 2026 15:03:25 +0100 Subject: [PATCH 07/15] Add TTL + LRU byte-budget eviction to the cache --- src/CLAUDE.md | 2 +- src/reader/cache.rs | 538 ++++++++++++++++++++++++++++++++++++++++---- src/reader/mod.rs | 5 + 3 files changed, 500 insertions(+), 45 deletions(-) diff --git a/src/CLAUDE.md b/src/CLAUDE.md index 5394c90fd..5f94c2589 100644 --- a/src/CLAUDE.md +++ b/src/CLAUDE.md @@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`). -**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. A non-row-returning source statement (DML/DDL on the primary) clears the read memo, so later identical reads don't return stale data. Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `+://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. +**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. Each memoized read is bounded by a **TTL** (default 300s) and the whole memo by an **LRU byte budget** (default 512 MB); both are configurable via `CacheConfig` (env `GGSQL_CACHE_DISABLED`/`GGSQL_CACHE_TTL`/`GGSQL_CACHE_MAX_BYTES`). Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `+://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. ### `execute/` diff --git a/src/reader/cache.rs b/src/reader/cache.rs index b951b4154..7b7a14ce9 100644 --- a/src/reader/cache.rs +++ b/src/reader/cache.rs @@ -11,12 +11,123 @@ //! `register`s the result into the cache. //! - [`Reader::dialect`] returns the cache dialect. +use crate::array_util::{as_i64, as_str}; use crate::reader::{execute_with_reader, ColumnInfo, Reader, Spec, SqlDialect, TableInfo}; use crate::{naming, DataFrame, Result}; +use arrow::array::Array; use std::cell::{Cell, RefCell}; use std::collections::hash_map::DefaultHasher; use std::collections::HashSet; use std::hash::Hasher; +use std::time::{SystemTime, UNIX_EPOCH}; + +/// Runtime configuration for the result memo: TTL and LRU byte-budget. +#[derive(Debug, Clone)] +pub struct CacheConfig { + /// When `false`, reads always hit the primary. + pub enabled: bool, + /// Entries older than this are treated as misses and re-fetched. + pub ttl_secs: u64, + /// Cumulative byte budget across all memo entries before LRU eviction. + pub max_bytes: u64, +} + +impl Default for CacheConfig { + fn default() -> Self { + Self { + enabled: true, + ttl_secs: 300, + max_bytes: 512 * 1024 * 1024, + } + } +} + +/// Per-field overrides applied on top of an env-derived [`CacheConfig`]. +#[derive(Debug, Clone, Default)] +pub struct CacheConfigOverride { + pub enabled: Option, + pub ttl_secs: Option, + pub max_bytes: Option, +} + +impl CacheConfig { + /// Read configuration from the environment, falling back to defaults. + /// + /// - `GGSQL_CACHE_DISABLED` — set, non-empty and not `0` disables the cache. + /// - `GGSQL_CACHE_TTL` — TTL in seconds. + /// - `GGSQL_CACHE_MAX_BYTES` — byte budget, accepts `512mb`/`1gb`/bytes. + pub fn from_env() -> Self { + let mut cfg = Self::default(); + if std::env::var("GGSQL_CACHE_DISABLED") + .ok() + .filter(|v| !v.is_empty() && v != "0") + .is_some() + { + cfg.enabled = false; + } + if let Ok(v) = std::env::var("GGSQL_CACHE_TTL") { + if let Ok(secs) = v.trim().parse::() { + cfg.ttl_secs = secs; + } + } + if let Ok(v) = std::env::var("GGSQL_CACHE_MAX_BYTES") { + if let Some(bytes) = parse_human_bytes(&v) { + cfg.max_bytes = bytes; + } + } + cfg + } + + /// Apply per-field overrides; each `Some` value wins over `self`. + pub fn merge(self, over: CacheConfigOverride) -> Self { + Self { + enabled: over.enabled.unwrap_or(self.enabled), + ttl_secs: over.ttl_secs.unwrap_or(self.ttl_secs), + max_bytes: over.max_bytes.unwrap_or(self.max_bytes), + } + } +} + +/// Parse a byte count, accepting an optional `kb`/`mb`/`gb` suffix. +pub fn parse_human_bytes(s: &str) -> Option { + let s = s.trim(); + let lower = s.to_ascii_lowercase(); + let (num, mult) = if let Some(n) = lower.strip_suffix("gb") { + (n, 1024 * 1024 * 1024) + } else if let Some(n) = lower.strip_suffix("mb") { + (n, 1024 * 1024) + } else if let Some(n) = lower.strip_suffix("kb") { + (n, 1024) + } else { + (lower.as_str(), 1) + }; + num.trim().parse::().ok().map(|n| n * mult) +} + +/// Wall-clock milliseconds since the UNIX epoch. On a clock earlier than the +/// epoch (a misconfigured system clock) we return `i64::MAX` so existing +/// entries appear ancient and force a re-fetch. +fn now_ms() -> i64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as i64) + .unwrap_or(i64::MAX) +} + +/// Estimate the in-memory size of a DataFrame as the sum of its Arrow columns' +/// memory footprints. +fn estimate_bytes(df: &DataFrame) -> i64 { + df.get_columns() + .iter() + .map(|col| col.get_array_memory_size()) + .sum::() as i64 +} + +/// One memo row, trimmed to the fields consulted at runtime. +struct MemoEntry { + table_name: String, + fetched_at_epoch_ms: i64, +} pub struct CachingReader { /// Primary backend — the real data source. @@ -25,6 +136,8 @@ pub struct CachingReader { cache: Box, /// Connection URI of the primary. primary_uri: String, + /// TTL + byte-budget configuration for the result memo. + config: CacheConfig, /// Whether the metadata table has been created in the cache backend. meta_ready: Cell, /// Names registered into the cache. A source read that references one @@ -34,22 +147,39 @@ pub struct CachingReader { impl CachingReader { /// Construct a `CachingReader` from a primary reader, an in-memory cache - /// backend, and the primary's connection URI. The cache is owned by the - /// `CachingReader` and dropped with it. + /// backend, and the primary's connection URI, using environment-derived + /// cache configuration. The cache is owned by the `CachingReader` and + /// dropped with it. pub fn new( primary: Box, cache: Box, primary_uri: impl Into, + ) -> Self { + Self::with_config(primary, cache, primary_uri, CacheConfig::from_env()) + } + + /// Construct a `CachingReader` with explicit cache configuration. + pub fn with_config( + primary: Box, + cache: Box, + primary_uri: impl Into, + config: CacheConfig, ) -> Self { Self { primary, cache, primary_uri: primary_uri.into(), + config, meta_ready: Cell::new(false), resident: RefCell::new(HashSet::new()), } } + /// The active cache configuration. + pub fn cache_config(&self) -> &CacheConfig { + &self.config + } + /// Derive a stable cache key from the primary URI and the SQL text. fn cache_key(&self, sql: &str) -> String { let mut hasher = DefaultHasher::new(); @@ -65,8 +195,10 @@ impl CachingReader { return Ok(()); } let sql = format!( - "CREATE TABLE IF NOT EXISTS {} \ - (cache_key VARCHAR PRIMARY KEY, sql VARCHAR NOT NULL, table_name VARCHAR NOT NULL)", + "CREATE TABLE IF NOT EXISTS {} (\ + cache_key VARCHAR PRIMARY KEY, sql VARCHAR NOT NULL, table_name VARCHAR NOT NULL, \ + fetched_at_epoch_ms BIGINT NOT NULL, last_accessed_epoch_ms BIGINT NOT NULL, \ + byte_estimate BIGINT NOT NULL, row_count BIGINT NOT NULL)", naming::quote_ident(naming::CACHE_META_TABLE) ); self.cache.execute_sql(&sql)?; @@ -74,10 +206,10 @@ impl CachingReader { Ok(()) } - /// Look up the cache table holding the memoized result for `key`. - fn lookup_memo(&self, key: &str) -> Result> { + /// Look up the memo entry for `key`. + fn lookup_memo(&self, key: &str) -> Result> { let sql = format!( - "SELECT table_name FROM {} WHERE cache_key = {}", + "SELECT table_name, fetched_at_epoch_ms FROM {} WHERE cache_key = {}", naming::quote_ident(naming::CACHE_META_TABLE), naming::quote_literal(key), ); @@ -85,38 +217,95 @@ impl CachingReader { if df.height() == 0 { return Ok(None); } - let table = crate::array_util::as_str(df.column("table_name")?)? - .value(0) - .to_string(); - Ok(Some(table)) + let table_name = as_str(df.column("table_name")?)?.value(0).to_string(); + let fetched_at_epoch_ms = as_i64(df.column("fetched_at_epoch_ms")?)?.value(0); + Ok(Some(MemoEntry { + table_name, + fetched_at_epoch_ms, + })) } /// Record a memoized read in the metadata table. - fn insert_memo(&self, key: &str, sql: &str, table: &str) -> Result<()> { + fn insert_memo( + &self, + key: &str, + sql: &str, + table: &str, + byte_estimate: i64, + row_count: i64, + ) -> Result<()> { + let now = now_ms(); let stmt = format!( - "INSERT OR REPLACE INTO {} (cache_key, sql, table_name) VALUES ({}, {}, {})", + "INSERT OR REPLACE INTO {} \ + (cache_key, sql, table_name, fetched_at_epoch_ms, last_accessed_epoch_ms, \ + byte_estimate, row_count) \ + VALUES ({}, {}, {}, {}, {}, {}, {})", naming::quote_ident(naming::CACHE_META_TABLE), naming::quote_literal(key), naming::quote_literal(sql), naming::quote_literal(table), + now, + now, + byte_estimate, + row_count, ); self.cache.execute_sql(&stmt)?; Ok(()) } - /// The cache tables holding all currently-memoized results. - fn memo_tables(&self) -> Result> { - let sql = format!( - "SELECT table_name FROM {}", + /// Advance the last-accessed timestamp for `key` (LRU bookkeeping). + fn touch(&self, key: &str) -> Result<()> { + let stmt = format!( + "UPDATE {} SET last_accessed_epoch_ms = {} WHERE cache_key = {}", + naming::quote_ident(naming::CACHE_META_TABLE), + now_ms(), + naming::quote_literal(key), + ); + self.cache.execute_sql(&stmt)?; + Ok(()) + } + + /// Drop a single memo entry: delete its meta row and unregister the table. + fn drop_entry(&self, key: &str, table: &str) -> Result<()> { + let del = format!( + "DELETE FROM {} WHERE cache_key = {}", + naming::quote_ident(naming::CACHE_META_TABLE), + naming::quote_literal(key), + ); + self.cache.execute_sql(&del)?; + let _ = self.cache.unregister(table); + Ok(()) + } + + /// Evict LRU entries until the cumulative byte estimate is within `max_bytes`. + fn evict_over_budget(&self) -> Result<()> { + // Cast the SUM to BIGINT. + let sum_sql = format!( + "SELECT CAST(COALESCE(SUM(byte_estimate), 0) AS BIGINT) AS n FROM {}", naming::quote_ident(naming::CACHE_META_TABLE) ); - let df = self.cache.execute_sql(&sql)?; - let n = df.height(); - if n == 0 { - return Ok(Vec::new()); + loop { + let df = self.cache.execute_sql(&sum_sql)?; + let total = if df.height() == 0 { + 0 + } else { + as_i64(df.column("n")?)?.value(0) + }; + if total <= self.config.max_bytes as i64 { + return Ok(()); + } + let pick = format!( + "SELECT cache_key, table_name FROM {} ORDER BY last_accessed_epoch_ms ASC LIMIT 1", + naming::quote_ident(naming::CACHE_META_TABLE) + ); + let df = self.cache.execute_sql(&pick)?; + if df.height() == 0 { + return Ok(()); + } + let key = as_str(df.column("cache_key")?)?.value(0).to_string(); + let table = as_str(df.column("table_name")?)?.value(0).to_string(); + self.drop_entry(&key, &table)?; } - let col = crate::array_util::as_str(df.column("table_name")?)?; - Ok((0..n).map(|i| col.value(i).to_string()).collect()) } /// Whether `sql` references a cache-resident table by exact name. @@ -131,17 +320,34 @@ impl CachingReader { refs.iter().any(|t| resident.contains(t)) } - /// Drop every memoized result and empty the metadata table. + /// Drop every memoized result, one entry at a time. fn clear_memo(&self) -> Result<()> { self.ensure_meta_table()?; - for table in self.memo_tables()? { - let _ = self.cache.unregister(&table); - } - let del = format!( - "DELETE FROM {}", + let df = self.cache.execute_sql(&format!( + "SELECT cache_key, table_name FROM {}", naming::quote_ident(naming::CACHE_META_TABLE) - ); - self.cache.execute_sql(&del)?; + ))?; + let n = df.height(); + if n == 0 { + return Ok(()); + } + let keys = as_str(df.column("cache_key")?)?; + let tables = as_str(df.column("table_name")?)?; + let mut failures: Vec = Vec::new(); + for i in 0..n { + let key = keys.value(i); + let table = tables.value(i); + if let Err(e) = self.drop_entry(key, table) { + failures.push(format!("{key}: {e}")); + } + } + if !failures.is_empty() { + return Err(crate::GgsqlError::ReaderError(format!( + "clear_cache: {} cache entries failed to drop: {}", + failures.len(), + failures.join("; ") + ))); + } Ok(()) } } @@ -159,24 +365,44 @@ impl Reader for CachingReader { } self.ensure_meta_table()?; + + // With caching disabled the memo is never consulted or written. + if !self.config.enabled { + return self.primary.execute_sql(sql); + } + let key = self.cache_key(sql); - if let Some(table) = self.lookup_memo(&key)? { - return self - .cache - .execute_sql(&format!("SELECT * FROM {}", naming::quote_ident(&table))); + // Serve a fresh, still-present entry; on a stale or vanished entry drop + // it and fall through to a primary re-fetch. + if let Some(entry) = self.lookup_memo(&key)? { + let age_ms = (now_ms() - entry.fetched_at_epoch_ms).max(0); + let ttl_ms = (self.config.ttl_secs as i64).saturating_mul(1000); + if age_ms < ttl_ms { + let select = format!("SELECT * FROM {}", naming::quote_ident(&entry.table_name)); + if let Ok(df) = self.cache.execute_sql(&select) { + self.touch(&key)?; + return Ok(df); + } + } + self.drop_entry(&key, &entry.table_name)?; } let df = self.primary.execute_sql(sql)?; - if super::returns_rows(sql) { + // Cache row-returning reads only. + if super::returns_rows(sql) && df.width() > 0 { let table = naming::cache_result_table(&key); + let byte_estimate = estimate_bytes(&df); + let row_count = df.height() as i64; self.cache.register(&table, df.clone(), true)?; - self.insert_memo(&key, sql, &table)?; - } else { - // A source-side write or DDL on the primary can change the data - // behind a memoized read; drop the memo. - self.clear_memo()?; + if let Err(e) = self.insert_memo(&key, sql, &table, byte_estimate, row_count) { + // The result table registered but the meta row didn't: drop the + // orphan so it can't leak, then surface the error. + let _ = self.cache.unregister(&table); + return Err(e); + } + self.evict_over_budget()?; } Ok(df) @@ -225,6 +451,10 @@ impl Reader for CachingReader { true } + fn clear_cache(&self) -> Result<()> { + self.clear_memo() + } + // Schema introspection describes the real data source, so delegate to the // primary; the cache only holds synthetic `__ggsql_*` tables. fn list_catalogs(&self) -> Result> { @@ -603,11 +833,231 @@ mod behavior_tests { reader.execute_sql("INSERT INTO t VALUES (4)").unwrap(); let d2 = reader.execute_sql(q).unwrap(); + assert_eq!(d2.height(), 3, "the memo is served despite the write"); + + // After an explicit clear, the re-read sees the inserted row. + reader.clear_cache().unwrap(); + let d3 = reader.execute_sql(q).unwrap(); + assert_eq!(d3.height(), 4, "clear_cache forces a fresh primary read"); + } + + #[test] + fn test_default_config_enabled_ttl_300() { + let reader = CachingReader::with_config( + Box::new(DuckDBReader::new_in_memory().unwrap()), + Box::new(DuckDBReader::new_in_memory().unwrap()), + "test://primary", + CacheConfig::default(), + ); + assert!(reader.cache_config().enabled); + assert_eq!(reader.cache_config().ttl_secs, 300); + } + + #[test] + fn test_repeat_query_hits_primary_once() { + let inner = DuckDBReader::new_in_memory().unwrap(); + inner + .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let (primary, log) = SpyReader::wrap(Box::new(inner)); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let q = "SELECT y FROM base ORDER BY y"; + reader.execute_sql(q).unwrap(); + reader.execute_sql(q).unwrap(); + + let hits = log + .lock() + .unwrap() + .iter() + .filter(|s| s.as_str() == q) + .count(); + assert_eq!(hits, 1, "the repeat read is served from the memo"); + } + + #[test] + fn test_ttl_zero_always_misses() { + let inner = DuckDBReader::new_in_memory().unwrap(); + inner + .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let (primary, log) = SpyReader::wrap(Box::new(inner)); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::with_config( + primary, + cache, + "test://primary", + CacheConfig { + enabled: true, + ttl_secs: 0, + max_bytes: 512 * 1024 * 1024, + }, + ); + + let q = "SELECT y FROM base ORDER BY y"; + reader.execute_sql(q).unwrap(); + reader.execute_sql(q).unwrap(); + + let hits = log + .lock() + .unwrap() + .iter() + .filter(|s| s.as_str() == q) + .count(); + assert_eq!(hits, 2, "ttl=0 must miss on every read"); + } + + #[test] + fn test_disabled_always_hits_primary() { + let inner = DuckDBReader::new_in_memory().unwrap(); + inner + .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let (primary, log) = SpyReader::wrap(Box::new(inner)); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::with_config( + primary, + cache, + "test://primary", + CacheConfig { + enabled: false, + ttl_secs: 300, + max_bytes: 512 * 1024 * 1024, + }, + ); + + let q = "SELECT y FROM base ORDER BY y"; + reader.execute_sql(q).unwrap(); + reader.execute_sql(q).unwrap(); + + let hits = log + .lock() + .unwrap() + .iter() + .filter(|s| s.as_str() == q) + .count(); + assert_eq!(hits, 2, "a disabled cache always hits the primary"); + // The memo was never written. + let meta = reader + .execute_sql(&format!("SELECT * FROM {}", naming::CACHE_META_TABLE)) + .unwrap(); + assert_eq!(meta.height(), 0); + } + + #[test] + fn test_lru_evicts_oldest_when_over_budget() { + // A 1-byte budget forces eviction after every insert, so each cached + // read is gone by the next time it's requested. + let inner = DuckDBReader::new_in_memory().unwrap(); + inner + .register( + "base", + df! { "a" => vec![1_i64, 2, 3], "b" => vec![4_i64, 5, 6] }.unwrap(), + true, + ) + .unwrap(); + let (primary, log) = SpyReader::wrap(Box::new(inner)); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::with_config( + primary, + cache, + "test://primary", + CacheConfig { + enabled: true, + ttl_secs: 300, + max_bytes: 1, + }, + ); + + let q1 = "SELECT a FROM base ORDER BY a"; + let q2 = "SELECT b FROM base ORDER BY b"; + reader.execute_sql(q1).unwrap(); + reader.execute_sql(q2).unwrap(); + // q1 was evicted when q2 was inserted, so re-reading q1 hits the primary + // again. + reader.execute_sql(q1).unwrap(); + + let q1_hits = log + .lock() + .unwrap() + .iter() + .filter(|s| s.as_str() == q1) + .count(); assert_eq!( - d2.height(), - 4, - "re-read must see the inserted row, not the memo" + q1_hits, 2, + "the evicted entry is re-fetched from the primary" ); + // The budget is enforced: at most one entry resides. + let meta = reader + .execute_sql(&format!("SELECT * FROM {}", naming::CACHE_META_TABLE)) + .unwrap(); + assert!(meta.height() <= 1, "over-budget entries are evicted"); + } + + #[test] + fn test_missing_cached_table_self_heals() { + // If the cached result table vanishes out from under the memo, the entry + // is dropped and the read falls through to the primary instead of erroring. + let inner = DuckDBReader::new_in_memory().unwrap(); + inner + .register("base", df! { "y" => vec![1_i64, 2, 3] }.unwrap(), true) + .unwrap(); + let (primary, log) = SpyReader::wrap(Box::new(inner)); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let q = "SELECT y FROM base ORDER BY y"; + reader.execute_sql(q).unwrap(); + + // Drop the cached table directly, leaving a dangling meta row. + let table = reader + .execute_sql(&format!( + "SELECT table_name FROM {}", + naming::CACHE_META_TABLE + )) + .unwrap(); + let table_name = crate::array_util::as_str(table.column("table_name").unwrap()) + .unwrap() + .value(0) + .to_string(); + reader + .cache + .execute_sql(&format!("DROP TABLE {}", naming::quote_ident(&table_name))) + .unwrap(); + + // The next read self-heals: it re-fetches from the primary. + let df = reader.execute_sql(q).unwrap(); + assert_eq!(df.height(), 3); + let hits = log + .lock() + .unwrap() + .iter() + .filter(|s| s.as_str() == q) + .count(); + assert_eq!(hits, 2, "the missing table forced a primary re-fetch"); + } + + #[test] + fn test_parse_human_bytes() { + assert_eq!(parse_human_bytes("1024"), Some(1024)); + assert_eq!(parse_human_bytes("512mb"), Some(512 * 1024 * 1024)); + assert_eq!(parse_human_bytes("1GB"), Some(1024 * 1024 * 1024)); + assert_eq!(parse_human_bytes(" 2kb "), Some(2 * 1024)); + assert_eq!(parse_human_bytes("nonsense"), None); + } + + #[test] + fn test_config_merge_uri_wins() { + let base = CacheConfig::default(); + let merged = base.merge(CacheConfigOverride { + enabled: Some(false), + ttl_secs: Some(60), + max_bytes: None, + }); + assert!(!merged.enabled); + assert_eq!(merged.ttl_secs, 60); + assert_eq!(merged.max_bytes, 512 * 1024 * 1024); } #[test] diff --git a/src/reader/mod.rs b/src/reader/mod.rs index 34a63f306..aaa961a83 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -856,6 +856,11 @@ pub trait Reader { false } + /// Clear any cached query results held by this reader. + fn clear_cache(&self) -> Result<()> { + Ok(()) + } + // ========================================================================= // Schema introspection // ========================================================================= From fc162a6dddaefb8083c6d9448657ba78c3715493 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Tue, 30 Jun 2026 15:31:35 +0100 Subject: [PATCH 08/15] Add -- @uncache and make cell meta-commands line-oriented --- ggsql-jupyter/src/executor.rs | 163 +++++++++++++++++++++++++++++----- ggsql-jupyter/src/kernel.rs | 15 ++-- 2 files changed, 151 insertions(+), 27 deletions(-) diff --git a/ggsql-jupyter/src/executor.rs b/ggsql-jupyter/src/executor.rs index fab09a9e4..18f6e0116 100644 --- a/ggsql-jupyter/src/executor.rs +++ b/ggsql-jupyter/src/executor.rs @@ -2,7 +2,8 @@ //! //! This module handles the execution of ggsql queries using the existing //! ggsql library components (parser, DuckDB reader, Vega-Lite writer). -//! It supports dynamic reader switching via `-- @connect:` meta-commands. +//! It supports leading `--` meta-command lines. Each occupies its own comment +//! line, so a cell may stack them above a query that then runs as normal. use anyhow::Result; use ggsql::{ @@ -25,7 +26,7 @@ pub enum ExecutionResult { spec: String, // Vega-Lite JSON }, /// Connection changed via meta-command - ConnectionChanged { uri: String, display_name: String }, + ConnectionChanged { display_name: String }, } /// Generate a human-readable display name for a connection URI. @@ -101,13 +102,49 @@ pub fn host_for_uri(uri: &str) -> String { /// The `-- @connect:` meta-command prefix. const META_CONNECT_PREFIX: &str = "-- @connect:"; +/// The `-- @uncache` meta-command prefix. +const META_UNCACHE_PREFIX: &str = "-- @uncache"; + +/// A leading cell directive expressed as a `--` line comment. +#[derive(Debug, PartialEq, Eq)] +pub enum MetaCommand { + /// Switch the active reader to the given connection URI. + Connect(String), + /// Clear the active reader's cache. + Uncache, +} + +/// Split `code` into its first line and the remainder. +/// Handles `\n`, `\r\n`, and a lone `\r`. +fn split_first_line(code: &str) -> (&str, &str) { + match code.find(['\n', '\r']) { + None => (code, ""), + Some(i) => { + let line = &code[..i]; + let rest = &code[i..]; + let rest = rest + .strip_prefix("\r\n") + .or_else(|| rest.strip_prefix('\n')) + .or_else(|| rest.strip_prefix('\r')) + .unwrap_or(rest); + (line, rest) + } + } +} -/// Parse a `-- @connect: ` meta-command, returning the URI if present. -pub fn parse_meta_command(code: &str) -> Option { - let trimmed = code.trim(); - trimmed - .strip_prefix(META_CONNECT_PREFIX) - .map(|rest| rest.trim().to_string()) +/// Peel a single leading meta-command from `code`, returning it together with +/// the rest of the cell to process next. +pub fn take_leading_meta(code: &str) -> Option<(MetaCommand, &str)> { + let trimmed = code.trim_start(); + let (line, rest) = split_first_line(trimmed); + let line = line.trim(); + if let Some(uri) = line.strip_prefix(META_CONNECT_PREFIX) { + return Some((MetaCommand::Connect(uri.trim().to_string()), rest)); + } + if line == META_UNCACHE_PREFIX { + return Some((MetaCommand::Uncache, rest)); + } + None } /// Query executor maintaining persistent database connection @@ -158,18 +195,38 @@ impl QueryExecutor { /// Execute a ggsql query or meta-command /// /// This handles: - /// - `-- @connect: ` meta-commands for switching readers + /// - `-- @` meta-commands /// - Pure SQL queries (no VISUALISE) /// - ggsql queries with VISUALISE clauses pub fn execute(&mut self, code: &str) -> Result { tracing::debug!("Executing query: {} chars", code.len()); - // Check for meta-commands first - if let Some(uri) = parse_meta_command(code) { - tracing::info!("Meta-command: switching reader to {}", uri); - self.swap_reader(&uri)?; - let display_name = display_name_for_uri(&uri); - return Ok(ExecutionResult::ConnectionChanged { uri, display_name }); + // Apply any leading meta-command lines, then run whatever SQL remains. + let mut code = code; + let mut last_connect: Option = None; + while let Some((cmd, rest)) = take_leading_meta(code) { + match cmd { + MetaCommand::Connect(uri) => { + tracing::info!("Meta-command: switching reader to {}", uri); + self.swap_reader(&uri)?; + last_connect = Some(uri); + } + MetaCommand::Uncache => { + tracing::info!("Meta-command: clearing cache"); + self.reader.clear_cache()?; + } + } + code = rest; + } + + // A cell that was nothing but meta-commands. + if code.trim().is_empty() { + if let Some(uri) = last_connect { + let display_name = display_name_for_uri(&uri); + return Ok(ExecutionResult::ConnectionChanged { display_name }); + } + // An empty DataFrame renders no cell output. + return Ok(ExecutionResult::DataFrame(DataFrame::empty())); } // 1. Validate to check if there's a visualization @@ -238,16 +295,44 @@ mod tests { } #[test] - fn test_parse_meta_command() { + fn test_take_leading_meta_connect() { + // `-- @connect:` takes the rest of its line as the URI; the next line is + // the remainder. + assert_eq!( + take_leading_meta("-- @connect: duckdb://memory"), + Some((MetaCommand::Connect("duckdb://memory".to_string()), "")) + ); + assert_eq!( + take_leading_meta(" -- @connect: duckdb://my.db \nSELECT 1"), + Some(( + MetaCommand::Connect("duckdb://my.db".to_string()), + "SELECT 1" + )) + ); + } + + #[test] + fn test_take_leading_meta_uncache() { + assert_eq!( + take_leading_meta("-- @uncache"), + Some((MetaCommand::Uncache, "")) + ); assert_eq!( - parse_meta_command("-- @connect: duckdb://memory"), - Some("duckdb://memory".to_string()) + take_leading_meta("-- @uncache\nSELECT 1"), + Some((MetaCommand::Uncache, "SELECT 1")) ); assert_eq!( - parse_meta_command(" -- @connect: duckdb://my.db "), - Some("duckdb://my.db".to_string()) + take_leading_meta("-- @uncache \r\nSELECT 1"), + Some((MetaCommand::Uncache, "SELECT 1")) ); - assert_eq!(parse_meta_command("SELECT 1"), None); + // `-- @uncache foo` on one line is an ordinary SQL comment, not the directive. + assert_eq!(take_leading_meta("-- @uncache foo"), None); + } + + #[test] + fn test_take_leading_meta_non_directive() { + assert_eq!(take_leading_meta("SELECT 1"), None); + assert_eq!(take_leading_meta("-- a normal comment\nSELECT 1"), None); } #[test] @@ -259,6 +344,42 @@ mod tests { assert!(matches!(result, ExecutionResult::ConnectionChanged { .. })); } + #[test] + fn test_connect_then_runs_remaining_query() { + // A leading `-- @connect:` switches the reader and still runs the query + // below it in the same cell. + let mut executor = QueryExecutor::new().unwrap(); + let result = executor + .execute( + "-- @connect: duckdb://memory\nSELECT 1 AS x, 2 AS y VISUALISE x, y DRAW point", + ) + .unwrap(); + assert_eq!(executor.reader_uri(), "duckdb://memory"); + assert!(matches!(result, ExecutionResult::Visualization { .. })); + } + + #[test] + fn test_uncache_meta_command_clears_cache() { + // On the default reader (no cache) `clear_cache` is a no-op; this proves + // the dispatch arm is wired and yields an empty DataFrame. + let mut executor = QueryExecutor::new().unwrap(); + let result = executor.execute("-- @uncache").unwrap(); + match result { + ExecutionResult::DataFrame(df) => assert_eq!(df.width(), 0), + other => panic!("expected empty DataFrame, got {other:?}"), + } + } + + #[test] + fn test_uncache_then_runs_remaining_query() { + // A leading `-- @uncache` clears the cache and still runs the query below. + let mut executor = QueryExecutor::new().unwrap(); + let result = executor + .execute("-- @uncache\nSELECT 1 AS x, 2 AS y VISUALISE x, y DRAW point") + .unwrap(); + assert!(matches!(result, ExecutionResult::Visualization { .. })); + } + #[test] fn test_display_name_for_uri() { assert_eq!(display_name_for_uri("duckdb://memory"), "DuckDB (memory)"); diff --git a/ggsql-jupyter/src/kernel.rs b/ggsql-jupyter/src/kernel.rs index b9844c602..85e82d0ad 100644 --- a/ggsql-jupyter/src/kernel.rs +++ b/ggsql-jupyter/src/kernel.rs @@ -323,21 +323,24 @@ impl KernelServer { } // Execute the query + let uri_before = self.executor.reader_uri().to_string(); let result = self.executor.execute(code); + let connection_changed = self.executor.reader_uri() != uri_before; + if connection_changed { + let uri = self.executor.reader_uri().to_string(); + self.open_connection_comm(&uri).await?; + } match result { Ok(exec_result) => { - // If the connection changed, open a new connection comm - let is_connection_changed = + // A bare connection change renders nothing. + let suppress_output = matches!(&exec_result, ExecutionResult::ConnectionChanged { .. }); - if let ExecutionResult::ConnectionChanged { ref uri, .. } = &exec_result { - self.open_connection_comm(uri).await?; - } // Send execute_result (not display_data) // Per Jupyter spec: execute_result includes execution_count // Only send if there's something to display (DDL returns None) - if !silent && !is_connection_changed { + if !silent && !suppress_output { if let Some(display_data) = format_display_data(exec_result, &hints) { // Build message content, including output_location if present let mut content = json!({ From 4f2cd554d2cb2066268ef6a53c3e6e07acc64c92 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Tue, 30 Jun 2026 15:55:30 +0100 Subject: [PATCH 09/15] Configure the cache via URI query parameters --- CHANGELOG.md | 2 + src/CLAUDE.md | 2 +- src/reader/connection.rs | 99 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 101 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 327a651f7..916238567 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ - New caching layer that wraps any `Reader` with an in-memory, writeable cache backend (currently duckdb or sqlite), making write-constrained databases usable and avoiding repeated remote reads during interactive iteration. + Memoized reads are bounded by a TTL and an LRU byte budget, configurable per + connection. The cache can be cleared mid-session with the `-- @uncache` meta-command. ## 0.4.1 - 2026-06-22 diff --git a/src/CLAUDE.md b/src/CLAUDE.md index 5f94c2589..321a2fdcf 100644 --- a/src/CLAUDE.md +++ b/src/CLAUDE.md @@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`). -**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. Each memoized read is bounded by a **TTL** (default 300s) and the whole memo by an **LRU byte budget** (default 512 MB); both are configurable via `CacheConfig` (env `GGSQL_CACHE_DISABLED`/`GGSQL_CACHE_TTL`/`GGSQL_CACHE_MAX_BYTES`). Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `+://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. +**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. Each memoized read is bounded by a **TTL** (default 300s) and the whole memo by an **LRU byte budget** (default 512 MB); both are configurable via `CacheConfig` (env `GGSQL_CACHE_DISABLED`/`GGSQL_CACHE_TTL`/`GGSQL_CACHE_MAX_BYTES`, or per-connection URI query parameters `?ttl=…&max_bytes=…&disabled=…`. The `__ggsql_cache_meta__` table is queryable for introspection (`SELECT * FROM __ggsql_cache_meta__`). Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `+://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. ### `execute/` diff --git a/src/reader/connection.rs b/src/reader/connection.rs index b699e60f3..7c1fba364 100644 --- a/src/reader/connection.rs +++ b/src/reader/connection.rs @@ -29,6 +29,45 @@ pub fn split_cache_uri(uri: &str) -> Option<(String, String)> { Some((format!("{}://{}", primary, rest), cache.to_string())) } +/// Cache-config keys recognised in a connection URI's trailing `?` query string. +#[cfg(any(feature = "duckdb", feature = "sqlite"))] +const KNOWN_CACHE_PARAMS: &[&str] = &["ttl", "max_bytes", "disabled"]; + +/// Pull cache-config keys out of a connection URI's trailing `?key=value&…` +/// query string, returning the URI with those keys removed plus the overrides. +#[cfg(any(feature = "duckdb", feature = "sqlite"))] +fn strip_cache_params(uri: &str) -> (String, crate::reader::cache::CacheConfigOverride) { + use crate::reader::cache::{parse_human_bytes, CacheConfigOverride}; + + let mut over = CacheConfigOverride::default(); + let Some((body, query)) = uri.split_once('?') else { + return (uri.to_string(), over); + }; + + let mut kept: Vec<&str> = Vec::new(); + for segment in query.split('&') { + match segment.split_once('=') { + Some((key, value)) if KNOWN_CACHE_PARAMS.contains(&key) => match key { + "ttl" => over.ttl_secs = value.trim().parse::().ok(), + "max_bytes" => over.max_bytes = parse_human_bytes(value), + "disabled" => { + let v = value.trim().to_ascii_lowercase(); + over.enabled = Some(!matches!(v.as_str(), "1" | "true" | "yes")); + } + _ => unreachable!("validated against KNOWN_CACHE_PARAMS"), + }, + // Not a cache key: keep it on the URI for the primary reader. + _ => kept.push(segment), + } + } + + if kept.is_empty() { + (body.to_string(), over) + } else { + (format!("{}?{}", body, kept.join("&")), over) + } +} + /// Map a cache-backend scheme to its in-memory connection URI. #[cfg(any(feature = "duckdb", feature = "sqlite"))] fn cache_uri(scheme: &str) -> Result<&'static str> { @@ -105,12 +144,17 @@ pub fn reader_from_uri(uri: &str) -> Result> { if let Some((primary_uri, cache_scheme)) = split_cache_uri(uri) { #[cfg(any(feature = "duckdb", feature = "sqlite"))] { + use crate::reader::cache::CacheConfig; + + let (primary_uri, over) = strip_cache_params(&primary_uri); + let config = CacheConfig::from_env().merge(over); let primary = build_reader(&primary_uri)?; let cache = build_reader(cache_uri(&cache_scheme)?)?; - return Ok(Box::new(crate::reader::CachingReader::new( + return Ok(Box::new(crate::reader::CachingReader::with_config( primary, cache, primary_uri, + config, ))); } #[cfg(not(any(feature = "duckdb", feature = "sqlite")))] @@ -217,4 +261,57 @@ mod tests { assert_eq!(split_cache_uri("+duckdb://x"), None); assert_eq!(split_cache_uri("odbc+://x"), None); } + + #[cfg(any(feature = "duckdb", feature = "sqlite"))] + #[test] + fn test_strip_cache_params_parses_known_keys() { + let (uri, over) = strip_cache_params("duckdb://memory?ttl=600"); + assert_eq!(uri, "duckdb://memory"); + assert_eq!(over.ttl_secs, Some(600)); + assert_eq!(over.max_bytes, None); + assert_eq!(over.enabled, None); + + let (uri, over) = strip_cache_params("duckdb://memory?max_bytes=256mb&disabled=true"); + assert_eq!(uri, "duckdb://memory"); + assert_eq!(over.max_bytes, Some(256 * 1024 * 1024)); + assert_eq!(over.enabled, Some(false)); + } + + #[cfg(any(feature = "duckdb", feature = "sqlite"))] + #[test] + fn test_strip_cache_params_keeps_non_cache_segments() { + // A non-cache `?key=` tail contributes no overrides and is left in place. + let (uri, over) = strip_cache_params("odbc://DSN=foo?warehouse=PROD"); + assert_eq!(uri, "odbc://DSN=foo?warehouse=PROD"); + assert_eq!(over.ttl_secs, None); + + // ODBC body with `=`/`;` and no `?` is returned verbatim. + let (uri, over) = strip_cache_params("odbc://Driver=Snowflake;Server=x"); + assert_eq!(uri, "odbc://Driver=Snowflake;Server=x"); + assert_eq!(over.enabled, None); + + // Cache keys are extracted; other params (e.g. ODBC settings) are kept. + let (uri, over) = strip_cache_params("odbc://DSN=foo?warehouse=PROD&ttl=10&max_bytes=8mb"); + assert_eq!(uri, "odbc://DSN=foo?warehouse=PROD"); + assert_eq!(over.ttl_secs, Some(10)); + assert_eq!(over.max_bytes, Some(8 * 1024 * 1024)); + + // When every param is a cache key, the `?` is dropped entirely. + let (uri, over) = strip_cache_params("duckdb://memory?ttl=10&disabled=1"); + assert_eq!(uri, "duckdb://memory"); + assert_eq!(over.ttl_secs, Some(10)); + assert_eq!(over.enabled, Some(false)); + + // Plain URI, no query string. + let (uri, over) = strip_cache_params("duckdb://memory"); + assert_eq!(uri, "duckdb://memory"); + assert_eq!(over.ttl_secs, None); + } + + #[cfg(all(feature = "duckdb", feature = "sqlite"))] + #[test] + fn test_reader_from_uri_applies_uri_cache_params() { + // A composite URI with a cache-param tail builds a CachingReader + assert!(reader_from_uri("duckdb+sqlite://memory?ttl=600&max_bytes=64mb").is_ok()); + } } From 55b04c22469d1a739cb0ffb2a3cfc0bd99fbd1b7 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Wed, 1 Jul 2026 09:38:06 +0100 Subject: [PATCH 10/15] Namespace cache query params --- src/CLAUDE.md | 2 +- src/reader/connection.rs | 24 ++++++++++++++---------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/CLAUDE.md b/src/CLAUDE.md index 321a2fdcf..c674b9808 100644 --- a/src/CLAUDE.md +++ b/src/CLAUDE.md @@ -52,7 +52,7 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`). -**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. Each memoized read is bounded by a **TTL** (default 300s) and the whole memo by an **LRU byte budget** (default 512 MB); both are configurable via `CacheConfig` (env `GGSQL_CACHE_DISABLED`/`GGSQL_CACHE_TTL`/`GGSQL_CACHE_MAX_BYTES`, or per-connection URI query parameters `?ttl=…&max_bytes=…&disabled=…`. The `__ggsql_cache_meta__` table is queryable for introspection (`SELECT * FROM __ggsql_cache_meta__`). Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `+://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. +**Caching layer.** `CachingReader` (`cache.rs`) wraps a primary reader plus an in-memory `CacheBackend`, splitting work across two `Reader` surfaces. **`execute_sql` = source**: base reads of the user's data plus user setup/DML run on the primary (with result memoization), except `ggsql:` builtins, the `__ggsql_cache_meta__` table, and reads that reference a cache-resident internal table, which go to the cache. **`execute_sql_cached` = compute**: all dialect-generated/derived SQL (schema probes, stats, projection/map transforms, spatial setup, final layer queries — everything operating on `__ggsql_*` tables) runs on the cache; it defaults to `execute_sql` so a plain reader runs everything on one connection. Cache routing is by **exact-identifier membership** in the set of tables registered into the cache. Memoization keys on `hash(primary_uri + sql)` and is tracked in the `__ggsql_cache_meta__` table inside the cache backend. Each memoized read is bounded by a **TTL** (default 300s) and the whole memo by an **LRU byte budget** (default 512 MB); both are configurable via `CacheConfig` (env `GGSQL_CACHE_DISABLED`/`GGSQL_CACHE_TTL`/`GGSQL_CACHE_MAX_BYTES`, or per-connection URI query parameters `?cache_ttl=…&cache_max_bytes=…&cache_disabled=…`). The `__ggsql_cache_meta__` table is queryable for introspection (`SELECT * FROM __ggsql_cache_meta__`). Pure/non-visual SQL (CLI table fallback, Jupyter) goes through `execute_sql` so it reads the primary rather than the empty cache. `Reader::materialize_table` (default = `CREATE TEMP TABLE` on the reader, no Rust roundtrip) is overridden to read the body via the source surface and `register()` the result into the cache, so the primary is never written to; `Reader::caches_sources()` (default `false`, `true` for `CachingReader`) gates the executor's per-layer source staging: file sources are staged on the cache surface, while identifiers go through `materialize_table`, which routes the read to the cache (CTEs, builtins, cache-resident tables) or the primary as needed. `dialect()` returns the **cache** dialect. Selected via the composite `+://` scheme (`reader_from_uri` / `split_cache_uri`) or the CLI `--cache` flag; off by default. ### `execute/` diff --git a/src/reader/connection.rs b/src/reader/connection.rs index 7c1fba364..557712c73 100644 --- a/src/reader/connection.rs +++ b/src/reader/connection.rs @@ -31,7 +31,7 @@ pub fn split_cache_uri(uri: &str) -> Option<(String, String)> { /// Cache-config keys recognised in a connection URI's trailing `?` query string. #[cfg(any(feature = "duckdb", feature = "sqlite"))] -const KNOWN_CACHE_PARAMS: &[&str] = &["ttl", "max_bytes", "disabled"]; +const KNOWN_CACHE_PARAMS: &[&str] = &["cache_ttl", "cache_max_bytes", "cache_disabled"]; /// Pull cache-config keys out of a connection URI's trailing `?key=value&…` /// query string, returning the URI with those keys removed plus the overrides. @@ -48,9 +48,9 @@ fn strip_cache_params(uri: &str) -> (String, crate::reader::cache::CacheConfigOv for segment in query.split('&') { match segment.split_once('=') { Some((key, value)) if KNOWN_CACHE_PARAMS.contains(&key) => match key { - "ttl" => over.ttl_secs = value.trim().parse::().ok(), - "max_bytes" => over.max_bytes = parse_human_bytes(value), - "disabled" => { + "cache_ttl" => over.ttl_secs = value.trim().parse::().ok(), + "cache_max_bytes" => over.max_bytes = parse_human_bytes(value), + "cache_disabled" => { let v = value.trim().to_ascii_lowercase(); over.enabled = Some(!matches!(v.as_str(), "1" | "true" | "yes")); } @@ -265,13 +265,14 @@ mod tests { #[cfg(any(feature = "duckdb", feature = "sqlite"))] #[test] fn test_strip_cache_params_parses_known_keys() { - let (uri, over) = strip_cache_params("duckdb://memory?ttl=600"); + let (uri, over) = strip_cache_params("duckdb://memory?cache_ttl=600"); assert_eq!(uri, "duckdb://memory"); assert_eq!(over.ttl_secs, Some(600)); assert_eq!(over.max_bytes, None); assert_eq!(over.enabled, None); - let (uri, over) = strip_cache_params("duckdb://memory?max_bytes=256mb&disabled=true"); + let (uri, over) = + strip_cache_params("duckdb://memory?cache_max_bytes=256mb&cache_disabled=true"); assert_eq!(uri, "duckdb://memory"); assert_eq!(over.max_bytes, Some(256 * 1024 * 1024)); assert_eq!(over.enabled, Some(false)); @@ -291,13 +292,14 @@ mod tests { assert_eq!(over.enabled, None); // Cache keys are extracted; other params (e.g. ODBC settings) are kept. - let (uri, over) = strip_cache_params("odbc://DSN=foo?warehouse=PROD&ttl=10&max_bytes=8mb"); - assert_eq!(uri, "odbc://DSN=foo?warehouse=PROD"); + let (uri, over) = + strip_cache_params("odbc://DSN=foo?ttl=99&cache_ttl=10&cache_max_bytes=8mb"); + assert_eq!(uri, "odbc://DSN=foo?ttl=99"); assert_eq!(over.ttl_secs, Some(10)); assert_eq!(over.max_bytes, Some(8 * 1024 * 1024)); // When every param is a cache key, the `?` is dropped entirely. - let (uri, over) = strip_cache_params("duckdb://memory?ttl=10&disabled=1"); + let (uri, over) = strip_cache_params("duckdb://memory?cache_ttl=10&cache_disabled=1"); assert_eq!(uri, "duckdb://memory"); assert_eq!(over.ttl_secs, Some(10)); assert_eq!(over.enabled, Some(false)); @@ -312,6 +314,8 @@ mod tests { #[test] fn test_reader_from_uri_applies_uri_cache_params() { // A composite URI with a cache-param tail builds a CachingReader - assert!(reader_from_uri("duckdb+sqlite://memory?ttl=600&max_bytes=64mb").is_ok()); + assert!( + reader_from_uri("duckdb+sqlite://memory?cache_ttl=600&cache_max_bytes=64mb").is_ok() + ); } } From e8b6991fce907a93d2fbd52bba0ec2e248477caa Mon Sep 17 00:00:00 2001 From: George Stagg Date: Wed, 1 Jul 2026 09:38:25 +0100 Subject: [PATCH 11/15] Remove now-irrelevant test --- src/reader/cache.rs | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/src/reader/cache.rs b/src/reader/cache.rs index 7b7a14ce9..1d57ba0af 100644 --- a/src/reader/cache.rs +++ b/src/reader/cache.rs @@ -815,32 +815,6 @@ mod behavior_tests { assert_eq!(df.height(), 3); } - #[test] - fn test_source_write_invalidates_memo() { - // Memoize a base read, mutate the primary, then re-read: the memo must be - // invalidated by the non-row-returning statement so the second read is fresh. - let primary = DuckDBReader::new_in_memory().unwrap(); - primary - .register("t", df! { "v" => vec![1_i64, 2, 3] }.unwrap(), true) - .unwrap(); - let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); - let reader = CachingReader::new(Box::new(primary), cache, "test://primary"); - - let q = "SELECT v FROM t"; - let d1 = reader.execute_sql(q).unwrap(); - assert_eq!(d1.height(), 3); - - reader.execute_sql("INSERT INTO t VALUES (4)").unwrap(); - - let d2 = reader.execute_sql(q).unwrap(); - assert_eq!(d2.height(), 3, "the memo is served despite the write"); - - // After an explicit clear, the re-read sees the inserted row. - reader.clear_cache().unwrap(); - let d3 = reader.execute_sql(q).unwrap(); - assert_eq!(d3.height(), 4, "clear_cache forces a fresh primary read"); - } - #[test] fn test_default_config_enabled_ttl_300() { let reader = CachingReader::with_config( From 70d387224896362905de65bdc7457535bab094cc Mon Sep 17 00:00:00 2001 From: George Stagg Date: Wed, 1 Jul 2026 09:38:58 +0100 Subject: [PATCH 12/15] Switch ordering in drop_entry --- src/reader/cache.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/reader/cache.rs b/src/reader/cache.rs index 1d57ba0af..3e4e962bc 100644 --- a/src/reader/cache.rs +++ b/src/reader/cache.rs @@ -265,15 +265,15 @@ impl CachingReader { Ok(()) } - /// Drop a single memo entry: delete its meta row and unregister the table. + /// Drop a single memo entry: unregister the table, then delete its meta row. fn drop_entry(&self, key: &str, table: &str) -> Result<()> { + self.cache.unregister(table)?; let del = format!( "DELETE FROM {} WHERE cache_key = {}", naming::quote_ident(naming::CACHE_META_TABLE), naming::quote_literal(key), ); self.cache.execute_sql(&del)?; - let _ = self.cache.unregister(table); Ok(()) } From e750c14bec5ff26c5ecce3ae8a08a5ef70803022 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Wed, 1 Jul 2026 11:59:32 +0100 Subject: [PATCH 13/15] Stage primary sources when executing a query with mixed residency tables --- src/execute/cte.rs | 228 ++++++++++++++++++++++++++++++++++++++++++-- src/execute/mod.rs | 3 +- src/naming.rs | 35 +++++++ src/reader/cache.rs | 78 +++++++++++++++ 4 files changed, 333 insertions(+), 11 deletions(-) diff --git a/src/execute/cte.rs b/src/execute/cte.rs index e843fa074..5b4522282 100644 --- a/src/execute/cte.rs +++ b/src/execute/cte.rs @@ -130,6 +130,80 @@ pub fn transform_cte_references(sql: &str, cte_names: &HashSet) -> Strin result } +/// Stage the primary base tables in a body destined for the cache. +/// +/// A body that is entirely primary (no cache-resident reference) or entirely +/// cache-resident is returned unchanged. +pub fn transform_source_references(sql: &str, reader: &dyn Reader) -> Result { + if !reader.caches_sources() { + return Ok(sql.to_string()); + } + + // Discover the names referenced after FROM/JOIN — including quoted (`"foo"`) + // and schema-qualified (`schema.table`) names. Subqueries contribute no name. + let re = match regex::Regex::new( + r#"(?i)\b(?:FROM|JOIN)\s+("(?:[^"]|"")*"|[A-Za-z_][A-Za-z0-9_$]*(?:\.[A-Za-z_][A-Za-z0-9_$]*)*)"#, + ) { + Ok(re) => re, + Err(_) => return Ok(sql.to_string()), + }; + + let mut seen = HashSet::new(); + let mut refs: Vec = Vec::new(); + for caps in re.captures_iter(sql) { + if let Some(m) = caps.get(1) { + let name = m.as_str().to_string(); + if seen.insert(name.clone()) { + refs.push(name); + } + } + } + + // A name resolves against the cache backend (rather than the primary) if it + // is an internal `__ggsql_*` table or a `ggsql:` builtin dataset. + let is_cache_resolvable = |name: &str| { + naming::is_internal_table(&naming::unquote_ident(name)) || name.starts_with("ggsql:") + }; + + let has_cache_ref = refs.iter().any(|r| is_cache_resolvable(r)); + let primary_refs: Vec<&String> = refs.iter().filter(|r| !is_cache_resolvable(r)).collect(); + + // Only mixed bodies need staging. + if !has_cache_ref || primary_refs.is_empty() { + return Ok(sql.to_string()); + } + + let mut result = sql.to_string(); + for (index, raw) in primary_refs.into_iter().enumerate() { + let staged = naming::staged_source_table(index); + reader.materialize_table(&staged, &[], &format!("SELECT * FROM {}", raw))?; + let target = naming::quote_ident(&staged); + + // Rewrite FROM/JOIN/qualified references to the staged copy. + let patterns = [ + ( + format!(r"(?i)(\bFROM\s+){}(\s|,|\)|$)", regex::escape(raw)), + format!("${{1}}{}${{2}}", target), + ), + ( + format!(r"(?i)(\bJOIN\s+){}(\s|,|\)|$)", regex::escape(raw)), + format!("${{1}}{}${{2}}", target), + ), + ( + format!(r"(?i)\b{}(\.[a-zA-Z_][a-zA-Z0-9_]*)", regex::escape(raw)), + format!("{}${{1}}", target), + ), + ]; + for (pattern, replacement) in patterns { + if let Ok(re) = regex::Regex::new(&pattern) { + result = re.replace_all(&result, replacement.as_str()).to_string(); + } + } + } + + Ok(result) +} + /// Materialize CTEs as temporary tables in the database /// /// Creates a temp table for each CTE in declaration order. When a CTE @@ -143,6 +217,8 @@ pub fn materialize_ctes(ctes: &[CteDefinition], reader: &dyn Reader) -> Result Option<(String, String)> { pub fn transform_global_sql( source_tree: &SourceTree, materialized_ctes: &HashSet, -) -> (Vec, Option) { + reader: &dyn Reader, +) -> Result<(Vec, Option)> { // Collect side-effect statements (CREATE, INSERT, UPDATE, DELETE) that // need to run before the main query. These appear alongside a trailing // SELECT or VISUALISE FROM. @@ -245,14 +322,15 @@ pub fn transform_global_sql( }); if let Some(select_sql) = select_sql { - return ( + let select_sql = transform_cte_references(&select_sql, materialized_ctes); + return Ok(( side_effects, - Some(transform_cte_references(&select_sql, materialized_ctes)), - ); + Some(transform_source_references(&select_sql, reader)?), + )); } if !has_executable_sql(source_tree) { - return (vec![], None); + return Ok((vec![], None)); } // We have non-SELECT executable SQL and/or VISUALISE FROM. @@ -266,18 +344,24 @@ pub fn transform_global_sql( ) .map(|table| { let q = format!("SELECT * FROM {}", table); - transform_cte_references(&q, materialized_ctes) - }); + let q = transform_cte_references(&q, materialized_ctes); + transform_source_references(&q, reader) + }) + .transpose()?; if !side_effects.is_empty() || viz_from_query.is_some() { - (side_effects, viz_from_query) + Ok((side_effects, viz_from_query)) } else { // has_executable_sql was true but we found no specific statements or // VISUALISE FROM — fall back to extract_sql as the query. let query = source_tree .extract_sql() - .map(|s| transform_cte_references(&s, materialized_ctes)); - (vec![], query) + .map(|s| { + let s = transform_cte_references(&s, materialized_ctes); + transform_source_references(&s, reader) + }) + .transpose()?; + Ok((vec![], query)) } } @@ -469,6 +553,130 @@ mod tests { } } + /// Minimal reader that records `materialize_table` calls, used to unit-test + /// source-reference staging without a database. + struct MockReader { + caches: bool, + staged: std::cell::RefCell>, + } + + impl MockReader { + fn new(caches: bool) -> Self { + Self { + caches, + staged: std::cell::RefCell::new(Vec::new()), + } + } + } + + impl Reader for MockReader { + fn execute_sql(&self, _sql: &str) -> Result { + unreachable!("staging must not touch execute_sql in these tests") + } + fn register(&self, _name: &str, _df: crate::DataFrame, _replace: bool) -> Result<()> { + Ok(()) + } + fn execute(&self, _query: &str) -> Result { + unreachable!() + } + fn caches_sources(&self) -> bool { + self.caches + } + fn materialize_table( + &self, + name: &str, + _column_aliases: &[String], + body_sql: &str, + ) -> Result<()> { + self.staged + .borrow_mut() + .push((name.to_string(), body_sql.to_string())); + Ok(()) + } + } + + #[test] + fn test_transform_source_references_quoted_primary_ref() { + let reader = MockReader::new(true); + // A quoted primary base table joined against a cache-resident CTE temp. + let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN \"my base\" ON 1 = 1"; + let out = transform_source_references(sql, &reader).unwrap(); + + let staged = reader.staged.borrow(); + assert_eq!(staged.len(), 1); + assert_eq!(staged[0].1, "SELECT * FROM \"my base\""); + assert!(out.contains("__ggsql_staged_0_")); + assert!(!out.contains("JOIN \"my base\"")); + } + + #[test] + fn test_transform_source_references_non_caching_reader_unchanged() { + let reader = MockReader::new(false); + let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN base ON base.k = 1"; + let out = transform_source_references(sql, &reader).unwrap(); + assert_eq!(out, sql); + assert!(reader.staged.borrow().is_empty()); + } + + #[test] + fn test_transform_source_references_all_primary_unchanged() { + let reader = MockReader::new(true); + let out = transform_source_references("SELECT * FROM a JOIN base ON a.k = base.k", &reader) + .unwrap(); + assert_eq!(out, "SELECT * FROM a JOIN base ON a.k = base.k"); + assert!(reader.staged.borrow().is_empty()); + } + + #[test] + fn test_transform_source_references_stages_mixed_body() { + let reader = MockReader::new(true); + // A cache-resident CTE temp joined against a primary base table. + let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN base ON base.k = 1"; + let out = transform_source_references(sql, &reader).unwrap(); + + let staged = reader.staged.borrow(); + assert_eq!(staged.len(), 1, "base should be staged exactly once"); + assert!(staged[0].0.starts_with("__ggsql_staged_0_")); + assert_eq!(staged[0].1, "SELECT * FROM base"); + + assert!(out.contains("__ggsql_staged_0_")); + assert!(out.contains("\"__ggsql_cte_t__\"")); // cte ref preserved + assert!(!out.contains("JOIN base")); + } + + #[test] + fn test_transform_source_references_reversed_from_join() { + let reader = MockReader::new(true); + // Primary base table in FROM, cache-resident CTE temp in JOIN. + let sql = "SELECT * FROM base JOIN \"__ggsql_cte_t__\" ON base.k = 1"; + let out = transform_source_references(sql, &reader).unwrap(); + + assert_eq!(reader.staged.borrow().len(), 1); + assert!(out.contains("FROM \"__ggsql_staged_0_")); + assert!(!out.contains("FROM base")); + } + + #[test] + fn test_transform_source_references_schema_qualified() { + let reader = MockReader::new(true); + let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN myschema.base ON base.k = 1"; + let out = transform_source_references(sql, &reader).unwrap(); + + let staged = reader.staged.borrow(); + assert_eq!(staged.len(), 1); + assert_eq!(staged[0].1, "SELECT * FROM myschema.base"); + assert!(out.contains("__ggsql_staged_0_")); + assert!(!out.contains("JOIN myschema.base")); + } + + #[test] + fn test_transform_source_references_same_ref_staged_once() { + let reader = MockReader::new(true); + let sql = "SELECT * FROM base JOIN \"__ggsql_cte_t__\" ON base.k = base.j"; + let _ = transform_source_references(sql, &reader).unwrap(); + assert_eq!(reader.staged.borrow().len(), 1); + } + #[test] fn test_split_with_query_basic() { let sql = "WITH cte AS (SELECT * FROM x) SELECT * FROM cte"; diff --git a/src/execute/mod.rs b/src/execute/mod.rs index 2baa2032e..0e6379d01 100644 --- a/src/execute/mod.rs +++ b/src/execute/mod.rs @@ -1152,7 +1152,8 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result String { ) } +/// Generate temp table name for a primary base table staged into the cache. +/// +/// Format: `__ggsql_staged____` +/// +/// # Example +/// ``` +/// use ggsql::naming; +/// let table = naming::staged_source_table(1); +/// assert!(table.starts_with("__ggsql_staged_1_")); +/// assert!(table.ends_with("__")); +/// ``` +pub fn staged_source_table(index: usize) -> String { + format!( + "{}staged_{}_{}{}", + GGSQL_PREFIX, + index, + session_id(), + GGSQL_SUFFIX + ) +} + /// Generate table name for a builtin dataset. /// /// Used when rewriting `ggsql:penguins` to the internal table name. @@ -328,6 +349,20 @@ pub fn quote_literal(s: &str) -> String { // Detection Functions // ============================================================================ +/// Check if a name refers to an internal ggsql table (CTE temp, staged source, +/// global, builtin dataset, cache result, …). +/// +/// # Example +/// ``` +/// use ggsql::naming; +/// assert!(naming::is_internal_table("__ggsql_cte_sales_abc__")); +/// assert!(naming::is_internal_table("__ggsql_staged_0_abc__")); +/// assert!(!naming::is_internal_table("sales")); +/// ``` +pub fn is_internal_table(name: &str) -> bool { + name.starts_with(GGSQL_PREFIX) +} + /// Check if a column name is a synthetic constant column. /// /// # Example diff --git a/src/reader/cache.rs b/src/reader/cache.rs index 3e4e962bc..6ed5dc6c2 100644 --- a/src/reader/cache.rs +++ b/src/reader/cache.rs @@ -718,6 +718,84 @@ mod behavior_tests { ); } + #[test] + fn test_cte_joined_against_primary_base_table() { + // A materialized CTE (cache-resident) joined against a primary-only base + // table: the global query is mixed, so the base table must be staged + // into the cache for the join to resolve there. + let base = DuckDBReader::new_in_memory().unwrap(); + base.register( + "base", + df! { "k" => vec![1_i64, 2], "w" => vec![10_i64, 20] }.unwrap(), + true, + ) + .unwrap(); + let primary = Box::new(ReadOnlyReader::new(Box::new(base))); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let spec = reader.execute( + "WITH t AS (SELECT 1 AS k, 100 AS v) \ + SELECT t.v, base.w FROM t JOIN base ON t.k = base.k \ + VISUALISE v AS x, w AS y DRAW point", + ); + assert!( + spec.is_ok(), + "CTE joined against a primary base table should succeed: {:?}", + spec.err() + ); + } + + #[test] + fn test_schema_qualified_base_table_join() { + // Same as above, but the primary base table is schema-qualified. + let base = DuckDBReader::new_in_memory().unwrap(); + base.execute_sql("CREATE SCHEMA myschema").unwrap(); + base.execute_sql("CREATE TABLE myschema.base AS SELECT 1 AS k, 10 AS w") + .unwrap(); + let primary = Box::new(ReadOnlyReader::new(Box::new(base))); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let spec = reader.execute( + "WITH t AS (SELECT 1 AS k, 100 AS v) \ + SELECT t.v, myschema.base.w FROM t JOIN myschema.base ON t.k = myschema.base.k \ + VISUALISE v AS x, w AS y DRAW point", + ); + assert!( + spec.is_ok(), + "CTE joined against a schema-qualified base table should succeed: {:?}", + spec.err() + ); + } + + #[test] + fn test_mixed_later_cte_body_joins_primary() { + // A later CTE whose body references an earlier (cache-resident) CTE and a + // primary-only base table: the CTE body is mixed and must stage the base. + let base = DuckDBReader::new_in_memory().unwrap(); + base.register( + "base", + df! { "k" => vec![1_i64], "w" => vec![10_i64] }.unwrap(), + true, + ) + .unwrap(); + let primary = Box::new(ReadOnlyReader::new(Box::new(base))); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let spec = reader.execute( + "WITH a AS (SELECT 1 AS k, 5 AS p), \ + b AS (SELECT a.p, base.w FROM a JOIN base ON a.k = base.k) \ + SELECT * FROM b VISUALISE p AS x, w AS y DRAW point", + ); + assert!( + spec.is_ok(), + "mixed later-CTE body joining a primary table should succeed: {:?}", + spec.err() + ); + } + #[test] fn test_meta_table_records_and_serves_memo() { // A memoized read is recorded in the metadata table and served back from From 536a392028984af35856f5bd2dfd53816c3aad06 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Wed, 1 Jul 2026 15:19:01 +0100 Subject: [PATCH 14/15] Update ggsql grammar for JOIN and qualified window function --- tree-sitter-ggsql/grammar.js | 92 ++++++++-- tree-sitter-ggsql/test/corpus/basic.txt | 221 ++++++++++++++++++++++++ 2 files changed, 300 insertions(+), 13 deletions(-) diff --git a/tree-sitter-ggsql/grammar.js b/tree-sitter-ggsql/grammar.js index c19938fcc..521f9de40 100644 --- a/tree-sitter-ggsql/grammar.js +++ b/tree-sitter-ggsql/grammar.js @@ -310,16 +310,7 @@ module.exports = grammar({ non_from_sql_keyword: $ => choice( caseInsensitive('WHERE'), - caseInsensitive('JOIN'), - caseInsensitive('LEFT'), - caseInsensitive('RIGHT'), - caseInsensitive('INNER'), - caseInsensitive('OUTER'), caseInsensitive('LATERAL'), - caseInsensitive('CROSS'), - caseInsensitive('NATURAL'), - caseInsensitive('FULL'), - caseInsensitive('ON'), caseInsensitive('AND'), caseInsensitive('OR'), caseInsensitive('NOT'), @@ -518,8 +509,8 @@ module.exports = grammar({ window_partition_clause: $ => seq( caseInsensitive('PARTITION'), caseInsensitive('BY'), - $.identifier, - repeat(seq(',', $.identifier)) + $._dotted_column, + repeat(seq(',', $._dotted_column)) ), window_order_clause: $ => seq( @@ -529,12 +520,19 @@ module.exports = grammar({ repeat(seq(',', $.order_item)) ), + // ORDER BY order_item: $ => seq( - $.identifier, + $._dotted_column, optional(choice(caseInsensitive('ASC'), caseInsensitive('DESC'))), optional(seq(caseInsensitive('NULLS'), choice(caseInsensitive('FIRST'), caseInsensitive('LAST')))) ), + // A (possibly dotted) column reference used as a window sort/partition key. + _dotted_column: $ => prec.right(seq( + $.identifier, + repeat(seq('.', $.identifier)) + )), + frame_clause: $ => seq( choice(caseInsensitive('ROWS'), caseInsensitive('RANGE')), choice( @@ -576,9 +574,77 @@ module.exports = grammar({ from_clause: $ => prec.right(1, seq( token(prec(1, caseInsensitive('FROM'))), $.table_ref, - repeat(seq(',', $.table_ref)) + repeat(choice( + seq(',', $.table_ref), + $.join_clause + )) + )), + + // ANSI join: an operator, the target table, and an ON or USING condition. + join_clause: $ => prec.right(seq( + $.join_operator, + $.table_ref, + optional($.join_condition) )), + // ANSI join operators. + join_operator: $ => choice( + seq(caseInsensitive('CROSS'), caseInsensitive('JOIN')), + seq( + optional(caseInsensitive('NATURAL')), + optional(choice( + caseInsensitive('INNER'), + seq( + choice( + caseInsensitive('LEFT'), + caseInsensitive('RIGHT'), + caseInsensitive('FULL') + ), + optional(caseInsensitive('OUTER')) + ) + )), + caseInsensitive('JOIN') + ) + ), + + join_condition: $ => choice( + seq(caseInsensitive('ON'), $.join_condition_expression), + seq( + caseInsensitive('USING'), + '(', + $.identifier, + repeat(seq(',', $.identifier)), + ')' + ) + ), + + // The ON search condition. + join_condition_expression: $ => prec.right(repeat1(choice( + $.case_expression, + $.cast_expression, + $.function_call, + $.jinja_template, + $.join_condition_keyword, + $.string, + $.number, + '.', '*', '=', '<', '>', '!', '+', '-', '/', '%', '|', '&', '^', '~', '::', + $.subquery, + $.identifier + ))), + + // Expression-level keywords allowed inside an ON condition. + join_condition_keyword: $ => choice( + caseInsensitive('AND'), + caseInsensitive('OR'), + caseInsensitive('NOT'), + caseInsensitive('IN'), + caseInsensitive('IS'), + caseInsensitive('NULL'), + caseInsensitive('EXISTS'), + caseInsensitive('BETWEEN'), + caseInsensitive('LIKE') + ), + // VISUALISE/VISUALIZE [global_mapping] [FROM source] with clauses // Global mapping sets default aesthetics for all layers // FROM source can be an identifier (table/CTE) or string (file path) diff --git a/tree-sitter-ggsql/test/corpus/basic.txt b/tree-sitter-ggsql/test/corpus/basic.txt index 43834daf7..ee1fdf991 100644 --- a/tree-sitter-ggsql/test/corpus/basic.txt +++ b/tree-sitter-ggsql/test/corpus/basic.txt @@ -941,6 +941,158 @@ DRAW bar (draw_clause (geom_type))))) +================================================================================ +SELECT with JOIN +================================================================================ + +SELECT * FROM x JOIN y ON x.id = y.id +VISUALISE a AS x +DRAW bar + +-------------------------------------------------------------------------------- + +(query + (sql_portion + (sql_statement + (select_statement + (select_body + (from_clause + (table_ref + table: (qualified_name + (identifier + (bare_identifier)))) + (join_clause + (join_operator) + (table_ref + table: (qualified_name + (identifier + (bare_identifier)))) + (join_condition + (join_condition_expression + (identifier + (bare_identifier)) + (identifier + (bare_identifier)) + (identifier + (bare_identifier)) + (identifier + (bare_identifier)))))))))) + (visualise_statement + (visualise_keyword) + (global_mapping + (mapping_list + (mapping_element + (explicit_mapping + value: (mapping_value + (column_reference + (identifier + (bare_identifier)))) + name: (aesthetic_name))))) + (viz_clause + (draw_clause + (geom_type))))) + +================================================================================ +SELECT with LEFT OUTER JOIN, aliases and ON condition +================================================================================ + +SELECT * FROM orders o LEFT OUTER JOIN customers c ON o.cid = c.id +VISUALISE a AS x +DRAW bar + +-------------------------------------------------------------------------------- + +(query + (sql_portion + (sql_statement + (select_statement + (select_body + (from_clause + (table_ref + table: (qualified_name + (identifier + (bare_identifier))) + alias: (identifier + (bare_identifier))) + (join_clause + (join_operator) + (table_ref + table: (qualified_name + (identifier + (bare_identifier))) + alias: (identifier + (bare_identifier))) + (join_condition + (join_condition_expression + (identifier + (bare_identifier)) + (identifier + (bare_identifier)) + (identifier + (bare_identifier)) + (identifier + (bare_identifier)))))))))) + (visualise_statement + (visualise_keyword) + (global_mapping + (mapping_list + (mapping_element + (explicit_mapping + value: (mapping_value + (column_reference + (identifier + (bare_identifier)))) + name: (aesthetic_name))))) + (viz_clause + (draw_clause + (geom_type))))) + +================================================================================ +SELECT with INNER JOIN USING column list +================================================================================ + +SELECT * FROM a INNER JOIN b USING (k, id) +VISUALISE a AS x +DRAW bar + +-------------------------------------------------------------------------------- + +(query + (sql_portion + (sql_statement + (select_statement + (select_body + (from_clause + (table_ref + table: (qualified_name + (identifier + (bare_identifier)))) + (join_clause + (join_operator) + (table_ref + table: (qualified_name + (identifier + (bare_identifier)))) + (join_condition + (identifier + (bare_identifier)) + (identifier + (bare_identifier))))))))) + (visualise_statement + (visualise_keyword) + (global_mapping + (mapping_list + (mapping_element + (explicit_mapping + value: (mapping_value + (column_reference + (identifier + (bare_identifier)))) + name: (aesthetic_name))))) + (viz_clause + (draw_clause + (geom_type))))) + ================================================================================ Literal value in MAPPING ================================================================================ @@ -1339,6 +1491,75 @@ DRAW point MAPPING x AS x, rnk AS y name: (aesthetic_name))))))))) ================================================================================ +Window function with qualified PARTITION BY and ORDER BY +================================================================================ + +SELECT ROW_NUMBER() OVER (PARTITION BY a.k ORDER BY b.w DESC) AS rn FROM a JOIN b ON a.k = b.k +VISUALISE +DRAW point MAPPING rn AS x + +-------------------------------------------------------------------------------- + +(query + (sql_portion + (sql_statement + (select_statement + (select_body + (window_function + function: (identifier + (bare_identifier)) + (window_specification + (window_partition_clause + (identifier + (bare_identifier)) + (identifier + (bare_identifier))) + (window_order_clause + (order_item + (identifier + (bare_identifier)) + (identifier + (bare_identifier)))))) + (identifier + (bare_identifier)) + (identifier + (bare_identifier)) + (from_clause + (table_ref + table: (qualified_name + (identifier + (bare_identifier)))) + (join_clause + (join_operator) + (table_ref + table: (qualified_name + (identifier + (bare_identifier)))) + (join_condition + (join_condition_expression + (identifier + (bare_identifier)) + (identifier + (bare_identifier)) + (identifier + (bare_identifier)) + (identifier + (bare_identifier)))))))))) + (visualise_statement + (visualise_keyword) + (viz_clause + (draw_clause + (geom_type) + (mapping_clause + (mapping_list + (mapping_element + (explicit_mapping + value: (mapping_value + (column_reference + (identifier + (bare_identifier)))) + name: (aesthetic_name))))))))) +================================================================================ Window function with frame clause ================================================================================ From a42350150b377eabc1c0f2a9d72275bb32839654 Mon Sep 17 00:00:00 2001 From: George Stagg Date: Wed, 1 Jul 2026 15:55:42 +0100 Subject: [PATCH 15/15] Improve SQL table-reference parsing --- src/CLAUDE.md | 4 +- src/execute/cte.rs | 490 +++++++++++++++++++++++++++++++++---------- src/execute/mod.rs | 16 +- src/parser/mod.rs | 5 + src/parser/sql.rs | 222 ++++++++++++++++++++ src/reader/cache.rs | 152 +++++++++++++- src/reader/data.rs | 248 +--------------------- src/reader/duckdb.rs | 4 +- src/reader/sqlite.rs | 4 +- 9 files changed, 764 insertions(+), 381 deletions(-) create mode 100644 src/parser/sql.rs diff --git a/src/CLAUDE.md b/src/CLAUDE.md index c674b9808..c32ea6975 100644 --- a/src/CLAUDE.md +++ b/src/CLAUDE.md @@ -34,6 +34,7 @@ src/ - `mod.rs` exposes `parse_query()` which builds a `Vec` from a query string. - `source_tree.rs` is the parse-once wrapper: holds the tree-sitter `Tree`, source text, and language; offers a declarative query API (`find_node`, `find_text`, …) plus lazy `extract_sql()` / `extract_visualise()` extractors. It also handles the `VISUALISE FROM ` shorthand by injecting `SELECT * FROM `. - `builder.rs` walks the CST and produces typed `Plot` values. This is where new grammar nodes become `Plot` fields. +- `sql.rs` extracts structure from SQL fragments over the parse tree. Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding syntax, edit `grammar.js`, regenerate, then teach `builder.rs` about the new nodes. @@ -48,7 +49,8 @@ Grammar lives in [`/tree-sitter-ggsql/`](../tree-sitter-ggsql/) — when adding | `odbc.rs` | ODBC | `odbc` (default) | | `cache.rs` | `CachingReader` — wraps any primary `Reader` with an in-memory cache | `duckdb` or `sqlite` | | `connection.rs` | Connection-string parsing for all of the above | — | -| `data.rs`, `spec.rs` | `Spec` type returned by `execute()`, plus DataFrame conversion | — | +| `spec.rs` | `Spec` type returned by `execute()`, plus DataFrame conversion | — | +| `data.rs` | Bundled sample datasets — the `ggsql:` builtins | `builtin-data` | `SqlDialect` trait in `mod.rs` lets each driver supply its own type names, information-schema queries, and spatial helper methods (`sql_st_transform`, `sql_geometry_to_wkb`, `sql_geometry_bbox`, `sql_ensure_geometry`, `sql_select_replace`, `sql_spatial_setup`). diff --git a/src/execute/cte.rs b/src/execute/cte.rs index 5b4522282..bcfc1cf7a 100644 --- a/src/execute/cte.rs +++ b/src/execute/cte.rs @@ -79,57 +79,120 @@ pub(crate) fn get_node_text<'a>(node: &Node, source: &'a str) -> &'a str { &source[node.start_byte()..node.end_byte()] } -/// Transform CTE references in SQL to use temp table names +/// Transform CTE references in SQL to use temp table names. /// -/// Replaces references to CTEs (e.g., `FROM sales`, `JOIN sales`) with -/// the corresponding temp table names (e.g., `FROM __ggsql_cte_sales__`). +/// Replaces references to CTEs (e.g. `FROM sales`, `JOIN sales`, `sales.col`) +/// with the corresponding temp table names (e.g. `__ggsql_cte_sales__`). /// -/// This handles table references after FROM and JOIN keywords, being careful -/// to only replace whole word matches (not substrings). +/// Table references are found via the parser; column references are rewritten +/// tolerant of whitespace around the dot and never inside string literals. pub fn transform_cte_references(sql: &str, cte_names: &HashSet) -> String { if cte_names.is_empty() { return sql.to_string(); } - let mut result = sql.to_string(); + // On a parse failure leave the SQL unchanged. + let Ok(sites) = crate::parser::extract_table_ref_sites(sql) else { + return sql.to_string(); + }; + let string_ranges = crate::parser::string_literal_ranges(sql); + let in_string = |pos: usize| string_ranges.iter().any(|&(s, e)| pos >= s && pos < e); + + // Match CTE names against the unquoted reference text, mirroring the + // definition names. + let temp_of = |raw: &str| -> Option { + let name = naming::unquote_ident(raw); + cte_names + .iter() + .find(|c| naming::unquote_ident(c).eq_ignore_ascii_case(&name)) + .map(|c| naming::quote_ident(&naming::cte_table(c))) + }; - for cte_name in cte_names { - let temp_table_name = naming::quote_ident(&naming::cte_table(cte_name)); + let mut replacements: Vec<(usize, usize, String)> = Vec::new(); - // Replace table references: FROM cte_name, JOIN cte_name, cte_name.column - // Use word boundary matching to avoid replacing substrings - // Pattern: (FROM|JOIN)\s+(\s|,|)|$) - let patterns = [ - // FROM cte_name (case insensitive) - ( - format!(r"(?i)(\bFROM\s+){}(\s|,|\)|$)", regex::escape(cte_name)), - format!("${{1}}{}${{2}}", temp_table_name), - ), - // JOIN cte_name (case insensitive) - handles LEFT JOIN, RIGHT JOIN, etc. - ( - format!(r"(?i)(\bJOIN\s+){}(\s|,|\)|$)", regex::escape(cte_name)), - format!("${{1}}{}${{2}}", temp_table_name), - ), - // Qualified column references: cte_name.column (case insensitive) - ( - format!( - r"(?i)\b{}(\.[a-zA-Z_][a-zA-Z0-9_]*)", - regex::escape(cte_name) - ), - format!("{}${{1}}", temp_table_name), - ), - ]; + // Rewrite each table_ref that names a CTE to its temp table. + for site in &sites { + if let Some(temp) = temp_of(&site.raw) { + replacements.push((site.start, site.end, temp)); + } + } - for (pattern, replacement) in patterns { - if let Ok(re) = regex::Regex::new(&pattern) { - result = re.replace_all(&result, replacement.as_str()).to_string(); + // Rewrite qualified column references `cte.col` -> `temp.col`. + let site_starts: HashSet = sites.iter().map(|s| s.start).collect(); + for cte in cte_names { + let temp = naming::quote_ident(&naming::cte_table(cte)); + let bare = naming::unquote_ident(cte); + let pattern = format!(r"((?i:{}))\s*\.", regex::escape(&bare)); + let Ok(re) = regex::Regex::new(&pattern) else { + continue; + }; + for caps in re.captures_iter(sql) { + let name = caps.get(1).unwrap(); + let start = name.start(); + if site_starts.contains(&start) || in_string(start) { + continue; + } + // Require a boundary before the name. + let ok_prefix = sql[..start] + .chars() + .next_back() + .is_none_or(|c| !(c.is_alphanumeric() || c == '_' || c == '.' || c == '"')); + if ok_prefix { + replacements.push((name.start(), name.end(), temp.clone())); } } } + // Apply in reverse byte order so earlier offsets stay valid. + let mut result = sql.to_string(); + replacements.sort_by_key(|(start, _, _)| std::cmp::Reverse(*start)); + // Distinct offsets only. + replacements.dedup_by_key(|(start, _, _)| *start); + + for (start, end, replacement) in replacements { + result.replace_range(start..end, &replacement); + } + result } +/// Byte offsets of the `.` separators in a (possibly quoted) dotted identifier. +fn unquoted_dot_positions(raw: &str) -> Vec { + let mut positions = Vec::new(); + let mut in_quote = false; + for (i, b) in raw.bytes().enumerate() { + match b { + b'"' => in_quote = !in_quote, + b'.' if !in_quote => positions.push(i), + _ => {} + } + } + positions +} + +/// Split a (possibly quoted) dotted identifier into its components, trimming +/// surrounding whitespace: `schema . base` → `["schema","base"]`. +fn identifier_components(raw: &str) -> Vec<&str> { + let mut parts = Vec::new(); + let mut start = 0; + for pos in unquoted_dot_positions(raw) { + parts.push(raw[start..pos].trim()); + start = pos + 1; + } + parts.push(raw[start..].trim()); + parts +} + +/// The final component of a (possibly quoted) dotted identifier: `base` for +/// `schema.base`, `"base"` for `"schema"."base"`, and the whole (trimmed) string +/// for a single (possibly quoted) name. +fn last_identifier_component(raw: &str) -> &str { + match unquoted_dot_positions(raw).last() { + Some(&pos) => raw[pos + 1..].trim(), + None => raw.trim(), + } +} + /// Stage the primary base tables in a body destined for the cache. /// /// A body that is entirely primary (no cache-resident reference) or entirely @@ -139,68 +202,130 @@ pub fn transform_source_references(sql: &str, reader: &dyn Reader) -> Result re, + // Discover table references via the parser. + let sites = match crate::parser::extract_table_ref_sites(sql) { + Ok(sites) => sites, Err(_) => return Ok(sql.to_string()), }; - let mut seen = HashSet::new(); - let mut refs: Vec = Vec::new(); - for caps in re.captures_iter(sql) { - if let Some(m) = caps.get(1) { - let name = m.as_str().to_string(); - if seen.insert(name.clone()) { - refs.push(name); - } - } - } - - // A name resolves against the cache backend (rather than the primary) if it - // is an internal `__ggsql_*` table or a `ggsql:` builtin dataset. - let is_cache_resolvable = |name: &str| { - naming::is_internal_table(&naming::unquote_ident(name)) || name.starts_with("ggsql:") + // A ref resolves against the cache backend (rather than the primary) when it + // is an internal `__ggsql_*` table, a `ggsql:` builtin, or a file-path string. + let is_cache_resolvable = |raw: &str| { + raw.starts_with('\'') + || raw.starts_with("ggsql:") + || naming::is_internal_table(&naming::unquote_ident(raw)) }; - let has_cache_ref = refs.iter().any(|r| is_cache_resolvable(r)); - let primary_refs: Vec<&String> = refs.iter().filter(|r| !is_cache_resolvable(r)).collect(); + let has_cache_ref = sites.iter().any(|s| is_cache_resolvable(&s.raw)); + let primary_sites: Vec<&crate::parser::TableRefSite> = sites + .iter() + .filter(|s| !is_cache_resolvable(&s.raw)) + .collect(); // Only mixed bodies need staging. - if !has_cache_ref || primary_refs.is_empty() { + if !has_cache_ref || primary_sites.is_empty() { return Ok(sql.to_string()); } - let mut result = sql.to_string(); - for (index, raw) in primary_refs.into_iter().enumerate() { - let staged = naming::staged_source_table(index); - reader.materialize_table(&staged, &[], &format!("SELECT * FROM {}", raw))?; - let target = naming::quote_ident(&staged); + // The final identifier component of a ref. This is how an unaliased ref + // is spelled at column sites, so it doubles as the alias to attach. + let last_of = |raw: &str| -> String { last_identifier_component(raw).to_string() }; - // Rewrite FROM/JOIN/qualified references to the staged copy. - let patterns = [ - ( - format!(r"(?i)(\bFROM\s+){}(\s|,|\)|$)", regex::escape(raw)), - format!("${{1}}{}${{2}}", target), - ), - ( - format!(r"(?i)(\bJOIN\s+){}(\s|,|\)|$)", regex::escape(raw)), - format!("${{1}}{}${{2}}", target), - ), - ( - format!(r"(?i)\b{}(\.[a-zA-Z_][a-zA-Z0-9_]*)", regex::escape(raw)), - format!("{}${{1}}", target), - ), - ]; - for (pattern, replacement) in patterns { - if let Ok(re) = regex::Regex::new(&pattern) { - result = re.replace_all(&result, replacement.as_str()).to_string(); + // Stage each distinct primary table into the cache once. + let mut staged_for: std::collections::HashMap = + std::collections::HashMap::new(); + for site in &primary_sites { + if !staged_for.contains_key(&site.raw) { + let t = naming::staged_source_table(staged_for.len()); + reader.materialize_table(&t, &[], &format!("SELECT * FROM {}", site.raw))?; + staged_for.insert(site.raw.clone(), t); + } + } + + // An unaliased ref is aliased back to its last component so `base.col` keeps + // resolving. Two unaliased refs sharing a last component (`a.base`+`b.base`) + // would collide, so those use their unique staged name instead. + let mut last_counts: std::collections::HashMap = + std::collections::HashMap::new(); + let mut seen_unaliased: HashSet<&str> = HashSet::new(); + for site in &primary_sites { + if !site.has_alias && seen_unaliased.insert(site.raw.as_str()) { + *last_counts.entry(last_of(&site.raw)).or_default() += 1; + } + } + let last_collides = |raw: &str| last_counts.get(&last_of(raw)).copied().unwrap_or(0) > 1; + + let string_ranges = crate::parser::string_literal_ranges(sql); + let in_string = |pos: usize| string_ranges.iter().any(|&(s, e)| pos >= s && pos < e); + let site_starts: HashSet = primary_sites.iter().map(|s| s.start).collect(); + + let mut replacements: Vec<(usize, usize, String)> = Vec::new(); + + // Rewrite each table_ref occurrence to the staged table. + for site in &primary_sites { + let quoted = naming::quote_ident(&staged_for[&site.raw]); + let replacement = if site.has_alias || last_collides(&site.raw) { + quoted + } else { + format!("{} AS {}", quoted, last_of(&site.raw)) + }; + replacements.push((site.start, site.end, replacement)); + } + + // Rewrite full column qualifiers (`schema.base.col`) to the alias, or the + // staged table when it collides — skipping string literals and table_ref sites. + for raw in staged_for.keys() { + // Only multi-part refs can appear as a full qualifier at column sites. + if unquoted_dot_positions(raw).is_empty() { + continue; + } + let target = if last_collides(raw) { + naming::quote_ident(&staged_for[raw]) + } else { + last_of(raw) + }; + // Match the qualifier tolerant of whitespace around its dots. + let pattern = format!( + r"({})\s*\.", + identifier_components(raw) + .iter() + .map(|c| { + if c.starts_with('"') { + regex::escape(c) + } else { + format!("(?i:{})", regex::escape(c)) + } + }) + .collect::>() + .join(r"\s*\.\s*") + ); + let Ok(re) = regex::Regex::new(&pattern) else { + continue; + }; + for caps in re.captures_iter(sql) { + let qualifier = caps.get(1).unwrap(); + let start = qualifier.start(); + if site_starts.contains(&start) || in_string(start) { + continue; + } + // Require a boundary before the qualifier. + let ok_prefix = sql[..start] + .chars() + .next_back() + .is_none_or(|c| !(c.is_alphanumeric() || c == '_' || c == '.' || c == '"')); + if ok_prefix { + replacements.push((start, qualifier.end(), target.clone())); } } } + // Apply in reverse byte order so earlier offsets stay valid. + let mut result = sql.to_string(); + replacements.sort_by_key(|(start, _, _)| std::cmp::Reverse(*start)); + for (start, end, replacement) in replacements { + result.replace_range(start..end, &replacement); + } + Ok(result) } @@ -281,24 +406,13 @@ pub fn split_with_query(source_tree: &SourceTree) -> Option<(String, String)> { Some((cte_prefix, trailing)) } -/// Transform global SQL for execution with temp tables. +/// Collect side-effect statements (CREATE, INSERT, UPDATE, DELETE) that +/// need to run before the main query. /// -/// Returns statements to execute directly as side effects (CREATE, INSERT, …) -/// and an optional query whose result should be wrapped as the global temp -/// table. -pub fn transform_global_sql( - source_tree: &SourceTree, - materialized_ctes: &HashSet, - reader: &dyn Reader, -) -> Result<(Vec, Option)> { - // Collect side-effect statements (CREATE, INSERT, UPDATE, DELETE) that - // need to run before the main query. These appear alongside a trailing - // SELECT or VISUALISE FROM. - // - // Only structured DML is handled here — other_sql_statement nodes - // (INSTALL, LOAD, SET, …) are pre-executed in prepare_data_with_reader. +/// Only structured DML is handled here — other_sql_statement nodes +/// (INSTALL, LOAD, SET, …) are pre-executed in prepare_data_with_reader. +pub fn extract_side_effects(source_tree: &SourceTree) -> Vec { let root = source_tree.root(); - let side_effect_stmts = r#" (sql_statement [(create_statement) @@ -306,12 +420,24 @@ pub fn transform_global_sql( (update_statement) (delete_statement)] @stmt) "#; - let side_effects: Vec = source_tree + source_tree .find_texts(&root, side_effect_stmts) .into_iter() - .map(|s| transform_cte_references(s.trim(), materialized_ctes)) + .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) - .collect(); + .collect() +} + +/// Transform global SQL for execution as the global temp table. +/// +/// Returns the query whose result should be wrapped as the global temp table, +/// with CTE references rewritten and primary base tables staged. +pub fn transform_global_sql( + source_tree: &SourceTree, + materialized_ctes: &HashSet, + reader: &dyn Reader, +) -> Result> { + let root = source_tree.root(); // Try to extract trailing SELECT (WITH...SELECT or direct SELECT) let select_sql = split_with_query(source_tree) @@ -323,20 +449,17 @@ pub fn transform_global_sql( if let Some(select_sql) = select_sql { let select_sql = transform_cte_references(&select_sql, materialized_ctes); - return Ok(( - side_effects, - Some(transform_source_references(&select_sql, reader)?), - )); + return Ok(Some(transform_source_references(&select_sql, reader)?)); } if !has_executable_sql(source_tree) { - return Ok((vec![], None)); + return Ok(None); } - // We have non-SELECT executable SQL and/or VISUALISE FROM. - // Side-effects run directly, VISUALISE FROM becomes the queryable part. - // A bare WITH clause without a trailing statement is not executable on - // its own (its CTEs are already materialized separately). + // We have non-SELECT executable SQL and/or VISUALISE FROM. VISUALISE FROM + // becomes the queryable part; a bare WITH clause without a trailing + // statement is not executable on its own (its CTEs are materialized + // separately). let viz_from_query = source_tree .find_text( &root, @@ -349,19 +472,18 @@ pub fn transform_global_sql( }) .transpose()?; - if !side_effects.is_empty() || viz_from_query.is_some() { - Ok((side_effects, viz_from_query)) + if viz_from_query.is_some() || !extract_side_effects(source_tree).is_empty() { + Ok(viz_from_query) } else { // has_executable_sql was true but we found no specific statements or // VISUALISE FROM — fall back to extract_sql as the query. - let query = source_tree + source_tree .extract_sql() .map(|s| { let s = transform_cte_references(&s, materialized_ctes); transform_source_references(&s, reader) }) - .transpose()?; - Ok((vec![], query)) + .transpose() } } @@ -553,6 +675,46 @@ mod tests { } } + #[test] + fn test_transform_cte_references_comma_join_second_position() { + // A CTE in a non-first comma position must still be rewritten. + let ctes: HashSet = ["cte"].iter().map(|s| s.to_string()).collect(); + let out = transform_cte_references("SELECT * FROM base, cte WHERE base.k = cte.k", &ctes); + assert!( + !out.contains("FROM base, cte "), + "cte table ref not rewritten: {out}" + ); + assert!(out.contains("__ggsql_cte_cte_")); + // Both the table ref and the qualified column ref are rewritten. + assert_eq!(out.matches("__ggsql_cte_cte_").count(), 2); + } + + #[test] + fn test_transform_cte_references_preserves_string_literals() { + // A CTE name inside a string literal must not be rewritten. + let ctes: HashSet = ["cte"].iter().map(|s| s.to_string()).collect(); + let out = transform_cte_references("SELECT cte.k, 'cte.k' AS lit FROM cte", &ctes); + assert!(out.contains("'cte.k'"), "literal was corrupted: {out}"); + // The real qualifier and table ref are still rewritten. + assert_eq!(out.matches("__ggsql_cte_cte_").count(), 2); + } + + #[test] + fn test_transform_cte_references_whitespace_around_dot() { + let ctes: HashSet = ["cte"].iter().map(|s| s.to_string()).collect(); + let out = transform_cte_references("SELECT cte . v FROM cte", &ctes); + // The whitespace-separated qualifier is rewritten too. + assert!(!out.contains("cte . v"), "qualifier not rewritten: {out}"); + assert_eq!(out.matches("__ggsql_cte_cte_").count(), 2); + } + + #[test] + fn test_transform_cte_references_case_insensitive() { + let ctes: HashSet = ["cte"].iter().map(|s| s.to_string()).collect(); + let out = transform_cte_references("SELECT CTE.v FROM CTE", &ctes); + assert_eq!(out.matches("__ggsql_cte_cte_").count(), 2); + } + /// Minimal reader that records `materialize_table` calls, used to unit-test /// source-reference staging without a database. struct MockReader { @@ -656,6 +818,65 @@ mod tests { assert!(!out.contains("FROM base")); } + #[test] + fn test_transform_source_references_case_insensitive_qualifier() { + let reader = MockReader::new(true); + // The full qualifier is spelled in a different case than the table_ref; + // unquoted identifiers fold, so it must still be rewritten. + let sql = "SELECT MYSCHEMA.BASE.w FROM \"__ggsql_cte_t__\" \ + JOIN myschema.base ON MYSCHEMA.BASE.k = 1"; + let out = transform_source_references(sql, &reader).unwrap(); + + assert_eq!(reader.staged.borrow().len(), 1); + assert!( + !out.contains("MYSCHEMA"), + "case-variant qualifier not rewritten: {out}" + ); + } + + #[test] + fn test_last_identifier_component() { + assert_eq!(last_identifier_component("base"), "base"); + assert_eq!(last_identifier_component("schema.base"), "base"); + assert_eq!(last_identifier_component("cat.schema.base"), "base"); + assert_eq!(last_identifier_component("\"schema\".\"base\""), "\"base\""); + assert_eq!(last_identifier_component("schema.\"base\""), "\"base\""); + // A dot inside a quoted component is not a separator. + assert_eq!(last_identifier_component("\"my.base\""), "\"my.base\""); + } + + #[test] + fn test_transform_source_references_fully_quoted_qualified() { + let reader = MockReader::new(true); + // `"s"."t"` must stage and alias to the (quoted) last component. + let sql = "SELECT \"myschema\".\"base\".w FROM \"__ggsql_cte_t__\" \ + JOIN \"myschema\".\"base\" ON \"myschema\".\"base\".k = 1"; + let out = transform_source_references(sql, &reader).unwrap(); + + let staged = reader.staged.borrow(); + assert_eq!(staged.len(), 1); + assert_eq!(staged[0].1, "SELECT * FROM \"myschema\".\"base\""); + // The staged table is aliased AS "base" and full qualifiers rewritten. + assert!(out.contains("AS \"base\"")); + assert!(!out.contains("\"myschema\".\"base\"")); + } + + #[test] + fn test_transform_source_references_whitespace_around_dot() { + let reader = MockReader::new(true); + // Whitespace/newlines around the dots (in both the table_ref and the full + // column qualifier) must not defeat staging or the qualifier rewrite. + let sql = "SELECT myschema .\nbase . w FROM \"__ggsql_cte_t__\" \ + JOIN myschema . base ON myschema . base . k = 1"; + let out = transform_source_references(sql, &reader).unwrap(); + + let staged = reader.staged.borrow(); + assert_eq!(staged.len(), 1); + // The full column qualifiers are rewritten to the alias despite spacing. + assert!(!out.contains("myschema"), "qualifier not rewritten: {out}"); + assert!(out.contains("AS base")); + } + #[test] fn test_transform_source_references_schema_qualified() { let reader = MockReader::new(true); @@ -677,6 +898,47 @@ mod tests { assert_eq!(reader.staged.borrow().len(), 1); } + #[test] + fn test_transform_source_references_comma_join() { + // A comma join between a cache-resident temp and a primary base table: + // the primary side must be staged (the old FROM/JOIN regex missed this). + let reader = MockReader::new(true); + let sql = "SELECT * FROM \"__ggsql_cte_t__\", base"; + let out = transform_source_references(sql, &reader).unwrap(); + + let staged = reader.staged.borrow(); + assert_eq!(staged.len(), 1); + assert_eq!(staged[0].1, "SELECT * FROM base"); + assert!(out.contains("__ggsql_staged_0_")); + assert!(out.contains("\"__ggsql_cte_t__\"")); + } + + #[test] + fn test_transform_source_references_preserves_string_literals() { + // A string literal that happens to look like a qualified reference must + // not be rewritten. + let reader = MockReader::new(true); + let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN myschema.base \ + ON note = 'myschema.base.k'"; + let out = transform_source_references(sql, &reader).unwrap(); + + assert_eq!(reader.staged.borrow().len(), 1); + // The literal is untouched; the real qualifier in the table_ref is staged. + assert!(out.contains("'myschema.base.k'")); + assert!(!out.contains("JOIN myschema.base ")); + } + + #[test] + fn test_transform_source_references_builtin_not_staged() { + // A `ggsql:` builtin resolves against the cache, so a JOIN against it and + // a resident temp is entirely cache-side and needs no staging. + let reader = MockReader::new(true); + let sql = "SELECT * FROM \"__ggsql_cte_t__\" JOIN ggsql:penguins ON 1 = 1"; + let out = transform_source_references(sql, &reader).unwrap(); + assert!(reader.staged.borrow().is_empty()); + assert_eq!(out, sql); + } + #[test] fn test_split_with_query_basic() { let sql = "WITH cte AS (SELECT * FROM x) SELECT * FROM cte"; diff --git a/src/execute/mod.rs b/src/execute/mod.rs index 0e6379d01..51b90b94b 100644 --- a/src/execute/mod.rs +++ b/src/execute/mod.rs @@ -1135,6 +1135,13 @@ pub fn prepare_data_with_reader(query: &str, reader: &dyn Reader) -> Result Result Result> { + let source_tree = SourceTree::new(sql)?; + let root = source_tree.root(); + + let mut sites = Vec::new(); + for node in source_tree.find_nodes(&root, "(table_ref) @ref") { + let Some(table) = node.child_by_field_name("table") else { + continue; + }; + sites.push(TableRefSite { + start: table.start_byte(), + end: table.end_byte(), + raw: source_tree.get_text(&table), + has_alias: node.child_by_field_name("alias").is_some(), + }); + } + Ok(sites) +} + +/// Extract the table names referenced in a SQL query's `FROM`/`JOIN` clauses. +/// +/// Returns the `table_ref` targets, with surrounding quotes stripped. Subquery +/// sources contribute no name. +pub fn extract_table_refs(sql: &str) -> Result> { + let source_tree = SourceTree::new(sql)?; + let root = source_tree.root(); + + let mut names: Vec = source_tree + .find_texts(&root, "(table_ref table: (_) @table)") + .iter() + .map(|t| naming::unquote_ident(t)) + .collect(); + names.sort_unstable(); + names.dedup(); + Ok(names) +} + +/// Byte ranges of string literals in `sql`. +pub fn string_literal_ranges(sql: &str) -> Vec<(usize, usize)> { + let Ok(source_tree) = SourceTree::new(sql) else { + return Vec::new(); + }; + let root = source_tree.root(); + source_tree + .find_nodes(&root, "(string) @s") + .iter() + .map(|n| (n.start_byte(), n.end_byte())) + .collect() +} + +/// Extract builtin dataset names from SQL containing namespaced identifiers. +pub fn extract_builtin_dataset_names(sql: &str) -> Result> { + let source_tree = SourceTree::new(sql)?; + let root = source_tree.root(); + + let mut names: Vec = source_tree + .find_texts(&root, "(namespaced_identifier) @select") + .iter() + .filter_map(|token| token.strip_prefix("ggsql:").map(|s| s.to_string())) + .collect(); + names.sort_unstable(); + names.dedup(); + Ok(names) +} + +/// Rewrite SQL to replace namespaced identifiers with internal table names. +/// +/// e.g. `SELECT * FROM ggsql:penguins` → `SELECT * FROM __ggsql_data_penguins__`. +/// +/// Uses the parse tree to find the positions of namespaced identifiers, then +/// replaces them. +pub fn rewrite_namespaced_sql(sql: &str) -> Result { + let source_tree = SourceTree::new(sql)?; + let root = source_tree.root(); + + // Collect (start_byte, end_byte, replacement) tuples. + let mut replacements: Vec<(usize, usize, String)> = Vec::new(); + for node in source_tree.find_nodes(&root, "(namespaced_identifier) @select") { + let full_text = source_tree.get_text(&node); + if let Some(name) = full_text.strip_prefix("ggsql:") { + replacements.push(( + node.start_byte(), + node.end_byte(), + naming::quote_ident(&naming::builtin_data_table(name)), + )); + } + } + + if replacements.is_empty() { + return Ok(sql.to_string()); + } + + // Apply replacements in reverse byte order to preserve earlier offsets. + let mut result = sql.to_string(); + replacements.sort_by_key(|(start, _, _)| std::cmp::Reverse(*start)); + for (start, end, replacement) in replacements { + result.replace_range(start..end, &replacement); + } + Ok(result) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_builtin_dataset_names_single() { + let sql = "SELECT * FROM ggsql:penguins VISUALISE DRAW point MAPPING x AS x"; + assert_eq!( + extract_builtin_dataset_names(sql).unwrap(), + vec!["penguins"] + ); + } + + #[test] + fn test_extract_builtin_dataset_names_multiple() { + let sql = + "SELECT * FROM ggsql:penguins, ggsql:airquality VISUALISE DRAW point MAPPING x AS x"; + let names = extract_builtin_dataset_names(sql).unwrap(); + assert_eq!(names.len(), 2); + assert!(names.contains(&"airquality".to_string())); + assert!(names.contains(&"penguins".to_string())); + } + + #[test] + fn test_extract_builtin_dataset_names_dedup() { + let sql = "SELECT * FROM ggsql:penguins p1, ggsql:penguins p2 VISUALISE DRAW point MAPPING x AS x"; + assert_eq!( + extract_builtin_dataset_names(sql).unwrap(), + vec!["penguins"] + ); + } + + #[test] + fn test_extract_builtin_dataset_names_none() { + let sql = "SELECT * FROM regular_table VISUALISE DRAW point MAPPING x AS x"; + assert!(extract_builtin_dataset_names(sql).unwrap().is_empty()); + } + + #[test] + fn test_rewrite_namespaced_sql_simple() { + let sql = "SELECT * FROM ggsql:penguins"; + assert_eq!( + rewrite_namespaced_sql(sql).unwrap(), + "SELECT * FROM \"__ggsql_data_penguins__\"" + ); + } + + #[test] + fn test_rewrite_namespaced_sql_multiple() { + let sql = "SELECT * FROM ggsql:penguins p, ggsql:airquality a WHERE p.id = a.id"; + assert_eq!( + rewrite_namespaced_sql(sql).unwrap(), + "SELECT * FROM \"__ggsql_data_penguins__\" p, \"__ggsql_data_airquality__\" a WHERE p.id = a.id" + ); + } + + #[test] + fn test_rewrite_namespaced_sql_no_change() { + let sql = "SELECT * FROM regular_table WHERE x > 5"; + assert_eq!(rewrite_namespaced_sql(sql).unwrap(), sql); + } + + #[test] + fn test_rewrite_namespaced_sql_with_visualise() { + let sql = "SELECT * FROM ggsql:penguins VISUALISE DRAW point MAPPING bill_len AS x, bill_dep AS y"; + let rewritten = rewrite_namespaced_sql(sql).unwrap(); + assert!(rewritten.starts_with("SELECT * FROM \"__ggsql_data_penguins__\"")); + assert!(!rewritten.contains("ggsql:")); + } + + #[test] + fn test_extract_table_refs_basic() { + let mut refs = extract_table_refs("SELECT * FROM a JOIN \"b c\" ON a.k = 1").unwrap(); + refs.sort(); + assert_eq!(refs, vec!["a".to_string(), "b c".to_string()]); + } + + #[test] + fn test_extract_table_ref_sites_alias_and_range() { + let sites = + extract_table_ref_sites("SELECT * FROM orders o JOIN items ON o.k = 1").unwrap(); + assert_eq!(sites.len(), 2); + assert_eq!(sites[0].raw, "orders"); + assert!(sites[0].has_alias); + assert_eq!(sites[1].raw, "items"); + assert!(!sites[1].has_alias); + } + + #[test] + fn test_string_literal_ranges_finds_literals() { + let sql = "SELECT * FROM t WHERE note = 'hello'"; + let ranges = string_literal_ranges(sql); + assert_eq!(ranges.len(), 1); + let (s, e) = ranges[0]; + assert_eq!(&sql[s..e], "'hello'"); + } +} diff --git a/src/reader/cache.rs b/src/reader/cache.rs index 6ed5dc6c2..fcab27329 100644 --- a/src/reader/cache.rs +++ b/src/reader/cache.rs @@ -308,16 +308,24 @@ impl CachingReader { } } - /// Whether `sql` references a cache-resident table by exact name. + /// Whether `sql` reads something the cache backend owns: a cache-resident + /// table, the metadata table, or a `ggsql:` builtin dataset. /// - /// Parses the query's `table_ref` targets and tests them against `resident`. + /// Determined from the parsed `table_ref`/namespaced-identifier targets. /// On a parse failure we conservatively return `false`. - fn references_resident(&self, sql: &str) -> bool { - let Ok(refs) = super::data::extract_table_refs(sql) else { + fn references_cache_resident(&self, sql: &str) -> bool { + if crate::parser::extract_builtin_dataset_names(sql) + .map(|d| !d.is_empty()) + .unwrap_or(false) + { + return true; + } + let Ok(refs) = crate::parser::extract_table_refs(sql) else { return false; }; let resident = self.resident.borrow(); - refs.iter().any(|t| resident.contains(t)) + refs.iter() + .any(|t| t == naming::CACHE_META_TABLE || resident.contains(t)) } /// Drop every memoized result, one entry at a time. @@ -357,10 +365,7 @@ impl Reader for CachingReader { fn execute_sql(&self, sql: &str) -> Result { // Route to the cache when the read targets cache-resident objects, the // metadata table, or a builtin dataset. - if sql.contains("ggsql:") - || sql.contains(naming::CACHE_META_TABLE) - || self.references_resident(sql) - { + if self.references_cache_resident(sql) { return self.cache.execute_sql(sql); } @@ -769,6 +774,135 @@ mod behavior_tests { ); } + #[test] + fn test_multiple_primary_joins_in_chain() { + // A chain of joins bringing in two distinct primary tables must stage both. + let base = DuckDBReader::new_in_memory().unwrap(); + base.execute_sql("CREATE TABLE base AS SELECT 1 AS k, 10 AS w") + .unwrap(); + base.execute_sql("CREATE TABLE base2 AS SELECT 1 AS k, 20 AS z") + .unwrap(); + let primary = Box::new(ReadOnlyReader::new(Box::new(base))); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let spec = reader.execute( + "WITH t AS (SELECT 1 AS k) \ + SELECT base.w, base2.z FROM t JOIN base ON t.k = base.k JOIN base2 ON t.k = base2.k \ + VISUALISE w AS x, z AS y DRAW point", + ); + assert!( + spec.is_ok(), + "two primaries in a join chain should both be staged: {:?}", + spec.err() + ); + } + + #[test] + fn test_same_named_schema_tables_joined() { + // Two tables sharing a final name across schemas, joined and referenced + // by full qualifier: they must not collapse to the same alias. + let base = DuckDBReader::new_in_memory().unwrap(); + base.execute_sql("CREATE SCHEMA a").unwrap(); + base.execute_sql("CREATE SCHEMA b").unwrap(); + base.execute_sql("CREATE TABLE a.base AS SELECT 1 AS k, 10 AS w") + .unwrap(); + base.execute_sql("CREATE TABLE b.base AS SELECT 1 AS k, 20 AS w") + .unwrap(); + let primary = Box::new(ReadOnlyReader::new(Box::new(base))); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let spec = reader.execute( + "WITH t AS (SELECT 1 AS k) \ + SELECT a.base.w AS aw, b.base.w AS bw \ + FROM t JOIN a.base ON t.k = a.base.k JOIN b.base ON t.k = b.base.k \ + VISUALISE aw AS x, bw AS y DRAW point", + ); + assert!( + spec.is_ok(), + "same-named schema tables should stage to distinct aliases: {:?}", + spec.err() + ); + } + + #[test] + fn test_setup_dml_runs_before_staging() { + // A query that CREATEs a primary table and then joins it against a CTE + // (mixed) must run the CREATE before staging reads the table. + let primary = Box::new(DuckDBReader::new_in_memory().unwrap()); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let spec = reader.execute( + "CREATE TABLE sales AS SELECT * FROM (VALUES (1, 10), (2, 20)) t(k, w); \ + WITH c AS (SELECT 1 AS k UNION ALL SELECT 2) \ + SELECT sales.k, sales.w FROM sales JOIN c ON sales.k = c.k \ + VISUALISE k AS x, w AS y DRAW point", + ); + assert!( + spec.is_ok(), + "setup DML must run before staging reads the created table: {:?}", + spec.err() + ); + } + + #[test] + fn test_multi_statement_dml_ordering_before_staging() { + // Multiple dependent side-effects (CREATE, INSERT, UPDATE) must run in + // order before staging, and the staged data must reflect them. + let primary = Box::new(DuckDBReader::new_in_memory().unwrap()); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let prepared = crate::execute::prepare_data_with_reader( + "CREATE TABLE t(k INTEGER, w INTEGER); \ + INSERT INTO t VALUES (1, 10), (2, 20); \ + UPDATE t SET w = w + 5 WHERE k = 1; \ + WITH c AS (SELECT 1 AS k UNION ALL SELECT 2) \ + SELECT t.k, t.w FROM t JOIN c ON t.k = c.k \ + VISUALISE k AS x, w AS y DRAW point", + &reader, + ) + .expect("multi-statement setup DML + staging should succeed"); + + let df = prepared.data.get(&naming::layer_key(0)).unwrap(); + // Row for k=1 must reflect the UPDATE (w = 15), k=2 unchanged (w = 20). + assert_eq!(df.height(), 2); + let ws = df.column("__ggsql_aes_pos2__").unwrap(); + let mut vals: Vec = (0..df.height()) + .map(|i| crate::array_util::value_to_string(ws, i)) + .collect(); + vals.sort(); + assert_eq!(vals, vec!["15".to_string(), "20".to_string()]); + } + + #[test] + fn test_fully_quoted_schema_qualified_join() { + // A fully double-quoted schema-qualified primary (`"s"."t"`) joined with + // a cache-resident CTE: the last component must be extracted correctly so + // the staged alias is valid. + let base = DuckDBReader::new_in_memory().unwrap(); + base.execute_sql("CREATE SCHEMA myschema").unwrap(); + base.execute_sql("CREATE TABLE myschema.base AS SELECT 1 AS k, 10 AS w") + .unwrap(); + let primary = Box::new(ReadOnlyReader::new(Box::new(base))); + let cache = Box::new(DuckDBReader::new_in_memory().unwrap()); + let reader = CachingReader::new(primary, cache, "test://primary"); + + let spec = reader.execute( + "WITH t AS (SELECT 1 AS k) \ + SELECT \"myschema\".\"base\".w FROM t JOIN \"myschema\".\"base\" \ + ON t.k = \"myschema\".\"base\".k \ + VISUALISE w AS x DRAW point", + ); + assert!( + spec.is_ok(), + "fully-quoted schema-qualified join should succeed: {:?}", + spec.err() + ); + } + #[test] fn test_mixed_later_cte_body_joins_primary() { // A later CTE whose body references an earlier (cache-resident) CTE and a diff --git a/src/reader/data.rs b/src/reader/data.rs index 384d944ff..bc9966826 100644 --- a/src/reader/data.rs +++ b/src/reader/data.rs @@ -1,6 +1,4 @@ -use tree_sitter::{Parser, Query, StreamingIterator}; - -use crate::{naming, GgsqlError}; +use crate::GgsqlError; // ============================================================================= // Embedded dataset bytes @@ -119,250 +117,6 @@ pub fn is_known_builtin(name: &str) -> bool { KNOWN_DATASETS.contains(&name) } -// ============================================================================= -// SQL namespace rewriting (always available, including WASM) -// ============================================================================= - -/// Extract builtin dataset names from SQL containing namespaced identifiers. -/// -/// Finds `ggsql:X` patterns via tree-sitter and returns the dataset names -/// (without the `ggsql:` prefix), deduplicated. -pub fn extract_builtin_dataset_names(sql: &str) -> Result, GgsqlError> { - let token_def = r#"(namespaced_identifier) @select"#; - let mut tokens = tokens_from_tree(sql, token_def, "select")?; - - if tokens.is_empty() { - return Ok(Vec::new()); - } - - tokens.sort_unstable(); - tokens.dedup(); - - let datasets: Vec = tokens - .iter() - .filter_map(|token| token.strip_prefix("ggsql:").map(|s| s.to_string())) - .collect(); - - Ok(datasets) -} - -/// Extract the table names referenced in a SQL query's `FROM`/`JOIN` clauses. -/// -/// Returns the `table_ref` targets (deduplicated), with surrounding quotes -/// stripped via [`naming::unquote_ident`] so they can be compared against -/// unquoted registered table names. Subquery sources contribute no name. -pub fn extract_table_refs(sql: &str) -> Result, GgsqlError> { - let token_def = r#"(table_ref table: (_) @table)"#; - let mut tokens = tokens_from_tree(sql, token_def, "table")? - .iter() - .map(|t| naming::unquote_ident(t)) - .collect::>(); - tokens.sort_unstable(); - tokens.dedup(); - Ok(tokens) -} - -/// Rewrite SQL to replace namespaced identifiers with internal table names. -/// -/// e.g., `SELECT * FROM ggsql:penguins` -> `SELECT * FROM __ggsql_data_penguins__` -/// -/// Uses tree-sitter to find the exact byte positions of namespaced identifiers, -/// then replaces them in reverse order to preserve offsets. -pub fn rewrite_namespaced_sql(sql: &str) -> Result { - let token_def = r#"(namespaced_identifier) @select"#; - - // Parse to get byte positions - let mut parser = Parser::new(); - parser - .set_language(&tree_sitter_ggsql::language()) - .map_err(|e| GgsqlError::ParseError(format!("Failed to initialise parser: {}", e)))?; - - let tree = parser - .parse(sql, None) - .ok_or_else(|| GgsqlError::ParseError(format!("Failed to parse query: {}", sql)))?; - - let query = Query::new(&tree.language(), token_def) - .map_err(|e| GgsqlError::ParseError(format!("Failed to initialise tree_query: {}", e)))?; - - let index = query - .capture_index_for_name("select") - .ok_or_else(|| GgsqlError::ParseError("Failed to capture index".to_string()))?; - - let mut cursor = tree_sitter::QueryCursor::new(); - let mut matches = cursor.matches(&query, tree.root_node(), sql.as_bytes()); - - // Collect (start_byte, end_byte, replacement) tuples - let mut replacements: Vec<(usize, usize, String)> = Vec::new(); - while let Some(matching) = matches.next() { - for item in matching.captures { - if item.index != index { - continue; - } - let node = item.node; - let full_text = &sql[node.start_byte()..node.end_byte()]; - if let Some(name) = full_text.strip_prefix("ggsql:") { - replacements.push(( - node.start_byte(), - node.end_byte(), - naming::quote_ident(&naming::builtin_data_table(name)), - )); - } - } - } - - if replacements.is_empty() { - return Ok(sql.to_string()); - } - - // Apply replacements in reverse byte order to preserve earlier offsets - let mut result = sql.to_string(); - replacements.sort_by_key(|(start, _, _)| std::cmp::Reverse(*start)); - for (start, end, replacement) in replacements { - result.replace_range(start..end, &replacement); - } - - Ok(result) -} - -// ============================================================================= -// Shared tree-sitter helpers -// ============================================================================= - -fn tokens_from_tree( - sql_query: &str, - tree_query: &str, - name: &str, -) -> Result, GgsqlError> { - // Setup parser - let mut parser = Parser::new(); - if let Err(e) = parser.set_language(&tree_sitter_ggsql::language()) { - return Err(GgsqlError::ParseError(format!( - "Failed to initialise parser: {}", - e - ))); - } - - // Digest SQL to tree - let tree = parser.parse(sql_query, None); - if tree.is_none() { - return Err(GgsqlError::ParseError(format!( - "Failed to parse query: {}", - sql_query - ))); - } - let tree = tree.unwrap(); - - // Setup query for tree - let query = Query::new(&tree.language(), tree_query); - if let Err(e) = query { - return Err(GgsqlError::ParseError(format!( - "Failed to initialise `tree_query`: {}", - e - ))); - } - let query = query.unwrap(); - - // Find `name` in `tree_query` - let index = query.capture_index_for_name(name); - if index.is_none() { - return Err(GgsqlError::ParseError( - "Failed to capture index for `tree_query`".to_string(), - )); - } - let index = index.unwrap(); - - // Find matches of `tree_query` in the parsed tree - let mut cursor = tree_sitter::QueryCursor::new(); - let mut matches = cursor.matches(&query, tree.root_node(), sql_query.as_bytes()); - - // Collect results - let mut result: Vec = Vec::new(); - while let Some(matching) = matches.next() { - for item in matching.captures { - if item.index != index { - // We have a match with a different @keyword than the one defined by `name`. - continue; - } - let node = item.node; - let token = &sql_query[node.start_byte()..node.end_byte()]; - result.push(token.to_string()); - } - } - Ok(result) -} - -// ============================================================================= -// Tests -// ============================================================================= - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_extract_builtin_dataset_names_single() { - let sql = "SELECT * FROM ggsql:penguins VISUALISE DRAW point MAPPING x AS x"; - let names = extract_builtin_dataset_names(sql).unwrap(); - assert_eq!(names, vec!["penguins"]); - } - - #[test] - fn test_extract_builtin_dataset_names_multiple() { - let sql = - "SELECT * FROM ggsql:penguins, ggsql:airquality VISUALISE DRAW point MAPPING x AS x"; - let names = extract_builtin_dataset_names(sql).unwrap(); - assert_eq!(names.len(), 2); - assert!(names.contains(&"airquality".to_string())); - assert!(names.contains(&"penguins".to_string())); - } - - #[test] - fn test_extract_builtin_dataset_names_dedup() { - let sql = "SELECT * FROM ggsql:penguins p1, ggsql:penguins p2 VISUALISE DRAW point MAPPING x AS x"; - let names = extract_builtin_dataset_names(sql).unwrap(); - assert_eq!(names, vec!["penguins"]); - } - - #[test] - fn test_extract_builtin_dataset_names_none() { - let sql = "SELECT * FROM regular_table VISUALISE DRAW point MAPPING x AS x"; - let names = extract_builtin_dataset_names(sql).unwrap(); - assert!(names.is_empty()); - } - - #[test] - fn test_rewrite_namespaced_sql_simple() { - let sql = "SELECT * FROM ggsql:penguins"; - let rewritten = rewrite_namespaced_sql(sql).unwrap(); - assert_eq!(rewritten, "SELECT * FROM \"__ggsql_data_penguins__\""); - } - - #[test] - fn test_rewrite_namespaced_sql_multiple() { - let sql = "SELECT * FROM ggsql:penguins p, ggsql:airquality a WHERE p.id = a.id"; - let rewritten = rewrite_namespaced_sql(sql).unwrap(); - assert_eq!( - rewritten, - "SELECT * FROM \"__ggsql_data_penguins__\" p, \"__ggsql_data_airquality__\" a WHERE p.id = a.id" - ); - } - - #[test] - fn test_rewrite_namespaced_sql_no_change() { - let sql = "SELECT * FROM regular_table WHERE x > 5"; - let rewritten = rewrite_namespaced_sql(sql).unwrap(); - assert_eq!(rewritten, sql); - } - - #[test] - fn test_rewrite_namespaced_sql_with_visualise() { - let sql = "SELECT * FROM ggsql:penguins VISUALISE DRAW point MAPPING bill_len AS x, bill_dep AS y"; - let rewritten = rewrite_namespaced_sql(sql).unwrap(); - assert!(rewritten.starts_with("SELECT * FROM \"__ggsql_data_penguins__\"")); - assert!(!rewritten.contains("ggsql:")); - } -} - #[cfg(all(feature = "duckdb", feature = "builtin-data"))] #[cfg(test)] mod duckdb_tests { diff --git a/src/reader/duckdb.rs b/src/reader/duckdb.rs index 5badc5324..cde53e32e 100644 --- a/src/reader/duckdb.rs +++ b/src/reader/duckdb.rs @@ -25,7 +25,7 @@ use std::sync::Arc; fn register_builtin_datasets_duckdb(sql: &str, conn: &Connection) -> Result<()> { use std::{env, fs}; - let dataset_names = super::data::extract_builtin_dataset_names(sql)?; + let dataset_names = crate::parser::extract_builtin_dataset_names(sql)?; // Load spatial extension before registering datasets that contain // geometry columns, so that spatial features are available. @@ -366,7 +366,7 @@ impl Reader for DuckDBReader { register_builtin_datasets_duckdb(sql, &self.conn)?; // Rewrite ggsql:name → __ggsql_data_name__ in SQL - let sql = super::data::rewrite_namespaced_sql(sql)?; + let sql = crate::parser::rewrite_namespaced_sql(sql)?; if !super::returns_rows(&sql) { self.conn diff --git a/src/reader/sqlite.rs b/src/reader/sqlite.rs index 245c65a96..6df181633 100644 --- a/src/reader/sqlite.rs +++ b/src/reader/sqlite.rs @@ -387,7 +387,7 @@ impl Reader for SqliteReader { // Handle ggsql:name namespaced identifiers (builtin datasets) #[cfg(all(feature = "builtin-data", feature = "parquet"))] { - let dataset_names = super::data::extract_builtin_dataset_names(sql)?; + let dataset_names = crate::parser::extract_builtin_dataset_names(sql)?; for name in &dataset_names { let table_name = naming::builtin_data_table(name); if !self.table_exists(&table_name) { @@ -398,7 +398,7 @@ impl Reader for SqliteReader { } // Rewrite ggsql:name → __ggsql_data_name__ in SQL - let sql = super::data::rewrite_namespaced_sql(sql)?; + let sql = crate::parser::rewrite_namespaced_sql(sql)?; if !super::returns_rows(&sql) { self.conn