Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ palette = { version = "0.7", default-features = false, features = ["std", "appro
# Spatial
geozero = { version = "0.14", default-features = false }

# Online data (download + cache)
ureq = "3"

# Utilities
regex = "1.10"
chrono = "0.4"
Expand Down
21 changes: 19 additions & 2 deletions ggsql-wasm/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,8 @@ impl GgsqlContext {

/// Register all known builtin datasets (e.g. ggsql:penguins)
pub async fn register_builtin_datasets(&self) -> Result<(), JsValue> {
for &name in ggsql::reader::data::KNOWN_DATASETS {
if let Some(bytes) = ggsql::reader::data::builtin_parquet_bytes(name) {
for &name in ggsql::reader::builtin_data::KNOWN_DATASETS {
if let Some(bytes) = ggsql::reader::builtin_data::builtin_parquet_bytes(name) {
let table_name = ggsql::naming::builtin_data_table(name);
let columns_js = convert_parquet_js(bytes).await.map_err(|e| {
JsValue::from_str(&format!("Parquet error for '{}': {:?}", name, e))
Expand All @@ -336,6 +336,23 @@ impl GgsqlContext {
Ok(())
}

// TODO: implement `register_online_datasets` to support `online:world` etc.
//
// Expected signature:
// pub async fn register_online_datasets(&self, sql: &str) -> Result<(), JsValue>
//
// Implementation:
// 1. Extract names via:
// ggsql::reader::builtin_data::extract_prefixed_dataset_names(sql, "online")
// 2. For each name, skip if already registered (check table_exists on reader).
// 3. Resolve URL via ggsql::reader::online_data::resolve_online_dataset(name).
// 4. Fetch bytes via browser fetch() (wasm_bindgen_futures + web_sys::Request).
// 5. Parse with convert_parquet_js(bytes).await + columns_js_to_dataframe().
// 6. Register under ggsql::naming::online_data_table(name).
//
// Then call this from `execute()` and `execute_sql()` before the reader runs,
// which requires making those methods async (or calling from JS before execute).

/// Load a previously installed SQLite extension.
///
/// `entry_point` is the C init function name. If omitted, SQLite
Expand Down
6 changes: 4 additions & 2 deletions src/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ adbc_core = { version = "0.23", optional = true }
# Spatial
geozero = { workspace = true, optional = true, features = ["with-wkb", "with-wkt", "with-geojson"] }

# Online data (download + cache) — native only, pulled in with parquet
ureq = { workspace = true, optional = true }

# Serialization
serde.workspace = true
serde_json.workspace = true
Expand All @@ -57,14 +60,13 @@ uuid.workspace = true
[dev-dependencies]
jsonschema = { version = "0.44", default-features = false, features = ["resolve-file"] }
tempfile = "3.8"
ureq = "3"
adbc_datafusion = "0.23"
adbc_driver_manager = "0.23"

[features]
default = ["adbc", "duckdb", "sqlite", "vegalite", "parquet", "builtin-data", "odbc", "spatial"]
duckdb = ["dep:duckdb"]
parquet = ["dep:parquet"]
parquet = ["dep:parquet", "dep:ureq"]
sqlite = ["dep:rusqlite"]
adbc = ["dep:adbc_core"]
odbc = ["dep:toml_edit", "dep:libloading"]
Expand Down
20 changes: 20 additions & 0 deletions src/naming.rs
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,26 @@ pub fn builtin_data_table(name: &str) -> String {
format!("{}{}{}", DATA_PREFIX, name, GGSQL_SUFFIX)
}

/// Generate table name for an online dataset.
///
/// Used when rewriting `online:world` to the internal table name.
/// Format: `__ggsql_online_<name>__`
///
/// # Example
/// ```
/// use ggsql::naming;
/// assert_eq!(naming::online_data_table("world"), "__ggsql_online_world__");
/// assert_eq!(naming::online_data_table("us-counties"), "__ggsql_online_us_counties__");
/// ```
pub fn online_data_table(name: &str) -> String {
format!(
"{}online_{}{}",
GGSQL_PREFIX,
name.replace('-', "_"),
GGSQL_SUFFIX
)
}

/// Generate column name for a constant aesthetic value.
///
/// Used when a single layer has a literal aesthetic value that needs
Expand Down
101 changes: 72 additions & 29 deletions src/reader/data.rs → src/reader/builtin_data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,36 +64,27 @@ pub fn builtin_parquet_bytes(name: &str) -> Option<&'static [u8]> {
// Arrow-based builtin data loading
// =============================================================================

#[cfg(all(feature = "builtin-data", feature = "parquet"))]
pub fn load_builtin_dataframe(name: &str) -> Result<crate::DataFrame, GgsqlError> {
/// Convert raw parquet bytes into a DataFrame.
#[cfg(feature = "parquet")]
pub fn dataframe_from_parquet_bytes(
label: &str,
bytes: bytes::Bytes,
) -> Result<crate::DataFrame, GgsqlError> {
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;

let parquet_bytes = match name {
"penguins" => PENGUINS,
"airquality" => AIRQUALITY,
"world" => WORLD,
_ => {
return Err(GgsqlError::ReaderError(format!(
"Unknown builtin dataset: '{}'",
name
)))
}
};

let bytes = bytes::Bytes::from_static(parquet_bytes);
let reader = ParquetRecordBatchReaderBuilder::try_new(bytes)
.map_err(|e| {
GgsqlError::ReaderError(format!("Failed to read builtin dataset '{}': {}", name, e))
GgsqlError::ReaderError(format!("Failed to read parquet for '{}': {}", label, e))
})?
.build()
.map_err(|e| {
GgsqlError::ReaderError(format!("Failed to build reader for '{}': {}", name, e))
GgsqlError::ReaderError(format!("Failed to build reader for '{}': {}", label, e))
})?;

let batches: Vec<_> = reader
.collect::<std::result::Result<Vec<_>, _>>()
.map_err(|e| {
GgsqlError::ReaderError(format!("Failed to load builtin dataset '{}': {}", name, e))
GgsqlError::ReaderError(format!("Failed to load dataset '{}': {}", label, e))
})?;

if batches.is_empty() {
Expand All @@ -104,13 +95,30 @@ pub fn load_builtin_dataframe(name: &str) -> Result<crate::DataFrame, GgsqlError
batches.into_iter().next().unwrap()
} else {
arrow::compute::concat_batches(&batches[0].schema(), &batches).map_err(|e| {
GgsqlError::ReaderError(format!("Failed to concat batches for '{}': {}", name, e))
GgsqlError::ReaderError(format!("Failed to concat batches for '{}': {}", label, e))
})?
};

Ok(crate::DataFrame::from_record_batch(rb))
}

#[cfg(all(feature = "builtin-data", feature = "parquet"))]
pub fn load_builtin_dataframe(name: &str) -> Result<crate::DataFrame, GgsqlError> {
let parquet_bytes = match name {
"penguins" => PENGUINS,
"airquality" => AIRQUALITY,
"world" => WORLD,
_ => {
return Err(GgsqlError::ReaderError(format!(
"Unknown builtin dataset: '{}'",
name
)))
}
};

dataframe_from_parquet_bytes(name, bytes::Bytes::from_static(parquet_bytes))
}

/// Known builtin dataset names in the ggsql namespace
pub const KNOWN_DATASETS: &[&str] = &["penguins", "airquality", "world"];

Expand All @@ -123,11 +131,11 @@ pub fn is_known_builtin(name: &str) -> bool {
// SQL namespace rewriting (always available, including WASM)
// =============================================================================

/// Extract builtin dataset names from SQL containing namespaced identifiers.
/// Extract dataset names with a given namespace prefix from SQL.
///
/// Finds `ggsql:X` patterns via tree-sitter and returns the dataset names
/// (without the `ggsql:` prefix), deduplicated.
pub fn extract_builtin_dataset_names(sql: &str) -> Result<Vec<String>, GgsqlError> {
/// Finds `prefix:X` patterns via tree-sitter and returns the dataset names
/// (without the prefix), deduplicated.
pub fn extract_prefixed_dataset_names(sql: &str, prefix: &str) -> Result<Vec<String>, GgsqlError> {
let token_def = r#"(namespaced_identifier) @select"#;
let mut tokens = tokens_from_tree(sql, token_def, "select")?;

Expand All @@ -138,17 +146,20 @@ pub fn extract_builtin_dataset_names(sql: &str) -> Result<Vec<String>, GgsqlErro
tokens.sort_unstable();
tokens.dedup();

let prefix_colon = format!("{}:", prefix);
let datasets: Vec<String> = tokens
.iter()
.filter_map(|token| token.strip_prefix("ggsql:").map(|s| s.to_string()))
.filter_map(|token| token.strip_prefix(&prefix_colon).map(|s| s.to_string()))
.collect();

Ok(datasets)
}

/// Rewrite SQL to replace namespaced identifiers with internal table names.
///
/// e.g., `SELECT * FROM ggsql:penguins` -> `SELECT * FROM __ggsql_data_penguins__`
/// Handles both builtin and online dataset prefixes:
/// - `ggsql:penguins` -> `__ggsql_data_penguins__`
/// - `online:world` -> `__ggsql_online_world__`
///
/// Uses tree-sitter to find the exact byte positions of namespaced identifiers,
/// then replaces them in reverse order to preserve offsets.
Expand Down Expand Up @@ -190,6 +201,12 @@ pub fn rewrite_namespaced_sql(sql: &str) -> Result<String, GgsqlError> {
node.end_byte(),
naming::quote_ident(&naming::builtin_data_table(name)),
));
} else if let Some(name) = full_text.strip_prefix("online:") {
replacements.push((
node.start_byte(),
node.end_byte(),
naming::quote_ident(&naming::online_data_table(name)),
));
}
}
}
Expand Down Expand Up @@ -286,15 +303,15 @@ mod tests {
#[test]
fn test_extract_builtin_dataset_names_single() {
let sql = "SELECT * FROM ggsql:penguins VISUALISE DRAW point MAPPING x AS x";
let names = extract_builtin_dataset_names(sql).unwrap();
let names = extract_prefixed_dataset_names(sql, "ggsql").unwrap();
assert_eq!(names, vec!["penguins"]);
}

#[test]
fn test_extract_builtin_dataset_names_multiple() {
let sql =
"SELECT * FROM ggsql:penguins, ggsql:airquality VISUALISE DRAW point MAPPING x AS x";
let names = extract_builtin_dataset_names(sql).unwrap();
let names = extract_prefixed_dataset_names(sql, "ggsql").unwrap();
assert_eq!(names.len(), 2);
assert!(names.contains(&"airquality".to_string()));
assert!(names.contains(&"penguins".to_string()));
Expand All @@ -303,14 +320,14 @@ mod tests {
#[test]
fn test_extract_builtin_dataset_names_dedup() {
let sql = "SELECT * FROM ggsql:penguins p1, ggsql:penguins p2 VISUALISE DRAW point MAPPING x AS x";
let names = extract_builtin_dataset_names(sql).unwrap();
let names = extract_prefixed_dataset_names(sql, "ggsql").unwrap();
assert_eq!(names, vec!["penguins"]);
}

#[test]
fn test_extract_builtin_dataset_names_none() {
let sql = "SELECT * FROM regular_table VISUALISE DRAW point MAPPING x AS x";
let names = extract_builtin_dataset_names(sql).unwrap();
let names = extract_prefixed_dataset_names(sql, "ggsql").unwrap();
assert!(names.is_empty());
}

Expand Down Expand Up @@ -345,6 +362,32 @@ mod tests {
assert!(rewritten.starts_with("SELECT * FROM \"__ggsql_data_penguins__\""));
assert!(!rewritten.contains("ggsql:"));
}

#[test]
fn test_rewrite_online_dataset() {
let sql = "SELECT * FROM online:world";
let rewritten = rewrite_namespaced_sql(sql).unwrap();
assert_eq!(rewritten, "SELECT * FROM \"__ggsql_online_world__\"");
}

#[test]
fn test_rewrite_mixed_namespaces() {
// JOIN not yet supported by tree-sitter grammar for namespaced identifiers;
// comma-separated FROM works.
let sql = "SELECT * FROM ggsql:penguins, online:world";
let rewritten = rewrite_namespaced_sql(sql).unwrap();
assert!(rewritten.contains("\"__ggsql_data_penguins__\""));
assert!(rewritten.contains("\"__ggsql_online_world__\""));
assert!(!rewritten.contains("ggsql:"));
assert!(!rewritten.contains("online:"));
}

#[test]
fn test_extract_online_dataset_names() {
let sql = "SELECT * FROM online:world VISUALISE DRAW polygon MAPPING geom AS geometry";
let names = extract_prefixed_dataset_names(sql, "online").unwrap();
assert_eq!(names, vec!["world"]);
}
}

#[cfg(all(feature = "duckdb", feature = "builtin-data"))]
Expand Down
19 changes: 16 additions & 3 deletions src/reader/duckdb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ use std::sync::Arc;
fn register_builtin_datasets_duckdb(sql: &str, conn: &Connection) -> Result<()> {
use std::{env, fs};

let dataset_names = super::data::extract_builtin_dataset_names(sql)?;
let dataset_names = super::builtin_data::extract_prefixed_dataset_names(sql, "ggsql")?;

// Load spatial extension before registering datasets that contain
// geometry columns, so that spatial features are available.
Expand All @@ -34,7 +34,7 @@ fn register_builtin_datasets_duckdb(sql: &str, conn: &Connection) -> Result<()>
}

for name in dataset_names {
let Some(parquet_bytes) = super::data::builtin_parquet_bytes(&name) else {
let Some(parquet_bytes) = super::builtin_data::builtin_parquet_bytes(&name) else {
continue;
};

Expand Down Expand Up @@ -355,8 +355,21 @@ impl Reader for DuckDBReader {
#[cfg(feature = "builtin-data")]
register_builtin_datasets_duckdb(sql, &self.conn)?;

// Register online datasets if referenced
#[cfg(feature = "parquet")]
{
let online_names = super::builtin_data::extract_prefixed_dataset_names(sql, "online")?;
for name in &online_names {
let table_name = naming::online_data_table(name);
if !self.table_exists(&table_name)? {
let df = super::online_data::load_online_dataframe(name)?;
self.register(&table_name, df, true)?;
}
}
}

// Rewrite ggsql:name → __ggsql_data_name__ in SQL
let sql = super::data::rewrite_namespaced_sql(sql)?;
let sql = super::builtin_data::rewrite_namespaced_sql(sql)?;

if !super::returns_rows(&sql) {
self.conn
Expand Down
3 changes: 2 additions & 1 deletion src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -429,8 +429,9 @@ pub mod odbc;
#[cfg(feature = "adbc")]
pub mod adbc;

pub mod builtin_data;
pub mod connection;
pub mod data;
pub mod online_data;
mod spec;

#[cfg(feature = "duckdb")]
Expand Down
Loading
Loading