diff --git a/Cargo.lock b/Cargo.lock index 590a06fa2..9021700fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2967,6 +2967,7 @@ dependencies = [ "alloy", "amp-data-store", "amp-datasets-registry", + "amp-object-store", "amp-providers-registry", "arrow", "async-stream", @@ -2983,22 +2984,26 @@ dependencies = [ "evm-rpc-datasets", "firehose-datasets", "futures", + "indoc", "itertools 0.14.0", "js-runtime", "metadata-db", "object_store", "parking_lot", + "pgtemp", "rand 0.9.2", "regex", "serde", "serde_json", "solana-datasets", + "tempfile", "thiserror 2.0.18", "tokio", "tokio-stream", "tokio-util", "tracing", "url", + "uuid", ] [[package]] @@ -10016,7 +10021,6 @@ dependencies = [ "datafusion", "datasets-common", "futures", - "js-runtime", "metadata-db", "monitoring", "prost", diff --git a/crates/core/common/Cargo.toml b/crates/core/common/Cargo.toml index c7df7541b..2c68e7b7f 100644 --- a/crates/core/common/Cargo.toml +++ b/crates/core/common/Cargo.toml @@ -40,6 +40,11 @@ tokio-stream.workspace = true tokio-util.workspace = true tracing.workspace = true url.workspace = true +uuid.workspace = true [dev-dependencies] +amp-object-store = { path = "../object-store" } chrono.workspace = true +indoc.workspace = true +pgtemp.workspace = true +tempfile.workspace = true diff --git a/crates/core/common/src/amp_catalog_provider.rs b/crates/core/common/src/amp_catalog_provider.rs new file mode 100644 index 000000000..5ae810e29 --- /dev/null +++ b/crates/core/common/src/amp_catalog_provider.rs @@ -0,0 +1,214 @@ +//! Catalog provider for datasets. +//! +//! Resolves datasets at the catalog level and creates +//! [`DatasetSchemaProvider`] instances for schema-only table access. + +use std::{any::Any, collections::BTreeMap, sync::Arc}; + +use async_trait::async_trait; +use datafusion::{ + catalog::{ + AsyncCatalogProvider as TableAsyncCatalogProvider, + AsyncSchemaProvider as TableAsyncSchemaProvider, CatalogProvider as TableCatalogProvider, + SchemaProvider as TableSchemaProvider, + }, + error::DataFusionError, +}; +use datasets_common::{ + hash_reference::HashReference, partial_reference::PartialReference, reference::Reference, +}; +use datasets_derived::deps::SELF_REF_KEYWORD; +use js_runtime::isolate_pool::IsolatePool; + +use crate::{ + dataset_schema_provider::DatasetSchemaProvider, + dataset_store::DatasetStore, + func_catalog::{ + catalog_provider::{ + AsyncCatalogProvider as FuncAsyncCatalogProvider, + CatalogProvider as FuncCatalogProvider, + }, + schema_provider::{ + AsyncSchemaProvider as FuncAsyncSchemaProvider, SchemaProvider as FuncSchemaProvider, + }, + }, +}; + +/// Combined async schema provider for both tables and functions. +/// +/// Blanket-implemented for any type that implements both +/// [`TableAsyncSchemaProvider`] and [`FuncAsyncSchemaProvider`]. +pub trait AsyncSchemaProvider: TableAsyncSchemaProvider + FuncAsyncSchemaProvider {} +impl AsyncSchemaProvider for T {} + +/// The catalog name used to register Amp dataset providers. +pub const AMP_CATALOG_NAME: &str = "amp"; + +/// Catalog provider for datasets. +/// +/// Resolves datasets and creates [`DatasetSchemaProvider`] instances +/// that provide schema information without requiring a data store. +#[derive(Clone)] +pub struct AmpCatalogProvider { + store: DatasetStore, + isolate_pool: IsolatePool, + /// Optional dependency alias overrides. When set, bare names matching + /// a key are resolved directly to the corresponding [`HashReference`] + /// instead of going through `PartialReference` → `Reference` → `resolve_revision`. + dep_aliases: BTreeMap, + /// Optional self-schema provider for the `"self"` virtual schema. + /// Checked before store-based resolution in [`resolve_schema`]. + self_schema: Option>, +} + +impl AmpCatalogProvider { + /// Creates a new catalog provider. + pub fn new(store: DatasetStore, isolate_pool: IsolatePool) -> Self { + Self { + store, + isolate_pool, + dep_aliases: Default::default(), + self_schema: None, + } + } + + /// Sets the dependency alias map for direct resolution of bare names. + pub fn with_dep_aliases(mut self, dep_aliases: BTreeMap) -> Self { + self.dep_aliases = dep_aliases; + self + } + + /// Sets the self-schema provider for the `"self"` virtual schema. + /// + /// The self-schema is checked before store-based resolution in + /// [`resolve_schema`]. It doesn't correspond to a dataset in the store + /// but still needs to resolve tables and functions (e.g., during dump). + pub fn with_self_schema(mut self, provider: Arc) -> Self { + self.self_schema = Some(provider); + self + } + + /// Resolves a schema name to an [`AsyncSchemaProvider`]. + /// + /// Resolution order: + /// 1. Self-schema provider (for the `"self"` virtual schema) + /// 2. Dep alias overrides (pinned hash, no store resolution) + /// 3. Store lookup via `PartialReference` → `Reference` → `resolve_revision` + /// + /// Returns `Ok(None)` when the name doesn't match any provider. + /// Only actual I/O or storage errors produce `Err(...)`. + pub(crate) async fn resolve_schema( + &self, + name: &str, + ) -> Result>, DataFusionError> { + // 1. Self-schema (e.g., "self" for self fn/table refs during dump). + if name == SELF_REF_KEYWORD + && let Some(provider) = &self.self_schema + { + return Ok(Some(provider.clone())); + } + + // 2. Dep alias overrides — pinned hash, no store resolution needed. + if let Some(hash_ref) = self.dep_aliases.get(name) { + let dataset = self + .store + .get_dataset(hash_ref) + .await + .map_err(|err| DataFusionError::External(Box::new(err)))?; + + let provider: Arc = Arc::new(DatasetSchemaProvider::new( + name.to_string(), + dataset, + self.store.clone(), + self.isolate_pool.clone(), + )); + return Ok(Some(provider)); + } + + // 3. Store lookup via PartialReference → Reference → resolve_revision. + let Ok(partial_ref) = name.parse::() else { + return Ok(None); + }; + + let reference: Reference = partial_ref.into(); + + let Some(hash_ref) = self + .store + .resolve_revision(&reference) + .await + .map_err(|err| DataFusionError::External(Box::new(err)))? + else { + return Ok(None); + }; + + let dataset = self + .store + .get_dataset(&hash_ref) + .await + .map_err(|err| DataFusionError::External(Box::new(err)))?; + + let provider: Arc = Arc::new(DatasetSchemaProvider::new( + name.to_string(), + dataset, + self.store.clone(), + self.isolate_pool.clone(), + )); + Ok(Some(provider)) + } +} + +impl std::fmt::Debug for AmpCatalogProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("AmpCatalogProvider").finish() + } +} + +impl TableCatalogProvider for AmpCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + vec![] + } + + fn schema(&self, _name: &str) -> Option> { + None + } +} + +#[async_trait] +impl TableAsyncCatalogProvider for AmpCatalogProvider { + async fn schema( + &self, + name: &str, + ) -> Result>, DataFusionError> { + let schema = self.resolve_schema(name).await?; + Ok(schema.map(|s| s as _)) + } +} + +impl FuncCatalogProvider for AmpCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + vec![] + } + + fn schema(&self, _name: &str) -> Option> { + None + } +} + +#[async_trait] +impl FuncAsyncCatalogProvider for AmpCatalogProvider { + async fn schema( + &self, + name: &str, + ) -> Result>, DataFusionError> { + let schema = self.resolve_schema(name).await?; + Ok(schema.map(|s| s as _)) + } +} diff --git a/crates/core/common/src/catalog/logical.rs b/crates/core/common/src/catalog/logical.rs index ee13a67c6..49df07ecc 100644 --- a/crates/core/common/src/catalog/logical.rs +++ b/crates/core/common/src/catalog/logical.rs @@ -1,25 +1,3 @@ -use datafusion::logical_expr::ScalarUDF; - -pub mod for_admin_api; -pub mod for_dump; -pub mod for_manifest_validation; -pub mod for_query; pub mod table; pub use table::LogicalTable; - -#[derive(Clone, Debug)] -pub struct LogicalCatalog { - pub tables: Vec, - /// UDFs specific to the datasets corresponding to the resolved tables. - pub udfs: Vec, -} - -impl LogicalCatalog { - pub fn from_tables<'a>(tables: impl Iterator) -> Self { - Self { - tables: tables.cloned().collect(), - udfs: Vec::new(), - } - } -} diff --git a/crates/core/common/src/catalog/logical/for_admin_api.rs b/crates/core/common/src/catalog/logical/for_admin_api.rs deleted file mode 100644 index 4458352d9..000000000 --- a/crates/core/common/src/catalog/logical/for_admin_api.rs +++ /dev/null @@ -1,421 +0,0 @@ -//! Schema inference catalog construction for derived dataset validation -//! -//! This module provides catalog creation for schema inference when validating -//! derived dataset manifests via the admin API. -//! -//! ## Key Functions -//! -//! - [`create`] - Creates a LogicalCatalog for SQL validation -//! - [`resolve_tables_with_deps`] - Resolves table references using pre-resolved dependencies -//! - [`resolve_udfs_with_deps`] - Resolves function references to ScalarUDF instances - -use std::{ - collections::{BTreeMap, btree_map::Entry}, - sync::Arc, -}; - -use datafusion::logical_expr::{ScalarUDF, async_udf::AsyncScalarUDF}; -use datasets_common::{hash::Hash, hash_reference::HashReference, table_name::TableName}; -use datasets_derived::{ - dataset::Dataset as DerivedDataset, - deps::{DepAlias, DepAliasOrSelfRef, SELF_REF_KEYWORD}, - func_name::{ETH_CALL_FUNCTION_NAME, FuncName}, - manifest::Function, -}; -use js_runtime::{isolate_pool::IsolatePool, js_udf::JsUdf}; - -use crate::{ - catalog::logical::{LogicalCatalog, LogicalTable}, - dataset_store::{DatasetStore, EthCallForDatasetError, GetDatasetError}, - sql::{FunctionReference, TableReference}, -}; - -/// Map of table names to (table references, function references) -pub type TableReferencesMap = BTreeMap< - TableName, - ( - Vec>, - Vec>, - ), ->; - -/// Creates a LogicalCatalog for SQL validation with pre-resolved dependencies. -/// -/// This function is used during derived dataset manifest validation to create a logical -/// catalog that can validate SQL queries against specific dataset versions. -/// -/// ## Precondition -/// -/// All dependency aliases referenced in `refs` (table and function references) must exist as keys -/// in `manifest_deps`. Callers must validate this before calling `create`; violation will panic. -/// -/// ## Process -/// -/// 1. Flattens table references from the references map -/// 2. Resolves all table references to LogicalTable instances -/// 3. Flattens function references from the references map -/// 4. Resolves all function references to ScalarUDF instances -/// 5. Creates and returns a LogicalCatalog -/// -/// ## Related Functions -/// -/// - [`resolve_tables`] - Resolves table references to LogicalTable instances -/// - [`resolve_udfs`] - Resolves function references to UDFs -pub async fn create( - dataset_store: &DatasetStore, - isolate_pool: IsolatePool, - manifest_deps: BTreeMap, - manifest_udfs: BTreeMap, - refs: TableReferencesMap, -) -> Result { - let table_refs: Vec<_> = refs - .iter() - .flat_map(|(name, (table_refs, _))| { - table_refs.iter().map(move |table_ref| (name, table_ref)) - }) - .collect(); - - let tables = resolve_tables(dataset_store, &manifest_deps, table_refs) - .await - .map_err(CreateLogicalCatalogError::ResolveTables)?; - - let func_refs: Vec<_> = refs - .iter() - .flat_map(|(name, (_, func_refs))| func_refs.iter().map(move |func_ref| (name, func_ref))) - .collect(); - - let udfs = resolve_udfs( - dataset_store, - isolate_pool, - &manifest_deps, - &manifest_udfs, - func_refs, - ) - .await - .map_err(CreateLogicalCatalogError::ResolveUdfs)?; - - Ok(LogicalCatalog { tables, udfs }) -} - -#[derive(Debug, thiserror::Error)] -pub enum CreateLogicalCatalogError { - /// Failed to resolve table references to LogicalTable instances - #[error(transparent)] - ResolveTables(ResolveTablesError), - - /// Failed to resolve function references to UDF instances - #[error(transparent)] - ResolveUdfs(ResolveUdfsError), -} - -/// Resolves table references to LogicalTable instances using pre-resolved dependencies. -/// -/// Processes each table reference across all tables, looks up datasets by hash, finds tables -/// within datasets, and creates LogicalTable instances for catalog construction. -async fn resolve_tables<'a>( - dataset_store: &DatasetStore, - manifest_deps: &BTreeMap, - refs: impl IntoIterator)> + 'a, -) -> Result, ResolveTablesError> { - // Use hash-based map to deduplicate datasets across ALL tables - // Inner map: table_ref -> LogicalTable (deduplicates table references) - let mut tables: BTreeMap, LogicalTable>> = - BTreeMap::new(); - - // Process all table references - fail fast on first error - for (table_name, table_ref) in refs { - match table_ref { - TableReference::Bare { .. } => { - return Err(ResolveTablesError::UnqualifiedTable { - table_name: table_name.clone(), - table_ref: table_ref.to_string(), - }); - } - TableReference::Partial { schema, table } => { - // Schema is already parsed as DepAlias, lookup in dependencies map - let dataset_ref = manifest_deps - .get(schema.as_ref()) - .expect("dep alias validated before catalog creation"); - - // Skip if table reference is already resolved (optimization to avoid redundant dataset loading) - let Entry::Vacant(entry) = tables - .entry(dataset_ref.hash().clone()) - .or_default() - .entry(table_ref.clone()) - else { - continue; - }; - - // Load dataset by hash (cached by dataset_store) - let dataset = dataset_store - .get_dataset(dataset_ref) - .await - .map_err(|err| ResolveTablesError::GetDataset { - table_name: table_name.clone(), - reference: dataset_ref.clone(), - source: err, - })?; - - // Find table in dataset - let dataset_table = dataset - .tables() - .iter() - .find(|t| t.name() == table) - .ok_or_else(|| ResolveTablesError::TableNotFoundInDataset { - table_name: table_name.clone(), - referenced_table_name: table.as_ref().clone(), - reference: dataset_ref.clone(), - })?; - - let resolved_table = LogicalTable::new( - schema.to_string(), - dataset_ref.clone(), - dataset_table.clone(), - ); - - // Insert into vacant entry - entry.insert(resolved_table); - } - } - } - - // Flatten to Vec - Ok(tables - .into_values() - .flat_map(|map| map.into_values()) - .collect()) -} - -/// Resolves function references to ScalarUDF instances using pre-resolved dependencies. -/// -/// Processes each function reference across all tables: -/// - For external dependencies (dep.function): loads dataset and retrieves UDF -/// - For self-references (self.function): creates JsUdf from the manifest's function definition -/// - Skips bare functions (built-in DataFusion/Amp functions) -async fn resolve_udfs<'a>( - dataset_store: &DatasetStore, - isolate_pool: IsolatePool, - manifest_deps: &BTreeMap, - manifest_udfs: &BTreeMap, - refs: impl IntoIterator)> + 'a, -) -> Result, ResolveUdfsError> { - // Track UDFs from external dependencies - outer key: dataset hash, inner key: function reference - // Inner map ensures deduplication: multiple function references to the same UDF share one instance - let mut udfs: BTreeMap, ScalarUDF>> = - BTreeMap::new(); - // Track UDFs defined in this manifest (bare functions and self-references) - separate from dependency functions - // Ensures deduplication: multiple references to the same function share one instance - let mut self_udfs: BTreeMap, ScalarUDF> = BTreeMap::new(); - - // Process all function references - fail fast on first error - for (table_name, func_ref) in refs { - match func_ref { - // Skip bare functions - they are assumed to be built-in functions (Amp or DataFusion) - FunctionReference::Bare { function: _ } => { - continue; - } - FunctionReference::Qualified { schema, function } => { - // Match on schema type: DepAlias (external dependency) or SelfRef (same-dataset function) - match schema.as_ref() { - DepAliasOrSelfRef::DepAlias(dep_alias) => { - // External dependency reference - lookup in dependencies map - let dataset_ref = manifest_deps - .get(dep_alias) - .expect("dep alias validated before catalog creation"); - - // Check vacancy BEFORE loading dataset - let Entry::Vacant(entry) = udfs - .entry(dataset_ref.hash().clone()) - .or_default() - .entry(func_ref.clone()) - else { - continue; - }; - - // Only load dataset if UDF not already resolved (cached by dataset_store) - let dataset = - dataset_store - .get_dataset(dataset_ref) - .await - .map_err(|err| ResolveUdfsError::GetDataset { - table_name: table_name.clone(), - reference: dataset_ref.clone(), - source: err, - })?; - - // Get the UDF for this function reference - let udf = if function.as_ref() == ETH_CALL_FUNCTION_NAME { - dataset_store - .eth_call_for_dataset(&schema.to_string(), dataset.as_ref()) - .await - .map_err(|err| ResolveUdfsError::EthCallUdfCreation { - table_name: table_name.clone(), - reference: dataset_ref.clone(), - source: err, - })? - .ok_or_else(|| ResolveUdfsError::EthCallNotAvailable { - table_name: table_name.clone(), - reference: dataset_ref.clone(), - })? - } else { - dataset - .downcast_ref::() - .and_then(|d| { - d.function_by_name( - schema.to_string(), - function, - IsolatePool::dummy(), - ) - }) - .ok_or_else(|| ResolveUdfsError::FunctionNotFoundInDataset { - table_name: table_name.clone(), - function_name: (**function).clone(), - reference: dataset_ref.clone(), - })? - }; - - entry.insert(udf); - } - DepAliasOrSelfRef::SelfRef => { - // Same-dataset function reference (self.function_name) - // Look up function in the functions map (defined in this dataset) - let func_def = manifest_udfs.get(function).ok_or_else(|| { - ResolveUdfsError::SelfReferencedFunctionNotFound { - table_name: table_name.clone(), - function_name: (**function).clone(), - } - })?; - - // Skip if function reference is already resolved (optimization) - let Entry::Vacant(entry) = self_udfs.entry(func_ref.clone()) else { - continue; - }; - - // Create UDF from Function definition using JsUdf - // Use "self" as schema qualifier to preserve case sensitivity - let udf = AsyncScalarUDF::new(Arc::new(JsUdf::new( - isolate_pool.clone(), - Some(SELF_REF_KEYWORD.to_string()), // Schema = "self" - func_def.source.source.clone(), - func_def.source.filename.clone().into(), - Arc::from(function.as_ref().as_str()), - func_def - .input_types - .iter() - .map(|dt| dt.clone().into_arrow()) - .collect(), - func_def.output_type.clone().into_arrow(), - ))) - .into_scalar_udf(); - - entry.insert(udf); - } - } - } - } - } - - // Flatten to Vec - Ok(self_udfs - .into_values() - .chain(udfs.into_values().flat_map(|map| map.into_values())) - .collect()) -} - -#[derive(Debug, thiserror::Error)] -pub enum ResolveTablesError { - /// Table is not qualified with a schema/dataset name - #[error( - "In table '{table_name}': Unqualified table '{table_ref}', all tables must be qualified with a dataset" - )] - UnqualifiedTable { - /// The table being processed when the error occurred - table_name: TableName, - /// The unqualified table reference string - table_ref: String, - }, - - /// Failed to retrieve dataset from store when loading dataset for table reference - #[error("In table '{table_name}': Failed to retrieve dataset '{reference}'")] - GetDataset { - /// The table being processed when the error occurred - table_name: TableName, - /// The hash reference of the dataset that failed to load - reference: HashReference, - #[source] - source: GetDatasetError, - }, - - /// Table not found in dataset - #[error( - "In table '{table_name}': Table '{referenced_table_name}' not found in dataset '{reference}'" - )] - TableNotFoundInDataset { - /// The table being processed when the error occurred - table_name: TableName, - /// The name of the table that was not found in the dataset - referenced_table_name: TableName, - /// The hash reference of the dataset where the table was not found - reference: HashReference, - }, -} - -#[derive(Debug, thiserror::Error)] -pub enum ResolveUdfsError { - /// Failed to retrieve dataset from store when loading dataset for function - #[error("In table '{table_name}': Failed to retrieve dataset '{reference}' for function")] - GetDataset { - /// The table being processed when the error occurred - table_name: TableName, - /// The hash reference of the dataset that failed to load - reference: HashReference, - #[source] - source: GetDatasetError, - }, - - /// Failed to create ETH call UDF for dataset referenced in function name - #[error( - "In table '{table_name}': Failed to create ETH call UDF for dataset '{reference}' for function" - )] - EthCallUdfCreation { - /// The table being processed when the error occurred - table_name: TableName, - /// The hash reference of the dataset for which the eth_call UDF creation failed - reference: HashReference, - #[source] - source: EthCallForDatasetError, - }, - - /// eth_call function not available for dataset - #[error("In table '{table_name}': Function 'eth_call' not available for dataset '{reference}'")] - EthCallNotAvailable { - /// The table being processed when the error occurred - table_name: TableName, - /// The hash reference of the dataset that does not support eth_call - reference: HashReference, - }, - - /// Function not found in dataset - #[error( - "In table '{table_name}': Function '{function_name}' not found in dataset '{reference}'" - )] - FunctionNotFoundInDataset { - /// The table being processed when the error occurred - table_name: TableName, - /// The name of the function that was not found - function_name: FuncName, - /// The hash reference of the dataset where the function was not found - reference: HashReference, - }, - - /// Self-referenced function not found in manifest's functions map. - #[error( - "In table '{table_name}': Self-referenced function '{function_name}' not found in manifest functions" - )] - SelfReferencedFunctionNotFound { - /// The table containing the SQL query with the invalid reference - table_name: TableName, - /// The function name that was referenced but not defined - function_name: FuncName, - }, -} diff --git a/crates/core/common/src/catalog/logical/for_dump.rs b/crates/core/common/src/catalog/logical/for_dump.rs deleted file mode 100644 index 9e14ad84f..000000000 --- a/crates/core/common/src/catalog/logical/for_dump.rs +++ /dev/null @@ -1,402 +0,0 @@ -//! Derived dataset logical catalog construction with pre-resolved dependencies. -//! -//! This module creates LogicalCatalog for derived dataset SQL validation. -//! Uses static dependency resolution (DepAlias → Hash mappings) for deterministic execution. - -use std::{ - collections::{BTreeMap, btree_map::Entry}, - sync::Arc, -}; - -use datafusion::logical_expr::{ScalarUDF, async_udf::AsyncScalarUDF}; -use datasets_common::{hash::Hash, hash_reference::HashReference, table_name::TableName}; -use datasets_derived::{ - dataset::Dataset as DerivedDataset, - deps::{DepAlias, DepAliasOrSelfRef, SELF_REF_KEYWORD}, - func_name::{ETH_CALL_FUNCTION_NAME, FuncName}, - manifest::Function, -}; -use js_runtime::{isolate_pool::IsolatePool, js_udf::JsUdf}; - -use crate::{ - catalog::logical::{LogicalCatalog, LogicalTable}, - dataset_store::{DatasetStore, EthCallForDatasetError, GetDatasetError}, - sql::{FunctionReference, TableReference}, -}; - -/// Resolved SQL references tuple (table refs, function refs) for derived dataset execution. -pub type ResolvedReferences = ( - Vec>, - Vec>, -); - -/// Creates a logical catalog for derived dataset SQL validation without physical data access. -/// -/// This function builds a logical catalog with schemas only, enabling query plan generation -/// and schema inference without accessing physical parquet files. -/// -/// ## Precondition -/// -/// All dependency aliases referenced in `refs` (table and function references) must exist as keys -/// in `manifest_deps`. Callers must validate this before calling `create`; violation will panic. -/// -/// ## Where Used -/// -/// This function is used during derived dataset dump execution when only logical validation -/// is needed (such as during query planning phases). -/// -/// ## Implementation -/// -/// The function: -/// 1. Destructures the references tuple into table and function references -/// 2. Resolves table references to LogicalTable instances using pre-resolved dependencies -/// 3. Resolves function references to ScalarUDF instances -/// 4. Returns a LogicalCatalog containing tables and UDFs -pub async fn create( - dataset_store: &DatasetStore, - isolate_pool: &IsolatePool, - manifest_deps: &BTreeMap, - manifest_udfs: &BTreeMap, - refs: ResolvedReferences, -) -> Result { - let (table_refs, func_refs) = refs; - - let tables = resolve_tables(dataset_store, manifest_deps, table_refs) - .await - .map_err(CreateCatalogError::ResolveTables)?; - let udfs = resolve_udfs( - dataset_store, - isolate_pool, - manifest_deps, - manifest_udfs, - func_refs, - ) - .await - .map_err(CreateCatalogError::ResolveUdfs)?; - - Ok(LogicalCatalog { tables, udfs }) -} - -/// Resolves table references to LogicalTable instances using pre-resolved dependencies. -/// -/// Processes each table reference, looks up the dataset by hash, finds the table -/// within the dataset, and creates a LogicalTable for catalog construction. -async fn resolve_tables( - dataset_store: &DatasetStore, - manifest_deps: &BTreeMap, - refs: impl IntoIterator>, -) -> Result, ResolveTablesError> { - // Use hash-based map to deduplicate datasets and collect resolved tables - // Inner map: table_ref -> LogicalTable (deduplicates table references) - let mut tables: BTreeMap, LogicalTable>> = - BTreeMap::new(); - - for table_ref in refs { - match &table_ref { - TableReference::Bare { .. } => { - return Err(ResolveTablesError::UnqualifiedTable { - table_ref: table_ref.to_string(), - }); - } - TableReference::Partial { schema, table } => { - // Schema is already parsed as DepAlias, lookup in dependencies map - let dataset_ref = manifest_deps - .get(schema.as_ref()) - .expect("dep alias validated before catalog creation"); - - // Skip if table reference is already resolved (optimization to avoid redundant dataset loading) - let Entry::Vacant(entry) = tables - .entry(dataset_ref.hash().clone()) - .or_default() - .entry(table_ref.clone()) - else { - continue; - }; - - // Load dataset by hash (cached by dataset_store) - let dataset = dataset_store - .get_dataset(dataset_ref) - .await - .map_err(|err| ResolveTablesError::GetDataset { - reference: dataset_ref.clone(), - source: err, - })?; - - // Find table in dataset - let dataset_table = dataset - .tables() - .iter() - .find(|t| t.name() == table) - .ok_or_else(|| ResolveTablesError::TableNotFoundInDataset { - table_name: table.as_ref().clone(), - reference: dataset_ref.clone(), - })?; - - // Create LogicalTable - let resolved_table = LogicalTable::new( - schema.to_string(), - dataset_ref.clone(), - dataset_table.clone(), - ); - - // Insert into vacant entry - entry.insert(resolved_table); - } - } - } - - // Flatten to Vec - Ok(tables - .into_values() - .flat_map(|map| map.into_values()) - .collect()) -} - -/// Resolves function references to ScalarUDF instances using pre-resolved dependencies. -/// -/// Processes each function reference: -/// - For external dependencies (dep.function): loads dataset and retrieves UDF -/// - For self-references (self.function): creates JsUdf from the manifest's function definition -/// - Skips bare functions (built-in DataFusion/Amp functions) -async fn resolve_udfs( - dataset_store: &DatasetStore, - isolate_pool: &IsolatePool, - manifest_deps: &BTreeMap, - manifest_udfs: &BTreeMap, - refs: impl IntoIterator>, -) -> Result, ResolveUdfsError> { - // Track UDFs from external dependencies - outer key: dataset hash, inner key: function reference - // Inner map ensures deduplication: multiple function references to the same UDF share one instance - let mut udfs: BTreeMap, ScalarUDF>> = - BTreeMap::new(); - // Track UDFs defined in this manifest (bare functions and self-references) - separate from dependency functions - // Ensures deduplication: multiple references to the same function share one instance - let mut self_udfs: BTreeMap, ScalarUDF> = BTreeMap::new(); - - for func_ref in refs { - match &func_ref { - // Skip bare functions - they are assumed to be built-in functions (Amp or DataFusion) - FunctionReference::Bare { function: _ } => continue, - FunctionReference::Qualified { schema, function } => { - // Match on schema type: DepAlias (external dependency) or SelfRef (same-dataset function) - match schema.as_ref() { - DepAliasOrSelfRef::DepAlias(dep_alias) => { - // External dependency reference - lookup in dependencies map - let dataset_ref = manifest_deps - .get(dep_alias) - .expect("dep alias validated before catalog creation"); - - // Check vacancy BEFORE loading dataset - let Entry::Vacant(entry) = udfs - .entry(dataset_ref.hash().clone()) - .or_default() - .entry(func_ref.clone()) - else { - continue; - }; - - // Only load dataset if UDF not already resolved - let dataset = - dataset_store - .get_dataset(dataset_ref) - .await - .map_err(|err| ResolveUdfsError::GetDataset { - reference: dataset_ref.clone(), - source: err, - })?; - - // Get the UDF for this function reference - let udf = if function.as_ref() == ETH_CALL_FUNCTION_NAME { - dataset_store - .eth_call_for_dataset(&schema.to_string(), dataset.as_ref()) - .await - .map_err(|err| ResolveUdfsError::EthCallUdfCreation { - reference: dataset_ref.clone(), - source: err, - })? - .ok_or_else(|| ResolveUdfsError::EthCallNotAvailable { - reference: dataset_ref.clone(), - })? - } else { - dataset - .downcast_ref::() - .and_then(|d| { - d.function_by_name( - schema.to_string(), - function, - isolate_pool.clone(), - ) - }) - .ok_or_else(|| ResolveUdfsError::FunctionNotFoundInDataset { - function_name: (**function).clone(), - reference: dataset_ref.clone(), - })? - }; - - entry.insert(udf); - } - DepAliasOrSelfRef::SelfRef => { - // Same-dataset function reference (self.function_name) - // Look up function in the functions map (defined in this dataset) - let func_def = manifest_udfs.get(function).ok_or_else(|| { - ResolveUdfsError::SelfReferencedFunctionNotFound { - function_name: (**function).clone(), - } - })?; - - // Skip if function reference is already resolved (optimization) - let Entry::Vacant(entry) = self_udfs.entry(func_ref.clone()) else { - continue; - }; - - // Create UDF from Function definition using JsUdf - // Use "self" as schema qualifier to preserve case sensitivity - let udf = AsyncScalarUDF::new(Arc::new(JsUdf::new( - isolate_pool.clone(), - Some(SELF_REF_KEYWORD.to_string()), // Schema = "self" - func_def.source.source.clone(), - func_def.source.filename.clone().into(), - Arc::from(function.as_ref().as_str()), - func_def - .input_types - .iter() - .map(|dt| dt.clone().into_arrow()) - .collect(), - func_def.output_type.clone().into_arrow(), - ))) - .into_scalar_udf(); - - entry.insert(udf); - } - } - } - } - } - - // Flatten and combine UDFs - Ok(self_udfs - .into_values() - .chain(udfs.into_values().flat_map(|map| map.into_values())) - .collect()) -} - -/// Errors specific to create operations. -/// -/// This error type is used by `create()` to create -/// a logical catalog for derived dataset execution. -#[derive(Debug, thiserror::Error)] -pub enum CreateCatalogError { - /// Failed to resolve table references to LogicalTable instances. - #[error(transparent)] - ResolveTables(ResolveTablesError), - - /// Failed to resolve function references to UDF instances. - #[error(transparent)] - ResolveUdfs(ResolveUdfsError), -} - -/// Errors that can occur when resolving table references with dependencies. -#[derive(Debug, thiserror::Error)] -pub enum ResolveTablesError { - /// Table is not qualified with a schema/dataset name. - /// - /// All tables must be qualified with a dataset reference in the schema portion. - /// Unqualified tables (e.g., just `table_name`) are not allowed. - #[error("Unqualified table '{table_ref}', all tables must be qualified with a dataset")] - UnqualifiedTable { - /// The unqualified table reference string - table_ref: String, - }, - - /// Failed to retrieve dataset from store when loading dataset for table reference. - /// - /// This occurs when loading a dataset definition fails: - /// - Dataset not found in the store - /// - Dataset manifest is invalid or corrupted - /// - Unsupported dataset kind - /// - Storage backend errors when reading the dataset - #[error("Failed to retrieve dataset '{reference}' for table reference")] - GetDataset { - /// The hash reference of the dataset that failed to load - reference: HashReference, - #[source] - source: GetDatasetError, - }, - - /// Table not found in dataset. - /// - /// This occurs when the table name is referenced in the SQL query but the - /// dataset does not contain a table with that name. - #[error("Table '{table_name}' not found in dataset '{reference}'")] - TableNotFoundInDataset { - /// The name of the table that was not found - table_name: TableName, - /// The hash reference of the dataset that was searched - reference: HashReference, - }, -} - -/// Errors that can occur when resolving UDF references with dependencies. -#[derive(Debug, thiserror::Error)] -pub enum ResolveUdfsError { - /// Failed to retrieve dataset from store when loading dataset for function. - /// - /// This occurs when loading a dataset definition for a function fails: - /// - Dataset not found in the store - /// - Dataset manifest is invalid or corrupted - /// - Unsupported dataset kind - /// - Storage backend errors when reading the dataset - #[error("Failed to retrieve dataset '{reference}' for function reference")] - GetDataset { - /// The hash reference of the dataset that failed to load - reference: HashReference, - #[source] - source: GetDatasetError, - }, - - /// Failed to create ETH call UDF for dataset referenced in function name. - /// - /// This occurs when creating the eth_call user-defined function for a function fails: - /// - Invalid provider configuration for the dataset - /// - Provider connection issues - /// - Dataset is not an EVM RPC dataset but eth_call was requested - #[error("Failed to create ETH call UDF for dataset '{reference}' for function reference")] - EthCallUdfCreation { - /// The hash reference of the dataset for which eth_call UDF creation failed - reference: HashReference, - #[source] - source: EthCallForDatasetError, - }, - - /// eth_call function not available for dataset. - /// - /// This occurs when the eth_call function is referenced in SQL but the - /// dataset does not support eth_call (not an EVM RPC dataset or no provider configured). - #[error("Function 'eth_call' not available for dataset '{reference}'")] - EthCallNotAvailable { - /// The hash reference of the dataset that does not support eth_call - reference: HashReference, - }, - - /// Function not found in dataset. - /// - /// This occurs when a function is referenced in the SQL query but the - /// dataset does not contain a function with that name. - #[error("Function '{function_name}' not found in dataset '{reference}'")] - FunctionNotFoundInDataset { - /// The name of the function that was not found - function_name: FuncName, - /// The hash reference of the dataset that was searched - reference: HashReference, - }, - - /// Self-referenced function not found in manifest's functions map. - /// - /// This occurs when a SQL query uses `self.function_name` syntax but the - /// function is not defined in the manifest's `functions` section. - #[error("Self-referenced function '{function_name}' not found in manifest functions")] - SelfReferencedFunctionNotFound { - /// The function name that was referenced but not defined - function_name: FuncName, - }, -} diff --git a/crates/core/common/src/catalog/logical/for_manifest_validation.rs b/crates/core/common/src/catalog/logical/for_manifest_validation.rs deleted file mode 100644 index 6e1e34da2..000000000 --- a/crates/core/common/src/catalog/logical/for_manifest_validation.rs +++ /dev/null @@ -1,378 +0,0 @@ -//! Validation-specific catalog construction for derived dataset manifests -//! -//! This module provides catalog creation for validating derived dataset manifests -//! during the manifest compilation phase. -//! -//! ## Key Functions -//! -//! - [`create`] - Creates a LogicalCatalog for SQL validation -//! - [`resolve_tables`] - Resolves table references using pre-resolved dependencies -//! - [`resolve_udfs`] - Resolves function references to ScalarUDF instances - -use std::{ - collections::{BTreeMap, btree_map::Entry}, - sync::Arc, -}; - -use datafusion::logical_expr::{ScalarUDF, async_udf::AsyncScalarUDF}; -use datasets_common::{hash::Hash, hash_reference::HashReference, table_name::TableName}; -use datasets_derived::{ - dataset::Dataset as DerivedDataset, - deps::{DepAlias, DepAliasOrSelfRef, SELF_REF_KEYWORD}, - func_name::{ETH_CALL_FUNCTION_NAME, FuncName}, - manifest::Function, -}; -use js_runtime::{isolate_pool::IsolatePool, js_udf::JsUdf}; - -use crate::{ - catalog::logical::{LogicalCatalog, LogicalTable}, - dataset_store::{DatasetStore, EthCallForDatasetError, GetDatasetError}, - sql::{FunctionReference, TableReference}, -}; - -/// Map of table names to their SQL references (table refs and function refs) using dependency aliases or self-references. -pub type TableReferencesMap = BTreeMap< - TableName, - ( - Vec>, - Vec>, - ), ->; - -/// Creates a logical catalog for SQL validation using pre-resolved dependencies and functions. -/// -/// Builds a unified logical catalog from table and function references across multiple tables, -/// resolving dependency aliases to datasets for schema-only validation (no physical data access). -/// -/// ## Precondition -/// -/// All dependency aliases referenced in `refs` (table and function references) must exist as keys -/// in `manifest_deps`. Callers must validate this before calling `create`; violation will panic. -/// -/// ## Delegates to -/// -/// - [`resolve_tables`] - Resolves table references to `LogicalTable` instances -/// - [`resolve_udfs`] - Resolves function references to UDFs -pub async fn create( - dataset_store: &DatasetStore, - isolate_pool: IsolatePool, - manifest_deps: BTreeMap, - manifest_udfs: BTreeMap, - refs: TableReferencesMap, -) -> Result { - let table_refs: Vec<_> = refs - .iter() - .flat_map(|(name, (table_refs, _))| { - table_refs.iter().map(move |table_ref| (name, table_ref)) - }) - .collect(); - - let tables = resolve_tables(dataset_store, &manifest_deps, table_refs) - .await - .map_err(CreateLogicalCatalogError::ResolveTables)?; - - let func_refs: Vec<_> = refs - .iter() - .flat_map(|(name, (_, func_refs))| func_refs.iter().map(move |func_ref| (name, func_ref))) - .collect(); - - let udfs = resolve_udfs( - dataset_store, - isolate_pool, - &manifest_deps, - &manifest_udfs, - func_refs, - ) - .await - .map_err(CreateLogicalCatalogError::ResolveUdfs)?; - - Ok(LogicalCatalog { tables, udfs }) -} - -#[derive(Debug, thiserror::Error)] -pub enum CreateLogicalCatalogError { - /// Failed to resolve table references to LogicalTable instances - #[error(transparent)] - ResolveTables(ResolveTablesError), - - /// Failed to resolve function references to UDF instances - #[error(transparent)] - ResolveUdfs(ResolveUdfsError), -} - -/// Resolves table references to LogicalTable instances using pre-resolved dependencies. -async fn resolve_tables<'a>( - dataset_store: &DatasetStore, - manifest_deps: &BTreeMap, - refs: impl IntoIterator)> + 'a, -) -> Result, ResolveTablesError> { - let mut tables: BTreeMap, LogicalTable>> = - BTreeMap::new(); - - for (table_name, table_ref) in refs { - match table_ref { - TableReference::Bare { .. } => { - return Err(ResolveTablesError::UnqualifiedTable { - table_name: table_name.clone(), - table_ref: table_ref.to_string(), - }); - } - TableReference::Partial { schema, table } => { - let dataset_ref = manifest_deps - .get(schema.as_ref()) - .expect("dep alias validated before catalog creation"); - - let Entry::Vacant(entry) = tables - .entry(dataset_ref.hash().clone()) - .or_default() - .entry(table_ref.clone()) - else { - continue; - }; - - let dataset = dataset_store - .get_dataset(dataset_ref) - .await - .map_err(|err| ResolveTablesError::GetDataset { - table_name: table_name.clone(), - reference: dataset_ref.clone(), - source: err, - })?; - - let dataset_table = dataset - .tables() - .iter() - .find(|t| t.name() == table) - .ok_or_else(|| ResolveTablesError::TableNotFoundInDataset { - table_name: table_name.clone(), - referenced_table_name: table.as_ref().clone(), - reference: dataset_ref.clone(), - })?; - - let resolved_table = LogicalTable::new( - schema.to_string(), - dataset_ref.clone(), - dataset_table.clone(), - ); - - entry.insert(resolved_table); - } - } - } - - Ok(tables - .into_values() - .flat_map(|map| map.into_values()) - .collect()) -} - -/// Resolves function references to ScalarUDF instances using pre-resolved dependencies. -async fn resolve_udfs<'a>( - dataset_store: &DatasetStore, - isolate_pool: IsolatePool, - manifest_deps: &BTreeMap, - manifest_udfs: &BTreeMap, - refs: impl IntoIterator)> + 'a, -) -> Result, ResolveUdfsError> { - let mut udfs: BTreeMap, ScalarUDF>> = - BTreeMap::new(); - let mut self_udfs: BTreeMap, ScalarUDF> = BTreeMap::new(); - - for (table_name, func_ref) in refs { - match func_ref { - FunctionReference::Bare { function: _ } => { - continue; - } - FunctionReference::Qualified { schema, function } => match schema.as_ref() { - DepAliasOrSelfRef::DepAlias(dep_alias) => { - let dataset_ref = manifest_deps - .get(dep_alias) - .expect("dep alias validated before catalog creation"); - - // Check vacancy BEFORE loading dataset - let Entry::Vacant(entry) = udfs - .entry(dataset_ref.hash().clone()) - .or_default() - .entry(func_ref.clone()) - else { - continue; - }; - - // Only load dataset if UDF not already resolved - let dataset = dataset_store - .get_dataset(dataset_ref) - .await - .map_err(|err| ResolveUdfsError::GetDataset { - table_name: table_name.clone(), - reference: dataset_ref.clone(), - source: err, - })?; - - let udf = if function.as_ref() == ETH_CALL_FUNCTION_NAME { - dataset_store - .eth_call_for_dataset(&schema.to_string(), dataset.as_ref()) - .await - .map_err(|err| ResolveUdfsError::EthCallUdfCreation { - table_name: table_name.clone(), - reference: dataset_ref.clone(), - source: err, - })? - .ok_or_else(|| ResolveUdfsError::EthCallNotAvailable { - table_name: table_name.clone(), - reference: dataset_ref.clone(), - })? - } else { - dataset - .downcast_ref::() - .and_then(|d| { - d.function_by_name( - schema.to_string(), - function, - IsolatePool::dummy(), - ) - }) - .ok_or_else(|| ResolveUdfsError::FunctionNotFoundInDataset { - table_name: table_name.clone(), - function_name: (**function).clone(), - reference: dataset_ref.clone(), - })? - }; - - entry.insert(udf); - } - DepAliasOrSelfRef::SelfRef => { - let func_def = manifest_udfs.get(function).ok_or_else(|| { - ResolveUdfsError::SelfReferencedFunctionNotFound { - table_name: table_name.clone(), - function_name: (**function).clone(), - } - })?; - - let Entry::Vacant(entry) = self_udfs.entry(func_ref.clone()) else { - continue; - }; - - let udf = AsyncScalarUDF::new(Arc::new(JsUdf::new( - isolate_pool.clone(), - Some(SELF_REF_KEYWORD.to_string()), - func_def.source.source.clone(), - func_def.source.filename.clone().into(), - Arc::from(function.as_ref().as_str()), - func_def - .input_types - .iter() - .map(|dt| dt.clone().into_arrow()) - .collect(), - func_def.output_type.clone().into_arrow(), - ))) - .into_scalar_udf(); - - entry.insert(udf); - } - }, - } - } - - Ok(self_udfs - .into_values() - .chain(udfs.into_values().flat_map(|map| map.into_values())) - .collect()) -} - -#[derive(Debug, thiserror::Error)] -pub enum ResolveTablesError { - /// Table is not qualified with a schema/dataset name - #[error( - "In table '{table_name}': Unqualified table '{table_ref}', all tables must be qualified with a dataset" - )] - UnqualifiedTable { - /// The table being processed when the error occurred - table_name: TableName, - /// The unqualified table reference string - table_ref: String, - }, - - /// Failed to retrieve dataset from store when loading dataset for table reference - #[error("In table '{table_name}': Failed to retrieve dataset '{reference}'")] - GetDataset { - /// The table being processed when the error occurred - table_name: TableName, - /// The hash reference of the dataset that failed to load - reference: HashReference, - #[source] - source: GetDatasetError, - }, - - /// Table not found in dataset - #[error( - "In table '{table_name}': Table '{referenced_table_name}' not found in dataset '{reference}'" - )] - TableNotFoundInDataset { - /// The table being processed when the error occurred - table_name: TableName, - /// The name of the table that was not found in the dataset - referenced_table_name: TableName, - /// The hash reference of the dataset where the table was not found - reference: HashReference, - }, -} - -#[derive(Debug, thiserror::Error)] -pub enum ResolveUdfsError { - /// Failed to retrieve dataset from store when loading dataset for function - #[error("In table '{table_name}': Failed to retrieve dataset '{reference}' for function")] - GetDataset { - /// The table being processed when the error occurred - table_name: TableName, - /// The hash reference of the dataset that failed to load - reference: HashReference, - #[source] - source: GetDatasetError, - }, - - /// Failed to create ETH call UDF for dataset referenced in function name - #[error( - "In table '{table_name}': Failed to create ETH call UDF for dataset '{reference}' for function" - )] - EthCallUdfCreation { - /// The table being processed when the error occurred - table_name: TableName, - /// The hash reference of the dataset for which the eth_call UDF creation failed - reference: HashReference, - #[source] - source: EthCallForDatasetError, - }, - - /// eth_call function not available for dataset - #[error("In table '{table_name}': Function 'eth_call' not available for dataset '{reference}'")] - EthCallNotAvailable { - /// The table being processed when the error occurred - table_name: TableName, - /// The hash reference of the dataset that does not support eth_call - reference: HashReference, - }, - - /// Function not found in dataset - #[error( - "In table '{table_name}': Function '{function_name}' not found in dataset '{reference}'" - )] - FunctionNotFoundInDataset { - /// The table being processed when the error occurred - table_name: TableName, - /// The name of the function that was not found - function_name: FuncName, - /// The hash reference of the dataset where the function was not found - reference: HashReference, - }, - - /// Self-referenced function not found in manifest's functions map. - #[error( - "In table '{table_name}': Self-referenced function '{function_name}' not found in manifest functions" - )] - SelfReferencedFunctionNotFound { - /// The table containing the SQL query with the invalid reference - table_name: TableName, - /// The function name that was referenced but not defined - function_name: FuncName, - }, -} diff --git a/crates/core/common/src/catalog/logical/for_query.rs b/crates/core/common/src/catalog/logical/for_query.rs deleted file mode 100644 index 3b910fd2a..000000000 --- a/crates/core/common/src/catalog/logical/for_query.rs +++ /dev/null @@ -1,406 +0,0 @@ -//! Arrow Flight logical catalog construction with dynamic resolution. -//! -//! This module creates LogicalCatalog for Arrow Flight SQL query planning. -//! Uses dynamic resolution (PartialReference) supporting version tags and "latest". - -use std::collections::{BTreeMap, btree_map::Entry}; - -use datafusion::logical_expr::ScalarUDF; -use datasets_common::{ - hash::Hash, hash_reference::HashReference, partial_reference::PartialReference, - reference::Reference, table_name::TableName, -}; -use datasets_derived::{dataset::Dataset as DerivedDataset, func_name::ETH_CALL_FUNCTION_NAME}; -use js_runtime::isolate_pool::IsolatePool; - -use crate::{ - catalog::logical::{LogicalCatalog, LogicalTable}, - dataset_store::{DatasetStore, EthCallForDatasetError, GetDatasetError, ResolveRevisionError}, - sql::{FunctionReference, TableReference}, -}; - -/// Resolved SQL references tuple (table refs, function refs) using partial references. -pub type ResolvedReferences = ( - Vec>, - Vec>, -); - -/// Creates a logical catalog for SQL query planning without physical data access. -/// -/// This function builds a logical catalog with schemas only, enabling query plan generation -/// and schema inference without accessing physical parquet files. -/// -/// ## Where Used -/// -/// This function is used exclusively in the **Query Execution Path** for the planning phase: -/// -/// - **Arrow Flight GetFlightInfo** (`crates/services/server/src/flight.rs`): -/// - Called to generate query plan and return schema to clients -/// - Fast response without accessing physical data files -/// - Precedes actual query execution which uses `catalog_for_sql` -/// -/// ## Implementation -/// -/// The function analyzes the SQL query to: -/// 1. Extract table references and function names from the query -/// 2. Resolve dataset names to hashes via the dataset store -/// 3. Build logical catalog with schemas and UDFs -/// 4. Return logical catalog for use with `PlanContext` -/// -/// Unlike `catalog_for_sql`, this does not query the metadata database for physical -/// parquet locations, making it faster for planning-only operations. -pub async fn create( - dataset_store: &DatasetStore, - isolate_pool: &IsolatePool, - refs: ResolvedReferences, -) -> Result { - let (table_refs, func_refs) = refs; - - // Resolve logical catalog using shared helpers - let tables = resolve_tables(dataset_store, table_refs) - .await - .map_err(CreateCatalogError::ResolveTables)?; - let udfs = resolve_udfs(dataset_store, isolate_pool, func_refs) - .await - .map_err(CreateCatalogError::ResolveUdfs)?; - - Ok(LogicalCatalog { tables, udfs }) -} - -/// Resolves table references to LogicalTable instances using dynamic resolution. -/// -/// Processes each table reference, resolves the dataset reference to a hash, -/// loads the dataset, finds the table, and creates a LogicalTable for catalog construction. -async fn resolve_tables( - dataset_store: &DatasetStore, - refs: impl IntoIterator>, -) -> Result, ResolveTablesError> { - // Use hash-based map to deduplicate datasets and collect resolved tables - // Inner map: table_ref -> LogicalTable (deduplicates table references) - let mut tables: BTreeMap, LogicalTable>> = - BTreeMap::new(); - - for table_ref in refs { - match &table_ref { - TableReference::Bare { .. } => { - return Err(ResolveTablesError::UnqualifiedTable { - table_ref: table_ref.to_string(), - }); - } - TableReference::Partial { schema, table } => { - // Schema is already parsed as PartialReference, convert to Reference - let reference: Reference = schema.as_ref().clone().into(); - - // Resolve reference to hash reference - let dataset_ref = dataset_store - .resolve_revision(&reference) - .await - .map_err(|err| ResolveTablesError::ResolveDatasetReference { - reference: reference.clone(), - source: err, - })? - .ok_or_else(|| ResolveTablesError::DatasetNotFound { - reference: reference.clone(), - })?; - - // Skip if table reference is already resolved (optimization to avoid redundant dataset loading) - let Entry::Vacant(entry) = tables - .entry(dataset_ref.hash().clone()) - .or_default() - .entry(table_ref.clone()) - else { - continue; - }; - - // Load dataset by hash reference (cached by dataset_store) - let dataset = dataset_store - .get_dataset(&dataset_ref) - .await - .map_err(|err| ResolveTablesError::LoadDataset { - reference: dataset_ref.clone(), - source: err, - })?; - - // Find table in dataset - let dataset_table = dataset - .tables() - .iter() - .find(|t| t.name() == table) - .ok_or_else(|| ResolveTablesError::TableNotFoundInDataset { - table_name: table.as_ref().clone(), - reference: dataset_ref.clone(), - })?; - - let resolved_table = LogicalTable::new( - schema.to_string(), - dataset_ref.clone(), - dataset_table.clone(), - ); - - // Insert into vacant entry - entry.insert(resolved_table); - } - } - } - - // Flatten to Vec - Ok(tables - .into_values() - .flat_map(|map| map.into_values()) - .collect()) -} - -/// Resolves function references to ScalarUDF instances using dynamic resolution. -/// -/// Processes each function reference, resolves the dataset reference, -/// loads the dataset, and retrieves or creates the UDF. -async fn resolve_udfs( - dataset_store: &DatasetStore, - isolate_pool: &IsolatePool, - refs: impl IntoIterator>, -) -> Result, ResolveUdfsError> { - // Track UDFs from external dependencies - outer key: dataset hash, inner key: function reference - // Inner map ensures deduplication: multiple function references to the same UDF share one instance - let mut udfs: BTreeMap, ScalarUDF>> = - BTreeMap::new(); - - for func_ref in refs { - match &func_ref { - // Skip bare functions - they are assumed to be built-in functions (Amp or DataFusion) - FunctionReference::Bare { .. } => continue, - FunctionReference::Qualified { schema, function } => { - // Schema is already parsed as PartialReference, convert to Reference - let reference: Reference = schema.as_ref().clone().into(); - - // Resolve reference to hash reference - let dataset_ref = dataset_store - .resolve_revision(&reference) - .await - .map_err(|err| ResolveUdfsError::ResolveDatasetReference { - reference: reference.clone(), - source: err, - })? - .ok_or_else(|| ResolveUdfsError::DatasetNotFound { - reference: reference.clone(), - })?; - - // Check vacancy BEFORE loading dataset - let Entry::Vacant(entry) = udfs - .entry(dataset_ref.hash().clone()) - .or_default() - .entry(func_ref.clone()) - else { - continue; - }; - - // Only load dataset if UDF not already resolved - let dataset = dataset_store - .get_dataset(&dataset_ref) - .await - .map_err(|err| ResolveUdfsError::LoadDataset { - reference: dataset_ref.clone(), - source: err, - })?; - - // Get the UDF for this function reference - let udf = if function.as_ref() == ETH_CALL_FUNCTION_NAME { - dataset_store - .eth_call_for_dataset(&schema.to_string(), dataset.as_ref()) - .await - .map_err(|err| ResolveUdfsError::EthCallUdfCreation { - reference: dataset_ref.clone(), - source: err, - })? - .ok_or_else(|| ResolveUdfsError::EthCallNotAvailable { - reference: dataset_ref.clone(), - })? - } else { - dataset - .downcast_ref::() - .and_then(|d| { - d.function_by_name(schema.to_string(), function, isolate_pool.clone()) - }) - .ok_or_else(|| ResolveUdfsError::FunctionNotFoundInDataset { - function_name: func_ref.to_string(), - reference: dataset_ref, - })? - }; - - entry.insert(udf); - } - } - } - - // Flatten to Vec - Ok(udfs - .into_values() - .flat_map(|map| map.into_values()) - .collect()) -} - -/// Errors specific to create operations -/// -/// This error type is used by `create()` to create -/// a logical catalog for Arrow Flight query planning (GetFlightInfo). -#[derive(Debug, thiserror::Error)] -pub enum CreateCatalogError { - /// Failed to resolve table references to LogicalTable instances. - #[error(transparent)] - ResolveTables(ResolveTablesError), - - /// Failed to resolve function references to UDF instances. - #[error(transparent)] - ResolveUdfs(ResolveUdfsError), -} - -/// Errors that can occur when resolving table references. -#[derive(Debug, thiserror::Error)] -pub enum ResolveTablesError { - /// Table is not qualified with a schema/dataset name. - /// - /// All tables must be qualified with a dataset reference in the schema portion. - /// Unqualified tables (e.g., just `table_name`) are not allowed. - #[error("Unqualified table '{table_ref}', all tables must be qualified with a dataset")] - UnqualifiedTable { - /// The unqualified table reference string - table_ref: String, - }, - - /// Dataset not found. - /// - /// This occurs when a dataset reference resolves to None, meaning the dataset - /// does not exist in the metadata store. - #[error("Dataset '{reference}' not found")] - DatasetNotFound { - /// The dataset reference that was not found - reference: Reference, - }, - - /// Failed to resolve dataset reference to a hash reference. - /// - /// This occurs when the dataset store cannot resolve a reference to its - /// corresponding content hash. Common causes include: - /// - Dataset does not exist in the store - /// - Version tag not found - /// - Storage backend errors - /// - Invalid reference format - /// - Database connection issues - #[error("Failed to resolve dataset reference '{reference}'")] - ResolveDatasetReference { - /// The dataset reference that failed to resolve - reference: Reference, - #[source] - source: ResolveRevisionError, - }, - - /// Failed to load dataset from the dataset store. - /// - /// This occurs when loading a dataset definition fails. Common causes include: - /// - Dataset does not exist in the store - /// - Dataset manifest is invalid or corrupted - /// - Unsupported dataset kind - /// - Storage backend errors when reading the dataset - /// - Manifest file not found in object store - #[error("Failed to load dataset '{reference}'")] - LoadDataset { - /// The hash reference of the dataset that failed to load - reference: HashReference, - #[source] - source: GetDatasetError, - }, - - /// Table not found in dataset. - /// - /// This occurs when the table name is referenced in the SQL query but the - /// dataset does not contain a table with that name. - #[error("Table '{table_name}' not found in dataset '{reference}'")] - TableNotFoundInDataset { - /// The name of the table that was not found - table_name: TableName, - /// The hash reference of the dataset that was searched - reference: HashReference, - }, -} - -/// Errors that can occur when resolving UDF references. -#[derive(Debug, thiserror::Error)] -pub enum ResolveUdfsError { - /// Dataset not found. - /// - /// This occurs when a dataset reference resolves to None, meaning the dataset - /// does not exist in the metadata store. - #[error("Dataset '{reference}' not found")] - DatasetNotFound { - /// The dataset reference that was not found - reference: Reference, - }, - - /// Failed to resolve dataset reference to a hash reference. - /// - /// This occurs when the dataset store cannot resolve a reference to its - /// corresponding content hash. Common causes include: - /// - Dataset does not exist in the store - /// - Version tag not found - /// - Storage backend errors - /// - Invalid reference format - /// - Database connection issues - #[error("Failed to resolve dataset reference '{reference}'")] - ResolveDatasetReference { - /// The dataset reference that failed to resolve - reference: Reference, - #[source] - source: ResolveRevisionError, - }, - - /// Failed to load dataset from the dataset store. - /// - /// This occurs when loading a dataset definition fails. Common causes include: - /// - Dataset does not exist in the store - /// - Dataset manifest is invalid or corrupted - /// - Unsupported dataset kind - /// - Storage backend errors when reading the dataset - /// - Manifest file not found in object store - #[error("Failed to load dataset '{reference}'")] - LoadDataset { - /// The hash reference of the dataset that failed to load - reference: HashReference, - #[source] - source: GetDatasetError, - }, - - /// Failed to create ETH call UDF for dataset referenced in function name. - /// - /// This occurs when creating the eth_call user-defined function for a function fails: - /// - Invalid provider configuration for the dataset - /// - Provider connection issues - /// - Dataset is not an EVM RPC dataset but eth_call was requested - #[error("Failed to create ETH call UDF for dataset '{reference}'")] - EthCallUdfCreation { - /// The hash reference of the dataset for which eth_call UDF creation failed - reference: HashReference, - #[source] - source: EthCallForDatasetError, - }, - - /// eth_call function not available for dataset. - /// - /// This occurs when the eth_call function is referenced in SQL but the - /// dataset does not support eth_call (not an EVM RPC dataset or no provider configured). - #[error("Function 'eth_call' not available for dataset '{reference}'")] - EthCallNotAvailable { - /// The hash reference of the dataset that does not support eth_call - reference: HashReference, - }, - - /// Function not found in dataset. - /// - /// This occurs when a function is referenced in the SQL query but the - /// dataset does not contain a function with that name. - #[error("Function '{function_name}' not found in dataset '{reference}'")] - FunctionNotFoundInDataset { - /// The name of the function that was not found - function_name: String, - /// The hash reference of the dataset that was searched - reference: HashReference, - }, -} diff --git a/crates/core/common/src/catalog/physical/catalog.rs b/crates/core/common/src/catalog/physical/catalog.rs index 40008f79f..f08440a8b 100644 --- a/crates/core/common/src/catalog/physical/catalog.rs +++ b/crates/core/common/src/catalog/physical/catalog.rs @@ -1,24 +1,45 @@ -use std::sync::Arc; +use std::{collections::BTreeMap, sync::Arc}; + +use datafusion::logical_expr::ScalarUDF; +use datasets_common::hash_reference::HashReference; use crate::{ BlockNum, - catalog::logical::LogicalCatalog, + catalog::logical::LogicalTable, physical_table::{MultiNetworkSegmentsError, SnapshotError, table::PhysicalTable}, - sql::TableReference, }; -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct Catalog { - /// The logical catalog describing dataset schemas and metadata. - logical: LogicalCatalog, + /// Logical tables describing dataset schemas and metadata. + tables: Vec, + /// UDFs specific to the datasets corresponding to the resolved tables. + udfs: Vec, /// The physical catalog entries, each pairing a physical table with SQL naming. entries: Vec, + /// Dependency alias to hash reference mappings for lazy resolution. + dep_aliases: BTreeMap, } impl Catalog { - /// Creates a new catalog from the given entries and logical catalog. - pub fn new(logical: LogicalCatalog, entries: Vec) -> Self { - Catalog { logical, entries } + /// Creates a new catalog from the given entries, logical tables, and UDFs. + pub fn new( + tables: Vec, + udfs: Vec, + entries: Vec, + dep_aliases: BTreeMap, + ) -> Self { + Catalog { + tables, + udfs, + entries, + dep_aliases, + } + } + + /// Returns the dependency alias to hash reference mappings. + pub fn dep_aliases(&self) -> &BTreeMap { + &self.dep_aliases } /// Returns the catalog entries. @@ -32,14 +53,19 @@ impl Catalog { self.entries.iter().map(|entry| entry.physical_table()) } - /// Returns a reference to the logical catalog. - pub fn logical(&self) -> &LogicalCatalog { - &self.logical + /// Returns the logical tables. + pub fn tables(&self) -> &[LogicalTable] { + &self.tables } - /// Consumes the catalog, returning its entries and logical catalog. - pub fn into_parts(self) -> (Vec, LogicalCatalog) { - (self.entries, self.logical) + /// Returns the user-defined functions. + pub fn udfs(&self) -> &[ScalarUDF] { + &self.udfs + } + + /// Consumes the catalog, returning its entries, logical tables, and UDFs. + pub fn into_parts(self) -> (Vec, Vec, Vec) { + (self.entries, self.tables, self.udfs) } /// Returns the earliest synced block number across all tables in this catalog. @@ -133,12 +159,4 @@ impl CatalogTable { pub fn sql_schema_name(&self) -> &str { &self.sql_schema_name } - - /// Qualified table reference in the format `dataset_name.table_name`. - pub fn table_ref(&self) -> TableReference { - TableReference::partial( - self.sql_schema_name.clone(), - self.physical_table.table_name().clone(), - ) - } } diff --git a/crates/core/common/src/catalog/physical/for_dump.rs b/crates/core/common/src/catalog/physical/for_dump.rs index 7d2baccfb..658fa5b2d 100644 --- a/crates/core/common/src/catalog/physical/for_dump.rs +++ b/crates/core/common/src/catalog/physical/for_dump.rs @@ -1,48 +1,105 @@ //! Derived dataset physical catalog construction. //! //! This module provides physical catalog creation for derived dataset execution. -//! It accepts a pre-created logical catalog and adds physical parquet locations. +//! It resolves dependency tables from manifest deps and SQL table references, +//! then adds physical parquet locations. + +use std::collections::{BTreeMap, btree_map::Entry}; use amp_data_store::DataStore; -use datasets_common::{hash_reference::HashReference, table_name::TableName}; +use datafusion::logical_expr::ScalarUDF; +use datasets_common::{hash::Hash, hash_reference::HashReference, table_name::TableName}; +use datasets_derived::deps::DepAlias; use super::catalog::{Catalog, CatalogTable}; use crate::{ - catalog::logical::LogicalCatalog, + catalog::logical::LogicalTable, dataset_store::{DatasetStore, GetDatasetError}, physical_table::table::PhysicalTable, + sql::TableReference, }; /// Creates a full catalog with physical data access for derived dataset dumps. /// -/// This function builds a complete catalog by adding physical parquet file locations -/// to an existing logical catalog. +/// This function resolves dependency tables from manifest deps and SQL table references, +/// loads dataset metadata, builds physical table entries, and constructs the catalog. /// /// ## Parameters /// /// - `dataset_store`: Used to retrieve dataset metadata including start_block /// - `data_store`: Used to query metadata database for physical parquet locations -/// - `logical`: Pre-created logical catalog containing table schemas and UDFs -/// -/// ## Where Used -/// -/// Called exclusively by `dump_derived_dataset` in `crates/core/dump/src/derived_dataset.rs` -/// during the dump execution phase (NOT during validation). -/// -/// ## Implementation -/// -/// The function: -/// 1. Iterates through tables in the logical catalog -/// 2. Queries metadata database for physical parquet locations -/// 3. Retrieves dataset metadata to get start_block -/// 4. Constructs physical catalog for query execution +/// - `manifest_deps`: Dependency alias → hash reference mappings from the manifest +/// - `table_refs`: Parsed SQL table references with dep alias schemas +/// - `udfs`: Pre-resolved self-ref UDFs (from logical catalog) pub async fn create( dataset_store: &DatasetStore, data_store: &DataStore, - logical: LogicalCatalog, + manifest_deps: &BTreeMap, + table_refs: Vec>, + udfs: Vec, ) -> Result { + // Resolve table references to LogicalTable instances + let mut tables_by_hash: BTreeMap, LogicalTable>> = + Default::default(); + + for table_ref in &table_refs { + match table_ref { + TableReference::Bare { .. } => { + return Err(CreateCatalogError::UnqualifiedTable { + table_ref: table_ref.to_string(), + }); + } + TableReference::Partial { schema, table } => { + let dataset_ref = manifest_deps.get(schema.as_ref()).ok_or_else(|| { + CreateCatalogError::DependencyAliasNotFound { + alias: schema.as_ref().clone(), + } + })?; + + let Entry::Vacant(entry) = tables_by_hash + .entry(dataset_ref.hash().clone()) + .or_default() + .entry(table_ref.clone()) + else { + continue; + }; + + let dataset = dataset_store + .get_dataset(dataset_ref) + .await + .map_err(|err| CreateCatalogError::GetDataset { + reference: dataset_ref.clone(), + source: err, + })?; + + let dataset_table = dataset + .tables() + .iter() + .find(|t| t.name() == table) + .ok_or_else(|| CreateCatalogError::TableNotFoundInDataset { + table_name: table.as_ref().clone(), + reference: dataset_ref.clone(), + })?; + + let resolved_table = LogicalTable::new( + schema.to_string(), + dataset_ref.clone(), + dataset_table.clone(), + ); + + entry.insert(resolved_table); + } + } + } + + let logical_tables: Vec = tables_by_hash + .into_values() + .flat_map(|map| map.into_values()) + .collect(); + + // Build physical catalog entries from resolved logical tables let mut entries = Vec::new(); - for table in &logical.tables { + for table in &logical_tables { let dataset_ref = table.dataset_reference(); let table_name = table.name(); @@ -59,7 +116,6 @@ pub async fn create( table: table_name.clone(), })?; - // Retrieve dataset to get start_block let dataset = dataset_store .get_dataset(dataset_ref) .await @@ -79,7 +135,13 @@ pub async fn create( entries.push(CatalogTable::new(physical_table.into(), sql_schema_name)); } - Ok(Catalog::new(logical, entries)) + // Build dep_aliases map + let dep_aliases: BTreeMap = manifest_deps + .iter() + .map(|(alias, hash_ref)| (alias.to_string(), hash_ref.clone())) + .collect(); + + Ok(Catalog::new(logical_tables, udfs, entries, dep_aliases)) } /// Errors that can occur when creating a physical catalog. @@ -87,6 +149,40 @@ pub async fn create( /// Returned by [`create`] when catalog creation fails. #[derive(Debug, thiserror::Error)] pub enum CreateCatalogError { + /// Table is not qualified with a schema/dataset name. + #[error("Unqualified table '{table_ref}', all tables must be qualified with a dataset")] + UnqualifiedTable { + /// The unqualified table reference string + table_ref: String, + }, + + /// Dependency alias not found when processing table reference. + #[error( + "Dependency alias '{alias}' referenced in table reference but not provided in dependencies" + )] + DependencyAliasNotFound { + /// The dependency alias that was not found in the dependencies map + alias: DepAlias, + }, + + /// Failed to retrieve dataset from store when loading dataset for table reference. + #[error("Failed to retrieve dataset '{reference}' for table reference")] + GetDataset { + /// The hash reference of the dataset that failed to load + reference: HashReference, + #[source] + source: GetDatasetError, + }, + + /// Table not found in dataset. + #[error("Table '{table_name}' not found in dataset '{reference}'")] + TableNotFoundInDataset { + /// The name of the table that was not found + table_name: TableName, + /// The hash reference of the dataset that was searched + reference: HashReference, + }, + /// Failed to retrieve physical table metadata from the metadata database. /// /// This occurs when querying the metadata database for the active physical diff --git a/crates/core/common/src/catalog/physical/for_query.rs b/crates/core/common/src/catalog/physical/for_query.rs index 192978afd..39ca5e982 100644 --- a/crates/core/common/src/catalog/physical/for_query.rs +++ b/crates/core/common/src/catalog/physical/for_query.rs @@ -1,87 +1,116 @@ //! Arrow Flight physical catalog construction. //! //! This module provides physical catalog creation for Arrow Flight query execution. -//! It accepts a pre-created logical catalog and adds physical parquet locations. +//! It resolves tables directly from parsed SQL table references, bypassing the +//! logical catalog. + +use std::collections::BTreeSet; use amp_data_store::DataStore; -use datasets_common::{hash_reference::HashReference, table_name::TableName}; +use datasets_common::{ + hash_reference::HashReference, partial_reference::PartialReference, reference::Reference, + table_name::TableName, +}; use super::catalog::{Catalog, CatalogTable}; use crate::{ - catalog::logical::LogicalCatalog, - dataset_store::{DatasetStore, GetDatasetError}, + dataset_store::{DatasetStore, GetDatasetError, ResolveRevisionError}, physical_table::table::PhysicalTable, + sql::TableReference, }; /// Creates a full catalog with physical data access for SQL query execution. /// -/// This function builds a complete catalog by adding physical parquet file locations -/// to an existing logical catalog, enabling actual query execution with DataFusion. +/// This function resolves tables directly from parsed SQL table references, +/// converting partial references to hash references, loading dataset metadata, +/// and building physical table entries for query execution. /// /// ## Parameters /// -/// - `dataset_store`: Used to retrieve dataset metadata including start_block +/// - `dataset_store`: Used to resolve references and retrieve dataset metadata /// - `data_store`: Used to query metadata database for physical parquet locations -/// - `logical`: Pre-created logical catalog containing table schemas and UDFs -/// -/// ## Where Used -/// -/// This function is used exclusively in the **Query Execution Path**: -/// -/// - **Arrow Flight DoGet** (`crates/services/server/src/flight.rs`): -/// - Called during Arrow Flight `DoGet` phase to execute user queries -/// - Provides physical catalog for streaming query results to clients -/// -/// ## Implementation -/// -/// The function: -/// 1. Iterates through tables in the logical catalog -/// 2. Queries metadata database for physical parquet locations -/// 3. Retrieves dataset metadata to get start_block -/// 4. Constructs physical catalog for query execution +/// - `table_refs`: Parsed SQL table references with partial dataset references pub async fn create( dataset_store: &DatasetStore, data_store: &DataStore, - logical: LogicalCatalog, + table_refs: Vec>, ) -> Result { let mut entries = Vec::new(); - for table in &logical.tables { - let dataset_ref = table.dataset_reference(); + let mut seen = BTreeSet::new(); - let revision = data_store - .get_table_active_revision(dataset_ref, table.name()) + for table_ref in &table_refs { + let TableReference::Partial { schema, table } = table_ref else { + continue; + }; + + let reference: Reference = PartialReference::clone(schema).into(); + + // Resolve to hash reference + let hash_ref = dataset_store + .resolve_revision(&reference) .await - .map_err(|source| CreateCatalogError::PhysicalTableRetrieval { - dataset: dataset_ref.clone(), - table: table.name().clone(), + .map_err(|source| CreateCatalogError::ResolveRevision { + reference: reference.clone(), source, })? - .ok_or_else(|| CreateCatalogError::TableNotSynced { - dataset: dataset_ref.clone(), - table: table.name().clone(), + .ok_or_else(|| CreateCatalogError::DatasetNotFound { + reference: reference.clone(), })?; - // Retrieve dataset to get start_block + // Deduplicate by (schema_name, hash, table_name) — different SQL aliases + // for the same dataset+table need separate physical entries because + // attach_to matches by sql_schema_name. + let sql_schema_name = schema.to_string(); + let dedup_key = (sql_schema_name.clone(), hash_ref.clone(), table.clone()); + if !seen.insert(dedup_key) { + continue; + } + + // Load dataset let dataset = dataset_store - .get_dataset(dataset_ref) + .get_dataset(&hash_ref) .await .map_err(|source| CreateCatalogError::DatasetRetrieval { - dataset: dataset_ref.clone(), + dataset: hash_ref.clone(), source, })?; - let sql_schema_name = table.sql_schema_name().to_string(); + // Find the table in the dataset + let dataset_table = dataset + .tables() + .iter() + .find(|t| t.name() == table.as_ref()) + .ok_or_else(|| CreateCatalogError::DatasetTableNotFound { + dataset: hash_ref.clone(), + table: (**table).clone(), + })? + .clone(); + + // Get physical revision + let revision = data_store + .get_table_active_revision(&hash_ref, table) + .await + .map_err(|source| CreateCatalogError::PhysicalTableRetrieval { + dataset: hash_ref.clone(), + table: (**table).clone(), + source, + })? + .ok_or_else(|| CreateCatalogError::TableNotSynced { + dataset: hash_ref.clone(), + table: (**table).clone(), + })?; + let physical_table = PhysicalTable::from_revision( data_store.clone(), - table.dataset_reference().clone(), + hash_ref, dataset.start_block(), - table.table().clone(), + dataset_table, revision, ); entries.push(CatalogTable::new(physical_table.into(), sql_schema_name)); } - Ok(Catalog::new(logical, entries)) + Ok(Catalog::new(vec![], vec![], entries, Default::default())) } /// Errors that can occur when creating a physical catalog. @@ -89,6 +118,31 @@ pub async fn create( /// Returned by [`create`] when catalog creation fails. #[derive(Debug, thiserror::Error)] pub enum CreateCatalogError { + /// Failed to resolve a dataset reference to a hash reference. + #[error("Failed to resolve dataset reference {reference}")] + ResolveRevision { + /// The reference that failed to resolve + reference: Reference, + #[source] + source: ResolveRevisionError, + }, + + /// Dataset reference resolved but no dataset was found. + #[error("Dataset not found: {reference}")] + DatasetNotFound { + /// The reference that was not found + reference: Reference, + }, + + /// Table not found in dataset definition. + #[error("Table {table} not found in dataset {dataset}")] + DatasetTableNotFound { + /// The hash reference of the dataset + dataset: HashReference, + /// The table name that was not found + table: TableName, + }, + /// Failed to retrieve physical table metadata from the metadata database. /// /// This occurs when querying the metadata database for the active physical diff --git a/crates/core/common/src/catalog/physical/snapshot.rs b/crates/core/common/src/catalog/physical/snapshot.rs index 0fa1eb46e..7095fa1f7 100644 --- a/crates/core/common/src/catalog/physical/snapshot.rs +++ b/crates/core/common/src/catalog/physical/snapshot.rs @@ -23,7 +23,7 @@ use futures::{Stream, StreamExt as _}; use super::{catalog::CatalogTable, reader}; use crate::{ BlockRange, - catalog::logical::LogicalCatalog, + catalog::logical::LogicalTable, physical_table::{ MultiNetworkSegmentsError, SnapshotError, resolved::ResolvedFile, snapshot::TableSnapshot as PhyTableSnapshot, table::PhysicalTable, @@ -38,8 +38,10 @@ use crate::{ /// execution layer (via `QueryableSnapshot`). #[derive(Debug, Clone)] pub struct CatalogSnapshot { - /// The logical catalog describing dataset schemas and metadata. - logical: LogicalCatalog, + /// Logical tables describing dataset schemas and metadata. + tables: Vec, + /// UDFs specific to the datasets corresponding to the resolved tables. + udfs: Vec, /// Each snapshot is paired with its SQL table ref schema string. table_snapshots: Vec<(Arc, String)>, } @@ -50,7 +52,8 @@ impl CatalogSnapshot { /// When `ignore_canonical_segments` is `true`, canonical chain filtering is /// skipped during snapshot creation. pub async fn from_catalog( - logical: LogicalCatalog, + tables: Vec, + udfs: Vec, entries: &[CatalogTable], ignore_canonical_segments: bool, ) -> Result { @@ -65,14 +68,15 @@ impl CatalogSnapshot { } Ok(Self { - logical, + tables, + udfs, table_snapshots, }) } - /// Returns a reference to the logical catalog. - pub fn logical(&self) -> &LogicalCatalog { - &self.logical + /// Returns the logical tables. + pub fn tables(&self) -> &[LogicalTable] { + &self.tables } /// Returns the table snapshots paired with their SQL table ref schema strings. @@ -82,9 +86,9 @@ impl CatalogSnapshot { .map(|(s, sql)| (s, sql.as_str())) } - /// Returns the user-defined functions registered in the logical catalog. + /// Returns the user-defined functions. pub fn udfs(&self) -> &[ScalarUDF] { - &self.logical.udfs + &self.udfs } /// Reconstructs `CatalogTable` entries from the snapshotted data. diff --git a/crates/core/common/src/context.rs b/crates/core/common/src/context.rs index ad7fcb949..0feaeb117 100644 --- a/crates/core/common/src/context.rs +++ b/crates/core/common/src/context.rs @@ -1,3 +1,5 @@ mod common; pub mod exec; pub mod plan; +pub mod session; +pub mod session_state; diff --git a/crates/core/common/src/context/common.rs b/crates/core/common/src/context/common.rs index 76511fb90..dfcc8a2c2 100644 --- a/crates/core/common/src/context/common.rs +++ b/crates/core/common/src/context/common.rs @@ -1,57 +1,15 @@ -use datafusion::{ - error::DataFusionError, - logical_expr::{LogicalPlan, ScalarUDF}, - prelude::{SQLOptions, SessionContext}, - sql::parser, -}; +use datafusion::{error::DataFusionError, logical_expr::LogicalPlan, prelude::SQLOptions}; -use crate::{ - evm::udfs::{ - EvmDecodeHex, EvmDecodeLog, EvmDecodeParams, EvmDecodeType, EvmEncodeHex, EvmEncodeParams, - EvmEncodeType, EvmTopic, ShiftUnits, - }, - plan_visitors::forbid_underscore_prefixed_aliases, -}; - -/// Returns the built-in scalar UDFs registered in every session context. -pub fn builtin_udfs() -> Vec { - vec![ - EvmDecodeLog::new().into(), - EvmDecodeLog::new().with_deprecated_name().into(), - EvmTopic::new().into(), - EvmEncodeParams::new().into(), - EvmDecodeParams::new().into(), - EvmEncodeType::new().into(), - EvmDecodeType::new().into(), - EvmEncodeHex::new().into(), - EvmDecodeHex::new().into(), - ShiftUnits::new().into(), - ] -} - -/// Converts a parsed SQL statement into a validated logical plan. +/// Context string used to tag user-input errors when flattening to `DataFusionError`. /// -/// Validates that the plan does not use reserved underscore-prefixed aliases -/// and enforces read-only constraints (no DDL/DML). -#[tracing::instrument(skip_all, err)] -pub async fn sql_to_plan( - ctx: &SessionContext, - query: parser::Statement, -) -> Result { - let plan = ctx - .state() - .statement_to_plan(query) - .await - .map_err(SqlToPlanError::StatementToPlan)?; - - forbid_underscore_prefixed_aliases(&plan).map_err(SqlToPlanError::ForbiddenAliases)?; - read_only_check(&plan).map_err(SqlToPlanError::ReadOnlyCheck)?; - - Ok(plan) -} +/// Errors tagged with this context represent invalid user input (e.g., malformed +/// SQL references, forbidden aliases, read-only violations) and should be surfaced +/// as `BAD_REQUEST` / `invalid_argument` to the client. Internal errors are not +/// tagged and default to `INTERNAL_SERVER_ERROR` / `internal`. +pub(super) const INVALID_INPUT_CONTEXT: &str = "amp::invalid_input"; /// Verifies that the logical plan contains no DDL, DML, or statement operations. -pub fn read_only_check(plan: &LogicalPlan) -> Result<(), ReadOnlyCheckError> { +pub(super) fn read_only_check(plan: &LogicalPlan) -> Result<(), ReadOnlyCheckError> { SQLOptions::new() .with_allow_ddl(false) .with_allow_dml(false) @@ -60,44 +18,6 @@ pub fn read_only_check(plan: &LogicalPlan) -> Result<(), ReadOnlyCheckError> { .map_err(ReadOnlyCheckError) } -/// Failed to convert SQL statement to a logical plan -/// -/// This error covers all failure modes during the `sql_to_plan` pipeline: -/// statement-to-plan conversion, alias validation, and read-only enforcement. -#[derive(Debug, thiserror::Error)] -pub enum SqlToPlanError { - /// DataFusion failed to convert the SQL statement into a logical plan - /// - /// Possible causes: - /// - SQL syntax is valid but semantically incorrect (e.g., type mismatch) - /// - Referenced columns do not exist in the schema - /// - Unsupported SQL features - #[error("failed to convert SQL statement to logical plan: {0}")] - StatementToPlan(#[source] DataFusionError), - - /// Query uses underscore-prefixed column aliases which are reserved - /// - /// Column aliases starting with `_` are reserved for special columns - /// like `_block_num`. Queries must not use these aliases. - #[error("query uses forbidden underscore-prefixed aliases")] - ForbiddenAliases(#[source] DataFusionError), - - /// Query plan violates read-only constraints - /// - /// The plan contains DDL (CREATE, DROP, ALTER), DML (INSERT, UPDATE, DELETE), - /// or statement operations that are not allowed in a read-only context. - #[error("query plan violates read-only constraints")] - ReadOnlyCheck(#[source] ReadOnlyCheckError), -} - -impl SqlToPlanError { - /// Returns `true` if this error represents an invalid plan due to user input - /// (forbidden aliases or read-only violations) rather than an internal failure. - pub fn is_invalid_plan(&self) -> bool { - matches!(self, Self::ForbiddenAliases(_) | Self::ReadOnlyCheck(_)) - } -} - /// Query plan violates read-only constraints /// /// This occurs when a query plan contains DDL, DML, or statement operations diff --git a/crates/core/common/src/context/exec.rs b/crates/core/common/src/context/exec.rs index 7e8e1105e..ae35cfc08 100644 --- a/crates/core/common/src/context/exec.rs +++ b/crates/core/common/src/context/exec.rs @@ -1,24 +1,28 @@ use std::{ - collections::BTreeMap, + collections::{BTreeMap, BTreeSet}, pin::Pin, sync::{Arc, LazyLock}, task::{Context, Poll}, }; +use amp_data_store::DataStore; use arrow::{array::ArrayRef, compute::concat_batches, datatypes::SchemaRef}; use datafusion::{ self, arrow::array::RecordBatch, - catalog::MemorySchemaProvider, + catalog::{AsyncCatalogProvider as TableAsyncCatalogProvider, MemorySchemaProvider}, error::DataFusionError, execution::{ - RecordBatchStream, SendableRecordBatchStream, SessionStateBuilder, config::SessionConfig, - context::SessionContext, memory_pool::human_readable_size, runtime_env::RuntimeEnv, + RecordBatchStream, SendableRecordBatchStream, TaskContext, + cache::cache_manager::CacheManager, + config::SessionConfig, + disk_manager::DiskManager, + memory_pool::{MemoryPool, human_readable_size}, + object_store::ObjectStoreRegistry, }, logical_expr::LogicalPlan, physical_optimizer::PhysicalOptimizerRule, physical_plan::{ExecutionPlan, displayable, execute_stream, stream::RecordBatchStreamAdapter}, - physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner as _}, sql::parser, }; use datafusion_tracing::{ @@ -26,6 +30,7 @@ use datafusion_tracing::{ }; use datasets_common::network_id::NetworkId; use futures::{Stream, TryStreamExt, stream}; +use js_runtime::isolate_pool::IsolatePool; use regex::Regex; use tracing::field; @@ -35,13 +40,19 @@ use crate::{ Catalog, snapshot::{CatalogSnapshot, FromCatalogError, QueryableSnapshot}, }, - context::common::{ - ReadOnlyCheckError, SqlToPlanError, builtin_udfs, read_only_check, sql_to_plan, + context::{ + common::{INVALID_INPUT_CONTEXT, ReadOnlyCheckError, read_only_check}, + session::{SessionContext, SessionState, SessionStateBuilder}, }, + dataset_store::DatasetStore, exec_env::ExecEnv, + func_catalog::catalog_provider::AsyncCatalogProvider as FuncAsyncCatalogProvider, memory_pool::{MemoryPoolKind, TieredMemoryPool, make_memory_pool}, physical_table::MultiNetworkSegmentsError, - plan_visitors::{extract_table_references_from_plan, forbid_duplicate_field_names}, + plan_visitors::{ + extract_table_references_from_plan, forbid_duplicate_field_names, + forbid_underscore_prefixed_aliases, + }, sql::{TableReference, TableReferenceConversionError}, }; @@ -49,57 +60,17 @@ use crate::{ #[derive(Clone)] pub struct ExecContext { pub env: ExecEnv, - session_config: SessionConfig, physical_table: CatalogSnapshot, query_snapshots: Vec>, /// Per-query memory pool (if per-query limits are enabled) tiered_memory_pool: Arc, + /// Custom session context that owns the runtime environment and optimizer + /// rules. All session creation goes through `session_ctx.state()` + /// so that runtime configuration is never duplicated across call sites. + session_ctx: SessionContext, } impl ExecContext { - /// Creates a query context from a physical catalog. - /// - /// `ignore_canonical` controls whether to use only the canonical chain (`false`, default) - /// or all physically present segments (`true`, used for forked-chain contexts). - pub async fn for_catalog( - env: ExecEnv, - catalog: Catalog, - ignore_canonical: bool, - ) -> Result { - // Create a catalog snapshot with canonical chain locked in - let (entries, logical) = catalog.into_parts(); - let physical_table = CatalogSnapshot::from_catalog(logical, &entries, ignore_canonical) - .await - .map_err(CreateContextError::CatalogSnapshot)?; - - let query_snapshots = physical_table - .table_snapshots() - .map(|(s, sql_schema_name)| { - QueryableSnapshot::from_snapshot(s, env.store.clone(), sql_schema_name.to_string()) - .map(Arc::new) - .map_err(CreateContextError::MultiNetworkSegments) - }) - .collect::, _>>()?; - - let tiered_memory_pool: Arc = { - let per_query_bytes = env.query_max_mem_mb * 1024 * 1024; - let child_pool = make_memory_pool(MemoryPoolKind::Greedy, per_query_bytes); - - Arc::new(TieredMemoryPool::new( - env.global_memory_pool.clone(), - child_pool, - )) - }; - - Ok(Self { - session_config: env.session_config.clone(), - env, - physical_table, - query_snapshots, - tiered_memory_pool, - }) - } - /// Returns the physical catalog snapshot backing this query context. /// /// Exposes segment-level data for streaming query consumers that need to @@ -110,18 +81,38 @@ impl ExecContext { } /// Converts a parsed SQL statement into a logical plan against the physical catalog. - pub async fn plan_sql(&self, query: parser::Statement) -> Result { - let ctx = new_session_ctx( - self.session_config.clone(), - &self.tiered_memory_pool, + /// + /// Performs async pre-resolution, registers physical catalog tables, + /// then enforces consumer-level policies: forbidden underscore-prefixed + /// aliases and read-only constraints. + pub async fn statement_to_plan( + &self, + query: parser::Statement, + ) -> Result { + let mut state = self + .session_ctx + .resolved_state(&query) + .await + .map_err(SqlError::Planning)?; + register_catalog( &self.env, - ); - register_catalog(&self.env, &ctx, &self.physical_table, &self.query_snapshots) - .map_err(SqlError::RegisterTable)?; + &mut state, + &self.physical_table, + &self.query_snapshots, + ) + .map_err(SqlError::RegisterTable)?; + + let plan = state.statement_to_plan(query).await.map_err(|err| { + SqlError::Planning(StatementToPlanError::StatementToPlan(err).into_datafusion_error()) + })?; + + read_only_check(&plan).map_err(|err| { + SqlError::Planning(StatementToPlanError::ReadOnlyCheck(err).into_datafusion_error()) + })?; + forbid_underscore_prefixed_aliases(&plan).map_err(|err| { + SqlError::Planning(StatementToPlanError::ForbiddenAliases(err).into_datafusion_error()) + })?; - let plan = sql_to_plan(&ctx, query) - .await - .map_err(SqlError::SqlToPlan)?; Ok(plan) } @@ -131,15 +122,16 @@ impl ExecContext { plan: LogicalPlan, logical_optimize: bool, ) -> Result { - let ctx = new_session_ctx( - self.session_config.clone(), - &self.tiered_memory_pool, + let mut state = self.session_ctx.state(); + register_catalog( &self.env, - ); - register_catalog(&self.env, &ctx, &self.physical_table, &self.query_snapshots) - .map_err(ExecutePlanError::RegisterTable)?; + &mut state, + &self.physical_table, + &self.query_snapshots, + ) + .map_err(ExecutePlanError::RegisterTable)?; - let result = execute_plan(&ctx, plan, logical_optimize) + let result = execute_plan(&state, plan, logical_optimize) .await .map_err(ExecutePlanError::Execute)?; @@ -155,22 +147,24 @@ impl ExecContext { plan: LogicalPlan, ) -> Result { let schema = plan.schema().inner().clone(); - let ctx = new_session_ctx( - self.session_config.clone(), - &self.tiered_memory_pool, + let mut state = self.session_ctx.state(); + register_catalog( &self.env, - ); - register_catalog(&self.env, &ctx, &self.physical_table, &self.query_snapshots) - .map_err(ExecuteAndConcatError::RegisterTable)?; + &mut state, + &self.physical_table, + &self.query_snapshots, + ) + .map_err(ExecuteAndConcatError::RegisterTable)?; - let batch_stream = execute_plan(&ctx, plan, true) + let batch_stream = execute_plan(&state, plan, true) .await .map_err(ExecuteAndConcatError::Execute)? .try_collect::>() .await .map_err(ExecuteAndConcatError::CollectResults)?; - Ok(concat_batches(&schema, &batch_stream).unwrap()) + concat_batches(&schema, &batch_stream) + .map_err(|err| ExecuteAndConcatError::ConcatBatches(err.into())) } /// Looks up a queryable snapshot by reference. Fails if the table is not in the catalog. @@ -195,7 +189,7 @@ impl ExecContext { &self, plan: &LogicalPlan, ) -> Result, CommonRangesError> { - let mut ranges_by_network: BTreeMap = BTreeMap::new(); + let mut ranges_by_network: BTreeMap = Default::default(); for df_table_ref in extract_table_references_from_plan(plan) .map_err(CommonRangesError::ExtractTableReferences)? { @@ -267,24 +261,244 @@ pub enum CreateContextError { MultiNetworkSegments(#[source] MultiNetworkSegmentsError), } +/// Builder for [`ExecContext`]. +/// +/// Accepts the decomposed components that were previously combined into an +/// umbrella [`ExecEnv`] object. Internally composes a [`SessionStateBuilder`] +/// for session construction; no session builder internals are exposed through +/// the public API. +/// +/// Construction is a two-step process: configure the builder with +/// [`new`](Self::new), then call [`for_catalog`](Self::for_catalog) to +/// asynchronously create the [`ExecContext`] (catalog snapshot creation +/// requires async canonical-chain resolution that cannot happen in a sync +/// builder). +/// +/// # Async catalog providers (planning-only) +/// +/// The builder exposes [`with_table_catalog`](Self::with_table_catalog) and +/// [`with_func_catalog`](Self::with_func_catalog) for registering async +/// catalog providers. These feed into the session's +/// `resolved_state()` pre-resolution pipeline, which is +/// invoked by [`ExecContext::statement_to_plan`]. +/// +/// **Current production state**: no production call site wires async catalog +/// providers on exec. The eager [`CatalogSnapshot`] path (physical tables +/// registered via [`register_catalog`]) is the primary exec-path table/function +/// source. When both catalog maps are empty the session's pre-resolution +/// early-outs to the eager path, so production exec planning incurs no async +/// overhead. The async APIs are retained for testing and future use. +pub struct ExecContextBuilder { + session_config: SessionConfig, + store: DataStore, + dataset_store: DatasetStore, + isolate_pool: IsolatePool, + global_memory_pool: Arc, + query_max_mem_mb: usize, + disk_manager: Arc, + cache_manager: Arc, + object_store_registry: Arc, + table_catalogs: BTreeMap>, + func_catalogs: BTreeMap>, +} + +impl ExecContextBuilder { + /// Creates a new builder from an [`ExecEnv`]. + /// + /// All mandatory execution components are sourced from the environment. + /// Optional async catalog providers can be added via + /// [`with_table_catalog`](Self::with_table_catalog) and + /// [`with_func_catalog`](Self::with_func_catalog). + pub fn new(env: ExecEnv) -> Self { + Self { + session_config: env.session_config, + store: env.store, + dataset_store: env.dataset_store, + isolate_pool: env.isolate_pool, + global_memory_pool: env.global_memory_pool, + query_max_mem_mb: env.query_max_mem_mb, + disk_manager: env.disk_manager, + cache_manager: env.cache_manager, + object_store_registry: env.object_store_registry, + table_catalogs: Default::default(), + func_catalogs: Default::default(), + } + } + + /// Registers a named async table catalog provider for SQL pre-resolution. + /// + /// Providers are consulted by the custom session during `statement_to_plan` before + /// SQL-to-plan conversion. Only catalogs referenced by the query are + /// resolved. The `name` must match the `SessionConfig` default catalog + /// for the provider to be reachable — see [`SessionStateBuilder`] docs. + /// Not wired at production call sites — see struct-level docs. + pub fn with_table_catalog( + mut self, + name: impl Into, + provider: Arc, + ) -> Self { + self.table_catalogs.insert(name.into(), provider); + self + } + + /// Registers a named async function catalog provider for SQL pre-resolution. + /// + /// Providers are consulted by the custom session during `statement_to_plan` before + /// SQL-to-plan conversion. Only schema-qualified function references trigger + /// async function resolution. The `name` must match the `SessionConfig` + /// default catalog for the provider to be reachable — see + /// [`SessionStateBuilder`] docs. Not wired at production call sites — + /// see struct-level docs. + pub fn with_func_catalog( + mut self, + name: impl Into, + provider: Arc, + ) -> Self { + self.func_catalogs.insert(name.into(), provider); + self + } + + /// Creates an [`ExecContext`] backed by a physical catalog. + /// + /// This is the async construction step that cannot be performed in the + /// synchronous builder: it locks in the canonical chain via + /// [`CatalogSnapshot::from_catalog`] and assembles the full execution + /// context. + /// + /// Internally composes a [`SessionStateBuilder`] from the stored + /// components (including the per-query tiered memory pool). The session + /// builder and its internals are not exposed through the + /// [`ExecContextBuilder`] API. + pub async fn for_catalog( + self, + catalog: Catalog, + ignore_canonical_segments: bool, + ) -> Result { + // Create the per-query tiered memory pool from global + per-query limit. + let per_query_bytes = self.query_max_mem_mb * 1024 * 1024; + let child_pool = make_memory_pool(MemoryPoolKind::Greedy, per_query_bytes); + let tiered_memory_pool = Arc::new(TieredMemoryPool::new( + self.global_memory_pool.clone(), + child_pool, + )); + + // Create the catalog snapshot (the async step that requires canonical + // chain resolution from segment metadata). + let (entries, tables, udfs) = catalog.into_parts(); + let physical_table = + CatalogSnapshot::from_catalog(tables, udfs, &entries, ignore_canonical_segments) + .await + .map_err(CreateContextError::CatalogSnapshot)?; + + let query_snapshots = physical_table + .table_snapshots() + .map(|(s, sql_schema_name)| { + QueryableSnapshot::from_snapshot(s, self.store.clone(), sql_schema_name.to_string()) + .map(Arc::new) + .map_err(CreateContextError::MultiNetworkSegments) + }) + .collect::, _>>()?; + + let env = ExecEnv { + session_config: self.session_config.clone(), + global_memory_pool: self.global_memory_pool.clone(), + disk_manager: self.disk_manager.clone(), + cache_manager: self.cache_manager.clone(), + object_store_registry: self.object_store_registry.clone(), + isolate_pool: self.isolate_pool, + query_max_mem_mb: self.query_max_mem_mb, + store: self.store, + dataset_store: self.dataset_store, + }; + + // Compose a SessionStateBuilder from the stored components (including + // the per-query tiered pool). All session creation in ExecContext goes + // through `self.session_ctx.state()`. + let mut session_builder = SessionStateBuilder::new(self.session_config) + .with_memory_pool(tiered_memory_pool.clone()) + .with_disk_manager(env.disk_manager.clone()) + .with_cache_manager(env.cache_manager.clone()) + .with_object_store_registry(env.object_store_registry.clone()) + .with_physical_optimizer_rule(create_instrumentation_rule()); + + for (name, provider) in self.table_catalogs { + session_builder = session_builder.with_table_catalog(name, provider); + } + for (name, provider) in self.func_catalogs { + session_builder = session_builder.with_func_catalog(name, provider); + } + + let session_ctx = SessionContext::new_with_state(session_builder.build()); + + Ok(ExecContext { + env, + physical_table, + query_snapshots, + tiered_memory_pool, + session_ctx, + }) + } +} + /// Failed to plan a SQL query in the query context /// -/// This error covers failures during `ExecContext::plan_sql()`. +/// This error covers failures during `ExecContext::statement_to_plan()`. #[derive(Debug, thiserror::Error)] pub enum SqlError { - /// Failed to create a exec session context + /// Failed during SQL planning (pre-resolution or plan conversion) /// - /// This occurs when building a `SessionContext` for query execution fails, - /// typically due to a table registration error during context setup. + /// Covers async catalog pre-resolution failures and SQL-to-logical-plan + /// conversion errors. Use [`is_user_input_error`](super::session::is_user_input_error) + /// on the inner `DataFusionError` to classify user-input vs internal errors. + #[error("failed to plan SQL query")] + Planning(#[source] DataFusionError), + + /// Failed to create an exec session context + /// + /// This occurs when building a `SessionState` for query execution fails, + /// typically due to a table registration error during state setup. #[error("failed to create exec session context")] RegisterTable(#[source] RegisterTableError), +} - /// Failed to convert SQL to a logical plan - /// - /// This occurs during SQL-to-logical-plan conversion, including - /// statement parsing, alias validation, and read-only enforcement. - #[error("failed to convert SQL to logical plan: {0}")] - SqlToPlan(#[source] SqlToPlanError), +/// Failed to convert a SQL statement into a validated logical plan. +/// +/// Covers all failure modes during policy-enforcing SQL planning in +/// [`ExecContext::statement_to_plan`]: statement conversion, alias validation, +/// and read-only enforcement. Flattened to [`DataFusionError`] before wrapping +/// in [`SqlError::Planning`]. +#[derive(Debug, thiserror::Error)] +enum StatementToPlanError { + /// DataFusion failed to convert the SQL statement into a logical plan + #[error("failed to convert SQL statement to logical plan: {0}")] + StatementToPlan(#[source] DataFusionError), + + /// Query uses underscore-prefixed column aliases which are reserved + #[error("query uses forbidden underscore-prefixed aliases")] + ForbiddenAliases(#[source] DataFusionError), + + /// Query plan violates read-only constraints + #[error("query plan violates read-only constraints")] + ReadOnlyCheck(#[source] ReadOnlyCheckError), +} + +impl StatementToPlanError { + /// Converts into a [`DataFusionError`] with appropriate user-input tagging. + fn into_datafusion_error(self) -> DataFusionError { + match self { + Self::StatementToPlan(err) => { + err.context("failed to convert SQL statement to logical plan") + } + Self::ForbiddenAliases(err) => DataFusionError::Plan(format!( + "query uses forbidden underscore-prefixed aliases: {err}" + )) + .context(INVALID_INPUT_CONTEXT), + Self::ReadOnlyCheck(err) => { + DataFusionError::Plan(format!("query plan violates read-only constraints: {err}")) + .context(INVALID_INPUT_CONTEXT) + } + } + } } /// Errors that occur during inner `execute_plan` function @@ -316,6 +530,13 @@ pub enum ExecuteError { /// Failed to collect explain results #[error("failed to collect explain results")] CollectExplainResults(#[source] DataFusionError), + + /// Failed to concatenate collected explain result batches + /// + /// This occurs when schema mismatch or allocation failure prevents + /// concatenation of EXPLAIN output batches. + #[error("failed to concatenate explain result batches")] + ConcatExplainResults(#[source] DataFusionError), } /// Failed to execute a plan via `ExecContext::execute_plan` @@ -323,7 +544,7 @@ pub enum ExecuteError { /// This error wraps session context creation and inner execution errors. #[derive(Debug, thiserror::Error)] pub enum ExecutePlanError { - /// Failed to create a exec session context + /// Failed to create an exec session context #[error("failed to create exec session context")] RegisterTable(#[source] RegisterTableError), @@ -337,7 +558,7 @@ pub enum ExecutePlanError { /// This error covers `ExecContext::execute_and_concat()`. #[derive(Debug, thiserror::Error)] pub enum ExecuteAndConcatError { - /// Failed to create a exec session context + /// Failed to create an exec session context #[error("failed to create exec session context")] RegisterTable(#[source] RegisterTableError), @@ -351,6 +572,13 @@ pub enum ExecuteAndConcatError { /// execution when materializing record batches from the result stream. #[error("failed to collect query results")] CollectResults(#[source] DataFusionError), + + /// Failed to concatenate collected result batches into a single batch + /// + /// This occurs when schema mismatch or allocation failure prevents + /// concatenation of the materialized record batches. + #[error("failed to concatenate query result batches")] + ConcatBatches(#[source] DataFusionError), } /// Referenced table does not exist in the catalog @@ -380,144 +608,153 @@ pub enum CommonRangesError { TableNotFound(#[source] TableNotFoundError), } -/// Creates a bare DataFusion session context with the query runtime environment and builtin -/// UDFs but no catalog tables. +/// Registers the tables and UDFs from a [`CatalogSnapshot`] into a [`SessionState`]. /// -/// Call [`register_catalog`] on the returned context to populate it with tables and UDFs -/// from a [`CatalogSnapshot`]. -fn new_session_ctx( - config: SessionConfig, - tiered_memory_pool: &Arc, - env: &ExecEnv, -) -> SessionContext { - let runtime_env = Arc::new(RuntimeEnv { - memory_pool: tiered_memory_pool.clone(), - disk_manager: env.disk_manager.clone(), - cache_manager: env.cache_manager.clone(), - object_store_registry: env.object_store_registry.clone(), - }); - - let state = SessionStateBuilder::new() - .with_config(config) - .with_runtime_env(runtime_env) - .with_default_features() - .with_physical_optimizer_rule(create_instrumentation_rule()) - .build(); - - let ctx = SessionContext::new_with_state(state); - - // Register the builtin UDFs - for udf in builtin_udfs() { - ctx.register_udf(udf); - } - - ctx -} - -/// Registers the tables and UDFs from a [`CatalogSnapshot`] into a [`SessionContext`]. +/// For each unique schema name a [`MemorySchemaProvider`] is created when the +/// schema does not already exist **or** when the existing schema is a read-only +/// provider (e.g. `ResolvedSchemaProvider` left by async pre-resolution). +/// Physical catalog tables are then registered into the writable schema. fn register_catalog( env: &ExecEnv, - ctx: &SessionContext, + state: &mut SessionState, catalog: &CatalogSnapshot, query_snapshots: &[Arc], ) -> Result<(), RegisterTableError> { - for table in query_snapshots { - // The catalog schema needs to be explicitly created or table creation will fail. - let schema_name = table.sql_schema_name(); - if ctx - .catalog(&ctx.catalog_names()[0]) - .unwrap() - .schema(schema_name) - .is_none() - { + let default_catalog_name = state.config().options().catalog.default_catalog.clone(); + let default_catalog = state + .catalog_list() + .catalog(&default_catalog_name) + .ok_or_else(|| RegisterTableError::MissingDefaultCatalog { + catalog: default_catalog_name.clone(), + })?; + + // Always register fresh schemas to ensure idempotent table registration. + // The catalog's `CatalogProviderList` is `Arc`-shared across `DfSessionState` + // clones, so schemas from a prior planning phase (`register_logical_catalog`) + // may persist with their planning-only tables. A fresh `MemorySchemaProvider` + // avoids "table already exists" errors from DF 52's strict duplicate rejection. + { + let schema_names = query_snapshots + .iter() + .map(|t| t.sql_schema_name()) + .collect::>(); + for schema_name in schema_names { let schema = Arc::new(MemorySchemaProvider::new()); - ctx.catalog(&ctx.catalog_names()[0]) - .unwrap() + default_catalog .register_schema(schema_name, schema) - .unwrap(); + .map_err(|source| RegisterTableError::RegisterSchema { + catalog: default_catalog_name.clone(), + schema: schema_name.to_string(), + source, + })?; } + } + for table in query_snapshots { let table_ref = table.table_ref(); // This may overwrite a previously registered store, but that should not make a difference. // The only segment of the `table.url()` that matters here is the schema and bucket name. - ctx.register_object_store( + state.runtime_env().register_object_store( table.physical_table().url(), env.store.as_datafusion_object_store().clone(), ); - ctx.register_table(table_ref, table.clone()) - .map_err(RegisterTableError)?; + let table_name = table_ref.table().to_string(); + let table_ref: datafusion::common::TableReference = table_ref.into(); + let schema_provider = state + .schema_for_ref(table_ref) + .map_err(RegisterTableError::RegisterTable)?; + + schema_provider + .register_table(table_name, table.clone()) + .map_err(RegisterTableError::RegisterTable)?; } // Register catalog UDFs for udf in catalog.udfs() { - ctx.register_udf(udf.clone()); + state + .register_udf(Arc::new(udf.clone())) + .map_err(RegisterTableError::RegisterUdf)?; } Ok(()) } -/// Failed to register a dataset table with the exec session context -/// -/// This occurs when DataFusion rejects a table registration during query -/// session creation, typically because a table with the same name already -/// exists or the table metadata is invalid. +/// Failed to register catalog content with the exec session context #[derive(Debug, thiserror::Error)] -#[error("Failed to register dataset table with exec session context")] -pub struct RegisterTableError(#[source] DataFusionError); +pub enum RegisterTableError { + /// The configured default catalog is missing from the DataFusion session + #[error("default catalog '{catalog}' is not registered in exec session context")] + MissingDefaultCatalog { catalog: String }, + + /// Failed to create a schema in the configured default catalog + #[error("failed to register schema '{schema}' in default catalog '{catalog}'")] + RegisterSchema { + catalog: String, + schema: String, + #[source] + source: DataFusionError, + }, + + /// Failed to register a dataset table in the exec session context + #[error("failed to register dataset table with exec session context")] + RegisterTable(#[source] DataFusionError), + + /// Failed to register a catalog UDF in the exec session context + #[error("failed to register catalog UDF in exec session context")] + RegisterUdf(#[source] DataFusionError), +} /// `logical_optimize` controls whether logical optimizations should be applied to `plan`. #[tracing::instrument(skip_all, err)] async fn execute_plan( - ctx: &SessionContext, + state: &SessionState, mut plan: LogicalPlan, logical_optimize: bool, ) -> Result { read_only_check(&plan).map_err(ExecuteError::ReadOnlyCheck)?; - tracing::debug!("logical plan: {}", plan.to_string().replace('\n', "\\n")); + tracing::debug!(logical_plan = %plan.to_string().replace('\n', "\\n"), "planned SQL"); if logical_optimize { - plan = ctx - .state() - .optimize(&plan) - .map_err(ExecuteError::Optimize)?; + plan = state.optimize(&plan).map_err(ExecuteError::Optimize)?; } let is_explain = matches!(plan, LogicalPlan::Explain(_) | LogicalPlan::Analyze(_)); - let planner = DefaultPhysicalPlanner::default(); - let physical_plan = planner - .create_physical_plan(&plan, &ctx.state()) + let physical_plan = state + .create_physical_plan(&plan) .await .map_err(ExecuteError::CreatePhysicalPlan)?; forbid_duplicate_field_names(&physical_plan, &plan) .map_err(ExecuteError::DuplicateFieldNames)?; - tracing::debug!("physical plan: {}", print_physical_plan(&*physical_plan)); + tracing::debug!(physical_plan = %print_physical_plan(&*physical_plan), "optimized plan"); + let task_ctx = state.task_ctx(); match is_explain { - false => execute_stream(physical_plan, ctx.task_ctx()).map_err(ExecuteError::ExecuteStream), - true => execute_explain(physical_plan, ctx).await, + false => execute_stream(physical_plan, task_ctx).map_err(ExecuteError::ExecuteStream), + true => execute_explain(physical_plan, task_ctx).await, } } // We do special handling for `Explain` plans to ensure that the output is sanitized from full paths. async fn execute_explain( physical_plan: Arc, - ctx: &SessionContext, + task_ctx: Arc, ) -> Result { use datafusion::physical_plan::execution_plan; let schema = physical_plan.schema().clone(); - let output = execution_plan::collect(physical_plan, ctx.task_ctx()) + let output = execution_plan::collect(physical_plan, task_ctx) .await .map_err(ExecuteError::CollectExplainResults)?; - let concatenated = concat_batches(&schema, &output).unwrap(); - let sanitized = sanitize_explain(&concatenated); + let concatenated = concat_batches(&schema, &output) + .map_err(|err| ExecuteError::ConcatExplainResults(err.into()))?; + let sanitized = sanitize_explain(concatenated); let stream = RecordBatchStreamAdapter::new(schema, stream::iter(std::iter::once(Ok(sanitized)))); @@ -534,16 +771,24 @@ fn sanitize_parquet_paths(text: &str) -> String { PARQUET_PATH_REGEX.replace_all(text, "$1").into_owned() } -// Sanitize the explain output by removing full paths and and keeping only the filenames. -fn sanitize_explain(batch: &RecordBatch) -> RecordBatch { +// Sanitize the explain output by removing full paths and keeping only the filenames. +// +// Uses best-effort passthrough: if the expected "plan" column is absent, is not a +// StringArray, or if constructing the output batch fails (e.g., on a future DataFusion +// EXPLAIN schema change), the original batch is returned unchanged rather than panicking. +fn sanitize_explain(batch: RecordBatch) -> RecordBatch { use arrow::array::StringArray; - let plan_idx = batch.schema().index_of("plan").unwrap(); - let plan_column = batch + let Ok(plan_idx) = batch.schema().index_of("plan") else { + return batch; + }; + let Some(plan_column) = batch .column(plan_idx) .as_any() .downcast_ref::() - .unwrap(); + else { + return batch; + }; let transformed: StringArray = plan_column .iter() @@ -553,7 +798,7 @@ fn sanitize_explain(batch: &RecordBatch) -> RecordBatch { let mut columns: Vec = batch.columns().to_vec(); columns[plan_idx] = Arc::new(transformed); - RecordBatch::try_new(batch.schema(), columns).unwrap() + RecordBatch::try_new(batch.schema(), columns).unwrap_or(batch) } /// Prints the physical plan to a single line, for logging. @@ -622,3 +867,268 @@ pub fn create_instrumentation_rule() -> Arc::new()))], + ) + .expect("empty test batch should be constructible"); + + //* When + let result = sanitize_explain(batch); + + //* Then + assert_eq!(result.num_rows(), 0, "empty batch should remain empty"); + } + + #[test] + fn sanitize_explain_with_plan_column_sanitizes_parquet_paths() { + //* Given + let schema = Arc::new(Schema::new(vec![Field::new("plan", DataType::Utf8, false)])); + let plan_text = "ParquetExec: file_groups={1 group: [/data/store/subdir/file.parquet]}"; + let batch = + RecordBatch::try_new(schema, vec![Arc::new(StringArray::from(vec![plan_text]))]) + .expect("test batch should be constructible"); + + //* When + let result = sanitize_explain(batch); + + //* Then + let plan_col = result + .column(0) + .as_any() + .downcast_ref::() + .expect("plan column should remain StringArray after sanitization"); + let sanitized = plan_col.value(0); + assert!( + !sanitized.contains("/data/store/subdir/"), + "full directory path should be removed from explain output: {sanitized}" + ); + assert!( + sanitized.contains("file.parquet"), + "parquet filename should be preserved: {sanitized}" + ); + } + + #[test] + fn sanitize_explain_with_multiple_columns_preserves_non_plan_columns() { + //* Given + let schema = Arc::new(Schema::new(vec![ + Field::new("type", DataType::Utf8, false), + Field::new("plan", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(StringArray::from(vec!["physical"])), + Arc::new(StringArray::from(vec!["Scan: /long/path/to/data.parquet"])), + ], + ) + .expect("multi-column test batch should be constructible"); + + //* When + let result = sanitize_explain(batch); + + //* Then + let type_col = result + .column(0) + .as_any() + .downcast_ref::() + .expect("type column should remain StringArray"); + assert_eq!( + type_col.value(0), + "physical", + "non-plan columns should be unchanged" + ); + + let plan_col = result + .column(1) + .as_any() + .downcast_ref::() + .expect("plan column should remain StringArray"); + assert!( + plan_col.value(0).contains("data.parquet"), + "parquet filename should be preserved in plan column" + ); + } + + #[test] + fn sanitize_explain_with_null_values_in_plan_column_preserves_nulls() { + //* Given + let schema = Arc::new(Schema::new(vec![Field::new("plan", DataType::Utf8, true)])); + let batch = RecordBatch::try_new( + schema, + vec![Arc::new(StringArray::from(vec![ + Some("Scan: /path/to/data.parquet"), + None, + Some("Filter: col > 0"), + ]))], + ) + .expect("test batch with nulls should be constructible"); + + //* When + let result = sanitize_explain(batch); + + //* Then + let plan_col = result + .column(0) + .as_any() + .downcast_ref::() + .expect("plan column should remain StringArray"); + assert_eq!(result.num_rows(), 3, "all rows should be preserved"); + assert!( + plan_col.value(0).contains("data.parquet"), + "non-null parquet path should be sanitized" + ); + assert!(plan_col.is_null(1), "null values should be preserved"); + assert_eq!( + plan_col.value(2), + "Filter: col > 0", + "rows without parquet paths should be unchanged" + ); + } + } + + mod sanitize_parquet_paths_tests { + use super::*; + + #[test] + fn sanitize_parquet_paths_with_full_path_returns_filename_only() { + //* When + let result = sanitize_parquet_paths("/data/store/subdir/file.parquet"); + + //* Then + assert_eq!( + result, "file.parquet", + "full path should be replaced with filename only" + ); + } + + #[test] + fn sanitize_parquet_paths_with_filename_only_returns_unchanged() { + //* When + let result = sanitize_parquet_paths("file.parquet"); + + //* Then + assert_eq!(result, "file.parquet", "bare filename should be unchanged"); + } + + #[test] + fn sanitize_parquet_paths_with_no_parquet_returns_unchanged() { + //* Given + let text = "Filter: column > 0"; + + //* When + let result = sanitize_parquet_paths(text); + + //* Then + assert_eq!( + result, text, + "text without parquet paths should be unchanged" + ); + } + + #[test] + fn sanitize_parquet_paths_with_multiple_paths_sanitizes_all() { + //* Given + let text = "files: [/a/b/one.parquet, /c/d/two.parquet]"; + + //* When + let result = sanitize_parquet_paths(text); + + //* Then + assert!( + result.contains("one.parquet"), + "first filename should be preserved: {result}" + ); + assert!( + result.contains("two.parquet"), + "second filename should be preserved: {result}" + ); + assert!( + !result.contains("/a/b/"), + "first directory path should be removed: {result}" + ); + assert!( + !result.contains("/c/d/"), + "second directory path should be removed: {result}" + ); + } + + #[test] + fn sanitize_parquet_paths_with_empty_string_returns_empty() { + //* When + let result = sanitize_parquet_paths(""); + + //* Then + assert_eq!(result, "", "empty string should remain empty"); + } + } +} diff --git a/crates/core/common/src/context/plan.rs b/crates/core/common/src/context/plan.rs index a93d6abdb..32c72170e 100644 --- a/crates/core/common/src/context/plan.rs +++ b/crates/core/common/src/context/plan.rs @@ -1,199 +1,156 @@ use std::sync::Arc; use datafusion::{ - catalog::MemorySchemaProvider, - common::DFSchemaRef, - error::DataFusionError, - execution::{SessionStateBuilder, config::SessionConfig, context::SessionContext}, - sql::parser, + catalog::AsyncCatalogProvider as TableAsyncCatalogProvider, common::DFSchemaRef, + error::DataFusionError, execution::config::SessionConfig, sql::parser, }; +pub use crate::context::session::is_user_input_error; use crate::{ - catalog::logical::{LogicalCatalog, LogicalTable}, - context::common::{SqlToPlanError, builtin_udfs, sql_to_plan}, + context::{ + common::{INVALID_INPUT_CONTEXT, ReadOnlyCheckError, read_only_check}, + session::{SessionContext, SessionStateBuilder}, + }, detached_logical_plan::DetachedLogicalPlan, - plan_table::PlanTable, + func_catalog::catalog_provider::AsyncCatalogProvider as FuncAsyncCatalogProvider, + plan_visitors::forbid_underscore_prefixed_aliases, }; /// A context for planning SQL queries. +/// +/// Delegates SQL planning and schema inference to a custom [`SessionContext`] +/// that performs async catalog pre-resolution before calling DataFusion. +/// Async-resolved catalogs are registered via `with_table_catalog` and +/// `with_func_catalog`. pub struct PlanContext { - session_config: SessionConfig, - catalog: LogicalCatalog, + session_ctx: SessionContext, } impl PlanContext { - /// Creates a planning context from a logical catalog. - pub fn new(session_config: SessionConfig, catalog: LogicalCatalog) -> Self { - Self { - session_config, - catalog, - } - } - - /// Returns the logical tables registered in this planning context. - pub fn logical_tables(&self) -> &[LogicalTable] { - &self.catalog.tables - } - /// Infers the output schema of the query by planning it against empty tables. pub async fn sql_output_schema( &self, query: parser::Statement, - ) -> Result { - let ctx = new_session_ctx(self.session_config.clone()); - register_catalog(&ctx, &self.catalog).map_err(SqlError::RegisterTable)?; - let plan = sql_to_plan(&ctx, query) - .await - .map_err(SqlError::SqlToPlan)?; + ) -> Result { + let plan = self.statement_to_plan(query).await?; Ok(plan.schema().clone()) } - /// Converts a parsed SQL statement into a detached logical plan. - pub async fn plan_sql( + /// Plans a SQL statement into a validated [`LogicalPlan`]. + /// + /// Delegates to the session for pure DF planning, then enforces + /// consumer-level policies: forbidden underscore-prefixed aliases + /// and read-only constraints. + pub async fn statement_to_plan( &self, query: parser::Statement, - ) -> Result { - let ctx = new_session_ctx(self.session_config.clone()); - register_catalog(&ctx, &self.catalog).map_err(SqlError::RegisterTable)?; - let plan = sql_to_plan(&ctx, query) - .await - .map_err(SqlError::SqlToPlan)?; + ) -> Result { + let plan = self.session_ctx.statement_to_plan(query).await?; + + forbid_underscore_prefixed_aliases(&plan) + .map_err(|err| StatementToPlanError::ForbiddenAliases(err).into_datafusion_error())?; + read_only_check(&plan) + .map_err(|err| StatementToPlanError::ReadOnlyCheck(err).into_datafusion_error())?; + Ok(DetachedLogicalPlan::new(plan)) } /// Applies DataFusion logical optimizations to a detached plan. #[tracing::instrument(skip_all, err)] - pub async fn optimize_plan( + pub fn optimize( &self, plan: &DetachedLogicalPlan, - ) -> Result { - let ctx = new_session_ctx(self.session_config.clone()); - register_catalog(&ctx, &self.catalog).map_err(OptimizePlanError::RegisterTable)?; - ctx.state() + ) -> Result { + self.session_ctx .optimize(plan) - .map_err(OptimizePlanError::Optimize) .map(DetachedLogicalPlan::new) } } -/// Creates a bare DataFusion session context with builtin UDFs but no catalog tables. +/// Builder for [`PlanContext`]. /// -/// Call [`register_catalog`] on the returned context to populate it with tables and UDFs -/// from a [`LogicalCatalog`]. -fn new_session_ctx(config: SessionConfig) -> SessionContext { - let ctx = { - let state = SessionStateBuilder::new() - .with_config(config) - .with_runtime_env(Default::default()) - .with_default_features() - .build(); - SessionContext::new_with_state(state) - }; - - // Register the builtin UDFs - for udf in builtin_udfs() { - ctx.register_udf(udf); - } - - ctx +/// Composes a [`SessionStateBuilder`] internally. The exposed API covers +/// async table catalog registration (`with_table_catalog`) and async function +/// catalog registration (`with_func_catalog`). No internals of the state +/// builder are passed through directly. +pub struct PlanContextBuilder { + state_builder: SessionStateBuilder, } -/// Registers the tables and UDFs from a [`LogicalCatalog`] into a [`SessionContext`]. -fn register_catalog( - ctx: &SessionContext, - catalog: &LogicalCatalog, -) -> Result<(), RegisterTableError> { - // Register tables first to ensure schemas are created before UDF registration - for table in catalog.tables.iter() { - let schema_name = table.sql_schema_name(); - - // The catalog schema needs to be explicitly created or table creation will fail. - if ctx - .catalog(&ctx.catalog_names()[0]) - .unwrap() - .schema(schema_name) - .is_none() - { - let schema = Arc::new(MemorySchemaProvider::new()); - ctx.catalog(&ctx.catalog_names()[0]) - .unwrap() - .register_schema(schema_name, schema) - .unwrap(); +impl PlanContextBuilder { + /// Creates a new builder for configuring a [`PlanContext`]. + pub fn new(session_config: SessionConfig) -> Self { + Self { + state_builder: SessionStateBuilder::new(session_config), } - - // Register the table with a planning-only provider that exposes the schema but cannot be scanned. - let table_schema = table.schema().clone(); - ctx.register_table( - table.table_ref().clone(), - Arc::new(PlanTable::new(table_schema)), - ) - .map_err(RegisterTableError)?; - } - - // Register UDFs after tables to ensure any schema dependencies are resolved - for udf in catalog.udfs.iter() { - ctx.register_udf(udf.clone()); } - Ok(()) -} - -/// Failed to register a catalog table with the planning session context -/// -/// This occurs when DataFusion rejects a table registration during planning -/// session creation, typically because a table with the same name already -/// exists or the table metadata is invalid. -#[derive(Debug, thiserror::Error)] -#[error("Failed to register catalog table with planning session context")] -pub struct RegisterTableError(#[source] DataFusionError); - -/// Failed to plan a SQL query against the planning context -/// -/// This error is shared by `plan_sql` and `sql_output_schema` because they -/// produce the exact same error variants. -#[derive(Debug, thiserror::Error)] -pub enum SqlError { - /// Failed to create a planning session context + /// Registers a named async table catalog provider. /// - /// This occurs when building a `SessionContext` for SQL planning fails, - /// typically due to a table registration error during context setup. - #[error("failed to create planning session context")] - RegisterTable(#[source] RegisterTableError), + /// Referenced catalogs are resolved before SQL planning via the + /// pre-resolution pipeline in the custom session. The `name` must + /// match the `SessionConfig` default catalog for the provider to + /// be reachable — see [`SessionContextBuilder`] docs. + pub fn with_table_catalog( + mut self, + name: impl Into, + provider: Arc, + ) -> Self { + self.state_builder = self.state_builder.with_table_catalog(name, provider); + self + } - /// Failed to convert SQL to a logical plan + /// Registers a named async function catalog provider. /// - /// This occurs during SQL-to-logical-plan conversion, including - /// statement parsing, alias validation, and read-only enforcement. - #[error("failed to convert SQL to logical plan: {0}")] - SqlToPlan(#[source] SqlToPlanError), -} + /// Referenced qualified function catalogs are resolved before SQL planning + /// and registered as `ScalarUDF`s. The `name` must match the + /// `SessionConfig` default catalog for the provider to be reachable — + /// see [`SessionContextBuilder`] docs. + pub fn with_func_catalog( + mut self, + name: impl Into, + provider: Arc, + ) -> Self { + self.state_builder = self.state_builder.with_func_catalog(name, provider); + self + } -impl SqlError { - /// Returns `true` if this error represents an invalid plan due to user input - /// (forbidden aliases or read-only violations) rather than an internal failure. - pub fn is_invalid_plan(&self) -> bool { - matches!(self, Self::SqlToPlan(err) if err.is_invalid_plan()) + /// Builds the [`PlanContext`]. + pub fn build(self) -> PlanContext { + PlanContext { + session_ctx: SessionContext::new_with_state(self.state_builder.build()), + } } } -/// Failed to optimize a logical plan +/// Failed to convert a SQL statement into a validated logical plan. /// -/// This error covers failures during the logical optimization phase. +/// Covers all failure modes during policy-enforcing SQL planning in +/// [`PlanContext::statement_to_plan`]: alias validation and read-only +/// enforcement. Flattened to [`DataFusionError`] before leaving the method. #[derive(Debug, thiserror::Error)] -pub enum OptimizePlanError { - /// Failed to create a planning session context - /// - /// This occurs when building a `SessionContext` for optimization fails, - /// typically due to a table registration error during context setup. - #[error("failed to create planning session context")] - RegisterTable(#[source] RegisterTableError), +enum StatementToPlanError { + /// Query uses underscore-prefixed column aliases which are reserved + #[error("query uses forbidden underscore-prefixed aliases")] + ForbiddenAliases(#[source] DataFusionError), + + /// Query plan violates read-only constraints + #[error("query plan violates read-only constraints")] + ReadOnlyCheck(#[source] ReadOnlyCheckError), +} - /// DataFusion optimizer failed to process the plan - /// - /// Possible causes: - /// - Optimizer rule failure during logical optimization - /// - Type inference errors - /// - Schema inconsistencies - #[error("failed to optimize logical plan")] - Optimize(#[source] DataFusionError), +impl StatementToPlanError { + /// Converts into a [`DataFusionError`] with appropriate user-input tagging. + fn into_datafusion_error(self) -> DataFusionError { + match self { + Self::ForbiddenAliases(err) => DataFusionError::Plan(format!( + "query uses forbidden underscore-prefixed aliases: {err}" + )) + .context(INVALID_INPUT_CONTEXT), + Self::ReadOnlyCheck(err) => { + DataFusionError::Plan(format!("query plan violates read-only constraints: {err}")) + .context(INVALID_INPUT_CONTEXT) + } + } + } } diff --git a/crates/core/common/src/context/session.rs b/crates/core/common/src/context/session.rs new file mode 100644 index 000000000..0b399cc27 --- /dev/null +++ b/crates/core/common/src/context/session.rs @@ -0,0 +1,93 @@ +//! Session context wrapping shared session state with session metadata. +//! +//! Parallels DataFusion's `SessionContext` which wraps `Arc>` +//! with a session ID and start time. + +use std::sync::Arc; + +use chrono::{DateTime, Utc}; +use datafusion::{error::DataFusionError, logical_expr::LogicalPlan, sql::parser}; +use parking_lot::RwLock; + +pub use crate::context::session_state::{SessionState, SessionStateBuilder, is_user_input_error}; + +/// A session context that wraps shared [`SessionState`] with session metadata. +/// +/// Parallels DataFusion's `SessionContext` structure: holds a session ID, +/// start time, and an `Arc>` for shared mutable access +/// to the underlying session state. +#[derive(Clone)] +pub struct SessionContext { + session_id: String, + session_start_time: DateTime, + state: Arc>, +} + +impl Default for SessionContext { + fn default() -> Self { + Self::new() + } +} + +impl SessionContext { + /// Creates a new session context with default [`SessionState`]. + pub fn new() -> Self { + Self::new_with_state(SessionState::default()) + } + + /// Creates a new session context wrapping the given [`SessionState`]. + pub fn new_with_state(state: SessionState) -> Self { + Self { + session_id: uuid::Uuid::now_v7().to_string(), + session_start_time: Utc::now(), + state: Arc::new(RwLock::new(state)), + } + } + + /// Returns the session ID. + pub fn session_id(&self) -> &str { + &self.session_id + } + + /// Returns the session start time. + pub fn session_start_time(&self) -> DateTime { + self.session_start_time + } + + /// Returns a clone of the inner [`SessionState`]. + /// + /// Mirrors DataFusion's `SessionContext::state() -> SessionState` pattern. + pub(crate) fn state(&self) -> SessionState { + self.state.read().clone() + } + + /// Returns a new [`SessionState`] with async catalog pre-resolution + /// applied for the given SQL statement. + /// + /// Delegates to [`SessionState::resolved_state`]. + pub(crate) async fn resolved_state( + &self, + stmt: &parser::Statement, + ) -> Result { + let state = self.state.read().clone(); + state.resolved_state(stmt).await + } + + /// Plans a SQL statement into a [`LogicalPlan`]. + /// + /// Delegates to [`SessionState::statement_to_plan`]. + pub async fn statement_to_plan( + &self, + stmt: parser::Statement, + ) -> Result { + let state = self.state.read().clone(); + state.statement_to_plan(stmt).await + } + + /// Applies DataFusion logical optimizations to an existing plan. + /// + /// Delegates to [`SessionState::optimize`]. + pub fn optimize(&self, plan: &LogicalPlan) -> Result { + self.state.read().optimize(plan) + } +} diff --git a/crates/core/common/src/context/session_state.rs b/crates/core/common/src/context/session_state.rs new file mode 100644 index 000000000..839573cd7 --- /dev/null +++ b/crates/core/common/src/context/session_state.rs @@ -0,0 +1,698 @@ +//! Session state for SQL planning and execution. +//! +//! Provides a re-implementation of session state management that performs async +//! catalog pre-resolution before SQL planning. Async pre-resolution extracts +//! table and function references from the SQL statement, resolves only the +//! referenced catalogs concurrently, and registers the results into a +//! transient DataFusion `SessionState` before planning. + +use std::{collections::BTreeMap, sync::Arc}; + +use datafusion::{ + catalog::{ + AsyncCatalogProvider as TableAsyncCatalogProvider, CatalogProviderList, SchemaProvider, + }, + common::TableReference, + error::DataFusionError, + execution::{ + FunctionRegistry, SessionState as DfSessionState, + SessionStateBuilder as DfSessionStateBuilder, TaskContext, + cache::cache_manager::CacheManager, config::SessionConfig, disk_manager::DiskManager, + memory_pool::MemoryPool, object_store::ObjectStoreRegistry, runtime_env::RuntimeEnv, + }, + logical_expr::{LogicalPlan, ScalarUDF}, + physical_optimizer::PhysicalOptimizerRule, + physical_plan::ExecutionPlan as DfExecutionPlan, + physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}, + sql::parser, +}; +use futures::future; + +use crate::{ + context::common::INVALID_INPUT_CONTEXT, + evm::udfs::{ + EvmDecodeHex, EvmDecodeLog, EvmDecodeParams, EvmDecodeType, EvmEncodeHex, EvmEncodeParams, + EvmEncodeType, EvmTopic, ShiftUnits, + }, + func_catalog::catalog_provider::AsyncCatalogProvider as FuncAsyncCatalogProvider, + sql::{FunctionReference, resolve_function_references, resolve_table_references}, +}; + +/// Session state for planning and executing SQL queries. +/// +/// Stores a base `SessionState` built once at construction (with config, +/// runtime environment, builtin UDFs, and physical optimizer rules) and +/// clones it for each planning/execution operation. SQL planning entry +/// points (`statement_to_plan`) perform async catalog +/// pre-resolution on top of the cloned base state. +#[derive(Clone)] +pub struct SessionState { + /// Base DataFusion session state built once at construction. + /// + /// Contains the session config, runtime environment, builtin UDFs, and + /// physical optimizer rules. Cloned for each planning/execution operation; + /// callers register additional catalogs or tables on the clone before use. + state: DfSessionState, + + /// Named async table catalog providers consulted during SQL pre-resolution. + /// + /// Keyed by catalog name (e.g. `"amp"`). Only providers whose names match + /// catalogs referenced by the SQL statement are resolved. When both this + /// map and `func_catalogs` are empty, pre-resolution is skipped entirely + /// and the eager logical catalog path is used. + table_catalogs: BTreeMap>, + + /// Named async function catalog providers consulted during SQL pre-resolution. + /// + /// Keyed by catalog name. Only schema-qualified function references trigger + /// resolution; bare function names + /// map entirely (spec requirement 3/4). + func_catalogs: BTreeMap>, +} + +impl SessionState { + /// Returns the session ID. + /// + /// Mirrors [`DfSessionState::session_id`]. + pub fn session_id(&self) -> &str { + self.state.session_id() + } + + /// Returns the session configuration. + /// + /// Mirrors [`DfSessionState::config`]. + pub fn config(&self) -> &SessionConfig { + self.state.config() + } + + /// Returns the runtime environment. + /// + /// Mirrors [`DfSessionState::runtime_env`]. + pub fn runtime_env(&self) -> &Arc { + self.state.runtime_env() + } + + /// Returns the catalog list. + /// + /// Mirrors [`DfSessionState::catalog_list`]. + pub fn catalog_list(&self) -> &Arc { + self.state.catalog_list() + } + + /// Resolves a [`TableReference`] to its [`SchemaProvider`]. + /// + /// Mirrors [`DfSessionState::schema_for_ref`]. + pub fn schema_for_ref( + &self, + table_ref: impl Into, + ) -> Result, DataFusionError> { + self.state.schema_for_ref(table_ref) + } + + /// Registers a scalar UDF. + /// + /// Mirrors [`DfSessionState::register_udf`] via [`FunctionRegistry`]. + pub fn register_udf( + &mut self, + udf: Arc, + ) -> Result>, DataFusionError> { + self.state.register_udf(udf) + } + + /// Creates a new [`TaskContext`] for this session. + /// + /// Mirrors [`DfSessionState::task_ctx`]. + pub fn task_ctx(&self) -> Arc { + self.state.task_ctx() + } + + /// Applies DataFusion logical optimizations to an existing plan. + /// + /// Does not trigger SQL pre-resolution; the plan is assumed to be fully + /// resolved already. + /// + /// Mirrors [`DfSessionState::optimize`] (synchronous). + #[tracing::instrument(skip_all, err)] + pub fn optimize(&self, plan: &LogicalPlan) -> Result { + self.state.optimize(plan) + } + + /// Returns a new `SessionState` with async catalog pre-resolution + /// applied for the given SQL statement. + /// + /// The returned state has all async catalogs resolved into the DF + /// state; its async catalog maps are empty (resolution is complete). + pub(crate) async fn resolved_state( + &self, + stmt: &parser::Statement, + ) -> Result { + let df_state = self.resolve_for_statement(stmt).await?; + Ok(SessionState { + state: df_state, + table_catalogs: Default::default(), + func_catalogs: Default::default(), + }) + } + + /// Creates a physical execution plan from a logical plan. + /// + /// Wraps DataFusion's `DefaultPhysicalPlanner` so that `DfSessionState` + /// stays internal to `SessionState`. + pub(crate) async fn create_physical_plan( + &self, + plan: &LogicalPlan, + ) -> Result, DataFusionError> { + DefaultPhysicalPlanner::default() + .create_physical_plan(plan, &self.state) + .await + } + + /// Plans a SQL statement into a [`LogicalPlan`]. + /// + /// Performs async pre-resolution of referenced dataset catalogs and + /// functions before planning. Returns the raw logical plan without + /// policy enforcement (alias validation, read-only checks). + /// + /// Consumer-level policies (forbidden aliases, read-only constraints) + /// are enforced by [`PlanContext`] and [`ExecContext`], not here. + /// + /// Aligns with DataFusion's [`SessionState::statement_to_plan`](datafusion::execution::session_state::SessionState::statement_to_plan). + pub async fn statement_to_plan( + &self, + stmt: parser::Statement, + ) -> Result { + let resolved = self.resolved_state(&stmt).await?; + resolved.state.statement_to_plan(stmt).await + } + + /// Resolves async catalogs for a SQL statement and returns a transient `DfSessionState`. + /// + /// This is the SQL-only pre-resolution path used by `statement_to_plan`, + /// `statement_to_schema`, and exec SQL planning in `ExecContext`. It: + /// 1. Extracts table and function references from the SQL statement. + /// 2. Filters bare function refs (built-ins, not async-resolved). + /// 3. Resolves only the catalogs that are actually referenced. + /// 4. Registers resolved tables into the `DfSessionState`. + /// 5. Registers resolved functions as `ScalarUDF`s in deterministic order. + /// + /// Errors are flattened to `DataFusionError` at this boundary. + async fn resolve_for_statement( + &self, + stmt: &parser::Statement, + ) -> Result { + // Early-out: if no async providers are registered, skip resolution. + if self.table_catalogs.is_empty() && self.func_catalogs.is_empty() { + return Ok(self.state.clone()); + } + + // Extract table references from the SQL statement (per-query cache via + // the resolve() default impls; we collect all refs here once). + let table_refs = resolve_table_references::(stmt).map_err(|err| { + DataFusionError::Plan(format!("failed to extract table references: {err}")) + .context(INVALID_INPUT_CONTEXT) + })?; + + // Extract function references; bare functions are built-ins and bypass + // async resolution (spec requirement 3/4). + let all_func_refs = resolve_function_references::(stmt).map_err(|err| { + DataFusionError::Plan(format!("failed to extract function references: {err}")) + .context(INVALID_INPUT_CONTEXT) + })?; + let qualified_func_refs: Vec<_> = all_func_refs + .into_iter() + .filter(|r| matches!(r, FunctionReference::Qualified { .. })) + .collect(); + + // Convert to DataFusion TableReference for the resolve() API. + // All our refs are Bare or Partial (catalog-qualified refs are rejected + // by resolve_table_references), so the catalog is always the default. + let df_table_refs: Vec = table_refs + .into_iter() + .map(datafusion::common::TableReference::from) + .collect(); + + // Function references use TableReference::partial(schema, function_name) + // because FuncReference is an alias for datafusion::common::TableReference. + let df_func_refs: Vec = qualified_func_refs + .into_iter() + .filter_map(|r| match r { + FunctionReference::Qualified { schema, function } => Some( + datafusion::common::TableReference::partial(schema.as_str(), function.as_str()), + ), + FunctionReference::Bare { .. } => None, + }) + .collect(); + + // All our refs use the default catalog (catalog-qualified refs are + // rejected upstream). Resolve only providers registered under names + // that appear in the references, to avoid unnecessary I/O. + // LoD exception: navigating DataFusion's config API (third-party struct). + let default_catalog = self + .state + .config() + .options() + .catalog + .default_catalog + .clone(); + let config = self.state.config().clone(); + + // Identify table catalog providers referenced in the statement. + let referenced_table_catalogs: Vec<(String, Arc)> = + if df_table_refs.is_empty() { + Vec::new() + } else { + self.table_catalogs + .iter() + .filter(|(name, _)| { + df_table_refs + .iter() + .any(|r| r.catalog().unwrap_or(&default_catalog) == name.as_str()) + }) + .map(|(name, provider)| (name.clone(), provider.clone())) + .collect() + }; + + // Identify function catalog providers referenced in the statement. + let referenced_func_catalogs: Vec<(String, Arc)> = + if df_func_refs.is_empty() { + Vec::new() + } else { + self.func_catalogs + .iter() + .filter(|(name, _)| { + df_func_refs + .iter() + .any(|r| r.catalog().unwrap_or(&default_catalog) == name.as_str()) + }) + .map(|(name, provider)| (name.clone(), provider.clone())) + .collect() + }; + + // Resolve table and function catalogs concurrently. Both categories + // of futures overlap so that function resolution can start before all + // table resolves complete. + let table_futures = + referenced_table_catalogs + .into_iter() + .map(|(catalog_name, provider)| { + let config = config.clone(); + let df_table_refs = df_table_refs.clone(); + async move { + let resolved = provider + .resolve(&df_table_refs, &config, &catalog_name) + .await?; + Ok::<_, DataFusionError>((catalog_name, resolved)) + } + }); + + let func_futures = referenced_func_catalogs + .into_iter() + .map(|(catalog_name, provider)| { + let config = config.clone(); + let df_func_refs = df_func_refs.clone(); + async move { + let resolved = provider + .resolve(&df_func_refs, &config, &catalog_name) + .await?; + Ok::<_, DataFusionError>((catalog_name, resolved)) + } + }); + + let (resolved_table_catalogs, resolved_func_catalogs) = future::try_join( + future::try_join_all(table_futures), + future::try_join_all(func_futures), + ) + .await?; + + let mut state = self.state.clone(); + + // Register resolved table catalogs into the session state. + for (catalog_name, catalog_provider) in resolved_table_catalogs { + if let Some(existing) = state.catalog_list().catalog(&catalog_name) { + // Merge: add/replace individual schemas. Async-resolved schemas + // override overlapping eager registrations; non-overlapping eager + // schemas are kept. + let mut schema_names = catalog_provider.schema_names(); + schema_names.sort(); + for schema_name in schema_names { + if let Some(schema) = catalog_provider.schema(&schema_name) { + existing.register_schema(&schema_name, schema)?; + } + } + } else { + state + .catalog_list() + .register_catalog(catalog_name, catalog_provider); + } + } + + // Register resolved functions as ScalarUDFs. + // Sort schema names and function names for deterministic registration + // order (allows overwrites; last write wins for duplicate names). + for (catalog_name, func_catalog) in &resolved_func_catalogs { + let mut schema_names = func_catalog.schema_names(); + schema_names.sort(); + + for schema_name in schema_names { + let Some(schema) = func_catalog.schema(&schema_name) else { + continue; + }; + + let mut function_names = schema.function_names(); + function_names.sort(); + + for function_name in function_names { + let maybe_function = schema.function(&function_name).await.map_err(|err| { + DataFusionError::Plan(format!( + "failed to load resolved function '{schema_name}.{function_name}' from catalog '{catalog_name}': {err}" + )) + })?; + if let Some(func_provider) = maybe_function { + state.register_udf(func_provider.scalar_udf())?; + } + } + } + } + + Ok(state) + } +} + +impl Default for SessionState { + fn default() -> Self { + SessionStateBuilder::new(SessionConfig::default()).build() + } +} + +/// Returns the built-in scalar UDFs registered in every session state. +fn builtin_udfs() -> Vec { + vec![ + EvmDecodeLog::new().into(), + EvmDecodeLog::new().with_deprecated_name().into(), + EvmTopic::new().into(), + EvmEncodeParams::new().into(), + EvmDecodeParams::new().into(), + EvmEncodeType::new().into(), + EvmDecodeType::new().into(), + EvmEncodeHex::new().into(), + EvmDecodeHex::new().into(), + ShiftUnits::new().into(), + ] +} + +/// Builder for [`SessionState`]. +/// +/// Requires a [`SessionConfig`] as the mandatory constructor input. +/// Optional runtime environment components can be supplied for exec usage; +/// plan-only usage defaults to `RuntimeEnv::default()`. +/// +/// Physical optimizer rules (needed for exec's instrumentation) are added +/// via [`with_physical_optimizer_rule`](Self::with_physical_optimizer_rule) +/// and applied in registration order. +/// +/// # Async provider name invariant +/// +/// Async table and function providers registered via +/// [`with_table_catalog`](Self::with_table_catalog) and +/// [`with_func_catalog`](Self::with_func_catalog) are matched against +/// catalog names extracted from SQL references at planning time. +/// Because `resolve_table_references` rejects catalog-qualified +/// references (`catalog.schema.table`), all extracted references +/// resolve their catalog via the `SessionConfig` default catalog +/// (`datafusion.catalog.default_catalog`). As a result, only +/// providers registered under a name that matches the default catalog +/// can ever be consulted during pre-resolution. Providers registered +/// under any other name are silently unreachable and will never +/// resolve any tables or functions — the query will degrade to a +/// late "table/function not found" planning error. +/// +/// [`build`](Self::build) emits a warning for each provider name +/// that does not match the default catalog, making the misconfiguration +/// explicit. +#[derive(Clone)] +pub struct SessionStateBuilder { + /// Session configuration folded into [`SessionState::base_state`] at build time. + session_config: SessionConfig, + /// Memory pool component for the runtime environment in [`SessionState::base_state`]. + memory_pool: Option>, + /// Disk manager component for the runtime environment in [`SessionState::base_state`]. + disk_manager: Option>, + /// Cache manager component for the runtime environment in [`SessionState::base_state`]. + cache_manager: Option>, + /// Object store registry component for the runtime environment in [`SessionState::base_state`]. + object_store_registry: Option>, + /// Physical optimizer rules folded into [`SessionState::base_state`] at build time. + physical_optimizer_rules: Vec>, + /// See [`SessionState::table_catalogs`]. + table_catalogs: BTreeMap>, + /// See [`SessionState::func_catalogs`]. + func_catalogs: BTreeMap>, +} + +impl SessionStateBuilder { + /// Creates a new builder with the mandatory session configuration. + pub fn new(session_config: SessionConfig) -> Self { + Self { + session_config, + memory_pool: None, + disk_manager: None, + cache_manager: None, + object_store_registry: None, + physical_optimizer_rules: Vec::new(), + table_catalogs: Default::default(), + func_catalogs: Default::default(), + } + } + + /// Sets the memory pool for the runtime environment in [`SessionState::base_state`]. + pub fn with_memory_pool(mut self, pool: Arc) -> Self { + self.memory_pool = Some(pool); + self + } + + /// Sets the disk manager for the runtime environment in [`SessionState::base_state`]. + pub fn with_disk_manager(mut self, disk_manager: Arc) -> Self { + self.disk_manager = Some(disk_manager); + self + } + + /// Sets the cache manager for the runtime environment in [`SessionState::base_state`]. + pub fn with_cache_manager(mut self, cache_manager: Arc) -> Self { + self.cache_manager = Some(cache_manager); + self + } + + /// Sets the object store registry for the runtime environment in [`SessionState::base_state`]. + pub fn with_object_store_registry(mut self, registry: Arc) -> Self { + self.object_store_registry = Some(registry); + self + } + + /// Appends a physical optimizer rule to [`SessionState::base_state`]. + pub fn with_physical_optimizer_rule( + mut self, + rule: Arc, + ) -> Self { + self.physical_optimizer_rules.push(rule); + self + } + + /// Inserts a provider into [`SessionState::table_catalogs`]. + /// + /// The `name` must match the `SessionConfig` default catalog for the + /// provider to be reachable during SQL pre-resolution. See the + /// [async provider name invariant](Self#async-provider-name-invariant) + /// on the builder docs. [`build`](Self::build) warns on mismatches. + pub fn with_table_catalog( + mut self, + name: impl Into, + provider: Arc, + ) -> Self { + self.table_catalogs.insert(name.into(), provider); + self + } + + /// Inserts a provider into [`SessionState::func_catalogs`]. + /// + /// The `name` must match the `SessionConfig` default catalog for the + /// provider to be reachable during SQL pre-resolution. See the + /// [async provider name invariant](Self#async-provider-name-invariant) + /// on the builder docs. [`build`](Self::build) warns on mismatches. + pub fn with_func_catalog( + mut self, + name: impl Into, + provider: Arc, + ) -> Self { + self.func_catalogs.insert(name.into(), provider); + self + } + + /// Builds the [`SessionState`]. + /// + /// Warns for each async provider name that does not match the + /// `SessionConfig` default catalog, since such providers can never + /// be reached during SQL pre-resolution (see + /// [async provider name invariant](Self#async-provider-name-invariant)). + pub fn build(self) -> SessionState { + let runtime_env = build_runtime_env( + self.memory_pool, + self.disk_manager, + self.cache_manager, + self.object_store_registry, + ); + + // Build the base SessionState once: config + runtime + optimizer rules + builtin UDFs. + let mut builder = DfSessionStateBuilder::new() + .with_config(self.session_config) + .with_runtime_env(runtime_env) + .with_default_features(); + + for rule in &self.physical_optimizer_rules { + builder = builder.with_physical_optimizer_rule(rule.clone()); + } + + let mut base_state = builder.build(); + + // Register builtin UDFs after build to preserve DataFusion's default + // scalar functions (abs, concat, etc.). + for udf in builtin_udfs() { + base_state + .register_udf(Arc::new(udf)) + .expect("builtin UDF registration should never fail"); + } + + SessionState { + state: base_state, + table_catalogs: self.table_catalogs, + func_catalogs: self.func_catalogs, + } + } +} + +/// Constructs an `Arc` from optional individual components. +/// +/// If no components are provided, returns `Arc::default()` (same as plan-only +/// usage). If any component is provided, it overrides the corresponding field +/// in a default `RuntimeEnv`, leaving the rest at their defaults. +fn build_runtime_env( + memory_pool: Option>, + disk_manager: Option>, + cache_manager: Option>, + object_store_registry: Option>, +) -> Arc { + let mut env = RuntimeEnv::default(); + if let Some(value) = memory_pool { + env.memory_pool = value; + } + if let Some(value) = disk_manager { + env.disk_manager = value; + } + if let Some(value) = cache_manager { + env.cache_manager = value; + } + if let Some(value) = object_store_registry { + env.object_store_registry = value; + } + Arc::new(env) +} + +/// Returns `true` if the error represents invalid user input. +/// +/// Walks the full [`DataFusionError::Context`] chain and returns `true` as +/// soon as the `amp::invalid_input` tag is found at any depth. This is +/// robust to future callers that add an additional `.context("…")` wrapper +/// around an already-tagged error: the tag remains detectable even when it +/// is not the outermost context layer. +/// +/// Used by transport layers (flight, admin-api) to map errors to appropriate +/// HTTP/gRPC status codes: +/// - `true` → `BAD_REQUEST` / `invalid_argument` +/// - `false` → `INTERNAL_SERVER_ERROR` / `internal` +pub fn is_user_input_error(err: &DataFusionError) -> bool { + let mut current = err; + loop { + match current { + DataFusionError::Context(ctx, inner) => { + if ctx == INVALID_INPUT_CONTEXT { + return true; + } + current = inner; + } + _ => return false, + } + } +} + +#[cfg(test)] +mod tests { + use datafusion::common::DataFusionError; + + use super::*; + + /// Verifies that `is_user_input_error` returns `true` when the + /// `amp::invalid_input` tag is wrapped inside an additional + /// `DataFusionError::Context` layer. + /// + /// A future caller adding `.context("outer message")` to an already-tagged + /// error must not accidentally downgrade a user-input error to an internal + /// error. The classification must survive arbitrary depths of context + /// wrapping. + #[test] + fn is_user_input_error_detects_tag_under_nested_context_wrapper() { + //* Given — tag is at depth 2 + let tagged = DataFusionError::Plan("invalid table reference".to_string()) + .context("amp::invalid_input"); + let wrapped = tagged.context("failed to convert SQL statement to logical plan"); + + //* When + let result = is_user_input_error(&wrapped); + + //* Then + assert!( + result, + "is_user_input_error should return true even when the amp::invalid_input tag is \ + wrapped by an outer context: {wrapped:?}" + ); + } + + /// Verifies that `is_user_input_error` returns `true` at any arbitrary depth + /// of context wrapping (depth 3 in this case). + #[test] + fn is_user_input_error_detects_tag_under_multiple_nested_context_wrappers() { + //* Given — tag is at depth 3 + let tagged = + DataFusionError::Plan("forbidden alias".to_string()).context("amp::invalid_input"); + let wrapped_once = tagged.context("outer context 1"); + let wrapped_twice = wrapped_once.context("outer context 2"); + + //* When + let result = is_user_input_error(&wrapped_twice); + + //* Then + assert!( + result, + "is_user_input_error should return true regardless of nesting depth: {wrapped_twice:?}" + ); + } + + /// Verifies that `is_user_input_error` returns `false` when only non-tag + /// context wrappers are present (no `amp::invalid_input` in the chain). + #[test] + fn is_user_input_error_returns_false_for_untagged_nested_context() { + //* Given — multiple context wrappers but no tag + let inner = DataFusionError::Plan("provider lookup failed".to_string()); + let wrapped = inner + .context("failed to resolve catalog") + .context("planning failed"); + + //* When + let result = is_user_input_error(&wrapped); + + //* Then + assert!( + !result, + "is_user_input_error should return false when the tag is absent from all context \ + layers: {wrapped:?}" + ); + } +} diff --git a/crates/core/common/src/dataset_schema_provider.rs b/crates/core/common/src/dataset_schema_provider.rs new file mode 100644 index 000000000..8c054c824 --- /dev/null +++ b/crates/core/common/src/dataset_schema_provider.rs @@ -0,0 +1,224 @@ +//! Schema provider for a dataset. +//! +//! Provides table and function resolution from a pre-resolved dataset without +//! requiring a data store. Tables are resolved as [`PlanTable`] instances +//! that expose schema information only. + +use std::{ + any::Any, + collections::{BTreeMap, BTreeSet}, + sync::Arc, +}; + +use async_trait::async_trait; +use datafusion::{ + catalog::{ + AsyncSchemaProvider as TableAsyncSchemaProvider, SchemaProvider as TableSchemaProvider, + TableProvider, + }, + error::DataFusionError, + logical_expr::ScalarUDF, +}; +use datasets_common::{dataset::Dataset, table_name::TableName}; +use datasets_derived::{dataset::Dataset as DerivedDataset, func_name::ETH_CALL_FUNCTION_NAME}; +use js_runtime::isolate_pool::IsolatePool; +use parking_lot::RwLock; + +use crate::{ + dataset_store::DatasetStore, + func_catalog::{ + function_provider::{FunctionProvider, ScalarFunctionProvider}, + schema_provider::{ + AsyncSchemaProvider as FuncAsyncSchemaProvider, SchemaProvider as FuncSchemaProvider, + }, + }, + plan_table::PlanTable, +}; + +/// Schema provider for a dataset. +/// +/// Resolves tables as [`PlanTable`] instances (schema-only, no data access) +/// and functions using the provided isolate pool. +pub struct DatasetSchemaProvider { + schema_name: String, + dataset: Arc, + dataset_store: DatasetStore, + isolate_pool: IsolatePool, + tables: RwLock>>, + functions: RwLock>>, +} + +impl DatasetSchemaProvider { + /// Creates a new provider for the given dataset, schema name, and isolate pool. + pub(crate) fn new( + schema_name: String, + dataset: Arc, + dataset_store: DatasetStore, + isolate_pool: IsolatePool, + ) -> Self { + Self { + schema_name, + dataset, + dataset_store, + isolate_pool, + tables: RwLock::new(Default::default()), + functions: RwLock::new(Default::default()), + } + } +} + +#[async_trait] +impl TableSchemaProvider for DatasetSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + let mut names: BTreeSet = self.tables.read().keys().cloned().collect(); + names.extend(self.dataset.tables().iter().map(|t| t.name().to_string())); + names.into_iter().collect() + } + + async fn table(&self, name: &str) -> Result>, DataFusionError> { + // Check cache first + { + let tables = self.tables.read(); + if let Some(table) = tables.get(name) { + return Ok(Some(table.clone())); + } + } + + let table_name: TableName = name.parse().map_err(|err| { + DataFusionError::Plan(format!("Invalid table name '{}': {}", name, err)) + })?; + + // Find table in dataset + let Some(dataset_table) = self + .dataset + .tables() + .iter() + .find(|t| t.name() == &table_name) + else { + return Ok(None); + }; + + let table_schema = dataset_table.schema().clone(); + + let table_provider: Arc = Arc::new(PlanTable::new(table_schema)); + + // Cache table provider + self.tables + .write() + .insert(name.to_string(), table_provider.clone()); + + Ok(Some(table_provider)) + } + + fn table_exist(&self, name: &str) -> bool { + if self.tables.read().contains_key(name) { + return true; + } + + let Ok(table_name) = name.parse::() else { + return false; + }; + + self.dataset + .tables() + .iter() + .any(|t| t.name() == &table_name) + } +} + +#[async_trait] +impl TableAsyncSchemaProvider for DatasetSchemaProvider { + async fn table(&self, name: &str) -> Result>, DataFusionError> { + ::table(self, name).await + } +} + +#[async_trait] +impl FuncSchemaProvider for DatasetSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn function_names(&self) -> Vec { + let functions = self.functions.read(); + functions.keys().cloned().collect() + } + + async fn function( + &self, + name: &str, + ) -> Result>, DataFusionError> { + // Check cache first + { + let functions = self.functions.read(); + if let Some(func) = functions.get(name) { + return Ok(Some(Arc::new(ScalarFunctionProvider::from(func.clone())))); + } + } + + // Check for eth_call function + if name == ETH_CALL_FUNCTION_NAME { + let udf = self + .dataset_store + .eth_call_for_dataset(&self.schema_name, self.dataset.as_ref()) + .await + .map_err(|err| DataFusionError::External(Box::new(err)))?; + + if let Some(udf) = udf { + let udf = Arc::new(udf); + self.functions.write().insert(name.to_string(), udf.clone()); + return Ok(Some(Arc::new(ScalarFunctionProvider::from(udf)))); + } + } + + // Try to get UDF from derived dataset + let udf = self.dataset.downcast_ref::().and_then(|d| { + d.function_by_name(self.schema_name.clone(), name, self.isolate_pool.clone()) + }); + + if let Some(udf) = udf { + let udf = Arc::new(udf); + self.functions.write().insert(name.to_string(), udf.clone()); + return Ok(Some(Arc::new(ScalarFunctionProvider::from(udf)))); + } + + Ok(None) + } + + /// Returns whether the function is known **from the cache only**. + /// + /// This deliberately does not probe the dataset or the store because: + /// - `eth_call` resolution requires async I/O (`dataset_store.eth_call_for_dataset`), + /// which cannot be performed in this synchronous trait method without blocking. + /// - Derived-dataset UDF lookup (`function_by_name`) is sync but allocates a + /// full `ScalarUDF` as a side effect, which is inappropriate for an existence check. + /// + /// Callers that need authoritative existence checks should use the async + /// `function()` method instead. + fn function_exist(&self, name: &str) -> bool { + let functions = self.functions.read(); + functions.contains_key(name) + } +} + +#[async_trait] +impl FuncAsyncSchemaProvider for DatasetSchemaProvider { + async fn function( + &self, + name: &str, + ) -> Result>, DataFusionError> { + ::function(self, name).await + } +} + +impl std::fmt::Debug for DatasetSchemaProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DatasetSchemaProvider") + .field("schema_name", &self.schema_name) + .finish() + } +} diff --git a/crates/core/common/src/dataset_store.rs b/crates/core/common/src/dataset_store.rs index d042dbb89..f14c79288 100644 --- a/crates/core/common/src/dataset_store.rs +++ b/crates/core/common/src/dataset_store.rs @@ -45,6 +45,12 @@ pub struct DatasetStore { dataset_cache: Arc>>>, } +impl std::fmt::Debug for DatasetStore { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DatasetStore").finish_non_exhaustive() + } +} + impl DatasetStore { /// Creates a new dataset store with in-memory caching for datasets and eth_call UDFs. pub fn new(datasets_registry: DatasetsRegistry, providers_registry: ProvidersRegistry) -> Self { diff --git a/crates/core/common/src/exec_env.rs b/crates/core/common/src/exec_env.rs index a8da9c246..e5eefbcdf 100644 --- a/crates/core/common/src/exec_env.rs +++ b/crates/core/common/src/exec_env.rs @@ -14,11 +14,15 @@ use datafusion::{ }; use js_runtime::isolate_pool::IsolatePool; -use crate::memory_pool::{MemoryPoolKind, make_memory_pool}; +use crate::{ + amp_catalog_provider::AMP_CATALOG_NAME, + dataset_store::DatasetStore, + memory_pool::{MemoryPoolKind, make_memory_pool}, +}; /// Returns the default DataFusion catalog name used across all session contexts. fn default_catalog_name() -> ScalarValue { - ScalarValue::Utf8(Some("amp".to_string())) + ScalarValue::Utf8(Some(AMP_CATALOG_NAME.to_string())) } /// Creates a [`SessionConfig`] with project-wide defaults. @@ -74,6 +78,9 @@ pub struct ExecEnv { /// The data store used for query execution. pub store: DataStore, + + /// The dataset store used for dataset resolution and loading. + pub dataset_store: DatasetStore, } /// Creates a ExecEnv with specified memory and cache configuration @@ -85,6 +92,7 @@ pub fn create( query_max_mem_mb: usize, spill_location: &[PathBuf], store: DataStore, + dataset_store: DatasetStore, ) -> Result { let spill_allowed = !spill_location.is_empty(); let disk_manager_mode = if spill_allowed { @@ -120,5 +128,6 @@ pub fn create( isolate_pool, query_max_mem_mb, store, + dataset_store, }) } diff --git a/crates/core/common/src/func_catalog/catalog_list_provider.rs b/crates/core/common/src/func_catalog/catalog_list_provider.rs index a691cfef0..c23a537ca 100644 --- a/crates/core/common/src/func_catalog/catalog_list_provider.rs +++ b/crates/core/common/src/func_catalog/catalog_list_provider.rs @@ -3,7 +3,7 @@ use std::{any::Any, collections::BTreeMap, fmt::Debug, sync::Arc}; use async_trait::async_trait; use parking_lot::RwLock; -use super::catalog_provider::CatalogProvider as FuncCatalogProvider; +use super::catalog_provider::AsyncCatalogProvider; /// List of named function catalogs. /// @@ -18,20 +18,20 @@ pub trait CatalogProviderList: Debug + Sync + Send { fn register_catalog( &self, name: String, - catalog: Arc, - ) -> Option>; + catalog: Arc, + ) -> Option>; /// Returns all registered catalog names. fn catalog_names(&self) -> Vec; /// Looks up a catalog by name. - async fn catalog(&self, name: &str) -> Option>; + async fn catalog(&self, name: &str) -> Option>; } /// In-memory function catalog list. #[derive(Debug, Default)] pub struct MemoryCatalogProviderList { - catalogs: RwLock>>, + catalogs: RwLock>>, } impl MemoryCatalogProviderList { @@ -50,8 +50,8 @@ impl CatalogProviderList for MemoryCatalogProviderList { fn register_catalog( &self, name: String, - catalog: Arc, - ) -> Option> { + catalog: Arc, + ) -> Option> { self.catalogs.write().insert(name, catalog) } @@ -59,33 +59,28 @@ impl CatalogProviderList for MemoryCatalogProviderList { self.catalogs.read().keys().cloned().collect() } - async fn catalog(&self, name: &str) -> Option> { + async fn catalog(&self, name: &str) -> Option> { self.catalogs.read().get(name).cloned() } } #[cfg(test)] mod tests { + use datafusion::error::DataFusionError; + use super::*; - use crate::func_catalog::schema_provider::SchemaProvider as FuncSchemaProvider; + use crate::func_catalog::schema_provider::AsyncSchemaProvider as FuncAsyncSchemaProvider; #[derive(Debug)] - struct DummyCatalog { - id: &'static str, - } + struct DummyCatalog; #[async_trait] - impl FuncCatalogProvider for DummyCatalog { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema_names(&self) -> Vec { - Vec::new() - } - - async fn schema(&self, _name: &str) -> Option> { - None + impl AsyncCatalogProvider for DummyCatalog { + async fn schema( + &self, + _name: &str, + ) -> Result>, DataFusionError> { + Ok(None) } } @@ -93,7 +88,7 @@ mod tests { fn register_catalog_with_new_name_returns_none() { //* Given let name = "amp".to_string(); - let catalog = Arc::new(DummyCatalog { id: "first" }) as Arc; + let catalog = Arc::new(DummyCatalog); let list = MemoryCatalogProviderList::new(); @@ -112,9 +107,8 @@ mod tests { //* Given let name = "amp".to_string(); - let first_catalog = Arc::new(DummyCatalog { id: "first" }) as Arc; - let second_catalog = - Arc::new(DummyCatalog { id: "second" }) as Arc; + let first_catalog = Arc::new(DummyCatalog) as Arc; + let second_catalog = Arc::new(DummyCatalog) as Arc; let list = MemoryCatalogProviderList::new(); list.register_catalog(name.clone(), first_catalog); @@ -127,27 +121,19 @@ mod tests { replaced.is_some(), "should return previously registered catalog" ); - let current = list + let _current = list .catalog(&name) .await .expect("catalog should exist after replacement"); - let current = current - .as_any() - .downcast_ref::() - .expect("should downcast to DummyCatalog"); - assert_eq!( - current.id, "second", - "catalog should be the newly registered one" - ); } #[test] fn catalog_names_with_multiple_catalogs_returns_sorted_names() { //* Given let alpha_name = "alpha".to_string(); - let alpha_catalog = Arc::new(DummyCatalog { id: "a" }) as Arc; + let alpha_catalog = Arc::new(DummyCatalog) as Arc; let beta_name = "beta".to_string(); - let beta_catalog = Arc::new(DummyCatalog { id: "b" }) as Arc; + let beta_catalog = Arc::new(DummyCatalog) as Arc; let list = MemoryCatalogProviderList::new(); list.register_catalog(alpha_name, alpha_catalog); diff --git a/crates/core/common/src/func_catalog/catalog_provider.rs b/crates/core/common/src/func_catalog/catalog_provider.rs index 1d2d3a111..a807a127a 100644 --- a/crates/core/common/src/func_catalog/catalog_provider.rs +++ b/crates/core/common/src/func_catalog/catalog_provider.rs @@ -1,33 +1,40 @@ -use std::{any::Any, fmt::Debug, sync::Arc}; +use std::{any::Any, collections::HashMap, fmt::Debug, sync::Arc}; use async_trait::async_trait; -use datafusion::common::{Result, not_impl_err}; +use datafusion::{ + common::{Result, not_impl_err}, + error::DataFusionError, + execution::config::SessionConfig, +}; -use super::schema_provider::SchemaProvider as FuncSchemaProvider; +use super::schema_provider::{ + AsyncSchemaProvider as FuncAsyncSchemaProvider, FuncReference, ResolvedSchemaProviderBuilder, + SchemaProvider as FuncSchemaProvider, +}; /// Represents a catalog, comprising a number of named function schemas. /// /// Parallel to DataFusion's [`CatalogProvider`] but for functions instead of tables. +/// This is the cached/resolved trait used during query planning. /// /// [`CatalogProvider`]: datafusion::catalog::CatalogProvider -#[async_trait] pub trait CatalogProvider: Debug + Sync + Send { - /// Returns the catalog provider as [`Any`] - /// so that it can be downcast to a specific implementation. + /// Returns this `CatalogProvider` as [`Any`] so that it can be + /// downcast to a specific implementation. fn as_any(&self) -> &dyn Any; - /// Retrieves the list of available schema names in this catalog. + /// Returns the list of available schema names in this catalog. fn schema_names(&self) -> Vec; - /// Retrieves a specific schema from the catalog by name, provided it exists. - async fn schema(&self, name: &str) -> Option>; + /// Retrieves a specific schema from the catalog by name, if it exists. + fn schema(&self, name: &str) -> Option>; /// Adds a new schema to this catalog. /// /// If a schema of the same name existed before, it is replaced in /// the catalog and returned. /// - /// By default returns a "Not Implemented" error + /// By default returns a "Not Implemented" error. fn register_schema( &self, name: &str, @@ -47,12 +54,105 @@ pub trait CatalogProvider: Debug + Sync + Send { /// Implementations of this method should return None if schema with `name` /// does not exist. /// - /// By default returns a "Not Implemented" error + /// By default returns a "Not Implemented" error. fn deregister_schema( &self, _name: &str, _cascade: bool, ) -> Result>> { - not_impl_err!("Deregistering new schemas is not supported") + not_impl_err!("Deregistering schemas is not supported") + } +} + +/// Resolves function schemas lazily before query planning. +/// +/// Parallel to DataFusion's `AsyncCatalogProvider` but for functions instead of tables. +/// Implementors provide lazy resolution that can perform I/O; the `resolve` method +/// caches matching schemas into a [`CatalogProvider`] for use during planning. +#[async_trait] +pub trait AsyncCatalogProvider: Debug + Send + Sync { + /// Retrieves a specific async schema from the catalog by name, if it exists. + async fn schema( + &self, + name: &str, + ) -> Result>, DataFusionError>; + + /// Resolves the referenced functions and returns a cached [`CatalogProvider`]. + /// + /// The default implementation resolves each schema referenced and resolves + /// the individual function references within each schema. + async fn resolve( + &self, + references: &[FuncReference], + config: &SessionConfig, + catalog_name: &str, + ) -> Result, DataFusionError> { + let mut cached_schemas = HashMap::>::new(); + + for reference in references { + let ref_catalog_name = reference + .catalog() + .unwrap_or(&config.options().catalog.default_catalog); + + if ref_catalog_name != catalog_name { + continue; + } + + let schema_name = reference + .schema() + .unwrap_or(&config.options().catalog.default_schema); + + let schema = if let Some(schema) = cached_schemas.get_mut(schema_name) { + schema + } else { + let resolved_schema = self.schema(schema_name).await?; + let resolved_schema = resolved_schema.map(ResolvedSchemaProviderBuilder::new); + cached_schemas.insert(schema_name.to_string(), resolved_schema); + let Some(schema) = cached_schemas.get_mut(schema_name) else { + continue; + }; + schema + }; + + let Some(schema) = schema.as_mut() else { + continue; + }; + + schema.resolve_function(reference.table()).await?; + } + + let cached_schemas = cached_schemas + .into_iter() + .filter_map(|(key, maybe_builder)| { + maybe_builder.map(|schema_builder| (key, schema_builder.finish())) + }) + .collect::>(); + + Ok(Arc::new(ResolvedCatalogProvider { + schemas: cached_schemas, + })) + } +} + +/// A [`CatalogProvider`] backed by a pre-resolved set of schemas. +/// +/// Created by [`AsyncCatalogProvider::resolve`] to cache schemas for use +/// during query planning. +#[derive(Debug)] +pub struct ResolvedCatalogProvider { + schemas: HashMap>, +} + +impl CatalogProvider for ResolvedCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + self.schemas.keys().cloned().collect() + } + + fn schema(&self, name: &str) -> Option> { + self.schemas.get(name).cloned() } } diff --git a/crates/core/common/src/func_catalog/schema_provider.rs b/crates/core/common/src/func_catalog/schema_provider.rs index 8a802905f..955df68eb 100644 --- a/crates/core/common/src/func_catalog/schema_provider.rs +++ b/crates/core/common/src/func_catalog/schema_provider.rs @@ -1,16 +1,21 @@ -use std::{any::Any, fmt::Debug, sync::Arc}; +use std::{any::Any, collections::HashMap, fmt::Debug, sync::Arc}; use async_trait::async_trait; use datafusion::{ - common::{Result, exec_err}, + common::{Result, TableReference, exec_err}, error::DataFusionError, + execution::config::SessionConfig, }; use super::function_provider::FunctionProvider; +/// Type alias for function references, mirroring DataFusion's [`TableReference`]. +pub type FuncReference = TableReference; + /// Represents a schema, comprising a number of named functions. /// /// Parallel to DataFusion's [`SchemaProvider`] but for functions instead of tables. +/// This is the cached/resolved trait used during query planning. /// /// Note that [`SchemaProvider::function`] is `async` in order to simplify /// implementing providers where resolving a function requires I/O (e.g., creating @@ -62,3 +67,130 @@ pub trait SchemaProvider: Debug + Sync + Send { /// Returns true if a function exists in this schema provider, false otherwise. fn function_exist(&self, name: &str) -> bool; } + +/// Resolves functions lazily before query planning. +/// +/// Parallel to DataFusion's `AsyncSchemaProvider` but for functions instead of tables. +/// Implementors provide lazy resolution that can perform I/O; the `resolve` method +/// caches matching functions into a [`SchemaProvider`] for use during planning. +#[async_trait] +pub trait AsyncSchemaProvider: Send + Sync { + /// Retrieves a specific function from the schema by name, if it exists. + async fn function( + &self, + name: &str, + ) -> Result>, DataFusionError>; + + /// Resolves the referenced functions and returns a cached [`SchemaProvider`]. + /// + /// The default implementation resolves each matching function reference by + /// calling [`function`](Self::function) and caches the results. + async fn resolve( + &self, + references: &[FuncReference], + config: &SessionConfig, + catalog_name: &str, + schema_name: &str, + ) -> Result, DataFusionError> { + let mut cached_functions = HashMap::>>::new(); + + for reference in references { + let ref_catalog_name = reference + .catalog() + .unwrap_or(&config.options().catalog.default_catalog); + + if ref_catalog_name != catalog_name { + continue; + } + + let ref_schema_name = reference + .schema() + .unwrap_or(&config.options().catalog.default_schema); + + if ref_schema_name != schema_name { + continue; + } + + if !cached_functions.contains_key(reference.table()) { + let resolved = self.function(reference.table()).await?; + cached_functions.insert(reference.table().to_string(), resolved); + } + } + + let functions = cached_functions + .into_iter() + .filter_map(|(key, maybe_value)| maybe_value.map(|value| (key, value))) + .collect(); + + Ok(Arc::new(ResolvedSchemaProvider { functions })) + } +} + +/// A [`SchemaProvider`] backed by a pre-resolved set of functions. +/// +/// Created by [`AsyncSchemaProvider::resolve`] to cache functions for use +/// during query planning. +#[derive(Debug)] +pub struct ResolvedSchemaProvider { + functions: HashMap>, +} + +#[async_trait] +impl SchemaProvider for ResolvedSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn function_names(&self) -> Vec { + self.functions.keys().cloned().collect() + } + + async fn function( + &self, + name: &str, + ) -> Result>, DataFusionError> { + Ok(self.functions.get(name).cloned()) + } + + fn function_exist(&self, name: &str) -> bool { + self.functions.contains_key(name) + } +} + +/// Builder for [`ResolvedSchemaProvider`]. +/// +/// Used internally by [`AsyncCatalogProvider::resolve`] to resolve and cache +/// functions for a single schema before building the provider. +pub(crate) struct ResolvedSchemaProviderBuilder { + async_provider: Arc, + cached_functions: HashMap>>, +} + +impl ResolvedSchemaProviderBuilder { + /// Creates a new builder wrapping the given async schema provider. + pub(crate) fn new(async_provider: Arc) -> Self { + Self { + async_provider, + cached_functions: HashMap::new(), + } + } + + /// Resolves and caches the named function from the async provider, skipping if already cached. + pub(crate) async fn resolve_function(&mut self, name: &str) -> Result<(), DataFusionError> { + if !self.cached_functions.contains_key(name) { + let resolved = self.async_provider.function(name).await?; + self.cached_functions.insert(name.to_string(), resolved); + } + Ok(()) + } + + /// Consumes the builder and returns a [`ResolvedSchemaProvider`] containing all resolved functions. + pub(crate) fn finish(self) -> Arc { + let functions = self + .cached_functions + .into_iter() + .filter_map(|(key, maybe_value)| maybe_value.map(|value| (key, value))) + .collect(); + Arc::new(ResolvedSchemaProvider { functions }) + } +} diff --git a/crates/core/common/src/lib.rs b/crates/core/common/src/lib.rs index b20fec170..83c17e5eb 100644 --- a/crates/core/common/src/lib.rs +++ b/crates/core/common/src/lib.rs @@ -5,9 +5,11 @@ use arrow::{array::FixedSizeBinaryArray, datatypes::DataType}; pub use datafusion::{arrow, parquet}; pub use datasets_common::{block_num::BlockNum, block_range::BlockRange, end_block::EndBlock}; +pub mod amp_catalog_provider; pub mod catalog; pub mod context; pub mod cursor; +pub mod dataset_schema_provider; pub mod dataset_store; pub mod datasets_derived; pub mod detached_logical_plan; @@ -21,6 +23,7 @@ pub mod physical_table; pub mod plan_table; pub mod plan_visitors; pub mod retryable; +pub mod self_schema_provider; pub mod sql; pub mod stream_helpers; pub mod streaming_query; diff --git a/crates/core/common/src/physical_table/table.rs b/crates/core/common/src/physical_table/table.rs index cedac3b05..02b762c01 100644 --- a/crates/core/common/src/physical_table/table.rs +++ b/crates/core/common/src/physical_table/table.rs @@ -22,7 +22,7 @@ use crate::{ #[derive(Debug, Clone)] pub struct PhysicalTable { /// Core storage information from data-store - pub(crate) revision: PhyTableRevision, + revision: PhyTableRevision, /// Dataset reference (namespace, name, hash). dataset_reference: HashReference, @@ -37,7 +37,7 @@ pub struct PhysicalTable { network: NetworkId, /// Data store for accessing metadata database and object storage. - pub(crate) store: DataStore, + store: DataStore, /// Table definition (schema, network, sorted_by). table: Table, diff --git a/crates/core/common/src/plan_visitors.rs b/crates/core/common/src/plan_visitors.rs index 7b162fc07..b00b83215 100644 --- a/crates/core/common/src/plan_visitors.rs +++ b/crates/core/common/src/plan_visitors.rs @@ -1,6 +1,5 @@ use std::{ collections::{BTreeMap, BTreeSet}, - fmt, sync::Arc, }; @@ -17,11 +16,14 @@ use datafusion::{ }, physical_plan::ExecutionPlan, prelude::{Expr, col, lit}, - sql::{TableReference, utils::UNNEST_PLACEHOLDER}, + sql::{TableReference as DFTableReference, utils::UNNEST_PLACEHOLDER}, }; use datasets_common::{block_num::RESERVED_BLOCK_NUM_COLUMN_NAME, network_id::NetworkId}; -use crate::incrementalizer::{NonIncrementalQueryError, incremental_op_kind}; +use crate::{ + incrementalizer::{NonIncrementalQueryError, incremental_op_kind}, + sql::TableReference, +}; /// Helper function to create a column reference to `_block_num` fn block_num_col() -> Expr { @@ -189,7 +191,7 @@ impl TreeNodeRewriter for BlockNumPropagator { // Both the user expression and the generated one are simple column references to `_block_num`. // But they were not equal, probably due to qualifiers. If there is only one input table, we can ignore the qualifier difference. let input_schema = projection.input.schema(); - let input_qualifiers: BTreeSet<&TableReference> = + let input_qualifiers: BTreeSet<&DFTableReference> = input_schema.iter().filter_map(|x| x.0).collect(); if input_qualifiers.len() <= 1 { @@ -254,7 +256,7 @@ impl TreeNodeRewriter for BlockNumPropagator { Ok(Transformed::yes(LogicalPlan::SubqueryAlias(rebuilt))) } - // These nodes do not cache schema and are not leaves, so we can leave them as-is + // These DfTableReferencenodes do not cache schema and are not leaves, so we can leave them as-is Filter(_) | Repartition(_) | Subquery(_) | Explain(_) | Analyze(_) | DescribeTable(_) | Unnest(_) => Ok(Transformed::no(node)), @@ -320,8 +322,8 @@ pub enum NonIncrementalOp { RecursiveQuery, } -impl fmt::Display for NonIncrementalOp { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl std::fmt::Display for NonIncrementalOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { use NonIncrementalOp::*; match self { Limit => write!(f, "Limit"), @@ -358,7 +360,7 @@ pub fn is_incremental(plan: &LogicalPlan) -> Result<(), NonIncrementalQueryError pub fn extract_table_references_from_plan( plan: &LogicalPlan, -) -> Result, DataFusionError> { +) -> Result, DataFusionError> { let mut refs = BTreeSet::new(); plan.apply(|node| { @@ -380,11 +382,11 @@ pub fn extract_table_references_from_plan( #[derive(Debug, Clone)] pub struct CrossNetworkJoinInfo { /// Networks involved in the cross-network join - pub networks: BTreeSet, + pub networks: BTreeSet, } -impl fmt::Display for CrossNetworkJoinInfo { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { +impl std::fmt::Display for CrossNetworkJoinInfo { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "join across multiple networks: {:?}", self.networks) } } @@ -398,10 +400,16 @@ pub fn find_cross_network_join( plan: &LogicalPlan, catalog: &crate::catalog::physical::Catalog, ) -> Result, DataFusionError> { - let table_to_network: BTreeMap = catalog + let table_to_network: BTreeMap = catalog .entries() .iter() - .map(|t| (t.table_ref().into(), t.physical_table().network().clone())) + .map(|t| { + let table_ref = TableReference::Partial { + schema: Arc::new(t.sql_schema_name().to_owned()), + table: Arc::new(t.physical_table().table_name().clone()), + }; + (table_ref.into(), t.physical_table().network().clone()) + }) .collect(); let reference_networks = diff --git a/crates/core/common/src/self_schema_provider.rs b/crates/core/common/src/self_schema_provider.rs new file mode 100644 index 000000000..62da84591 --- /dev/null +++ b/crates/core/common/src/self_schema_provider.rs @@ -0,0 +1,221 @@ +//! Schema provider for virtual schemas like `"self"`. +//! +//! Handles schema names that don't correspond to any dataset in the store but +//! still need to resolve tables and functions. + +use std::{any::Any, collections::BTreeMap, sync::Arc}; + +use async_trait::async_trait; +use datafusion::{ + catalog::{ + AsyncSchemaProvider as TableAsyncSchemaProvider, SchemaProvider as TableSchemaProvider, + TableProvider, + }, + error::DataFusionError, + logical_expr::{ScalarUDF, async_udf::AsyncScalarUDF}, +}; +use datasets_common::table_name::TableName; +use datasets_derived::{deps::SELF_REF_KEYWORD, func_name::FuncName, manifest::Function}; +use js_runtime::{isolate_pool::IsolatePool, js_udf::JsUdf}; +use parking_lot::RwLock; + +use crate::{ + catalog::logical::LogicalTable, + func_catalog::{ + function_provider::{FunctionProvider, ScalarFunctionProvider}, + schema_provider::{ + AsyncSchemaProvider as FuncAsyncSchemaProvider, SchemaProvider as FuncSchemaProvider, + }, + }, + plan_table::PlanTable, +}; + +/// Schema provider for virtual schemas (e.g., `"self"`) that resolve tables +/// and functions without requiring a backing dataset in the store. +pub struct SelfSchemaProvider { + schema_name: String, + logical_tables: Vec, + udfs: Vec, + table_cache: RwLock>>, + function_cache: RwLock>>, +} + +impl SelfSchemaProvider { + /// Creates a provider from pre-built tables and UDFs. + pub fn new(schema_name: String, tables: Vec, udfs: Vec) -> Self { + Self { + schema_name, + logical_tables: tables, + udfs, + table_cache: RwLock::new(Default::default()), + function_cache: RwLock::new(Default::default()), + } + } + + /// Returns the UDFs held by this provider. + pub fn udfs(&self) -> &[ScalarUDF] { + &self.udfs + } + + /// Creates a provider from manifest function definitions (no tables). + /// + /// Builds UDFs from all manifest functions. + pub fn from_manifest_udfs( + schema_name: String, + isolate_pool: IsolatePool, + manifest_udfs: &BTreeMap, + ) -> Self { + let udfs: Vec = manifest_udfs + .iter() + .map(|(func_name, func_def)| { + AsyncScalarUDF::new(Arc::new(JsUdf::new( + isolate_pool.clone(), + Some(SELF_REF_KEYWORD.to_string()), + func_def.source.source.clone(), + func_def.source.filename.clone().into(), + Arc::from(func_name.as_str()), + func_def + .input_types + .iter() + .map(|dt| dt.clone().into_arrow()) + .collect(), + func_def.output_type.clone().into_arrow(), + ))) + .into_scalar_udf() + }) + .collect(); + + Self::new(schema_name, vec![], udfs) + } +} + +#[async_trait] +impl TableSchemaProvider for SelfSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.logical_tables + .iter() + .filter(|t| t.sql_schema_name() == self.schema_name) + .map(|t| t.name().to_string()) + .collect() + } + + async fn table(&self, name: &str) -> Result>, DataFusionError> { + // Check cache first + { + let tables = self.table_cache.read(); + if let Some(table) = tables.get(name) { + return Ok(Some(table.clone())); + } + } + + let table_name: TableName = name.parse().map_err(|err| { + DataFusionError::Plan(format!("Invalid table name '{}': {}", name, err)) + })?; + + let Some(lt) = self + .logical_tables + .iter() + .find(|t| t.sql_schema_name() == self.schema_name && t.name() == &table_name) + else { + return Ok(None); + }; + + let table_provider: Arc = Arc::new(PlanTable::new(lt.schema().clone())); + self.table_cache + .write() + .insert(name.to_string(), table_provider.clone()); + Ok(Some(table_provider)) + } + + fn table_exist(&self, name: &str) -> bool { + if self.table_cache.read().contains_key(name) { + return true; + } + + let Ok(table_name) = name.parse::() else { + return false; + }; + + self.logical_tables + .iter() + .any(|t| t.sql_schema_name() == self.schema_name && t.name() == &table_name) + } +} + +#[async_trait] +impl TableAsyncSchemaProvider for SelfSchemaProvider { + async fn table(&self, name: &str) -> Result>, DataFusionError> { + ::table(self, name).await + } +} + +#[async_trait] +impl FuncSchemaProvider for SelfSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn function_names(&self) -> Vec { + let prefix = format!("{}.", self.schema_name); + self.udfs + .iter() + .filter_map(|u| u.name().strip_prefix(&prefix).map(String::from)) + .collect() + } + + async fn function( + &self, + name: &str, + ) -> Result>, DataFusionError> { + // Check cache first + { + let functions = self.function_cache.read(); + if let Some(func) = functions.get(name) { + return Ok(Some(Arc::new(ScalarFunctionProvider::from(func.clone())))); + } + } + + // UDF names are stored as "schema.function" (e.g., "self.addSuffix"). + // Match by checking the schema-qualified prefix. + let qualified_name = format!("{}.{}", self.schema_name, name); + let Some(udf) = self.udfs.iter().find(|u| u.name() == qualified_name) else { + return Ok(None); + }; + + let udf = Arc::new(udf.clone()); + self.function_cache + .write() + .insert(name.to_string(), udf.clone()); + Ok(Some(Arc::new(ScalarFunctionProvider::from(udf)))) + } + + fn function_exist(&self, name: &str) -> bool { + if self.function_cache.read().contains_key(name) { + return true; + } + let qualified_name = format!("{}.{}", self.schema_name, name); + self.udfs.iter().any(|u| u.name() == qualified_name) + } +} + +#[async_trait] +impl FuncAsyncSchemaProvider for SelfSchemaProvider { + async fn function( + &self, + name: &str, + ) -> Result>, DataFusionError> { + ::function(self, name).await + } +} + +impl std::fmt::Debug for SelfSchemaProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("SelfSchemaProvider") + .field("schema_name", &self.schema_name) + .finish() + } +} diff --git a/crates/core/common/src/streaming_query.rs b/crates/core/common/src/streaming_query.rs index d5925564c..3421d54cb 100644 --- a/crates/core/common/src/streaming_query.rs +++ b/crates/core/common/src/streaming_query.rs @@ -13,7 +13,7 @@ use datasets_common::{ block_num::RESERVED_BLOCK_NUM_COLUMN_NAME, dataset::Dataset, hash_reference::HashReference, network_id::NetworkId, }; -use datasets_derived::dataset::Dataset as DerivedDataset; +use datasets_derived::{dataset::Dataset as DerivedDataset, deps::SELF_REF_KEYWORD}; use datasets_raw::dataset::Dataset as RawDataset; use futures::stream::{self, BoxStream, StreamExt}; use message_stream_with_block_complete::MessageStreamWithBlockComplete; @@ -30,15 +30,19 @@ use tracing::{Instrument, instrument}; use self::message_stream_with_block_complete::MessageStreamError; use crate::{ BlockNum, BlockRange, + amp_catalog_provider::{AMP_CATALOG_NAME, AmpCatalogProvider, AsyncSchemaProvider}, arrow::{ array::{RecordBatch, TimestampNanosecondArray}, datatypes::SchemaRef, }, catalog::{ - logical::{LogicalCatalog, LogicalTable}, + logical::LogicalTable, physical::{Catalog, CatalogTable}, }, - context::{exec::ExecContext, plan::PlanContext}, + context::{ + exec::{ExecContext, ExecContextBuilder}, + plan::PlanContextBuilder, + }, cursor::{Cursor, CursorNetworkNotFoundError, NetworkCursor, Watermark}, dataset_store::{DatasetStore, ResolveRevisionError}, detached_logical_plan::DetachedLogicalPlan, @@ -48,6 +52,8 @@ use crate::{ plan_visitors::{ find_cross_network_join, order_by_block_num, unproject_special_block_num_column, }, + self_schema_provider::SelfSchemaProvider, + sql::TableReference, sql_str::SqlStr, }; @@ -90,7 +96,7 @@ pub enum SpawnError { /// Optimization failures prevent the streaming query from starting with an /// efficient execution plan. #[error("failed to optimize query plan")] - OptimizePlan(#[source] crate::context::plan::OptimizePlanError), + OptimizePlan(#[source] DataFusionError), /// Query contains a join across tables from different blockchain networks /// @@ -332,7 +338,6 @@ impl StreamingQuery { pub async fn spawn( exec_env: ExecEnv, catalog: Catalog, - dataset_store: &DatasetStore, plan: DetachedLogicalPlan, start_block: BlockNum, end_block: Option, @@ -353,9 +358,9 @@ impl StreamingQuery { .any(|f| f.name() == RESERVED_BLOCK_NUM_COLUMN_NAME); // Prevent streaming cross-network joins (check runs before plan optimization). - if let Some(info) = find_cross_network_join(&plan, &catalog).map_err(|err| { - SpawnError::OptimizePlan(crate::context::plan::OptimizePlanError::Optimize(err)) - })? { + if let Some(info) = + find_cross_network_join(&plan, &catalog).map_err(SpawnError::OptimizePlan)? + { return Err(SpawnError::CrossNetworkJoin { info }); } @@ -367,10 +372,28 @@ impl StreamingQuery { .propagate_block_num() .map_err(SpawnError::PropagateBlockNum)?; - let ctx = PlanContext::new(exec_env.session_config.clone(), catalog.logical().clone()); - ctx.optimize_plan(&plan) - .await - .map_err(SpawnError::OptimizePlan)? + // Use dep alias map from the catalog so AmpCatalogProvider + // resolves dep aliases to pinned hash references. + let dep_alias_map = catalog.dep_aliases().clone(); + + let self_schema: Arc = Arc::new(SelfSchemaProvider::new( + SELF_REF_KEYWORD.to_string(), + catalog.tables().to_vec(), + catalog.udfs().to_vec(), + )); + let amp_catalog = Arc::new( + AmpCatalogProvider::new( + exec_env.dataset_store.clone(), + exec_env.isolate_pool.clone(), + ) + .with_dep_aliases(dep_alias_map) + .with_self_schema(self_schema), + ); + let ctx = PlanContextBuilder::new(exec_env.session_config.clone()) + .with_table_catalog(AMP_CATALOG_NAME, amp_catalog.clone()) + .with_func_catalog(AMP_CATALOG_NAME, amp_catalog) + .build(); + ctx.optimize(&plan).map_err(SpawnError::OptimizePlan)? }; // Resolve the network by walking dataset dependencies to find a raw dataset, @@ -382,7 +405,7 @@ impl StreamingQuery { .collect(); let raw_dataset = - resolve_raw_dataset_from_dependencies(dataset_store, unique_refs.iter()) + resolve_raw_dataset_from_dependencies(&exec_env.dataset_store, unique_refs.iter()) .await .map_err(SpawnError::ResolveRawDataset)?; @@ -433,7 +456,8 @@ impl StreamingQuery { self.table_updates.changed().await; // The table snapshots to execute the microbatch against. - let ctx = ExecContext::for_catalog(self.exec_env.clone(), self.catalog.clone(), false) + let ctx = ExecContextBuilder::new(self.exec_env.clone()) + .for_catalog(self.catalog.clone(), false) .await .map_err(StreamingQueryExecutionError::CreateExecContext)?; @@ -533,10 +557,15 @@ impl StreamingQuery { table.physical_table().dataset_reference().clone(), table.physical_table().table().clone(), ); - let logical = LogicalCatalog::from_tables(std::iter::once(&resolved_table)); - Catalog::new(logical, vec![self.blocks_table.clone()]) + Catalog::new( + vec![resolved_table], + vec![], + vec![self.blocks_table.clone()], + Default::default(), + ) }; - ExecContext::for_catalog(self.exec_env.clone(), catalog, false) + ExecContextBuilder::new(self.exec_env.clone()) + .for_catalog(catalog, false) .await .map_err(NextMicrobatchRangeError::CreateExecContext)? }; @@ -701,10 +730,13 @@ impl StreamingQuery { // context for querying forked blocks let fork_ctx = { let catalog = Catalog::new( - ctx.physical_table().logical().clone(), + ctx.physical_table().tables().to_vec(), + ctx.physical_table().udfs().to_vec(), ctx.physical_table().catalog_entries(), + Default::default(), ); - ExecContext::for_catalog(ctx.env.clone(), catalog, true) + ExecContextBuilder::new(ctx.env.clone()) + .for_catalog(catalog, true) .await .map_err(ReorgBaseError::CreateExecContext)? }; @@ -808,7 +840,11 @@ impl StreamingQuery { .unwrap_or_default(); let sql = format!( "SELECT hash, parent_hash, timestamp FROM {} WHERE block_num = {} {} LIMIT 1", - self.blocks_table.table_ref().to_quoted_string(), + TableReference::Partial { + schema: Arc::new(self.blocks_table.sql_schema_name().to_owned()), + table: Arc::new(self.blocks_table.physical_table().table_name().clone()), + } + .to_quoted_string(), number, hash_constraint, ); @@ -818,7 +854,7 @@ impl StreamingQuery { let sql_str = SqlStr::new_unchecked(sql); let query = crate::sql::parse(&sql_str).map_err(BlocksTableFetchError::ParseSql)?; let plan = ctx - .plan_sql(query) + .statement_to_plan(query) .await .map_err(BlocksTableFetchError::PlanSql)?; let results = ctx diff --git a/crates/core/common/tests/it_session_async_resolution.rs b/crates/core/common/tests/it_session_async_resolution.rs new file mode 100644 index 000000000..ce457fb02 --- /dev/null +++ b/crates/core/common/tests/it_session_async_resolution.rs @@ -0,0 +1,991 @@ +use std::{ + any::Any, + collections::BTreeMap, + sync::{Arc, Mutex}, +}; + +use amp_data_store::{ + DataStore, PhyTableRevision, + physical_table::{PhyTableRevisionPath, PhyTableUrl}, +}; +use amp_datasets_registry::{DatasetsRegistry, manifests::DatasetManifestsStore}; +use amp_object_store::url::ObjectStoreUrl; +use amp_providers_registry::{ProviderConfigsStore, ProvidersRegistry}; +use async_trait::async_trait; +use common::{ + catalog::physical::{Catalog, CatalogTable}, + context::{ + exec::ExecContextBuilder, + session::{SessionStateBuilder, is_user_input_error}, + }, + dataset_store::DatasetStore, + exec_env::{ExecEnv, default_session_config}, + func_catalog::{ + catalog_provider::{ + AsyncCatalogProvider as FuncAsyncCatalogProvider, + CatalogProvider as FuncCatalogProvider, + }, + function_provider::{FunctionProvider, ScalarFunctionProvider}, + schema_provider::{ + AsyncSchemaProvider as FuncAsyncSchemaProvider, SchemaProvider as FuncSchemaProvider, + }, + }, + physical_table::PhysicalTable, + sql, + sql_str::SqlStr, +}; +use datafusion::{ + arrow::datatypes::{DataType, Field, Schema}, + catalog::{ + AsyncCatalogProvider as TableAsyncCatalogProvider, + AsyncSchemaProvider as TableAsyncSchemaProvider, CatalogProvider as TableCatalogProvider, + SchemaProvider as TableSchemaProvider, TableProvider, + }, + common::{DFSchemaRef, DataFusionError}, + datasource::empty::EmptyTable, + execution::runtime_env::RuntimeEnv, + logical_expr::{ColumnarValue, ScalarUDF, Volatility, create_udf}, +}; +use datasets_common::{ + dataset::Table as DatasetTable, hash_reference::HashReference, network_id::NetworkId, + table_name::TableName, +}; +use js_runtime::isolate_pool::IsolatePool; +use metadata_db::{config::DEFAULT_POOL_MAX_CONNECTIONS, physical_table_revision::LocationId}; +use object_store::{ObjectStore, memory::InMemory}; +use pgtemp::PgTempDB; +type RequestLog = Arc>>; +type TableCatalogFixture = (Arc, RequestLog, RequestLog); +type FuncCatalogFixture = (Arc, RequestLog, RequestLog); + +#[tokio::test] +async fn statement_to_plan_with_qualified_function_resolves_async_catalogs() { + //* Given + let (amp_table_catalog, amp_table_schema_requests, amp_table_requests) = + create_mock_table_catalog("test_schema", "blocks"); + let (unused_table_catalog, unused_table_schema_requests) = create_empty_mock_table_catalog(); + + let (amp_func_catalog, amp_func_schema_requests, amp_func_requests) = + create_mock_func_catalog("test_schema", "identity_udf"); + let (unused_func_catalog, unused_func_schema_requests) = create_empty_mock_func_catalog(); + + let session_config = default_session_config().expect("default session config should be valid"); + let session_ctx = SessionStateBuilder::new(session_config) + .with_table_catalog( + "amp", + amp_table_catalog as Arc, + ) + .with_table_catalog( + "unused", + unused_table_catalog as Arc, + ) + .with_func_catalog("amp", amp_func_catalog as Arc) + .with_func_catalog( + "unused", + unused_func_catalog as Arc, + ) + .build(); + + let query = indoc::indoc! {r#" + SELECT test_schema.identity_udf(value) AS projected + FROM test_schema.blocks + "#}; + let stmt = parse_statement(query); + + //* When + let plan_result = session_ctx.statement_to_plan(stmt).await; + + //* Then + assert!( + plan_result.is_ok(), + "planning should succeed with async-resolved table and function: {plan_result:?}" + ); + + let amp_table_schemas = amp_table_schema_requests + .lock() + .expect("table schema request mutex should not be poisoned") + .clone(); + assert!( + !amp_table_schemas.is_empty(), + "referenced table catalog should be resolved" + ); + assert!( + amp_table_schemas.iter().all(|name| name == "test_schema"), + "table catalog resolution should only request referenced schema" + ); + + let amp_tables = amp_table_requests + .lock() + .expect("table request mutex should not be poisoned") + .clone(); + assert!( + amp_tables.iter().any(|name| name == "blocks"), + "table resolution should load the referenced table" + ); + + let amp_func_schemas = amp_func_schema_requests + .lock() + .expect("function schema request mutex should not be poisoned") + .clone(); + assert!( + !amp_func_schemas.is_empty(), + "referenced function catalog should be resolved" + ); + assert!( + amp_func_schemas.iter().all(|name| name == "test_schema"), + "function catalog resolution should only request referenced schema" + ); + + let amp_functions = amp_func_requests + .lock() + .expect("function request mutex should not be poisoned") + .clone(); + assert!( + amp_functions.iter().any(|name| name == "identity_udf"), + "function resolution should load the referenced function" + ); + + let unused_table_schemas = unused_table_schema_requests + .lock() + .expect("unused table schema request mutex should not be poisoned") + .clone(); + assert!( + unused_table_schemas.is_empty(), + "unreferenced table catalog should not be resolved" + ); + + let unused_func_schemas = unused_func_schema_requests + .lock() + .expect("unused function schema request mutex should not be poisoned") + .clone(); + assert!( + unused_func_schemas.is_empty(), + "unreferenced function catalog should not be resolved" + ); +} + +#[tokio::test] +async fn statement_to_schema_with_qualified_function_resolves_async_catalogs() { + //* Given + let (amp_table_catalog, amp_table_schema_requests, amp_table_requests) = + create_mock_table_catalog("test_schema", "blocks"); + let (unused_table_catalog, unused_table_schema_requests) = create_empty_mock_table_catalog(); + + let (amp_func_catalog, amp_func_schema_requests, amp_func_requests) = + create_mock_func_catalog("test_schema", "identity_udf"); + let (unused_func_catalog, unused_func_schema_requests) = create_empty_mock_func_catalog(); + + let session_config = default_session_config().expect("default session config should be valid"); + let session_ctx = SessionStateBuilder::new(session_config) + .with_table_catalog( + "amp", + amp_table_catalog as Arc, + ) + .with_table_catalog( + "unused", + unused_table_catalog as Arc, + ) + .with_func_catalog("amp", amp_func_catalog as Arc) + .with_func_catalog( + "unused", + unused_func_catalog as Arc, + ) + .build(); + + let query = indoc::indoc! {r#" + SELECT test_schema.identity_udf(value) AS projected + FROM test_schema.blocks + "#}; + let stmt = parse_statement(query); + + //* When + let schema_result = session_ctx + .statement_to_plan(stmt) + .await + .map(|p| p.schema().clone()); + + //* Then + let output_schema = schema_result + .expect("schema inference should succeed with async-resolved table and function"); + assert_schema_contains_projected_field(&output_schema, "projected"); + + let amp_table_schemas = amp_table_schema_requests + .lock() + .expect("table schema request mutex should not be poisoned") + .clone(); + assert!( + !amp_table_schemas.is_empty(), + "referenced table catalog should be resolved" + ); + assert!( + amp_table_schemas.iter().all(|name| name == "test_schema"), + "table catalog resolution should only request referenced schema" + ); + + let amp_tables = amp_table_requests + .lock() + .expect("table request mutex should not be poisoned") + .clone(); + assert!( + amp_tables.iter().any(|name| name == "blocks"), + "table resolution should load the referenced table" + ); + + let amp_func_schemas = amp_func_schema_requests + .lock() + .expect("function schema request mutex should not be poisoned") + .clone(); + assert!( + !amp_func_schemas.is_empty(), + "referenced function catalog should be resolved" + ); + assert!( + amp_func_schemas.iter().all(|name| name == "test_schema"), + "function catalog resolution should only request referenced schema" + ); + + let amp_functions = amp_func_requests + .lock() + .expect("function request mutex should not be poisoned") + .clone(); + assert!( + amp_functions.iter().any(|name| name == "identity_udf"), + "function resolution should load the referenced function" + ); + + let unused_table_schemas = unused_table_schema_requests + .lock() + .expect("unused table schema request mutex should not be poisoned") + .clone(); + assert!( + unused_table_schemas.is_empty(), + "unreferenced table catalog should not be resolved" + ); + + let unused_func_schemas = unused_func_schema_requests + .lock() + .expect("unused function schema request mutex should not be poisoned") + .clone(); + assert!( + unused_func_schemas.is_empty(), + "unreferenced function catalog should not be resolved" + ); +} + +#[tokio::test] +async fn statement_to_plan_with_bare_function_does_not_trigger_async_function_resolution() { + //* Given + let (amp_table_catalog, _amp_table_schema_requests, _amp_table_requests) = + create_mock_table_catalog("test_schema", "blocks"); + let (amp_func_catalog, amp_func_schema_requests, amp_func_requests) = + create_mock_func_catalog("test_schema", "identity_udf"); + + let session_config = default_session_config().expect("default session config should be valid"); + let session_ctx = SessionStateBuilder::new(session_config) + .with_table_catalog( + "amp", + amp_table_catalog as Arc, + ) + .with_func_catalog("amp", amp_func_catalog as Arc) + .build(); + + let query = "SELECT abs(value) AS projected FROM test_schema.blocks"; + let stmt = parse_statement(query); + + //* When + let plan_result = session_ctx.statement_to_plan(stmt).await; + + //* Then + assert!( + plan_result.is_ok(), + "planning should succeed with built-in bare function" + ); + + let resolved_function_schemas = amp_func_schema_requests + .lock() + .expect("function schema request mutex should not be poisoned") + .clone(); + assert!( + resolved_function_schemas.is_empty(), + "bare built-in functions should not trigger async function catalog resolution" + ); + + let resolved_functions = amp_func_requests + .lock() + .expect("function request mutex should not be poisoned") + .clone(); + assert!( + resolved_functions.is_empty(), + "no async function fetch should run for bare built-in functions" + ); +} + +#[tokio::test] +async fn statement_to_plan_with_catalog_qualified_table_in_pre_resolution_is_invalid_plan_error() { + //* Given + let (amp_table_catalog, _amp_table_schema_requests) = create_empty_mock_table_catalog(); + let session_config = default_session_config().expect("default session config should be valid"); + let session_ctx = SessionStateBuilder::new(session_config) + .with_table_catalog( + "amp", + amp_table_catalog as Arc, + ) + .build(); + + let stmt = parse_statement("SELECT * FROM amp.test_schema.blocks"); + + //* When + let result = session_ctx.statement_to_plan(stmt).await; + + //* Then + let error = result.expect_err("catalog-qualified table should fail pre-resolution"); + assert!( + is_user_input_error(&error), + "catalog-qualified table references should be classified as invalid user input: {error:?}" + ); +} + +#[tokio::test] +async fn statement_to_schema_with_catalog_qualified_function_in_pre_resolution_is_invalid_plan_error() + { + //* Given + let (amp_func_catalog, _amp_func_schema_requests) = create_empty_mock_func_catalog(); + let session_config = default_session_config().expect("default session config should be valid"); + let session_ctx = SessionStateBuilder::new(session_config) + .with_func_catalog("amp", amp_func_catalog as Arc) + .build(); + + let stmt = parse_statement("SELECT amp.test_schema.identity_udf(1) AS projected"); + + //* When + let result = session_ctx + .statement_to_plan(stmt) + .await + .map(|p| p.schema().clone()); + + //* Then + let error = result.expect_err("catalog-qualified function should fail pre-resolution"); + assert!( + is_user_input_error(&error), + "catalog-qualified function references should be classified as invalid user input: {error:?}" + ); +} + +#[tokio::test] +async fn exec_statement_to_plan_with_qualified_function_uses_async_pre_resolution_flow() { + //* Given + let temp_db = PgTempDB::new(); + let metadata_db = metadata_db::connect_pool_with_retry( + &temp_db.connection_uri(), + DEFAULT_POOL_MAX_CONNECTIONS, + ) + .await + .expect("metadata database should connect"); + + let data_dir = tempfile::tempdir().expect("temporary data directory should be created"); + let object_store_url = ObjectStoreUrl::new(data_dir.path().to_string_lossy().to_string()) + .expect("object store URL should be created from temp dir"); + let data_store = + DataStore::new(metadata_db.clone(), object_store_url, 16).expect("data store should build"); + + let manifests_store = + DatasetManifestsStore::new(Arc::new(InMemory::new()) as Arc); + let datasets_registry = DatasetsRegistry::new(metadata_db.clone(), manifests_store); + let provider_configs = + ProviderConfigsStore::new(Arc::new(InMemory::new()) as Arc); + let providers_registry = ProvidersRegistry::new(provider_configs); + let dataset_store = DatasetStore::new(datasets_registry, providers_registry); + + let runtime_env: Arc = Default::default(); + let exec_env = ExecEnv { + session_config: default_session_config().expect("default session config should be valid"), + global_memory_pool: runtime_env.memory_pool.clone(), + disk_manager: runtime_env.disk_manager.clone(), + cache_manager: runtime_env.cache_manager.clone(), + object_store_registry: runtime_env.object_store_registry.clone(), + isolate_pool: IsolatePool::new(), + query_max_mem_mb: 64, + store: data_store, + dataset_store, + }; + + let (amp_table_catalog, amp_table_schema_requests, _amp_table_requests) = + create_mock_table_catalog("test_schema", "blocks"); + let (amp_func_catalog, amp_func_schema_requests, _amp_func_requests) = + create_mock_func_catalog("test_schema", "identity_udf"); + + let catalog = Catalog::default(); + + let query_ctx = ExecContextBuilder::new(exec_env) + .with_table_catalog( + "amp", + amp_table_catalog as Arc, + ) + .with_func_catalog("amp", amp_func_catalog as Arc) + .for_catalog(catalog, false) + .await + .expect("exec context should build"); + + let query = indoc::indoc! {r#" + SELECT test_schema.identity_udf(value) AS projected + FROM test_schema.blocks + "#}; + let stmt = parse_statement(query); + + //* When + let plan_result = query_ctx.statement_to_plan(stmt).await; + + //* Then + assert!( + plan_result.is_ok(), + "exec SQL planning should succeed through async pre-resolution flow: {plan_result:?}" + ); + + let table_schema_requests = amp_table_schema_requests + .lock() + .expect("table schema request mutex should not be poisoned") + .clone(); + assert!( + !table_schema_requests.is_empty(), + "exec path should resolve referenced async table catalog" + ); + + let function_schema_requests = amp_func_schema_requests + .lock() + .expect("function schema request mutex should not be poisoned") + .clone(); + assert!( + !function_schema_requests.is_empty(), + "exec path should resolve referenced async function catalog" + ); +} + +/// Verifies that exec SQL planning succeeds when async pre-resolution and +/// physical catalog registration reference the same table names. +/// +/// The overlap is handled by the `needs_writable_schema` check in +/// `register_catalog`. Async pre-resolution registers a +/// `ResolvedSchemaProvider` (read-only) for planning-only tables. When +/// `register_catalog` encounters an existing schema that is not a +/// `MemorySchemaProvider`, it replaces the entire schema with a fresh empty +/// `MemorySchemaProvider`, discarding the planning-only tables. Physical +/// tables then register on the new empty schema without conflict. +#[tokio::test] +async fn exec_statement_to_plan_with_overlapping_async_and_physical_tables_succeeds() { + //* Given + let temp_db = PgTempDB::new(); + let metadata_db = metadata_db::connect_pool_with_retry( + &temp_db.connection_uri(), + DEFAULT_POOL_MAX_CONNECTIONS, + ) + .await + .expect("metadata database should connect"); + + let data_dir = tempfile::tempdir().expect("temporary data directory should be created"); + let object_store_url = ObjectStoreUrl::new(data_dir.path().to_string_lossy().to_string()) + .expect("object store URL should be created from temp dir"); + let data_store = + DataStore::new(metadata_db.clone(), object_store_url, 16).expect("data store should build"); + + let manifests_store = + DatasetManifestsStore::new(Arc::new(InMemory::new()) as Arc); + let datasets_registry = DatasetsRegistry::new(metadata_db.clone(), manifests_store); + let provider_configs = + ProviderConfigsStore::new(Arc::new(InMemory::new()) as Arc); + let providers_registry = ProvidersRegistry::new(provider_configs); + let dataset_store = DatasetStore::new(datasets_registry, providers_registry); + + let runtime_env: Arc = Default::default(); + let exec_env = ExecEnv { + session_config: default_session_config().expect("default session config should be valid"), + global_memory_pool: runtime_env.memory_pool.clone(), + disk_manager: runtime_env.disk_manager.clone(), + cache_manager: runtime_env.cache_manager.clone(), + object_store_registry: runtime_env.object_store_registry.clone(), + isolate_pool: IsolatePool::new(), + query_max_mem_mb: 64, + store: data_store.clone(), + dataset_store, + }; + + // Create a physical table under "test_schema.blocks" — the same name + // that the async table catalog will also resolve. + let table_schema = Arc::new(Schema::new(vec![Field::new( + "value", + DataType::Int64, + false, + )])); + let table_name: TableName = "blocks".parse().expect("table name should be valid"); + let network = NetworkId::new_unchecked("mainnet".to_string()); + let dataset_table = DatasetTable::new(table_name, table_schema, network, vec![]); + + let hash_ref: HashReference = + "_/test_dataset@b94d27b9934d3e08a52e52d7da7dabfac484efe37a5380ee9088f7ace2efcde9" + .parse() + .expect("hash reference should be valid"); + + let revision_path: PhyTableRevisionPath = + "test_dataset/blocks/00000000-0000-4000-8000-000000000001" + .parse() + .expect("revision path should be valid"); + let revision_url = PhyTableUrl::new(data_store.url(), &revision_path); + let location_id = LocationId::try_from(999_999_i64).expect("location ID should be valid"); + let revision = PhyTableRevision { + location_id, + path: revision_path, + url: revision_url, + }; + + let physical_table = Arc::new(PhysicalTable::from_revision( + data_store.clone(), + hash_ref, + None, + dataset_table, + revision, + )); + + // Async table catalog resolves the same "test_schema.blocks" table. + let (amp_table_catalog, _, _) = create_mock_table_catalog("test_schema", "blocks"); + + let catalog = Catalog::new( + vec![], + vec![], + vec![CatalogTable::new(physical_table, "test_schema".to_string())], + Default::default(), + ); + + // Use ignore_canonical_segments=true so that the empty revision + // produces an empty (but valid) TableSnapshot. + let query_ctx = ExecContextBuilder::new(exec_env) + .with_table_catalog( + "amp", + amp_table_catalog as Arc, + ) + .for_catalog(catalog, true) + .await + .expect("exec context should build with overlapping catalogs"); + + let query = "SELECT value FROM test_schema.blocks"; + let stmt = parse_statement(query); + + //* When + let plan_result = query_ctx.statement_to_plan(stmt).await; + + //* Then + assert!( + plan_result.is_ok(), + "exec planning should succeed when async pre-resolution and physical catalog \ + register the same table name: {plan_result:?}" + ); +} + +/// Verifies that a table provider registered under a name that does not +/// match the session's default catalog is never consulted during +/// pre-resolution, causing the query to fail with a "table not found" +/// planning error. +/// +/// Misconfigured provider names silently degrade to late planning errors. +/// The builder emits a warning at build time; this test locks the runtime +/// behavior. +#[tokio::test] +async fn statement_to_plan_with_mismatched_table_provider_name_fails_with_table_not_found() { + //* Given + let (table_catalog, schema_requests, _table_requests) = + create_mock_table_catalog("test_schema", "blocks"); + + let session_config = default_session_config().expect("default session config should be valid"); + // Register the provider under "wrong_catalog" while the default catalog is "amp". + let session_ctx = SessionStateBuilder::new(session_config) + .with_table_catalog( + "wrong_catalog", + table_catalog as Arc, + ) + .build(); + + let query = "SELECT value FROM test_schema.blocks"; + let stmt = parse_statement(query); + + //* When + let result = session_ctx.statement_to_plan(stmt).await; + + //* Then + assert!( + result.is_err(), + "planning should fail because the mismatched provider is never consulted" + ); + + let schemas_requested = schema_requests + .lock() + .expect("schema request mutex should not be poisoned") + .clone(); + assert!( + schemas_requested.is_empty(), + "mismatched provider should never be consulted during pre-resolution" + ); +} + +/// Verifies that a function provider registered under a name that does +/// not match the session's default catalog is never consulted, causing +/// qualified function calls to fail at planning time. +#[tokio::test] +async fn statement_to_plan_with_mismatched_func_provider_name_fails_with_function_not_found() { + //* Given + let (table_catalog, _, _) = create_mock_table_catalog("test_schema", "blocks"); + let (func_catalog, func_schema_requests, _func_requests) = + create_mock_func_catalog("test_schema", "identity_udf"); + + let session_config = default_session_config().expect("default session config should be valid"); + // Table provider matches the default catalog ("amp"), so the table resolves. + // Function provider is registered under "wrong_catalog", so the function won't resolve. + let session_ctx = SessionStateBuilder::new(session_config) + .with_table_catalog("amp", table_catalog as Arc) + .with_func_catalog( + "wrong_catalog", + func_catalog as Arc, + ) + .build(); + + let query = "SELECT test_schema.identity_udf(value) AS projected FROM test_schema.blocks"; + let stmt = parse_statement(query); + + //* When + let result = session_ctx.statement_to_plan(stmt).await; + + //* Then + assert!( + result.is_err(), + "planning should fail because the mismatched function provider is never consulted" + ); + + let func_schemas_requested = func_schema_requests + .lock() + .expect("function schema request mutex should not be poisoned") + .clone(); + assert!( + func_schemas_requested.is_empty(), + "mismatched function provider should never be consulted during pre-resolution" + ); +} + +#[derive(Debug)] +struct MockTableSchemaProvider { + tables: BTreeMap>, + requested_tables: Arc>>, +} + +impl MockTableSchemaProvider { + fn new( + tables: BTreeMap>, + requested_tables: Arc>>, + ) -> Self { + Self { + tables, + requested_tables, + } + } +} + +#[async_trait] +impl TableSchemaProvider for MockTableSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.tables.keys().cloned().collect() + } + + async fn table(&self, name: &str) -> Result>, DataFusionError> { + self.requested_tables + .lock() + .expect("table request mutex should not be poisoned") + .push(name.to_string()); + Ok(self.tables.get(name).cloned()) + } + + fn table_exist(&self, name: &str) -> bool { + self.tables.contains_key(name) + } +} + +#[async_trait] +impl TableAsyncSchemaProvider for MockTableSchemaProvider { + async fn table(&self, name: &str) -> Result>, DataFusionError> { + ::table(self, name).await + } +} + +#[derive(Debug)] +struct MockTableCatalogProvider { + schemas: BTreeMap>, + requested_schemas: Arc>>, +} + +impl MockTableCatalogProvider { + fn new( + schemas: BTreeMap>, + requested_schemas: Arc>>, + ) -> Self { + Self { + schemas, + requested_schemas, + } + } +} + +impl TableCatalogProvider for MockTableCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + self.schemas.keys().cloned().collect() + } + + fn schema(&self, name: &str) -> Option> { + self.schemas + .get(name) + .cloned() + .map(|schema| schema as Arc) + } +} + +#[async_trait] +impl TableAsyncCatalogProvider for MockTableCatalogProvider { + async fn schema( + &self, + name: &str, + ) -> Result>, DataFusionError> { + self.requested_schemas + .lock() + .expect("schema request mutex should not be poisoned") + .push(name.to_string()); + Ok(self + .schemas + .get(name) + .cloned() + .map(|schema| schema as Arc)) + } +} + +#[derive(Debug)] +struct MockFuncSchemaProvider { + functions: BTreeMap>, + requested_functions: Arc>>, +} + +impl MockFuncSchemaProvider { + fn new( + functions: BTreeMap>, + requested_functions: Arc>>, + ) -> Self { + Self { + functions, + requested_functions, + } + } +} + +#[async_trait] +impl FuncSchemaProvider for MockFuncSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn function_names(&self) -> Vec { + self.functions.keys().cloned().collect() + } + + async fn function( + &self, + name: &str, + ) -> Result>, DataFusionError> { + self.requested_functions + .lock() + .expect("function request mutex should not be poisoned") + .push(name.to_string()); + Ok(self.functions.get(name).cloned()) + } + + fn function_exist(&self, name: &str) -> bool { + self.functions.contains_key(name) + } +} + +#[async_trait] +impl FuncAsyncSchemaProvider for MockFuncSchemaProvider { + async fn function( + &self, + name: &str, + ) -> Result>, DataFusionError> { + ::function(self, name).await + } +} + +#[derive(Debug)] +struct MockFuncCatalogProvider { + schemas: BTreeMap>, + requested_schemas: Arc>>, +} + +impl MockFuncCatalogProvider { + fn new( + schemas: BTreeMap>, + requested_schemas: Arc>>, + ) -> Self { + Self { + schemas, + requested_schemas, + } + } +} + +impl FuncCatalogProvider for MockFuncCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + self.schemas.keys().cloned().collect() + } + + fn schema(&self, name: &str) -> Option> { + self.schemas + .get(name) + .cloned() + .map(|schema| schema as Arc) + } +} + +#[async_trait] +impl FuncAsyncCatalogProvider for MockFuncCatalogProvider { + async fn schema( + &self, + name: &str, + ) -> Result>, DataFusionError> { + self.requested_schemas + .lock() + .expect("function schema request mutex should not be poisoned") + .push(name.to_string()); + Ok(self + .schemas + .get(name) + .cloned() + .map(|schema| schema as Arc)) + } +} + +fn parse_statement(sql_text: &str) -> datafusion::sql::parser::Statement { + let sql = sql_text + .parse::() + .expect("SQL should be valid SqlStr"); + sql::parse(&sql).expect("SQL should parse into a statement") +} + +fn create_identity_udf(name: &str) -> Arc { + Arc::new(create_udf( + name, + vec![DataType::Int64], + DataType::Int64, + Volatility::Immutable, + Arc::new(|args: &[ColumnarValue]| match args.first() { + Some(first) => Ok(first.clone()), + None => Err(DataFusionError::Execution( + "identity UDF expects one argument".to_string(), + )), + }), + )) +} + +fn create_mock_table_catalog(schema_name: &str, table_name: &str) -> TableCatalogFixture { + let schema_requests = Arc::new(Mutex::new(Vec::new())); + let table_requests = Arc::new(Mutex::new(Vec::new())); + + let table_schema = Arc::new(Schema::new(vec![Field::new( + "value", + DataType::Int64, + false, + )])); + let table_provider: Arc = Arc::new(EmptyTable::new(table_schema)); + + let mut tables = BTreeMap::new(); + tables.insert(table_name.to_string(), table_provider); + + let schema_provider = Arc::new(MockTableSchemaProvider::new(tables, table_requests.clone())); + let mut schemas = BTreeMap::new(); + schemas.insert(schema_name.to_string(), schema_provider); + + ( + Arc::new(MockTableCatalogProvider::new( + schemas, + schema_requests.clone(), + )), + schema_requests, + table_requests, + ) +} + +fn create_empty_mock_table_catalog() -> (Arc, Arc>>) { + let schema_requests = Arc::new(Mutex::new(Vec::new())); + ( + Arc::new(MockTableCatalogProvider::new( + Default::default(), + schema_requests.clone(), + )), + schema_requests, + ) +} + +fn create_mock_func_catalog(schema_name: &str, function_name: &str) -> FuncCatalogFixture { + let schema_requests = Arc::new(Mutex::new(Vec::new())); + let function_requests = Arc::new(Mutex::new(Vec::new())); + + let udf = create_identity_udf(&format!("{schema_name}.{function_name}")); + let function_provider: Arc = Arc::new(ScalarFunctionProvider::from(udf)); + + let mut functions = BTreeMap::new(); + functions.insert(function_name.to_string(), function_provider); + + let schema_provider = Arc::new(MockFuncSchemaProvider::new( + functions, + function_requests.clone(), + )); + let mut schemas = BTreeMap::new(); + schemas.insert(schema_name.to_string(), schema_provider); + + ( + Arc::new(MockFuncCatalogProvider::new( + schemas, + schema_requests.clone(), + )), + schema_requests, + function_requests, + ) +} + +fn create_empty_mock_func_catalog() -> (Arc, Arc>>) { + let schema_requests = Arc::new(Mutex::new(Vec::new())); + ( + Arc::new(MockFuncCatalogProvider::new( + Default::default(), + schema_requests.clone(), + )), + schema_requests, + ) +} + +fn assert_schema_contains_projected_field(schema: &DFSchemaRef, field_name: &str) { + assert_eq!(schema.fields().len(), 1, "expected one projected field"); + + let projected = schema.field(0); + assert_eq!( + projected.name(), + field_name, + "projected field name should match alias" + ); + assert_eq!( + projected.data_type(), + &DataType::Int64, + "projected field type should be int64" + ); +} diff --git a/crates/core/worker-datasets-derived/src/dataset.rs b/crates/core/worker-datasets-derived/src/dataset.rs index e27202129..b89064810 100644 --- a/crates/core/worker-datasets-derived/src/dataset.rs +++ b/crates/core/worker-datasets-derived/src/dataset.rs @@ -115,11 +115,9 @@ use amp_worker_core::{ }; use common::{ BlockNum, - catalog::{ - logical::for_dump as logical_catalog, - physical::{Catalog, EarliestBlockError, for_dump as physical_for_dump}, - }, - context::{exec::ExecContext, plan::PlanContext}, + amp_catalog_provider::{AMP_CATALOG_NAME, AmpCatalogProvider}, + catalog::physical::{Catalog, EarliestBlockError, for_dump as physical_for_dump}, + context::{exec::ExecContextBuilder, plan::PlanContextBuilder}, cursor::Cursor, dataset_store::ResolveRevisionError, detached_logical_plan::DetachedLogicalPlan, @@ -128,10 +126,8 @@ use common::{ parquet::errors::ParquetError, physical_table::{CanonicalChainError, PhysicalTable}, retryable::RetryableErrorExt as _, - sql::{ - ParseSqlError, ResolveFunctionReferencesError, ResolveTableReferencesError, - resolve_function_references, resolve_table_references, - }, + self_schema_provider::SelfSchemaProvider, + sql::{ParseSqlError, ResolveTableReferencesError, resolve_table_references}, streaming_query::{ QueryMessage, StreamingQuery, message_stream_with_block_complete::MessageStreamError, }, @@ -139,7 +135,7 @@ use common::{ use datasets_common::hash_reference::HashReference; use datasets_derived::{ Manifest as DerivedManifest, - deps::{DepAlias, DepAliasError, DepAliasOrSelfRef, DepAliasOrSelfRefError}, + deps::{DepAlias, DepAliasError}, manifest::TableInput, }; use futures::StreamExt as _; @@ -245,6 +241,7 @@ pub async fn dump( ctx.config.query_max_mem_mb, &ctx.config.spill_location, ctx.data_store.clone(), + ctx.dataset_store.clone(), ) .map_err(Error::CreateQueryEnv)?; for (table, compactor) in &tables { @@ -472,7 +469,7 @@ async fn dump_table( // Resolve all dependencies from the manifest to HashReference // This ensures SQL schema references map to the exact dataset versions // specified in the manifest's dependencies - let mut dependencies: BTreeMap = BTreeMap::new(); + let mut dependencies: BTreeMap = Default::default(); for (alias, dep_reference) in &manifest.dependencies { // Convert DepReference to Reference for resolution @@ -494,30 +491,45 @@ async fn dump_table( let mut join_set = tasks::FailFastJoinSet::>::new(); + let self_schema_provider = SelfSchemaProvider::from_manifest_udfs( + datasets_derived::deps::SELF_REF_KEYWORD.to_string(), + env.isolate_pool.clone(), + &manifest.functions, + ); + let catalog = { let table_refs = resolve_table_references::(&query) .map_err(DumpTableError::ResolveTableReferences)?; - let func_refs = resolve_function_references::(&query) - .map_err(DumpTableError::ResolveFunctionReferences)?; - let logical = logical_catalog::create( + physical_for_dump::create( &ctx.dataset_store, - &env.isolate_pool, + &ctx.data_store, &dependencies, - &manifest.functions, - (table_refs, func_refs), + table_refs, + self_schema_provider.udfs().to_vec(), ) .await - .map_err(DumpTableError::CreateCatalog)?; - physical_for_dump::create(&ctx.dataset_store, &ctx.data_store, logical) - .await - .map_err(DumpTableError::CreatePhysicalCatalog)? + .map_err(DumpTableError::CreatePhysicalCatalog)? }; - let planning_ctx = PlanContext::new(env.session_config.clone(), catalog.logical().clone()); + let dep_alias_map = catalog.dep_aliases().clone(); + + // Planning context: tables resolved lazily by AmpCatalogProvider with dep aliases. + // UDFs are kept in the self-schema provider for self-refs and eth_call. + let self_schema: Arc = + Arc::new(self_schema_provider); + let amp_catalog = Arc::new( + AmpCatalogProvider::new(ctx.dataset_store.clone(), env.isolate_pool.clone()) + .with_dep_aliases(dep_alias_map) + .with_self_schema(self_schema), + ); + let planning_ctx = PlanContextBuilder::new(env.session_config.clone()) + .with_table_catalog(AMP_CATALOG_NAME, amp_catalog.clone()) + .with_func_catalog(AMP_CATALOG_NAME, amp_catalog) + .build(); join_set.spawn( async move { let plan = planning_ctx - .plan_sql(query.clone()) + .statement_to_plan(query.clone()) .await .map_err(DumpTableSpawnError::PlanSql)?; if let Err(err) = plan.is_incremental() { @@ -541,8 +553,9 @@ async fn dump_table( let start = dependency_earliest_block; let resolved = resolve_end_block(&end, start, async { - let query_ctx = - ExecContext::for_catalog(env.clone(), catalog.clone(), false).await?; + let query_ctx = ExecContextBuilder::new(env.clone()) + .for_catalog(catalog.clone(), false) + .await?; let max_end_blocks = query_ctx .max_end_blocks(&plan.clone().attach_to(&query_ctx)?) .await?; @@ -689,20 +702,6 @@ pub enum DumpTableError { #[error("failed to resolve table references: {0}")] ResolveTableReferences(#[source] ResolveTableReferencesError), - /// Failed to resolve function references from the SQL query - /// - /// This occurs when extracting and resolving function references from the - /// parsed SQL query fails. - #[error("failed to resolve function references: {0}")] - ResolveFunctionReferences(#[source] ResolveFunctionReferencesError), - - /// Failed to create the logical catalog for query execution - /// - /// This occurs when building the logical catalog from the resolved - /// table and function references fails. - #[error("failed to create catalog: {0}")] - CreateCatalog(#[source] logical_catalog::CreateCatalogError), - /// Failed to create the physical catalog for query execution /// /// This occurs when building the physical catalog from the logical @@ -727,8 +726,6 @@ impl RetryableErrorExt for DumpTableError { Self::ParseSql(_) => false, Self::DependencyNotFound { .. } => false, Self::ResolveTableReferences(_) => false, - Self::ResolveFunctionReferences(_) => false, - Self::CreateCatalog(_) => false, Self::CreatePhysicalCatalog(_) => false, // Transient DB failure — recoverable @@ -750,7 +747,7 @@ pub enum DumpTableSpawnError { /// This occurs when DataFusion cannot create an execution plan /// from the parsed SQL query. #[error("failed to plan SQL query: {0}")] - PlanSql(#[source] common::context::plan::SqlError), + PlanSql(#[source] datafusion::error::DataFusionError), /// The query is not incremental and cannot be synced /// @@ -840,7 +837,6 @@ async fn dump_sql_query( StreamingQuery::spawn( env.clone(), catalog.clone(), - &ctx.dataset_store, query, start, end, diff --git a/crates/core/worker-datasets-raw/src/dataset.rs b/crates/core/worker-datasets-raw/src/dataset.rs index fc7ff9c36..944f1078a 100644 --- a/crates/core/worker-datasets-raw/src/dataset.rs +++ b/crates/core/worker-datasets-raw/src/dataset.rs @@ -106,7 +106,7 @@ use amp_worker_core::{ use common::{ BlockNum, catalog::{ - logical::{LogicalCatalog, LogicalTable}, + logical::LogicalTable, physical::{Catalog, CatalogTable}, }, parquet::errors::ParquetError, @@ -219,13 +219,14 @@ pub async fn dump( ) }) .collect(); - let logical = LogicalCatalog::from_tables(resolved_tables.iter()); let catalog = Catalog::new( - logical, + resolved_tables, + vec![], tables .iter() .map(|(t, _)| CatalogTable::new(Arc::clone(t), sql_schema_name.clone())) .collect(), + Default::default(), ); // Ensure consistency before starting the dump procedure. @@ -1075,7 +1076,7 @@ fn spawn_freshness_tracker( let mut subscriptions: BTreeMap< LocationId, (Arc, tokio::sync::watch::Receiver<()>), - > = BTreeMap::new(); + > = Default::default(); for table in catalog.physical_tables() { let location_id = table.location_id(); let receiver = multiplexer.subscribe(location_id).await; diff --git a/crates/services/admin-api/src/handlers/common.rs b/crates/services/admin-api/src/handlers/common.rs index c1053b31d..4b7ef19ed 100644 --- a/crates/services/admin-api/src/handlers/common.rs +++ b/crates/services/admin-api/src/handlers/common.rs @@ -1,18 +1,16 @@ //! Common utilities for HTTP handlers -use std::collections::BTreeMap; +use std::{collections::BTreeMap, sync::Arc}; use amp_data_store::{DataStore, PhyTableRevision}; use amp_datasets_registry::error::ResolveRevisionError; use common::{ - catalog::logical::for_manifest_validation::{ - self as catalog, CreateLogicalCatalogError, ResolveTablesError, ResolveUdfsError, - TableReferencesMap, - }, - context::plan::{PlanContext, SqlError as PlanSqlError}, + amp_catalog_provider::{AMP_CATALOG_NAME, AmpCatalogProvider}, + context::plan::PlanContextBuilder, dataset_store::{DatasetStore, GetDatasetError}, exec_env::default_session_config, metadata::{AmpMetadataFromParquetError, amp_metadata_from_parquet_file}, + self_schema_provider::SelfSchemaProvider, sql::{ FunctionReference, ResolveFunctionReferencesError, ResolveTableReferencesError, TableReference, resolve_function_references, resolve_table_references, @@ -28,6 +26,15 @@ use datasets_derived::{ use futures::{StreamExt as _, stream}; use js_runtime::isolate_pool::IsolatePool; +/// Map of table names to their SQL references (table refs and function refs) using dependency aliases or self-references. +type TableReferencesMap = BTreeMap< + TableName, + ( + Vec>, + Vec>, + ), +>; + /// A string wrapper that ensures the value is not empty or whitespace-only /// /// This invariant-holding _new-type_ validates that strings contain at least one non-whitespace character. @@ -232,7 +239,7 @@ pub async fn validate_derived_manifest( ) -> Result<(), ManifestValidationError> { // Step 1: Resolve all dependencies to HashReference // This must happen first to ensure all dependencies exist before parsing SQL - let mut dependencies: BTreeMap = BTreeMap::new(); + let mut dependencies: BTreeMap = Default::default(); for (alias, dep_reference) in &manifest.dependencies { // Convert DepReference to Reference for resolution @@ -258,8 +265,8 @@ pub async fn validate_derived_manifest( // Step 2: Parse all SQL queries and extract references // Store parsed statements to avoid re-parsing in Step 4 - let mut statements: BTreeMap = BTreeMap::new(); - let mut references: TableReferencesMap = BTreeMap::new(); + let mut statements: BTreeMap = Default::default(); + let mut references: TableReferencesMap = Default::default(); for (table_name, table) in &manifest.tables { let TableInput::View(View { sql }) = &table.input; @@ -334,41 +341,25 @@ pub async fn validate_derived_manifest( // - Schema compatibility across dependencies let session_config = default_session_config().map_err(ManifestValidationError::SessionConfig)?; - let planning_ctx = catalog::create( - store, - IsolatePool::dummy(), // For manifest validation only (no JS execution) - dependencies, - manifest.functions.clone(), - references, - ) - .await - .map(|catalog| PlanContext::new(session_config, catalog)) - .map_err(|err| match &err { - CreateLogicalCatalogError::ResolveTables(resolve_error) => match resolve_error { - ResolveTablesError::UnqualifiedTable { .. } => { - ManifestValidationError::UnqualifiedTable(err) - } - ResolveTablesError::GetDataset { .. } => ManifestValidationError::GetDataset(err), - ResolveTablesError::TableNotFoundInDataset { .. } => { - ManifestValidationError::TableNotFoundInDataset(err) - } - }, - CreateLogicalCatalogError::ResolveUdfs(resolve_error) => match resolve_error { - ResolveUdfsError::GetDataset { .. } => ManifestValidationError::GetDataset(err), - ResolveUdfsError::EthCallUdfCreation { .. } => { - ManifestValidationError::EthCallUdfCreation(err) - } - ResolveUdfsError::EthCallNotAvailable { .. } => { - ManifestValidationError::EthCallNotAvailable(err) - } - ResolveUdfsError::FunctionNotFoundInDataset { .. } => { - ManifestValidationError::FunctionNotFoundInDataset(err) - } - ResolveUdfsError::SelfReferencedFunctionNotFound { .. } => { - ManifestValidationError::FunctionNotFoundInDataset(err) - } - }, - })?; + let dep_aliases: BTreeMap = dependencies + .iter() + .map(|(alias, hash_ref)| (alias.to_string(), hash_ref.clone())) + .collect(); + let self_schema: Arc = + Arc::new(SelfSchemaProvider::from_manifest_udfs( + datasets_derived::deps::SELF_REF_KEYWORD.to_string(), + IsolatePool::dummy(), + &manifest.functions, + )); + let amp_catalog = Arc::new( + AmpCatalogProvider::new(store.clone(), IsolatePool::dummy()) + .with_dep_aliases(dep_aliases) + .with_self_schema(self_schema), + ); + let planning_ctx = PlanContextBuilder::new(session_config) + .with_table_catalog(AMP_CATALOG_NAME, amp_catalog.clone()) + .with_func_catalog(AMP_CATALOG_NAME, amp_catalog) + .build(); // Step 4: Validate that all table SQL queries are incremental. // Incremental processing is required for derived datasets to efficiently update @@ -376,7 +367,7 @@ pub async fn validate_derived_manifest( // Use cached parsed statements from Step 2 to avoid re-parsing. for (table_name, stmt) in statements { // Plan the SQL query to a logical plan - let plan = planning_ctx.plan_sql(stmt).await.map_err(|err| { + let plan = planning_ctx.statement_to_plan(stmt).await.map_err(|err| { ManifestValidationError::SqlPlanningError { table_name: table_name.clone(), source: err, @@ -482,13 +473,6 @@ pub enum ManifestValidationError { source: ResolveTableReferencesError, }, - /// Unqualified table reference - /// - /// All tables must be qualified with a dataset reference in the schema portion. - /// Unqualified tables (e.g., just `table_name`) are not allowed. - #[error("Unqualified table reference: {0}")] - UnqualifiedTable(#[source] CreateLogicalCatalogError), - /// Invalid table name /// /// Table name does not conform to SQL identifier rules (must start with letter/underscore, @@ -496,39 +480,6 @@ pub enum ManifestValidationError { #[error("Invalid table name in SQL query: {0}")] InvalidTableName(#[source] ResolveTableReferencesError), - /// Failed to retrieve dataset from store - /// - /// This occurs when loading a dataset definition fails due to: - /// - Invalid or corrupted manifest - /// - Unsupported dataset kind - /// - Storage backend errors - #[error("Failed to retrieve dataset from store: {0}")] - GetDataset(#[source] CreateLogicalCatalogError), - - /// Failed to create ETH call UDF - /// - /// This occurs when creating the eth_call user-defined function fails. - #[error("Failed to create ETH call UDF: {0}")] - EthCallUdfCreation(#[source] CreateLogicalCatalogError), - - /// Table not found in dataset - /// - /// The referenced table does not exist in the dataset. - #[error("Table not found in dataset: {0}")] - TableNotFoundInDataset(#[source] CreateLogicalCatalogError), - - /// Function not found in dataset - /// - /// The referenced function does not exist in the dataset. - #[error("Function not found in dataset: {0}")] - FunctionNotFoundInDataset(#[source] CreateLogicalCatalogError), - - /// eth_call function not available - /// - /// The eth_call function is not available for the referenced dataset. - #[error("eth_call function not available: {0}")] - EthCallNotAvailable(#[source] CreateLogicalCatalogError), - /// Dependency alias not found /// /// A table or function reference uses an alias that was not provided in the dependencies map. @@ -587,7 +538,7 @@ pub enum ManifestValidationError { /// The table whose SQL query failed to plan table_name: TableName, #[source] - source: PlanSqlError, + source: datafusion::error::DataFusionError, }, /// Failed to create DataFusion session configuration diff --git a/crates/services/admin-api/src/handlers/schema.rs b/crates/services/admin-api/src/handlers/schema.rs index c744ab06b..5c7653d2c 100644 --- a/crates/services/admin-api/src/handlers/schema.rs +++ b/crates/services/admin-api/src/handlers/schema.rs @@ -1,4 +1,4 @@ -use std::collections::BTreeMap; +use std::{collections::BTreeMap, sync::Arc}; use axum::{ Json, @@ -6,19 +6,12 @@ use axum::{ http::StatusCode, }; use common::{ - catalog::logical::for_admin_api::{ - self as catalog, CreateLogicalCatalogError, ResolveTablesError, ResolveUdfsError, - TableReferencesMap, - }, - context::plan::{PlanContext, SqlError as PlanSqlError}, - dataset_store::GetDatasetError, + amp_catalog_provider::{AMP_CATALOG_NAME, AmpCatalogProvider}, + context::plan::{PlanContextBuilder, is_user_input_error}, exec_env::default_session_config, incrementalizer::NonIncrementalQueryError, plan_visitors::prepend_special_block_num_field, - sql::{ - FunctionReference, ResolveFunctionReferencesError, ResolveTableReferencesError, - TableReference, resolve_function_references, resolve_table_references, - }, + self_schema_provider::SelfSchemaProvider, sql_str::SqlStr, }; use datafusion::sql::parser::Statement; @@ -26,10 +19,7 @@ use datasets_common::{ hash_reference::HashReference, network_id::NetworkId, table_name::TableName, }; use datasets_derived::{ - deps::{ - DepAlias, DepAliasError, DepAliasOrSelfRef, DepAliasOrSelfRefError, DepReference, - HashOrVersion, - }, + deps::{DepAlias, DepReference, HashOrVersion}, func_name::FuncName, manifest::{Function, TableSchema}, }; @@ -141,14 +131,14 @@ pub async fn handler( // 2. Checking if the provided function names match what's used in SQL // 3. Warning or erroring if functions are defined but never used return Ok(Json(SchemaResponse { - schemas: BTreeMap::new(), + schemas: Default::default(), })); } // Resolve all dependencies to their manifest hashes // This must happen before parsing SQL to ensure all dependencies exist let dependencies = { - let mut resolved: BTreeMap = BTreeMap::new(); + let mut resolved: BTreeMap = Default::default(); for (alias, dep_ref) in dependencies { let (fqn, hash_or_version) = dep_ref.clone().into_fqn_and_hash_or_version(); @@ -199,9 +189,8 @@ pub async fn handler( }; // Parse all SQL queries from tables and extract table references and function names - let (statements, references) = { - let mut statements: BTreeMap = BTreeMap::new(); - let mut references = TableReferencesMap::new(); + let statements = { + let mut statements: BTreeMap = Default::default(); for (table_name, sql_query) in tables { let stmt = common::sql::parse(&sql_query).map_err(|err| Error::InvalidTableSql { @@ -209,137 +198,46 @@ pub async fn handler( source: err, })?; - // Extract table references from the statement - let table_refs = - resolve_table_references::(&stmt).map_err(|err| match &err { - ResolveTableReferencesError::InvalidTableName { .. } => { - Error::InvalidTableName(err) - } - ResolveTableReferencesError::CatalogQualifiedTable { .. } => { - Error::CatalogQualifiedTable { - table_name: table_name.clone(), - source: err, - } - } - ResolveTableReferencesError::InvalidSchemaFormat { .. } => { - Error::InvalidDependencyAliasForTableRef { - table_name: table_name.clone(), - source: err, - } - } - _ => Error::TableReferenceResolution { - table_name: table_name.clone(), - source: err, - }, - })?; - - // Validate dependency aliases in table references before catalog creation - for table_ref in &table_refs { - if let TableReference::Partial { schema, .. } = table_ref - && !dependencies.contains_key(schema.as_ref()) - { - return Err(Error::DependencyAliasNotFound { - table_name: table_name.clone(), - alias: schema.to_string(), - } - .into()); - } - } - - // Extract function references from the statement (supports both external deps and self-references) - let func_refs = resolve_function_references::(&stmt).map_err( - |err| match &err { - ResolveFunctionReferencesError::InvalidSchemaFormat { .. } => { - Error::InvalidDependencyAliasForFunctionRef { - table_name: table_name.clone(), - source: err, - } - } - ResolveFunctionReferencesError::CatalogQualifiedFunction { .. } => { - Error::CatalogQualifiedFunction { - table_name: table_name.clone(), - source: err, - } - } - _ => Error::FunctionReferenceResolution { - table_name: table_name.clone(), - source: err, - }, - }, - )?; - - // Validate dependency aliases in function references before catalog creation - for func_ref in &func_refs { - if let FunctionReference::Qualified { schema, .. } = func_ref - && let DepAliasOrSelfRef::DepAlias(dep_alias) = schema.as_ref() - && !dependencies.contains_key(dep_alias) - { - return Err(Error::DependencyAliasNotFound { - table_name: table_name.clone(), - alias: dep_alias.to_string(), - } - .into()); - } - } - - statements.insert(table_name.clone(), stmt); - references.insert(table_name, (table_refs, func_refs)); + statements.insert(table_name, stmt); } - (statements, references) + statements }; - // Create logical catalog using resolved dependencies - let catalog = catalog::create( - &ctx.dataset_store, - IsolatePool::dummy(), // For schema validation only (no JS execution) - dependencies, - functions, - references, - ) - .await - .map_err(|err| match &err { - CreateLogicalCatalogError::ResolveTables(inner) => match inner { - ResolveTablesError::UnqualifiedTable { .. } => Error::UnqualifiedTable(err), - ResolveTablesError::GetDataset { - source: GetDatasetError::DatasetNotFound(_), - .. - } => Error::DatasetNotFound(err), - ResolveTablesError::GetDataset { .. } => Error::GetDataset(err), - ResolveTablesError::TableNotFoundInDataset { .. } => Error::TableNotFoundInDataset(err), - }, - CreateLogicalCatalogError::ResolveUdfs(inner) => match inner { - ResolveUdfsError::GetDataset { - source: GetDatasetError::DatasetNotFound(_), - .. - } => Error::DatasetNotFound(err), - ResolveUdfsError::GetDataset { .. } => Error::GetDataset(err), - ResolveUdfsError::EthCallUdfCreation { .. } => Error::EthCallUdfCreation(err), - ResolveUdfsError::EthCallNotAvailable { .. } => Error::EthCallNotAvailable(err), - ResolveUdfsError::FunctionNotFoundInDataset { .. } => { - Error::FunctionNotFoundInDataset(err) - } - ResolveUdfsError::SelfReferencedFunctionNotFound { .. } => { - Error::FunctionNotFoundInDataset(err) - } - }, - })?; + // Build dep_aliases for AmpCatalogProvider before dependencies is consumed + let dep_aliases: BTreeMap = dependencies + .iter() + .map(|(alias, hash_ref)| (alias.to_string(), hash_ref.clone())) + .collect(); - // Create planning context from catalog + // Create planning context with self-schema provider let session_config = default_session_config().map_err(Error::SessionConfig)?; - let planning_ctx = PlanContext::new(session_config, catalog); + let self_schema: Arc = + Arc::new(SelfSchemaProvider::from_manifest_udfs( + datasets_derived::deps::SELF_REF_KEYWORD.to_string(), + IsolatePool::dummy(), + &functions, + )); + let amp_catalog = Arc::new( + AmpCatalogProvider::new(ctx.dataset_store.clone(), IsolatePool::dummy()) + .with_dep_aliases(dep_aliases.clone()) + .with_self_schema(self_schema), + ); + let planning_ctx = PlanContextBuilder::new(session_config) + .with_table_catalog(AMP_CATALOG_NAME, amp_catalog.clone()) + .with_func_catalog(AMP_CATALOG_NAME, amp_catalog) + .build(); // Infer schema for each table and extract networks let mut schemas = BTreeMap::new(); for (table_name, stmt) in statements { - let plan = - planning_ctx - .plan_sql(stmt.clone()) - .await - .map_err(|err| Error::SchemaInference { - table_name: table_name.clone(), - source: err, - })?; + let plan = planning_ctx + .statement_to_plan(stmt.clone()) + .await + .map_err(|err| Error::SchemaPlanInference { + table_name: table_name.clone(), + source: err, + })?; // Return error if query is non-incremental plan.is_incremental() .map_err(|err| Error::NonIncrementalQuery { @@ -347,14 +245,7 @@ pub async fn handler( source: err, })?; // Infer schema using the planning context - let schema = - planning_ctx - .sql_output_schema(stmt) - .await - .map_err(|err| Error::SchemaInference { - table_name: table_name.clone(), - source: err, - })?; + let schema = plan.schema(); // Prepend the special block number field let schema = prepend_special_block_num_field(&schema); @@ -475,35 +366,6 @@ enum Error { source: NonIncrementalQueryError, }, - /// Failed to resolve table references in SQL query - /// - /// This occurs when: - /// - Table references contain invalid identifiers - /// - Table references have unsupported format (not 1-3 parts) - /// - Table names don't conform to identifier rules - #[error("Failed to resolve table references for table '{table_name}': {source}")] - TableReferenceResolution { - /// The table name that contains the invalid references - table_name: TableName, - /// The underlying resolution error - #[source] - source: ResolveTableReferencesError, - }, - - /// Failed to resolve function references from SQL query - /// - /// This occurs when: - /// - Function references cannot be extracted from the parsed SQL statement - /// - Unsupported DML statements are encountered - #[error("Failed to resolve function references for table '{table_name}': {source}")] - FunctionReferenceResolution { - /// The table name that contains the invalid functions - table_name: TableName, - /// The underlying extraction error - #[source] - source: ResolveFunctionReferencesError, - }, - /// Dependency not found in dataset store /// /// This occurs when: @@ -547,149 +409,21 @@ enum Error { source: amp_datasets_registry::error::ResolveRevisionError, }, - /// Catalog-qualified table reference not supported - /// - /// Only dataset-qualified tables are supported (e.g., `dataset.table`). - /// Catalog-qualified tables (e.g., `catalog.schema.table`) are not supported. - /// - /// This error is detected during table reference resolution from SQL. - #[error("Catalog-qualified table in '{table_name}': {source}")] - CatalogQualifiedTable { - /// The table name that contains the catalog-qualified reference - table_name: TableName, - /// The underlying resolution error - #[source] - source: ResolveTableReferencesError, - }, - - /// Unqualified table reference - /// - /// All tables must be qualified with a dataset reference in the schema portion. - /// Unqualified tables (e.g., just `table_name`) are not allowed. - #[error(transparent)] - UnqualifiedTable(CreateLogicalCatalogError), - - /// Invalid table name - /// - /// Table name does not conform to SQL identifier rules (must start with letter/underscore, - /// contain only alphanumeric/underscore/dollar, and be <= 63 bytes). - #[error("Invalid table name in SQL query: {0}")] - InvalidTableName(#[source] ResolveTableReferencesError), - - /// Dataset reference not found - /// - /// The referenced dataset does not exist in the store. - #[error(transparent)] - DatasetNotFound(CreateLogicalCatalogError), - - /// Failed to retrieve dataset from store - /// - /// This occurs when loading a dataset definition fails due to: - /// - Invalid or corrupted manifest - /// - Unsupported dataset kind - /// - Storage backend errors - #[error(transparent)] - GetDataset(CreateLogicalCatalogError), - - /// Failed to create ETH call UDF - /// - /// This occurs when creating the eth_call user-defined function fails. - #[error(transparent)] - EthCallUdfCreation(CreateLogicalCatalogError), - - /// Table not found in dataset - /// - /// The referenced table does not exist in the dataset. - #[error(transparent)] - TableNotFoundInDataset(CreateLogicalCatalogError), - - /// Function not found in dataset - /// - /// The referenced function does not exist in the dataset. - #[error(transparent)] - FunctionNotFoundInDataset(CreateLogicalCatalogError), - - /// eth_call function not available - /// - /// The eth_call function is not available for the referenced dataset. - #[error(transparent)] - EthCallNotAvailable(CreateLogicalCatalogError), - - /// Invalid dependency alias in table reference - /// - /// The dependency alias in a table reference does not conform to alias rules - /// (must start with letter, contain only alphanumeric/underscore, and be <= 63 bytes). - /// - /// This error is detected during table reference resolution from SQL. - #[error("Invalid dependency alias in table reference in '{table_name}': {source}")] - InvalidDependencyAliasForTableRef { - /// The table name that contains the invalid alias - table_name: TableName, - /// The underlying resolution error - #[source] - source: ResolveTableReferencesError, - }, - - /// Invalid dependency alias in function reference - /// - /// The dependency alias in a function reference does not conform to alias rules - /// (must start with letter, contain only alphanumeric/underscore, and be <= 63 bytes). - /// - /// This error is detected during function reference resolution from SQL. - #[error("Invalid dependency alias in function reference in '{table_name}': {source}")] - InvalidDependencyAliasForFunctionRef { - /// The table name that contains the invalid alias - table_name: TableName, - /// The underlying resolution error - #[source] - source: ResolveFunctionReferencesError, - }, - - /// Catalog-qualified function reference not supported - /// - /// Only dataset-qualified functions are supported (e.g., `dataset.function`). - /// Catalog-qualified functions (e.g., `catalog.schema.function`) are not supported. - /// - /// This error is detected during function reference resolution from SQL. - #[error("Catalog-qualified function in '{table_name}': {source}")] - CatalogQualifiedFunction { - /// The table name that contains the catalog-qualified function reference - table_name: TableName, - /// The underlying resolution error - #[source] - source: ResolveFunctionReferencesError, - }, - - /// Dependency alias not found - /// - /// A table or function reference uses an alias that was not provided in the dependencies map. - #[error( - "Dependency alias not found: In table '{table_name}': Dependency alias '{alias}' referenced in table but not provided in dependencies" - )] - DependencyAliasNotFound { - /// The table being processed when the error occurred - table_name: TableName, - /// The dependency alias that was not found - alias: String, - }, - /// Failed to create DataFusion session configuration #[error("failed to create session config")] SessionConfig(#[source] datafusion::error::DataFusionError), - /// Failed to infer schema for table + /// Failed to plan SQL during schema inference for a table /// - /// This occurs when: - /// - Query planning fails due to invalid references - /// - Type inference fails for the query - /// - Schema determination encounters errors - #[error("Failed to infer schema for table '{table_name}': {source}")] - SchemaInference { + /// This occurs when SQL planning fails for a table query during schema + /// inference (e.g., invalid references, type mismatches). + #[error("Failed to plan SQL for table '{table_name}': {source}")] + SchemaPlanInference { /// The table name that failed schema inference table_name: TableName, - /// The underlying query context error + /// The underlying SQL planning error #[source] - source: PlanSqlError, + source: datafusion::error::DataFusionError, }, } @@ -720,30 +454,14 @@ impl IntoErrorResponse for Error { Error::EmptyTablesAndFunctions => "EMPTY_TABLES_AND_FUNCTIONS", Error::InvalidTableSql { .. } => "INVALID_TABLE_SQL", Error::NonIncrementalQuery { .. } => "NON_INCREMENTAL_QUERY", - Error::TableReferenceResolution { .. } => "TABLE_REFERENCE_RESOLUTION", - Error::FunctionReferenceResolution { .. } => "FUNCTION_REFERENCE_RESOLUTION", Error::DependencyNotFound { .. } => "DEPENDENCY_NOT_FOUND", Error::DependencyManifestLinkCheck { .. } => "DEPENDENCY_MANIFEST_LINK_CHECK", Error::DependencyVersionResolution { .. } => "DEPENDENCY_VERSION_RESOLUTION", - Error::CatalogQualifiedTable { .. } => "CATALOG_QUALIFIED_TABLE", - Error::UnqualifiedTable(_) => "UNQUALIFIED_TABLE", - Error::InvalidTableName(_) => "INVALID_TABLE_NAME", - Error::InvalidDependencyAliasForTableRef { .. } => { - "INVALID_DEPENDENCY_ALIAS_FOR_TABLE_REF" - } - Error::InvalidDependencyAliasForFunctionRef { .. } => { - "INVALID_DEPENDENCY_ALIAS_FOR_FUNCTION_REF" - } - Error::CatalogQualifiedFunction { .. } => "CATALOG_QUALIFIED_FUNCTION", - Error::DatasetNotFound(_) => "DATASET_NOT_FOUND", - Error::GetDataset(_) => "GET_DATASET_ERROR", - Error::EthCallUdfCreation(_) => "ETH_CALL_UDF_CREATION_ERROR", - Error::TableNotFoundInDataset(_) => "TABLE_NOT_FOUND_IN_DATASET", - Error::FunctionNotFoundInDataset(_) => "FUNCTION_NOT_FOUND_IN_DATASET", - Error::EthCallNotAvailable(_) => "ETH_CALL_NOT_AVAILABLE", - Error::DependencyAliasNotFound { .. } => "DEPENDENCY_ALIAS_NOT_FOUND", Error::SessionConfig(_) => "SESSION_CONFIG_ERROR", - Error::SchemaInference { .. } => "SCHEMA_INFERENCE", + Error::SchemaPlanInference { source, .. } if is_user_input_error(source) => { + "INVALID_PLAN" + } + Error::SchemaPlanInference { .. } => "SCHEMA_INFERENCE", } } @@ -753,26 +471,14 @@ impl IntoErrorResponse for Error { Error::EmptyTablesAndFunctions => StatusCode::BAD_REQUEST, Error::InvalidTableSql { .. } => StatusCode::BAD_REQUEST, Error::NonIncrementalQuery { .. } => StatusCode::BAD_REQUEST, - Error::TableReferenceResolution { .. } => StatusCode::BAD_REQUEST, - Error::FunctionReferenceResolution { .. } => StatusCode::BAD_REQUEST, Error::DependencyNotFound { .. } => StatusCode::NOT_FOUND, Error::DependencyManifestLinkCheck { .. } => StatusCode::INTERNAL_SERVER_ERROR, Error::DependencyVersionResolution { .. } => StatusCode::INTERNAL_SERVER_ERROR, - Error::CatalogQualifiedTable { .. } => StatusCode::BAD_REQUEST, - Error::UnqualifiedTable(_) => StatusCode::BAD_REQUEST, - Error::InvalidTableName(_) => StatusCode::BAD_REQUEST, - Error::InvalidDependencyAliasForTableRef { .. } => StatusCode::BAD_REQUEST, - Error::InvalidDependencyAliasForFunctionRef { .. } => StatusCode::BAD_REQUEST, - Error::CatalogQualifiedFunction { .. } => StatusCode::BAD_REQUEST, - Error::DatasetNotFound(_) => StatusCode::NOT_FOUND, - Error::GetDataset(_) => StatusCode::INTERNAL_SERVER_ERROR, - Error::EthCallUdfCreation(_) => StatusCode::INTERNAL_SERVER_ERROR, - Error::TableNotFoundInDataset(_) => StatusCode::NOT_FOUND, - Error::FunctionNotFoundInDataset(_) => StatusCode::NOT_FOUND, - Error::EthCallNotAvailable(_) => StatusCode::NOT_FOUND, - Error::DependencyAliasNotFound { .. } => StatusCode::BAD_REQUEST, Error::SessionConfig(_) => StatusCode::INTERNAL_SERVER_ERROR, - Error::SchemaInference { .. } => StatusCode::INTERNAL_SERVER_ERROR, + Error::SchemaPlanInference { source, .. } if is_user_input_error(source) => { + StatusCode::BAD_REQUEST + } + Error::SchemaPlanInference { .. } => StatusCode::INTERNAL_SERVER_ERROR, } } } diff --git a/crates/services/server/Cargo.toml b/crates/services/server/Cargo.toml index 87a8513ce..4e3054a06 100644 --- a/crates/services/server/Cargo.toml +++ b/crates/services/server/Cargo.toml @@ -16,7 +16,6 @@ common = { path = "../../core/common" } datafusion.workspace = true datasets-common = { path = "../../core/datasets-common", features = ["bincode"] } futures.workspace = true -js-runtime = { path = "../../core/js-runtime" } metadata-db = { path = "../../core/metadata-db" } monitoring = { path = "../../core/monitoring" } prost.workspace = true diff --git a/crates/services/server/src/flight.rs b/crates/services/server/src/flight.rs index 58fea6856..ed56a3548 100644 --- a/crates/services/server/src/flight.rs +++ b/crates/services/server/src/flight.rs @@ -21,39 +21,29 @@ use axum::{Router, http::StatusCode, response::IntoResponse}; use bytes::{BufMut, Bytes, BytesMut}; use common::{ BlockNum, BlockRange, + amp_catalog_provider::{AMP_CATALOG_NAME, AmpCatalogProvider}, arrow::{ self, array::RecordBatch, datatypes::SchemaRef, ipc::writer::{CompressionContext, DictionaryTracker, IpcDataGenerator, IpcWriteOptions}, }, - catalog::{ - logical::{ - self, - for_query::{ - CreateCatalogError as CreateLogicalCatalogError, create as create_logical_catalog, - }, - }, - physical::{ - Catalog, EarliestBlockError, - for_query::{ - CreateCatalogError as PhysicalCreateCatalogError, - create as create_physical_table_catalog, - }, + catalog::physical::{ + Catalog, EarliestBlockError, + for_query::{ + CreateCatalogError as PhysicalCreateCatalogError, + create as create_physical_table_catalog, }, }, context::{ - exec::{self, ExecContext}, - plan::{self, PlanContext}, + exec::{self, ExecContextBuilder}, + plan::{self, PlanContextBuilder}, }, cursor::Cursor, dataset_store::{DatasetStore, GetDatasetError}, detached_logical_plan::{AttachPlanError, DetachedLogicalPlan}, exec_env::ExecEnv, - sql::{ - ResolveFunctionReferencesError, ResolveTableReferencesError, resolve_function_references, - resolve_table_references, - }, + sql::{ResolveFunctionReferencesError, ResolveTableReferencesError, resolve_table_references}, sql_str::SqlStr, streaming_query::{QueryMessage, StreamingQuery}, }; @@ -68,7 +58,6 @@ use futures::{ Stream, StreamExt as _, TryStreamExt, stream::{self, BoxStream}, }; -use js_runtime::isolate_pool::IsolatePool; use metadata_db::{MetadataDb, NotificationMultiplexerHandle, notification_multiplexer}; use monitoring::telemetry::metrics::Meter; use prost::Message as _; @@ -88,7 +77,6 @@ type TonicStream = Pin> + Send + 'sta pub struct Service { config: Arc, env: ExecEnv, - dataset_store: DatasetStore, notification_multiplexer: Arc, metrics: Option>, } @@ -106,6 +94,7 @@ impl Service { config.query_max_mem_mb, &config.spill_location, data_store, + dataset_store, ) .map_err(InitError::ExecEnv)?; let notification_multiplexer = @@ -115,7 +104,6 @@ impl Service { Ok(Self { config, env, - dataset_store, notification_multiplexer, metrics, }) @@ -128,31 +116,32 @@ impl Service { cursor: Option, ) -> Result { let query = common::sql::parse(sql.as_ref()).map_err(Error::SqlParse)?; - let catalog = { - let table_refs = resolve_table_references::(&query) - .map_err(Error::TableReferenceResolution)?; - let func_refs = resolve_function_references::(&query) - .map_err(Error::FunctionReferenceResolution)?; - let logical = create_logical_catalog( - &self.dataset_store, - &self.env.isolate_pool, - (table_refs, func_refs), - ) - .await - .map_err(Error::CreateLogicalCatalogError)?; - create_physical_table_catalog(&self.dataset_store, &self.env.store, logical) - .await - .map_err(Error::PhysicalCatalogError) - }?; + let physical_table_refs = resolve_table_references::(&query) + .map_err(Error::TableReferenceResolution)?; + let catalog = create_physical_table_catalog( + &self.env.dataset_store, + &self.env.store, + physical_table_refs, + ) + .await + .map_err(Error::PhysicalCatalogError)?; - let ctx = PlanContext::new(self.env.session_config.clone(), catalog.logical().clone()); - let plan = ctx.plan_sql(query.clone()).await.map_err(Error::PlanSql)?; + let amp_catalog = Arc::new(AmpCatalogProvider::new( + self.env.dataset_store.clone(), + self.env.isolate_pool.clone(), + )); + let ctx = PlanContextBuilder::new(self.env.session_config.clone()) + .with_table_catalog(AMP_CATALOG_NAME, amp_catalog.clone()) + .with_func_catalog(AMP_CATALOG_NAME, amp_catalog) + .build(); + let plan = ctx + .statement_to_plan(query.clone()) + .await + .map_err(Error::PlanSql)?; let is_streaming = is_streaming.unwrap_or_else(|| common::stream_helpers::is_streaming(&query)); - let result = self - .execute_plan(catalog, &self.dataset_store, plan, is_streaming, cursor) - .await; + let result = self.execute_plan(catalog, plan, is_streaming, cursor).await; // Record execution error if result.is_err() @@ -173,7 +162,6 @@ impl Service { pub async fn execute_plan( &self, catalog: Catalog, - dataset_store: &DatasetStore, plan: DetachedLogicalPlan, is_streaming: bool, cursor: Option, @@ -187,7 +175,8 @@ impl Service { // If not streaming or metadata db is not available, execute once if !is_streaming { - let ctx = ExecContext::for_catalog(self.env.clone(), catalog, false) + let ctx = ExecContextBuilder::new(self.env.clone()) + .for_catalog(catalog, false) .await .map_err(Error::CreateExecContext)?; let plan = plan.attach_to(&ctx).map_err(Error::AttachPlan)?; @@ -249,7 +238,6 @@ impl Service { let query = StreamingQuery::spawn( self.env.clone(), catalog, - dataset_store, plan, earliest_block, None, @@ -313,19 +301,14 @@ impl Service { let query = common::sql::parse(&sql_str).map_err(Error::SqlParse)?; let plan_ctx = { - let table_refs = resolve_table_references::(&query) - .map_err(Error::TableReferenceResolution)?; - let func_refs = resolve_function_references::(&query) - .map_err(Error::FunctionReferenceResolution)?; - - let catalog = create_logical_catalog( - &self.dataset_store, - &IsolatePool::dummy(), - (table_refs, func_refs), - ) - .await - .map_err(Error::CreateLogicalCatalogError)?; - PlanContext::new(self.env.session_config.clone(), catalog) + let amp_catalog = Arc::new(AmpCatalogProvider::new( + self.env.dataset_store.clone(), + self.env.isolate_pool.clone(), + )); + PlanContextBuilder::new(self.env.session_config.clone()) + .with_table_catalog(AMP_CATALOG_NAME, amp_catalog.clone()) + .with_func_catalog(AMP_CATALOG_NAME, amp_catalog) + .build() }; let is_streaming = streaming_override @@ -333,7 +316,7 @@ impl Service { let schema = plan_ctx .sql_output_schema(query) .await - .map_err(Error::PlanSql)?; + .map_err(Error::SqlToSchema)?; let ticket = AmpTicket { query: sql_query.query, is_streaming, @@ -938,63 +921,129 @@ fn split_batch_for_grpc_response( #[derive(Error, Debug)] #[expect(clippy::enum_variant_names)] pub enum Error { + /// Failed to decode a Protocol Buffers message from the client request. + /// + /// Returned as `invalid_argument` to the client. #[error("ProtocolBuffers decoding error: {0}")] PbDecodeError(String), + /// Client sent a flight descriptor with an unrecognized descriptor type. + /// + /// Returned as `invalid_argument` to the client. #[error("unsupported flight descriptor type: {0}")] UnsupportedFlightDescriptorType(String), + /// Client sent a flight descriptor with an unsupported `Any`-typed command. + /// + /// Returned as `invalid_argument` to the client. #[error("unsupported flight descriptor command: {0}")] UnsupportedFlightDescriptorCommand(String), + /// DataFusion execution engine error during query processing. + /// + /// Maps to gRPC status based on the underlying DataFusion error kind + /// (e.g., `resource_exhausted` for memory limits). #[error("query execution error")] ExecutionError(#[source] DataFusionError), + /// Failed to look up datasets required for query planning. + /// + /// Indicates a dataset store backend failure. #[error("error looking up datasets")] DatasetStoreError(#[source] GetDatasetError), - #[error("error creating logical catalog")] - CreateLogicalCatalogError(#[source] CreateLogicalCatalogError), - + /// Failed to create the physical catalog for query execution. + /// + /// Indicates a backend failure during catalog snapshot construction. #[error("error creating physical catalog")] PhysicalCatalogError(#[source] PhysicalCreateCatalogError), + /// SQL text contains table references that cannot be parsed as dataset identifiers. + /// + /// Returned as `invalid_argument` to the client. #[error("Failed to resolve table references from SQL")] TableReferenceResolution(#[source] ResolveTableReferencesError), + /// SQL text contains function references that cannot be parsed as dataset identifiers. + /// + /// Returned as `invalid_argument` to the client. #[error("Failed to resolve function references from SQL")] FunctionReferenceResolution(#[source] ResolveFunctionReferencesError), + /// Client SQL text failed to parse as a valid SQL statement. + /// + /// Returned as `BAD_REQUEST` / `invalid_argument` to the client. #[error("SQL parse error")] SqlParse(#[source] common::sql::ParseSqlError), + /// SQL logical planning failed. + /// + /// When [`is_user_input_error`](plan::is_user_input_error) is true the + /// error is surfaced as `invalid_argument`; otherwise it is treated as + /// an internal failure. #[error("failed to plan SQL query")] - PlanSql(#[source] plan::SqlError), - + PlanSql(#[source] DataFusionError), + + /// Failed to infer the output schema from a SQL statement. + /// + /// When [`is_user_input_error`](plan::is_user_input_error) is true the + /// error is surfaced as `invalid_argument`; otherwise it is treated as + /// an internal failure. + #[error("failed to infer SQL output schema")] + SqlToSchema(#[source] DataFusionError), + + /// Failed to construct the execution context (catalog snapshot assembly). + /// + /// Indicates a backend failure during exec context creation. #[error("failed to create exec context")] CreateExecContext(#[source] exec::CreateContextError), + /// Failed to attach a detached logical plan to an execution context. + /// + /// Occurs when the plan's table providers cannot be re-bound. #[error("failed to attach plan to query context")] AttachPlan(#[source] AttachPlanError), + /// Failed to compute the common block ranges across query tables. + /// + /// Transport mapping per sub-error: + /// - `TableNotFound` → `NOT_FOUND` / `not_found` (table absent from catalog at execution time) + /// - `ExtractTableReferences` → `INTERNAL_SERVER_ERROR` / `internal` (plan traversal failure) + /// - `TableReferenceConversion` → `INTERNAL_SERVER_ERROR` / `internal` (post-plan reference conversion failure) #[error("failed to compute common ranges")] QueryCommonRanges(#[source] exec::CommonRangesError), + /// Physical plan execution failed (table registration or DataFusion execution). + /// + /// Sub-errors distinguish registration failures from core execution failures. #[error("failed to execute plan")] QueryExecutePlan(#[source] exec::ExecutePlanError), + /// Client query is structurally invalid (e.g., missing required clauses). + /// + /// Returned as `BAD_REQUEST` / `invalid_argument` to the client. #[error("invalid query: {0}")] InvalidQuery(String), + /// Error during streaming query execution. + /// + /// Wraps errors from the streaming query pipeline + /// (microbatch iteration, reorg handling). #[error("streaming query execution error: {0}")] StreamingExecutionError(String), #[error("failed to determine streaming start block")] StreamingEarliestBlock(#[source] EarliestBlockError), + /// Failed to encode a Flight ticket for the client. + /// + /// Returned as `invalid_argument` to the client. #[error("ticket encoding error")] TicketEncodingError(#[source] DataFusionError), + /// Failed to decode a Flight ticket received from the client. + /// + /// Returned as `invalid_argument` to the client. #[error("ticket decoding error")] TicketDecodingError(#[source] DataFusionError), } @@ -1007,13 +1056,14 @@ impl Error { Error::UnsupportedFlightDescriptorCommand(_) => "UNSUPPORTED_FLIGHT_DESCRIPTOR_COMMAND", Error::ExecutionError(_) => "EXECUTION_ERROR", Error::DatasetStoreError(_) => "DATASET_STORE_ERROR", - Error::CreateLogicalCatalogError(_) => "CREATE_LOGICAL_CATALOG_ERROR", Error::PhysicalCatalogError(_) => "PHYSICAL_CATALOG_ERROR", Error::TableReferenceResolution(_) => "TABLE_REFERENCE_RESOLUTION", Error::FunctionReferenceResolution(_) => "FUNCTION_REFERENCE_RESOLUTION", Error::SqlParse(_) => "SQL_PARSE_ERROR", - Error::PlanSql(e) if e.is_invalid_plan() => "INVALID_PLAN", + Error::PlanSql(e) if plan::is_user_input_error(e) => "INVALID_PLAN", Error::PlanSql(_) => "PLANNING_ERROR", + Error::SqlToSchema(e) if plan::is_user_input_error(e) => "INVALID_PLAN", + Error::SqlToSchema(_) => "PLANNING_ERROR", Error::CreateExecContext(exec::CreateContextError::CatalogSnapshot(_)) => { "CATALOG_SNAPSHOT_ERROR" } @@ -1063,23 +1113,26 @@ impl IntoResponse for Error { fn into_response(self) -> axum::response::Response { let status_code = match &self { Error::SqlParse(_) => StatusCode::BAD_REQUEST, - Error::PlanSql(e) if e.is_invalid_plan() => StatusCode::BAD_REQUEST, + Error::PlanSql(e) if plan::is_user_input_error(e) => StatusCode::BAD_REQUEST, Error::PlanSql(_) => StatusCode::INTERNAL_SERVER_ERROR, + Error::SqlToSchema(e) if plan::is_user_input_error(e) => StatusCode::BAD_REQUEST, + Error::SqlToSchema(_) => StatusCode::INTERNAL_SERVER_ERROR, Error::CreateExecContext(_) => StatusCode::INTERNAL_SERVER_ERROR, Error::AttachPlan(_) => StatusCode::INTERNAL_SERVER_ERROR, Error::QueryCommonRanges(exec::CommonRangesError::TableNotFound(_)) => { StatusCode::NOT_FOUND } - Error::QueryCommonRanges(_) => StatusCode::INTERNAL_SERVER_ERROR, + Error::QueryCommonRanges(exec::CommonRangesError::ExtractTableReferences(_)) => { + StatusCode::INTERNAL_SERVER_ERROR + } + Error::QueryCommonRanges(exec::CommonRangesError::TableReferenceConversion(_)) => { + StatusCode::INTERNAL_SERVER_ERROR + } Error::QueryExecutePlan(_) => StatusCode::INTERNAL_SERVER_ERROR, Error::ExecutionError(_) => StatusCode::INTERNAL_SERVER_ERROR, Error::StreamingExecutionError(_) => StatusCode::INTERNAL_SERVER_ERROR, Error::StreamingEarliestBlock(_) => StatusCode::INTERNAL_SERVER_ERROR, Error::DatasetStoreError(_) => StatusCode::INTERNAL_SERVER_ERROR, - Error::CreateLogicalCatalogError(CreateLogicalCatalogError::ResolveTables( - logical::for_query::ResolveTablesError::TableNotFoundInDataset { .. }, - )) => StatusCode::NOT_FOUND, - Error::CreateLogicalCatalogError(_) => StatusCode::INTERNAL_SERVER_ERROR, Error::PhysicalCatalogError(_) => StatusCode::INTERNAL_SERVER_ERROR, Error::TableReferenceResolution(_) => StatusCode::BAD_REQUEST, Error::FunctionReferenceResolution(_) => StatusCode::BAD_REQUEST, @@ -1108,19 +1161,27 @@ impl From for Status { Error::UnsupportedFlightDescriptorCommand(_) => Status::invalid_argument(message), Error::DatasetStoreError(_) => Status::internal(message), Error::SqlParse(_) => Status::invalid_argument(message), - Error::PlanSql(e) if e.is_invalid_plan() => Status::invalid_argument(message), + Error::PlanSql(e) if plan::is_user_input_error(e) => Status::invalid_argument(message), Error::PlanSql(_) => Status::internal(message), + Error::SqlToSchema(e) if plan::is_user_input_error(e) => { + Status::invalid_argument(message) + } + Error::SqlToSchema(_) => Status::internal(message), Error::CreateExecContext(_) => Status::internal(message), Error::AttachPlan(_) => Status::internal(message), Error::QueryCommonRanges(exec::CommonRangesError::TableNotFound(_)) => { Status::not_found(message) } - Error::QueryCommonRanges(_) => Status::internal(message), + Error::QueryCommonRanges(exec::CommonRangesError::ExtractTableReferences(_)) => { + Status::internal(message) + } + Error::QueryCommonRanges(exec::CommonRangesError::TableReferenceConversion(_)) => { + Status::internal(message) + } Error::QueryExecutePlan(_) => Status::internal(message), Error::ExecutionError(df) => datafusion_error_to_status(df, message), Error::StreamingExecutionError(_) => Status::internal(message), Error::StreamingEarliestBlock(_) => Status::internal(message), - Error::CreateLogicalCatalogError(_) => Status::internal(message), Error::PhysicalCatalogError(_) => Status::internal(message), Error::TableReferenceResolution(_) => Status::invalid_argument(message), Error::FunctionReferenceResolution(_) => Status::invalid_argument(message), @@ -1161,11 +1222,14 @@ fn error_with_causes(err: &dyn std::error::Error) -> String { #[cfg(test)] mod tests { + use axum::{http::StatusCode, response::IntoResponse}; use common::{ catalog::physical::EarliestBlockError, context::exec, physical_table::{CanonicalChainError, MultiNetworkSegmentsError, SnapshotError}, }; + use datafusion::error::DataFusionError; + use tonic::Code; use super::*; @@ -1219,4 +1283,205 @@ mod tests { //* Then assert_eq!(code, "STREAMING_EXECUTION_ERROR"); } + + #[test] + fn from_error_with_plan_sql_user_input_error_returns_invalid_argument() { + //* Given + let error = Error::PlanSql( + DataFusionError::Plan( + "failed to extract table references: Catalog-qualified table references are not supported: amp.public.blocks" + .to_string(), + ) + .context("amp::invalid_input"), + ); + + //* When + let status = Status::from(error); + + //* Then + assert_eq!(status.code(), Code::InvalidArgument); + } + + #[test] + fn into_response_with_sql_to_schema_user_input_error_returns_bad_request() { + //* Given + let error = Error::SqlToSchema( + DataFusionError::Plan( + "failed to extract function references: Catalog-qualified function references are not supported: amp.public.identity_udf" + .to_string(), + ) + .context("amp::invalid_input"), + ); + + //* When + let response = error.into_response(); + + //* Then + assert_eq!(response.status(), StatusCode::BAD_REQUEST); + } + + /// Regression: classification is based on context tag, not message text. + /// Changing the diagnostic message must not affect the status code. + #[test] + fn from_error_with_plan_sql_user_input_different_message_returns_invalid_argument() { + //* Given + let error = Error::PlanSql( + DataFusionError::Plan("completely different diagnostic text".to_string()) + .context("amp::invalid_input"), + ); + + //* When + let status = Status::from(error); + + //* Then + assert_eq!(status.code(), Code::InvalidArgument); + } + + /// Regression: internal resolution errors remain internal, regardless of message. + #[test] + fn from_error_with_plan_sql_internal_error_returns_internal() { + //* Given + let error = Error::PlanSql(DataFusionError::Plan( + "failed to resolve async catalog provider".to_string(), + )); + + //* When + let status = Status::from(error); + + //* Then + assert_eq!(status.code(), Code::Internal); + } + + /// Regression (T29): user-input tag inside a nested context wrapper must + /// still map to `invalid_argument`. A future caller that adds an outer + /// `.context("…")` to an already-tagged error must not accidentally + /// produce an `internal` gRPC status. + #[test] + fn from_error_with_plan_sql_nested_user_input_tag_returns_invalid_argument() { + //* Given — amp::invalid_input is wrapped by an outer context layer + let tagged = DataFusionError::Plan("invalid table reference".to_string()) + .context("amp::invalid_input"); + let error = + Error::PlanSql(tagged.context("failed to convert SQL statement to logical plan")); + + //* When + let status = Status::from(error); + + //* Then + assert_eq!(status.code(), Code::InvalidArgument); + } + + /// Regression (T29): same as above, but for the HTTP transport path and + /// `SqlToSchema` errors. + #[test] + fn into_response_with_sql_to_schema_nested_user_input_tag_returns_bad_request() { + //* Given — amp::invalid_input is wrapped by an outer context layer + let tagged = DataFusionError::Plan("invalid function reference".to_string()) + .context("amp::invalid_input"); + let error = Error::SqlToSchema(tagged.context("failed to infer output schema")); + + //* When + let response = error.into_response(); + + //* Then + assert_eq!(response.status(), StatusCode::BAD_REQUEST); + } + + fn query_common_ranges_table_not_found_error() -> Error { + use common::sql::TableReference; + let table_name = "blocks".parse().expect("should parse valid table name"); + let table_ref = TableReference::::Bare { + table: Arc::new(table_name), + }; + Error::QueryCommonRanges(exec::CommonRangesError::TableNotFound( + exec::TableNotFoundError(table_ref), + )) + } + + fn query_common_ranges_extract_table_references_error() -> Error { + Error::QueryCommonRanges(exec::CommonRangesError::ExtractTableReferences( + DataFusionError::Plan("failed to traverse plan tree".to_string()), + )) + } + + fn query_common_ranges_table_reference_conversion_error() -> Error { + use common::sql::TableReferenceConversionError; + Error::QueryCommonRanges(exec::CommonRangesError::TableReferenceConversion( + TableReferenceConversionError::CatalogQualifiedTable { + table_ref: "catalog.schema.table".to_string(), + }, + )) + } + + #[test] + fn into_response_with_query_common_ranges_table_not_found_returns_not_found() { + //* Given + let error = query_common_ranges_table_not_found_error(); + + //* When + let response = error.into_response(); + + //* Then + assert_eq!(response.status(), StatusCode::NOT_FOUND); + } + + #[test] + fn from_error_with_query_common_ranges_table_not_found_returns_not_found() { + //* Given + let error = query_common_ranges_table_not_found_error(); + + //* When + let status = Status::from(error); + + //* Then + assert_eq!(status.code(), Code::NotFound); + } + + #[test] + fn into_response_with_query_common_ranges_extract_table_references_returns_internal() { + //* Given + let error = query_common_ranges_extract_table_references_error(); + + //* When + let response = error.into_response(); + + //* Then + assert_eq!(response.status(), StatusCode::INTERNAL_SERVER_ERROR); + } + + #[test] + fn from_error_with_query_common_ranges_extract_table_references_returns_internal() { + //* Given + let error = query_common_ranges_extract_table_references_error(); + + //* When + let status = Status::from(error); + + //* Then + assert_eq!(status.code(), Code::Internal); + } + + #[test] + fn into_response_with_query_common_ranges_table_reference_conversion_returns_internal() { + //* Given + let error = query_common_ranges_table_reference_conversion_error(); + + //* When + let response = error.into_response(); + + //* Then + assert_eq!(response.status(), StatusCode::INTERNAL_SERVER_ERROR); + } + + #[test] + fn from_error_with_query_common_ranges_table_reference_conversion_returns_internal() { + //* Given + let error = query_common_ranges_table_reference_conversion_error(); + + //* When + let status = Status::from(error); + + //* Then + assert_eq!(status.code(), Code::Internal); + } } diff --git a/tests/src/testlib/helpers.rs b/tests/src/testlib/helpers.rs index 202c5169b..3a9ef3a44 100644 --- a/tests/src/testlib/helpers.rs +++ b/tests/src/testlib/helpers.rs @@ -15,7 +15,7 @@ use anyhow::{Result, anyhow}; use common::{ BlockRange, catalog::{ - logical::{LogicalCatalog, LogicalTable}, + logical::LogicalTable, physical::{Catalog, CatalogTable}, }, dataset_store::DatasetStore, @@ -375,8 +375,12 @@ pub async fn catalog_for_dataset( ) }) .collect(); - let logical = LogicalCatalog::from_tables(resolved_tables.iter()); - Ok(Catalog::new(logical, tables)) + Ok(Catalog::new( + resolved_tables, + vec![], + tables, + Default::default(), + )) } /// Create a test metrics context for validating metrics collection. diff --git a/tests/src/tests/it_admin_api_schema.rs b/tests/src/tests/it_admin_api_schema.rs index b98103a73..d3f5fad5e 100644 --- a/tests/src/tests/it_admin_api_schema.rs +++ b/tests/src/tests/it_admin_api_schema.rs @@ -5,7 +5,7 @@ //! ## Test Coverage //! //! ### Table Qualification Levels -//! - Unqualified: `blocks` → ERROR (BAD_REQUEST) +//! - Unqualified: `blocks` → ERROR (INTERNAL_SERVER_ERROR, lazy resolution) //! - Catalog-qualified: `catalog.eth_firehose.blocks` → ERROR (BAD_REQUEST) //! //! ### Error Conditions @@ -34,7 +34,7 @@ async fn resolve_schema_with_unqualified_table_fails() { //* Then assert_eq!( resp.status(), - StatusCode::BAD_REQUEST, + StatusCode::INTERNAL_SERVER_ERROR, "schema resolution should fail for unqualified table reference" ); @@ -44,13 +44,8 @@ async fn resolve_schema_with_unqualified_table_fails() { .expect("failed to parse error response JSON"); assert_eq!( - response.error_code, "UNQUALIFIED_TABLE", - "should return UNQUALIFIED_TABLE for unqualified table" - ); - assert!( - response.error_message.contains("Unqualified table"), - "error message should mention unqualified table, got: {}", - response.error_message + response.error_code, "SCHEMA_INFERENCE", + "should return SCHEMA_INFERENCE for unqualified table (lazy resolution)" ); } @@ -82,11 +77,13 @@ async fn resolve_schema_with_catalog_qualified_table_fails() { .expect("failed to parse error response JSON"); assert_eq!( - response.error_code, "CATALOG_QUALIFIED_TABLE", - "should return CATALOG_QUALIFIED_TABLE for catalog-qualified table" + response.error_code, "INVALID_PLAN", + "should return INVALID_PLAN for catalog-qualified table" ); assert!( - response.error_message.contains("Catalog-qualified table"), + response + .error_message + .contains("Catalog-qualified table references are not supported"), "error message should mention catalog-qualified table, got: {}", response.error_message ); @@ -607,7 +604,7 @@ async fn multiple_tables_unqualified_table_fails() { //* Then assert_eq!( resp.status(), - StatusCode::BAD_REQUEST, + StatusCode::INTERNAL_SERVER_ERROR, "schema resolution should fail with unqualified table" ); @@ -617,8 +614,8 @@ async fn multiple_tables_unqualified_table_fails() { .expect("failed to parse error response JSON"); assert_eq!( - response.error_code, "UNQUALIFIED_TABLE", - "should return UNQUALIFIED_TABLE" + response.error_code, "SCHEMA_INFERENCE", + "should return SCHEMA_INFERENCE for unqualified table (lazy resolution)" ); } @@ -655,8 +652,8 @@ async fn multiple_tables_catalog_qualified_fails() { .expect("failed to parse error response JSON"); assert_eq!( - response.error_code, "CATALOG_QUALIFIED_TABLE", - "should return CATALOG_QUALIFIED_TABLE" + response.error_code, "INVALID_PLAN", + "should return INVALID_PLAN for catalog-qualified table" ); } @@ -716,7 +713,7 @@ async fn multiple_tables_undefined_alias_fails() { //* Then assert_eq!( resp.status(), - StatusCode::BAD_REQUEST, + StatusCode::INTERNAL_SERVER_ERROR, "schema resolution should fail with undefined alias" ); @@ -726,8 +723,8 @@ async fn multiple_tables_undefined_alias_fails() { .expect("failed to parse error response JSON"); assert_eq!( - response.error_code, "DEPENDENCY_ALIAS_NOT_FOUND", - "should return DEPENDENCY_ALIAS_NOT_FOUND" + response.error_code, "SCHEMA_INFERENCE", + "should return SCHEMA_INFERENCE for undefined alias (lazy resolution)" ); } @@ -1190,7 +1187,7 @@ async fn multiple_tables_table_not_in_dataset_fails() { //* Then assert_eq!( resp.status(), - StatusCode::NOT_FOUND, + StatusCode::INTERNAL_SERVER_ERROR, "schema resolution should fail with table not in dataset" ); @@ -1200,8 +1197,8 @@ async fn multiple_tables_table_not_in_dataset_fails() { .expect("failed to parse error response JSON"); assert_eq!( - response.error_code, "TABLE_NOT_FOUND_IN_DATASET", - "should return TABLE_NOT_FOUND_IN_DATASET" + response.error_code, "SCHEMA_INFERENCE", + "should return SCHEMA_INFERENCE for table not in dataset (lazy resolution)" ); } @@ -1410,8 +1407,8 @@ async fn function_not_in_dataset_fails_at_catalog_construction() { //* Then assert_eq!( resp.status(), - StatusCode::NOT_FOUND, - "schema resolution should fail at catalog construction with 404" + StatusCode::INTERNAL_SERVER_ERROR, + "schema resolution should fail for nonexistent function (lazy resolution)" ); let response: ErrorResponse = resp @@ -1420,14 +1417,8 @@ async fn function_not_in_dataset_fails_at_catalog_construction() { .expect("failed to parse error response JSON"); assert_eq!( - response.error_code, "FUNCTION_NOT_FOUND_IN_DATASET", - "should return FUNCTION_NOT_FOUND_IN_DATASET error" - ); - assert!( - response.error_message.contains("nonexistent_function") - && response.error_message.contains("_/eth_firehose"), - "error message should indicate function and dataset, got: {}", - response.error_message + response.error_code, "SCHEMA_INFERENCE", + "should return SCHEMA_INFERENCE for nonexistent function (lazy resolution)" ); } @@ -1458,7 +1449,7 @@ async fn multiple_tables_with_missing_function_fails_on_first() { //* Then assert_eq!( resp.status(), - StatusCode::NOT_FOUND, + StatusCode::INTERNAL_SERVER_ERROR, "schema resolution should fail when a table references missing function" ); @@ -1468,14 +1459,8 @@ async fn multiple_tables_with_missing_function_fails_on_first() { .expect("failed to parse error response JSON"); assert_eq!( - response.error_code, "FUNCTION_NOT_FOUND_IN_DATASET", - "should return FUNCTION_NOT_FOUND_IN_DATASET error" - ); - // Should fail on table2 which references fake_decode - assert!( - response.error_message.contains("table2") && response.error_message.contains("fake_decode"), - "error message should reference the failing table and function, got: {}", - response.error_message + response.error_code, "SCHEMA_INFERENCE", + "should return SCHEMA_INFERENCE for missing function (lazy resolution)" ); } @@ -1550,17 +1535,10 @@ async fn function_with_catalog_qualification_fails() { .await .expect("failed to parse error response JSON"); - // The error should indicate catalog-qualified function (not supported) + // The error should indicate invalid plan (catalog-qualified functions caught by planner) assert_eq!( - response.error_code, "CATALOG_QUALIFIED_FUNCTION", - "should return CATALOG_QUALIFIED_FUNCTION for catalog-qualified function" - ); - assert!( - response - .error_message - .contains("Catalog-qualified function references are not supported"), - "error message should mention catalog-qualified functions are not supported, got: {}", - response.error_message + response.error_code, "INVALID_PLAN", + "should return INVALID_PLAN for catalog-qualified function" ); } @@ -1596,15 +1574,10 @@ async fn function_with_invalid_format_fails() { .await .expect("failed to parse error response JSON"); - // The error should indicate function reference resolution failure + // The error should indicate invalid plan (invalid function format caught by planner) assert_eq!( - response.error_code, "FUNCTION_REFERENCE_RESOLUTION", - "should return FUNCTION_REFERENCE_RESOLUTION for invalid function format" - ); - assert!( - response.error_message.contains("Invalid function format"), - "error message should mention invalid function format, got: {}", - response.error_message + response.error_code, "INVALID_PLAN", + "should return INVALID_PLAN for invalid function format" ); } @@ -1631,7 +1604,7 @@ async fn function_with_undefined_alias_fails() { //* Then assert_eq!( resp.status(), - StatusCode::BAD_REQUEST, + StatusCode::INTERNAL_SERVER_ERROR, "schema resolution should fail with undefined function alias" ); @@ -1641,13 +1614,8 @@ async fn function_with_undefined_alias_fails() { .expect("failed to parse error response JSON"); assert_eq!( - response.error_code, "DEPENDENCY_ALIAS_NOT_FOUND", - "should return DEPENDENCY_ALIAS_NOT_FOUND for undefined function alias" - ); - assert!( - response.error_message.contains("undefined_alias"), - "error message should mention the undefined alias, got: {}", - response.error_message + response.error_code, "SCHEMA_INFERENCE", + "should return SCHEMA_INFERENCE for undefined function alias (lazy resolution)" ); } @@ -1711,7 +1679,7 @@ async fn multiple_functions_mixed_validity_fails() { //* Then assert_eq!( resp.status(), - StatusCode::NOT_FOUND, + StatusCode::INTERNAL_SERVER_ERROR, "schema resolution should fail due to invalid function despite valid built-ins" ); @@ -1721,13 +1689,8 @@ async fn multiple_functions_mixed_validity_fails() { .expect("failed to parse error response JSON"); assert_eq!( - response.error_code, "FUNCTION_NOT_FOUND_IN_DATASET", - "should return FUNCTION_NOT_FOUND_IN_DATASET error" - ); - assert!( - response.error_message.contains("nonexistent_fn"), - "error message should indicate the invalid function, got: {}", - response.error_message + response.error_code, "SCHEMA_INFERENCE", + "should return SCHEMA_INFERENCE for nonexistent function (lazy resolution)" ); } diff --git a/tests/src/tests/it_reorg.rs b/tests/src/tests/it_reorg.rs index a6b86c030..cefeccbc1 100644 --- a/tests/src/tests/it_reorg.rs +++ b/tests/src/tests/it_reorg.rs @@ -4,10 +4,8 @@ use alloy::primitives::BlockHash; use arrow_flight::FlightData; use common::{ BlockNum, BlockRange, - catalog::{ - logical::for_query as logical_catalog, physical::for_query as physical_table_catalog, - }, - sql::{self, resolve_function_references, resolve_table_references}, + catalog::physical::for_query as physical_table_catalog, + sql::{self, resolve_table_references}, sql_str::SqlStr, }; use datasets_common::{partial_reference::PartialReference, reference::Reference}; @@ -476,29 +474,13 @@ impl ReorgTestCtx { let test_env = &self.ctx; let sql_query = SqlStr::new_unchecked(format!("select * from {}.blocks", dataset)); let sql = sql::parse(&sql_query).expect("Failed to parse SQL for dataset.blocks"); - let env = common::exec_env::create( - test_env.daemon_server().config().max_mem_mb, - test_env.daemon_server().config().query_max_mem_mb, - &test_env.daemon_server().config().spill_location, - test_env.daemon_server().data_store().clone(), - ) - .expect("Failed to create query environment"); let catalog = { let table_refs = resolve_table_references::(&sql) .expect("Failed to resolve table references"); - let func_refs = resolve_function_references::(&sql) - .expect("Failed to resolve function references"); - let logical = logical_catalog::create( - test_env.daemon_server().dataset_store(), - &env.isolate_pool, - (table_refs, func_refs), - ) - .await - .expect("Failed to create logical catalog"); physical_table_catalog::create( test_env.daemon_server().dataset_store(), test_env.daemon_server().data_store(), - logical, + table_refs, ) .await .expect("Failed to create physical catalog for SQL query") @@ -645,7 +627,7 @@ async fn check_batch(flight_client: &mut FlightClient, take: usize) { let blocks: Vec = serde_json::from_value(blocks).expect("Failed to deserialize blocks"); - let mut by_number: BTreeMap> = BTreeMap::new(); + let mut by_number: BTreeMap> = Default::default(); for block in blocks { by_number.entry(block.block_num).or_default().push(block); } diff --git a/tests/src/tests/it_server_sql_references.rs b/tests/src/tests/it_server_sql_references.rs new file mode 100644 index 000000000..59fccfd6c --- /dev/null +++ b/tests/src/tests/it_server_sql_references.rs @@ -0,0 +1,357 @@ +use std::time::Duration; + +use datasets_common::reference::Reference; +use monitoring::logging; + +use crate::testlib::{ + self, ctx::TestCtxBuilder, fixtures::DatasetPackage, helpers as test_helpers, +}; + +#[tokio::test] +async fn dataset_query_with_semver_ref_returns_data() { + //* Given + let mut ctx = TestCtx::new("dataset_ref_semver").await; + ctx.restore_eth_firehose_snapshot().await; + ctx.register_multi_version_dataset("0.0.1").await; + ctx.materialize_multi_version_dataset("_/multi_version@0.0.1") + .await; + + //* When + let (results, _) = ctx + .query(r#"SELECT block_num FROM "_/multi_version@0.0.1".blocks ORDER BY block_num LIMIT 1"#) + .await; + + //* Then + assert_eq!(results, serde_json::json!([{"block_num": 15000000}])); +} + +#[tokio::test] +async fn dataset_query_with_bare_name_ref_returns_latest() { + //* Given + let mut ctx = TestCtx::new("dataset_ref_latest").await; + ctx.restore_eth_firehose_snapshot().await; + ctx.register_multi_version_dataset("0.0.1").await; + ctx.materialize_multi_version_dataset("_/multi_version@0.0.1") + .await; + + //* When + let (results, _) = ctx + .query("SELECT block_num FROM multi_version.blocks ORDER BY block_num LIMIT 1") + .await; + + //* Then + assert_eq!(results, serde_json::json!([{"block_num": 15000000}])); +} + +#[tokio::test] +async fn dataset_query_with_explicit_latest_ref_returns_data() { + //* Given + let mut ctx = TestCtx::new("dataset_ref_latest_explicit").await; + ctx.restore_eth_firehose_snapshot().await; + ctx.register_multi_version_dataset("0.0.1").await; + ctx.materialize_multi_version_dataset("_/multi_version@0.0.1") + .await; + + //* When + let (results, _) = ctx + .query( + r#"SELECT block_num FROM "_/multi_version@latest".blocks ORDER BY block_num LIMIT 1"#, + ) + .await; + + //* Then + assert_eq!(results, serde_json::json!([{"block_num": 15000000}])); +} + +#[tokio::test] +async fn dataset_query_with_dev_ref_returns_data() { + //* Given + let mut ctx = TestCtx::new("dataset_ref_dev").await; + ctx.restore_eth_firehose_snapshot().await; + ctx.register_multi_version_dataset_dev_revision().await; + ctx.materialize_multi_version_dataset("_/multi_version@dev") + .await; + + //* When + let (results, _) = ctx + .query(r#"SELECT block_num FROM "_/multi_version@dev".blocks ORDER BY block_num LIMIT 1"#) + .await; + + //* Then + assert_eq!(results, serde_json::json!([{"block_num": 15000000}])); +} + +#[tokio::test] +async fn dataset_query_with_hash_ref_returns_data() { + //* Given + let mut ctx = TestCtx::new("dataset_ref_hash").await; + ctx.restore_eth_firehose_snapshot().await; + ctx.register_multi_version_dataset("0.0.1").await; + ctx.materialize_multi_version_dataset("_/multi_version@0.0.1") + .await; + let hash = ctx.get_manifest_hash("_/multi_version@0.0.1").await; + + //* When + let (results, _) = ctx + .query(&format!( + r#"SELECT block_num FROM "_/multi_version@{hash}".blocks ORDER BY block_num LIMIT 1"# + )) + .await; + + //* Then + assert_eq!(results, serde_json::json!([{"block_num": 15000000}])); +} + +#[tokio::test] +async fn dataset_query_with_underscore_namespace_ref_returns_data() { + //* Given + let mut ctx = TestCtx::new("dataset_ref_underscore_ns").await; + ctx.restore_eth_firehose_snapshot().await; + ctx.register_multi_version_dataset("0.0.1").await; + ctx.materialize_multi_version_dataset("_/multi_version@0.0.1") + .await; + + //* When + let (results, _) = ctx + .query(r#"SELECT block_num FROM "_/multi_version@0.0.1".blocks ORDER BY block_num LIMIT 1"#) + .await; + + //* Then + assert_eq!(results, serde_json::json!([{"block_num": 15000000}])); +} + +#[tokio::test] +async fn function_query_with_semver_ref_returns_result() { + //* Given + let mut ctx = TestCtx::new("function_ref_semver").await; + ctx.register_basic_function_dataset("0.0.0").await; + + //* When + let (results, _) = ctx + .query(r#"SELECT "_/basic_function@0.0.0".testString()"#) + .await; + + //* Then + let row = results.as_array().expect("should be array")[0] + .as_object() + .expect("should be object"); + let value = row.values().next().expect("should have a value"); + assert_eq!(value, "I'm a function"); +} + +#[tokio::test] +async fn function_query_with_bare_name_ref_returns_result() { + //* Given + let mut ctx = TestCtx::new("function_ref_latest").await; + ctx.register_basic_function_dataset("0.0.0").await; + + //* When + let (results, _) = ctx.query("SELECT basic_function.testString()").await; + + //* Then + let row = results.as_array().expect("should be array")[0] + .as_object() + .expect("should be object"); + let value = row.values().next().expect("should have a value"); + assert_eq!(value, "I'm a function"); +} + +#[tokio::test] +async fn function_query_with_explicit_latest_ref_returns_result() { + //* Given + let mut ctx = TestCtx::new("function_ref_latest_explicit").await; + ctx.register_basic_function_dataset("0.0.0").await; + + //* When + let (results, _) = ctx + .query(r#"SELECT "_/basic_function@latest".testString()"#) + .await; + + //* Then + let row = results.as_array().expect("should be array")[0] + .as_object() + .expect("should be object"); + let value = row.values().next().expect("should have a value"); + assert_eq!(value, "I'm a function"); +} + +#[tokio::test] +async fn function_query_with_dev_ref_returns_result() { + //* Given + let mut ctx = TestCtx::new("function_ref_dev").await; + ctx.register_basic_function_dataset_dev_revision().await; + + //* When + let (results, _) = ctx + .query(r#"SELECT "_/basic_function@dev".testString()"#) + .await; + + //* Then + let row = results.as_array().expect("should be array")[0] + .as_object() + .expect("should be object"); + let value = row.values().next().expect("should have a value"); + assert_eq!(value, "I'm a function"); +} + +#[tokio::test] +async fn function_query_with_hash_ref_returns_result() { + //* Given + let mut ctx = TestCtx::new("function_ref_hash").await; + ctx.register_basic_function_dataset("0.0.0").await; + let hash = ctx.get_manifest_hash("_/basic_function@0.0.0").await; + + //* When + let (results, _) = ctx + .query(&format!(r#"SELECT "_/basic_function@{hash}".testString()"#)) + .await; + + //* Then + let row = results.as_array().expect("should be array")[0] + .as_object() + .expect("should be object"); + let value = row.values().next().expect("should have a value"); + assert_eq!(value, "I'm a function"); +} + +#[tokio::test] +async fn function_query_with_underscore_namespace_ref_returns_result() { + //* Given + let mut ctx = TestCtx::new("function_ref_underscore_ns").await; + ctx.register_basic_function_dataset("0.0.0").await; + + //* When + let (results, _) = ctx + .query(r#"SELECT "_/basic_function@0.0.0".testString()"#) + .await; + + //* Then + let row = results.as_array().expect("should be array")[0] + .as_object() + .expect("should be object"); + let value = row.values().next().expect("should have a value"); + assert_eq!(value, "I'm a function"); +} + +/// Test context for SQL reference tests. +/// +/// Wraps the test environment and flight client, providing infallible +/// helper methods for common setup steps (snapshot restoration, package +/// registration, dataset dumping, and querying). +struct TestCtx { + ctx: testlib::ctx::TestCtx, + amp_cli: testlib::fixtures::AmpCli, + flight_client: testlib::fixtures::FlightClient, +} + +impl TestCtx { + /// Create a new test context with `eth_firehose` manifest, snapshot, and provider. + async fn new(test_name: &str) -> Self { + logging::init(); + + let ctx = TestCtxBuilder::new(test_name) + .with_dataset_manifest("eth_firehose") + .with_dataset_snapshot("eth_firehose") + .with_provider_config("firehose_eth_mainnet") + .build() + .await + .expect("failed to create test environment"); + + let amp_cli = ctx.new_amp_cli(); + + let flight_client = ctx + .new_flight_client() + .await + .expect("failed to create flight client"); + + Self { + ctx, + amp_cli, + flight_client, + } + } + + /// Restore the `eth_firehose` snapshot so dependency data is available for queries. + async fn restore_eth_firehose_snapshot(&self) { + let ampctl = self.ctx.new_ampctl(); + let eth_firehose_ref: Reference = "_/eth_firehose@0.0.0" + .parse() + .expect("should parse eth_firehose reference"); + test_helpers::restore_dataset_snapshot( + &ctl, + self.ctx.daemon_controller().dataset_store(), + self.ctx.daemon_server().data_store(), + ð_firehose_ref, + ) + .await + .expect("failed to restore eth_firehose snapshot"); + } + + /// Register the `multi_version` package with a semver tag. + async fn register_multi_version_dataset(&self, tag: &str) { + let package = DatasetPackage::new("multi_version", Some("v1.config.ts")); + package + .register(&self.amp_cli, tag) + .await + .expect("failed to register multi_version package"); + } + + /// Register the `multi_version` package as a dev revision (no semver tag). + async fn register_multi_version_dataset_dev_revision(&self) { + let package = DatasetPackage::new("multi_version", Some("v1.config.ts")); + package + .register(&self.amp_cli, None) + .await + .expect("failed to register multi_version package"); + } + + /// Materialize the `multi_version` derived dataset at block 15000000. + async fn materialize_multi_version_dataset(&self, reference: &str) { + let dump_ref: Reference = reference + .parse() + .expect("should parse multi_version reference"); + let ampctl = self.ctx.new_ampctl(); + test_helpers::deploy_and_wait(&ctl, &dump_ref, Some(15000000), Duration::from_secs(60)) + .await + .expect("failed to dump multi_version dataset"); + } + + /// Register the `basic_function` package with a semver tag. + async fn register_basic_function_dataset(&self, tag: &str) { + let package = DatasetPackage::new("basic_function", None); + package + .register(&self.amp_cli, tag) + .await + .expect("failed to register basic_function package"); + } + + /// Register the `basic_function` package as a dev revision (no semver tag). + async fn register_basic_function_dataset_dev_revision(&self) { + let package = DatasetPackage::new("basic_function", None); + package + .register(&self.amp_cli, None) + .await + .expect("failed to register basic_function package"); + } + + /// Execute a SQL query via the Flight SQL client. + async fn query(&mut self, sql: &str) -> (serde_json::Value, usize) { + self.flight_client + .run_query(sql, None) + .await + .expect("query should succeed") + } + + /// Get the latest manifest hash for the given dataset reference. + async fn get_manifest_hash(&self, dataset: &str) -> String { + let dataset_ref: Reference = dataset + .parse() + .expect("should parse dataset reference for hash lookup"); + self.ctx + .new_ampctl() + .get_latest_manifest_hash(&dataset_ref) + .await + .expect("should get manifest hash") + .to_string() + } +} diff --git a/tests/src/tests/mod.rs b/tests/src/tests/mod.rs index dcf968a96..75d5a8bb3 100644 --- a/tests/src/tests/mod.rs +++ b/tests/src/tests/mod.rs @@ -17,6 +17,7 @@ mod it_multi_network_batch; mod it_multi_table_continuous; mod it_non_incremental; mod it_reorg; +mod it_server_sql_references; mod it_solana_historical_to_json_rpc_transition; mod it_sql; mod it_sql_dataset_batch_size;