diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index d04261456d600..de558331e5b26 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -15,23 +15,34 @@ // specific language governing permissions and limitations // under the License. -//! NB: This module is a work in progress. -//! We merged it early in -//! with the skeleton and snapshots matching the current state, -//! but the actual implementation is pending further development. -//! There may be comments or code that are incomplete or inaccurate. //! Two-pass optimizer pipeline that pushes cheap expressions (like struct field //! access `user['status']`) closer to data sources, enabling early data reduction //! and source-level optimizations (e.g., Parquet column pruning). See //! [`ExtractLeafExpressions`] (pass 1) and [`PushDownLeafProjections`] (pass 2). -use datafusion_common::Result; -use datafusion_common::tree_node::Transformed; +use indexmap::{IndexMap, IndexSet}; +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion_common::alias::AliasGenerator; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; +use datafusion_common::{Column, DFSchema, Result, qualified_name}; use datafusion_expr::logical_plan::LogicalPlan; +use datafusion_expr::{Expr, ExpressionPlacement, Projection}; use crate::optimizer::ApplyOrder; +use crate::push_down_filter::replace_cols_by_name; +use crate::utils::has_all_column_refs; use crate::{OptimizerConfig, OptimizerRule}; +/// Prefix for aliases generated by the extraction optimizer passes. +/// +/// This prefix is **reserved for internal optimizer use**. User-defined aliases +/// starting with this prefix may be misidentified as optimizer-generated +/// extraction aliases, leading to unexpected behavior. Do not use this prefix +/// in user queries. +const EXTRACTED_EXPR_PREFIX: &str = "__datafusion_extracted"; + /// Extracts `MoveTowardsLeafNodes` sub-expressions from non-projection nodes /// into **extraction projections** (pass 1 of 2). /// @@ -72,7 +83,8 @@ use crate::{OptimizerConfig, OptimizerRule}; /// ``` /// /// **Important:** The `PushDownFilter` rule is aware of projections created by this rule -/// and will not push filters through them. See `is_extracted_expr_projection` in utils.rs. +/// and will not push filters through them. It uses `ExpressionPlacement` to detect +/// `MoveTowardsLeafNodes` expressions and skip filter pushdown past them. #[derive(Default, Debug)] pub struct ExtractLeafExpressions {} @@ -95,9 +107,533 @@ impl OptimizerRule for ExtractLeafExpressions { fn rewrite( &self, plan: LogicalPlan, - _config: &dyn OptimizerConfig, + config: &dyn OptimizerConfig, ) -> Result> { - Ok(Transformed::no(plan)) + let alias_generator = config.alias_generator(); + extract_from_plan(plan, alias_generator) + } +} + +/// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node. +/// +/// Works for any number of inputs (0, 1, 2, …N). For multi-input nodes +/// like Join, each extracted sub-expression is routed to the correct input +/// by checking which input's schema contains all of the expression's column +/// references. +fn extract_from_plan( + plan: LogicalPlan, + alias_generator: &Arc, +) -> Result> { + // Only extract from plan types whose output schema is predictable after + // expression rewriting. Nodes like Window derive column names from + // their expressions, so rewriting `get_field` inside a window function + // changes the output schema and breaks the recovery projection. + if !matches!( + &plan, + LogicalPlan::Aggregate(_) + | LogicalPlan::Filter(_) + | LogicalPlan::Sort(_) + | LogicalPlan::Limit(_) + | LogicalPlan::Join(_) + ) { + return Ok(Transformed::no(plan)); + } + + let inputs = plan.inputs(); + if inputs.is_empty() { + return Ok(Transformed::no(plan)); + } + + // Save original output schema before any transformation + let original_schema = Arc::clone(plan.schema()); + + // Build per-input schemas from borrowed inputs (before plan is consumed + // by map_expressions). We only need schemas and column sets for routing; + // the actual inputs are cloned later only if extraction succeeds. + let input_schemas: Vec> = + inputs.iter().map(|i| Arc::clone(i.schema())).collect(); + + // Build per-input extractors + let mut extractors: Vec = input_schemas + .iter() + .map(|schema| LeafExpressionExtractor::new(schema.as_ref(), alias_generator)) + .collect(); + + // Build per-input column sets for routing expressions to the correct input + let input_column_sets: Vec> = input_schemas + .iter() + .map(|schema| schema_columns(schema.as_ref())) + .collect(); + + // Transform expressions via map_expressions with routing + let transformed = plan.map_expressions(|expr| { + routing_extract(expr, &mut extractors, &input_column_sets) + })?; + + // If no expressions were rewritten, nothing was extracted + if !transformed.transformed { + return Ok(transformed); + } + + // Clone inputs now that we know extraction succeeded. Wrap in Arc + // upfront since build_extraction_projection expects &Arc. + let owned_inputs: Vec> = transformed + .data + .inputs() + .into_iter() + .map(|i| Arc::new(i.clone())) + .collect(); + + // Build per-input extraction projections (None means no extractions for that input) + let new_inputs: Vec = owned_inputs + .into_iter() + .zip(extractors.iter()) + .map(|(input_arc, extractor)| { + match extractor.build_extraction_projection(&input_arc)? { + Some(plan) => Ok(plan), + // No extractions for this input — recover the LogicalPlan + // without cloning (refcount is 1 since build returned None). + None => { + Ok(Arc::try_unwrap(input_arc).unwrap_or_else(|arc| (*arc).clone())) + } + } + }) + .collect::>>()?; + + // Rebuild the plan keeping its rewritten expressions but replacing + // inputs with the new extraction projections. + let new_plan = transformed + .data + .with_new_exprs(transformed.data.expressions(), new_inputs)?; + + // Add recovery projection if the output schema changed + let recovered = build_recovery_projection(original_schema.as_ref(), new_plan)?; + + Ok(Transformed::yes(recovered)) +} + +/// Given an expression, returns the index of the input whose columns fully +/// cover the expression's column references. +/// Returns `None` if the expression references columns from multiple inputs +/// or if multiple inputs match (ambiguous, e.g. unqualified columns present +/// in both sides of a join). +fn find_owning_input( + expr: &Expr, + input_column_sets: &[std::collections::HashSet], +) -> Option { + let mut found = None; + for (idx, cols) in input_column_sets.iter().enumerate() { + if has_all_column_refs(expr, cols) { + if found.is_some() { + // Ambiguous — multiple inputs match + return None; + } + found = Some(idx); + } + } + found +} + +/// Walks an expression tree top-down, extracting `MoveTowardsLeafNodes` +/// sub-expressions and routing each to the correct per-input extractor. +fn routing_extract( + expr: Expr, + extractors: &mut [LeafExpressionExtractor], + input_column_sets: &[std::collections::HashSet], +) -> Result> { + expr.transform_down(|e| { + // Skip expressions already aliased with extracted expression pattern + if let Expr::Alias(alias) = &e + && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) + { + return Ok(Transformed { + data: e, + transformed: false, + tnr: TreeNodeRecursion::Jump, + }); + } + + // Don't extract Alias nodes directly — preserve the alias and let + // transform_down recurse into the inner expression + if matches!(&e, Expr::Alias(_)) { + return Ok(Transformed::no(e)); + } + + match e.placement() { + ExpressionPlacement::MoveTowardsLeafNodes => { + if let Some(idx) = find_owning_input(&e, input_column_sets) { + let col_ref = extractors[idx].add_extracted(e)?; + Ok(Transformed::yes(col_ref)) + } else { + // References columns from multiple inputs — cannot extract + Ok(Transformed::no(e)) + } + } + ExpressionPlacement::Column => { + // Track columns that the parent node references so the + // extraction projection includes them as pass-through. + // Without this, the extraction projection would only + // contain __datafusion_extracted_N aliases, and the parent couldn't + // resolve its other column references. + if let Expr::Column(col) = &e + && let Some(idx) = find_owning_input(&e, input_column_sets) + { + extractors[idx].columns_needed.insert(col.clone()); + } + Ok(Transformed::no(e)) + } + _ => Ok(Transformed::no(e)), + } + }) +} + +/// Returns all columns in the schema (both qualified and unqualified forms) +fn schema_columns(schema: &DFSchema) -> std::collections::HashSet { + schema + .iter() + .flat_map(|(qualifier, field)| { + [ + Column::new(qualifier.cloned(), field.name()), + Column::new_unqualified(field.name()), + ] + }) + .collect() +} + +/// Rewrites extraction pairs and column references from one qualifier +/// space to another. +/// +/// Builds a replacement map by zipping `from_schema` (whose qualifiers +/// currently appear in `pairs` / `columns`) with `to_schema` (the +/// qualifiers we want), then applies `replace_cols_by_name`. +/// +/// Used for SubqueryAlias (alias-space -> input-space) and Union +/// (union output-space -> per-branch input-space). +fn remap_pairs_and_columns( + pairs: &[(Expr, String)], + columns: &IndexSet, + from_schema: &DFSchema, + to_schema: &DFSchema, +) -> Result { + let mut replace_map = HashMap::new(); + for ((from_q, from_f), (to_q, to_f)) in from_schema.iter().zip(to_schema.iter()) { + replace_map.insert( + qualified_name(from_q, from_f.name()), + Expr::Column(Column::new(to_q.cloned(), to_f.name())), + ); + } + let remapped_pairs: Vec<(Expr, String)> = pairs + .iter() + .map(|(expr, alias)| { + Ok(( + replace_cols_by_name(expr.clone(), &replace_map)?, + alias.clone(), + )) + }) + .collect::>()?; + let remapped_columns: IndexSet = columns + .iter() + .filter_map(|col| { + let rewritten = + replace_cols_by_name(Expr::Column(col.clone()), &replace_map).ok()?; + if let Expr::Column(c) = rewritten { + Some(c) + } else { + Some(col.clone()) + } + }) + .collect(); + Ok(ExtractionTarget { + pairs: remapped_pairs, + columns: remapped_columns, + }) +} + +// ============================================================================= +// Helper Types & Functions for Extraction Targeting +// ============================================================================= + +/// A bundle of extraction pairs (expression + alias) and standalone columns +/// that need to be pushed through a plan node. +struct ExtractionTarget { + /// Extracted expressions paired with their generated aliases. + pairs: Vec<(Expr, String)>, + /// Standalone column references needed by the parent node. + columns: IndexSet, +} + +/// Build a replacement map from a projection: output_column_name -> underlying_expr. +/// +/// This is used to resolve column references through a renaming projection. +/// For example, if a projection has `user AS x`, this maps `x` -> `col("user")`. +fn build_projection_replace_map(projection: &Projection) -> HashMap { + projection + .schema + .iter() + .zip(projection.expr.iter()) + .map(|((qualifier, field), expr)| { + let key = Column::from((qualifier, field)).flat_name(); + (key, expr.clone().unalias()) + }) + .collect() +} + +/// Build a recovery projection to restore the original output schema. +/// +/// After extraction, a node's output schema may differ from the original: +/// +/// - **Schema-preserving nodes** (Filter/Sort/Limit): the extraction projection +/// below adds extra `__datafusion_extracted_N` columns that bubble up through +/// the node. Recovery selects only the original columns to hide the extras. +/// ```text +/// Original schema: [id, user] +/// After extraction: [__datafusion_extracted_1, id, user] ← extra column leaked through +/// Recovery: SELECT id, user FROM ... ← hides __datafusion_extracted_1 +/// ``` +/// +/// - **Schema-defining nodes** (Aggregate): same number of columns but names +/// may differ because extracted aliases replaced the original expressions. +/// Recovery maps positionally, aliasing where names changed. +/// ```text +/// Original: [SUM(user['balance'])] +/// After: [SUM(__datafusion_extracted_1)] ← name changed +/// Recovery: SUM(__datafusion_extracted_1) AS "SUM(user['balance'])" +/// ``` +/// +/// - **Schemas identical** → no recovery projection needed. +fn build_recovery_projection( + original_schema: &DFSchema, + input: LogicalPlan, +) -> Result { + let new_schema = input.schema(); + let orig_len = original_schema.fields().len(); + let new_len = new_schema.fields().len(); + + if orig_len == new_len { + // Same number of fields — check if schemas are identical + let schemas_match = original_schema.iter().zip(new_schema.iter()).all( + |((orig_q, orig_f), (new_q, new_f))| { + orig_f.name() == new_f.name() && orig_q == new_q + }, + ); + if schemas_match { + return Ok(input); + } + + // Schema-defining nodes (Aggregate, Join): names may differ at some + // positions because extracted aliases replaced the original expressions. + // Map positionally, aliasing where the name changed. + // + // Invariant: `with_new_exprs` on all supported node types (Aggregate, + // Filter, Sort, Limit, Join) preserves column order, so positional + // mapping is safe here. + debug_assert!( + orig_len == new_len, + "build_recovery_projection: positional mapping requires same field count, \ + got original={orig_len} vs new={new_len}" + ); + let mut proj_exprs = Vec::with_capacity(orig_len); + for (i, (orig_qualifier, orig_field)) in original_schema.iter().enumerate() { + let (new_qualifier, new_field) = new_schema.qualified_field(i); + if orig_field.name() == new_field.name() && orig_qualifier == new_qualifier { + proj_exprs.push(Expr::from((orig_qualifier, orig_field))); + } else { + let new_col = Expr::Column(Column::from((new_qualifier, new_field))); + proj_exprs.push( + new_col.alias_qualified(orig_qualifier.cloned(), orig_field.name()), + ); + } + } + let projection = Projection::try_new(proj_exprs, Arc::new(input))?; + Ok(LogicalPlan::Projection(projection)) + } else { + // Schema-preserving nodes: new schema has extra extraction columns. + // Original columns still exist by name; select them to hide extras. + let col_exprs: Vec = original_schema.iter().map(Expr::from).collect(); + let projection = Projection::try_new(col_exprs, Arc::new(input))?; + Ok(LogicalPlan::Projection(projection)) + } +} + +/// Collects `MoveTowardsLeafNodes` sub-expressions found during expression +/// tree traversal and can build an extraction projection from them. +/// +/// # Example +/// +/// Given `Filter: user['status'] = 'active' AND user['name'] IS NOT NULL`: +/// - `add_extracted(user['status'])` → stores it, returns `col("__datafusion_extracted_1")` +/// - `add_extracted(user['name'])` → stores it, returns `col("__datafusion_extracted_2")` +/// - `build_extraction_projection()` produces: +/// `Projection: user['status'] AS __datafusion_extracted_1, user['name'] AS __datafusion_extracted_2, ` +struct LeafExpressionExtractor<'a> { + /// Extracted expressions: maps expression -> alias + extracted: IndexMap, + /// Columns referenced by extracted expressions or the parent node, + /// included as pass-through in the extraction projection. + columns_needed: IndexSet, + /// Input schema + input_schema: &'a DFSchema, + /// Alias generator + alias_generator: &'a Arc, +} + +impl<'a> LeafExpressionExtractor<'a> { + fn new(input_schema: &'a DFSchema, alias_generator: &'a Arc) -> Self { + Self { + extracted: IndexMap::new(), + columns_needed: IndexSet::new(), + input_schema, + alias_generator, + } + } + + /// Adds an expression to extracted set, returns column reference. + fn add_extracted(&mut self, expr: Expr) -> Result { + // Deduplication: reuse existing alias if same expression + if let Some(alias) = self.extracted.get(&expr) { + return Ok(Expr::Column(Column::new_unqualified(alias))); + } + + // Track columns referenced by this expression + for col in expr.column_refs() { + self.columns_needed.insert(col.clone()); + } + + // Generate unique alias + let alias = self.alias_generator.next(EXTRACTED_EXPR_PREFIX); + self.extracted.insert(expr, alias.clone()); + + Ok(Expr::Column(Column::new_unqualified(&alias))) + } + + /// Builds an extraction projection above the given input, or merges into + /// it if the input is already a projection. Delegates to + /// [`build_extraction_projection_impl`]. + /// + /// Returns `None` if there are no extractions. + fn build_extraction_projection( + &self, + input: &Arc, + ) -> Result> { + if self.extracted.is_empty() { + return Ok(None); + } + let pairs: Vec<(Expr, String)> = self + .extracted + .iter() + .map(|(e, a)| (e.clone(), a.clone())) + .collect(); + let proj = build_extraction_projection_impl( + &pairs, + &self.columns_needed, + input, + self.input_schema, + )?; + Ok(Some(LogicalPlan::Projection(proj))) + } +} + +/// Build an extraction projection above the target node (shared by both passes). +/// +/// If the target is an existing projection, merges into it. This requires +/// resolving column references through the projection's rename mapping: +/// if the projection has `user AS u`, and an extracted expression references +/// `u['name']`, we must rewrite it to `user['name']` since the merged +/// projection reads from the same input as the original. +/// +/// Deduplicates by resolved expression equality and adds pass-through +/// columns as needed. Otherwise builds a fresh projection with extracted +/// expressions + ALL input schema columns. +fn build_extraction_projection_impl( + extracted_exprs: &[(Expr, String)], + columns_needed: &IndexSet, + target: &Arc, + target_schema: &DFSchema, +) -> Result { + if let LogicalPlan::Projection(existing) = target.as_ref() { + // Merge into existing projection + let mut proj_exprs = existing.expr.clone(); + + // Build a map of existing expressions (by Expr equality) to their aliases + let existing_extractions: IndexMap = existing + .expr + .iter() + .filter_map(|e| { + if let Expr::Alias(alias) = e + && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) + { + return Some((*alias.expr.clone(), alias.name.clone())); + } + None + }) + .collect(); + + // Resolve column references through the projection's rename mapping + let replace_map = build_projection_replace_map(existing); + + // Add new extracted expressions, resolving column refs through the projection + for (expr, alias) in extracted_exprs { + let resolved = replace_cols_by_name(expr.clone().alias(alias), &replace_map)?; + let resolved_inner = if let Expr::Alias(a) = &resolved { + a.expr.as_ref() + } else { + &resolved + }; + if let Some(existing_alias) = existing_extractions.get(resolved_inner) { + // Same expression already extracted under a different alias — + // add the expression with the new alias so both names are + // available in the output. We can't reference the existing alias + // as a column within the same projection, so we duplicate the + // computation. + if existing_alias != alias { + proj_exprs.push(resolved); + } + } else { + proj_exprs.push(resolved); + } + } + + // Add any new pass-through columns that aren't already in the projection. + // We check against existing.input.schema() (the projection's source) rather + // than target_schema (the projection's output) because columns produced + // by alias expressions (e.g., CSE's __common_expr_N) exist in the output but + // not the input, and cannot be added as pass-through Column references. + let existing_cols: IndexSet = existing + .expr + .iter() + .filter_map(|e| { + if let Expr::Column(c) = e { + Some(c.clone()) + } else { + None + } + }) + .collect(); + + let input_schema = existing.input.schema(); + for col in columns_needed { + let col_expr = Expr::Column(col.clone()); + let resolved = replace_cols_by_name(col_expr, &replace_map)?; + if let Expr::Column(resolved_col) = &resolved + && !existing_cols.contains(resolved_col) + && input_schema.has_column(resolved_col) + { + proj_exprs.push(Expr::Column(resolved_col.clone())); + } + // If resolved to non-column expr, it's already computed by existing projection + } + + Projection::try_new(proj_exprs, Arc::clone(&existing.input)) + } else { + // Build new projection with extracted expressions + all input columns + let mut proj_exprs = Vec::new(); + for (expr, alias) in extracted_exprs { + proj_exprs.push(expr.clone().alias(alias)); + } + for (qualifier, field) in target_schema.iter() { + proj_exprs.push(Expr::from((qualifier, field))); + } + Projection::try_new(proj_exprs, Arc::clone(target)) } } @@ -120,9 +656,9 @@ impl OptimizerRule for ExtractLeafExpressions { /// /// After pass 1, the extraction projection sits directly below the filter: /// ```text -/// Projection: id, user <-- recovery -/// Filter: __extracted_1 = 'active' -/// Projection: user['status'] AS __extracted_1, id, user <-- extraction +/// Projection: id, user <-- recovery +/// Filter: __datafusion_extracted_1 = 'active' +/// Projection: user['status'] AS __datafusion_extracted_1, id, user <-- extraction /// TableScan: t [id, user] /// ``` /// @@ -130,8 +666,8 @@ impl OptimizerRule for ExtractLeafExpressions { /// and a subsequent `OptimizeProjections` pass removes the (now-redundant) /// recovery projection: /// ```text -/// Filter: __extracted_1 = 'active' -/// Projection: user['status'] AS __extracted_1, id, user <-- extraction (pushed down) +/// Filter: __datafusion_extracted_1 = 'active' +/// Projection: user['status'] AS __datafusion_extracted_1, id, user <-- extraction (pushed down) /// TableScan: t [id, user] /// ``` #[derive(Default, Debug)] @@ -155,12 +691,494 @@ impl OptimizerRule for PushDownLeafProjections { fn rewrite( &self, plan: LogicalPlan, - _config: &dyn OptimizerConfig, + config: &dyn OptimizerConfig, ) -> Result> { - Ok(Transformed::no(plan)) + let alias_generator = config.alias_generator(); + match try_push_input(&plan, alias_generator)? { + Some(new_plan) => Ok(Transformed::yes(new_plan)), + None => Ok(Transformed::no(plan)), + } + } +} + +/// Attempts to push a projection's extractable expressions further down. +/// +/// Returns `Some(new_subtree)` if the projection was pushed down or merged, +/// `None` if there is nothing to push or the projection sits above a barrier. +fn try_push_input( + input: &LogicalPlan, + alias_generator: &Arc, +) -> Result> { + let LogicalPlan::Projection(proj) = input else { + return Ok(None); + }; + split_and_push_projection(proj, alias_generator) +} + +/// Splits a projection into extractable pieces, pushes them towards leaf +/// nodes, and adds a recovery projection if needed. +/// +/// Handles both: +/// - **Pure extraction projections** (all `__datafusion_extracted` aliases + columns) +/// - **Mixed projections** (containing `MoveTowardsLeafNodes` sub-expressions) +/// +/// Returns `Some(new_subtree)` if extractions were pushed down, +/// `None` if there is nothing to extract or push. +/// +/// # Example: Mixed Projection +/// +/// ```text +/// Input plan: +/// Projection: user['name'] IS NOT NULL AS has_name, id +/// Filter: ... +/// TableScan +/// +/// Phase 1 (Split): +/// extraction_pairs: [(user['name'], "__datafusion_extracted_1")] +/// recovery_exprs: [__datafusion_extracted_1 IS NOT NULL AS has_name, id] +/// +/// Phase 2 (Push): +/// Push extraction projection through Filter toward TableScan +/// +/// Phase 3 (Recovery): +/// Projection: __datafusion_extracted_1 IS NOT NULL AS has_name, id <-- recovery +/// Filter: ... +/// Projection: user['name'] AS __datafusion_extracted_1, id <-- extraction (pushed) +/// TableScan +/// ``` +fn split_and_push_projection( + proj: &Projection, + alias_generator: &Arc, +) -> Result> { + let input = &proj.input; + let input_schema = input.schema(); + + // ── Phase 1: Split ────────────────────────────────────────────────── + // For each projection expression, collect extraction pairs and build + // recovery expressions. + // + // Pre-existing `__datafusion_extracted` aliases are inserted into the + // extractor's `IndexMap` with the **full** `Expr::Alias(…)` as the key, + // so the alias name participates in equality. This prevents collisions + // when CSE rewrites produce the same inner expression under different + // alias names (e.g. `__common_expr_4 AS __datafusion_extracted_1` and + // `__common_expr_4 AS __datafusion_extracted_3`). New extractions from + // `routing_extract` use bare (non-Alias) keys and get normal dedup. + // + // When building the final `extraction_pairs`, the Alias wrapper is + // stripped so consumers see the usual `(inner_expr, alias_name)` tuples. + + let mut extractors = vec![LeafExpressionExtractor::new( + input_schema.as_ref(), + alias_generator, + )]; + let input_column_sets = vec![schema_columns(input_schema.as_ref())]; + + let original_schema = proj.schema.as_ref(); + let mut recovery_exprs: Vec = Vec::with_capacity(proj.expr.len()); + let mut needs_recovery = false; + let mut has_new_extractions = false; + let mut proj_exprs_captured: usize = 0; + // Track standalone column expressions (Case B) to detect column refs + // from extracted aliases (Case A) that aren't also standalone expressions. + let mut standalone_columns: IndexSet = IndexSet::new(); + + for (expr, (qualifier, field)) in proj.expr.iter().zip(original_schema.iter()) { + if let Expr::Alias(alias) = expr + && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) + { + // Insert the full Alias expression as the key so that + // distinct alias names don't collide in the IndexMap. + let alias_name = alias.name.clone(); + + for col_ref in alias.expr.column_refs() { + extractors[0].columns_needed.insert(col_ref.clone()); + } + + extractors[0] + .extracted + .insert(expr.clone(), alias_name.clone()); + recovery_exprs.push(Expr::Column(Column::new_unqualified(&alias_name))); + proj_exprs_captured += 1; + } else if let Expr::Column(col) = expr { + // Plain column pass-through — track it in the extractor + extractors[0].columns_needed.insert(col.clone()); + standalone_columns.insert(col.clone()); + recovery_exprs.push(expr.clone()); + proj_exprs_captured += 1; + } else { + // Everything else: run through routing_extract + let transformed = + routing_extract(expr.clone(), &mut extractors, &input_column_sets)?; + if transformed.transformed { + has_new_extractions = true; + } + let transformed_expr = transformed.data; + + // Build recovery expression, aliasing back to original name if needed + let original_name = field.name(); + let needs_alias = if let Expr::Column(col) = &transformed_expr { + col.name.as_str() != original_name + } else { + let expr_name = transformed_expr.schema_name().to_string(); + original_name != &expr_name + }; + let recovery_expr = if needs_alias { + needs_recovery = true; + transformed_expr + .clone() + .alias_qualified(qualifier.cloned(), original_name) + } else { + transformed_expr.clone() + }; + + // If the expression was transformed (i.e., has extracted sub-parts), + // it differs from what the pushed projection outputs → needs recovery. + // Also, any non-column, non-__datafusion_extracted expression needs recovery + // because the pushed extraction projection won't output it directly. + if transformed.transformed || !matches!(expr, Expr::Column(_)) { + needs_recovery = true; + } + + recovery_exprs.push(recovery_expr); + } + } + + // Build extraction_pairs, stripping the Alias wrapper from pre-existing + // entries (they used the full Alias as the map key to avoid dedup). + let extractor = &extractors[0]; + let extraction_pairs: Vec<(Expr, String)> = extractor + .extracted + .iter() + .map(|(e, a)| match e { + Expr::Alias(alias) => (*alias.expr.clone(), a.clone()), + _ => (e.clone(), a.clone()), + }) + .collect(); + let columns_needed = &extractor.columns_needed; + + // If no extractions found, nothing to do + if extraction_pairs.is_empty() { + return Ok(None); + } + + // If columns_needed has entries that aren't standalone projection columns + // (i.e., they came from column refs inside extracted aliases), a merge + // into an inner projection will widen the schema with those extra columns, + // requiring a recovery projection to restore the original schema. + if columns_needed + .iter() + .any(|c| !standalone_columns.contains(c)) + { + needs_recovery = true; + } + + // ── Phase 2: Push down ────────────────────────────────────────────── + let proj_input = Arc::clone(&proj.input); + let pushed = push_extraction_pairs( + &extraction_pairs, + columns_needed, + proj, + &proj_input, + alias_generator, + proj_exprs_captured, + )?; + + // ── Phase 3: Recovery ─────────────────────────────────────────────── + // Determine the base plan: either the pushed result or an in-place extraction. + let base_plan = match pushed { + Some(plan) => plan, + None => { + if !has_new_extractions { + // Only pre-existing __datafusion_extracted aliases and columns, no new + // extractions from routing_extract. The original projection is + // already an extraction projection that couldn't be pushed + // further. Return None. + return Ok(None); + } + // Build extraction projection in-place (couldn't push down) + let input_arc = Arc::clone(input); + let extraction = build_extraction_projection_impl( + &extraction_pairs, + columns_needed, + &input_arc, + input_schema.as_ref(), + )?; + LogicalPlan::Projection(extraction) + } + }; + + // Wrap with recovery projection if the output schema changed + if needs_recovery { + let recovery = LogicalPlan::Projection(Projection::try_new( + recovery_exprs, + Arc::new(base_plan), + )?); + Ok(Some(recovery)) + } else { + Ok(Some(base_plan)) + } +} + +/// Returns true if the plan is a Projection where ALL expressions are either +/// `Alias(EXTRACTED_EXPR_PREFIX, ...)` or `Column`, with at least one extraction. +/// Such projections can safely be pushed further without re-extraction. +fn is_pure_extraction_projection(plan: &LogicalPlan) -> bool { + let LogicalPlan::Projection(proj) = plan else { + return false; + }; + let mut has_extraction = false; + for expr in &proj.expr { + match expr { + Expr::Alias(alias) if alias.name.starts_with(EXTRACTED_EXPR_PREFIX) => { + has_extraction = true; + } + Expr::Column(_) => {} + _ => return false, + } + } + has_extraction +} + +/// Pushes extraction pairs down through the projection's input node, +/// dispatching to the appropriate handler based on the input node type. +fn push_extraction_pairs( + pairs: &[(Expr, String)], + columns_needed: &IndexSet, + proj: &Projection, + proj_input: &Arc, + alias_generator: &Arc, + proj_exprs_captured: usize, +) -> Result> { + match proj_input.as_ref() { + // Merge into existing projection, then try to push the result further down. + // Only merge when every expression in the outer projection is fully + // captured as either an extraction pair (Case A: __datafusion_extracted + // alias) or a plain column (Case B). Uncaptured expressions (e.g. + // `col AS __common_expr_1` from CSE, or complex expressions with + // extracted sub-parts) would be lost during the merge. + LogicalPlan::Projection(_) if proj_exprs_captured == proj.expr.len() => { + let target_schema = Arc::clone(proj_input.schema()); + let merged = build_extraction_projection_impl( + pairs, + columns_needed, + proj_input, + target_schema.as_ref(), + )?; + let merged_plan = LogicalPlan::Projection(merged); + + // After merging, try to push the result further down, but ONLY + // if the merged result is still a pure extraction projection + // (all __datafusion_extracted aliases + columns). If the merge inherited + // bare MoveTowardsLeafNodes expressions from the inner projection, + // pushing would re-extract them into new aliases and fail when + // the (None, true) fallback can't find the original aliases. + // This handles: Extraction → Recovery(cols) → Filter → ... → TableScan + // by pushing through the recovery projection AND the filter in one pass. + if is_pure_extraction_projection(&merged_plan) + && let Some(pushed) = try_push_input(&merged_plan, alias_generator)? + { + return Ok(Some(pushed)); + } + Ok(Some(merged_plan)) + } + // Generic: handles Filter/Sort/Limit (via recursion), + // SubqueryAlias (with qualifier remap in try_push_into_inputs), + // Join, and anything else. + // Safely bails out for nodes that don't pass through extracted + // columns (Aggregate, Window) via the output schema check. + _ => try_push_into_inputs( + pairs, + columns_needed, + proj_input.as_ref(), + alias_generator, + ), } } +/// Routes extraction pairs and columns to the appropriate inputs. +/// +/// - **Union**: broadcasts to every input via [`remap_pairs_and_columns`]. +/// - **Other nodes**: routes each expression to the one input that owns +/// all of its column references (via [`find_owning_input`]). +/// +/// Returns `None` if any expression can't be routed or no input has pairs. +fn route_to_inputs( + pairs: &[(Expr, String)], + columns: &IndexSet, + node: &LogicalPlan, + input_column_sets: &[std::collections::HashSet], + input_schemas: &[Arc], +) -> Result>> { + let num_inputs = input_schemas.len(); + let mut per_input: Vec = (0..num_inputs) + .map(|_| ExtractionTarget { + pairs: vec![], + columns: IndexSet::new(), + }) + .collect(); + + if matches!(node, LogicalPlan::Union(_)) { + // Union output schema and each input schema have the same fields by + // index but may differ in qualifiers (e.g. output `s` vs input + // `simple_struct.s`). Remap pairs/columns to each input's space. + let union_schema = node.schema(); + for (idx, input_schema) in input_schemas.iter().enumerate() { + per_input[idx] = + remap_pairs_and_columns(pairs, columns, union_schema, input_schema)?; + } + } else { + for (expr, alias) in pairs { + match find_owning_input(expr, input_column_sets) { + Some(idx) => per_input[idx].pairs.push((expr.clone(), alias.clone())), + None => return Ok(None), // Cross-input expression — bail out + } + } + for col in columns { + let col_expr = Expr::Column(col.clone()); + match find_owning_input(&col_expr, input_column_sets) { + Some(idx) => { + per_input[idx].columns.insert(col.clone()); + } + None => return Ok(None), // Ambiguous column — bail out + } + } + } + + // Check at least one input has extractions to push + if per_input.iter().all(|t| t.pairs.is_empty()) { + return Ok(None); + } + + Ok(Some(per_input)) +} + +/// Pushes extraction expressions into a node's inputs by routing each +/// expression to the input that owns all of its column references. +/// +/// Works for any number of inputs (1, 2, …N). For single-input nodes, +/// all expressions trivially route to that input. For multi-input nodes +/// (Join, etc.), each expression is routed to the side that owns its columns. +/// +/// Returns `Some(new_node)` if all expressions could be routed AND the +/// rebuilt node's output schema contains all extracted aliases. +/// Returns `None` if any expression references columns from multiple inputs +/// or the node doesn't pass through the extracted columns. +/// +/// # Example: Join with expressions from both sides +/// +/// ```text +/// Extraction projection above a Join: +/// Projection: left.user['name'] AS __datafusion_extracted_1, right.order['total'] AS __datafusion_extracted_2, ... +/// Join: left.id = right.user_id +/// TableScan: left [id, user] +/// TableScan: right [user_id, order] +/// +/// After routing each expression to its owning input: +/// Join: left.id = right.user_id +/// Projection: user['name'] AS __datafusion_extracted_1, id, user <-- left-side extraction +/// TableScan: left [id, user] +/// Projection: order['total'] AS __datafusion_extracted_2, user_id, order <-- right-side extraction +/// TableScan: right [user_id, order] +/// ``` +fn try_push_into_inputs( + pairs: &[(Expr, String)], + columns_needed: &IndexSet, + node: &LogicalPlan, + alias_generator: &Arc, +) -> Result> { + let inputs = node.inputs(); + if inputs.is_empty() { + return Ok(None); + } + + // SubqueryAlias remaps qualifiers between input and output. + // Rewrite pairs/columns from alias-space to input-space before routing. + let remapped = if let LogicalPlan::SubqueryAlias(sa) = node { + remap_pairs_and_columns(pairs, columns_needed, &sa.schema, sa.input.schema())? + } else { + ExtractionTarget { + pairs: pairs.to_vec(), + columns: columns_needed.clone(), + } + }; + let pairs = &remapped.pairs[..]; + let columns_needed = &remapped.columns; + + // Build per-input schemas and column sets for routing + let input_schemas: Vec> = + inputs.iter().map(|i| Arc::clone(i.schema())).collect(); + let input_column_sets: Vec> = + input_schemas.iter().map(|s| schema_columns(s)).collect(); + + // Route pairs and columns to the appropriate inputs + let per_input = match route_to_inputs( + pairs, + columns_needed, + node, + &input_column_sets, + &input_schemas, + )? { + Some(routed) => routed, + None => return Ok(None), + }; + + let num_inputs = inputs.len(); + + // Build per-input extraction projections and push them as far as possible + // immediately. This is critical because map_children preserves cached schemas, + // so if the TopDown pass later pushes a child further (changing its output + // schema), the parent node's schema becomes stale. + let mut new_inputs: Vec = Vec::with_capacity(num_inputs); + for (idx, input) in inputs.into_iter().enumerate() { + if per_input[idx].pairs.is_empty() { + new_inputs.push(input.clone()); + } else { + let input_arc = Arc::new(input.clone()); + let target_schema = Arc::clone(input.schema()); + let proj = build_extraction_projection_impl( + &per_input[idx].pairs, + &per_input[idx].columns, + &input_arc, + target_schema.as_ref(), + )?; + // Verify all requested aliases appear in the projection's output. + // A merge may deduplicate if the same expression already exists + // under a different alias, leaving the requested alias missing. + let proj_schema = proj.schema.as_ref(); + for (_expr, alias) in &per_input[idx].pairs { + if !proj_schema.fields().iter().any(|f| f.name() == alias) { + return Ok(None); + } + } + let proj_plan = LogicalPlan::Projection(proj); + // Try to push the extraction projection further down within + // this input (e.g., through Filter → existing extraction projection). + // This ensures the input's output schema is stable and won't change + // when the TopDown pass later visits children. + match try_push_input(&proj_plan, alias_generator)? { + Some(pushed) => new_inputs.push(pushed), + None => new_inputs.push(proj_plan), + } + } + } + + // Rebuild the node with new inputs + let new_node = node.with_new_exprs(node.expressions(), new_inputs)?; + + // Safety check: verify all extracted aliases appear in the rebuilt + // node's output schema. Nodes like Aggregate define their own output + // and won't pass through extracted columns — bail out for those. + let output_schema = new_node.schema(); + for (_expr, alias) in pairs { + if !output_schema.fields().iter().any(|f| f.name() == alias) { + return Ok(None); + } + } + + Ok(Some(new_node)) +} + #[cfg(test)] mod tests { use std::sync::Arc; @@ -283,13 +1301,20 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown (same as after extraction) ## Optimized - (same as after pushdown) + Projection: test.id + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id + TableScan: test projection=[id, user] "#) } @@ -332,10 +1357,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: leaf_udf(test.user, Utf8("name")) + TableScan: test projection=[user] "#) } @@ -359,10 +1387,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 IS NOT NULL AS has_name + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name + TableScan: test projection=[user] "#) } @@ -408,7 +1439,10 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 IS NOT NULL AND __datafusion_extracted_1 IS NULL + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown (same as after extraction) @@ -431,7 +1465,10 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("test") + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown (same as after extraction) @@ -456,13 +1493,19 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown (same as after extraction) ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1 + TableScan: test projection=[user] "#) } @@ -484,7 +1527,10 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: test.user, COUNT(__datafusion_extracted_1) AS COUNT(leaf_udf(test.user,Utf8("value"))) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1)]] + Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown (same as after extraction) @@ -509,13 +1555,23 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: leaf_udf(test.user, Utf8("name")) + Projection: test.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2 + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -535,10 +1591,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS username + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: leaf_udf(test.user, Utf8("name")) AS username + TableScan: test projection=[user] "#) } @@ -560,10 +1619,17 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: test.user, leaf_udf(test.user, Utf8("label")) + Projection: test.user + Filter: __datafusion_extracted_1 > Int32(150) + Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: test.user, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("label")) + Filter: __datafusion_extracted_1 > Int32(150) + Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user, leaf_udf(test.user, Utf8("label")) AS __datafusion_extracted_2 + TableScan: test projection=[user] ## Optimized (same as after pushdown) @@ -587,10 +1653,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_1 AS name2 + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("name")) AS name2 + TableScan: test projection=[user] "#) } @@ -617,7 +1686,10 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")) + Sort: test.user ASC NULLS FIRST + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized (same as after pushdown) @@ -643,10 +1715,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")) + Limit: skip=0, fetch=10 + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")) + Limit: skip=0, fetch=10 + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 + TableScan: test projection=[user] "#) } @@ -669,7 +1747,9 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1) AS cnt]] + Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown (same as after extraction) @@ -748,13 +1828,28 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 IS NOT NULL + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + Projection: test.id, test.user + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 IS NOT NULL + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 IS NOT NULL + Projection: test.id, test.user, __datafusion_extracted_1 + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 + TableScan: test projection=[id, user] "#) } @@ -776,10 +1871,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: leaf_udf(test.user, Utf8("name")) + TableScan: test projection=[user] "#) } @@ -848,14 +1946,69 @@ mod tests { Filter: leaf_udf(test.user, Utf8("status")) = Utf8("active") TableScan: test projection=[user] + ## After Extraction + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: test.user + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] + + ## After Pushdown + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 + TableScan: test projection=[user] + + ## Optimized + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: __datafusion_extracted_1 + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 + TableScan: test projection=[user] + "#) + } + + /// Projection containing a MoveTowardsLeafNodes sub-expression above an + /// Aggregate. Aggregate blocks pushdown, so the (None, true) recovery + /// fallback path fires: in-place extraction + recovery projection. + #[test] + fn test_projection_with_leaf_expr_above_aggregate() -> Result<()> { + use datafusion_expr::test::function_stub::count; + + let table_scan = test_table_scan_with_struct()?; + let plan = LogicalPlanBuilder::from(table_scan) + .aggregate(vec![col("user")], vec![count(lit(1))])? + .project(vec![ + leaf_udf(col("user"), "name") + .is_not_null() + .alias("has_name"), + col("COUNT(Int32(1))"), + ])? + .build()?; + + assert_stages!(plan, @r#" + ## Original Plan + Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name, COUNT(Int32(1)) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]] + TableScan: test projection=[user] + ## After Extraction (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 IS NOT NULL AS has_name, COUNT(Int32(1)) + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user, COUNT(Int32(1)) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]] + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name, COUNT(Int32(1)) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]] + TableScan: test projection=[user] "#) } @@ -875,13 +2028,28 @@ mod tests { TableScan: test projection=[a, b, c] ## After Extraction - (same as original) + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_1 = Int32(2) + Projection: leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1, test.a, test.b, test.c + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_2 = Int32(1) + Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c + TableScan: test projection=[a, b, c] ## After Pushdown - (same as after extraction) + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_1 = Int32(2) + Filter: __datafusion_extracted_2 = Int32(1) + Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1 + TableScan: test projection=[a, b, c] ## Optimized - (same as after pushdown) + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_1 = Int32(2) + Projection: test.a, test.b, test.c, __datafusion_extracted_1 + Filter: __datafusion_extracted_2 = Int32(1) + Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1 + TableScan: test projection=[a, b, c] "#) } @@ -923,7 +2091,12 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2 + Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_2, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown (same as after extraction) @@ -959,7 +2132,11 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] ## After Pushdown (same as after extraction) @@ -996,7 +2173,12 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") AND __datafusion_extracted_2 = Utf8("admin") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown (same as after extraction) @@ -1064,13 +2246,34 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, right.id, right.user + Projection: test.id, test.user, right.id, right.user + Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 + Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_1 = Utf8("active") + Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 + Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1 + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: test.id, test.user, __datafusion_extracted_1, right.id, right.user + Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 + Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1 + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user + TableScan: right projection=[id, user] "#) } @@ -1101,6 +2304,58 @@ mod tests { ## After Extraction (same as original) + ## After Pushdown + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), __datafusion_extracted_2 AS leaf_udf(right.user,Utf8("role")) + Inner Join: test.id = right.id + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user + TableScan: right projection=[id, user] + + ## Optimized + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), __datafusion_extracted_2 AS leaf_udf(right.user,Utf8("role")) + Inner Join: test.id = right.id + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id + TableScan: right projection=[id, user] + "#) + } + + /// Join where both sides have same-named columns: a qualified reference + /// to the right side must be routed to the right input, not the left. + #[test] + fn test_extract_from_join_qualified_right_side() -> Result<()> { + use datafusion_expr::JoinType; + + let left = test_table_scan_with_struct()?; + let right = test_table_scan_with_struct_named("right")?; + + // Filter references right.user explicitly — must route to right side + let plan = LogicalPlanBuilder::from(left) + .join_on( + right, + JoinType::Inner, + vec![ + col("test.id").eq(col("right.id")), + leaf_udf(col("right.user"), "status").eq(lit("active")), + ], + )? + .build()?; + + assert_stages!(plan, @r#" + ## Original Plan + Inner Join: Filter: test.id = right.id AND leaf_udf(right.user, Utf8("status")) = Utf8("active") + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] + + ## After Extraction + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.id = right.id AND __datafusion_extracted_1 = Utf8("active") + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user + TableScan: right projection=[id, user] + ## After Pushdown (same as after extraction) @@ -1109,6 +2364,99 @@ mod tests { "#) } + /// When both inputs contain the same unqualified column, an unqualified + /// column reference is ambiguous and `find_owning_input` must return + /// `None` rather than always returning 0 (the left side). + #[test] + fn test_find_owning_input_ambiguous_unqualified_column() { + use std::collections::HashSet; + + // Simulate schema_columns output for two sides of a join where both + // have a "user" column — each set contains the qualified and + // unqualified form. + let left_cols: HashSet = [ + Column::new(Some("test"), "user"), + Column::new_unqualified("user"), + ] + .into_iter() + .collect(); + + let right_cols: HashSet = [ + Column::new(Some("right"), "user"), + Column::new_unqualified("user"), + ] + .into_iter() + .collect(); + + let input_column_sets = vec![left_cols, right_cols]; + + // Unqualified "user" matches both sets — must return None (ambiguous) + let unqualified = Expr::Column(Column::new_unqualified("user")); + assert_eq!(find_owning_input(&unqualified, &input_column_sets), None); + + // Qualified "right.user" matches only the right set — must return Some(1) + let qualified_right = Expr::Column(Column::new(Some("right"), "user")); + assert_eq!( + find_owning_input(&qualified_right, &input_column_sets), + Some(1) + ); + + // Qualified "test.user" matches only the left set — must return Some(0) + let qualified_left = Expr::Column(Column::new(Some("test"), "user")); + assert_eq!( + find_owning_input(&qualified_left, &input_column_sets), + Some(0) + ); + } + + /// Two leaf_udf expressions from different sides of a Join in a Filter. + /// Each is routed to its respective input side independently. + #[test] + fn test_extract_from_join_cross_input_expression() -> Result<()> { + let left = test_table_scan_with_struct()?; + let right = test_table_scan_with_struct_named("right")?; + + let plan = LogicalPlanBuilder::from(left) + .join_on( + right, + datafusion_expr::JoinType::Inner, + vec![col("test.id").eq(col("right.id"))], + )? + .filter( + leaf_udf(col("test.user"), "status") + .eq(leaf_udf(col("right.user"), "status")), + )? + .build()?; + + assert_stages!(plan, @r#" + ## Original Plan + Filter: leaf_udf(test.user, Utf8("status")) = leaf_udf(right.user, Utf8("status")) + Inner Join: Filter: test.id = right.id + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] + + ## After Extraction + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_1 = __datafusion_extracted_2 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, right.id, right.user + Inner Join: Filter: test.id = right.id + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] + + ## After Pushdown + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_1 = __datafusion_extracted_2 + Inner Join: Filter: test.id = right.id + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_2, right.id, right.user + TableScan: right projection=[id, user] + + ## Optimized + (same as after pushdown) + "#) + } + // ========================================================================= // Column-rename through intermediate node tests // ========================================================================= @@ -1134,10 +2482,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS leaf_udf(x,Utf8("a")) + Filter: x IS NOT NULL + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_1 AS leaf_udf(x,Utf8("a")) + Filter: x IS NOT NULL + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1 + TableScan: test projection=[user] "#) } @@ -1162,10 +2516,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 IS NOT NULL AS leaf_udf(x,Utf8("a")) IS NOT NULL + Filter: x IS NOT NULL + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_1 IS NOT NULL AS leaf_udf(x,Utf8("a")) IS NOT NULL + Filter: x IS NOT NULL + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1 + TableScan: test projection=[user] "#) } @@ -1185,13 +2545,19 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: x + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown (same as after extraction) ## Optimized - (same as after pushdown) + Projection: x + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1 + TableScan: test projection=[user] "#) } @@ -1218,10 +2584,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS leaf_udf(sub.user,Utf8("name")) + SubqueryAlias: sub + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_1 AS leaf_udf(sub.user,Utf8("name")) + SubqueryAlias: sub + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 + TableScan: test projection=[user] "#) } @@ -1243,13 +2615,26 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: leaf_udf(sub.user, Utf8("name")) + Projection: sub.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: leaf_udf(sub.user, Utf8("status")) AS __datafusion_extracted_1, sub.user + SubqueryAlias: sub + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_2 AS leaf_udf(sub.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + SubqueryAlias: sub + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS leaf_udf(sub.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + SubqueryAlias: sub + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -1274,10 +2659,18 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS leaf_udf(outer_sub.user,Utf8("name")) + SubqueryAlias: outer_sub + SubqueryAlias: inner_sub + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_1 AS leaf_udf(outer_sub.user,Utf8("name")) + SubqueryAlias: outer_sub + SubqueryAlias: inner_sub + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 + TableScan: test projection=[user] "#) } @@ -1360,13 +2753,20 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b") + Projection: leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown (same as after extraction) ## Optimized - (same as after pushdown) + Projection: test.id + Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b") + Projection: leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id + TableScan: test projection=[id, user] "#) } @@ -1375,7 +2775,7 @@ mod tests { // ========================================================================= /// Extraction pushdown through a filter that already had its own - /// `mock_leaf` extracted. + /// `leaf_udf` extracted. #[test] fn test_extraction_pushdown_through_filter_with_extracted_predicate() -> Result<()> { let table_scan = test_table_scan_with_struct()?; @@ -1391,13 +2791,23 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, leaf_udf(test.user, Utf8("name")) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2 + TableScan: test projection=[id, user] "#) } @@ -1418,18 +2828,28 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, leaf_udf(test.user, Utf8("status")) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 > Int32(5) + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2 + TableScan: test projection=[id, user] "#) } - /// Left join with a `mock_leaf` filter on the right side AND - /// the projection also selects `mock_leaf` from the right side. + /// Left join with a `leaf_udf` filter on the right side AND + /// the projection also selects `leaf_udf` from the right side. #[test] fn test_left_join_with_filter_and_projection_extraction() -> Result<()> { use datafusion_expr::JoinType; @@ -1461,13 +2881,28 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(right.user, Utf8("status")) + Projection: test.id, test.user, right.id, right.user + Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(right.user,Utf8("status")) + Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_3 + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(right.user,Utf8("status")) + Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_3 + TableScan: right projection=[id, user] "#) } @@ -1492,13 +2927,62 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("status")) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 > Int32(5) + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_3 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_3 + TableScan: test projection=[id, user] "#) } + + /// When an extraction projection's __extracted alias references a column + /// (e.g. `user`) that is NOT a standalone expression in the projection, + /// the merge into the inner projection should still succeed. + #[test] + fn test_merge_extraction_into_projection_with_column_ref_inflation() -> Result<()> { + let table_scan = test_table_scan_with_struct()?; + + // Inner projection (simulates a trimmed projection) + let inner = LogicalPlanBuilder::from(table_scan) + .project(vec![col("user"), col("id")])? + .build()?; + + // Outer projection: __extracted alias + id (but NOT user as standalone). + // The alias references `user` internally, inflating columns_needed. + let plan = LogicalPlanBuilder::from(inner) + .project(vec![ + leaf_udf(col("user"), "status") + .alias(format!("{EXTRACTED_EXPR_PREFIX}_1")), + col("id"), + ])? + .build()?; + + // Run only PushDownLeafProjections + let ctx = OptimizerContext::new().with_max_passes(1); + let optimizer = + Optimizer::with_rules(vec![Arc::new(PushDownLeafProjections::new())]); + let result = optimizer.optimize(plan, &ctx, |_, _| {})?; + + // With the fix: merge succeeds → extraction merged into inner projection. + // Without the fix: merge rejected → two separate projections remain. + insta::assert_snapshot!(format!("{result}"), @r#" + Projection: __datafusion_extracted_1, test.id + Projection: test.user, test.id, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1 + TableScan: test + "#); + + Ok(()) + } } diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 9cccb20bcc45e..93df300bb50b4 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -559,7 +559,19 @@ fn merge_consecutive_projections(proj: Projection) -> Result rewrite_expr(*expr, &prev_projection).map(|result| { result.update_data(|expr| { - Expr::Alias(Alias::new(expr, relation, name).with_metadata(metadata)) + // After substitution, the inner expression may now have the + // same schema_name as the alias (e.g. when an extraction + // alias like `__extracted_1 AS f(x)` is resolved back to + // `f(x)`). Wrapping in a redundant self-alias causes a + // cosmetic `f(x) AS f(x)` due to Display vs schema_name + // formatting differences. Drop the alias when it matches. + if metadata.is_none() && expr.schema_name().to_string() == name { + expr + } else { + Expr::Alias( + Alias::new(expr, relation, name).with_metadata(metadata), + ) + } }) }), e => rewrite_expr(e, &prev_projection), diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 877a84fe4dc14..118ddef49b7e7 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -43,6 +43,7 @@ use crate::eliminate_join::EliminateJoin; use crate::eliminate_limit::EliminateLimit; use crate::eliminate_outer_join::EliminateOuterJoin; use crate::extract_equijoin_predicate::ExtractEquijoinPredicate; +use crate::extract_leaf_expressions::{ExtractLeafExpressions, PushDownLeafProjections}; use crate::filter_null_join_keys::FilterNullJoinKeys; use crate::optimize_projections::OptimizeProjections; use crate::optimize_unions::OptimizeUnions; @@ -260,6 +261,8 @@ impl Optimizer { // that might benefit from the following rules Arc::new(EliminateGroupByConstant::new()), Arc::new(CommonSubexprEliminate::new()), + Arc::new(ExtractLeafExpressions::new()), + Arc::new(PushDownLeafProjections::new()), Arc::new(OptimizeProjections::new()), ]; diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 6f615ec391c9e..c5907d497500e 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -197,6 +197,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c] logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE logical_plan after optimize_unions SAME TEXT AS ABOVE @@ -219,6 +221,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections SAME TEXT AS ABOVE logical_plan TableScan: simple_explain_test projection=[a, b, c] initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true @@ -558,6 +562,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c] logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE logical_plan after optimize_unions SAME TEXT AS ABOVE @@ -580,6 +586,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections SAME TEXT AS ABOVE logical_plan TableScan: simple_explain_test projection=[a, b, c] initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index 8ef2596f18e33..e7b9e77dfef58 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -165,7 +165,7 @@ ORDER BY c1 ---- logical_plan 01)Dml: op=[Insert Into] table=[table_without_values] -02)--Projection: a1 AS a1, a2 AS a2 +02)--Projection: a1, a2 03)----Sort: aggregate_test_100.c1 ASC NULLS LAST 04)------Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a2, aggregate_test_100.c1 05)--------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index 50e26b2fb0b85..c25b80a0d7f20 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -235,13 +235,14 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 2 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -259,13 +260,14 @@ query TT EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +01)Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1) 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)] -02)--FilterExec: id@0 > 2 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +02)--FilterExec: id@1 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -283,13 +285,14 @@ query TT EXPLAIN SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) -02)--Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[label] +02)--Filter: __datafusion_extracted_1 > Int64(150) +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2 +04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label]] -02)--FilterExec: get_field(s@1, value) > 150 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as simple_struct.s[label]] +02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1, __datafusion_extracted_2@2] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet # Verify correctness query IT @@ -556,14 +559,15 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value' ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 03)----Filter: simple_struct.id > Int64(1) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -03)----FilterExec: id@0 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +03)----FilterExec: id@1 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -583,14 +587,15 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value' ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 03)----Filter: simple_struct.id > Int64(1) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: TopK(fetch=2), expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -03)----FilterExec: id@0 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +03)----FilterExec: id@1 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -608,14 +613,15 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LI ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1) 03)----Filter: simple_struct.id > Int64(1) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)] -03)----FilterExec: id@0 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +03)----FilterExec: id@1 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -747,16 +753,17 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +02)--Projection: multi_struct.id, __datafusion_extracted_1 AS multi_struct.s[value] 03)----Filter: multi_struct.id > Int64(2) -04)------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)] +04)------Projection: get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_1, multi_struct.id +05)--------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] 02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as multi_struct.s[value]] -04)------FilterExec: id@0 > 2 +03)----ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as multi_struct.s[value]] +04)------FilterExec: id@1 > 2 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3 -06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -774,13 +781,16 @@ query TT EXPLAIN SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label']; ---- logical_plan -01)Aggregate: groupBy=[[get_field(multi_struct.s, Utf8("label"))]], aggr=[[sum(get_field(multi_struct.s, Utf8("value")))]] -02)--TableScan: multi_struct projection=[s] +01)Projection: __datafusion_extracted_1 AS multi_struct.s[label], sum(__datafusion_extracted_2) AS sum(multi_struct.s[value]) +02)--Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[sum(__datafusion_extracted_2)]] +03)----Projection: get_field(multi_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_2 +04)------TableScan: multi_struct projection=[s] physical_plan -01)AggregateExec: mode=FinalPartitioned, gby=[multi_struct.s[label]@0 as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] -02)--RepartitionExec: partitioning=Hash([multi_struct.s[label]@0], 4), input_partitions=3 -03)----AggregateExec: mode=Partial, gby=[get_field(s@0, label) as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] -04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[s], file_type=parquet +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as multi_struct.s[label], sum(__datafusion_extracted_2)@1 as sum(multi_struct.s[value])] +02)--AggregateExec: mode=FinalPartitioned, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)] +03)----RepartitionExec: partitioning=Hash([__datafusion_extracted_1@0], 4), input_partitions=3 +04)------AggregateExec: mode=Partial, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)] +05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet # Verify correctness query TI @@ -831,13 +841,14 @@ query TT EXPLAIN SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL; ---- logical_plan -01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) -02)--Filter: get_field(nullable_struct.s, Utf8("value")) IS NOT NULL -03)----TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL] +01)Projection: nullable_struct.id, __datafusion_extracted_2 AS nullable_struct.s[label] +02)--Filter: __datafusion_extracted_1 IS NOT NULL +03)----Projection: get_field(nullable_struct.s, Utf8("value")) AS __datafusion_extracted_1, nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) AS __datafusion_extracted_2 +04)------TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as nullable_struct.s[label]] -02)--FilterExec: get_field(s@1, value) IS NOT NULL -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, s], file_type=parquet +01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as nullable_struct.s[label]] +02)--FilterExec: __datafusion_extracted_1@0 IS NOT NULL, projection=[id@1, __datafusion_extracted_2@2] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet # Verify correctness query IT @@ -954,27 +965,29 @@ EXPLAIN SELECT (id + s['value']) * (id + s['value']) as id_and_value FROM simple ---- logical_plan 01)Projection: __common_expr_1 * __common_expr_1 AS id_and_value -02)--Projection: simple_struct.id + get_field(simple_struct.s, Utf8("value")) AS __common_expr_1 +02)--Projection: simple_struct.id + __datafusion_extracted_2 AS __common_expr_1 03)----Filter: simple_struct.id > Int64(2) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan 01)ProjectionExec: expr=[__common_expr_1@0 * __common_expr_1@0 as id_and_value] -02)--ProjectionExec: expr=[id@0 + get_field(s@1, value) as __common_expr_1] -03)----FilterExec: id@0 > 2 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 + __datafusion_extracted_2@0 as __common_expr_1] +03)----FilterExec: id@1 > 2 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] query TT EXPLAIN SELECT s['value'] + s['value'] as doubled FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) + get_field(simple_struct.s, Utf8("value")) AS doubled +01)Projection: __datafusion_extracted_1 + __datafusion_extracted_1 AS doubled 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) + get_field(s@0, value) as doubled] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 + __datafusion_extracted_1@0 as doubled] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -992,13 +1005,14 @@ query TT EXPLAIN SELECT s['value'], s['label'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label] 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value], get_field(s@0, label) as simple_struct.s[label]] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label]] +02)--FilterExec: id@2 > 2, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query IT @@ -1041,13 +1055,14 @@ query TT EXPLAIN SELECT s['value'] * 2 + length(s['label']) as score FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) * Int64(2) + CAST(character_length(get_field(simple_struct.s, Utf8("label"))) AS length(get_field(simple_struct.s, Utf8("label"))) AS Int64) AS score +01)Projection: __datafusion_extracted_1 * Int64(2) + CAST(character_length(__datafusion_extracted_2) AS Int64) AS score 02)--Filter: simple_struct.id > Int64(1) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) * 2 + CAST(character_length(get_field(s@0, label)) AS Int64) as score] -02)--FilterExec: id@0 > 1, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 * 2 + CAST(character_length(__datafusion_extracted_2@1) AS Int64) as score] +02)--FilterExec: id@2 > 1, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query I @@ -1117,13 +1132,14 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(1) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -1136,13 +1152,14 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 1 AND (id < 4 OR id = 5); ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(1) AND (simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 1 AND (id@0 < 4 OR id@0 = 5), projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND (id@0 < 4 OR id@0 = 5), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND (id_null_count@1 != row_count@2 AND id_min@3 < 4 OR id_null_count@1 != row_count@2 AND id_min@3 <= 5 AND 5 <= id_max@0), required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 1 AND (id@1 < 4 OR id@1 = 5), projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND (id@0 < 4 OR id@0 = 5), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND (id_null_count@1 != row_count@2 AND id_min@3 < 4 OR id_null_count@1 != row_count@2 AND id_min@3 <= 5 AND 5 <= id_max@0), required_guarantees=[] # Verify correctness - should return rows where (id > 1) AND ((id < 4) OR (id = 5)) # That's: id=2,3 (1 1 AND id < 5; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(1) AND simple_struct.id < Int64(5) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(5)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(5)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 1 AND id@0 < 5, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND id@0 < 5, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND id_null_count@1 != row_count@2 AND id_min@3 < 5, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 1 AND id@1 < 5, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND id@0 < 5, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND id_null_count@1 != row_count@2 AND id_min@3 < 5, required_guarantees=[] # Verify correctness - should return rows where 1 < id < 5 (id=2,3,4) query I @@ -1177,13 +1195,14 @@ query TT EXPLAIN SELECT s['value'], s['label'], id FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")), simple_struct.id +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label], simple_struct.id 02)--Filter: simple_struct.id > Int64(1) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label], id@0 as id] -02)--FilterExec: id@0 > 1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label], id@2 as id] +02)--FilterExec: id@2 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness - note that id is now at index 2 in the augmented projection query ITI @@ -1197,13 +1216,14 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE length(s['label']) > 4; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) -02)--Filter: character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4) -03)----TableScan: simple_struct projection=[s], partial_filters=[character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4)] +01)Projection: __datafusion_extracted_2 AS simple_struct.s[value] +02)--Filter: character_length(__datafusion_extracted_1) > Int32(4) +03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2 +04)------TableScan: simple_struct projection=[s], partial_filters=[character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: character_length(get_field(s@0, label)) > 4 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[s], file_type=parquet +01)ProjectionExec: expr=[__datafusion_extracted_2@0 as simple_struct.s[value]] +02)--FilterExec: character_length(__datafusion_extracted_1@0) > 4, projection=[__datafusion_extracted_2@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet # Verify correctness - filter on rows where label length > 4 (all have length 5, except 'one' has 3) # Wait, from the data: alpha(5), beta(4), gamma(5), delta(5), epsilon(7) @@ -1230,12 +1250,13 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value']; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: get_field(simple_struct.s, Utf8("value")) ASC NULLS LAST -03)----TableScan: simple_struct projection=[id, s] +02)--Sort: __datafusion_extracted_1 ASC NULLS LAST +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 +04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: expr=[get_field(s@1, value) ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--SortExec: expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet # Verify correctness query I @@ -1258,13 +1279,13 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['label']; ---- logical_plan 01)Projection: simple_struct.id, simple_struct.s[value] -02)--Sort: simple_struct.id ASC NULLS LAST, get_field(simple_struct.s, Utf8("label")) ASC NULLS LAST -03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), simple_struct.s +02)--Sort: simple_struct.id ASC NULLS LAST, __datafusion_extracted_1 ASC NULLS LAST +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1 04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id, simple_struct.s[value]@1 as simple_struct.s[value]] -02)--SortExec: expr=[id@0 ASC NULLS LAST, get_field(s@2, label) ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], s], file_type=parquet +02)--SortExec: expr=[id@0 ASC NULLS LAST, __datafusion_extracted_1@2 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as __datafusion_extracted_1], file_type=parquet # Verify correctness query II @@ -1287,12 +1308,13 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] LIMIT 2; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: get_field(simple_struct.s, Utf8("value")) ASC NULLS LAST, fetch=2 -03)----TableScan: simple_struct projection=[id, s] +02)--Sort: __datafusion_extracted_1 ASC NULLS LAST, fetch=2 +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 +04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: TopK(fetch=2), expr=[get_field(s@1, value) ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--SortExec: TopK(fetch=2), expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet # Verify correctness query I @@ -1312,12 +1334,13 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] * 2; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: get_field(simple_struct.s, Utf8("value")) * Int64(2) ASC NULLS LAST -03)----TableScan: simple_struct projection=[id, s] +02)--Sort: __datafusion_extracted_1 * Int64(2) ASC NULLS LAST +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 +04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: expr=[get_field(s@1, value) * 2 ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--SortExec: expr=[__datafusion_extracted_1@1 * 2 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet # Verify correctness query I @@ -1390,13 +1413,15 @@ INNER JOIN join_right ON simple_struct.s['value'] = join_right.s['level'] * 10; ---- logical_plan 01)Projection: simple_struct.id, join_right.id -02)--Inner Join: get_field(simple_struct.s, Utf8("value")) = get_field(join_right.s, Utf8("level")) * Int64(10) -03)----TableScan: simple_struct projection=[id, s] -04)----TableScan: join_right projection=[id, s] +02)--Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2 * Int64(10) +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s] +05)----Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id +06)------TableScan: join_right projection=[id, s] physical_plan -01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(simple_struct.s[value]@2, join_right.s[level] * Int64(10)@2)], projection=[id@0, id@3] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s, get_field(s@1, level) * 10 as join_right.s[level] * Int64(10)], file_type=parquet +01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(__datafusion_extracted_1@0, __datafusion_extracted_2 * Int64(10)@2)], projection=[id@1, id@3] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id, get_field(s@1, level) * 10 as __datafusion_extracted_2 * Int64(10)], file_type=parquet # Verify correctness - value = level * 10 # simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250) @@ -1424,13 +1449,14 @@ WHERE simple_struct.s['value'] > 150; logical_plan 01)Inner Join: simple_struct.id = join_right.id 02)--Projection: simple_struct.id -03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] -05)--TableScan: join_right projection=[id] +03)----Filter: __datafusion_extracted_1 > Int64(150) +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +06)--TableScan: join_right projection=[id] physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] -02)--FilterExec: get_field(s@1, value) > 150, projection=[id@0] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet 04)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness - id matches and value > 150 @@ -1459,17 +1485,19 @@ WHERE simple_struct.s['value'] > 100 AND join_right.s['level'] > 3; logical_plan 01)Inner Join: simple_struct.id = join_right.id 02)--Projection: simple_struct.id -03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(100) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(100)] -05)--Projection: join_right.id -06)----Filter: get_field(join_right.s, Utf8("level")) > Int64(3) -07)------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(3)] +03)----Filter: __datafusion_extracted_1 > Int64(100) +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(100)] +06)--Projection: join_right.id +07)----Filter: __datafusion_extracted_2 > Int64(3) +08)------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id +09)--------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(3)] physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] -02)--FilterExec: get_field(s@1, value) > 100, projection=[id@0] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet -04)--FilterExec: get_field(s@1, level) > 3, projection=[id@0] -05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet, predicate=DynamicFilter [ empty ] +02)--FilterExec: __datafusion_extracted_1@0 > 100, projection=[id@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet +04)--FilterExec: __datafusion_extracted_2@0 > 3, projection=[id@1] +05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness - id matches, value > 100, and level > 3 # Matching ids where value > 100: 2(200), 3(150), 4(300), 5(250) @@ -1495,15 +1523,17 @@ FROM simple_struct INNER JOIN join_right ON simple_struct.id = join_right.id; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")), get_field(join_right.s, Utf8("role")) +01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[label], __datafusion_extracted_2 AS join_right.s[role] 02)--Inner Join: simple_struct.id = join_right.id -03)----TableScan: simple_struct projection=[id, s] -04)----TableScan: join_right projection=[id, s] +03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s] +05)----Projection: get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_2, join_right.id +06)------TableScan: join_right projection=[id, s] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label], get_field(s@2, role) as join_right.s[role]] -02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@0, s@1, s@3] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet, predicate=DynamicFilter [ empty ] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[label], __datafusion_extracted_2@2 as join_right.s[role]] +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], projection=[__datafusion_extracted_1@0, id@1, __datafusion_extracted_2@2] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, id], file_type=parquet +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, role) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness query ITT @@ -1561,17 +1591,20 @@ FROM simple_struct LEFT JOIN join_right ON simple_struct.id = join_right.id AND join_right.s['level'] > 5; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(join_right.s, Utf8("level")) +01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[value], __datafusion_extracted_3 AS join_right.s[level] 02)--Left Join: simple_struct.id = join_right.id -03)----TableScan: simple_struct projection=[id, s] -04)----Filter: get_field(join_right.s, Utf8("level")) > Int64(5) -05)------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(5)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s] +05)----Projection: join_right.id, __datafusion_extracted_3 +06)------Filter: __datafusion_extracted_1 > Int64(5) +07)--------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_1, join_right.id, get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_3 +08)----------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(5)] physical_plan -01)ProjectionExec: expr=[id@1 as id, get_field(s@2, value) as simple_struct.s[value], get_field(s@0, level) as join_right.s[level]] -02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(id@0, id@0)], projection=[s@1, id@2, s@3] -03)----FilterExec: get_field(s@1, level) > 5 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet -05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_2@0 as simple_struct.s[value], __datafusion_extracted_3@2 as join_right.s[level]] +02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@1, id@0)], projection=[__datafusion_extracted_2@0, id@1, __datafusion_extracted_3@3] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet +04)----FilterExec: __datafusion_extracted_1@0 > 5, projection=[id@1, __datafusion_extracted_3@2] +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_1, id, get_field(s@1, level) as __datafusion_extracted_3], file_type=parquet # Verify correctness - left join with level > 5 condition # Only join_right rows with level > 5 are matched: id=1 (level=10), id=4 (level=8) @@ -1599,14 +1632,15 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 2, projection=[s@1] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] 03)----RepartitionExec: partitioning=RoundRobinBatch(32), input_partitions=1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] ##################### # Section 14: SubqueryAlias tests @@ -1621,15 +1655,16 @@ query TT EXPLAIN SELECT t.s['value'] FROM (SELECT * FROM simple_struct) t WHERE t.id > 2; ---- logical_plan -01)Projection: get_field(t.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS t.s[value] 02)--SubqueryAlias: t -03)----Projection: simple_struct.s +03)----Projection: __datafusion_extracted_1 04)------Filter: simple_struct.id > Int64(2) -05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +05)--------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as t.s[value]] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -1645,9 +1680,10 @@ EXPLAIN SELECT t.s['value'], t.s['label'] FROM (SELECT * FROM simple_struct) t O ---- logical_plan 01)Sort: t.s[value] ASC NULLS LAST -02)--Projection: get_field(t.s, Utf8("value")), get_field(t.s, Utf8("label")) +02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label] 03)----SubqueryAlias: t -04)------TableScan: simple_struct projection=[s] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2 +05)--------TableScan: simple_struct projection=[s] physical_plan 01)SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as t.s[value], get_field(s@1, label) as t.s[label]], file_type=parquet @@ -1667,16 +1703,17 @@ query TT EXPLAIN SELECT u.s['value'] FROM (SELECT * FROM (SELECT * FROM simple_struct) t) u WHERE u.id > 2; ---- logical_plan -01)Projection: get_field(u.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS u.s[value] 02)--SubqueryAlias: u 03)----SubqueryAlias: t -04)------Projection: simple_struct.s +04)------Projection: __datafusion_extracted_1 05)--------Filter: simple_struct.id > Int64(2) -06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as u.s[value]] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as u.s[value]] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -1693,11 +1730,12 @@ EXPLAIN SELECT t.id FROM (SELECT * FROM simple_struct) t WHERE t.s['value'] > 20 logical_plan 01)SubqueryAlias: t 02)--Projection: simple_struct.id -03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(200) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(200)] +03)----Filter: __datafusion_extracted_1 > Int64(200) +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(200)] physical_plan -01)FilterExec: get_field(s@1, value) > 200, projection=[id@0] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +01)FilterExec: __datafusion_extracted_1@0 > 200, projection=[id@1] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet # Verify correctness query I @@ -1719,22 +1757,24 @@ EXPLAIN SELECT s['value'] FROM ( ) t; ---- logical_plan -01)Projection: get_field(t.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS t.s[value] 02)--SubqueryAlias: t 03)----Union -04)------Projection: simple_struct.s +04)------Projection: __datafusion_extracted_1 05)--------Filter: simple_struct.id <= Int64(3) -06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] -07)------Projection: simple_struct.s -08)--------Filter: simple_struct.id > Int64(3) -09)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] +06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] +08)------Projection: __datafusion_extracted_1 +09)--------Filter: simple_struct.id > Int64(3) +10)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +11)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as t.s[value]] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]] 02)--UnionExec -03)----FilterExec: id@0 <= 3, projection=[s@1] -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] -05)----FilterExec: id@0 > 3, projection=[s@1] -06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] +03)----FilterExec: id@1 <= 3, projection=[__datafusion_extracted_1@0] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] +05)----FilterExec: id@1 > 3, projection=[__datafusion_extracted_1@0] +06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] # Verify correctness query I @@ -1760,24 +1800,26 @@ EXPLAIN SELECT s['value'], s['label'] FROM ( ---- logical_plan 01)Sort: t.s[value] ASC NULLS LAST -02)--Projection: get_field(t.s, Utf8("value")), get_field(t.s, Utf8("label")) +02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label] 03)----SubqueryAlias: t 04)------Union -05)--------Projection: simple_struct.s +05)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2 06)----------Filter: simple_struct.id <= Int64(3) -07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] -08)--------Projection: simple_struct.s -09)----------Filter: simple_struct.id > Int64(3) -10)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] +07)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +08)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] +09)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2 +10)----------Filter: simple_struct.id > Int64(3) +11)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +12)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] physical_plan 01)SortPreservingMergeExec: [t.s[value]@0 ASC NULLS LAST] 02)--SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[get_field(s@0, value) as t.s[value], get_field(s@0, label) as t.s[label]] +03)----ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value], __datafusion_extracted_2@1 as t.s[label]] 04)------UnionExec -05)--------FilterExec: id@0 <= 3, projection=[s@1] -06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] -07)--------FilterExec: id@0 > 3, projection=[s@1] -08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] +05)--------FilterExec: id@2 <= 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] +07)--------FilterExec: id@2 > 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] # Verify correctness query IT @@ -1792,3 +1834,118 @@ SELECT s['value'], s['label'] FROM ( 200 beta 250 epsilon 300 delta + +##################### +# Section 16: Aggregate / Join edge-case tests +# Translated from unit tests in extract_leaf_expressions.rs +##################### + +### +# Test 16.1: Projection with get_field above Aggregate +# Aggregate blocks pushdown, so the get_field stays in the top projection. +# (mirrors test_projection_with_leaf_expr_above_aggregate) +### + +query TT +EXPLAIN SELECT s['label'] IS NOT NULL AS has_label, COUNT(1) +FROM simple_struct GROUP BY s; +---- +logical_plan +01)Projection: get_field(simple_struct.s, Utf8("label")) IS NOT NULL AS has_label, count(Int64(1)) +02)--Aggregate: groupBy=[[simple_struct.s]], aggr=[[count(Int64(1))]] +03)----TableScan: simple_struct projection=[s] +physical_plan +01)ProjectionExec: expr=[get_field(s@0, label) IS NOT NULL as has_label, count(Int64(1))@1 as count(Int64(1))] +02)--AggregateExec: mode=Single, gby=[s@0 as s], aggr=[count(Int64(1))] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[s], file_type=parquet + +# Verify correctness - all labels are non-null +query BI +SELECT s['label'] IS NOT NULL AS has_label, COUNT(1) +FROM simple_struct GROUP BY s ORDER BY COUNT(1); +---- +true 1 +true 1 +true 1 +true 1 +true 1 + +### +# Test 16.2: Join with get_field filter on qualified right side +# The get_field on join_right.s['role'] must be routed to the right input only. +# (mirrors test_extract_from_join_qualified_right_side) +### + +query TT +EXPLAIN +SELECT s.s['value'], j.s['role'] +FROM join_right j +INNER JOIN simple_struct s ON s.id = j.id +WHERE s.s['value'] > j.s['level']; +---- +logical_plan +01)Projection: __datafusion_extracted_3 AS s.s[value], __datafusion_extracted_4 AS j.s[role] +02)--Inner Join: j.id = s.id Filter: __datafusion_extracted_1 > __datafusion_extracted_2 +03)----SubqueryAlias: j +04)------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_4, join_right.id +05)--------TableScan: join_right projection=[id, s] +06)----SubqueryAlias: s +07)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_3, simple_struct.id +08)--------TableScan: simple_struct projection=[id, s] +physical_plan +01)ProjectionExec: expr=[__datafusion_extracted_3@1 as s.s[value], __datafusion_extracted_4@0 as j.s[role]] +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@2, id@2)], filter=__datafusion_extracted_1@1 > __datafusion_extracted_2@0, projection=[__datafusion_extracted_4@1, __datafusion_extracted_3@4] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, get_field(s@1, role) as __datafusion_extracted_4, id], file_type=parquet +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_3, id], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness - only admin roles match (ids 1 and 4) +query II +SELECT simple_struct.id, join_right.id +FROM simple_struct +INNER JOIN join_right + ON simple_struct.id = join_right.id + AND join_right.s['role'] = 'admin' +ORDER BY simple_struct.id; +---- +1 1 +4 4 + +### +# Test 16.3: Join with cross-input get_field comparison in WHERE +# get_field from each side is extracted and routed to its respective input independently. +# (mirrors test_extract_from_join_cross_input_expression) +### + +query TT +EXPLAIN SELECT simple_struct.id, join_right.id +FROM simple_struct +INNER JOIN join_right ON simple_struct.id = join_right.id +WHERE simple_struct.s['value'] > join_right.s['level']; +---- +logical_plan +01)Projection: simple_struct.id, join_right.id +02)--Inner Join: simple_struct.id = join_right.id Filter: __datafusion_extracted_1 > __datafusion_extracted_2 +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s] +05)----Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id +06)------TableScan: join_right projection=[id, s] +physical_plan +01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], filter=__datafusion_extracted_1@0 > __datafusion_extracted_2@1, projection=[id@1, id@3] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness - all rows match since value >> level for all ids +# simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250) +# join_right: (1,10), (2,5), (3,1), (4,8), (5,3) +query II +SELECT simple_struct.id, join_right.id +FROM simple_struct +INNER JOIN join_right ON simple_struct.id = join_right.id +WHERE simple_struct.s['value'] > join_right.s['level'] +ORDER BY simple_struct.id; +---- +1 1 +2 2 +3 3 +4 4 +5 5 diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index b1cb354e053e4..edafcfaa543f2 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -116,11 +116,12 @@ explain select * from (select column1, unnest(column2) as o from d) where o['a'] ---- physical_plan 01)ProjectionExec: expr=[column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as o] -02)--FilterExec: get_field(__unnest_placeholder(d.column2,depth=1)@1, a) = 1 +02)--FilterExec: __datafusion_extracted_1@0 = 1, projection=[column1@1, __unnest_placeholder(d.column2,depth=1)@2] 03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -04)------UnnestExec -05)--------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)] -06)----------DataSourceExec: partitions=1, partition_sizes=[1] +04)------ProjectionExec: expr=[get_field(__unnest_placeholder(d.column2,depth=1)@1, a) as __datafusion_extracted_1, column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as __unnest_placeholder(d.column2,depth=1)] +05)--------UnnestExec +06)----------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)] +07)------------DataSourceExec: partitions=1, partition_sizes=[1] statement ok drop table d; diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index 53a1bb4ec6751..e20815a58c765 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -1666,4 +1666,4 @@ order by id; 3 2 150 statement ok -drop table t_agg_window; +drop table t_agg_window; \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 1a6b82020c667..73aeb6c99d0db 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -666,7 +666,7 @@ explain select unnest(unnest(unnest(column3)['c1'])), column3 from recursive_unn logical_plan 01)Projection: __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1],depth=2) AS UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), recursive_unnest_table.column3 02)--Unnest: lists[__unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1])|depth=2] structs[] -03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1) AS UNNEST(recursive_unnest_table.column3), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 +03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 04)------Unnest: lists[__unnest_placeholder(recursive_unnest_table.column3)|depth=1] structs[] 05)--------Projection: recursive_unnest_table.column3 AS __unnest_placeholder(recursive_unnest_table.column3), recursive_unnest_table.column3 06)----------TableScan: recursive_unnest_table projection=[column3]