From 7b55719d56b724bb07c014d0ed47cc49bc0adfc0 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 9 Feb 2026 22:40:59 -0500 Subject: [PATCH 01/18] feat: add ExtractLeafExpressions optimizer rule for get_field pushdown --- .../optimizer/src/extract_leaf_expressions.rs | 1464 +++++++++++++++-- datafusion/optimizer/src/optimizer.rs | 3 + .../sqllogictest/test_files/explain.slt | 8 + .../sqllogictest/test_files/projection.slt | 2 +- .../test_files/projection_pushdown.slt | 468 +++--- .../test_files/push_down_filter.slt | 9 +- datafusion/sqllogictest/test_files/struct.slt | 2 +- datafusion/sqllogictest/test_files/unnest.slt | 2 +- 8 files changed, 1636 insertions(+), 322 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index d04261456d600..46556c57523d2 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -15,23 +15,28 @@ // specific language governing permissions and limitations // under the License. -//! NB: This module is a work in progress. -//! We merged it early in -//! with the skeleton and snapshots matching the current state, -//! but the actual implementation is pending further development. -//! There may be comments or code that are incomplete or inaccurate. //! Two-pass optimizer pipeline that pushes cheap expressions (like struct field //! access `user['status']`) closer to data sources, enabling early data reduction //! and source-level optimizations (e.g., Parquet column pruning). See //! [`ExtractLeafExpressions`] (pass 1) and [`PushDownLeafProjections`] (pass 2). -use datafusion_common::Result; -use datafusion_common::tree_node::Transformed; +use indexmap::{IndexMap, IndexSet}; +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion_common::alias::AliasGenerator; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; +use datafusion_common::{Column, DFSchema, Result, qualified_name}; use datafusion_expr::logical_plan::LogicalPlan; +use datafusion_expr::{Expr, ExpressionPlacement, Projection}; use crate::optimizer::ApplyOrder; +use crate::push_down_filter::replace_cols_by_name; +use crate::utils::has_all_column_refs; use crate::{OptimizerConfig, OptimizerRule}; +const EXTRACTED_EXPR_PREFIX: &str = "__datafusion_extracted"; + /// Extracts `MoveTowardsLeafNodes` sub-expressions from non-projection nodes /// into **extraction projections** (pass 1 of 2). /// @@ -72,7 +77,8 @@ use crate::{OptimizerConfig, OptimizerRule}; /// ``` /// /// **Important:** The `PushDownFilter` rule is aware of projections created by this rule -/// and will not push filters through them. See `is_extracted_expr_projection` in utils.rs. +/// and will not push filters through them. It uses `ExpressionPlacement` to detect +/// `MoveTowardsLeafNodes` expressions and skip filter pushdown past them. #[derive(Default, Debug)] pub struct ExtractLeafExpressions {} @@ -95,9 +101,443 @@ impl OptimizerRule for ExtractLeafExpressions { fn rewrite( &self, plan: LogicalPlan, - _config: &dyn OptimizerConfig, + config: &dyn OptimizerConfig, ) -> Result> { - Ok(Transformed::no(plan)) + let alias_generator = config.alias_generator(); + extract_from_plan(plan, alias_generator) + } +} + +/// Extracts `MoveTowardsLeafNodes` sub-expressions from a plan node. +/// +/// Works for any number of inputs (0, 1, 2, …N). For multi-input nodes +/// like Join, each extracted sub-expression is routed to the correct input +/// by checking which input's schema contains all of the expression's column +/// references. +fn extract_from_plan( + plan: LogicalPlan, + alias_generator: &Arc, +) -> Result> { + // Only extract from plan types whose output schema is predictable after + // expression rewriting. Nodes like Window derive column names from + // their expressions, so rewriting `get_field` inside a window function + // changes the output schema and breaks the recovery projection. + if !matches!( + &plan, + LogicalPlan::Aggregate(_) + | LogicalPlan::Filter(_) + | LogicalPlan::Sort(_) + | LogicalPlan::Limit(_) + | LogicalPlan::Join(_) + ) { + return Ok(Transformed::no(plan)); + } + + let inputs = plan.inputs(); + if inputs.is_empty() { + return Ok(Transformed::no(plan)); + } + + // Save original output schema before any transformation + let original_schema = Arc::clone(plan.schema()); + + // Clone inputs upfront (before plan is consumed by map_expressions) + let owned_inputs: Vec = inputs.into_iter().cloned().collect(); + + // Build per-input schemas (kept alive for extractor borrows) + let input_schemas: Vec> = owned_inputs + .iter() + .map(|i| Arc::clone(i.schema())) + .collect(); + + // Build per-input extractors + let mut extractors: Vec = input_schemas + .iter() + .map(|schema| LeafExpressionExtractor::new(schema.as_ref(), alias_generator)) + .collect(); + + // Build per-input column sets for routing expressions to the correct input + let input_column_sets: Vec> = input_schemas + .iter() + .map(|schema| schema_columns(schema.as_ref())) + .collect(); + + // Transform expressions via map_expressions with routing + let transformed = plan.map_expressions(|expr| { + routing_extract(expr, &mut extractors, &input_column_sets) + })?; + + // If no expressions were rewritten, nothing was extracted + if !transformed.transformed { + return Ok(transformed); + } + + // Build per-input extraction projections (None means no extractions for that input) + let new_inputs: Vec = owned_inputs + .iter() + .zip(extractors.iter()) + .map(|(input, extractor)| { + let input_arc = Arc::new(input.clone()); + Ok(extractor + .build_extraction_projection(&input_arc)? + .unwrap_or_else(|| input.clone())) + }) + .collect::>>()?; + + // Rebuild and add recovery projection if schema changed + let new_plan = transformed + .data + .with_new_exprs(transformed.data.expressions(), new_inputs)?; + + // Add recovery projection if the output schema changed + let recovered = build_recovery_projection(original_schema.as_ref(), new_plan)?; + + Ok(Transformed::yes(recovered)) +} + +/// Given an expression, returns the index of the input whose columns fully +/// cover the expression's column references. +/// Returns `None` if the expression references columns from multiple inputs. +fn find_owning_input( + expr: &Expr, + input_column_sets: &[std::collections::HashSet], +) -> Option { + input_column_sets + .iter() + .position(|cols| has_all_column_refs(expr, cols)) +} + +/// Walks an expression tree top-down, extracting `MoveTowardsLeafNodes` +/// sub-expressions and routing each to the correct per-input extractor. +fn routing_extract( + expr: Expr, + extractors: &mut [LeafExpressionExtractor], + input_column_sets: &[std::collections::HashSet], +) -> Result> { + expr.transform_down(|e| { + // Skip expressions already aliased with extracted expression pattern + if let Expr::Alias(alias) = &e + && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) + { + return Ok(Transformed { + data: e, + transformed: false, + tnr: TreeNodeRecursion::Jump, + }); + } + + // Don't extract Alias nodes directly — preserve the alias and let + // transform_down recurse into the inner expression + if matches!(&e, Expr::Alias(_)) { + return Ok(Transformed::no(e)); + } + + match e.placement() { + ExpressionPlacement::MoveTowardsLeafNodes => { + if let Some(idx) = find_owning_input(&e, input_column_sets) { + let col_ref = extractors[idx].add_extracted(e)?; + Ok(Transformed::yes(col_ref)) + } else { + // References columns from multiple inputs — cannot extract + Ok(Transformed::no(e)) + } + } + ExpressionPlacement::Column => { + // Track columns that the parent node references so the + // extraction projection includes them as pass-through. + // Without this, the extraction projection would only + // contain __extracted_N aliases, and the parent couldn't + // resolve its other column references. + if let Expr::Column(col) = &e + && let Some(idx) = find_owning_input(&e, input_column_sets) + { + extractors[idx].columns_needed.insert(col.clone()); + } + Ok(Transformed::no(e)) + } + _ => Ok(Transformed::no(e)), + } + }) +} + +/// Returns all columns in the schema (both qualified and unqualified forms) +fn schema_columns(schema: &DFSchema) -> std::collections::HashSet { + schema + .iter() + .flat_map(|(qualifier, field)| { + [ + Column::new(qualifier.cloned(), field.name()), + Column::new_unqualified(field.name()), + ] + }) + .collect() +} + +// ============================================================================= +// Helper Functions for Extraction Targeting +// ============================================================================= + +/// Build a replacement map from a projection: output_column_name -> underlying_expr. +/// +/// This is used to resolve column references through a renaming projection. +/// For example, if a projection has `user AS x`, this maps `x` -> `col("user")`. +fn build_projection_replace_map(projection: &Projection) -> HashMap { + projection + .schema + .iter() + .zip(projection.expr.iter()) + .map(|((qualifier, field), expr)| { + let key = Column::from((qualifier, field)).flat_name(); + (key, expr.clone().unalias()) + }) + .collect() +} + +/// Build a recovery projection to restore the original output schema. +/// +/// After extraction, a node's output schema may differ from the original: +/// +/// - **Schema-preserving nodes** (Filter/Sort/Limit): the extraction projection +/// below adds extra `__extracted_N` columns that bubble up through the node. +/// Recovery selects only the original columns to hide the extras. +/// ```text +/// Original schema: [id, user] +/// After extraction: [__extracted_1, id, user] ← extra column leaked through +/// Recovery: SELECT id, user FROM ... ← hides __extracted_1 +/// ``` +/// +/// - **Schema-defining nodes** (Aggregate): same number of columns but names +/// may differ because extracted aliases replaced the original expressions. +/// Recovery maps positionally, aliasing where names changed. +/// ```text +/// Original: [SUM(user['balance'])] +/// After: [SUM(__extracted_1)] ← name changed +/// Recovery: SUM(__extracted_1) AS "SUM(user['balance'])" +/// ``` +/// +/// - **Schemas identical** → no recovery projection needed. +fn build_recovery_projection( + original_schema: &DFSchema, + input: LogicalPlan, +) -> Result { + let new_schema = input.schema(); + let orig_len = original_schema.fields().len(); + let new_len = new_schema.fields().len(); + + if orig_len == new_len { + // Same number of fields — check if schemas are identical + let schemas_match = original_schema.iter().zip(new_schema.iter()).all( + |((orig_q, orig_f), (new_q, new_f))| { + orig_f.name() == new_f.name() && orig_q == new_q + }, + ); + if schemas_match { + return Ok(input); + } + + // Schema-defining nodes (Projection, Aggregate): names may differ at some positions. + // Map positionally, aliasing where the name changed. + let mut proj_exprs = Vec::with_capacity(orig_len); + for (i, (orig_qualifier, orig_field)) in original_schema.iter().enumerate() { + let (new_qualifier, new_field) = new_schema.qualified_field(i); + if orig_field.name() == new_field.name() && orig_qualifier == new_qualifier { + proj_exprs.push(Expr::from((orig_qualifier, orig_field))); + } else { + let new_col = Expr::Column(Column::from((new_qualifier, new_field))); + proj_exprs.push( + new_col.alias_qualified(orig_qualifier.cloned(), orig_field.name()), + ); + } + } + let projection = Projection::try_new(proj_exprs, Arc::new(input))?; + Ok(LogicalPlan::Projection(projection)) + } else { + // Schema-preserving nodes: new schema has extra extraction columns. + // Original columns still exist by name; select them to hide extras. + let col_exprs: Vec = original_schema.iter().map(Expr::from).collect(); + let projection = Projection::try_new(col_exprs, Arc::new(input))?; + Ok(LogicalPlan::Projection(projection)) + } +} + +/// Collects `MoveTowardsLeafNodes` sub-expressions found during expression +/// tree traversal and can build an extraction projection from them. +/// +/// # Example +/// +/// Given `Filter: user['status'] = 'active' AND user['name'] IS NOT NULL`: +/// - `add_extracted(user['status'])` → stores it, returns `col("__extracted_1")` +/// - `add_extracted(user['name'])` → stores it, returns `col("__extracted_2")` +/// - `build_extraction_projection()` produces: +/// `Projection: user['status'] AS __extracted_1, user['name'] AS __extracted_2, ` +struct LeafExpressionExtractor<'a> { + /// Extracted expressions: maps expression -> alias + extracted: IndexMap, + /// Columns referenced by extracted expressions or the parent node, + /// included as pass-through in the extraction projection. + columns_needed: IndexSet, + /// Input schema + input_schema: &'a DFSchema, + /// Alias generator + alias_generator: &'a Arc, +} + +impl<'a> LeafExpressionExtractor<'a> { + fn new(input_schema: &'a DFSchema, alias_generator: &'a Arc) -> Self { + Self { + extracted: IndexMap::new(), + columns_needed: IndexSet::new(), + input_schema, + alias_generator, + } + } + + /// Adds an expression to extracted set, returns column reference. + fn add_extracted(&mut self, expr: Expr) -> Result { + // Deduplication: reuse existing alias if same expression + if let Some(alias) = self.extracted.get(&expr) { + return Ok(Expr::Column(Column::new_unqualified(alias))); + } + + // Track columns referenced by this expression + for col in expr.column_refs() { + self.columns_needed.insert(col.clone()); + } + + // Generate unique alias + let alias = self.alias_generator.next(EXTRACTED_EXPR_PREFIX); + self.extracted.insert(expr, alias.clone()); + + Ok(Expr::Column(Column::new_unqualified(&alias))) + } + + /// Builds a fresh extraction projection above the given input. + /// + /// Returns `None` if there are no extractions. Otherwise creates a new + /// projection that includes extracted expressions (aliased) plus all + /// input schema columns for pass-through. + fn build_extraction_projection( + &self, + input: &Arc, + ) -> Result> { + if self.extracted.is_empty() { + return Ok(None); + } + let mut proj_exprs = Vec::new(); + for (expr, alias) in self.extracted.iter() { + proj_exprs.push(expr.clone().alias(alias)); + } + for (qualifier, field) in self.input_schema.iter() { + proj_exprs.push(Expr::from((qualifier, field))); + } + Ok(Some(LogicalPlan::Projection(Projection::try_new( + proj_exprs, + Arc::clone(input), + )?))) + } +} + +/// Build an extraction projection above the target node. +/// +/// If the target is an existing projection, merges into it. This requires +/// resolving column references through the projection's rename mapping: +/// if the projection has `user AS u`, and an extracted expression references +/// `u['name']`, we must rewrite it to `user['name']` since the merged +/// projection reads from the same input as the original. +/// +/// Deduplicates by resolved expression equality and adds pass-through +/// columns as needed. Otherwise builds a fresh projection with extracted +/// expressions + ALL input schema columns. +fn build_extraction_projection_impl( + extracted_exprs: &[(Expr, String)], + columns_needed: &IndexSet, + target: &Arc, + target_schema: &DFSchema, +) -> Result { + if let LogicalPlan::Projection(existing) = target.as_ref() { + // Merge into existing projection + let mut proj_exprs = existing.expr.clone(); + + // Build a map of existing expressions (by Expr equality) to their aliases + let existing_extractions: IndexMap = existing + .expr + .iter() + .filter_map(|e| { + if let Expr::Alias(alias) = e + && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) + { + return Some((*alias.expr.clone(), alias.name.clone())); + } + None + }) + .collect(); + + // Resolve column references through the projection's rename mapping + let replace_map = build_projection_replace_map(existing); + + // Add new extracted expressions, resolving column refs through the projection + for (expr, alias) in extracted_exprs { + let resolved = replace_cols_by_name(expr.clone().alias(alias), &replace_map)?; + let resolved_inner = if let Expr::Alias(a) = &resolved { + a.expr.as_ref() + } else { + &resolved + }; + if let Some(existing_alias) = existing_extractions.get(resolved_inner) { + // Same expression already extracted under a different alias — + // add the expression with the new alias so both names are + // available in the output. We can't reference the existing alias + // as a column within the same projection, so we duplicate the + // computation. + if existing_alias != alias { + proj_exprs.push(resolved); + } + } else { + proj_exprs.push(resolved); + } + } + + // Add any new pass-through columns that aren't already in the projection. + // We check against existing.input.schema() (the projection's source) rather + // than target_schema (the projection's output) because columns produced + // by alias expressions (e.g., CSE's __common_expr_N) exist in the output but + // not the input, and cannot be added as pass-through Column references. + let existing_cols: IndexSet = existing + .expr + .iter() + .filter_map(|e| { + if let Expr::Column(c) = e { + Some(c.clone()) + } else { + None + } + }) + .collect(); + + let input_schema = existing.input.schema(); + for col in columns_needed { + let col_expr = Expr::Column(col.clone()); + let resolved = replace_cols_by_name(col_expr, &replace_map)?; + if let Expr::Column(resolved_col) = &resolved + && !existing_cols.contains(resolved_col) + && input_schema.has_column(resolved_col) + { + proj_exprs.push(Expr::Column(resolved_col.clone())); + } + // If resolved to non-column expr, it's already computed by existing projection + } + + Projection::try_new(proj_exprs, Arc::clone(&existing.input)) + } else { + // Build new projection with extracted expressions + all input columns + let mut proj_exprs = Vec::new(); + for (expr, alias) in extracted_exprs { + proj_exprs.push(expr.clone().alias(alias)); + } + for (qualifier, field) in target_schema.iter() { + proj_exprs.push(Expr::from((qualifier, field))); + } + Projection::try_new(proj_exprs, Arc::clone(target)) } } @@ -155,10 +595,507 @@ impl OptimizerRule for PushDownLeafProjections { fn rewrite( &self, plan: LogicalPlan, - _config: &dyn OptimizerConfig, + config: &dyn OptimizerConfig, ) -> Result> { - Ok(Transformed::no(plan)) + let alias_generator = config.alias_generator(); + match try_push_input(&plan, alias_generator)? { + Some(new_plan) => Ok(Transformed::yes(new_plan)), + None => Ok(Transformed::no(plan)), + } + } +} + +/// Attempts to push a projection's extractable expressions further down. +/// +/// Returns `Some(new_subtree)` if the projection was pushed down or merged, +/// `None` if there is nothing to push or the projection sits above a barrier. +fn try_push_input( + input: &LogicalPlan, + alias_generator: &Arc, +) -> Result> { + let LogicalPlan::Projection(proj) = input else { + return Ok(None); + }; + split_and_push_projection(proj, alias_generator) +} + +/// Splits a projection into extractable pieces, pushes them towards leaf +/// nodes, and adds a recovery projection if needed. +/// +/// Handles both: +/// - **Pure extraction projections** (all `__extracted` aliases + columns) +/// - **Mixed projections** (containing `MoveTowardsLeafNodes` sub-expressions) +/// +/// Returns `Some(new_subtree)` if extractions were pushed down, +/// `None` if there is nothing to extract or push. +/// +/// # Example: Mixed Projection +/// +/// ```text +/// Input plan: +/// Projection: user['name'] IS NOT NULL AS has_name, id +/// Filter: ... +/// TableScan +/// +/// Phase 1 (Split): +/// extraction_pairs: [(user['name'], "__extracted_1")] +/// recovery_exprs: [__extracted_1 IS NOT NULL AS has_name, id] +/// +/// Phase 2 (Push): +/// Push extraction projection through Filter toward TableScan +/// +/// Phase 3 (Recovery): +/// Projection: __extracted_1 IS NOT NULL AS has_name, id <-- recovery +/// Filter: ... +/// Projection: user['name'] AS __extracted_1, id <-- extraction (pushed) +/// TableScan +/// ``` +fn split_and_push_projection( + proj: &Projection, + alias_generator: &Arc, +) -> Result> { + let input = &proj.input; + let input_schema = input.schema(); + + // ── Phase 1: Split ────────────────────────────────────────────────── + // For each projection expression, collect extraction pairs and build + // recovery expressions. + // + // Pre-existing `__extracted` aliases are inserted into the extractor's + // `IndexMap` with the **full** `Expr::Alias(…)` as the key, so the + // alias name participates in equality. This prevents collisions when + // CSE rewrites produce the same inner expression under different alias + // names (e.g. `__common_expr_4 AS __extracted_1` and + // `__common_expr_4 AS __extracted_3`). New extractions from + // `routing_extract` use bare (non-Alias) keys and get normal dedup. + // + // When building the final `extraction_pairs`, the Alias wrapper is + // stripped so consumers see the usual `(inner_expr, alias_name)` tuples. + + let mut extractors = vec![LeafExpressionExtractor::new( + input_schema.as_ref(), + alias_generator, + )]; + let input_column_sets = vec![schema_columns(input_schema.as_ref())]; + + let original_schema = proj.schema.as_ref(); + let mut recovery_exprs: Vec = Vec::with_capacity(proj.expr.len()); + let mut needs_recovery = false; + let mut has_new_extractions = false; + + for (expr, (qualifier, field)) in proj.expr.iter().zip(original_schema.iter()) { + if let Expr::Alias(alias) = expr + && alias.name.starts_with(EXTRACTED_EXPR_PREFIX) + { + // Insert the full Alias expression as the key so that + // distinct alias names don't collide in the IndexMap. + let alias_name = alias.name.clone(); + + for col_ref in alias.expr.column_refs() { + extractors[0].columns_needed.insert(col_ref.clone()); + } + + extractors[0] + .extracted + .insert(expr.clone(), alias_name.clone()); + recovery_exprs.push(Expr::Column(Column::new_unqualified(&alias_name))); + } else if let Expr::Column(col) = expr { + // Plain column pass-through — track it in the extractor + extractors[0].columns_needed.insert(col.clone()); + recovery_exprs.push(expr.clone()); + } else { + // Everything else: run through routing_extract + let transformed = + routing_extract(expr.clone(), &mut extractors, &input_column_sets)?; + if transformed.transformed { + has_new_extractions = true; + } + let transformed_expr = transformed.data; + + // Build recovery expression, aliasing back to original name if needed + let original_name = field.name(); + let needs_alias = if let Expr::Column(col) = &transformed_expr { + col.name.as_str() != original_name + } else { + let expr_name = transformed_expr.schema_name().to_string(); + original_name != &expr_name + }; + let recovery_expr = if needs_alias { + needs_recovery = true; + transformed_expr + .clone() + .alias_qualified(qualifier.cloned(), original_name) + } else { + transformed_expr.clone() + }; + + // If the expression was transformed (i.e., has extracted sub-parts), + // it differs from what the pushed projection outputs → needs recovery. + // Also, any non-column, non-__extracted expression needs recovery + // because the pushed extraction projection won't output it directly. + if transformed.transformed || !matches!(expr, Expr::Column(_)) { + needs_recovery = true; + } + + recovery_exprs.push(recovery_expr); + } + } + + // Build extraction_pairs, stripping the Alias wrapper from pre-existing + // entries (they used the full Alias as the map key to avoid dedup). + let extractor = &extractors[0]; + let extraction_pairs: Vec<(Expr, String)> = extractor + .extracted + .iter() + .map(|(e, a)| match e { + Expr::Alias(alias) => (*alias.expr.clone(), a.clone()), + _ => (e.clone(), a.clone()), + }) + .collect(); + let columns_needed = &extractor.columns_needed; + + // If no extractions found, nothing to do + if extraction_pairs.is_empty() { + return Ok(None); + } + + // ── Phase 2: Push down ────────────────────────────────────────────── + let proj_input = Arc::clone(&proj.input); + let pushed = push_extraction_pairs( + &extraction_pairs, + columns_needed, + proj, + &proj_input, + alias_generator, + )?; + + // ── Phase 3: Recovery ─────────────────────────────────────────────── + match (pushed, needs_recovery) { + (Some(pushed_plan), true) => { + // Wrap with recovery projection + let recovery = LogicalPlan::Projection(Projection::try_new( + recovery_exprs, + Arc::new(pushed_plan), + )?); + Ok(Some(recovery)) + } + (Some(pushed_plan), false) => { + // No recovery needed (pure extraction projection) + Ok(Some(pushed_plan)) + } + (None, true) => { + // Push returned None but we still have extractions to apply. + // Build the extraction projection in-place (not pushed) so the + // recovery can resolve extracted expressions. + if !has_new_extractions { + // Only pre-existing __extracted aliases and columns, no new + // extractions from routing_extract. The original projection is + // already an extraction projection that couldn't be pushed + // further. Return None. + return Ok(None); + } + let input_arc = Arc::clone(input); + let extraction = build_extraction_projection_impl( + &extraction_pairs, + columns_needed, + &input_arc, + input_schema.as_ref(), + )?; + let extraction_plan = LogicalPlan::Projection(extraction); + let recovery = LogicalPlan::Projection(Projection::try_new( + recovery_exprs, + Arc::new(extraction_plan), + )?); + Ok(Some(recovery)) + } + (None, false) => { + // No extractions could be pushed and no recovery needed + Ok(None) + } + } +} + +/// Returns true if the plan is a Projection where ALL expressions are either +/// `Alias(EXTRACTED_EXPR_PREFIX, ...)` or `Column`, with at least one extraction. +/// Such projections can safely be pushed further without re-extraction. +fn is_pure_extraction_projection(plan: &LogicalPlan) -> bool { + let LogicalPlan::Projection(proj) = plan else { + return false; + }; + let mut has_extraction = false; + for expr in &proj.expr { + match expr { + Expr::Alias(alias) if alias.name.starts_with(EXTRACTED_EXPR_PREFIX) => { + has_extraction = true; + } + Expr::Column(_) => {} + _ => return false, + } + } + has_extraction +} + +/// Pushes extraction pairs down through the projection's input node, +/// dispatching to the appropriate handler based on the input node type. +fn push_extraction_pairs( + pairs: &[(Expr, String)], + columns_needed: &IndexSet, + proj: &Projection, + proj_input: &Arc, + alias_generator: &Arc, +) -> Result> { + match proj_input.as_ref() { + // Merge into existing projection, then try to push the result further down. + // Only merge when all outer expressions are captured (pairs + columns). + // Uncaptured expressions (e.g. `col AS __common_expr_1`) would be lost + // during the merge since build_extraction_projection_impl only knows + // about the captured pairs and columns. + LogicalPlan::Projection(_) + if pairs.len() + columns_needed.len() == proj.expr.len() => + { + let target_schema = Arc::clone(proj_input.schema()); + let merged = build_extraction_projection_impl( + pairs, + columns_needed, + proj_input, + target_schema.as_ref(), + )?; + let merged_plan = LogicalPlan::Projection(merged); + + // After merging, try to push the result further down, but ONLY + // if the merged result is still a pure extraction projection + // (all __extracted aliases + columns). If the merge inherited + // bare MoveTowardsLeafNodes expressions from the inner projection, + // pushing would re-extract them into new aliases and fail when + // the (None, true) fallback can't find the original aliases. + // This handles: Extraction → Recovery(cols) → Filter → ... → TableScan + // by pushing through the recovery projection AND the filter in one pass. + if is_pure_extraction_projection(&merged_plan) + && let Some(pushed) = try_push_input(&merged_plan, alias_generator)? + { + return Ok(Some(pushed)); + } + Ok(Some(merged_plan)) + } + // Generic: handles Filter/Sort/Limit (via recursion), + // SubqueryAlias (with qualifier remap in try_push_into_inputs), + // Join, and anything else. + // Safely bails out for nodes that don't pass through extracted + // columns (Aggregate, Window) via the output schema check. + _ => try_push_into_inputs( + pairs, + columns_needed, + proj_input.as_ref(), + alias_generator, + ), + } +} + +/// Pushes extraction expressions into a node's inputs by routing each +/// expression to the input that owns all of its column references. +/// +/// Works for any number of inputs (1, 2, …N). For single-input nodes, +/// all expressions trivially route to that input. For multi-input nodes +/// (Join, etc.), each expression is routed to the side that owns its columns. +/// +/// Returns `Some(new_node)` if all expressions could be routed AND the +/// rebuilt node's output schema contains all extracted aliases. +/// Returns `None` if any expression references columns from multiple inputs +/// or the node doesn't pass through the extracted columns. +/// +/// # Example: Join with expressions from both sides +/// +/// ```text +/// Extraction projection above a Join: +/// Projection: left.user['name'] AS __extracted_1, right.order['total'] AS __extracted_2, ... +/// Join: left.id = right.user_id +/// TableScan: left [id, user] +/// TableScan: right [user_id, order] +/// +/// After routing each expression to its owning input: +/// Join: left.id = right.user_id +/// Projection: user['name'] AS __extracted_1, id, user <-- left-side extraction +/// TableScan: left [id, user] +/// Projection: order['total'] AS __extracted_2, user_id, order <-- right-side extraction +/// TableScan: right [user_id, order] +/// ``` +fn try_push_into_inputs( + pairs: &[(Expr, String)], + columns_needed: &IndexSet, + node: &LogicalPlan, + alias_generator: &Arc, +) -> Result> { + let inputs = node.inputs(); + if inputs.is_empty() { + return Ok(None); + } + + // SubqueryAlias remaps qualifiers between input and output. + // Rewrite pairs/columns from alias-space to input-space before routing. + let (pairs, columns_needed) = if let LogicalPlan::SubqueryAlias(sa) = node { + let mut replace_map = HashMap::new(); + for ((input_q, input_f), (alias_q, alias_f)) in + sa.input.schema().iter().zip(sa.schema.iter()) + { + replace_map.insert( + qualified_name(alias_q, alias_f.name()), + Expr::Column(Column::new(input_q.cloned(), input_f.name())), + ); + } + let remapped_pairs: Vec<(Expr, String)> = pairs + .iter() + .map(|(expr, alias)| { + Ok(( + replace_cols_by_name(expr.clone(), &replace_map)?, + alias.clone(), + )) + }) + .collect::>()?; + let remapped_columns: IndexSet = columns_needed + .iter() + .filter_map(|col| { + let rewritten = + replace_cols_by_name(Expr::Column(col.clone()), &replace_map).ok()?; + if let Expr::Column(c) = rewritten { + Some(c) + } else { + Some(col.clone()) + } + }) + .collect(); + (remapped_pairs, remapped_columns) + } else { + (pairs.to_vec(), columns_needed.clone()) + }; + let pairs = &pairs[..]; + let columns_needed = &columns_needed; + + let num_inputs = inputs.len(); + + // Build per-input column sets using existing schema_columns() + let input_schemas: Vec> = + inputs.iter().map(|i| Arc::clone(i.schema())).collect(); + let input_column_sets: Vec> = + input_schemas.iter().map(|s| schema_columns(s)).collect(); + + // Route pairs and columns to inputs. + // Union: all inputs share the same schema, so broadcast to every branch. + // Everything else (Join, single-input nodes): columns are disjoint across + // inputs, so route each expression to its owning input. + let broadcast = matches!(node, LogicalPlan::Union(_)); + + let mut per_input_pairs: Vec> = vec![vec![]; num_inputs]; + let mut per_input_columns: Vec> = vec![IndexSet::new(); num_inputs]; + + if broadcast { + // Union output schema and each input schema have the same fields by + // index but may differ in qualifiers (e.g. output `s` vs input + // `simple_struct.s`). Remap pairs/columns to each input's space. + let union_schema = node.schema(); + for (idx, input_schema) in input_schemas.iter().enumerate() { + let mut remap = HashMap::new(); + for ((out_q, out_f), (in_q, in_f)) in + union_schema.iter().zip(input_schema.iter()) + { + remap.insert( + qualified_name(out_q, out_f.name()), + Expr::Column(Column::new(in_q.cloned(), in_f.name())), + ); + } + per_input_pairs[idx] = pairs + .iter() + .map(|(expr, alias)| { + Ok((replace_cols_by_name(expr.clone(), &remap)?, alias.clone())) + }) + .collect::>()?; + per_input_columns[idx] = columns_needed + .iter() + .filter_map(|col| { + let rewritten = + replace_cols_by_name(Expr::Column(col.clone()), &remap).ok()?; + if let Expr::Column(c) = rewritten { + Some(c) + } else { + Some(col.clone()) + } + }) + .collect(); + } + } else { + for (expr, alias) in pairs { + match find_owning_input(expr, &input_column_sets) { + Some(idx) => per_input_pairs[idx].push((expr.clone(), alias.clone())), + None => return Ok(None), // Cross-input expression — bail out + } + } + for col in columns_needed { + let col_expr = Expr::Column(col.clone()); + match find_owning_input(&col_expr, &input_column_sets) { + Some(idx) => { + per_input_columns[idx].insert(col.clone()); + } + None => return Ok(None), // Ambiguous column — bail out + } + } } + + // Check at least one input has extractions to push + if per_input_pairs.iter().all(|p| p.is_empty()) { + return Ok(None); + } + + // Build per-input extraction projections and push them as far as possible + // immediately. This is critical because map_children preserves cached schemas, + // so if the TopDown pass later pushes a child further (changing its output + // schema), the parent node's schema becomes stale. + let mut new_inputs: Vec = Vec::with_capacity(num_inputs); + for (idx, input) in inputs.into_iter().enumerate() { + if per_input_pairs[idx].is_empty() { + new_inputs.push(input.clone()); + } else { + let input_arc = Arc::new(input.clone()); + let target_schema = Arc::clone(input.schema()); + let proj = build_extraction_projection_impl( + &per_input_pairs[idx], + &per_input_columns[idx], + &input_arc, + target_schema.as_ref(), + )?; + // Verify all requested aliases appear in the projection's output. + // A merge may deduplicate if the same expression already exists + // under a different alias, leaving the requested alias missing. + let proj_schema = proj.schema.as_ref(); + for (_expr, alias) in &per_input_pairs[idx] { + if !proj_schema.fields().iter().any(|f| f.name() == alias) { + return Ok(None); + } + } + let proj_plan = LogicalPlan::Projection(proj); + // Try to push the extraction projection further down within + // this input (e.g., through Filter → existing extraction projection). + // This ensures the input's output schema is stable and won't change + // when the TopDown pass later visits children. + match try_push_input(&proj_plan, alias_generator)? { + Some(pushed) => new_inputs.push(pushed), + None => new_inputs.push(proj_plan), + } + } + } + + // Rebuild the node with new inputs + let new_node = node.with_new_exprs(node.expressions(), new_inputs)?; + + // Safety check: verify all extracted aliases appear in the rebuilt + // node's output schema. Nodes like Aggregate define their own output + // and won't pass through extracted columns — bail out for those. + let output_schema = new_node.schema(); + for (_expr, alias) in pairs { + if !output_schema.fields().iter().any(|f| f.name() == alias) { + return Ok(None); + } + } + + Ok(Some(new_node)) } #[cfg(test)] @@ -283,13 +1220,24 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id + Projection: test.id, test.user + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id + Filter: __datafusion_extracted_3 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id + TableScan: test projection=[id, user] "#) } @@ -332,10 +1280,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")) + TableScan: test projection=[user] "#) } @@ -359,10 +1310,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 IS NOT NULL AS has_name + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: mock_leaf(test.user, Utf8("name")) IS NOT NULL AS has_name + TableScan: test projection=[user] "#) } @@ -408,13 +1362,22 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 IS NOT NULL AND __datafusion_extracted_1 IS NULL + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user + Filter: __datafusion_extracted_2 IS NOT NULL AND __datafusion_extracted_2 IS NULL + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user + Filter: __datafusion_extracted_3 IS NOT NULL AND __datafusion_extracted_3 IS NULL + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] "#) } @@ -431,13 +1394,22 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("test") + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user + Filter: __datafusion_extracted_2 = Utf8("test") + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user + Filter: __datafusion_extracted_3 = Utf8("test") + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] "#) } @@ -456,13 +1428,22 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_2]], aggr=[[COUNT(Int32(1))]] + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_3]], aggr=[[COUNT(Int32(1))]] + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 + TableScan: test projection=[user] "#) } @@ -484,13 +1465,22 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: test.user, COUNT(__datafusion_extracted_1) AS COUNT(mock_leaf(test.user,Utf8("value"))) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1)]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: test.user, COUNT(__datafusion_extracted_2) AS COUNT(mock_leaf(test.user,Utf8("value"))) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_2)]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: test.user, COUNT(__datafusion_extracted_3) AS COUNT(mock_leaf(test.user,Utf8("value"))) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_3)]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_3, test.user + TableScan: test projection=[user] "#) } @@ -509,13 +1499,23 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: mock_leaf(test.user, Utf8("name")) + Projection: test.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_4 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + TableScan: test projection=[user] "#) } @@ -535,10 +1535,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS username + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: mock_leaf(test.user, Utf8("name")) AS username + TableScan: test projection=[user] "#) } @@ -560,13 +1563,23 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: test.user, mock_leaf(test.user, Utf8("label")) + Projection: test.user + Filter: __datafusion_extracted_1 > Int32(150) + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: test.user, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("label")) + Filter: __datafusion_extracted_2 > Int32(150) + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_3 + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: test.user, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("label")) + Filter: __datafusion_extracted_4 > Int32(150) + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_4, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_5 + TableScan: test projection=[user] "#) } @@ -587,10 +1600,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_1 AS name2 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")), mock_leaf(test.user, Utf8("name")) AS name2 + TableScan: test projection=[user] "#) } @@ -617,10 +1633,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) + Sort: test.user ASC NULLS FIRST + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Sort: test.user ASC NULLS FIRST + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] "#) } @@ -643,10 +1665,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) + Limit: skip=0, fetch=10 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Limit: skip=0, fetch=10 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -669,13 +1697,19 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1) AS cnt]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_2) AS cnt]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_3) AS cnt]] + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_3, test.user + TableScan: test projection=[user] "#) } @@ -748,13 +1782,28 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 IS NOT NULL + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + Projection: test.id, test.user + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user + Filter: __datafusion_extracted_3 IS NOT NULL + Filter: __datafusion_extracted_4 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user + Filter: __datafusion_extracted_5 IS NOT NULL + Projection: test.id, test.user, __datafusion_extracted_5 + Filter: __datafusion_extracted_6 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_6, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + TableScan: test projection=[id, user] "#) } @@ -776,10 +1825,13 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")) + TableScan: test projection=[user] "#) } @@ -849,13 +1901,28 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: test.user + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_3]], aggr=[[COUNT(Int32(1))]] + Filter: __datafusion_extracted_4 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_5]], aggr=[[COUNT(Int32(1))]] + Projection: __datafusion_extracted_5 + Filter: __datafusion_extracted_6 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_6, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + TableScan: test projection=[user] "#) } @@ -875,13 +1942,28 @@ mod tests { TableScan: test projection=[a, b, c] ## After Extraction - (same as original) + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_1 = Int32(2) + Projection: mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1, test.a, test.b, test.c + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_2 = Int32(1) + Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c + TableScan: test projection=[a, b, c] ## After Pushdown - (same as after extraction) + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_3 = Int32(2) + Filter: __datafusion_extracted_4 = Int32(1) + Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_4, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_3 + TableScan: test projection=[a, b, c] ## Optimized - (same as after pushdown) + Projection: test.a, test.b, test.c + Filter: __datafusion_extracted_5 = Int32(2) + Projection: test.a, test.b, test.c, __datafusion_extracted_5 + Filter: __datafusion_extracted_6 = Int32(1) + Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_6, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_5 + TableScan: test projection=[a, b, c] "#) } @@ -923,13 +2005,28 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_2, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user, right.id, right.user + Inner Join: __datafusion_extracted_3 = __datafusion_extracted_4 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_4, right.id, right.user + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user, right.id, right.user + Inner Join: __datafusion_extracted_5 = __datafusion_extracted_6 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_5, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_6, right.id, right.user + TableScan: right projection=[id, user] "#) } @@ -959,13 +2056,25 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_3 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] "#) } @@ -996,13 +2105,28 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") AND __datafusion_extracted_2 = Utf8("admin") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_3 = Utf8("active") AND __datafusion_extracted_4 = Utf8("admin") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_4, right.id, right.user + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.user = right.user AND __datafusion_extracted_5 = Utf8("active") AND __datafusion_extracted_6 = Utf8("admin") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_6, right.id, right.user + TableScan: right projection=[id, user] "#) } @@ -1064,13 +2188,34 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, right.id, right.user + Projection: test.id, test.user, right.id, right.user + Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_4 = Utf8("active") + Inner Join: __datafusion_extracted_5 = __datafusion_extracted_6 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_5, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4 + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_6, right.id, right.user + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_7 = Utf8("active") + Projection: test.id, test.user, __datafusion_extracted_7, right.id, right.user + Inner Join: __datafusion_extracted_8 = __datafusion_extracted_9 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_8, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_7 + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_9, right.id, right.user + TableScan: right projection=[id, user] "#) } @@ -1102,10 +2247,20 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_2 AS mock_leaf(right.user,Utf8("role")) + Inner Join: test.id = right.id + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_4 AS mock_leaf(right.user,Utf8("role")) + Inner Join: test.id = right.id + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_4, right.id + TableScan: right projection=[id, user] "#) } @@ -1134,10 +2289,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(x,Utf8("a")) + Filter: x IS NOT NULL + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS mock_leaf(x,Utf8("a")) + Filter: x IS NOT NULL + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -1162,10 +2323,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL + Filter: x IS NOT NULL + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL + Filter: x IS NOT NULL + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -1185,13 +2352,23 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: x + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(x, Utf8("a")) AS __datafusion_extracted_1, x + Projection: test.user AS x + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: x + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: x + Filter: __datafusion_extracted_3 = Utf8("active") + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_3 + TableScan: test projection=[user] "#) } @@ -1218,10 +2395,16 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(sub.user,Utf8("name")) + SubqueryAlias: sub + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) + SubqueryAlias: sub + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -1243,13 +2426,26 @@ mod tests { TableScan: test projection=[user] ## After Extraction - (same as original) + Projection: mock_leaf(sub.user, Utf8("name")) + Projection: sub.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(sub.user, Utf8("status")) AS __datafusion_extracted_1, sub.user + SubqueryAlias: sub + TableScan: test projection=[user] ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_3 AS mock_leaf(sub.user,Utf8("name")) + Filter: __datafusion_extracted_2 = Utf8("active") + SubqueryAlias: sub + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_5 AS mock_leaf(sub.user,Utf8("name")) + Filter: __datafusion_extracted_4 = Utf8("active") + SubqueryAlias: sub + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + TableScan: test projection=[user] "#) } @@ -1274,10 +2470,18 @@ mod tests { (same as original) ## After Pushdown - (same as after extraction) + Projection: __datafusion_extracted_1 AS mock_leaf(outer_sub.user,Utf8("name")) + SubqueryAlias: outer_sub + SubqueryAlias: inner_sub + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + TableScan: test projection=[user] ## Optimized - (same as after pushdown) + Projection: __datafusion_extracted_2 AS mock_leaf(outer_sub.user,Utf8("name")) + SubqueryAlias: outer_sub + SubqueryAlias: inner_sub + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + TableScan: test projection=[user] "#) } @@ -1360,13 +2564,24 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b") + Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id + Projection: test.id, test.user + Filter: __datafusion_extracted_3 = Utf8("a") AND __datafusion_extracted_4 = Utf8("b") + Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_3, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_4, test.id, test.user + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id + Filter: __datafusion_extracted_5 = Utf8("a") AND __datafusion_extracted_6 = Utf8("b") + Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_5, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_6, test.id + TableScan: test projection=[id, user] "#) } @@ -1391,13 +2606,23 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, mock_leaf(test.user, Utf8("name")) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_4 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + TableScan: test projection=[id, user] "#) } @@ -1418,13 +2643,23 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, mock_leaf(test.user, Utf8("status")) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_2 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_4 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5 + TableScan: test projection=[id, user] "#) } @@ -1461,13 +2696,28 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, mock_leaf(test.user, Utf8("name")), mock_leaf(right.user, Utf8("status")) + Projection: test.id, test.user, right.id, right.user + Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user + TableScan: right projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_4 AS mock_leaf(right.user,Utf8("status")) + Left Join: Filter: test.id = right.id AND __datafusion_extracted_2 > Int32(5) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_2, right.id, right.user, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_4 + TableScan: right projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_6 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_7 AS mock_leaf(right.user,Utf8("status")) + Left Join: Filter: test.id = right.id AND __datafusion_extracted_5 > Int32(5) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_6, test.id + TableScan: test projection=[id, user] + Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_5, right.id, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_7 + TableScan: right projection=[id, user] "#) } @@ -1492,13 +2742,23 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - (same as original) + Projection: test.id, mock_leaf(test.user, Utf8("name")), mock_leaf(test.user, Utf8("status")) + Projection: test.id, test.user + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] ## After Pushdown - (same as after extraction) + Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_4 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_2 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4 + TableScan: test projection=[id, user] ## Optimized - (same as after pushdown) + Projection: test.id, __datafusion_extracted_6 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_7 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_5 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_6, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_7 + TableScan: test projection=[id, user] "#) } } diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 877a84fe4dc14..118ddef49b7e7 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -43,6 +43,7 @@ use crate::eliminate_join::EliminateJoin; use crate::eliminate_limit::EliminateLimit; use crate::eliminate_outer_join::EliminateOuterJoin; use crate::extract_equijoin_predicate::ExtractEquijoinPredicate; +use crate::extract_leaf_expressions::{ExtractLeafExpressions, PushDownLeafProjections}; use crate::filter_null_join_keys::FilterNullJoinKeys; use crate::optimize_projections::OptimizeProjections; use crate::optimize_unions::OptimizeUnions; @@ -260,6 +261,8 @@ impl Optimizer { // that might benefit from the following rules Arc::new(EliminateGroupByConstant::new()), Arc::new(CommonSubexprEliminate::new()), + Arc::new(ExtractLeafExpressions::new()), + Arc::new(PushDownLeafProjections::new()), Arc::new(OptimizeProjections::new()), ]; diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 6f615ec391c9e..c5907d497500e 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -197,6 +197,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c] logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE logical_plan after optimize_unions SAME TEXT AS ABOVE @@ -219,6 +221,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections SAME TEXT AS ABOVE logical_plan TableScan: simple_explain_test projection=[a, b, c] initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true @@ -558,6 +562,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections TableScan: simple_explain_test projection=[a, b, c] logical_plan after rewrite_set_comparison SAME TEXT AS ABOVE logical_plan after optimize_unions SAME TEXT AS ABOVE @@ -580,6 +586,8 @@ logical_plan after push_down_filter SAME TEXT AS ABOVE logical_plan after single_distinct_aggregation_to_group_by SAME TEXT AS ABOVE logical_plan after eliminate_group_by_constant SAME TEXT AS ABOVE logical_plan after common_sub_expression_eliminate SAME TEXT AS ABOVE +logical_plan after extract_leaf_expressions SAME TEXT AS ABOVE +logical_plan after push_down_leaf_projections SAME TEXT AS ABOVE logical_plan after optimize_projections SAME TEXT AS ABOVE logical_plan TableScan: simple_explain_test projection=[a, b, c] initial_physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/example.csv]]}, projection=[a, b, c], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/projection.slt b/datafusion/sqllogictest/test_files/projection.slt index e18114bc51ca8..8b3bd4d12c6a2 100644 --- a/datafusion/sqllogictest/test_files/projection.slt +++ b/datafusion/sqllogictest/test_files/projection.slt @@ -244,7 +244,7 @@ query TT explain select column1.c0 from t; ---- logical_plan -01)Projection: get_field(t.column1, Utf8("c0")) +01)Projection: get_field(t.column1, Utf8("c0")) AS t.column1[c0] 02)--TableScan: t projection=[column1] physical_plan 01)ProjectionExec: expr=[get_field(column1@0, c0) as t.column1[c0]] diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index 50e26b2fb0b85..6dfa66cda51c9 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -104,7 +104,7 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet @@ -122,7 +122,7 @@ query TT EXPLAIN SELECT s['label'] FROM simple_struct; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("label")) +01)Projection: get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] 02)--TableScan: simple_struct projection=[s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as simple_struct.s[label]], file_type=parquet @@ -144,7 +144,7 @@ query TT EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet @@ -166,7 +166,7 @@ query TT EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct; ---- logical_plan -01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) AS nested_struct.nested[outer][inner] 02)--TableScan: nested_struct projection=[id, nested] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet @@ -186,7 +186,7 @@ query TT EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet @@ -208,7 +208,7 @@ query TT EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") AS simple_struct.s[label] || Utf8("_suffix") 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet @@ -235,13 +235,14 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 2 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -259,13 +260,14 @@ query TT EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +01)Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1) 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)] -02)--FilterExec: id@0 > 2 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +02)--FilterExec: id@1 > 2 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -283,13 +285,14 @@ query TT EXPLAIN SELECT id, s['label'] FROM simple_struct WHERE s['value'] > 150; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) -02)--Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[label] +02)--Filter: __datafusion_extracted_1 > Int64(150) +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2 +04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label]] -02)--FilterExec: get_field(s@1, value) > 150 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as simple_struct.s[label]] +02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1, __datafusion_extracted_2@2] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet # Verify correctness query IT @@ -313,7 +316,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -338,7 +341,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -363,7 +366,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] @@ -437,7 +440,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -460,7 +463,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -483,7 +486,7 @@ EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3 ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -506,7 +509,7 @@ EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: nested_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) +02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) AS nested_struct.nested[outer][inner] 03)----TableScan: nested_struct projection=[id, nested] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -528,7 +531,7 @@ EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") AS simple_struct.s[label] || Utf8("_suffix") 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -556,14 +559,15 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value' ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 03)----Filter: simple_struct.id > Int64(1) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -03)----FilterExec: id@0 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +03)----FilterExec: id@1 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -583,14 +587,15 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1 ORDER BY s['value' ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 03)----Filter: simple_struct.id > Int64(1) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: TopK(fetch=2), expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -03)----FilterExec: id@0 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +03)----FilterExec: id@1 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -608,14 +613,15 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct WHERE id > 1 ORDER BY id LI ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) +02)--Projection: simple_struct.id, __datafusion_extracted_1 + Int64(1) AS simple_struct.s[value] + Int64(1) 03)----Filter: simple_struct.id > Int64(1) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] -02)--ProjectionExec: expr=[id@0 as id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)] -03)----FilterExec: id@0 > 1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 + 1 as simple_struct.s[value] + Int64(1)] +03)----FilterExec: id@1 > 1 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND DynamicFilter [ empty ], pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -673,7 +679,7 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) AS multi_struct.s[value] 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] @@ -699,7 +705,7 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) AS multi_struct.s[value] 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 @@ -723,7 +729,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1) +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1) AS multi_struct.s[value] + Int64(1) 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 @@ -747,16 +753,17 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct WHERE id > 2 ORDER BY id; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) +02)--Projection: multi_struct.id, __datafusion_extracted_1 AS multi_struct.s[value] 03)----Filter: multi_struct.id > Int64(2) -04)------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)] +04)------Projection: get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_1, multi_struct.id +05)--------TableScan: multi_struct projection=[id, s], partial_filters=[multi_struct.id > Int64(2)] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] 02)--SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as multi_struct.s[value]] -04)------FilterExec: id@0 > 2 +03)----ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as multi_struct.s[value]] +04)------FilterExec: id@1 > 2 05)--------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3 -06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +06)----------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query II @@ -774,13 +781,16 @@ query TT EXPLAIN SELECT s['label'], SUM(s['value']) FROM multi_struct GROUP BY s['label']; ---- logical_plan -01)Aggregate: groupBy=[[get_field(multi_struct.s, Utf8("label"))]], aggr=[[sum(get_field(multi_struct.s, Utf8("value")))]] -02)--TableScan: multi_struct projection=[s] +01)Projection: __datafusion_extracted_1 AS multi_struct.s[label], sum(__datafusion_extracted_2) AS sum(multi_struct.s[value]) +02)--Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[sum(__datafusion_extracted_2)]] +03)----Projection: get_field(multi_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(multi_struct.s, Utf8("value")) AS __datafusion_extracted_2 +04)------TableScan: multi_struct projection=[s] physical_plan -01)AggregateExec: mode=FinalPartitioned, gby=[multi_struct.s[label]@0 as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] -02)--RepartitionExec: partitioning=Hash([multi_struct.s[label]@0], 4), input_partitions=3 -03)----AggregateExec: mode=Partial, gby=[get_field(s@0, label) as multi_struct.s[label]], aggr=[sum(multi_struct.s[value])] -04)------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[s], file_type=parquet +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as multi_struct.s[label], sum(__datafusion_extracted_2)@1 as sum(multi_struct.s[value])] +02)--AggregateExec: mode=FinalPartitioned, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)] +03)----RepartitionExec: partitioning=Hash([__datafusion_extracted_1@0], 4), input_partitions=3 +04)------AggregateExec: mode=Partial, gby=[__datafusion_extracted_1@0 as __datafusion_extracted_1], aggr=[sum(__datafusion_extracted_2)] +05)--------DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part1.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part3.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part4.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/multi/part5.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet # Verify correctness query TI @@ -809,7 +819,7 @@ query TT EXPLAIN SELECT id, s['value'] FROM nullable_struct; ---- logical_plan -01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value")) +01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value")) AS nullable_struct.s[value] 02)--TableScan: nullable_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, get_field(s@1, value) as nullable_struct.s[value]], file_type=parquet @@ -831,13 +841,14 @@ query TT EXPLAIN SELECT id, s['label'] FROM nullable_struct WHERE s['value'] IS NOT NULL; ---- logical_plan -01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) -02)--Filter: get_field(nullable_struct.s, Utf8("value")) IS NOT NULL -03)----TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL] +01)Projection: nullable_struct.id, __datafusion_extracted_2 AS nullable_struct.s[label] +02)--Filter: __datafusion_extracted_1 IS NOT NULL +03)----Projection: get_field(nullable_struct.s, Utf8("value")) AS __datafusion_extracted_1, nullable_struct.id, get_field(nullable_struct.s, Utf8("label")) AS __datafusion_extracted_2 +04)------TableScan: nullable_struct projection=[id, s], partial_filters=[get_field(nullable_struct.s, Utf8("value")) IS NOT NULL] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as nullable_struct.s[label]] -02)--FilterExec: get_field(s@1, value) IS NOT NULL -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, s], file_type=parquet +01)ProjectionExec: expr=[id@0 as id, __datafusion_extracted_2@1 as nullable_struct.s[label]] +02)--FilterExec: __datafusion_extracted_1@0 IS NOT NULL, projection=[id@1, __datafusion_extracted_2@2] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id, get_field(s@1, label) as __datafusion_extracted_2], file_type=parquet # Verify correctness query IT @@ -856,7 +867,7 @@ EXPLAIN SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct OR ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("value")) + Int64(10), get_field(simple_struct.s, Utf8("label")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("value")) + Int64(10) AS simple_struct.s[value] + Int64(10), get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -954,27 +965,29 @@ EXPLAIN SELECT (id + s['value']) * (id + s['value']) as id_and_value FROM simple ---- logical_plan 01)Projection: __common_expr_1 * __common_expr_1 AS id_and_value -02)--Projection: simple_struct.id + get_field(simple_struct.s, Utf8("value")) AS __common_expr_1 +02)--Projection: simple_struct.id + __datafusion_extracted_2 AS __common_expr_1 03)----Filter: simple_struct.id > Int64(2) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan 01)ProjectionExec: expr=[__common_expr_1@0 * __common_expr_1@0 as id_and_value] -02)--ProjectionExec: expr=[id@0 + get_field(s@1, value) as __common_expr_1] -03)----FilterExec: id@0 > 2 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +02)--ProjectionExec: expr=[id@1 + __datafusion_extracted_2@0 as __common_expr_1] +03)----FilterExec: id@1 > 2 +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] query TT EXPLAIN SELECT s['value'] + s['value'] as doubled FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) + get_field(simple_struct.s, Utf8("value")) AS doubled +01)Projection: __datafusion_extracted_1 + __datafusion_extracted_1 AS doubled 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) + get_field(s@0, value) as doubled] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 + __datafusion_extracted_1@0 as doubled] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -992,13 +1005,14 @@ query TT EXPLAIN SELECT s['value'], s['label'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label] 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value], get_field(s@0, label) as simple_struct.s[label]] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label]] +02)--FilterExec: id@2 > 2, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query IT @@ -1041,13 +1055,14 @@ query TT EXPLAIN SELECT s['value'] * 2 + length(s['label']) as score FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) * Int64(2) + CAST(character_length(get_field(simple_struct.s, Utf8("label"))) AS length(get_field(simple_struct.s, Utf8("label"))) AS Int64) AS score +01)Projection: __datafusion_extracted_1 * Int64(2) + CAST(character_length(__datafusion_extracted_2) AS length(get_field(simple_struct.s, Utf8("label"))) AS Int64) AS score 02)--Filter: simple_struct.id > Int64(1) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) * 2 + CAST(character_length(get_field(s@0, label)) AS Int64) as score] -02)--FilterExec: id@0 > 1, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 * 2 + CAST(character_length(__datafusion_extracted_2@1) AS Int64) as score] +02)--FilterExec: id@2 > 1, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query I @@ -1073,7 +1088,7 @@ EXPLAIN SELECT id, 42 as answer, s['label'] FROM simple_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, Int64(42) AS answer, get_field(simple_struct.s, Utf8("label")) +02)--Projection: simple_struct.id, Int64(42) AS answer, get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -1096,7 +1111,7 @@ EXPLAIN SELECT id, s['value'] + 100, s['label'] || '_test' FROM simple_struct OR ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(100), get_field(simple_struct.s, Utf8("label")) || Utf8("_test") +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(100) AS simple_struct.s[value] + Int64(100), get_field(simple_struct.s, Utf8("label")) || Utf8("_test") AS simple_struct.s[label] || Utf8("_test") 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -1117,13 +1132,14 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(1) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness query II @@ -1136,13 +1152,14 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 1 AND (id < 4 OR id = 5); ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(1) AND (simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(4) OR simple_struct.id = Int64(5)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 1 AND (id@0 < 4 OR id@0 = 5), projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND (id@0 < 4 OR id@0 = 5), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND (id_null_count@1 != row_count@2 AND id_min@3 < 4 OR id_null_count@1 != row_count@2 AND id_min@3 <= 5 AND 5 <= id_max@0), required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 1 AND (id@1 < 4 OR id@1 = 5), projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND (id@0 < 4 OR id@0 = 5), pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND (id_null_count@1 != row_count@2 AND id_min@3 < 4 OR id_null_count@1 != row_count@2 AND id_min@3 <= 5 AND 5 <= id_max@0), required_guarantees=[] # Verify correctness - should return rows where (id > 1) AND ((id < 4) OR (id = 5)) # That's: id=2,3 (1 1 AND id < 5; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(1) AND simple_struct.id < Int64(5) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(5)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1), simple_struct.id < Int64(5)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 1 AND id@0 < 5, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1 AND id@0 < 5, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND id_null_count@1 != row_count@2 AND id_min@3 < 5, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 1 AND id@1 < 5, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 1 AND id@0 < 5, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1 AND id_null_count@1 != row_count@2 AND id_min@3 < 5, required_guarantees=[] # Verify correctness - should return rows where 1 < id < 5 (id=2,3,4) query I @@ -1177,13 +1195,14 @@ query TT EXPLAIN SELECT s['value'], s['label'], id FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")), simple_struct.id +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value], __datafusion_extracted_2 AS simple_struct.s[label], simple_struct.id 02)--Filter: simple_struct.id > Int64(1) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] physical_plan -01)ProjectionExec: expr=[get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label], id@0 as id] -02)--FilterExec: id@0 > 1 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value], __datafusion_extracted_2@1 as simple_struct.s[label], id@2 as id] +02)--FilterExec: id@2 > 1 +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 1, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 1, required_guarantees=[] # Verify correctness - note that id is now at index 2 in the augmented projection query ITI @@ -1197,13 +1216,14 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE length(s['label']) > 4; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) -02)--Filter: character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4) -03)----TableScan: simple_struct projection=[s], partial_filters=[character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4)] +01)Projection: __datafusion_extracted_2 AS simple_struct.s[value] +02)--Filter: character_length(__datafusion_extracted_1) > Int32(4) +03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2 +04)------TableScan: simple_struct projection=[s], partial_filters=[character_length(get_field(simple_struct.s, Utf8("label"))) > Int32(4)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: character_length(get_field(s@0, label)) > 4 -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[s], file_type=parquet +01)ProjectionExec: expr=[__datafusion_extracted_2@0 as simple_struct.s[value]] +02)--FilterExec: character_length(__datafusion_extracted_1@0) > 4, projection=[__datafusion_extracted_2@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_2], file_type=parquet # Verify correctness - filter on rows where label length > 4 (all have length 5, except 'one' has 3) # Wait, from the data: alpha(5), beta(4), gamma(5), delta(5), epsilon(7) @@ -1230,12 +1250,13 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value']; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: get_field(simple_struct.s, Utf8("value")) ASC NULLS LAST -03)----TableScan: simple_struct projection=[id, s] +02)--Sort: __datafusion_extracted_1 ASC NULLS LAST +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 +04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: expr=[get_field(s@1, value) ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--SortExec: expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet # Verify correctness query I @@ -1258,13 +1279,13 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['label']; ---- logical_plan 01)Projection: simple_struct.id, simple_struct.s[value] -02)--Sort: simple_struct.id ASC NULLS LAST, get_field(simple_struct.s, Utf8("label")) ASC NULLS LAST -03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), simple_struct.s +02)--Sort: simple_struct.id ASC NULLS LAST, __datafusion_extracted_1 ASC NULLS LAST +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1 04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id, simple_struct.s[value]@1 as simple_struct.s[value]] -02)--SortExec: expr=[id@0 ASC NULLS LAST, get_field(s@2, label) ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], s], file_type=parquet +02)--SortExec: expr=[id@0 ASC NULLS LAST, __datafusion_extracted_1@2 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as __datafusion_extracted_1], file_type=parquet # Verify correctness query II @@ -1287,12 +1308,13 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] LIMIT 2; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: get_field(simple_struct.s, Utf8("value")) ASC NULLS LAST, fetch=2 -03)----TableScan: simple_struct projection=[id, s] +02)--Sort: __datafusion_extracted_1 ASC NULLS LAST, fetch=2 +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 +04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: TopK(fetch=2), expr=[get_field(s@1, value) ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--SortExec: TopK(fetch=2), expr=[__datafusion_extracted_1@1 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet # Verify correctness query I @@ -1312,12 +1334,13 @@ EXPLAIN SELECT id FROM simple_struct ORDER BY s['value'] * 2; ---- logical_plan 01)Projection: simple_struct.id -02)--Sort: get_field(simple_struct.s, Utf8("value")) * Int64(2) ASC NULLS LAST -03)----TableScan: simple_struct projection=[id, s] +02)--Sort: __datafusion_extracted_1 * Int64(2) ASC NULLS LAST +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1 +04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id] -02)--SortExec: expr=[get_field(s@1, value) * 2 ASC NULLS LAST], preserve_partitioning=[false] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--SortExec: expr=[__datafusion_extracted_1@1 * 2 ASC NULLS LAST], preserve_partitioning=[false] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as __datafusion_extracted_1], file_type=parquet # Verify correctness query I @@ -1339,7 +1362,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['value']; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST, simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] @@ -1390,13 +1413,15 @@ INNER JOIN join_right ON simple_struct.s['value'] = join_right.s['level'] * 10; ---- logical_plan 01)Projection: simple_struct.id, join_right.id -02)--Inner Join: get_field(simple_struct.s, Utf8("value")) = get_field(join_right.s, Utf8("level")) * Int64(10) -03)----TableScan: simple_struct projection=[id, s] -04)----TableScan: join_right projection=[id, s] +02)--Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2 * Int64(10) +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s] +05)----Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id +06)------TableScan: join_right projection=[id, s] physical_plan -01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(simple_struct.s[value]@2, join_right.s[level] * Int64(10)@2)], projection=[id@0, id@3] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet -03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s, get_field(s@1, level) * 10 as join_right.s[level] * Int64(10)], file_type=parquet +01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(__datafusion_extracted_1@0, __datafusion_extracted_2 * Int64(10)@2)], projection=[id@1, id@3] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id, get_field(s@1, level) * 10 as __datafusion_extracted_2 * Int64(10)], file_type=parquet # Verify correctness - value = level * 10 # simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250) @@ -1424,13 +1449,14 @@ WHERE simple_struct.s['value'] > 150; logical_plan 01)Inner Join: simple_struct.id = join_right.id 02)--Projection: simple_struct.id -03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(150) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] -05)--TableScan: join_right projection=[id] +03)----Filter: __datafusion_extracted_1 > Int64(150) +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(150)] +06)--TableScan: join_right projection=[id] physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] -02)--FilterExec: get_field(s@1, value) > 150, projection=[id@0] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +02)--FilterExec: __datafusion_extracted_1@0 > 150, projection=[id@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet 04)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness - id matches and value > 150 @@ -1459,17 +1485,19 @@ WHERE simple_struct.s['value'] > 100 AND join_right.s['level'] > 3; logical_plan 01)Inner Join: simple_struct.id = join_right.id 02)--Projection: simple_struct.id -03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(100) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(100)] -05)--Projection: join_right.id -06)----Filter: get_field(join_right.s, Utf8("level")) > Int64(3) -07)------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(3)] +03)----Filter: __datafusion_extracted_1 > Int64(100) +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(100)] +06)--Projection: join_right.id +07)----Filter: __datafusion_extracted_2 > Int64(3) +08)------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id +09)--------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(3)] physical_plan 01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] -02)--FilterExec: get_field(s@1, value) > 100, projection=[id@0] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet -04)--FilterExec: get_field(s@1, level) > 3, projection=[id@0] -05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet, predicate=DynamicFilter [ empty ] +02)--FilterExec: __datafusion_extracted_1@0 > 100, projection=[id@1] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet +04)--FilterExec: __datafusion_extracted_2@0 > 3, projection=[id@1] +05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet # Verify correctness - id matches, value > 100, and level > 3 # Matching ids where value > 100: 2(200), 3(150), 4(300), 5(250) @@ -1495,15 +1523,17 @@ FROM simple_struct INNER JOIN join_right ON simple_struct.id = join_right.id; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")), get_field(join_right.s, Utf8("role")) +01)Projection: simple_struct.id, __datafusion_extracted_1 AS simple_struct.s[label], __datafusion_extracted_2 AS join_right.s[role] 02)--Inner Join: simple_struct.id = join_right.id -03)----TableScan: simple_struct projection=[id, s] -04)----TableScan: join_right projection=[id, s] +03)----Projection: get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s] +05)----Projection: get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_2, join_right.id +06)------TableScan: join_right projection=[id, s] physical_plan -01)ProjectionExec: expr=[id@0 as id, get_field(s@1, label) as simple_struct.s[label], get_field(s@2, role) as join_right.s[role]] -02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)], projection=[id@0, s@1, s@3] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet -04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet, predicate=DynamicFilter [ empty ] +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_1@0 as simple_struct.s[label], __datafusion_extracted_2@2 as join_right.s[role]] +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], projection=[__datafusion_extracted_1@0, id@1, __datafusion_extracted_2@2] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as __datafusion_extracted_1, id], file_type=parquet +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, role) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness query ITT @@ -1561,17 +1591,20 @@ FROM simple_struct LEFT JOIN join_right ON simple_struct.id = join_right.id AND join_right.s['level'] > 5; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(join_right.s, Utf8("level")) +01)Projection: simple_struct.id, __datafusion_extracted_2 AS simple_struct.s[value], __datafusion_extracted_3 AS join_right.s[level] 02)--Left Join: simple_struct.id = join_right.id -03)----TableScan: simple_struct projection=[id, s] -04)----Filter: get_field(join_right.s, Utf8("level")) > Int64(5) -05)------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(5)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_2, simple_struct.id +04)------TableScan: simple_struct projection=[id, s] +05)----Projection: join_right.id, __datafusion_extracted_3 +06)------Filter: __datafusion_extracted_1 > Int64(5) +07)--------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_1, join_right.id, get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_3 +08)----------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("level")) > Int64(5)] physical_plan -01)ProjectionExec: expr=[id@1 as id, get_field(s@2, value) as simple_struct.s[value], get_field(s@0, level) as join_right.s[level]] -02)--HashJoinExec: mode=CollectLeft, join_type=Right, on=[(id@0, id@0)], projection=[s@1, id@2, s@3] -03)----FilterExec: get_field(s@1, level) > 5 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[id, s], file_type=parquet -05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +01)ProjectionExec: expr=[id@1 as id, __datafusion_extracted_2@0 as simple_struct.s[value], __datafusion_extracted_3@2 as join_right.s[level]] +02)--HashJoinExec: mode=CollectLeft, join_type=Left, on=[(id@1, id@0)], projection=[__datafusion_extracted_2@0, id@1, __datafusion_extracted_3@3] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_2, id], file_type=parquet +04)----FilterExec: __datafusion_extracted_1@0 > 5, projection=[id@1, __datafusion_extracted_3@2] +05)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_1, id, get_field(s@1, level) as __datafusion_extracted_3], file_type=parquet # Verify correctness - left join with level > 5 condition # Only join_right rows with level > 5 are matched: id=1 (level=10), id=4 (level=8) @@ -1599,14 +1632,15 @@ query TT EXPLAIN SELECT s['value'] FROM simple_struct WHERE id > 2; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS simple_struct.s[value] 02)--Filter: simple_struct.id > Int64(2) -03)----TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as simple_struct.s[value]] -02)--FilterExec: id@0 > 2, projection=[s@1] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as simple_struct.s[value]] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] 03)----RepartitionExec: partitioning=RoundRobinBatch(32), input_partitions=1 -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] ##################### # Section 14: SubqueryAlias tests @@ -1621,15 +1655,16 @@ query TT EXPLAIN SELECT t.s['value'] FROM (SELECT * FROM simple_struct) t WHERE t.id > 2; ---- logical_plan -01)Projection: get_field(t.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS t.s[value] 02)--SubqueryAlias: t -03)----Projection: simple_struct.s +03)----Projection: __datafusion_extracted_1 04)------Filter: simple_struct.id > Int64(2) -05)--------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +05)--------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as t.s[value]] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -1645,9 +1680,10 @@ EXPLAIN SELECT t.s['value'], t.s['label'] FROM (SELECT * FROM simple_struct) t O ---- logical_plan 01)Sort: t.s[value] ASC NULLS LAST -02)--Projection: get_field(t.s, Utf8("value")), get_field(t.s, Utf8("label")) +02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label] 03)----SubqueryAlias: t -04)------TableScan: simple_struct projection=[s] +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2 +05)--------TableScan: simple_struct projection=[s] physical_plan 01)SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[false] 02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as t.s[value], get_field(s@1, label) as t.s[label]], file_type=parquet @@ -1667,16 +1703,17 @@ query TT EXPLAIN SELECT u.s['value'] FROM (SELECT * FROM (SELECT * FROM simple_struct) t) u WHERE u.id > 2; ---- logical_plan -01)Projection: get_field(u.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS u.s[value] 02)--SubqueryAlias: u 03)----SubqueryAlias: t -04)------Projection: simple_struct.s +04)------Projection: __datafusion_extracted_1 05)--------Filter: simple_struct.id > Int64(2) -06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] +06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(2)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as u.s[value]] -02)--FilterExec: id@0 > 2, projection=[s@1] -03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as u.s[value]] +02)--FilterExec: id@1 > 2, projection=[__datafusion_extracted_1@0] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 2, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 2, required_guarantees=[] # Verify correctness query I @@ -1693,11 +1730,12 @@ EXPLAIN SELECT t.id FROM (SELECT * FROM simple_struct) t WHERE t.s['value'] > 20 logical_plan 01)SubqueryAlias: t 02)--Projection: simple_struct.id -03)----Filter: get_field(simple_struct.s, Utf8("value")) > Int64(200) -04)------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(200)] +03)----Filter: __datafusion_extracted_1 > Int64(200) +04)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +05)--------TableScan: simple_struct projection=[id, s], partial_filters=[get_field(simple_struct.s, Utf8("value")) > Int64(200)] physical_plan -01)FilterExec: get_field(s@1, value) > 200, projection=[id@0] -02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet +01)FilterExec: __datafusion_extracted_1@0 > 200, projection=[id@1] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet # Verify correctness query I @@ -1719,22 +1757,24 @@ EXPLAIN SELECT s['value'] FROM ( ) t; ---- logical_plan -01)Projection: get_field(t.s, Utf8("value")) +01)Projection: __datafusion_extracted_1 AS t.s[value] 02)--SubqueryAlias: t 03)----Union -04)------Projection: simple_struct.s +04)------Projection: __datafusion_extracted_1 05)--------Filter: simple_struct.id <= Int64(3) -06)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] -07)------Projection: simple_struct.s -08)--------Filter: simple_struct.id > Int64(3) -09)----------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] +06)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] +08)------Projection: __datafusion_extracted_1 +09)--------Filter: simple_struct.id > Int64(3) +10)----------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +11)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] physical_plan -01)ProjectionExec: expr=[get_field(s@0, value) as t.s[value]] +01)ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value]] 02)--UnionExec -03)----FilterExec: id@0 <= 3, projection=[s@1] -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] -05)----FilterExec: id@0 > 3, projection=[s@1] -06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] +03)----FilterExec: id@1 <= 3, projection=[__datafusion_extracted_1@0] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] +05)----FilterExec: id@1 > 3, projection=[__datafusion_extracted_1@0] +06)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] # Verify correctness query I @@ -1760,24 +1800,26 @@ EXPLAIN SELECT s['value'], s['label'] FROM ( ---- logical_plan 01)Sort: t.s[value] ASC NULLS LAST -02)--Projection: get_field(t.s, Utf8("value")), get_field(t.s, Utf8("label")) +02)--Projection: __datafusion_extracted_1 AS t.s[value], __datafusion_extracted_2 AS t.s[label] 03)----SubqueryAlias: t 04)------Union -05)--------Projection: simple_struct.s +05)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2 06)----------Filter: simple_struct.id <= Int64(3) -07)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] -08)--------Projection: simple_struct.s -09)----------Filter: simple_struct.id > Int64(3) -10)------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] +07)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +08)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id <= Int64(3)] +09)--------Projection: __datafusion_extracted_1, __datafusion_extracted_2 +10)----------Filter: simple_struct.id > Int64(3) +11)------------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id +12)--------------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(3)] physical_plan 01)SortPreservingMergeExec: [t.s[value]@0 ASC NULLS LAST] 02)--SortExec: expr=[t.s[value]@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[get_field(s@0, value) as t.s[value], get_field(s@0, label) as t.s[label]] +03)----ProjectionExec: expr=[__datafusion_extracted_1@0 as t.s[value], __datafusion_extracted_2@1 as t.s[label]] 04)------UnionExec -05)--------FilterExec: id@0 <= 3, projection=[s@1] -06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] -07)--------FilterExec: id@0 > 3, projection=[s@1] -08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, s], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] +05)--------FilterExec: id@2 <= 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 <= 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_min@0 <= 3, required_guarantees=[] +07)--------FilterExec: id@2 > 3, projection=[__datafusion_extracted_1@0, __datafusion_extracted_2@1] +08)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, label) as __datafusion_extracted_2, id], file_type=parquet, predicate=id@0 > 3, pruning_predicate=id_null_count@1 != row_count@2 AND id_max@0 > 3, required_guarantees=[] # Verify correctness query IT diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index b1cb354e053e4..edafcfaa543f2 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -116,11 +116,12 @@ explain select * from (select column1, unnest(column2) as o from d) where o['a'] ---- physical_plan 01)ProjectionExec: expr=[column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as o] -02)--FilterExec: get_field(__unnest_placeholder(d.column2,depth=1)@1, a) = 1 +02)--FilterExec: __datafusion_extracted_1@0 = 1, projection=[column1@1, __unnest_placeholder(d.column2,depth=1)@2] 03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -04)------UnnestExec -05)--------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)] -06)----------DataSourceExec: partitions=1, partition_sizes=[1] +04)------ProjectionExec: expr=[get_field(__unnest_placeholder(d.column2,depth=1)@1, a) as __datafusion_extracted_1, column1@0 as column1, __unnest_placeholder(d.column2,depth=1)@1 as __unnest_placeholder(d.column2,depth=1)] +05)--------UnnestExec +06)----------ProjectionExec: expr=[column1@0 as column1, column2@1 as __unnest_placeholder(d.column2)] +07)------------DataSourceExec: partitions=1, partition_sizes=[1] statement ok drop table d; diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index 53a1bb4ec6751..09dd98a50b579 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -661,7 +661,7 @@ query TT explain select s['a']['b'] from explain_test; ---- logical_plan -01)Projection: get_field(explain_test.s, Utf8("a"), Utf8("b")) +01)Projection: get_field(explain_test.s, Utf8("a"), Utf8("b")) AS explain_test.s[a][b] 02)--TableScan: explain_test projection=[s] physical_plan 01)ProjectionExec: expr=[get_field(s@0, a, b) as explain_test.s[a][b]] diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 1a6b82020c667..73aeb6c99d0db 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -666,7 +666,7 @@ explain select unnest(unnest(unnest(column3)['c1'])), column3 from recursive_unn logical_plan 01)Projection: __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1],depth=2) AS UNNEST(UNNEST(UNNEST(recursive_unnest_table.column3)[c1])), recursive_unnest_table.column3 02)--Unnest: lists[__unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1])|depth=2] structs[] -03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1) AS UNNEST(recursive_unnest_table.column3), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 +03)----Projection: get_field(__unnest_placeholder(recursive_unnest_table.column3,depth=1), Utf8("c1")) AS __unnest_placeholder(UNNEST(recursive_unnest_table.column3)[c1]), recursive_unnest_table.column3 04)------Unnest: lists[__unnest_placeholder(recursive_unnest_table.column3)|depth=1] structs[] 05)--------Projection: recursive_unnest_table.column3 AS __unnest_placeholder(recursive_unnest_table.column3), recursive_unnest_table.column3 06)----------TableScan: recursive_unnest_table projection=[column3] From 78f44fa24c4048d7450f295c108ef4c92736c694 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 9 Feb 2026 23:35:21 -0500 Subject: [PATCH 02/18] avoid unstable test snaps --- .../optimizer/src/extract_leaf_expressions.rs | 299 +++++++----------- 1 file changed, 116 insertions(+), 183 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 46556c57523d2..ee984d4e32ebd 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -1227,16 +1227,12 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id - Projection: test.id, test.user - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized Projection: test.id - Filter: __datafusion_extracted_3 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id TableScan: test projection=[id, user] "#) } @@ -1368,16 +1364,10 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, test.user - Filter: __datafusion_extracted_2 IS NOT NULL AND __datafusion_extracted_2 IS NULL - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user - Filter: __datafusion_extracted_3 IS NOT NULL AND __datafusion_extracted_3 IS NULL - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user - TableScan: test projection=[id, user] + (same as after pushdown) "#) } @@ -1400,16 +1390,10 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, test.user - Filter: __datafusion_extracted_2 = Utf8("test") - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user - Filter: __datafusion_extracted_3 = Utf8("test") - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user - TableScan: test projection=[id, user] + (same as after pushdown) "#) } @@ -1434,15 +1418,12 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_2]], aggr=[[COUNT(Int32(1))]] - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_3]], aggr=[[COUNT(Int32(1))]] - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -1471,16 +1452,10 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: test.user, COUNT(__datafusion_extracted_2) AS COUNT(mock_leaf(test.user,Utf8("value"))) - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_2)]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Projection: test.user, COUNT(__datafusion_extracted_3) AS COUNT(mock_leaf(test.user,Utf8("value"))) - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_3)]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_3, test.user - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1506,15 +1481,15 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_4 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[user] "#) } @@ -1570,16 +1545,13 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: test.user, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("label")) - Filter: __datafusion_extracted_2 > Int32(150) - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_3 + Projection: test.user, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("label")) + Filter: __datafusion_extracted_1 > Int32(150) + Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_2 TableScan: test projection=[user] ## Optimized - Projection: test.user, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("label")) - Filter: __datafusion_extracted_4 > Int32(150) - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_4, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_5 - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1639,10 +1611,7 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) - Sort: test.user ASC NULLS FIRST - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1671,9 +1640,9 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) Limit: skip=0, fetch=10 - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -1702,14 +1671,10 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_2) AS cnt]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_2, test.user - TableScan: test projection=[user] + (same as after extraction) ## Optimized - Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_3) AS cnt]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_3, test.user - TableScan: test projection=[user] + (same as after pushdown) "#) } @@ -1792,17 +1757,17 @@ mod tests { ## After Pushdown Projection: test.id, test.user - Filter: __datafusion_extracted_3 IS NOT NULL - Filter: __datafusion_extracted_4 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + Filter: __datafusion_extracted_1 IS NOT NULL + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] ## Optimized Projection: test.id, test.user - Filter: __datafusion_extracted_5 IS NOT NULL - Projection: test.id, test.user, __datafusion_extracted_5 - Filter: __datafusion_extracted_6 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_6, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + Filter: __datafusion_extracted_1 IS NOT NULL + Projection: test.id, test.user, __datafusion_extracted_1 + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] "#) } @@ -1910,18 +1875,18 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_3]], aggr=[[COUNT(Int32(1))]] - Filter: __datafusion_extracted_4 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) - Aggregate: groupBy=[[__datafusion_extracted_5]], aggr=[[COUNT(Int32(1))]] - Projection: __datafusion_extracted_5 - Filter: __datafusion_extracted_6 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_6, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] + Projection: __datafusion_extracted_1 + Filter: __datafusion_extracted_2 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -1952,17 +1917,17 @@ mod tests { ## After Pushdown Projection: test.a, test.b, test.c - Filter: __datafusion_extracted_3 = Int32(2) - Filter: __datafusion_extracted_4 = Int32(1) - Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_4, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_3 + Filter: __datafusion_extracted_1 = Int32(2) + Filter: __datafusion_extracted_2 = Int32(1) + Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1 TableScan: test projection=[a, b, c] ## Optimized Projection: test.a, test.b, test.c - Filter: __datafusion_extracted_5 = Int32(2) - Projection: test.a, test.b, test.c, __datafusion_extracted_5 - Filter: __datafusion_extracted_6 = Int32(1) - Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_6, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_5 + Filter: __datafusion_extracted_1 = Int32(2) + Projection: test.a, test.b, test.c, __datafusion_extracted_1 + Filter: __datafusion_extracted_2 = Int32(1) + Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1 TableScan: test projection=[a, b, c] "#) } @@ -2013,20 +1978,10 @@ mod tests { TableScan: right projection=[id, user] ## After Pushdown - Projection: test.id, test.user, right.id, right.user - Inner Join: __datafusion_extracted_3 = __datafusion_extracted_4 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_3, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_4, right.id, right.user - TableScan: right projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user, right.id, right.user - Inner Join: __datafusion_extracted_5 = __datafusion_extracted_6 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_5, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_6, right.id, right.user - TableScan: right projection=[id, user] + (same as after pushdown) "#) } @@ -2063,18 +2018,10 @@ mod tests { TableScan: right projection=[id, user] ## After Pushdown - Projection: test.id, test.user, right.id, right.user - Inner Join: Filter: test.user = right.user AND __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user - TableScan: test projection=[id, user] - TableScan: right projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user, right.id, right.user - Inner Join: Filter: test.user = right.user AND __datafusion_extracted_3 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id, test.user - TableScan: test projection=[id, user] - TableScan: right projection=[id, user] + (same as after pushdown) "#) } @@ -2113,20 +2060,10 @@ mod tests { TableScan: right projection=[id, user] ## After Pushdown - Projection: test.id, test.user, right.id, right.user - Inner Join: Filter: test.user = right.user AND __datafusion_extracted_3 = Utf8("active") AND __datafusion_extracted_4 = Utf8("admin") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_4, right.id, right.user - TableScan: right projection=[id, user] + (same as after extraction) ## Optimized - Projection: test.id, test.user, right.id, right.user - Inner Join: Filter: test.user = right.user AND __datafusion_extracted_5 = Utf8("active") AND __datafusion_extracted_6 = Utf8("admin") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5, test.id, test.user - TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_6, right.id, right.user - TableScan: right projection=[id, user] + (same as after pushdown) "#) } @@ -2200,21 +2137,21 @@ mod tests { ## After Pushdown Projection: test.id, test.user, right.id, right.user - Filter: __datafusion_extracted_4 = Utf8("active") - Inner Join: __datafusion_extracted_5 = __datafusion_extracted_6 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_5, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4 + Filter: __datafusion_extracted_1 = Utf8("active") + Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_6, right.id, right.user + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user TableScan: right projection=[id, user] ## Optimized Projection: test.id, test.user, right.id, right.user - Filter: __datafusion_extracted_7 = Utf8("active") - Projection: test.id, test.user, __datafusion_extracted_7, right.id, right.user - Inner Join: __datafusion_extracted_8 = __datafusion_extracted_9 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_8, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_7 + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: test.id, test.user, __datafusion_extracted_1, right.id, right.user + Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 + Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_9, right.id, right.user + Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user TableScan: right projection=[id, user] "#) } @@ -2255,11 +2192,11 @@ mod tests { TableScan: right projection=[id, user] ## Optimized - Projection: __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_4 AS mock_leaf(right.user,Utf8("role")) + Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_2 AS mock_leaf(right.user,Utf8("role")) Inner Join: test.id = right.id - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3, test.id + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_4, right.id + Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id TableScan: right projection=[id, user] "#) } @@ -2295,9 +2232,9 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(x,Utf8("a")) + Projection: __datafusion_extracted_1 AS mock_leaf(x,Utf8("a")) Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2 + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2329,9 +2266,9 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL + Projection: __datafusion_extracted_1 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2 + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2360,14 +2297,14 @@ mod tests { ## After Pushdown Projection: x - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_2, test.user + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized Projection: x - Filter: __datafusion_extracted_3 = Utf8("active") - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_3 + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2401,9 +2338,9 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS mock_leaf(sub.user,Utf8("name")) SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2434,17 +2371,17 @@ mod tests { TableScan: test projection=[user] ## After Pushdown - Projection: __datafusion_extracted_3 AS mock_leaf(sub.user,Utf8("name")) - Filter: __datafusion_extracted_2 = Utf8("active") + Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.user + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_5 AS mock_leaf(sub.user,Utf8("name")) - Filter: __datafusion_extracted_4 = Utf8("active") + Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[user] "#) } @@ -2477,10 +2414,10 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(outer_sub.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS mock_leaf(outer_sub.user,Utf8("name")) SubqueryAlias: outer_sub SubqueryAlias: inner_sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2571,16 +2508,12 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id - Projection: test.id, test.user - Filter: __datafusion_extracted_3 = Utf8("a") AND __datafusion_extracted_4 = Utf8("b") - Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_3, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_4, test.id, test.user - TableScan: test projection=[id, user] + (same as after extraction) ## Optimized Projection: test.id - Filter: __datafusion_extracted_5 = Utf8("a") AND __datafusion_extracted_6 = Utf8("b") - Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_5, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_6, test.id + Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b") + Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id TableScan: test projection=[id, user] "#) } @@ -2613,15 +2546,15 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("name")) - Filter: __datafusion_extracted_4 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_5 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Filter: __datafusion_extracted_1 = Utf8("active") + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] "#) } @@ -2650,15 +2583,15 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_2 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_5 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_4 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4, test.id, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] "#) } @@ -2704,19 +2637,19 @@ mod tests { TableScan: right projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_4 AS mock_leaf(right.user,Utf8("status")) - Left Join: Filter: test.id = right.id AND __datafusion_extracted_2 > Int32(5) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, test.id, test.user + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(right.user,Utf8("status")) + Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_2, right.id, right.user, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_4 + Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: right projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_6 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_7 AS mock_leaf(right.user,Utf8("status")) - Left Join: Filter: test.id = right.id AND __datafusion_extracted_5 > Int32(5) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_6, test.id + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(right.user,Utf8("status")) + Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_5, right.id, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_7 + Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: right projection=[id, user] "#) } @@ -2749,15 +2682,15 @@ mod tests { TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_4 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_2 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_3, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_4 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: test projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_6 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_7 AS mock_leaf(test.user,Utf8("status")) - Filter: __datafusion_extracted_5 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_5, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_6, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_7 + Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) + Filter: __datafusion_extracted_1 > Int32(5) + Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: test projection=[id, user] "#) } From 3ecad7df548568e6b2867725ba968ff5af918c92 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Mon, 9 Feb 2026 23:52:16 -0500 Subject: [PATCH 03/18] use common function --- .../optimizer/src/extract_leaf_expressions.rs | 258 +++++++++--------- datafusion/optimizer/src/test/udfs.rs | 22 +- 2 files changed, 145 insertions(+), 135 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index ee984d4e32ebd..1923aec848acf 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -1223,7 +1223,7 @@ mod tests { Projection: test.id Projection: test.id, test.user Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user TableScan: test projection=[id, user] ## After Pushdown @@ -1232,7 +1232,7 @@ mod tests { ## Optimized Projection: test.id Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id TableScan: test projection=[id, user] "#) } @@ -1276,12 +1276,12 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized - Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) AS leaf_udf(test.user,Utf8("name")) TableScan: test projection=[user] "#) } @@ -1307,11 +1307,11 @@ mod tests { ## After Pushdown Projection: __datafusion_extracted_1 IS NOT NULL AS has_name - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized - Projection: mock_leaf(test.user, Utf8("name")) IS NOT NULL AS has_name + Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name TableScan: test projection=[user] "#) } @@ -1360,7 +1360,7 @@ mod tests { ## After Extraction Projection: test.id, test.user Filter: __datafusion_extracted_1 IS NOT NULL AND __datafusion_extracted_1 IS NULL - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user TableScan: test projection=[id, user] ## After Pushdown @@ -1386,7 +1386,7 @@ mod tests { ## After Extraction Projection: test.id, test.user Filter: __datafusion_extracted_1 = Utf8("test") - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user TableScan: test projection=[id, user] ## After Pushdown @@ -1412,18 +1412,18 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), COUNT(Int32(1)) Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## After Pushdown (same as after extraction) ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), COUNT(Int32(1)) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), COUNT(Int32(1)) Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -1446,9 +1446,9 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: test.user, COUNT(__datafusion_extracted_1) AS COUNT(mock_leaf(test.user,Utf8("value"))) + Projection: test.user, COUNT(__datafusion_extracted_1) AS COUNT(leaf_udf(test.user,Utf8("value"))) Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1)]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## After Pushdown @@ -1474,22 +1474,22 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: mock_leaf(test.user, Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) Projection: test.user Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## After Pushdown - Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Projection: __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")) Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Projection: __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")) Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[user] "#) } @@ -1511,11 +1511,11 @@ mod tests { ## After Pushdown Projection: __datafusion_extracted_1 AS username - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized - Projection: mock_leaf(test.user, Utf8("name")) AS username + Projection: leaf_udf(test.user, Utf8("name")) AS username TableScan: test projection=[user] "#) } @@ -1538,16 +1538,16 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: test.user, mock_leaf(test.user, Utf8("label")) + Projection: test.user, leaf_udf(test.user, Utf8("label")) Projection: test.user Filter: __datafusion_extracted_1 > Int32(150) - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## After Pushdown - Projection: test.user, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("label")) + Projection: test.user, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("label")) Filter: __datafusion_extracted_1 > Int32(150) - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user, mock_leaf(test.user, Utf8("label")) AS __datafusion_extracted_2 + Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user, leaf_udf(test.user, Utf8("label")) AS __datafusion_extracted_2 TableScan: test projection=[user] ## Optimized @@ -1572,12 +1572,12 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_1 AS name2 - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_1 AS name2 + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized - Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")), mock_leaf(test.user, Utf8("name")) AS name2 + Projection: leaf_udf(test.user, Utf8("name")) AS leaf_udf(test.user,Utf8("name")), leaf_udf(test.user, Utf8("name")) AS name2 TableScan: test projection=[user] "#) } @@ -1605,9 +1605,9 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")) Sort: test.user ASC NULLS FIRST - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized @@ -1634,15 +1634,15 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")) Limit: skip=0, fetch=10 - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")) Limit: skip=0, fetch=10 - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -1667,7 +1667,7 @@ mod tests { ## After Extraction Aggregate: groupBy=[[test.user]], aggr=[[COUNT(__datafusion_extracted_1) AS cnt]] - Projection: mock_leaf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("value")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## After Pushdown @@ -1749,17 +1749,17 @@ mod tests { ## After Extraction Projection: test.id, test.user Filter: __datafusion_extracted_1 IS NOT NULL - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.id, test.user Projection: test.id, test.user Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user TableScan: test projection=[id, user] ## After Pushdown Projection: test.id, test.user Filter: __datafusion_extracted_1 IS NOT NULL Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] ## Optimized @@ -1767,7 +1767,7 @@ mod tests { Filter: __datafusion_extracted_1 IS NOT NULL Projection: test.id, test.user, __datafusion_extracted_1 Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] "#) } @@ -1790,12 +1790,12 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized - Projection: mock_leaf(test.user, Utf8("name")) AS mock_leaf(test.user,Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) AS leaf_udf(test.user,Utf8("name")) TableScan: test projection=[user] "#) } @@ -1866,27 +1866,27 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1)) Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user Projection: test.user Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user TableScan: test projection=[user] ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1)) Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("name")), COUNT(Int32(1)) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("name")), COUNT(Int32(1)) Aggregate: groupBy=[[__datafusion_extracted_1]], aggr=[[COUNT(Int32(1))]] Projection: __datafusion_extracted_1 Filter: __datafusion_extracted_2 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -1909,17 +1909,17 @@ mod tests { ## After Extraction Projection: test.a, test.b, test.c Filter: __datafusion_extracted_1 = Int32(2) - Projection: mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1, test.a, test.b, test.c + Projection: leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1, test.a, test.b, test.c Projection: test.a, test.b, test.c Filter: __datafusion_extracted_2 = Int32(1) - Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c + Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c TableScan: test projection=[a, b, c] ## After Pushdown Projection: test.a, test.b, test.c Filter: __datafusion_extracted_1 = Int32(2) Filter: __datafusion_extracted_2 = Int32(1) - Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1 TableScan: test projection=[a, b, c] ## Optimized @@ -1927,7 +1927,7 @@ mod tests { Filter: __datafusion_extracted_1 = Int32(2) Projection: test.a, test.b, test.c, __datafusion_extracted_1 Filter: __datafusion_extracted_2 = Int32(1) - Projection: mock_leaf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, mock_leaf(test.b, Utf8("y")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.a, Utf8("x")) AS __datafusion_extracted_2, test.a, test.b, test.c, leaf_udf(test.b, Utf8("y")) AS __datafusion_extracted_1 TableScan: test projection=[a, b, c] "#) } @@ -1972,9 +1972,9 @@ mod tests { ## After Extraction Projection: test.id, test.user, right.id, right.user Inner Join: __datafusion_extracted_1 = __datafusion_extracted_2 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_1, test.id, test.user + Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_1, test.id, test.user TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_2, right.id, right.user + Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_2, right.id, right.user TableScan: right projection=[id, user] ## After Pushdown @@ -2013,7 +2013,7 @@ mod tests { ## After Extraction Projection: test.id, test.user, right.id, right.user Inner Join: Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user TableScan: test projection=[id, user] TableScan: right projection=[id, user] @@ -2054,9 +2054,9 @@ mod tests { ## After Extraction Projection: test.id, test.user, right.id, right.user Inner Join: Filter: test.user = right.user AND __datafusion_extracted_1 = Utf8("active") AND __datafusion_extracted_2 = Utf8("admin") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user + Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user TableScan: right projection=[id, user] ## After Pushdown @@ -2127,21 +2127,21 @@ mod tests { ## After Extraction Projection: test.id, test.user, right.id, right.user Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, right.id, right.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, right.id, right.user Projection: test.id, test.user, right.id, right.user Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user + Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user + Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user TableScan: right projection=[id, user] ## After Pushdown Projection: test.id, test.user, right.id, right.user Filter: __datafusion_extracted_1 = Utf8("active") Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user + Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user TableScan: right projection=[id, user] ## Optimized @@ -2149,9 +2149,9 @@ mod tests { Filter: __datafusion_extracted_1 = Utf8("active") Projection: test.id, test.user, __datafusion_extracted_1, right.id, right.user Inner Join: __datafusion_extracted_2 = __datafusion_extracted_3 - Projection: mock_leaf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.user, Utf8("id")) AS __datafusion_extracted_2, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1 TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user + Projection: leaf_udf(right.user, Utf8("id")) AS __datafusion_extracted_3, right.id, right.user TableScan: right projection=[id, user] "#) } @@ -2184,19 +2184,19 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_2 AS mock_leaf(right.user,Utf8("role")) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), __datafusion_extracted_2 AS leaf_udf(right.user,Utf8("role")) Inner Join: test.id = right.id - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user + Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id, right.user TableScan: right projection=[id, user] ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(test.user,Utf8("status")), __datafusion_extracted_2 AS mock_leaf(right.user,Utf8("role")) + Projection: __datafusion_extracted_1 AS leaf_udf(test.user,Utf8("status")), __datafusion_extracted_2 AS leaf_udf(right.user,Utf8("role")) Inner Join: test.id = right.id - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id + Projection: leaf_udf(right.user, Utf8("role")) AS __datafusion_extracted_2, right.id TableScan: right projection=[id, user] "#) } @@ -2226,15 +2226,15 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(x,Utf8("a")) + Projection: __datafusion_extracted_1 AS leaf_udf(x,Utf8("a")) Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(x,Utf8("a")) + Projection: __datafusion_extracted_1 AS leaf_udf(x,Utf8("a")) Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2260,15 +2260,15 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL + Projection: __datafusion_extracted_1 IS NOT NULL AS leaf_udf(x,Utf8("a")) IS NOT NULL Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_1 IS NOT NULL AS mock_leaf(x,Utf8("a")) IS NOT NULL + Projection: __datafusion_extracted_1 IS NOT NULL AS leaf_udf(x,Utf8("a")) IS NOT NULL Filter: x IS NOT NULL - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2291,20 +2291,20 @@ mod tests { ## After Extraction Projection: x Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(x, Utf8("a")) AS __datafusion_extracted_1, x + Projection: leaf_udf(x, Utf8("a")) AS __datafusion_extracted_1, x Projection: test.user AS x TableScan: test projection=[user] ## After Pushdown Projection: x Filter: __datafusion_extracted_1 = Utf8("active") - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized Projection: x Filter: __datafusion_extracted_1 = Utf8("active") - Projection: test.user AS x, mock_leaf(test.user, Utf8("a")) AS __datafusion_extracted_1 + Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2332,15 +2332,15 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(sub.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS leaf_udf(sub.user,Utf8("name")) SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(sub.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS leaf_udf(sub.user,Utf8("name")) SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2363,25 +2363,25 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: mock_leaf(sub.user, Utf8("name")) + Projection: leaf_udf(sub.user, Utf8("name")) Projection: sub.user Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(sub.user, Utf8("status")) AS __datafusion_extracted_1, sub.user + Projection: leaf_udf(sub.user, Utf8("status")) AS __datafusion_extracted_1, sub.user SubqueryAlias: sub TableScan: test projection=[user] ## After Pushdown - Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) + Projection: __datafusion_extracted_2 AS leaf_udf(sub.user,Utf8("name")) Filter: __datafusion_extracted_1 = Utf8("active") SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.user TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_2 AS mock_leaf(sub.user,Utf8("name")) + Projection: __datafusion_extracted_2 AS leaf_udf(sub.user,Utf8("name")) Filter: __datafusion_extracted_1 = Utf8("active") SubqueryAlias: sub - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[user] "#) } @@ -2407,17 +2407,17 @@ mod tests { (same as original) ## After Pushdown - Projection: __datafusion_extracted_1 AS mock_leaf(outer_sub.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS leaf_udf(outer_sub.user,Utf8("name")) SubqueryAlias: outer_sub SubqueryAlias: inner_sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] ## Optimized - Projection: __datafusion_extracted_1 AS mock_leaf(outer_sub.user,Utf8("name")) + Projection: __datafusion_extracted_1 AS leaf_udf(outer_sub.user,Utf8("name")) SubqueryAlias: outer_sub SubqueryAlias: inner_sub - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_1 + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1 TableScan: test projection=[user] "#) } @@ -2504,7 +2504,7 @@ mod tests { Projection: test.id Projection: test.id, test.user Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b") - Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id, test.user + Projection: leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id, test.user TableScan: test projection=[id, user] ## After Pushdown @@ -2513,7 +2513,7 @@ mod tests { ## Optimized Projection: test.id Filter: __datafusion_extracted_1 = Utf8("a") AND __datafusion_extracted_2 = Utf8("b") - Projection: mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_1, mock_leaf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id + Projection: leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_1, leaf_udf(test.user, Utf8("field")) AS __datafusion_extracted_2, test.id TableScan: test projection=[id, user] "#) } @@ -2523,7 +2523,7 @@ mod tests { // ========================================================================= /// Extraction pushdown through a filter that already had its own - /// `mock_leaf` extracted. + /// `leaf_udf` extracted. #[test] fn test_extraction_pushdown_through_filter_with_extracted_predicate() -> Result<()> { let table_scan = test_table_scan_with_struct()?; @@ -2539,22 +2539,22 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - Projection: test.id, mock_leaf(test.user, Utf8("name")) + Projection: test.id, leaf_udf(test.user, Utf8("name")) Projection: test.id, test.user Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")) Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")) Filter: __datafusion_extracted_1 = Utf8("active") - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] "#) } @@ -2576,28 +2576,28 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - Projection: test.id, mock_leaf(test.user, Utf8("status")) + Projection: test.id, leaf_udf(test.user, Utf8("status")) Projection: test.id, test.user Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("status")) Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("status")) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("status")) Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_2 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_2 TableScan: test projection=[id, user] "#) } - /// Left join with a `mock_leaf` filter on the right side AND - /// the projection also selects `mock_leaf` from the right side. + /// Left join with a `leaf_udf` filter on the right side AND + /// the projection also selects `leaf_udf` from the right side. #[test] fn test_left_join_with_filter_and_projection_extraction() -> Result<()> { use datafusion_expr::JoinType; @@ -2629,27 +2629,27 @@ mod tests { TableScan: right projection=[id, user] ## After Extraction - Projection: test.id, mock_leaf(test.user, Utf8("name")), mock_leaf(right.user, Utf8("status")) + Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(right.user, Utf8("status")) Projection: test.id, test.user, right.id, right.user Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user + Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user TableScan: right projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(right.user,Utf8("status")) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(right.user,Utf8("status")) Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id, test.user TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_3 + Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: right projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(right.user,Utf8("status")) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(right.user,Utf8("status")) Left Join: Filter: test.id = right.id AND __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, test.id TableScan: test projection=[id, user] - Projection: mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, mock_leaf(right.user, Utf8("status")) AS __datafusion_extracted_3 + Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: right projection=[id, user] "#) } @@ -2675,22 +2675,22 @@ mod tests { TableScan: test projection=[id, user] ## After Extraction - Projection: test.id, mock_leaf(test.user, Utf8("name")), mock_leaf(test.user, Utf8("status")) + Projection: test.id, leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("status")) Projection: test.id, test.user Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user TableScan: test projection=[id, user] ## After Pushdown - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(test.user,Utf8("status")) Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: test projection=[id, user] ## Optimized - Projection: test.id, __datafusion_extracted_2 AS mock_leaf(test.user,Utf8("name")), __datafusion_extracted_3 AS mock_leaf(test.user,Utf8("status")) + Projection: test.id, __datafusion_extracted_2 AS leaf_udf(test.user,Utf8("name")), __datafusion_extracted_3 AS leaf_udf(test.user,Utf8("status")) Filter: __datafusion_extracted_1 > Int32(5) - Projection: mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, mock_leaf(test.user, Utf8("name")) AS __datafusion_extracted_2, mock_leaf(test.user, Utf8("status")) AS __datafusion_extracted_3 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_2, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_3 TableScan: test projection=[id, user] "#) } diff --git a/datafusion/optimizer/src/test/udfs.rs b/datafusion/optimizer/src/test/udfs.rs index 9164603dba3d5..35ea3e44d3e72 100644 --- a/datafusion/optimizer/src/test/udfs.rs +++ b/datafusion/optimizer/src/test/udfs.rs @@ -30,6 +30,7 @@ use datafusion_expr::{ pub struct PlacementTestUDF { signature: Signature, placement: ExpressionPlacement, + name: String, id: usize, } @@ -42,6 +43,7 @@ impl Default for PlacementTestUDF { impl PlacementTestUDF { pub fn new() -> Self { Self { + name: "leaf_udf".to_string(), // Accept any one or two arguments and return UInt32 for testing purposes. // The actual types don't matter since this UDF is not intended for execution. signature: Signature::new( @@ -57,6 +59,19 @@ impl PlacementTestUDF { /// This also resets the name of the UDF to a default based on the placement. pub fn with_placement(mut self, placement: ExpressionPlacement) -> Self { self.placement = placement; + self.name = match self.placement { + ExpressionPlacement::MoveTowardsLeafNodes => "leaf_udf", + ExpressionPlacement::KeepInPlace => "keep_in_place_udf", + ExpressionPlacement::Column => "column_udf", + ExpressionPlacement::Literal => "literal_udf", + } + .to_string(); + self + } + + /// Set the name of the UDF, which is used in the expression and thus in optimizer rules. + pub fn with_name(mut self, name: &str) -> Self { + self.name = name.to_string(); self } @@ -73,12 +88,7 @@ impl ScalarUDFImpl for PlacementTestUDF { self } fn name(&self) -> &str { - match self.placement { - ExpressionPlacement::MoveTowardsLeafNodes => "leaf_udf", - ExpressionPlacement::KeepInPlace => "keep_in_place_udf", - ExpressionPlacement::Column => "column_udf", - ExpressionPlacement::Literal => "literal_udf", - } + &self.name } fn signature(&self) -> &Signature { &self.signature From 3cdec93c9c782b84eb1442128a4442f9b73bb604 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 09:23:53 -0500 Subject: [PATCH 04/18] update comments, reduce clones --- .../optimizer/src/extract_leaf_expressions.rs | 56 ++++++++++--------- .../test_files/projection_pushdown.slt | 2 +- 2 files changed, 31 insertions(+), 27 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 1923aec848acf..10a2ae6611b0b 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -141,14 +141,11 @@ fn extract_from_plan( // Save original output schema before any transformation let original_schema = Arc::clone(plan.schema()); - // Clone inputs upfront (before plan is consumed by map_expressions) - let owned_inputs: Vec = inputs.into_iter().cloned().collect(); - - // Build per-input schemas (kept alive for extractor borrows) - let input_schemas: Vec> = owned_inputs - .iter() - .map(|i| Arc::clone(i.schema())) - .collect(); + // Build per-input schemas from borrowed inputs (before plan is consumed + // by map_expressions). We only need schemas and column sets for routing; + // the actual inputs are cloned later only if extraction succeeds. + let input_schemas: Vec> = + inputs.iter().map(|i| Arc::clone(i.schema())).collect(); // Build per-input extractors let mut extractors: Vec = input_schemas @@ -172,6 +169,11 @@ fn extract_from_plan( return Ok(transformed); } + // Clone inputs now that we know extraction succeeded. We need owned + // copies to wrap in extraction projections below each input. + let owned_inputs: Vec = + transformed.data.inputs().into_iter().cloned().collect(); + // Build per-input extraction projections (None means no extractions for that input) let new_inputs: Vec = owned_inputs .iter() @@ -184,7 +186,8 @@ fn extract_from_plan( }) .collect::>>()?; - // Rebuild and add recovery projection if schema changed + // Rebuild the plan keeping its rewritten expressions but replacing + // inputs with the new extraction projections. let new_plan = transformed .data .with_new_exprs(transformed.data.expressions(), new_inputs)?; @@ -298,12 +301,12 @@ fn build_projection_replace_map(projection: &Projection) -> HashMap HashMap` +/// `Projection: user['status'] AS __datafusion_extracted_1, user['name'] AS __datafusion_extracted_2, ` struct LeafExpressionExtractor<'a> { /// Extracted expressions: maps expression -> alias extracted: IndexMap, @@ -560,9 +563,9 @@ fn build_extraction_projection_impl( /// /// After pass 1, the extraction projection sits directly below the filter: /// ```text -/// Projection: id, user <-- recovery -/// Filter: __extracted_1 = 'active' -/// Projection: user['status'] AS __extracted_1, id, user <-- extraction +/// Projection: id, user <-- recovery +/// Filter: __datafusion_extracted_1 = 'active' +/// Projection: user['status'] AS __datafusion_extracted_1, id, user <-- extraction /// TableScan: t [id, user] /// ``` /// @@ -570,8 +573,8 @@ fn build_extraction_projection_impl( /// and a subsequent `OptimizeProjections` pass removes the (now-redundant) /// recovery projection: /// ```text -/// Filter: __extracted_1 = 'active' -/// Projection: user['status'] AS __extracted_1, id, user <-- extraction (pushed down) +/// Filter: __datafusion_extracted_1 = 'active' +/// Projection: user['status'] AS __datafusion_extracted_1, id, user <-- extraction (pushed down) /// TableScan: t [id, user] /// ``` #[derive(Default, Debug)] @@ -846,8 +849,9 @@ fn push_extraction_pairs( ) -> Result> { match proj_input.as_ref() { // Merge into existing projection, then try to push the result further down. - // Only merge when all outer expressions are captured (pairs + columns). - // Uncaptured expressions (e.g. `col AS __common_expr_1`) would be lost + // Only merge when every expression in the outer projection is accounted + // for as either an extraction pair or a needed column. Uncaptured + // expressions (e.g. `col AS __common_expr_1` from CSE) would be lost // during the merge since build_extraction_projection_impl only knows // about the captured pairs and columns. LogicalPlan::Projection(_) @@ -864,7 +868,7 @@ fn push_extraction_pairs( // After merging, try to push the result further down, but ONLY // if the merged result is still a pure extraction projection - // (all __extracted aliases + columns). If the merge inherited + // (all __datafusion_extracted aliases + columns). If the merge inherited // bare MoveTowardsLeafNodes expressions from the inner projection, // pushing would re-extract them into new aliases and fail when // the (None, true) fallback can't find the original aliases. diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index 6dfa66cda51c9..2ff46f3b38b9b 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -1497,7 +1497,7 @@ physical_plan 02)--FilterExec: __datafusion_extracted_1@0 > 100, projection=[id@1] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet 04)--FilterExec: __datafusion_extracted_2@0 > 3, projection=[id@1] -05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet +05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness - id matches, value > 100, and level > 3 # Matching ids where value > 100: 2(200), 3(150), 4(300), 5(250) From cc443e09146a60658d3c54edbb9c6fd32cd746a8 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 09:28:09 -0500 Subject: [PATCH 05/18] clean up build_extraction_projection --- .../optimizer/src/extract_leaf_expressions.rs | 43 +++++++++---------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 10a2ae6611b0b..cb3e9dedd8d21 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -414,11 +414,11 @@ impl<'a> LeafExpressionExtractor<'a> { Ok(Expr::Column(Column::new_unqualified(&alias))) } - /// Builds a fresh extraction projection above the given input. + /// Builds an extraction projection above the given input, or merges into + /// it if the input is already a projection. Delegates to + /// [`build_extraction_projection_impl`]. /// - /// Returns `None` if there are no extractions. Otherwise creates a new - /// projection that includes extracted expressions (aliased) plus all - /// input schema columns for pass-through. + /// Returns `None` if there are no extractions. fn build_extraction_projection( &self, input: &Arc, @@ -426,21 +426,22 @@ impl<'a> LeafExpressionExtractor<'a> { if self.extracted.is_empty() { return Ok(None); } - let mut proj_exprs = Vec::new(); - for (expr, alias) in self.extracted.iter() { - proj_exprs.push(expr.clone().alias(alias)); - } - for (qualifier, field) in self.input_schema.iter() { - proj_exprs.push(Expr::from((qualifier, field))); - } - Ok(Some(LogicalPlan::Projection(Projection::try_new( - proj_exprs, - Arc::clone(input), - )?))) + let pairs: Vec<(Expr, String)> = self + .extracted + .iter() + .map(|(e, a)| (e.clone(), a.clone())) + .collect(); + let proj = build_extraction_projection_impl( + &pairs, + &self.columns_needed, + input, + self.input_schema, + )?; + Ok(Some(LogicalPlan::Projection(proj))) } } -/// Build an extraction projection above the target node. +/// Build an extraction projection above the target node (shared by both passes). /// /// If the target is an existing projection, merges into it. This requires /// resolving column references through the projection's rename mapping: @@ -2293,18 +2294,14 @@ mod tests { TableScan: test projection=[user] ## After Extraction - Projection: x - Filter: __datafusion_extracted_1 = Utf8("active") - Projection: leaf_udf(x, Utf8("a")) AS __datafusion_extracted_1, x - Projection: test.user AS x - TableScan: test projection=[user] - - ## After Pushdown Projection: x Filter: __datafusion_extracted_1 = Utf8("active") Projection: test.user AS x, leaf_udf(test.user, Utf8("a")) AS __datafusion_extracted_1, test.user TableScan: test projection=[user] + ## After Pushdown + (same as after extraction) + ## Optimized Projection: x Filter: __datafusion_extracted_1 = Utf8("active") From 33649707d0b67011f8633430cd1881596e95b616 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 09:33:22 -0500 Subject: [PATCH 06/18] clean up duplication MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Added remap_pairs_and_columns helper (after schema_columns) — consolidates the duplicated qualifier-remapping pattern that was copy-pasted in the SubqueryAlias block and the Union broadcast block. Both callers now go through this single function. 2. Added route_to_inputs helper (above try_push_into_inputs) — encapsulates the routing decision: Union broadcasting via remap_pairs_and_columns vs route-by-ownership via find_owning_input, plus the "at least one input has pairs" check. 3. Simplified try_push_into_inputs — replaced ~80 lines of inline SubqueryAlias remapping + routing logic with two clean calls: - remap_pairs_and_columns(...) for SubqueryAlias - route_to_inputs(...) with a simple match for early return --- .../optimizer/src/extract_leaf_expressions.rs | 213 ++++++++++-------- 1 file changed, 115 insertions(+), 98 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index cb3e9dedd8d21..c61766bd196af 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -276,6 +276,52 @@ fn schema_columns(schema: &DFSchema) -> std::collections::HashSet { .collect() } +/// Rewrites extraction pairs and column references from one qualifier +/// space to another. +/// +/// Builds a replacement map by zipping `from_schema` (whose qualifiers +/// currently appear in `pairs` / `columns`) with `to_schema` (the +/// qualifiers we want), then applies `replace_cols_by_name`. +/// +/// Used for SubqueryAlias (alias-space -> input-space) and Union +/// (union output-space -> per-branch input-space). +fn remap_pairs_and_columns( + pairs: &[(Expr, String)], + columns: &IndexSet, + from_schema: &DFSchema, + to_schema: &DFSchema, +) -> Result<(Vec<(Expr, String)>, IndexSet)> { + let mut replace_map = HashMap::new(); + for ((from_q, from_f), (to_q, to_f)) in from_schema.iter().zip(to_schema.iter()) { + replace_map.insert( + qualified_name(from_q, from_f.name()), + Expr::Column(Column::new(to_q.cloned(), to_f.name())), + ); + } + let remapped_pairs: Vec<(Expr, String)> = pairs + .iter() + .map(|(expr, alias)| { + Ok(( + replace_cols_by_name(expr.clone(), &replace_map)?, + alias.clone(), + )) + }) + .collect::>()?; + let remapped_columns: IndexSet = columns + .iter() + .filter_map(|col| { + let rewritten = + replace_cols_by_name(Expr::Column(col.clone()), &replace_map).ok()?; + if let Expr::Column(c) = rewritten { + Some(c) + } else { + Some(col.clone()) + } + }) + .collect(); + Ok((remapped_pairs, remapped_columns)) +} + // ============================================================================= // Helper Functions for Extraction Targeting // ============================================================================= @@ -896,6 +942,61 @@ fn push_extraction_pairs( } } +/// Routes extraction pairs and columns to the appropriate inputs. +/// +/// - **Union**: broadcasts to every input via [`remap_pairs_and_columns`]. +/// - **Other nodes**: routes each expression to the one input that owns +/// all of its column references (via [`find_owning_input`]). +/// +/// Returns `None` if any expression can't be routed or no input has pairs. +fn route_to_inputs( + pairs: &[(Expr, String)], + columns: &IndexSet, + node: &LogicalPlan, + input_column_sets: &[std::collections::HashSet], + input_schemas: &[Arc], +) -> Result>, Vec>)>> { + let num_inputs = input_schemas.len(); + let mut per_input_pairs: Vec> = vec![vec![]; num_inputs]; + let mut per_input_columns: Vec> = vec![IndexSet::new(); num_inputs]; + + if matches!(node, LogicalPlan::Union(_)) { + // Union output schema and each input schema have the same fields by + // index but may differ in qualifiers (e.g. output `s` vs input + // `simple_struct.s`). Remap pairs/columns to each input's space. + let union_schema = node.schema(); + for (idx, input_schema) in input_schemas.iter().enumerate() { + let (remapped_pairs, remapped_columns) = + remap_pairs_and_columns(pairs, columns, union_schema, input_schema)?; + per_input_pairs[idx] = remapped_pairs; + per_input_columns[idx] = remapped_columns; + } + } else { + for (expr, alias) in pairs { + match find_owning_input(expr, input_column_sets) { + Some(idx) => per_input_pairs[idx].push((expr.clone(), alias.clone())), + None => return Ok(None), // Cross-input expression — bail out + } + } + for col in columns { + let col_expr = Expr::Column(col.clone()); + match find_owning_input(&col_expr, input_column_sets) { + Some(idx) => { + per_input_columns[idx].insert(col.clone()); + } + None => return Ok(None), // Ambiguous column — bail out + } + } + } + + // Check at least one input has extractions to push + if per_input_pairs.iter().all(|p| p.is_empty()) { + return Ok(None); + } + + Ok(Some((per_input_pairs, per_input_columns))) +} + /// Pushes extraction expressions into a node's inputs by routing each /// expression to the input that owns all of its column references. /// @@ -938,116 +1039,32 @@ fn try_push_into_inputs( // SubqueryAlias remaps qualifiers between input and output. // Rewrite pairs/columns from alias-space to input-space before routing. let (pairs, columns_needed) = if let LogicalPlan::SubqueryAlias(sa) = node { - let mut replace_map = HashMap::new(); - for ((input_q, input_f), (alias_q, alias_f)) in - sa.input.schema().iter().zip(sa.schema.iter()) - { - replace_map.insert( - qualified_name(alias_q, alias_f.name()), - Expr::Column(Column::new(input_q.cloned(), input_f.name())), - ); - } - let remapped_pairs: Vec<(Expr, String)> = pairs - .iter() - .map(|(expr, alias)| { - Ok(( - replace_cols_by_name(expr.clone(), &replace_map)?, - alias.clone(), - )) - }) - .collect::>()?; - let remapped_columns: IndexSet = columns_needed - .iter() - .filter_map(|col| { - let rewritten = - replace_cols_by_name(Expr::Column(col.clone()), &replace_map).ok()?; - if let Expr::Column(c) = rewritten { - Some(c) - } else { - Some(col.clone()) - } - }) - .collect(); - (remapped_pairs, remapped_columns) + remap_pairs_and_columns(pairs, columns_needed, &sa.schema, sa.input.schema())? } else { (pairs.to_vec(), columns_needed.clone()) }; let pairs = &pairs[..]; let columns_needed = &columns_needed; - let num_inputs = inputs.len(); - - // Build per-input column sets using existing schema_columns() + // Build per-input schemas and column sets for routing let input_schemas: Vec> = inputs.iter().map(|i| Arc::clone(i.schema())).collect(); let input_column_sets: Vec> = input_schemas.iter().map(|s| schema_columns(s)).collect(); - // Route pairs and columns to inputs. - // Union: all inputs share the same schema, so broadcast to every branch. - // Everything else (Join, single-input nodes): columns are disjoint across - // inputs, so route each expression to its owning input. - let broadcast = matches!(node, LogicalPlan::Union(_)); - - let mut per_input_pairs: Vec> = vec![vec![]; num_inputs]; - let mut per_input_columns: Vec> = vec![IndexSet::new(); num_inputs]; - - if broadcast { - // Union output schema and each input schema have the same fields by - // index but may differ in qualifiers (e.g. output `s` vs input - // `simple_struct.s`). Remap pairs/columns to each input's space. - let union_schema = node.schema(); - for (idx, input_schema) in input_schemas.iter().enumerate() { - let mut remap = HashMap::new(); - for ((out_q, out_f), (in_q, in_f)) in - union_schema.iter().zip(input_schema.iter()) - { - remap.insert( - qualified_name(out_q, out_f.name()), - Expr::Column(Column::new(in_q.cloned(), in_f.name())), - ); - } - per_input_pairs[idx] = pairs - .iter() - .map(|(expr, alias)| { - Ok((replace_cols_by_name(expr.clone(), &remap)?, alias.clone())) - }) - .collect::>()?; - per_input_columns[idx] = columns_needed - .iter() - .filter_map(|col| { - let rewritten = - replace_cols_by_name(Expr::Column(col.clone()), &remap).ok()?; - if let Expr::Column(c) = rewritten { - Some(c) - } else { - Some(col.clone()) - } - }) - .collect(); - } - } else { - for (expr, alias) in pairs { - match find_owning_input(expr, &input_column_sets) { - Some(idx) => per_input_pairs[idx].push((expr.clone(), alias.clone())), - None => return Ok(None), // Cross-input expression — bail out - } - } - for col in columns_needed { - let col_expr = Expr::Column(col.clone()); - match find_owning_input(&col_expr, &input_column_sets) { - Some(idx) => { - per_input_columns[idx].insert(col.clone()); - } - None => return Ok(None), // Ambiguous column — bail out - } - } - } + // Route pairs and columns to the appropriate inputs + let (per_input_pairs, per_input_columns) = match route_to_inputs( + pairs, + columns_needed, + node, + &input_column_sets, + &input_schemas, + )? { + Some(routed) => routed, + None => return Ok(None), + }; - // Check at least one input has extractions to push - if per_input_pairs.iter().all(|p| p.is_empty()) { - return Ok(None); - } + let num_inputs = inputs.len(); // Build per-input extraction projections and push them as far as possible // immediately. This is critical because map_children preserves cached schemas, From 39d4cca08b89aafa9a33f53fc96f21cef7c9b076 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 10:26:47 -0500 Subject: [PATCH 07/18] simplify 4-way match in split_and_push_projection Replace the (pushed, needs_recovery) 4-way match with a two-step flow: first resolve the base plan, then conditionally wrap with recovery. This deduplicates the recovery-wrapping logic and flattens the has_new_extractions bail-out. Also update comments to use the full __datafusion_extracted prefix. Co-Authored-By: Claude Opus 4.6 --- .../optimizer/src/extract_leaf_expressions.rs | 78 ++++++++----------- 1 file changed, 34 insertions(+), 44 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index c61766bd196af..f2a0fbcfb57b5 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -249,7 +249,7 @@ fn routing_extract( // Track columns that the parent node references so the // extraction projection includes them as pass-through. // Without this, the extraction projection would only - // contain __extracted_N aliases, and the parent couldn't + // contain __datafusion_extracted_N aliases, and the parent couldn't // resolve its other column references. if let Expr::Column(col) = &e && let Some(idx) = find_owning_input(&e, input_column_sets) @@ -673,7 +673,7 @@ fn try_push_input( /// nodes, and adds a recovery projection if needed. /// /// Handles both: -/// - **Pure extraction projections** (all `__extracted` aliases + columns) +/// - **Pure extraction projections** (all `__datafusion_extracted` aliases + columns) /// - **Mixed projections** (containing `MoveTowardsLeafNodes` sub-expressions) /// /// Returns `Some(new_subtree)` if extractions were pushed down, @@ -688,16 +688,16 @@ fn try_push_input( /// TableScan /// /// Phase 1 (Split): -/// extraction_pairs: [(user['name'], "__extracted_1")] -/// recovery_exprs: [__extracted_1 IS NOT NULL AS has_name, id] +/// extraction_pairs: [(user['name'], "__datafusion_extracted_1")] +/// recovery_exprs: [__datafusion_extracted_1 IS NOT NULL AS has_name, id] /// /// Phase 2 (Push): /// Push extraction projection through Filter toward TableScan /// /// Phase 3 (Recovery): -/// Projection: __extracted_1 IS NOT NULL AS has_name, id <-- recovery +/// Projection: __datafusion_extracted_1 IS NOT NULL AS has_name, id <-- recovery /// Filter: ... -/// Projection: user['name'] AS __extracted_1, id <-- extraction (pushed) +/// Projection: user['name'] AS __datafusion_extracted_1, id <-- extraction (pushed) /// TableScan /// ``` fn split_and_push_projection( @@ -711,12 +711,12 @@ fn split_and_push_projection( // For each projection expression, collect extraction pairs and build // recovery expressions. // - // Pre-existing `__extracted` aliases are inserted into the extractor's - // `IndexMap` with the **full** `Expr::Alias(…)` as the key, so the - // alias name participates in equality. This prevents collisions when - // CSE rewrites produce the same inner expression under different alias - // names (e.g. `__common_expr_4 AS __extracted_1` and - // `__common_expr_4 AS __extracted_3`). New extractions from + // Pre-existing `__datafusion_extracted` aliases are inserted into the + // extractor's `IndexMap` with the **full** `Expr::Alias(…)` as the key, + // so the alias name participates in equality. This prevents collisions + // when CSE rewrites produce the same inner expression under different + // alias names (e.g. `__common_expr_4 AS __datafusion_extracted_1` and + // `__common_expr_4 AS __datafusion_extracted_3`). New extractions from // `routing_extract` use bare (non-Alias) keys and get normal dedup. // // When building the final `extraction_pairs`, the Alias wrapper is @@ -781,7 +781,7 @@ fn split_and_push_projection( // If the expression was transformed (i.e., has extracted sub-parts), // it differs from what the pushed projection outputs → needs recovery. - // Also, any non-column, non-__extracted expression needs recovery + // Also, any non-column, non-__datafusion_extracted expression needs recovery // because the pushed extraction projection won't output it directly. if transformed.transformed || !matches!(expr, Expr::Column(_)) { needs_recovery = true; @@ -820,30 +820,18 @@ fn split_and_push_projection( )?; // ── Phase 3: Recovery ─────────────────────────────────────────────── - match (pushed, needs_recovery) { - (Some(pushed_plan), true) => { - // Wrap with recovery projection - let recovery = LogicalPlan::Projection(Projection::try_new( - recovery_exprs, - Arc::new(pushed_plan), - )?); - Ok(Some(recovery)) - } - (Some(pushed_plan), false) => { - // No recovery needed (pure extraction projection) - Ok(Some(pushed_plan)) - } - (None, true) => { - // Push returned None but we still have extractions to apply. - // Build the extraction projection in-place (not pushed) so the - // recovery can resolve extracted expressions. + // Determine the base plan: either the pushed result or an in-place extraction. + let base_plan = match pushed { + Some(plan) => plan, + None => { if !has_new_extractions { - // Only pre-existing __extracted aliases and columns, no new + // Only pre-existing __datafusion_extracted aliases and columns, no new // extractions from routing_extract. The original projection is // already an extraction projection that couldn't be pushed // further. Return None. return Ok(None); } + // Build extraction projection in-place (couldn't push down) let input_arc = Arc::clone(input); let extraction = build_extraction_projection_impl( &extraction_pairs, @@ -851,17 +839,19 @@ fn split_and_push_projection( &input_arc, input_schema.as_ref(), )?; - let extraction_plan = LogicalPlan::Projection(extraction); - let recovery = LogicalPlan::Projection(Projection::try_new( - recovery_exprs, - Arc::new(extraction_plan), - )?); - Ok(Some(recovery)) - } - (None, false) => { - // No extractions could be pushed and no recovery needed - Ok(None) + LogicalPlan::Projection(extraction) } + }; + + // Wrap with recovery projection if the output schema changed + if needs_recovery { + let recovery = LogicalPlan::Projection(Projection::try_new( + recovery_exprs, + Arc::new(base_plan), + )?); + Ok(Some(recovery)) + } else { + Ok(Some(base_plan)) } } @@ -1013,16 +1003,16 @@ fn route_to_inputs( /// /// ```text /// Extraction projection above a Join: -/// Projection: left.user['name'] AS __extracted_1, right.order['total'] AS __extracted_2, ... +/// Projection: left.user['name'] AS __datafusion_extracted_1, right.order['total'] AS __datafusion_extracted_2, ... /// Join: left.id = right.user_id /// TableScan: left [id, user] /// TableScan: right [user_id, order] /// /// After routing each expression to its owning input: /// Join: left.id = right.user_id -/// Projection: user['name'] AS __extracted_1, id, user <-- left-side extraction +/// Projection: user['name'] AS __datafusion_extracted_1, id, user <-- left-side extraction /// TableScan: left [id, user] -/// Projection: order['total'] AS __extracted_2, user_id, order <-- right-side extraction +/// Projection: order['total'] AS __datafusion_extracted_2, user_id, order <-- right-side extraction /// TableScan: right [user_id, order] /// ``` fn try_push_into_inputs( From 12dace7fac9e2d4a5906ea0c24fe38af342b4f53 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 10:29:02 -0500 Subject: [PATCH 08/18] fix find_owning_input ambiguity with unqualified columns find_owning_input used .position() which returns the first matching input. Combined with schema_columns generating both qualified and unqualified column forms, an unqualified column present in both sides of a Join would always be attributed to index 0 (left side). Replace with logic that counts matches and returns None when ambiguous, causing the optimizer to safely skip extraction rather than mis-route. Co-Authored-By: Claude Opus 4.6 --- .../optimizer/src/extract_leaf_expressions.rs | 107 +++++++++++++++++- 1 file changed, 103 insertions(+), 4 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index f2a0fbcfb57b5..1c5c06e3b046c 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -200,14 +200,24 @@ fn extract_from_plan( /// Given an expression, returns the index of the input whose columns fully /// cover the expression's column references. -/// Returns `None` if the expression references columns from multiple inputs. +/// Returns `None` if the expression references columns from multiple inputs +/// or if multiple inputs match (ambiguous, e.g. unqualified columns present +/// in both sides of a join). fn find_owning_input( expr: &Expr, input_column_sets: &[std::collections::HashSet], ) -> Option { - input_column_sets - .iter() - .position(|cols| has_all_column_refs(expr, cols)) + let mut found = None; + for (idx, cols) in input_column_sets.iter().enumerate() { + if has_all_column_refs(expr, cols) { + if found.is_some() { + // Ambiguous — multiple inputs match + return None; + } + found = Some(idx); + } + } + found } /// Walks an expression tree top-down, extracting `MoveTowardsLeafNodes` @@ -2213,6 +2223,95 @@ mod tests { "#) } + /// Join where both sides have same-named columns: a qualified reference + /// to the right side must be routed to the right input, not the left. + #[test] + fn test_extract_from_join_qualified_right_side() -> Result<()> { + use datafusion_expr::JoinType; + + let left = test_table_scan_with_struct()?; + let right = test_table_scan_with_struct_named("right")?; + + // Filter references right.user explicitly — must route to right side + let plan = LogicalPlanBuilder::from(left) + .join_on( + right, + JoinType::Inner, + vec![ + col("test.id").eq(col("right.id")), + leaf_udf(col("right.user"), "status").eq(lit("active")), + ], + )? + .build()?; + + assert_stages!(plan, @r#" + ## Original Plan + Inner Join: Filter: test.id = right.id AND leaf_udf(right.user, Utf8("status")) = Utf8("active") + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] + + ## After Extraction + Projection: test.id, test.user, right.id, right.user + Inner Join: Filter: test.id = right.id AND __datafusion_extracted_1 = Utf8("active") + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_1, right.id, right.user + TableScan: right projection=[id, user] + + ## After Pushdown + (same as after extraction) + + ## Optimized + (same as after pushdown) + "#) + } + + /// When both inputs contain the same unqualified column, an unqualified + /// column reference is ambiguous and `find_owning_input` must return + /// `None` rather than always returning 0 (the left side). + #[test] + fn test_find_owning_input_ambiguous_unqualified_column() { + use std::collections::HashSet; + + // Simulate schema_columns output for two sides of a join where both + // have a "user" column — each set contains the qualified and + // unqualified form. + let left_cols: HashSet = [ + Column::new(Some("test"), "user"), + Column::new_unqualified("user"), + ] + .into_iter() + .collect(); + + let right_cols: HashSet = [ + Column::new(Some("right"), "user"), + Column::new_unqualified("user"), + ] + .into_iter() + .collect(); + + let input_column_sets = vec![left_cols, right_cols]; + + // Unqualified "user" matches both sets — must return None (ambiguous) + let unqualified = Expr::Column(Column::new_unqualified("user")); + assert_eq!(find_owning_input(&unqualified, &input_column_sets), None); + + // Qualified "right.user" matches only the right set — must return Some(1) + let qualified_right = + Expr::Column(Column::new(Some("right"), "user")); + assert_eq!( + find_owning_input(&qualified_right, &input_column_sets), + Some(1) + ); + + // Qualified "test.user" matches only the left set — must return Some(0) + let qualified_left = + Expr::Column(Column::new(Some("test"), "user")); + assert_eq!( + find_owning_input(&qualified_left, &input_column_sets), + Some(0) + ); + } + // ========================================================================= // Column-rename through intermediate node tests // ========================================================================= From 2120c2eec152005baa18326f0c3f81cf67fb791b Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 10:31:25 -0500 Subject: [PATCH 09/18] document positional-mapping invariant in build_recovery_projection Add a comment explaining why positional mapping is safe (all supported node types preserve column order in with_new_exprs) and a debug_assert to catch violations early. Co-Authored-By: Claude Opus 4.6 --- datafusion/optimizer/src/extract_leaf_expressions.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 1c5c06e3b046c..45c81720f96b7 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -394,8 +394,18 @@ fn build_recovery_projection( return Ok(input); } - // Schema-defining nodes (Projection, Aggregate): names may differ at some positions. + // Schema-defining nodes (Aggregate, Join): names may differ at some + // positions because extracted aliases replaced the original expressions. // Map positionally, aliasing where the name changed. + // + // Invariant: `with_new_exprs` on all supported node types (Aggregate, + // Filter, Sort, Limit, Join) preserves column order, so positional + // mapping is safe here. + debug_assert!( + orig_len == new_len, + "build_recovery_projection: positional mapping requires same field count, \ + got original={orig_len} vs new={new_len}" + ); let mut proj_exprs = Vec::with_capacity(orig_len); for (i, (orig_qualifier, orig_field)) in original_schema.iter().enumerate() { let (new_qualifier, new_field) = new_schema.qualified_field(i); From b992006f029bb5fccd208dec121b9c905f36f615 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 10:44:35 -0500 Subject: [PATCH 10/18] strip redundant self-aliases when merging consecutive projections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When OptimizeProjections merges two stacked projections (e.g. a recovery projection on top of an extraction projection), column references in the outer projection get substituted with the inner expression. If the outer expression was an alias like `__extracted_1 AS f(x)`, the substitution produces `f(x) AS f(x)` — a self-alias caused by Display vs schema_name formatting differences (datafusion#10364). Fix this by checking, after substitution, whether the inner expression's schema_name matches the alias name. If so, drop the redundant alias wrapper instead of preserving it. Co-Authored-By: Claude Opus 4.6 --- .../optimizer/src/extract_leaf_expressions.rs | 6 +-- .../optimizer/src/optimize_projections/mod.rs | 14 +++++- datafusion/sqllogictest/test_files/insert.slt | 2 +- .../sqllogictest/test_files/projection.slt | 2 +- .../test_files/projection_pushdown.slt | 46 +++++++++---------- datafusion/sqllogictest/test_files/struct.slt | 2 +- 6 files changed, 42 insertions(+), 30 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 45c81720f96b7..84ee45a2222cb 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -1313,7 +1313,7 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: leaf_udf(test.user, Utf8("name")) AS leaf_udf(test.user,Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) TableScan: test projection=[user] "#) } @@ -1609,7 +1609,7 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: leaf_udf(test.user, Utf8("name")) AS leaf_udf(test.user,Utf8("name")), leaf_udf(test.user, Utf8("name")) AS name2 + Projection: leaf_udf(test.user, Utf8("name")), leaf_udf(test.user, Utf8("name")) AS name2 TableScan: test projection=[user] "#) } @@ -1827,7 +1827,7 @@ mod tests { TableScan: test projection=[user] ## Optimized - Projection: leaf_udf(test.user, Utf8("name")) AS leaf_udf(test.user,Utf8("name")) + Projection: leaf_udf(test.user, Utf8("name")) TableScan: test projection=[user] "#) } diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 9cccb20bcc45e..d2a85f5009787 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -559,7 +559,19 @@ fn merge_consecutive_projections(proj: Projection) -> Result rewrite_expr(*expr, &prev_projection).map(|result| { result.update_data(|expr| { - Expr::Alias(Alias::new(expr, relation, name).with_metadata(metadata)) + // After substitution, the inner expression may now have the + // same schema_name as the alias (e.g. when an extraction + // alias like `__extracted_1 AS f(x)` is resolved back to + // `f(x)`). Wrapping in a redundant self-alias causes a + // cosmetic `f(x) AS f(x)` due to Display vs schema_name + // formatting differences. Drop the alias when it matches. + if expr.schema_name().to_string() == name { + expr + } else { + Expr::Alias( + Alias::new(expr, relation, name).with_metadata(metadata), + ) + } }) }), e => rewrite_expr(e, &prev_projection), diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index 8ef2596f18e33..e7b9e77dfef58 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -165,7 +165,7 @@ ORDER BY c1 ---- logical_plan 01)Dml: op=[Insert Into] table=[table_without_values] -02)--Projection: a1 AS a1, a2 AS a2 +02)--Projection: a1, a2 03)----Sort: aggregate_test_100.c1 ASC NULLS LAST 04)------Projection: sum(aggregate_test_100.c4) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a1, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS count(*) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING AS a2, aggregate_test_100.c1 05)--------WindowAggr: windowExpr=[[sum(CAST(aggregate_test_100.c4 AS Int64)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING, count(Int64(1)) PARTITION BY [aggregate_test_100.c1] ORDER BY [aggregate_test_100.c9 ASC NULLS LAST] ROWS BETWEEN 1 PRECEDING AND 1 FOLLOWING]] diff --git a/datafusion/sqllogictest/test_files/projection.slt b/datafusion/sqllogictest/test_files/projection.slt index 8b3bd4d12c6a2..e18114bc51ca8 100644 --- a/datafusion/sqllogictest/test_files/projection.slt +++ b/datafusion/sqllogictest/test_files/projection.slt @@ -244,7 +244,7 @@ query TT explain select column1.c0 from t; ---- logical_plan -01)Projection: get_field(t.column1, Utf8("c0")) AS t.column1[c0] +01)Projection: get_field(t.column1, Utf8("c0")) 02)--TableScan: t projection=[column1] physical_plan 01)ProjectionExec: expr=[get_field(column1@0, c0) as t.column1[c0]] diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index 2ff46f3b38b9b..ab93eebf3244a 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -104,7 +104,7 @@ query TT EXPLAIN SELECT id, s['value'] FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value]], file_type=parquet @@ -122,7 +122,7 @@ query TT EXPLAIN SELECT s['label'] FROM simple_struct; ---- logical_plan -01)Projection: get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] +01)Projection: get_field(simple_struct.s, Utf8("label")) 02)--TableScan: simple_struct projection=[s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, label) as simple_struct.s[label]], file_type=parquet @@ -144,7 +144,7 @@ query TT EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) as simple_struct.s[value], get_field(s@1, label) as simple_struct.s[label]], file_type=parquet @@ -166,7 +166,7 @@ query TT EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct; ---- logical_plan -01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) AS nested_struct.nested[outer][inner] +01)Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) 02)--TableScan: nested_struct projection=[id, nested] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nested.parquet]]}, projection=[id, get_field(nested@1, outer, inner) as nested_struct.nested[outer][inner]], file_type=parquet @@ -186,7 +186,7 @@ query TT EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, value) + 1 as simple_struct.s[value] + Int64(1)], file_type=parquet @@ -208,7 +208,7 @@ query TT EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct; ---- logical_plan -01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") AS simple_struct.s[label] || Utf8("_suffix") +01)Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") 02)--TableScan: simple_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id, get_field(s@1, label) || _suffix as simple_struct.s[label] || Utf8("_suffix")], file_type=parquet @@ -316,7 +316,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -341,7 +341,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -366,7 +366,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY s['value']; ---- logical_plan 01)Sort: simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] @@ -440,7 +440,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -463,7 +463,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM simple_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) AS simple_struct.s[value] + Int64(1) +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(1) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -486,7 +486,7 @@ EXPLAIN SELECT id, s['value'], s['label'] FROM simple_struct ORDER BY id LIMIT 3 ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -509,7 +509,7 @@ EXPLAIN SELECT id, nested['outer']['inner'] FROM nested_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: nested_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) AS nested_struct.nested[outer][inner] +02)--Projection: nested_struct.id, get_field(nested_struct.nested, Utf8("outer"), Utf8("inner")) 03)----TableScan: nested_struct projection=[id, nested] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -531,7 +531,7 @@ EXPLAIN SELECT id, s['label'] || '_suffix' FROM simple_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") AS simple_struct.s[label] || Utf8("_suffix") +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("label")) || Utf8("_suffix") 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -679,7 +679,7 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) AS multi_struct.s[value] +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST] @@ -705,7 +705,7 @@ EXPLAIN SELECT id, s['value'] FROM multi_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) AS multi_struct.s[value] +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 @@ -729,7 +729,7 @@ EXPLAIN SELECT id, s['value'] + 1 FROM multi_struct ORDER BY id LIMIT 3; ---- logical_plan 01)Sort: multi_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1) AS multi_struct.s[value] + Int64(1) +02)--Projection: multi_struct.id, get_field(multi_struct.s, Utf8("value")) + Int64(1) 03)----TableScan: multi_struct projection=[id, s] physical_plan 01)SortPreservingMergeExec: [id@0 ASC NULLS LAST], fetch=3 @@ -819,7 +819,7 @@ query TT EXPLAIN SELECT id, s['value'] FROM nullable_struct; ---- logical_plan -01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value")) AS nullable_struct.s[value] +01)Projection: nullable_struct.id, get_field(nullable_struct.s, Utf8("value")) 02)--TableScan: nullable_struct projection=[id, s] physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/nullable.parquet]]}, projection=[id, get_field(s@1, value) as nullable_struct.s[value]], file_type=parquet @@ -867,7 +867,7 @@ EXPLAIN SELECT id, s['value'], s['value'] + 10, s['label'] FROM simple_struct OR ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=3 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("value")) + Int64(10) AS simple_struct.s[value] + Int64(10), get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("value")) + Int64(10), get_field(simple_struct.s, Utf8("label")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=3), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -1088,7 +1088,7 @@ EXPLAIN SELECT id, 42 as answer, s['label'] FROM simple_struct ORDER BY id LIMIT ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, Int64(42) AS answer, get_field(simple_struct.s, Utf8("label")) AS simple_struct.s[label] +02)--Projection: simple_struct.id, Int64(42) AS answer, get_field(simple_struct.s, Utf8("label")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -1111,7 +1111,7 @@ EXPLAIN SELECT id, s['value'] + 100, s['label'] || '_test' FROM simple_struct OR ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, fetch=2 -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(100) AS simple_struct.s[value] + Int64(100), get_field(simple_struct.s, Utf8("label")) || Utf8("_test") AS simple_struct.s[label] || Utf8("_test") +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) + Int64(100), get_field(simple_struct.s, Utf8("label")) || Utf8("_test") 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: TopK(fetch=2), expr=[id@0 ASC NULLS LAST], preserve_partitioning=[false] @@ -1280,7 +1280,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['label']; logical_plan 01)Projection: simple_struct.id, simple_struct.s[value] 02)--Sort: simple_struct.id ASC NULLS LAST, __datafusion_extracted_1 ASC NULLS LAST -03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value], get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1 +03)----Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")), get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_1 04)------TableScan: simple_struct projection=[id, s] physical_plan 01)ProjectionExec: expr=[id@0 as id, simple_struct.s[value]@1 as simple_struct.s[value]] @@ -1362,7 +1362,7 @@ EXPLAIN SELECT id, s['value'] FROM simple_struct ORDER BY id, s['value']; ---- logical_plan 01)Sort: simple_struct.id ASC NULLS LAST, simple_struct.s[value] ASC NULLS LAST -02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) AS simple_struct.s[value] +02)--Projection: simple_struct.id, get_field(simple_struct.s, Utf8("value")) 03)----TableScan: simple_struct projection=[id, s] physical_plan 01)SortExec: expr=[id@0 ASC NULLS LAST, simple_struct.s[value]@1 ASC NULLS LAST], preserve_partitioning=[false] diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index 09dd98a50b579..53a1bb4ec6751 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -661,7 +661,7 @@ query TT explain select s['a']['b'] from explain_test; ---- logical_plan -01)Projection: get_field(explain_test.s, Utf8("a"), Utf8("b")) AS explain_test.s[a][b] +01)Projection: get_field(explain_test.s, Utf8("a"), Utf8("b")) 02)--TableScan: explain_test projection=[s] physical_plan 01)ProjectionExec: expr=[get_field(s@0, a, b) as explain_test.s[a][b]] From ee557b7e9b3030f314487e52836779e1e7189f6c Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 10:47:20 -0500 Subject: [PATCH 11/18] add tests for aggregate-blocked pushdown fallback and cross-input join extraction Co-Authored-By: Claude Opus 4.6 --- .../optimizer/src/extract_leaf_expressions.rs | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 84ee45a2222cb..e3a253f180901 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -1923,6 +1923,44 @@ mod tests { "#) } + /// Projection containing a MoveTowardsLeafNodes sub-expression above an + /// Aggregate. Aggregate blocks pushdown, so the (None, true) recovery + /// fallback path fires: in-place extraction + recovery projection. + #[test] + fn test_projection_with_leaf_expr_above_aggregate() -> Result<()> { + use datafusion_expr::test::function_stub::count; + + let table_scan = test_table_scan_with_struct()?; + let plan = LogicalPlanBuilder::from(table_scan) + .aggregate(vec![col("user")], vec![count(lit(1))])? + .project(vec![ + leaf_udf(col("user"), "name").is_not_null().alias("has_name"), + col("COUNT(Int32(1))"), + ])? + .build()?; + + assert_stages!(plan, @r#" + ## Original Plan + Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name, COUNT(Int32(1)) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]] + TableScan: test projection=[user] + + ## After Extraction + (same as original) + + ## After Pushdown + Projection: __datafusion_extracted_1 IS NOT NULL AS has_name, COUNT(Int32(1)) + Projection: leaf_udf(test.user, Utf8("name")) AS __datafusion_extracted_1, test.user, COUNT(Int32(1)) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]] + TableScan: test projection=[user] + + ## Optimized + Projection: leaf_udf(test.user, Utf8("name")) IS NOT NULL AS has_name, COUNT(Int32(1)) + Aggregate: groupBy=[[test.user]], aggr=[[COUNT(Int32(1))]] + TableScan: test projection=[user] + "#) + } + /// Merging adds new pass-through columns not in the existing extracted projection. #[test] fn test_merge_with_new_columns() -> Result<()> { @@ -2322,6 +2360,54 @@ mod tests { ); } + /// Two leaf_udf expressions from different sides of a Join in a Filter. + /// Each is routed to its respective input side independently. + #[test] + fn test_extract_from_join_cross_input_expression() -> Result<()> { + let left = test_table_scan_with_struct()?; + let right = test_table_scan_with_struct_named("right")?; + + let plan = LogicalPlanBuilder::from(left) + .join_on( + right, + datafusion_expr::JoinType::Inner, + vec![col("test.id").eq(col("right.id"))], + )? + .filter( + leaf_udf(col("test.user"), "status") + .eq(leaf_udf(col("right.user"), "status")), + )? + .build()?; + + assert_stages!(plan, @r#" + ## Original Plan + Filter: leaf_udf(test.user, Utf8("status")) = leaf_udf(right.user, Utf8("status")) + Inner Join: Filter: test.id = right.id + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] + + ## After Extraction + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_1 = __datafusion_extracted_2 + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_2, test.id, test.user, right.id, right.user + Inner Join: Filter: test.id = right.id + TableScan: test projection=[id, user] + TableScan: right projection=[id, user] + + ## After Pushdown + Projection: test.id, test.user, right.id, right.user + Filter: __datafusion_extracted_1 = __datafusion_extracted_2 + Inner Join: Filter: test.id = right.id + Projection: leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1, test.id, test.user + TableScan: test projection=[id, user] + Projection: leaf_udf(right.user, Utf8("status")) AS __datafusion_extracted_2, right.id, right.user + TableScan: right projection=[id, user] + + ## Optimized + (same as after pushdown) + "#) + } + // ========================================================================= // Column-rename through intermediate node tests // ========================================================================= From 6b228a6302e9b4ab269b2000cab4b95a94eb81ea Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 11:10:33 -0500 Subject: [PATCH 12/18] fix merge guard inflation and eliminate double clone in extract_leaf_expressions The projection merge guard used `pairs.len() + columns_needed.len()` which over-counted when extracted aliases referenced columns not present as standalone expressions, rejecting valid merges. Replace with a direct `proj_exprs_captured` counter that tracks only Case A/B expressions. Also detect when the merge will widen the schema (column refs from extracted aliases that aren't standalone) to trigger recovery. In extract_from_plan, store owned_inputs as Vec> and use Arc::try_unwrap to avoid cloning each input twice. Co-Authored-By: Claude Opus 4.6 --- .../optimizer/src/extract_leaf_expressions.rs | 96 +++++++++++++++---- 1 file changed, 80 insertions(+), 16 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index e3a253f180901..47500a0dc36cb 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -169,20 +169,27 @@ fn extract_from_plan( return Ok(transformed); } - // Clone inputs now that we know extraction succeeded. We need owned - // copies to wrap in extraction projections below each input. - let owned_inputs: Vec = - transformed.data.inputs().into_iter().cloned().collect(); + // Clone inputs now that we know extraction succeeded. Wrap in Arc + // upfront since build_extraction_projection expects &Arc. + let owned_inputs: Vec> = transformed + .data + .inputs() + .into_iter() + .map(|i| Arc::new(i.clone())) + .collect(); // Build per-input extraction projections (None means no extractions for that input) let new_inputs: Vec = owned_inputs - .iter() + .into_iter() .zip(extractors.iter()) - .map(|(input, extractor)| { - let input_arc = Arc::new(input.clone()); - Ok(extractor - .build_extraction_projection(&input_arc)? - .unwrap_or_else(|| input.clone())) + .map(|(input_arc, extractor)| { + match extractor.build_extraction_projection(&input_arc)? { + Some(plan) => Ok(plan), + // No extractions for this input — recover the LogicalPlan + // without cloning (refcount is 1 since build returned None). + None => Ok(Arc::try_unwrap(input_arc) + .unwrap_or_else(|arc| (*arc).clone())), + } }) .collect::>>()?; @@ -752,6 +759,10 @@ fn split_and_push_projection( let mut recovery_exprs: Vec = Vec::with_capacity(proj.expr.len()); let mut needs_recovery = false; let mut has_new_extractions = false; + let mut proj_exprs_captured: usize = 0; + // Track standalone column expressions (Case B) to detect column refs + // from extracted aliases (Case A) that aren't also standalone expressions. + let mut standalone_columns: IndexSet = IndexSet::new(); for (expr, (qualifier, field)) in proj.expr.iter().zip(original_schema.iter()) { if let Expr::Alias(alias) = expr @@ -769,10 +780,13 @@ fn split_and_push_projection( .extracted .insert(expr.clone(), alias_name.clone()); recovery_exprs.push(Expr::Column(Column::new_unqualified(&alias_name))); + proj_exprs_captured += 1; } else if let Expr::Column(col) = expr { // Plain column pass-through — track it in the extractor extractors[0].columns_needed.insert(col.clone()); + standalone_columns.insert(col.clone()); recovery_exprs.push(expr.clone()); + proj_exprs_captured += 1; } else { // Everything else: run through routing_extract let transformed = @@ -829,6 +843,14 @@ fn split_and_push_projection( return Ok(None); } + // If columns_needed has entries that aren't standalone projection columns + // (i.e., they came from column refs inside extracted aliases), a merge + // into an inner projection will widen the schema with those extra columns, + // requiring a recovery projection to restore the original schema. + if columns_needed.iter().any(|c| !standalone_columns.contains(c)) { + needs_recovery = true; + } + // ── Phase 2: Push down ────────────────────────────────────────────── let proj_input = Arc::clone(&proj.input); let pushed = push_extraction_pairs( @@ -837,6 +859,7 @@ fn split_and_push_projection( proj, &proj_input, alias_generator, + proj_exprs_captured, )?; // ── Phase 3: Recovery ─────────────────────────────────────────────── @@ -903,16 +926,17 @@ fn push_extraction_pairs( proj: &Projection, proj_input: &Arc, alias_generator: &Arc, + proj_exprs_captured: usize, ) -> Result> { match proj_input.as_ref() { // Merge into existing projection, then try to push the result further down. - // Only merge when every expression in the outer projection is accounted - // for as either an extraction pair or a needed column. Uncaptured - // expressions (e.g. `col AS __common_expr_1` from CSE) would be lost - // during the merge since build_extraction_projection_impl only knows - // about the captured pairs and columns. + // Only merge when every expression in the outer projection is fully + // captured as either an extraction pair (Case A: __datafusion_extracted + // alias) or a plain column (Case B). Uncaptured expressions (e.g. + // `col AS __common_expr_1` from CSE, or complex expressions with + // extracted sub-parts) would be lost during the merge. LogicalPlan::Projection(_) - if pairs.len() + columns_needed.len() == proj.expr.len() => + if proj_exprs_captured == proj.expr.len() => { let target_schema = Arc::clone(proj_input.schema()); let merged = build_extraction_projection_impl( @@ -2897,4 +2921,44 @@ mod tests { TableScan: test projection=[id, user] "#) } + + /// When an extraction projection's __extracted alias references a column + /// (e.g. `user`) that is NOT a standalone expression in the projection, + /// the merge into the inner projection should still succeed. + #[test] + fn test_merge_extraction_into_projection_with_column_ref_inflation() -> Result<()> + { + let table_scan = test_table_scan_with_struct()?; + + // Inner projection (simulates a trimmed projection) + let inner = LogicalPlanBuilder::from(table_scan) + .project(vec![col("user"), col("id")])? + .build()?; + + // Outer projection: __extracted alias + id (but NOT user as standalone). + // The alias references `user` internally, inflating columns_needed. + let plan = LogicalPlanBuilder::from(inner) + .project(vec![ + leaf_udf(col("user"), "status") + .alias(format!("{EXTRACTED_EXPR_PREFIX}_1")), + col("id"), + ])? + .build()?; + + // Run only PushDownLeafProjections + let ctx = OptimizerContext::new().with_max_passes(1); + let optimizer = + Optimizer::with_rules(vec![Arc::new(PushDownLeafProjections::new())]); + let result = optimizer.optimize(plan, &ctx, |_, _| {})?; + + // With the fix: merge succeeds → extraction merged into inner projection. + // Without the fix: merge rejected → two separate projections remain. + insta::assert_snapshot!(format!("{result}"), @r#" + Projection: __datafusion_extracted_1, test.id + Projection: test.user, test.id, leaf_udf(test.user, Utf8("status")) AS __datafusion_extracted_1 + TableScan: test + "#); + + Ok(()) + } } From eef3928460cb22e1b85aefa54a5cb51d85572f12 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 11:53:18 -0500 Subject: [PATCH 13/18] apply rustfmt and fix redundant CAST alias in projection_pushdown test Co-Authored-By: Claude Opus 4.6 --- .../optimizer/src/extract_leaf_expressions.rs | 27 ++++++++++--------- .../test_files/projection_pushdown.slt | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 47500a0dc36cb..4a959f8230b1c 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -187,8 +187,9 @@ fn extract_from_plan( Some(plan) => Ok(plan), // No extractions for this input — recover the LogicalPlan // without cloning (refcount is 1 since build returned None). - None => Ok(Arc::try_unwrap(input_arc) - .unwrap_or_else(|arc| (*arc).clone())), + None => { + Ok(Arc::try_unwrap(input_arc).unwrap_or_else(|arc| (*arc).clone())) + } } }) .collect::>>()?; @@ -847,7 +848,10 @@ fn split_and_push_projection( // (i.e., they came from column refs inside extracted aliases), a merge // into an inner projection will widen the schema with those extra columns, // requiring a recovery projection to restore the original schema. - if columns_needed.iter().any(|c| !standalone_columns.contains(c)) { + if columns_needed + .iter() + .any(|c| !standalone_columns.contains(c)) + { needs_recovery = true; } @@ -935,9 +939,7 @@ fn push_extraction_pairs( // alias) or a plain column (Case B). Uncaptured expressions (e.g. // `col AS __common_expr_1` from CSE, or complex expressions with // extracted sub-parts) would be lost during the merge. - LogicalPlan::Projection(_) - if proj_exprs_captured == proj.expr.len() => - { + LogicalPlan::Projection(_) if proj_exprs_captured == proj.expr.len() => { let target_schema = Arc::clone(proj_input.schema()); let merged = build_extraction_projection_impl( pairs, @@ -1958,7 +1960,9 @@ mod tests { let plan = LogicalPlanBuilder::from(table_scan) .aggregate(vec![col("user")], vec![count(lit(1))])? .project(vec![ - leaf_udf(col("user"), "name").is_not_null().alias("has_name"), + leaf_udf(col("user"), "name") + .is_not_null() + .alias("has_name"), col("COUNT(Int32(1))"), ])? .build()?; @@ -2368,16 +2372,14 @@ mod tests { assert_eq!(find_owning_input(&unqualified, &input_column_sets), None); // Qualified "right.user" matches only the right set — must return Some(1) - let qualified_right = - Expr::Column(Column::new(Some("right"), "user")); + let qualified_right = Expr::Column(Column::new(Some("right"), "user")); assert_eq!( find_owning_input(&qualified_right, &input_column_sets), Some(1) ); // Qualified "test.user" matches only the left set — must return Some(0) - let qualified_left = - Expr::Column(Column::new(Some("test"), "user")); + let qualified_left = Expr::Column(Column::new(Some("test"), "user")); assert_eq!( find_owning_input(&qualified_left, &input_column_sets), Some(0) @@ -2926,8 +2928,7 @@ mod tests { /// (e.g. `user`) that is NOT a standalone expression in the projection, /// the merge into the inner projection should still succeed. #[test] - fn test_merge_extraction_into_projection_with_column_ref_inflation() -> Result<()> - { + fn test_merge_extraction_into_projection_with_column_ref_inflation() -> Result<()> { let table_scan = test_table_scan_with_struct()?; // Inner projection (simulates a trimmed projection) diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index ab93eebf3244a..11dff581da1a0 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -1055,7 +1055,7 @@ query TT EXPLAIN SELECT s['value'] * 2 + length(s['label']) as score FROM simple_struct WHERE id > 1; ---- logical_plan -01)Projection: __datafusion_extracted_1 * Int64(2) + CAST(character_length(__datafusion_extracted_2) AS length(get_field(simple_struct.s, Utf8("label"))) AS Int64) AS score +01)Projection: __datafusion_extracted_1 * Int64(2) + CAST(character_length(__datafusion_extracted_2) AS Int64) AS score 02)--Filter: simple_struct.id > Int64(1) 03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("label")) AS __datafusion_extracted_2, simple_struct.id 04)------TableScan: simple_struct projection=[id, s], partial_filters=[simple_struct.id > Int64(1)] From a32783f757f8f76b6f09ed7c9fe95e156b3b572d Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 11:56:42 -0500 Subject: [PATCH 14/18] refactor ExtractionTarget struct to fix clippy::type_complexity Introduce an `ExtractionTarget` struct with named `pairs` and `columns` fields to replace complex tuple return types in `remap_pairs_and_columns` and `route_to_inputs`. Co-Authored-By: Claude Opus 4.6 --- .../optimizer/src/extract_leaf_expressions.rs | 61 ++++++++++++------- datafusion/optimizer/src/test/udfs.rs | 22 ++----- 2 files changed, 45 insertions(+), 38 deletions(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 4a959f8230b1c..75fe08c3a0147 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -308,7 +308,7 @@ fn remap_pairs_and_columns( columns: &IndexSet, from_schema: &DFSchema, to_schema: &DFSchema, -) -> Result<(Vec<(Expr, String)>, IndexSet)> { +) -> Result { let mut replace_map = HashMap::new(); for ((from_q, from_f), (to_q, to_f)) in from_schema.iter().zip(to_schema.iter()) { replace_map.insert( @@ -337,13 +337,25 @@ fn remap_pairs_and_columns( } }) .collect(); - Ok((remapped_pairs, remapped_columns)) + Ok(ExtractionTarget { + pairs: remapped_pairs, + columns: remapped_columns, + }) } // ============================================================================= -// Helper Functions for Extraction Targeting +// Helper Types & Functions for Extraction Targeting // ============================================================================= +/// A bundle of extraction pairs (expression + alias) and standalone columns +/// that need to be pushed through a plan node. +struct ExtractionTarget { + /// Extracted expressions paired with their generated aliases. + pairs: Vec<(Expr, String)>, + /// Standalone column references needed by the parent node. + columns: IndexSet, +} + /// Build a replacement map from a projection: output_column_name -> underlying_expr. /// /// This is used to resolve column references through a renaming projection. @@ -991,10 +1003,14 @@ fn route_to_inputs( node: &LogicalPlan, input_column_sets: &[std::collections::HashSet], input_schemas: &[Arc], -) -> Result>, Vec>)>> { +) -> Result>> { let num_inputs = input_schemas.len(); - let mut per_input_pairs: Vec> = vec![vec![]; num_inputs]; - let mut per_input_columns: Vec> = vec![IndexSet::new(); num_inputs]; + let mut per_input: Vec = (0..num_inputs) + .map(|_| ExtractionTarget { + pairs: vec![], + columns: IndexSet::new(), + }) + .collect(); if matches!(node, LogicalPlan::Union(_)) { // Union output schema and each input schema have the same fields by @@ -1002,15 +1018,13 @@ fn route_to_inputs( // `simple_struct.s`). Remap pairs/columns to each input's space. let union_schema = node.schema(); for (idx, input_schema) in input_schemas.iter().enumerate() { - let (remapped_pairs, remapped_columns) = + per_input[idx] = remap_pairs_and_columns(pairs, columns, union_schema, input_schema)?; - per_input_pairs[idx] = remapped_pairs; - per_input_columns[idx] = remapped_columns; } } else { for (expr, alias) in pairs { match find_owning_input(expr, input_column_sets) { - Some(idx) => per_input_pairs[idx].push((expr.clone(), alias.clone())), + Some(idx) => per_input[idx].pairs.push((expr.clone(), alias.clone())), None => return Ok(None), // Cross-input expression — bail out } } @@ -1018,7 +1032,7 @@ fn route_to_inputs( let col_expr = Expr::Column(col.clone()); match find_owning_input(&col_expr, input_column_sets) { Some(idx) => { - per_input_columns[idx].insert(col.clone()); + per_input[idx].columns.insert(col.clone()); } None => return Ok(None), // Ambiguous column — bail out } @@ -1026,11 +1040,11 @@ fn route_to_inputs( } // Check at least one input has extractions to push - if per_input_pairs.iter().all(|p| p.is_empty()) { + if per_input.iter().all(|t| t.pairs.is_empty()) { return Ok(None); } - Ok(Some((per_input_pairs, per_input_columns))) + Ok(Some(per_input)) } /// Pushes extraction expressions into a node's inputs by routing each @@ -1074,13 +1088,16 @@ fn try_push_into_inputs( // SubqueryAlias remaps qualifiers between input and output. // Rewrite pairs/columns from alias-space to input-space before routing. - let (pairs, columns_needed) = if let LogicalPlan::SubqueryAlias(sa) = node { + let remapped = if let LogicalPlan::SubqueryAlias(sa) = node { remap_pairs_and_columns(pairs, columns_needed, &sa.schema, sa.input.schema())? } else { - (pairs.to_vec(), columns_needed.clone()) + ExtractionTarget { + pairs: pairs.to_vec(), + columns: columns_needed.clone(), + } }; - let pairs = &pairs[..]; - let columns_needed = &columns_needed; + let pairs = &remapped.pairs[..]; + let columns_needed = &remapped.columns; // Build per-input schemas and column sets for routing let input_schemas: Vec> = @@ -1089,7 +1106,7 @@ fn try_push_into_inputs( input_schemas.iter().map(|s| schema_columns(s)).collect(); // Route pairs and columns to the appropriate inputs - let (per_input_pairs, per_input_columns) = match route_to_inputs( + let per_input = match route_to_inputs( pairs, columns_needed, node, @@ -1108,14 +1125,14 @@ fn try_push_into_inputs( // schema), the parent node's schema becomes stale. let mut new_inputs: Vec = Vec::with_capacity(num_inputs); for (idx, input) in inputs.into_iter().enumerate() { - if per_input_pairs[idx].is_empty() { + if per_input[idx].pairs.is_empty() { new_inputs.push(input.clone()); } else { let input_arc = Arc::new(input.clone()); let target_schema = Arc::clone(input.schema()); let proj = build_extraction_projection_impl( - &per_input_pairs[idx], - &per_input_columns[idx], + &per_input[idx].pairs, + &per_input[idx].columns, &input_arc, target_schema.as_ref(), )?; @@ -1123,7 +1140,7 @@ fn try_push_into_inputs( // A merge may deduplicate if the same expression already exists // under a different alias, leaving the requested alias missing. let proj_schema = proj.schema.as_ref(); - for (_expr, alias) in &per_input_pairs[idx] { + for (_expr, alias) in &per_input[idx].pairs { if !proj_schema.fields().iter().any(|f| f.name() == alias) { return Ok(None); } diff --git a/datafusion/optimizer/src/test/udfs.rs b/datafusion/optimizer/src/test/udfs.rs index 35ea3e44d3e72..9164603dba3d5 100644 --- a/datafusion/optimizer/src/test/udfs.rs +++ b/datafusion/optimizer/src/test/udfs.rs @@ -30,7 +30,6 @@ use datafusion_expr::{ pub struct PlacementTestUDF { signature: Signature, placement: ExpressionPlacement, - name: String, id: usize, } @@ -43,7 +42,6 @@ impl Default for PlacementTestUDF { impl PlacementTestUDF { pub fn new() -> Self { Self { - name: "leaf_udf".to_string(), // Accept any one or two arguments and return UInt32 for testing purposes. // The actual types don't matter since this UDF is not intended for execution. signature: Signature::new( @@ -59,19 +57,6 @@ impl PlacementTestUDF { /// This also resets the name of the UDF to a default based on the placement. pub fn with_placement(mut self, placement: ExpressionPlacement) -> Self { self.placement = placement; - self.name = match self.placement { - ExpressionPlacement::MoveTowardsLeafNodes => "leaf_udf", - ExpressionPlacement::KeepInPlace => "keep_in_place_udf", - ExpressionPlacement::Column => "column_udf", - ExpressionPlacement::Literal => "literal_udf", - } - .to_string(); - self - } - - /// Set the name of the UDF, which is used in the expression and thus in optimizer rules. - pub fn with_name(mut self, name: &str) -> Self { - self.name = name.to_string(); self } @@ -88,7 +73,12 @@ impl ScalarUDFImpl for PlacementTestUDF { self } fn name(&self) -> &str { - &self.name + match self.placement { + ExpressionPlacement::MoveTowardsLeafNodes => "leaf_udf", + ExpressionPlacement::KeepInPlace => "keep_in_place_udf", + ExpressionPlacement::Column => "column_udf", + ExpressionPlacement::Literal => "literal_udf", + } } fn signature(&self) -> &Signature { &self.signature From 00498f857cf260ae0572b807c79c409c6392058e Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 11:58:46 -0500 Subject: [PATCH 15/18] Revert --- datafusion/sqllogictest/test_files/struct.slt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index 53a1bb4ec6751..e20815a58c765 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -1666,4 +1666,4 @@ order by id; 3 2 150 statement ok -drop table t_agg_window; +drop table t_agg_window; \ No newline at end of file From 53730613c367761409e7e97ca46a1d1d2a940096 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 13:27:17 -0500 Subject: [PATCH 16/18] add SLT tests for aggregate-blocked and cross-input join get_field pushdown Translates three unit tests from extract_leaf_expressions.rs into SQL-level SLT tests covering get_field above aggregate, qualified right-side join extraction, and cross-input join comparison. Co-Authored-By: Claude Opus 4.6 --- .../test_files/projection_pushdown.slt | 114 ++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index 11dff581da1a0..15806edcd542f 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -1834,3 +1834,117 @@ SELECT s['value'], s['label'] FROM ( 200 beta 250 epsilon 300 delta + +##################### +# Section 16: Aggregate / Join edge-case tests +# Translated from unit tests in extract_leaf_expressions.rs +##################### + +### +# Test 16.1: Projection with get_field above Aggregate +# Aggregate blocks pushdown, so the get_field stays in the top projection. +# (mirrors test_projection_with_leaf_expr_above_aggregate) +### + +query TT +EXPLAIN SELECT s['label'] IS NOT NULL AS has_label, COUNT(1) +FROM simple_struct GROUP BY s; +---- +logical_plan +01)Projection: get_field(simple_struct.s, Utf8("label")) IS NOT NULL AS has_label, count(Int64(1)) +02)--Aggregate: groupBy=[[simple_struct.s]], aggr=[[count(Int64(1))]] +03)----TableScan: simple_struct projection=[s] +physical_plan +01)ProjectionExec: expr=[get_field(s@0, label) IS NOT NULL as has_label, count(Int64(1))@1 as count(Int64(1))] +02)--AggregateExec: mode=Single, gby=[s@0 as s], aggr=[count(Int64(1))] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[s], file_type=parquet + +# Verify correctness - all labels are non-null +query BI +SELECT s['label'] IS NOT NULL AS has_label, COUNT(1) +FROM simple_struct GROUP BY s ORDER BY COUNT(1); +---- +true 1 +true 1 +true 1 +true 1 +true 1 + +### +# Test 16.2: Join with get_field filter on qualified right side +# The get_field on join_right.s['role'] must be routed to the right input only. +# (mirrors test_extract_from_join_qualified_right_side) +### + +query TT +EXPLAIN SELECT simple_struct.id, join_right.id +FROM simple_struct +INNER JOIN join_right + ON simple_struct.id = join_right.id + AND join_right.s['role'] = 'admin'; +---- +logical_plan +01)Inner Join: simple_struct.id = join_right.id +02)--TableScan: simple_struct projection=[id] +03)--Projection: join_right.id +04)----Filter: __datafusion_extracted_1 = Utf8("admin") +05)------Projection: get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_1, join_right.id +06)--------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("role")) = Utf8("admin")] +physical_plan +01)ProjectionExec: expr=[id@1 as id, id@0 as id] +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] +03)----FilterExec: __datafusion_extracted_1@0 = admin, projection=[id@1] +04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, role) as __datafusion_extracted_1, id], file_type=parquet +05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness - only admin roles match (ids 1 and 4) +query II +SELECT simple_struct.id, join_right.id +FROM simple_struct +INNER JOIN join_right + ON simple_struct.id = join_right.id + AND join_right.s['role'] = 'admin' +ORDER BY simple_struct.id; +---- +1 1 +4 4 + +### +# Test 16.3: Join with cross-input get_field comparison in WHERE +# get_field from each side is extracted and routed to its respective input independently. +# (mirrors test_extract_from_join_cross_input_expression) +### + +query TT +EXPLAIN SELECT simple_struct.id, join_right.id +FROM simple_struct +INNER JOIN join_right ON simple_struct.id = join_right.id +WHERE simple_struct.s['value'] > join_right.s['level']; +---- +logical_plan +01)Projection: simple_struct.id, join_right.id +02)--Inner Join: simple_struct.id = join_right.id Filter: __datafusion_extracted_1 > __datafusion_extracted_2 +03)----Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, simple_struct.id +04)------TableScan: simple_struct projection=[id, s] +05)----Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, join_right.id +06)------TableScan: join_right projection=[id, s] +physical_plan +01)HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@1, id@1)], filter=__datafusion_extracted_1@0 > __datafusion_extracted_2@1, projection=[id@1, id@3] +02)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, id], file_type=parquet +03)--DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, id], file_type=parquet, predicate=DynamicFilter [ empty ] + +# Verify correctness - all rows match since value >> level for all ids +# simple_struct: (1,100), (2,200), (3,150), (4,300), (5,250) +# join_right: (1,10), (2,5), (3,1), (4,8), (5,3) +query II +SELECT simple_struct.id, join_right.id +FROM simple_struct +INNER JOIN join_right ON simple_struct.id = join_right.id +WHERE simple_struct.s['value'] > join_right.s['level'] +ORDER BY simple_struct.id; +---- +1 1 +2 2 +3 3 +4 4 +5 5 From 4ec0e6ccd7cc483d2d7d13a83e9cca624166aeb3 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 11 Feb 2026 16:17:49 -0500 Subject: [PATCH 17/18] update test --- .../test_files/projection_pushdown.slt | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/datafusion/sqllogictest/test_files/projection_pushdown.slt b/datafusion/sqllogictest/test_files/projection_pushdown.slt index 15806edcd542f..c25b80a0d7f20 100644 --- a/datafusion/sqllogictest/test_files/projection_pushdown.slt +++ b/datafusion/sqllogictest/test_files/projection_pushdown.slt @@ -1877,25 +1877,26 @@ true 1 ### query TT -EXPLAIN SELECT simple_struct.id, join_right.id -FROM simple_struct -INNER JOIN join_right - ON simple_struct.id = join_right.id - AND join_right.s['role'] = 'admin'; ----- -logical_plan -01)Inner Join: simple_struct.id = join_right.id -02)--TableScan: simple_struct projection=[id] -03)--Projection: join_right.id -04)----Filter: __datafusion_extracted_1 = Utf8("admin") -05)------Projection: get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_1, join_right.id -06)--------TableScan: join_right projection=[id, s], partial_filters=[get_field(join_right.s, Utf8("role")) = Utf8("admin")] +EXPLAIN +SELECT s.s['value'], j.s['role'] +FROM join_right j +INNER JOIN simple_struct s ON s.id = j.id +WHERE s.s['value'] > j.s['level']; +---- +logical_plan +01)Projection: __datafusion_extracted_3 AS s.s[value], __datafusion_extracted_4 AS j.s[role] +02)--Inner Join: j.id = s.id Filter: __datafusion_extracted_1 > __datafusion_extracted_2 +03)----SubqueryAlias: j +04)------Projection: get_field(join_right.s, Utf8("level")) AS __datafusion_extracted_2, get_field(join_right.s, Utf8("role")) AS __datafusion_extracted_4, join_right.id +05)--------TableScan: join_right projection=[id, s] +06)----SubqueryAlias: s +07)------Projection: get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_1, get_field(simple_struct.s, Utf8("value")) AS __datafusion_extracted_3, simple_struct.id +08)--------TableScan: simple_struct projection=[id, s] physical_plan -01)ProjectionExec: expr=[id@1 as id, id@0 as id] -02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@0, id@0)] -03)----FilterExec: __datafusion_extracted_1@0 = admin, projection=[id@1] -04)------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, role) as __datafusion_extracted_1, id], file_type=parquet -05)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[id], file_type=parquet, predicate=DynamicFilter [ empty ] +01)ProjectionExec: expr=[__datafusion_extracted_3@1 as s.s[value], __datafusion_extracted_4@0 as j.s[role]] +02)--HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(id@2, id@2)], filter=__datafusion_extracted_1@1 > __datafusion_extracted_2@0, projection=[__datafusion_extracted_4@1, __datafusion_extracted_3@4] +03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/join_right.parquet]]}, projection=[get_field(s@1, level) as __datafusion_extracted_2, get_field(s@1, role) as __datafusion_extracted_4, id], file_type=parquet +04)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/projection_pushdown/simple.parquet]]}, projection=[get_field(s@1, value) as __datafusion_extracted_1, get_field(s@1, value) as __datafusion_extracted_3, id], file_type=parquet, predicate=DynamicFilter [ empty ] # Verify correctness - only admin roles match (ids 1 and 4) query II From 40b6c5388e13045955101e5a5b5fb76aecfe5bb0 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Thu, 12 Feb 2026 17:12:40 -0500 Subject: [PATCH 18/18] address pr feedback --- datafusion/optimizer/src/extract_leaf_expressions.rs | 6 ++++++ datafusion/optimizer/src/optimize_projections/mod.rs | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/datafusion/optimizer/src/extract_leaf_expressions.rs b/datafusion/optimizer/src/extract_leaf_expressions.rs index 75fe08c3a0147..de558331e5b26 100644 --- a/datafusion/optimizer/src/extract_leaf_expressions.rs +++ b/datafusion/optimizer/src/extract_leaf_expressions.rs @@ -35,6 +35,12 @@ use crate::push_down_filter::replace_cols_by_name; use crate::utils::has_all_column_refs; use crate::{OptimizerConfig, OptimizerRule}; +/// Prefix for aliases generated by the extraction optimizer passes. +/// +/// This prefix is **reserved for internal optimizer use**. User-defined aliases +/// starting with this prefix may be misidentified as optimizer-generated +/// extraction aliases, leading to unexpected behavior. Do not use this prefix +/// in user queries. const EXTRACTED_EXPR_PREFIX: &str = "__datafusion_extracted"; /// Extracts `MoveTowardsLeafNodes` sub-expressions from non-projection nodes diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index d2a85f5009787..93df300bb50b4 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -565,7 +565,7 @@ fn merge_consecutive_projections(proj: Projection) -> Result