Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
235 changes: 234 additions & 1 deletion datafusion/optimizer/src/eliminate_outer_join.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use datafusion_expr::{Expr, Filter, Operator};

use crate::optimizer::ApplyOrder;
use datafusion_common::tree_node::Transformed;
use datafusion_expr::expr::{BinaryExpr, Cast, TryCast};
use datafusion_expr::expr::{BinaryExpr, Cast, InList, TryCast};
use std::sync::Arc;

///
Expand Down Expand Up @@ -298,6 +298,23 @@ fn extract_non_nullable_columns(
right_schema,
false,
),
// IN list and BETWEEN are null-rejecting on the input expression:
// if the input column is NULL, the result is NULL (filtered out),
// regardless of whether the list/range contains NULLs.
Expr::InList(InList { expr, .. }) => extract_non_nullable_columns(
expr,
non_nullable_cols,
left_schema,
right_schema,
false,
),
Expr::Between(between) => extract_non_nullable_columns(
&between.expr,
non_nullable_cols,
left_schema,
right_schema,
false,
),
_ => {}
}
}
Expand All @@ -309,6 +326,7 @@ mod tests {
use crate::assert_optimized_plan_eq_snapshot;
use crate::test::*;
use arrow::datatypes::DataType;
use datafusion_common::ScalarValue;
use datafusion_expr::{
Operator::{And, Or},
binary_expr, cast, col, lit,
Expand Down Expand Up @@ -436,6 +454,221 @@ mod tests {
")
}

#[test]
fn eliminate_left_with_in_list() -> Result<()> {
let t1 = test_table_scan_with_name("t1")?;
let t2 = test_table_scan_with_name("t2")?;

// t2.b IN (1, 2, 3) rejects nulls — if t2.b is NULL the IN returns
// NULL which is filtered out. So Left Join should become Inner Join.
let plan = LogicalPlanBuilder::from(t1)
.join(
t2,
JoinType::Left,
(vec![Column::from_name("a")], vec![Column::from_name("a")]),
None,
)?
.filter(col("t2.b").in_list(vec![lit(1u32), lit(2u32), lit(3u32)], false))?
.build()?;

assert_optimized_plan_equal!(plan, @r"
Filter: t2.b IN ([UInt32(1), UInt32(2), UInt32(3)])
Inner Join: t1.a = t2.a
TableScan: t1
TableScan: t2
")
}

#[test]
fn eliminate_left_with_in_list_containing_null() -> Result<()> {
let t1 = test_table_scan_with_name("t1")?;
let t2 = test_table_scan_with_name("t2")?;

// IN list with NULL still rejects null input columns:
// if t2.b is NULL, NULL IN (1, NULL) evaluates to NULL, which is filtered out
let plan = LogicalPlanBuilder::from(t1)
.join(
t2,
JoinType::Left,
(vec![Column::from_name("a")], vec![Column::from_name("a")]),
None,
)?
.filter(
col("t2.b")
.in_list(vec![lit(1u32), lit(ScalarValue::UInt32(None))], false),
)?
.build()?;

assert_optimized_plan_equal!(plan, @r"
Filter: t2.b IN ([UInt32(1), UInt32(NULL)])
Inner Join: t1.a = t2.a
TableScan: t1
TableScan: t2
")
}

#[test]
fn eliminate_left_with_not_in_list() -> Result<()> {
let t1 = test_table_scan_with_name("t1")?;
let t2 = test_table_scan_with_name("t2")?;

// NOT IN also rejects nulls: if t2.b is NULL, NOT (NULL IN (...))
// evaluates to NULL, which is filtered out
let plan = LogicalPlanBuilder::from(t1)
.join(
t2,
JoinType::Left,
(vec![Column::from_name("a")], vec![Column::from_name("a")]),
None,
)?
.filter(col("t2.b").in_list(vec![lit(1u32), lit(2u32)], true))?
.build()?;

assert_optimized_plan_equal!(plan, @r"
Filter: t2.b NOT IN ([UInt32(1), UInt32(2)])
Inner Join: t1.a = t2.a
TableScan: t1
TableScan: t2
")
}

#[test]
fn eliminate_left_with_between() -> Result<()> {
let t1 = test_table_scan_with_name("t1")?;
let t2 = test_table_scan_with_name("t2")?;

// BETWEEN rejects nulls: if t2.b is NULL, NULL BETWEEN 1 AND 10
// evaluates to NULL, which is filtered out
let plan = LogicalPlanBuilder::from(t1)
.join(
t2,
JoinType::Left,
(vec![Column::from_name("a")], vec![Column::from_name("a")]),
None,
)?
.filter(col("t2.b").between(lit(1u32), lit(10u32)))?
.build()?;

assert_optimized_plan_equal!(plan, @r"
Filter: t2.b BETWEEN UInt32(1) AND UInt32(10)
Inner Join: t1.a = t2.a
TableScan: t1
TableScan: t2
")
}

#[test]
fn eliminate_right_with_between() -> Result<()> {
let t1 = test_table_scan_with_name("t1")?;
let t2 = test_table_scan_with_name("t2")?;

// Right join: filter on left (nullable) side with BETWEEN should convert to Inner
let plan = LogicalPlanBuilder::from(t1)
.join(
t2,
JoinType::Right,
(vec![Column::from_name("a")], vec![Column::from_name("a")]),
None,
)?
.filter(col("t1.b").between(lit(1u32), lit(10u32)))?
.build()?;

assert_optimized_plan_equal!(plan, @r"
Filter: t1.b BETWEEN UInt32(1) AND UInt32(10)
Inner Join: t1.a = t2.a
TableScan: t1
TableScan: t2
")
}

#[test]
fn eliminate_full_with_between() -> Result<()> {
let t1 = test_table_scan_with_name("t1")?;
let t2 = test_table_scan_with_name("t2")?;

// Full join with BETWEEN on both sides should become Inner
let plan = LogicalPlanBuilder::from(t1)
.join(
t2,
JoinType::Full,
(vec![Column::from_name("a")], vec![Column::from_name("a")]),
None,
)?
.filter(binary_expr(
col("t1.b").between(lit(1u32), lit(10u32)),
And,
col("t2.b").between(lit(5u32), lit(20u32)),
))?
.build()?;

assert_optimized_plan_equal!(plan, @r"
Filter: t1.b BETWEEN UInt32(1) AND UInt32(10) AND t2.b BETWEEN UInt32(5) AND UInt32(20)
Inner Join: t1.a = t2.a
TableScan: t1
TableScan: t2
")
}

#[test]
fn eliminate_full_with_in_list() -> Result<()> {
let t1 = test_table_scan_with_name("t1")?;
let t2 = test_table_scan_with_name("t2")?;

// Full join with IN filters on both sides should become Inner
let plan = LogicalPlanBuilder::from(t1)
.join(
t2,
JoinType::Full,
(vec![Column::from_name("a")], vec![Column::from_name("a")]),
None,
)?
.filter(binary_expr(
col("t1.b").in_list(vec![lit(1u32), lit(2u32)], false),
And,
col("t2.b").in_list(vec![lit(3u32), lit(4u32)], false),
))?
.build()?;

assert_optimized_plan_equal!(plan, @r"
Filter: t1.b IN ([UInt32(1), UInt32(2)]) AND t2.b IN ([UInt32(3), UInt32(4)])
Inner Join: t1.a = t2.a
TableScan: t1
TableScan: t2
")
}

#[test]
fn no_eliminate_left_with_in_list_or_is_null() -> Result<()> {
Comment on lines +640 to +641
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be possible to add some SLT tests?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added slt tests

let t1 = test_table_scan_with_name("t1")?;
let t2 = test_table_scan_with_name("t2")?;

// WHERE (t2.b IN (1, 2)) OR (t2.b IS NULL)
// The OR with IS NULL makes the predicate null-tolerant:
// when t2.b is NULL, IS NULL returns true, so the whole OR is true.
// The outer join must be preserved.
let plan = LogicalPlanBuilder::from(t1)
.join(
t2,
JoinType::Left,
(vec![Column::from_name("a")], vec![Column::from_name("a")]),
None,
)?
.filter(binary_expr(
col("t2.b").in_list(vec![lit(1u32), lit(2u32)], false),
Or,
col("t2.b").is_null(),
))?
.build()?;

// Should NOT be converted to Inner — OR with IS NULL preserves null rows
assert_optimized_plan_equal!(plan, @r"
Filter: t2.b IN ([UInt32(1), UInt32(2)]) OR t2.b IS NULL
Left Join: t1.a = t2.a
TableScan: t1
TableScan: t2
")
}

#[test]
fn eliminate_full_with_type_cast() -> Result<()> {
let t1 = test_table_scan_with_name("t1")?;
Expand Down
Loading
Loading