From 433b1e5e482c9182b0f8a0d8ad78b7c345bf16e9 Mon Sep 17 00:00:00 2001 From: comphead Date: Tue, 31 Mar 2026 12:14:52 -0700 Subject: [PATCH 1/2] chore: Fix clippy --- Cargo.toml | 3 +++ datafusion-cli/tests/cli_integration.rs | 6 ++---- datafusion/common/src/error.rs | 1 - datafusion/common/src/hash_utils.rs | 6 ++++++ datafusion/execution/src/memory_pool/arrow.rs | 2 +- .../aggregates/group_values/single_group_by/primitive.rs | 1 + 6 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index ffdc14cc514dd..5438e99bbe2b1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -212,6 +212,9 @@ needless_pass_by_value = "warn" # https://github.com/apache/datafusion/issues/18881 allow_attributes = "warn" assigning_clones = "warn" +# ScalarValue and Expr contain Arc<..> with interior mutability but are +# intentionally used as hash keys throughout the codebase. +mutable_key_type = "allow" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = [ diff --git a/datafusion-cli/tests/cli_integration.rs b/datafusion-cli/tests/cli_integration.rs index 7bc45693a8b0d..7aa405338bffe 100644 --- a/datafusion-cli/tests/cli_integration.rs +++ b/datafusion-cli/tests/cli_integration.rs @@ -414,14 +414,12 @@ fn test_backtrace_output(#[case] query: &str) { let output = cmd.output().expect("Failed to execute command"); let stdout = String::from_utf8_lossy(&output.stdout); let stderr = String::from_utf8_lossy(&output.stderr); - let combined_output = format!("{}{}", stdout, stderr); + let combined_output = format!("{stdout}{stderr}"); // Assert that the output includes literal 'backtrace' assert!( combined_output.to_lowercase().contains("backtrace"), - "Expected output to contain 'backtrace', but got stdout: '{}' stderr: '{}'", - stdout, - stderr + "Expected output to contain 'backtrace', but got stdout: '{stdout}' stderr: '{stderr}'" ); } diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index fbc1421727a7f..c6c50371c26c1 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -1274,7 +1274,6 @@ mod test { // To pass the test the environment variable RUST_BACKTRACE should be set to 1 to enforce backtrace #[cfg(feature = "backtrace")] #[test] - #[expect(clippy::unnecessary_literal_unwrap)] fn test_enabled_backtrace() { match std::env::var("RUST_BACKTRACE") { Ok(val) if val == "1" => {} diff --git a/datafusion/common/src/hash_utils.rs b/datafusion/common/src/hash_utils.rs index 255525b92e0c0..fcc2e919b6cc2 100644 --- a/datafusion/common/src/hash_utils.rs +++ b/datafusion/common/src/hash_utils.rs @@ -19,12 +19,15 @@ use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano}; use arrow::array::*; +#[cfg(not(feature = "force_hash_collisions"))] use arrow::compute::take; use arrow::datatypes::*; #[cfg(not(feature = "force_hash_collisions"))] use arrow::{downcast_dictionary_array, downcast_primitive_array}; use foldhash::fast::FixedState; +#[cfg(not(feature = "force_hash_collisions"))] use itertools::Itertools; +#[cfg(not(feature = "force_hash_collisions"))] use std::collections::HashMap; use std::hash::{BuildHasher, Hash, Hasher}; @@ -198,6 +201,7 @@ hash_float_value!((half::f16, u16), (f32, u32), (f64, u64)); /// Create a `SeedableRandomState` whose per-hasher seed incorporates `seed`. /// This folds the previous hash into the hasher's initial state so only the /// new value needs to pass through the hash function — same cost as `hash_one`. +#[cfg(not(feature = "force_hash_collisions"))] #[inline] fn seeded_state(seed: u64) -> foldhash::fast::SeedableRandomState { foldhash::fast::SeedableRandomState::with_seed( @@ -303,6 +307,7 @@ fn hash_array( /// HAS_NULLS: do we have to check null in the inner loop /// HAS_BUFFERS: if true, array has external buffers; if false, all strings are inlined/ less then 12 bytes /// REHASH: if true, combining with existing hash, otherwise initializing +#[cfg(not(feature = "force_hash_collisions"))] #[inline(never)] fn hash_string_view_array_inner< T: ByteViewType, @@ -429,6 +434,7 @@ fn hash_generic_byte_view_array( /// - `HAS_NULL_KEYS`: Whether to check for null dictionary keys /// - `HAS_NULL_VALUES`: Whether to check for null dictionary values /// - `MULTI_COL`: Whether to combine with existing hash (true) or initialize (false) +#[cfg(not(feature = "force_hash_collisions"))] #[inline(never)] fn hash_dictionary_inner< K: ArrowDictionaryKeyType, diff --git a/datafusion/execution/src/memory_pool/arrow.rs b/datafusion/execution/src/memory_pool/arrow.rs index 4e8d986f1f5e3..929e3b7bd27e5 100644 --- a/datafusion/execution/src/memory_pool/arrow.rs +++ b/datafusion/execution/src/memory_pool/arrow.rs @@ -59,7 +59,7 @@ impl arrow_buffer::MemoryReservation for MemoryReservation { impl arrow_buffer::MemoryPool for ArrowMemoryPool { fn reserve(&self, size: usize) -> Box { let consumer = self.consumer.clone_with_new_id(); - let mut reservation = consumer.register(&self.inner); + let reservation = consumer.register(&self.inner); reservation.grow(size); Box::new(reservation) diff --git a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs index 4686648fb1e3d..efaf7eba0f1b5 100644 --- a/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs +++ b/datafusion/physical-plan/src/aggregates/group_values/single_group_by/primitive.rs @@ -28,6 +28,7 @@ use datafusion_execution::memory_pool::proxy::VecAllocExt; use datafusion_expr::EmitTo; use half::f16; use hashbrown::hash_table::HashTable; +#[cfg(not(feature = "force_hash_collisions"))] use std::hash::BuildHasher; use std::mem::size_of; use std::sync::Arc; From ca4c08e33358091299ced39041c2715c1418449b Mon Sep 17 00:00:00 2001 From: comphead Date: Fri, 3 Apr 2026 12:18:41 -0700 Subject: [PATCH 2/2] chore: Fix clippy --- Cargo.toml | 3 --- datafusion/common/src/pruning.rs | 2 ++ datafusion/common/src/scalar/mod.rs | 3 +++ datafusion/core/src/physical_planner.rs | 1 + datafusion/datasource/src/file_groups.rs | 1 + datafusion/expr/src/logical_plan/builder.rs | 2 ++ datafusion/functions-aggregate/src/median.rs | 1 + datafusion/functions-aggregate/src/percentile_cont.rs | 1 + .../optimizer/src/analyzer/resolve_grouping_function.rs | 3 +++ datafusion/optimizer/src/scalar_subquery_to_join.rs | 4 ++++ .../optimizer/src/simplify_expressions/expr_simplifier.rs | 6 ++++++ datafusion/physical-expr/src/utils/guarantee.rs | 2 ++ datafusion/pruning/src/pruning_predicate.rs | 2 ++ datafusion/spark/src/function/map/utils.rs | 1 + datafusion/sql/src/select.rs | 3 +++ .../substrait/src/logical_plan/consumer/rel/project_rel.rs | 2 ++ 16 files changed, 34 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 5438e99bbe2b1..ffdc14cc514dd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -212,9 +212,6 @@ needless_pass_by_value = "warn" # https://github.com/apache/datafusion/issues/18881 allow_attributes = "warn" assigning_clones = "warn" -# ScalarValue and Expr contain Arc<..> with interior mutability but are -# intentionally used as hash keys throughout the codebase. -mutable_key_type = "allow" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = [ diff --git a/datafusion/common/src/pruning.rs b/datafusion/common/src/pruning.rs index 5a7598ea1f299..27148de59a544 100644 --- a/datafusion/common/src/pruning.rs +++ b/datafusion/common/src/pruning.rs @@ -121,6 +121,7 @@ pub trait PruningStatistics { /// container, return `None` (the default). /// /// Note: the returned array must contain [`Self::num_containers`] rows + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key fn contained( &self, column: &Column, @@ -526,6 +527,7 @@ impl PruningStatistics for CompositePruningStatistics { #[cfg(test)] #[expect(deprecated)] +#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key mod tests { use crate::{ ColumnStatistics, diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index ad76442562985..4511d8db90075 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -4601,6 +4601,7 @@ impl ScalarValue { /// Estimates [size](Self::size) of [`HashSet`] in bytes. /// /// Includes the size of the [`HashSet`] container itself. + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key pub fn size_of_hashset(set: &HashSet) -> usize { size_of_val(set) + (size_of::() * set.capacity()) @@ -7263,6 +7264,8 @@ mod tests { size_of::>() + (9 * size_of::()) + sv_size, ); + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] + // ScalarValue has interior mutability but is intentionally used as hash key let mut s = HashSet::with_capacity(0); // do NOT clone `sv` here because this may shrink the vector capacity s.insert(v.pop().unwrap()); diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index e25969903521c..8173e3b7a44f4 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2086,6 +2086,7 @@ fn get_physical_expr_pair( /// A vector of unqualified filter expressions that can be passed to the TableProvider for execution. /// Returns an empty vector if no applicable filters are found. /// +#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key fn extract_dml_filters( input: &Arc, target: &TableReference, diff --git a/datafusion/datasource/src/file_groups.rs b/datafusion/datasource/src/file_groups.rs index 28a403ab92ad8..84594be54b504 100644 --- a/datafusion/datasource/src/file_groups.rs +++ b/datafusion/datasource/src/file_groups.rs @@ -488,6 +488,7 @@ impl FileGroup { /// /// Note: May return fewer groups than `max_target_partitions` when the /// number of unique partition values is less than the target. + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key pub fn group_by_partition_values( self, max_target_partitions: usize, diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 95c6010a34c56..9e639057607f0 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -2123,6 +2123,8 @@ pub fn wrap_projection_for_join_if_necessary( .into_iter() .map(Expr::Column) .collect::>(); + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] + // Expr contains Arc with interior mutability but is intentionally used as hash key let join_key_items = alias_join_keys .iter() .flat_map(|expr| expr.try_as_col().is_none().then_some(expr)) diff --git a/datafusion/functions-aggregate/src/median.rs b/datafusion/functions-aggregate/src/median.rs index 322932dcbfb8c..00c0ee3cd47b0 100644 --- a/datafusion/functions-aggregate/src/median.rs +++ b/datafusion/functions-aggregate/src/median.rs @@ -285,6 +285,7 @@ impl Accumulator for MedianAccumulator { size_of_val(self) + self.all_values.capacity() * size_of::() } + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> { let mut to_remove: HashMap = HashMap::new(); diff --git a/datafusion/functions-aggregate/src/percentile_cont.rs b/datafusion/functions-aggregate/src/percentile_cont.rs index 8a37ceef511b2..bbf6821ba09dd 100644 --- a/datafusion/functions-aggregate/src/percentile_cont.rs +++ b/datafusion/functions-aggregate/src/percentile_cont.rs @@ -440,6 +440,7 @@ where size_of_val(self) + self.all_values.capacity() * size_of::() } + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key fn retract_batch(&mut self, values: &[ArrayRef]) -> Result<()> { let mut to_remove: HashMap = HashMap::new(); for i in 0..values[0].len() { diff --git a/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs b/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs index 747c54e2cd26d..6b8ae3e8531bc 100644 --- a/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs +++ b/datafusion/optimizer/src/analyzer/resolve_grouping_function.rs @@ -72,6 +72,7 @@ fn group_expr_to_bitmap_index(group_expr: &[Expr]) -> Result>()) } +#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key fn replace_grouping_exprs( input: Arc, schema: &DFSchema, @@ -158,6 +159,7 @@ fn contains_grouping_function(exprs: &[Expr]) -> bool { } /// Validate that the arguments to the grouping function are in the group by clause. +#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key fn validate_args( function: &AggregateFunction, group_by_expr: &HashMap<&Expr, usize>, @@ -178,6 +180,7 @@ fn validate_args( } } +#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key fn grouping_function_on_id( function: &AggregateFunction, group_by_expr: &HashMap<&Expr, usize>, diff --git a/datafusion/optimizer/src/scalar_subquery_to_join.rs b/datafusion/optimizer/src/scalar_subquery_to_join.rs index 590b00098bd46..c54cd287dbb46 100644 --- a/datafusion/optimizer/src/scalar_subquery_to_join.rs +++ b/datafusion/optimizer/src/scalar_subquery_to_join.rs @@ -144,7 +144,11 @@ impl OptimizerRule for ScalarSubqueryToJoin { } let mut all_subqueries = vec![]; + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] + // Expr contains Arc with interior mutability but is intentionally used as hash key let mut expr_to_rewrite_expr_map = HashMap::new(); + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] + // Expr contains Arc with interior mutability but is intentionally used as hash key let mut subquery_to_expr_map = HashMap::new(); for expr in projection.expr.iter() { let (subqueries, rewrite_exprs) = diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index e4455b8c82fc1..1af08c91c1109 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1784,6 +1784,8 @@ impl TreeNodeRewriter for Simplifier<'_> { }) if are_inlist_and_eq(left.as_ref(), right.as_ref()) => { let lhs = to_inlist(*left).unwrap(); let rhs = to_inlist(*right).unwrap(); + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] + // Expr contains Arc with interior mutability but is intentionally used as hash key let mut seen: HashSet = HashSet::new(); let list = lhs .list @@ -2174,6 +2176,7 @@ impl<'a> StringScalar<'a> { } } +#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key fn has_common_conjunction(lhs: &Expr, rhs: &Expr) -> bool { let lhs_set: HashSet<&Expr> = iter_conjunction(lhs).collect(); iter_conjunction(rhs).any(|e| lhs_set.contains(&e) && !e.is_volatile()) @@ -2258,6 +2261,7 @@ fn to_inlist(expr: Expr) -> Option { /// Return the union of two inlist expressions /// maintaining the order of the elements in the two lists +#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key fn inlist_union(mut l1: InList, l2: InList, negated: bool) -> Result { // extend the list in l1 with the elements in l2 that are not already in l1 let l1_items: HashSet<_> = l1.list.iter().collect(); @@ -2276,6 +2280,7 @@ fn inlist_union(mut l1: InList, l2: InList, negated: bool) -> Result { /// Return the intersection of two inlist expressions /// maintaining the order of the elements in the two lists +#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // Expr contains Arc with interior mutability but is intentionally used as hash key fn inlist_intersection(mut l1: InList, l2: &InList, negated: bool) -> Result { let l2_items = l2.list.iter().collect::>(); @@ -2292,6 +2297,7 @@ fn inlist_intersection(mut l1: InList, l2: &InList, negated: bool) -> Result Result { let l2_items = l2.list.iter().collect::>(); diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index c4ce74fd3a573..00195a4df6307 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -93,6 +93,7 @@ impl LiteralGuarantee { /// Create a new instance of the guarantee if the provided operator is /// supported. Returns None otherwise. See [`LiteralGuarantee::analyze`] to /// create these structures from an predicate (boolean expression). + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key fn new<'a>( column_name: impl Into, guarantee: Guarantee, @@ -309,6 +310,7 @@ impl<'a> GuaranteeBuilder<'a> { /// * `AND (a IN (1,2,3))`: a is in (1, 2, or 3) /// * `AND (a != 1 OR a != 2 OR a != 3)`: a is not in (1, 2, or 3) /// * `AND (a NOT IN (1,2,3))`: a is not in (1, 2, or 3) + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key fn aggregate_multi_conjunct( mut self, col: &'a crate::expressions::Column, diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs index 6f6b00e80abc2..978d79b1f2fb0 100644 --- a/datafusion/pruning/src/pruning_predicate.rs +++ b/datafusion/pruning/src/pruning_predicate.rs @@ -2158,6 +2158,7 @@ mod tests { } /// Add contained information. + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key pub fn with_contained( mut self, values: impl IntoIterator, @@ -2172,6 +2173,7 @@ mod tests { } /// get any contained information for the specified values + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key fn contained(&self, find_values: &HashSet) -> Option { // find the one with the matching values self.contained diff --git a/datafusion/spark/src/function/map/utils.rs b/datafusion/spark/src/function/map/utils.rs index 28fa3227fd628..f5fff0c4b4c46 100644 --- a/datafusion/spark/src/function/map/utils.rs +++ b/datafusion/spark/src/function/map/utils.rs @@ -147,6 +147,7 @@ pub fn map_from_keys_values_offsets_nulls( )?)) } +#[allow(clippy::allow_attributes, clippy::mutable_key_type)] // ScalarValue has interior mutability but is intentionally used as hash key fn map_deduplicate_keys( flat_keys: &ArrayRef, flat_values: &ArrayRef, diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 6aa7ff599d0c5..1b35070b29f44 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -592,9 +592,12 @@ impl SqlToRel<'_, S> { } else { let mut unnest_options = UnnestOptions::new().with_preserve_nulls(false); + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] + // Expr contains Arc with interior mutability but is intentionally used as hash key let mut projection_exprs = match &aggr_expr_using_columns { Some(exprs) => (*exprs).clone(), None => { + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] let mut columns = HashSet::new(); for expr in &aggr_expr { expr.apply(|expr| { diff --git a/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs b/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs index d216d4ecf3188..0a4048650fa2b 100644 --- a/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs +++ b/datafusion/substrait/src/logical_plan/consumer/rel/project_rel.rs @@ -50,6 +50,8 @@ pub async fn from_project_rel( // For WindowFunctions, we need to wrap them in a Window relation. If there are duplicates, // we can do the window'ing only once, then the project will duplicate the result. // Order here doesn't matter since LPB::window_plan sorts the expressions. + #[allow(clippy::allow_attributes, clippy::mutable_key_type)] + // Expr contains Arc with interior mutability but is intentionally used as hash key let mut window_exprs: HashSet = HashSet::new(); for expr in &p.expressions { let e = consumer