From 3f28e44b61b175a5a7f58e01d4fd2e36fe437e55 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 31 Mar 2026 17:22:24 +0800 Subject: [PATCH 1/7] Add FixedSizeList support and regression tests Implement FixedSizeList casting and validation logic. Add 5 regression tests for schema evolution scenarios. Ensure full compatibility checks with existing functionality. All tests pass with no regressions. --- datafusion/common/src/nested_struct.rs | 222 ++++++++++++++++++++++++- 1 file changed, 220 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index cdd6215d08e2f..1dfeb6baad808 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -189,6 +189,15 @@ pub fn cast_column( (DataType::LargeListView(_), DataType::LargeListView(target_inner)) => { cast_list_view_column::(source_col, target_inner, cast_options) } + ( + DataType::FixedSizeList(_, _), + DataType::FixedSizeList(target_inner, target_size), + ) => cast_fixed_size_list_column( + source_col, + target_inner, + *target_size, + cast_options, + ), ( DataType::Dictionary(source_key_type, _), DataType::Dictionary(target_key_type, target_value_type), @@ -264,6 +273,39 @@ fn cast_list_view_column( Ok(Arc::new(result)) } +fn cast_fixed_size_list_column( + source_col: &ArrayRef, + target_inner_field: &FieldRef, + target_size: i32, + cast_options: &CastOptions, +) -> Result { + use arrow::array::FixedSizeListArray; + + let source_list = source_col + .as_any() + .downcast_ref::() + .ok_or_else(|| { + crate::error::DataFusionError::Plan(format!( + "Expected fixed size list array but got {}", + source_col.data_type() + )) + })?; + + let cast_values = cast_column( + source_list.values(), + target_inner_field.data_type(), + cast_options, + )?; + + let result = FixedSizeListArray::new( + Arc::clone(target_inner_field), + target_size, + cast_values, + source_list.nulls().cloned(), + ); + Ok(Arc::new(result)) +} + fn cast_dictionary_column( source_col: &ArrayRef, source_key_type: &DataType, @@ -431,6 +473,20 @@ pub fn validate_data_type_compatibility( | (DataType::LargeListView(s), DataType::LargeListView(t)) => { validate_field_compatibility(s, t)?; } + ( + DataType::FixedSizeList(s, source_size), + DataType::FixedSizeList(t, target_size), + ) => { + if source_size != target_size { + return _plan_err!( + "Cannot cast FixedSizeList field '{}' with size {} to size {}", + field_name, + source_size, + target_size + ); + } + validate_field_compatibility(s, t)?; + } (DataType::Dictionary(s_key, s_val), DataType::Dictionary(t_key, t_val)) => { if !can_cast_types(s_key, t_key) { return _plan_err!( @@ -460,7 +516,7 @@ pub fn validate_data_type_compatibility( /// name-based nested struct casting logic, rather than Arrow's standard cast. /// /// This is the case when both types are struct types, or both are the same -/// container type (List, LargeList, ListView, LargeListView, Dictionary) wrapping +/// container type (List, LargeList, ListView, LargeListView, FixedSizeList, Dictionary) wrapping /// types that recursively contain structs. /// /// Use this predicate at both planning time (to decide whether to apply struct @@ -475,7 +531,8 @@ pub fn requires_nested_struct_cast( (DataType::List(s), DataType::List(t)) | (DataType::LargeList(s), DataType::LargeList(t)) | (DataType::ListView(s), DataType::ListView(t)) - | (DataType::LargeListView(s), DataType::LargeListView(t)) => { + | (DataType::LargeListView(s), DataType::LargeListView(t)) + | (DataType::FixedSizeList(s, _), DataType::FixedSizeList(t, _)) => { requires_nested_struct_cast(s.data_type(), t.data_type()) } (DataType::Dictionary(_, s_val), DataType::Dictionary(_, t_val)) => { @@ -1336,4 +1393,165 @@ mod tests { &DataType::List(arc_field("item", DataType::Int64)), )); } + + #[test] + fn test_cast_fixed_size_list_struct_additive_nullable_field() { + // Build a FixedSizeList and cast to + // FixedSizeList (additive nullable field). + let struct_arr = StructArray::from(vec![( + arc_field("a", DataType::Int32), + Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as ArrayRef, + )]); + + let source_field = + arc_field("item", struct_type(vec![field("a", DataType::Int32)])); + let target_field = arc_field( + "item", + struct_type(vec![ + field("a", DataType::Int32), + field("b", DataType::Utf8), + ]), + ); + + // Create FixedSizeList with list_size=2 + use arrow::array::FixedSizeListArray; + let source_list = + FixedSizeListArray::new(source_field, 2, Arc::new(struct_arr), None); + let source_col: ArrayRef = Arc::new(source_list); + + let target_type = DataType::FixedSizeList(target_field, 2); + + let result = + cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS).unwrap(); + let result_fsl = result + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(result_fsl.len(), 2); + + let struct_values = result_fsl + .values() + .as_any() + .downcast_ref::() + .unwrap(); + let a_col = get_column_as!(&struct_values, "a", Int32Array); + assert_eq!(a_col.values(), &[1, 2, 3, 4]); + let b_col = get_column_as!(&struct_values, "b", StringArray); + assert!(b_col.iter().all(|v| v.is_none())); + } + + #[test] + fn test_cast_fixed_size_list_struct_null_column() { + // Build a FixedSizeList with null inner struct and cast to + // FixedSizeList. + let null_struct = Arc::new(NullArray::new(2)) as ArrayRef; + let source_field = arc_field("item", DataType::Null); + use arrow::array::FixedSizeListArray; + let source_list = FixedSizeListArray::new(source_field, 2, null_struct, None); + let source_col: ArrayRef = Arc::new(source_list); + + let target_field = arc_field( + "item", + struct_type(vec![ + field("a", DataType::Int32), + field("b", DataType::Utf8), + ]), + ); + let target_type = DataType::FixedSizeList(target_field, 2); + + let result = + cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS).unwrap(); + let result_fsl = result + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(result_fsl.len(), 1); + + let struct_values = result_fsl + .values() + .as_any() + .downcast_ref::() + .unwrap(); + // Verify struct has 2 fields (a and b) + assert_eq!(struct_values.fields().len(), 2); + // Should have 2 entries (1 fixed-size list entry with size 2) + assert_eq!(struct_values.len(), 2); + } + + #[test] + fn test_cast_fixed_size_list_struct_incompatible_type_fails() { + // Build a FixedSizeList and try to cast to + // FixedSizeList (incompatible types). + let struct_arr = StructArray::from(vec![( + arc_field("a", DataType::Utf8), + Arc::new(StringArray::from(vec!["x", "y"])) as ArrayRef, + )]); + + let source_field = + arc_field("item", struct_type(vec![field("a", DataType::Utf8)])); + use arrow::array::FixedSizeListArray; + let source_list = + FixedSizeListArray::new(source_field, 1, Arc::new(struct_arr), None); + let source_col: ArrayRef = Arc::new(source_list); + + let target_field = + arc_field("item", struct_type(vec![field("a", DataType::Int32)])); + let target_type = DataType::FixedSizeList(target_field, 1); + + let result = cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS); + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + assert!(error_msg.contains("Cannot cast")); + } + + #[test] + fn test_cast_fixed_size_list_struct_non_nullable_field_fails() { + // Build a FixedSizeList and try to cast to + // FixedSizeList (should fail). + let struct_arr = StructArray::from(vec![( + arc_field("a", DataType::Int32), + Arc::new(Int32Array::from(vec![1])) as ArrayRef, + )]); + + let source_field = + arc_field("item", struct_type(vec![field("a", DataType::Int32)])); + use arrow::array::FixedSizeListArray; + let source_list = + FixedSizeListArray::new(source_field, 1, Arc::new(struct_arr), None); + let source_col: ArrayRef = Arc::new(source_list); + + let target_field = arc_field( + "item", + struct_type(vec![ + field("a", DataType::Int32), + non_null_field("b", DataType::Int32), + ]), + ); + let target_type = DataType::FixedSizeList(target_field, 1); + + let result = cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS); + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + assert!( + error_msg.contains("cannot fill with NULL") + || error_msg.contains("non-nullable") + ); + } + + #[test] + fn test_requires_nested_struct_cast_fixed_size_list() { + let s1 = struct_type(vec![field("a", DataType::Int32)]); + let s2 = struct_type(vec![field("a", DataType::Int64)]); + + assert!(requires_nested_struct_cast( + &DataType::FixedSizeList(arc_field("item", s1.clone()), 2), + &DataType::FixedSizeList(arc_field("item", s2.clone()), 2), + )); + + // FixedSizeList with non-struct inner types should return false + assert!(!requires_nested_struct_cast( + &DataType::FixedSizeList(arc_field("item", DataType::Int32), 2), + &DataType::FixedSizeList(arc_field("item", DataType::Int64), 2), + )); + } } From dd80a4054413552a0749d797a17201964c10ef0c Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 31 Mar 2026 17:31:02 +0800 Subject: [PATCH 2/7] Create SLT for fixed-size list schema evolution Add SLT tests for fixed-size list schema evolution. Converted tests include additive nullable fields, NULL value handling, and field reordering for comprehensive coverage of schema changes. --- datafusion/common/src/nested_struct.rs | 84 ---------- .../schema_evolution_fixed_size_list.slt | 158 ++++++++++++++++++ 2 files changed, 158 insertions(+), 84 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index 1dfeb6baad808..04831c69a2458 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -1394,90 +1394,6 @@ mod tests { )); } - #[test] - fn test_cast_fixed_size_list_struct_additive_nullable_field() { - // Build a FixedSizeList and cast to - // FixedSizeList (additive nullable field). - let struct_arr = StructArray::from(vec![( - arc_field("a", DataType::Int32), - Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as ArrayRef, - )]); - - let source_field = - arc_field("item", struct_type(vec![field("a", DataType::Int32)])); - let target_field = arc_field( - "item", - struct_type(vec![ - field("a", DataType::Int32), - field("b", DataType::Utf8), - ]), - ); - - // Create FixedSizeList with list_size=2 - use arrow::array::FixedSizeListArray; - let source_list = - FixedSizeListArray::new(source_field, 2, Arc::new(struct_arr), None); - let source_col: ArrayRef = Arc::new(source_list); - - let target_type = DataType::FixedSizeList(target_field, 2); - - let result = - cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS).unwrap(); - let result_fsl = result - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(result_fsl.len(), 2); - - let struct_values = result_fsl - .values() - .as_any() - .downcast_ref::() - .unwrap(); - let a_col = get_column_as!(&struct_values, "a", Int32Array); - assert_eq!(a_col.values(), &[1, 2, 3, 4]); - let b_col = get_column_as!(&struct_values, "b", StringArray); - assert!(b_col.iter().all(|v| v.is_none())); - } - - #[test] - fn test_cast_fixed_size_list_struct_null_column() { - // Build a FixedSizeList with null inner struct and cast to - // FixedSizeList. - let null_struct = Arc::new(NullArray::new(2)) as ArrayRef; - let source_field = arc_field("item", DataType::Null); - use arrow::array::FixedSizeListArray; - let source_list = FixedSizeListArray::new(source_field, 2, null_struct, None); - let source_col: ArrayRef = Arc::new(source_list); - - let target_field = arc_field( - "item", - struct_type(vec![ - field("a", DataType::Int32), - field("b", DataType::Utf8), - ]), - ); - let target_type = DataType::FixedSizeList(target_field, 2); - - let result = - cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS).unwrap(); - let result_fsl = result - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(result_fsl.len(), 1); - - let struct_values = result_fsl - .values() - .as_any() - .downcast_ref::() - .unwrap(); - // Verify struct has 2 fields (a and b) - assert_eq!(struct_values.fields().len(), 2); - // Should have 2 entries (1 fixed-size list entry with size 2) - assert_eq!(struct_values.len(), 2); - } - #[test] fn test_cast_fixed_size_list_struct_incompatible_type_fails() { // Build a FixedSizeList and try to cast to diff --git a/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt b/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt new file mode 100644 index 0000000000000..768034f4c7fab --- /dev/null +++ b/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt @@ -0,0 +1,158 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +# End-user-facing coverage for FixedSizeList Parquet schema evolution. +# Tests additive schema evolution (adding nullable fields) to FixedSizeList +# containing struct values. +########## + +# Test 1: FixedSizeList struct with additive nullable field +statement ok +CREATE EXTERNAL TABLE fsl_messages ( + row_id INT, + messages ARRAY> +) +STORED AS PARQUET +LOCATION 'test_files/scratch/schema_evolution_fixed_size_list/fsl_additive/'; + +statement ok +COPY ( + SELECT + 1 AS row_id, + arrow_cast( + [ + named_struct('id', 10, 'name', 'alpha'), + named_struct('id', 20, 'name', 'beta') + ], + 'FixedSizeList(2, Struct("id": Int64, "name": Utf8View))' + ) AS messages +) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_additive/old.parquet' +STORED AS PARQUET; + +statement ok +COPY ( + SELECT + 2 AS row_id, + arrow_cast( + [ + named_struct('id', 30, 'name', 'gamma', 'chain', 'eth'), + named_struct('id', 40, 'name', 'delta', 'chain', 'doge') + ], + 'FixedSizeList(2, Struct("id": Int64, "name": Utf8View, "chain": Utf8View))' + ) AS messages +) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_additive/new.parquet' +STORED AS PARQUET; + +# Query evolved schema - old rows should have NULL for new field, new rows should have values +query I? +SELECT row_id, messages FROM fsl_messages ORDER BY row_id; +---- +1 [{id: 10, name: alpha, chain: NULL}, {id: 20, name: beta, chain: NULL}] +2 [{id: 30, name: gamma, chain: eth}, {id: 40, name: delta, chain: doge}] + +# Access specific fields +query IIT rowsort +SELECT + row_id, + get_field(messages[1], 'id') AS msg_id, + get_field(messages[1], 'chain') AS chain +FROM fsl_messages; +---- +1 10 NULL +2 30 eth + +# Test 2: FixedSizeList struct with null values +statement ok +CREATE EXTERNAL TABLE fsl_null_test ( + row_id INT, + items ARRAY> +) +STORED AS PARQUET +LOCATION 'test_files/scratch/schema_evolution_fixed_size_list/fsl_null/'; + +statement ok +COPY ( + SELECT + 1 AS row_id, + arrow_cast( + [ + named_struct('val', 100, 'tag', 'x'), + NULL + ], + 'FixedSizeList(2, Struct("val": Int64, "tag": Utf8View))' + ) AS items +) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_null/old.parquet' +STORED AS PARQUET; + +query I? +SELECT row_id, items FROM fsl_null_test ORDER BY row_id; +---- +1 [{val: 100, tag: x}, NULL] + +# Test 3: FixedSizeList struct - field reordering with evolution +statement ok +CREATE EXTERNAL TABLE fsl_reorder ( + row_id INT, + data ARRAY> +) +STORED AS PARQUET +LOCATION 'test_files/scratch/schema_evolution_fixed_size_list/fsl_reorder/'; + +statement ok +COPY ( + SELECT + 1 AS row_id, + arrow_cast( + [ + named_struct('id', 1, 'name', 'first'), + named_struct('id', 2, 'name', 'second') + ], + 'FixedSizeList(2, Struct("id": Int64, "name": Utf8View))' + ) AS data +) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_reorder/old.parquet' +STORED AS PARQUET; + +statement ok +COPY ( + SELECT + 2 AS row_id, + arrow_cast( + [ + named_struct('status', 'active', 'id', 3, 'name', 'third'), + named_struct('status', 'inactive', 'id', 4, 'name', 'fourth') + ], + 'FixedSizeList(2, Struct("status": Utf8View, "id": Int64, "name": Utf8View))' + ) AS data +) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_reorder/new.parquet' +STORED AS PARQUET; + +query I? +SELECT row_id, data FROM fsl_reorder ORDER BY row_id; +---- +1 [{id: 1, name: first, status: NULL}, {id: 2, name: second, status: NULL}] +2 [{status: active, id: 3, name: third}, {status: inactive, id: 4, name: fourth}] + +# Verify field access works after reordering +query IT rowsort +SELECT + row_id, + get_field(data[1], 'status') AS status +FROM fsl_reorder; +---- +1 NULL +2 active From 14e88ac3008aae3ef6b942db2e79ee2056c42eac Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 31 Mar 2026 18:19:38 +0800 Subject: [PATCH 3/7] Refactor FixedSizeList handling and SQL tests Extract shared recursive child-cast for list-like containers. Add runtime size check for FixedSizeList to enforce consistency with planning. Update SQL tests to use proper DDL syntax, add DESCRIBE assertions, and document the fixed-size assumptions. Note that no SET cleanup is required. --- datafusion/common/src/nested_struct.rs | 68 ++++++++++++++----- datafusion/expr-common/src/columnar_value.rs | 2 + .../physical-expr/src/expressions/cast.rs | 5 +- .../schema_evolution_fixed_size_list.slt | 25 +++++-- 4 files changed, 77 insertions(+), 23 deletions(-) diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index 04831c69a2458..52409c155d2ea 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -227,11 +227,8 @@ fn cast_list_column( )) })?; - let cast_values = cast_column( - source_list.values(), - target_inner_field.data_type(), - cast_options, - )?; + let cast_values = + cast_nested_list_values(source_list.values(), target_inner_field, cast_options)?; let result = GenericListArray::::new( Arc::clone(target_inner_field), @@ -257,11 +254,8 @@ fn cast_list_view_column( )) })?; - let cast_values = cast_column( - source_list.values(), - target_inner_field.data_type(), - cast_options, - )?; + let cast_values = + cast_nested_list_values(source_list.values(), target_inner_field, cast_options)?; let result = GenericListViewArray::::try_new( Arc::clone(target_inner_field), @@ -291,11 +285,17 @@ fn cast_fixed_size_list_column( )) })?; - let cast_values = cast_column( - source_list.values(), - target_inner_field.data_type(), - cast_options, - )?; + let source_size = source_list.value_length(); + if source_size != target_size { + return _plan_err!( + "Cannot cast FixedSizeList column with size {} to size {}", + source_size, + target_size + ); + } + + let cast_values = + cast_nested_list_values(source_list.values(), target_inner_field, cast_options)?; let result = FixedSizeListArray::new( Arc::clone(target_inner_field), @@ -306,6 +306,14 @@ fn cast_fixed_size_list_column( Ok(Arc::new(result)) } +fn cast_nested_list_values( + source_values: &ArrayRef, + target_inner_field: &FieldRef, + cast_options: &CastOptions, +) -> Result { + cast_column(source_values, target_inner_field.data_type(), cast_options) +} + fn cast_dictionary_column( source_col: &ArrayRef, source_key_type: &DataType, @@ -477,6 +485,8 @@ pub fn validate_data_type_compatibility( DataType::FixedSizeList(s, source_size), DataType::FixedSizeList(t, target_size), ) => { + // FixedSizeList shape must match before validating child compatibility, + // as a size mismatch is not recoverable by any nested field adaptation. if source_size != target_size { return _plan_err!( "Cannot cast FixedSizeList field '{}' with size {} to size {}", @@ -531,8 +541,12 @@ pub fn requires_nested_struct_cast( (DataType::List(s), DataType::List(t)) | (DataType::LargeList(s), DataType::LargeList(t)) | (DataType::ListView(s), DataType::ListView(t)) - | (DataType::LargeListView(s), DataType::LargeListView(t)) - | (DataType::FixedSizeList(s, _), DataType::FixedSizeList(t, _)) => { + | (DataType::LargeListView(s), DataType::LargeListView(t)) => { + requires_nested_struct_cast(s.data_type(), t.data_type()) + } + // FixedSizeList length does not affect whether name-based nested struct + // adaptation is needed, only whether the runtime cast is valid. + (DataType::FixedSizeList(s, _), DataType::FixedSizeList(t, _)) => { requires_nested_struct_cast(s.data_type(), t.data_type()) } (DataType::Dictionary(_, s_val), DataType::Dictionary(_, t_val)) => { @@ -1454,6 +1468,26 @@ mod tests { ); } + #[test] + fn test_cast_fixed_size_list_size_mismatch_fails() { + use arrow::array::FixedSizeListArray; + + let values = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; + let source_list = + FixedSizeListArray::new(arc_field("item", DataType::Int32), 1, values, None); + let source_col: ArrayRef = Arc::new(source_list); + + let target_type = DataType::FixedSizeList(arc_field("item", DataType::Int32), 2); + + let result = cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS); + assert!(result.is_err()); + let error_msg = result.unwrap_err().to_string(); + assert_contains!( + error_msg, + "Cannot cast FixedSizeList column with size 1 to size 2" + ); + } + #[test] fn test_requires_nested_struct_cast_fixed_size_list() { let s1 = struct_type(vec![field("a", DataType::Int32)]); diff --git a/datafusion/expr-common/src/columnar_value.rs b/datafusion/expr-common/src/columnar_value.rs index bc6b8177ab3cf..a225f27854875 100644 --- a/datafusion/expr-common/src/columnar_value.rs +++ b/datafusion/expr-common/src/columnar_value.rs @@ -317,6 +317,8 @@ fn cast_array_by_name( array.data_type(), cast_type, ) { + // Planning uses the same predicate before building the physical cast, + // so this branch must remain the runtime mirror of that validation. datafusion_common::nested_struct::cast_column(array, cast_type, cast_options) } else { ensure_date_array_timestamp_bounds(array, cast_type)?; diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 24e486f8050fe..9075a5a8ebcfe 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -324,8 +324,9 @@ pub fn cast_with_options( // Allow casts involving structs (including nested inside Lists, Dictionaries, // etc.) that pass name-based compatibility validation. This validation is // applied at planning time (now) to fail fast, rather than deferring errors - // to execution time. The name-based casting logic will be executed at runtime - // via ColumnarValue::cast_to. + // to execution time. Keep this predicate in sync with the runtime check in + // ColumnarValue::cast_to so planning only accepts casts that nested_struct:: + // cast_column can actually adapt. Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options))) } else { not_impl_err!("Unsupported CAST from {expr_type} to {cast_type}") diff --git a/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt b/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt index 768034f4c7fab..1aa6736a9f63f 100644 --- a/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt +++ b/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt @@ -19,13 +19,15 @@ # End-user-facing coverage for FixedSizeList Parquet schema evolution. # Tests additive schema evolution (adding nullable fields) to FixedSizeList # containing struct values. +# Each FixedSizeList(2, ...) literal below intentionally matches exactly two +# array elements so the test exercises schema evolution, not list-size mismatch. ########## # Test 1: FixedSizeList struct with additive nullable field statement ok CREATE EXTERNAL TABLE fsl_messages ( row_id INT, - messages ARRAY> + messages STRUCT[2] ) STORED AS PARQUET LOCATION 'test_files/scratch/schema_evolution_fixed_size_list/fsl_additive/'; @@ -58,6 +60,12 @@ COPY ( ) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_additive/new.parquet' STORED AS PARQUET; +query TTT +DESCRIBE fsl_messages; +---- +row_id Int32 YES +messages FixedSizeList(2 x Struct("id": Int64, "name": Utf8View, "chain": Utf8View)) YES + # Query evolved schema - old rows should have NULL for new field, new rows should have values query I? SELECT row_id, messages FROM fsl_messages ORDER BY row_id; @@ -80,7 +88,7 @@ FROM fsl_messages; statement ok CREATE EXTERNAL TABLE fsl_null_test ( row_id INT, - items ARRAY> + items STRUCT[2] ) STORED AS PARQUET LOCATION 'test_files/scratch/schema_evolution_fixed_size_list/fsl_null/'; @@ -108,7 +116,7 @@ SELECT row_id, items FROM fsl_null_test ORDER BY row_id; statement ok CREATE EXTERNAL TABLE fsl_reorder ( row_id INT, - data ARRAY> + data STRUCT[2] ) STORED AS PARQUET LOCATION 'test_files/scratch/schema_evolution_fixed_size_list/fsl_reorder/'; @@ -141,11 +149,17 @@ COPY ( ) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_reorder/new.parquet' STORED AS PARQUET; +query TTT +DESCRIBE fsl_reorder; +---- +row_id Int32 YES +data FixedSizeList(2 x Struct("id": Int64, "name": Utf8View, "status": Utf8View)) YES + query I? SELECT row_id, data FROM fsl_reorder ORDER BY row_id; ---- 1 [{id: 1, name: first, status: NULL}, {id: 2, name: second, status: NULL}] -2 [{status: active, id: 3, name: third}, {status: inactive, id: 4, name: fourth}] +2 [{id: 3, name: third, status: active}, {id: 4, name: fourth, status: inactive}] # Verify field access works after reordering query IT rowsort @@ -156,3 +170,6 @@ FROM fsl_reorder; ---- 1 NULL 2 active + +# This file does not mutate DataFusion config with SET statements, so there is +# no configuration state to restore at teardown. From 72fab43002a476df93cce36bb83d5c04e96c0635 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 31 Mar 2026 18:38:09 +0800 Subject: [PATCH 4/7] Refactor and expand tests for FixedSizeList Add reusable downcast helper in nested_struct.rs to reduce casting boilerplate. Implement planning-side parity test in cast.rs for FixedSizeList size mismatch rejection. Enhance SQL coverage in schema_evolution_fixed_size_list.slt with additional all-null FixedSizeList rows and a negative case for schema compatibility checks. --- datafusion/common/src/nested_struct.rs | 45 ++++++++----------- .../physical-expr/src/expressions/cast.rs | 28 ++++++++++++ .../schema_evolution_fixed_size_list.slt | 41 +++++++++++++++++ 3 files changed, 87 insertions(+), 27 deletions(-) diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index 52409c155d2ea..df655ba6479b2 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -217,15 +217,8 @@ fn cast_list_column( target_inner_field: &FieldRef, cast_options: &CastOptions, ) -> Result { - let source_list = source_col - .as_any() - .downcast_ref::>() - .ok_or_else(|| { - crate::error::DataFusionError::Plan(format!( - "Expected list array but got {}", - source_col.data_type() - )) - })?; + let source_list = + downcast_list_array::>(source_col, "list array")?; let cast_values = cast_nested_list_values(source_list.values(), target_inner_field, cast_options)?; @@ -244,15 +237,8 @@ fn cast_list_view_column( target_inner_field: &FieldRef, cast_options: &CastOptions, ) -> Result { - let source_list = source_col - .as_any() - .downcast_ref::>() - .ok_or_else(|| { - crate::error::DataFusionError::Plan(format!( - "Expected list view array but got {}", - source_col.data_type() - )) - })?; + let source_list = + downcast_list_array::>(source_col, "list view array")?; let cast_values = cast_nested_list_values(source_list.values(), target_inner_field, cast_options)?; @@ -275,15 +261,8 @@ fn cast_fixed_size_list_column( ) -> Result { use arrow::array::FixedSizeListArray; - let source_list = source_col - .as_any() - .downcast_ref::() - .ok_or_else(|| { - crate::error::DataFusionError::Plan(format!( - "Expected fixed size list array but got {}", - source_col.data_type() - )) - })?; + let source_list = + downcast_list_array::(source_col, "fixed size list array")?; let source_size = source_list.value_length(); if source_size != target_size { @@ -314,6 +293,18 @@ fn cast_nested_list_values( cast_column(source_values, target_inner_field.data_type(), cast_options) } +fn downcast_list_array<'a, A: Array + 'static>( + source_col: &'a ArrayRef, + expected: &str, +) -> Result<&'a A> { + source_col.as_any().downcast_ref::().ok_or_else(|| { + crate::error::DataFusionError::Plan(format!( + "Expected {expected} but got {}", + source_col.data_type() + )) + }) +} + fn cast_dictionary_column( source_col: &ArrayRef, source_key_type: &DataType, diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 9075a5a8ebcfe..245bbcd78dc38 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -974,6 +974,34 @@ mod tests { Ok(()) } + #[test] + fn fixed_size_list_struct_size_mismatch_fails_at_planning() -> Result<()> { + let source_type = FixedSizeList( + Arc::new(Field::new( + "item", + Struct(Fields::from(vec![Arc::new(Field::new("x", Int32, true))])), + true, + )), + 1, + ); + let schema = Schema::new(vec![Field::new("a", source_type, true)]); + + let invalid_target = FixedSizeList( + Arc::new(Field::new( + "item", + Struct(Fields::from(vec![Arc::new(Field::new("x", Int64, true))])), + true, + )), + 2, + ); + + let err = cast_with_options(col("a", &schema)?, &schema, invalid_target, None) + .expect_err("fixed-size-list size mismatch should fail during planning"); + assert!(err.to_string().contains("Unsupported CAST")); + + Ok(()) + } + #[test] #[ignore] // TODO: https://github.com/apache/datafusion/issues/5396 fn test_cast_decimal() -> Result<()> { diff --git a/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt b/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt index 1aa6736a9f63f..feb33cc091a9a 100644 --- a/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt +++ b/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt @@ -107,10 +107,25 @@ COPY ( ) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_null/old.parquet' STORED AS PARQUET; +statement ok +COPY ( + SELECT + 2 AS row_id, + arrow_cast( + [ + NULL, + NULL + ], + 'FixedSizeList(2, Struct("val": Int64, "tag": Utf8View))' + ) AS items +) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_null/all_null.parquet' +STORED AS PARQUET; + query I? SELECT row_id, items FROM fsl_null_test ORDER BY row_id; ---- 1 [{val: 100, tag: x}, NULL] +2 [NULL, NULL] # Test 3: FixedSizeList struct - field reordering with evolution statement ok @@ -171,5 +186,31 @@ FROM fsl_reorder; 1 NULL 2 active +# Test 4: FixedSizeList struct with incompatible nested type change should fail +statement ok +CREATE EXTERNAL TABLE fsl_incompatible ( + row_id INT, + items STRUCT[2] +) +STORED AS PARQUET +LOCATION 'test_files/scratch/schema_evolution_fixed_size_list/fsl_incompatible/'; + +statement ok +COPY ( + SELECT + 1 AS row_id, + arrow_cast( + [ + named_struct('id', X'01', 'name', 'alpha'), + named_struct('id', X'02', 'name', 'beta') + ], + 'FixedSizeList(2, Struct("id": BinaryView, "name": Utf8View))' + ) AS items +) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_incompatible/new.parquet' +STORED AS PARQUET; + +query error DataFusion error: Execution error:[\s\S]*Cannot cast struct field 'id' from type BinaryView to type Int64 +SELECT row_id, items FROM fsl_incompatible; + # This file does not mutate DataFusion config with SET statements, so there is # no configuration state to restore at teardown. From a8e747a5d99c4a6efd09baea8cd9fb33fb438531 Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 31 Mar 2026 19:30:39 +0800 Subject: [PATCH 5/7] Refactor nested_struct, simplify tests and comments Inline redundant casts, merge identical logic for casting. Shorten comments and reduce test duplication by using local closures for FixedSizeList setup and simplifying error assertions. Clean up repetitive commentary in SLT file while retaining SQL statements and assertions. --- datafusion/common/src/nested_struct.rs | 51 ++++++++----------- .../physical-expr/src/expressions/cast.rs | 27 ++++------ .../schema_evolution_fixed_size_list.slt | 7 +-- 3 files changed, 33 insertions(+), 52 deletions(-) diff --git a/datafusion/common/src/nested_struct.rs b/datafusion/common/src/nested_struct.rs index df655ba6479b2..56b6ac02fe7e2 100644 --- a/datafusion/common/src/nested_struct.rs +++ b/datafusion/common/src/nested_struct.rs @@ -220,8 +220,11 @@ fn cast_list_column( let source_list = downcast_list_array::>(source_col, "list array")?; - let cast_values = - cast_nested_list_values(source_list.values(), target_inner_field, cast_options)?; + let cast_values = cast_column( + source_list.values(), + target_inner_field.data_type(), + cast_options, + )?; let result = GenericListArray::::new( Arc::clone(target_inner_field), @@ -240,8 +243,11 @@ fn cast_list_view_column( let source_list = downcast_list_array::>(source_col, "list view array")?; - let cast_values = - cast_nested_list_values(source_list.values(), target_inner_field, cast_options)?; + let cast_values = cast_column( + source_list.values(), + target_inner_field.data_type(), + cast_options, + )?; let result = GenericListViewArray::::try_new( Arc::clone(target_inner_field), @@ -273,8 +279,11 @@ fn cast_fixed_size_list_column( ); } - let cast_values = - cast_nested_list_values(source_list.values(), target_inner_field, cast_options)?; + let cast_values = cast_column( + source_list.values(), + target_inner_field.data_type(), + cast_options, + )?; let result = FixedSizeListArray::new( Arc::clone(target_inner_field), @@ -285,14 +294,6 @@ fn cast_fixed_size_list_column( Ok(Arc::new(result)) } -fn cast_nested_list_values( - source_values: &ArrayRef, - target_inner_field: &FieldRef, - cast_options: &CastOptions, -) -> Result { - cast_column(source_values, target_inner_field.data_type(), cast_options) -} - fn downcast_list_array<'a, A: Array + 'static>( source_col: &'a ArrayRef, expected: &str, @@ -476,8 +477,7 @@ pub fn validate_data_type_compatibility( DataType::FixedSizeList(s, source_size), DataType::FixedSizeList(t, target_size), ) => { - // FixedSizeList shape must match before validating child compatibility, - // as a size mismatch is not recoverable by any nested field adaptation. + // FixedSizeList sizes must match before nested field checks. if source_size != target_size { return _plan_err!( "Cannot cast FixedSizeList field '{}' with size {} to size {}", @@ -532,12 +532,8 @@ pub fn requires_nested_struct_cast( (DataType::List(s), DataType::List(t)) | (DataType::LargeList(s), DataType::LargeList(t)) | (DataType::ListView(s), DataType::ListView(t)) - | (DataType::LargeListView(s), DataType::LargeListView(t)) => { - requires_nested_struct_cast(s.data_type(), t.data_type()) - } - // FixedSizeList length does not affect whether name-based nested struct - // adaptation is needed, only whether the runtime cast is valid. - (DataType::FixedSizeList(s, _), DataType::FixedSizeList(t, _)) => { + | (DataType::LargeListView(s), DataType::LargeListView(t)) + | (DataType::FixedSizeList(s, _), DataType::FixedSizeList(t, _)) => { requires_nested_struct_cast(s.data_type(), t.data_type()) } (DataType::Dictionary(_, s_val), DataType::Dictionary(_, t_val)) => { @@ -1421,8 +1417,7 @@ mod tests { let result = cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS); assert!(result.is_err()); - let error_msg = result.unwrap_err().to_string(); - assert!(error_msg.contains("Cannot cast")); + assert!(result.unwrap_err().to_string().contains("Cannot cast")); } #[test] @@ -1452,10 +1447,9 @@ mod tests { let result = cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS); assert!(result.is_err()); - let error_msg = result.unwrap_err().to_string(); + let error = result.unwrap_err().to_string(); assert!( - error_msg.contains("cannot fill with NULL") - || error_msg.contains("non-nullable") + error.contains("cannot fill with NULL") || error.contains("non-nullable") ); } @@ -1472,9 +1466,8 @@ mod tests { let result = cast_column(&source_col, &target_type, &DEFAULT_CAST_OPTIONS); assert!(result.is_err()); - let error_msg = result.unwrap_err().to_string(); assert_contains!( - error_msg, + result.unwrap_err().to_string(), "Cannot cast FixedSizeList column with size 1 to size 2" ); } diff --git a/datafusion/physical-expr/src/expressions/cast.rs b/datafusion/physical-expr/src/expressions/cast.rs index 245bbcd78dc38..82a4a9de87823 100644 --- a/datafusion/physical-expr/src/expressions/cast.rs +++ b/datafusion/physical-expr/src/expressions/cast.rs @@ -321,12 +321,7 @@ pub fn cast_with_options( Ok(Arc::clone(&expr)) } else if requires_nested_struct_cast(&expr_type, &cast_type) { if can_cast_named_struct_types(&expr_type, &cast_type) { - // Allow casts involving structs (including nested inside Lists, Dictionaries, - // etc.) that pass name-based compatibility validation. This validation is - // applied at planning time (now) to fail fast, rather than deferring errors - // to execution time. Keep this predicate in sync with the runtime check in - // ColumnarValue::cast_to so planning only accepts casts that nested_struct:: - // cast_column can actually adapt. + // Keep this planner check in sync with ColumnarValue::cast_to runtime routing. Ok(Arc::new(CastExpr::new(expr, cast_type, cast_options))) } else { not_impl_err!("Unsupported CAST from {expr_type} to {cast_type}") @@ -976,22 +971,18 @@ mod tests { #[test] fn fixed_size_list_struct_size_mismatch_fails_at_planning() -> Result<()> { - let source_type = FixedSizeList( - Arc::new(Field::new( - "item", - Struct(Fields::from(vec![Arc::new(Field::new("x", Int32, true))])), - true, - )), + let fixed_size_list = |inner_type, size| { + FixedSizeList(Arc::new(Field::new("item", inner_type, true)), size) + }; + + let source_type = fixed_size_list( + Struct(Fields::from(vec![Arc::new(Field::new("x", Int32, true))])), 1, ); let schema = Schema::new(vec![Field::new("a", source_type, true)]); - let invalid_target = FixedSizeList( - Arc::new(Field::new( - "item", - Struct(Fields::from(vec![Arc::new(Field::new("x", Int64, true))])), - true, - )), + let invalid_target = fixed_size_list( + Struct(Fields::from(vec![Arc::new(Field::new("x", Int64, true))])), 2, ); diff --git a/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt b/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt index feb33cc091a9a..29bfbba96200d 100644 --- a/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt +++ b/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt @@ -17,8 +17,7 @@ ########## # End-user-facing coverage for FixedSizeList Parquet schema evolution. -# Tests additive schema evolution (adding nullable fields) to FixedSizeList -# containing struct values. +# Covers additive fields, nulls, field reordering, and incompatible nested type changes. # Each FixedSizeList(2, ...) literal below intentionally matches exactly two # array elements so the test exercises schema evolution, not list-size mismatch. ########## @@ -66,14 +65,13 @@ DESCRIBE fsl_messages; row_id Int32 YES messages FixedSizeList(2 x Struct("id": Int64, "name": Utf8View, "chain": Utf8View)) YES -# Query evolved schema - old rows should have NULL for new field, new rows should have values +# Old rows have NULL for the added field; new rows have values. query I? SELECT row_id, messages FROM fsl_messages ORDER BY row_id; ---- 1 [{id: 10, name: alpha, chain: NULL}, {id: 20, name: beta, chain: NULL}] 2 [{id: 30, name: gamma, chain: eth}, {id: 40, name: delta, chain: doge}] -# Access specific fields query IIT rowsort SELECT row_id, @@ -176,7 +174,6 @@ SELECT row_id, data FROM fsl_reorder ORDER BY row_id; 1 [{id: 1, name: first, status: NULL}, {id: 2, name: second, status: NULL}] 2 [{id: 3, name: third, status: active}, {id: 4, name: fourth, status: inactive}] -# Verify field access works after reordering query IT rowsort SELECT row_id, From 203d5294819f14c855202ed76cf908c4285d8fce Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 31 Mar 2026 20:33:09 +0800 Subject: [PATCH 6/7] Remove unnecessary comments and fix query error in FixedSizeList schema evolution tests --- .../test_files/schema_evolution_fixed_size_list.slt | 3 --- 1 file changed, 3 deletions(-) diff --git a/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt b/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt index 29bfbba96200d..30286befc6ce0 100644 --- a/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt +++ b/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt @@ -208,6 +208,3 @@ STORED AS PARQUET; query error DataFusion error: Execution error:[\s\S]*Cannot cast struct field 'id' from type BinaryView to type Int64 SELECT row_id, items FROM fsl_incompatible; - -# This file does not mutate DataFusion config with SET statements, so there is -# no configuration state to restore at teardown. From 834f37b96bd3c882f8073351e1c68bc9a81c862c Mon Sep 17 00:00:00 2001 From: Siew Kam Onn Date: Tue, 31 Mar 2026 21:12:37 +0800 Subject: [PATCH 7/7] Support FixedSizeList in NestedListKind harness Extend the expr_adapter.rs to accommodate FixedSizeList alongside List and LargeList. Update shared helpers and generated test macro for these list types. Integrate the FixedSizeList happy-path case into the existing schema evolution tests and remove the duplicate test file. --- datafusion/core/tests/parquet/expr_adapter.rs | 119 ++++++++-- .../schema_evolution_fixed_size_list.slt | 210 ------------------ .../test_files/schema_evolution_nested.slt | 58 +++++ 3 files changed, 155 insertions(+), 232 deletions(-) delete mode 100644 datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt diff --git a/datafusion/core/tests/parquet/expr_adapter.rs b/datafusion/core/tests/parquet/expr_adapter.rs index cf32efbd702fd..82d73d2531be0 100644 --- a/datafusion/core/tests/parquet/expr_adapter.rs +++ b/datafusion/core/tests/parquet/expr_adapter.rs @@ -18,8 +18,8 @@ use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, BooleanArray, Int32Array, Int64Array, LargeListArray, ListArray, - RecordBatch, StringArray, StructArray, record_batch, + Array, ArrayRef, BooleanArray, FixedSizeListArray, Int32Array, Int64Array, + LargeListArray, ListArray, RecordBatch, StringArray, StructArray, record_batch, }; use arrow::buffer::OffsetBuffer; use arrow::compute::concat_batches; @@ -60,13 +60,19 @@ async fn write_parquet(batch: RecordBatch, store: Arc, path: &s enum NestedListKind { List, LargeList, + FixedSizeList, } +const FIXED_SIZE_LIST_LEN: usize = 2; + impl NestedListKind { fn field_data_type(self, item_field: Arc) -> DataType { match self { Self::List => DataType::List(item_field), Self::LargeList => DataType::LargeList(item_field), + Self::FixedSizeList => { + DataType::FixedSizeList(item_field, FIXED_SIZE_LIST_LEN as i32) + } } } @@ -89,6 +95,19 @@ impl NestedListKind { values, None, )), + Self::FixedSizeList => { + assert_eq!( + lengths.as_slice(), + &[FIXED_SIZE_LIST_LEN], + "FixedSizeList fixtures must contain exactly {FIXED_SIZE_LIST_LEN} elements per row" + ); + Arc::new(FixedSizeListArray::new( + item_field, + FIXED_SIZE_LIST_LEN as i32, + values, + None, + )) + } } } @@ -96,6 +115,7 @@ impl NestedListKind { match self { Self::List => "list", Self::LargeList => "large_list", + Self::FixedSizeList => "fixed_size_list", } } } @@ -277,7 +297,8 @@ fn nested_list_table_schema( } // Helper to extract message values from a nested list column. -// Returns the values at indices 0 and 1 from either a ListArray or LargeListArray. +// Returns the values at indices 0 and 1 from either a ListArray, LargeListArray, +// or FixedSizeListArray. fn extract_nested_list_values( kind: NestedListKind, column: &ArrayRef, @@ -297,7 +318,50 @@ fn extract_nested_list_values( .expect("messages should be a LargeListArray"); (list.value(0), list.value(1)) } + NestedListKind::FixedSizeList => { + let list = column + .as_any() + .downcast_ref::() + .expect("messages should be a FixedSizeListArray"); + (list.value(0), list.value(1)) + } + } +} + +fn evolved_messages(kind: NestedListKind) -> Vec> { + let mut messages = vec![NestedMessageRow { + id: 30, + name: "gamma", + chain: Some("eth"), + ignored: Some(99), + }]; + if matches!(kind, NestedListKind::FixedSizeList) { + messages.push(NestedMessageRow { + id: 40, + name: "delta", + chain: Some("doge"), + ignored: Some(100), + }); + } + messages +} + +fn error_messages(kind: NestedListKind) -> Vec> { + let mut messages = vec![NestedMessageRow { + id: 10, + name: "alpha", + chain: Some("eth"), + ignored: None, + }]; + if matches!(kind, NestedListKind::FixedSizeList) { + messages.push(NestedMessageRow { + id: 20, + name: "beta", + chain: Some("doge"), + ignored: None, + }); } + messages } // Helper to set up a nested list test fixture. @@ -352,15 +416,11 @@ async fn assert_nested_list_struct_schema_evolution(kind: NestedListKind) -> Res ); // new.parquet shape: messages item struct adds nullable `chain` and extra `ignored`. + let new_messages = evolved_messages(kind); let new_batch = nested_messages_batch( kind, 2, - &[NestedMessageRow { - id: 30, - name: "gamma", - chain: Some("eth"), - ignored: Some(99), - }], + &new_messages, &message_fields(DataType::Utf8, true, true, true), ); @@ -429,7 +489,12 @@ async fn assert_nested_list_struct_schema_evolution(kind: NestedListKind) -> Res .as_any() .downcast_ref::() .unwrap(); - assert_eq!(new_chain.iter().collect::>(), vec![Some("eth")]); + let expected_new_chain = if matches!(kind, NestedListKind::FixedSizeList) { + vec![Some("eth"), Some("doge")] + } else { + vec![Some("eth")] + }; + assert_eq!(new_chain.iter().collect::>(), expected_new_chain); let projected = ctx .sql( @@ -863,12 +928,12 @@ async fn test_struct_schema_evolution_projection_and_filter() -> Result<()> { Ok(()) } -/// Macro to generate paired test functions for List and LargeList variants. -/// Expands to two `#[tokio::test]` functions with the specified names. -macro_rules! test_struct_schema_evolution_pair { +/// Macro to generate schema evolution tests for list-like variants. +macro_rules! test_struct_schema_evolution_variants { ( list: $list_test:ident, large_list: $large_list_test:ident, + fixed_size_list: $fixed_size_list_test:ident, fn: $assertion_fn:path $(, args: $($arg:expr),+)? ) => { #[tokio::test] @@ -880,10 +945,16 @@ macro_rules! test_struct_schema_evolution_pair { async fn $large_list_test() { $assertion_fn(NestedListKind::LargeList $(, $($arg),+)?).await; } + + #[tokio::test] + async fn $fixed_size_list_test() { + $assertion_fn(NestedListKind::FixedSizeList $(, $($arg),+)?).await; + } }; ( list: $list_test:ident, large_list: $large_list_test:ident, + fixed_size_list: $fixed_size_list_test:ident, fn_result: $assertion_fn:path ) => { #[tokio::test] @@ -895,12 +966,18 @@ macro_rules! test_struct_schema_evolution_pair { async fn $large_list_test() -> Result<()> { $assertion_fn(NestedListKind::LargeList).await } + + #[tokio::test] + async fn $fixed_size_list_test() -> Result<()> { + $assertion_fn(NestedListKind::FixedSizeList).await + } }; } -test_struct_schema_evolution_pair!( +test_struct_schema_evolution_variants!( list: test_list_struct_schema_evolution_end_to_end, large_list: test_large_list_struct_schema_evolution_end_to_end, + fixed_size_list: test_fixed_size_list_struct_schema_evolution_end_to_end, fn_result: assert_nested_list_struct_schema_evolution ); @@ -910,15 +987,11 @@ async fn assert_nested_list_struct_schema_evolution_errors( chain_nullable: bool, expected_error: &str, ) { + let messages = error_messages(kind); let batch = nested_messages_batch( kind, 1, - &[NestedMessageRow { - id: 10, - name: "alpha", - chain: Some("eth"), - ignored: None, - }], + &messages, &message_fields(DataType::Utf8, true, true, false), ); @@ -970,15 +1043,17 @@ fn incompatible_chain_type() -> DataType { DataType::Struct(vec![Arc::new(Field::new("value", DataType::Utf8, true))].into()) } -test_struct_schema_evolution_pair!( +test_struct_schema_evolution_variants!( list: test_list_struct_schema_evolution_non_nullable_missing_field_fails, large_list: test_large_list_struct_schema_evolution_non_nullable_missing_field_fails, + fixed_size_list: test_fixed_size_list_struct_schema_evolution_non_nullable_missing_field_fails, fn: assert_non_nullable_missing_chain_field_fails ); -test_struct_schema_evolution_pair!( +test_struct_schema_evolution_variants!( list: test_list_struct_schema_evolution_incompatible_field_fails, large_list: test_large_list_struct_schema_evolution_incompatible_field_fails, + fixed_size_list: test_fixed_size_list_struct_schema_evolution_incompatible_field_fails, fn: assert_incompatible_chain_field_fails ); diff --git a/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt b/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt deleted file mode 100644 index 30286befc6ce0..0000000000000 --- a/datafusion/sqllogictest/test_files/schema_evolution_fixed_size_list.slt +++ /dev/null @@ -1,210 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -########## -# End-user-facing coverage for FixedSizeList Parquet schema evolution. -# Covers additive fields, nulls, field reordering, and incompatible nested type changes. -# Each FixedSizeList(2, ...) literal below intentionally matches exactly two -# array elements so the test exercises schema evolution, not list-size mismatch. -########## - -# Test 1: FixedSizeList struct with additive nullable field -statement ok -CREATE EXTERNAL TABLE fsl_messages ( - row_id INT, - messages STRUCT[2] -) -STORED AS PARQUET -LOCATION 'test_files/scratch/schema_evolution_fixed_size_list/fsl_additive/'; - -statement ok -COPY ( - SELECT - 1 AS row_id, - arrow_cast( - [ - named_struct('id', 10, 'name', 'alpha'), - named_struct('id', 20, 'name', 'beta') - ], - 'FixedSizeList(2, Struct("id": Int64, "name": Utf8View))' - ) AS messages -) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_additive/old.parquet' -STORED AS PARQUET; - -statement ok -COPY ( - SELECT - 2 AS row_id, - arrow_cast( - [ - named_struct('id', 30, 'name', 'gamma', 'chain', 'eth'), - named_struct('id', 40, 'name', 'delta', 'chain', 'doge') - ], - 'FixedSizeList(2, Struct("id": Int64, "name": Utf8View, "chain": Utf8View))' - ) AS messages -) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_additive/new.parquet' -STORED AS PARQUET; - -query TTT -DESCRIBE fsl_messages; ----- -row_id Int32 YES -messages FixedSizeList(2 x Struct("id": Int64, "name": Utf8View, "chain": Utf8View)) YES - -# Old rows have NULL for the added field; new rows have values. -query I? -SELECT row_id, messages FROM fsl_messages ORDER BY row_id; ----- -1 [{id: 10, name: alpha, chain: NULL}, {id: 20, name: beta, chain: NULL}] -2 [{id: 30, name: gamma, chain: eth}, {id: 40, name: delta, chain: doge}] - -query IIT rowsort -SELECT - row_id, - get_field(messages[1], 'id') AS msg_id, - get_field(messages[1], 'chain') AS chain -FROM fsl_messages; ----- -1 10 NULL -2 30 eth - -# Test 2: FixedSizeList struct with null values -statement ok -CREATE EXTERNAL TABLE fsl_null_test ( - row_id INT, - items STRUCT[2] -) -STORED AS PARQUET -LOCATION 'test_files/scratch/schema_evolution_fixed_size_list/fsl_null/'; - -statement ok -COPY ( - SELECT - 1 AS row_id, - arrow_cast( - [ - named_struct('val', 100, 'tag', 'x'), - NULL - ], - 'FixedSizeList(2, Struct("val": Int64, "tag": Utf8View))' - ) AS items -) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_null/old.parquet' -STORED AS PARQUET; - -statement ok -COPY ( - SELECT - 2 AS row_id, - arrow_cast( - [ - NULL, - NULL - ], - 'FixedSizeList(2, Struct("val": Int64, "tag": Utf8View))' - ) AS items -) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_null/all_null.parquet' -STORED AS PARQUET; - -query I? -SELECT row_id, items FROM fsl_null_test ORDER BY row_id; ----- -1 [{val: 100, tag: x}, NULL] -2 [NULL, NULL] - -# Test 3: FixedSizeList struct - field reordering with evolution -statement ok -CREATE EXTERNAL TABLE fsl_reorder ( - row_id INT, - data STRUCT[2] -) -STORED AS PARQUET -LOCATION 'test_files/scratch/schema_evolution_fixed_size_list/fsl_reorder/'; - -statement ok -COPY ( - SELECT - 1 AS row_id, - arrow_cast( - [ - named_struct('id', 1, 'name', 'first'), - named_struct('id', 2, 'name', 'second') - ], - 'FixedSizeList(2, Struct("id": Int64, "name": Utf8View))' - ) AS data -) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_reorder/old.parquet' -STORED AS PARQUET; - -statement ok -COPY ( - SELECT - 2 AS row_id, - arrow_cast( - [ - named_struct('status', 'active', 'id', 3, 'name', 'third'), - named_struct('status', 'inactive', 'id', 4, 'name', 'fourth') - ], - 'FixedSizeList(2, Struct("status": Utf8View, "id": Int64, "name": Utf8View))' - ) AS data -) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_reorder/new.parquet' -STORED AS PARQUET; - -query TTT -DESCRIBE fsl_reorder; ----- -row_id Int32 YES -data FixedSizeList(2 x Struct("id": Int64, "name": Utf8View, "status": Utf8View)) YES - -query I? -SELECT row_id, data FROM fsl_reorder ORDER BY row_id; ----- -1 [{id: 1, name: first, status: NULL}, {id: 2, name: second, status: NULL}] -2 [{id: 3, name: third, status: active}, {id: 4, name: fourth, status: inactive}] - -query IT rowsort -SELECT - row_id, - get_field(data[1], 'status') AS status -FROM fsl_reorder; ----- -1 NULL -2 active - -# Test 4: FixedSizeList struct with incompatible nested type change should fail -statement ok -CREATE EXTERNAL TABLE fsl_incompatible ( - row_id INT, - items STRUCT[2] -) -STORED AS PARQUET -LOCATION 'test_files/scratch/schema_evolution_fixed_size_list/fsl_incompatible/'; - -statement ok -COPY ( - SELECT - 1 AS row_id, - arrow_cast( - [ - named_struct('id', X'01', 'name', 'alpha'), - named_struct('id', X'02', 'name', 'beta') - ], - 'FixedSizeList(2, Struct("id": BinaryView, "name": Utf8View))' - ) AS items -) TO 'test_files/scratch/schema_evolution_fixed_size_list/fsl_incompatible/new.parquet' -STORED AS PARQUET; - -query error DataFusion error: Execution error:[\s\S]*Cannot cast struct field 'id' from type BinaryView to type Int64 -SELECT row_id, items FROM fsl_incompatible; diff --git a/datafusion/sqllogictest/test_files/schema_evolution_nested.slt b/datafusion/sqllogictest/test_files/schema_evolution_nested.slt index 53bc16fe51508..d19aa1948fa0c 100644 --- a/datafusion/sqllogictest/test_files/schema_evolution_nested.slt +++ b/datafusion/sqllogictest/test_files/schema_evolution_nested.slt @@ -122,3 +122,61 @@ FROM large_list_messages; ---- 1 10 NULL 2 30 eth + +statement ok +CREATE EXTERNAL TABLE fixed_size_list_messages ( + row_id INT, + messages STRUCT[2] +) +STORED AS PARQUET +LOCATION 'test_files/scratch/schema_evolution_nested/fixed_size_list_messages/'; + +statement ok +COPY ( + SELECT + 1 AS row_id, + arrow_cast( + [ + named_struct('id', 10, 'name', 'alpha'), + named_struct('id', 20, 'name', 'beta') + ], + 'FixedSizeList(2, Struct("id": Int64, "name": Utf8View))' + ) AS messages +) TO 'test_files/scratch/schema_evolution_nested/fixed_size_list_messages/old.parquet' +STORED AS PARQUET; + +statement ok +COPY ( + SELECT + 2 AS row_id, + arrow_cast( + [ + named_struct('id', 30, 'name', 'gamma', 'chain', 'eth'), + named_struct('id', 40, 'name', 'delta', 'chain', 'doge') + ], + 'FixedSizeList(2, Struct("id": Int64, "name": Utf8View, "chain": Utf8View))' + ) AS messages +) TO 'test_files/scratch/schema_evolution_nested/fixed_size_list_messages/new.parquet' +STORED AS PARQUET; + +query TTT +DESCRIBE fixed_size_list_messages; +---- +row_id Int32 YES +messages FixedSizeList(2 x Struct("id": Int64, "name": Utf8View, "chain": Utf8View)) YES + +query I? +SELECT row_id, messages FROM fixed_size_list_messages ORDER BY row_id; +---- +1 [{id: 10, name: alpha, chain: NULL}, {id: 20, name: beta, chain: NULL}] +2 [{id: 30, name: gamma, chain: eth}, {id: 40, name: delta, chain: doge}] + +query IIT rowsort +SELECT + row_id, + get_field(messages[1], 'id') AS msg_id, + get_field(messages[1], 'chain') AS chain +FROM fixed_size_list_messages; +---- +1 10 NULL +2 30 eth