From 244537b10ed2f3af4194fa6876b152a2a8a24c50 Mon Sep 17 00:00:00 2001 From: Zehua Zou Date: Fri, 12 Jun 2026 17:27:49 +0800 Subject: [PATCH 1/5] Add support for list view and large list view --- .../parquet/arrow/arrow_reader_writer_test.cc | 69 ++++++++++ cpp/src/parquet/arrow/arrow_schema_test.cc | 53 ++++--- cpp/src/parquet/arrow/path_internal.cc | 130 +++++++++++------- cpp/src/parquet/arrow/path_internal_test.cc | 25 ++++ cpp/src/parquet/arrow/reader.cc | 46 ++++++- cpp/src/parquet/arrow/schema.cc | 52 ++++--- cpp/src/parquet/arrow/writer.cc | 20 ++- 7 files changed, 289 insertions(+), 106 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index d29458bf226b..42b79c993eca 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -31,6 +31,7 @@ #include #include +#include "arrow/array/array_nested.h" #include "arrow/array/builder_binary.h" #include "arrow/array/builder_decimal.h" #include "arrow/array/builder_dict.h" @@ -3324,6 +3325,74 @@ TEST(ArrowReadWrite, LargeList) { } } +TEST(ArrowReadWrite, ListView) { + auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]"); + auto offsets = ArrayFromJSON(::arrow::int32(), "[3, 0, 5, 1]"); + auto sizes = ArrayFromJSON(::arrow::int32(), "[2, 1, 0, 2]"); + ASSERT_OK_AND_ASSIGN(auto array, ::arrow::ListViewArray::FromArrays( + ::arrow::list_view(::arrow::int32()), *offsets, + *sizes, *values, default_memory_pool())); + auto table = Table::Make( + ::arrow::schema({::arrow::field("root", array->type(), false)}), {array}); + + auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); + CheckSimpleRoundtrip(table, 2, props_store_schema); + + ASSERT_OK_AND_ASSIGN(auto expected_array, + ::arrow::ListArray::FromListView(*array, default_memory_pool())); + auto expected = Table::Make( + ::arrow::schema({::arrow::field("root", ::arrow::list(::arrow::int32()), false)}), + {expected_array}); + CheckConfiguredRoundtrip(table, expected); +} + +TEST(ArrowReadWrite, EmptyListView) { + auto type = ::arrow::list_view(::arrow::int32()); + auto array = ArrayFromJSON(type, "[]"); + auto table = Table::Make(::arrow::schema({::arrow::field("root", type)}), {array}); + + auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); + std::shared_ptr result; + ASSERT_NO_FATAL_FAILURE( + DoRoundtrip(table, 1, &result, default_writer_properties(), props_store_schema)); + ASSERT_OK(result->ValidateFull()); + + ASSERT_EQ(1, result->column(0)->num_chunks()); + const auto& list_view = + checked_cast(*result->column(0)->chunk(0)); + ASSERT_EQ(0, list_view.length()); + ASSERT_NE(nullptr, list_view.value_offsets()); + ASSERT_EQ(0, list_view.value_offsets()->size()); + ASSERT_NE(nullptr, list_view.value_sizes()); + ASSERT_EQ(0, list_view.value_sizes()->size()); +} + +TEST(ArrowReadWrite, LargeListView) { + auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]"); + auto offsets = ArrayFromJSON(::arrow::int64(), "[3, 0, 5, 1]"); + auto sizes = ArrayFromJSON(::arrow::int64(), "[2, 1, 0, 2]"); + ASSERT_OK_AND_ASSIGN(auto array, ::arrow::LargeListViewArray::FromArrays( + ::arrow::large_list_view(::arrow::int32()), + *offsets, *sizes, *values, default_memory_pool())); + auto table = Table::Make( + ::arrow::schema({::arrow::field("root", array->type(), false)}), {array}); + + auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); + CheckSimpleRoundtrip(table, 2, props_store_schema); + + ASSERT_OK_AND_ASSIGN(auto expected_array, + ::arrow::LargeListArray::FromListView( + checked_cast(*array), + default_memory_pool())); + auto expected = Table::Make( + ::arrow::schema({::arrow::field("root", ::arrow::large_list(::arrow::int32()))}), + {expected_array}); + ArrowReaderProperties reader_props; + reader_props.set_list_type(::arrow::Type::LARGE_LIST); + CheckConfiguredRoundtrip(table, expected, ::parquet::default_writer_properties(), + default_arrow_writer_properties(), reader_props); +} + TEST(ArrowReadWrite, FixedSizeList) { using ::arrow::field; using ::arrow::fixed_size_list; diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 7d9ecb5e6449..7a7b5a336939 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -1728,40 +1728,35 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) { std::vector parquet_fields; std::vector> arrow_fields; - // parquet_arrow will always generate 3-level LIST encodings - - // // LargeList (list-like non-null, elements nullable) - // required group my_list (LIST) { - // repeated group list { - // optional binary element (UTF8); - // } - // } - { - auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL, - ParquetType::BYTE_ARRAY, ConvertedType::UTF8); - auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); - parquet_fields.push_back( - GroupNode::Make("my_list", Repetition::REQUIRED, {list}, ConvertedType::LIST)); - auto arrow_element = ::arrow::field("string", UTF8, true); - auto arrow_list = ::arrow::large_list(arrow_element); - arrow_fields.push_back(::arrow::field("my_list", arrow_list, false)); - } - // // FixedSizeList[10] (list-like non-null, elements nullable) + // parquet_arrow will always generate this 3-level LIST encoding for list-like + // non-null Arrow arrays with nullable string elements: + // // required group my_list (LIST) { // repeated group list { // optional binary element (UTF8); // } // } - { - auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL, - ParquetType::BYTE_ARRAY, ConvertedType::UTF8); - auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); - parquet_fields.push_back( - GroupNode::Make("my_list", Repetition::REQUIRED, {list}, ConvertedType::LIST)); - auto arrow_element = ::arrow::field("string", UTF8, true); - auto arrow_list = ::arrow::fixed_size_list(arrow_element, 10); - arrow_fields.push_back(::arrow::field("my_list", arrow_list, false)); - } + auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL, + ParquetType::BYTE_ARRAY, ConvertedType::UTF8); + auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); + auto parquet_field = + GroupNode::Make("my_list", Repetition::REQUIRED, {list}, ConvertedType::LIST); + + auto AddListLikeField = [&](std::shared_ptr<::arrow::DataType> arrow_list) { + parquet_fields.push_back(parquet_field); + arrow_fields.push_back(::arrow::field("my_list", std::move(arrow_list), false)); + }; + + auto arrow_element = ::arrow::field("string", UTF8, true); + + // LargeList (list-like non-null, elements nullable) + AddListLikeField(::arrow::large_list(arrow_element)); + // ListView (list-like non-null, elements nullable) + AddListLikeField(::arrow::list_view(arrow_element)); + // LargeListView (list-like non-null, elements nullable) + AddListLikeField(::arrow::large_list_view(arrow_element)); + // FixedSizeList[10] (list-like non-null, elements nullable) + AddListLikeField(::arrow::fixed_size_list(arrow_element, 10)); ASSERT_OK(ConvertSchema(arrow_fields)); diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index 002859a5e7d5..eb45d0411c21 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -387,17 +387,16 @@ class ListPathNode { PathWriteContext* context) { // First fill int the remainder of the list. RETURN_IF_ERROR(FillRepLevels(child_range->Size(), rep_level_, context)); + // Once we've reached this point the following preconditions should hold: // 1. There are no more repeated path nodes to deal with. - // 2. All elements in |range| represent contiguous elements in the - // child array (Null values would have shortened the range to ensure - // all remaining list elements are present (though they may be empty lists)). + // 2. Null values would have shortened the range to ensure all remaining + // list elements are present (though they may be empty lists). // 3. No element of range spans a parent list (intermediate // list nodes only handle one list entry at a time). // - // Given these preconditions it should be safe to fill runs on non-empty + // Given these preconditions it is safe to fill runs on contiguous non-empty // lists here and expand the range in the child node accordingly. - while (!range->Empty()) { ElementRange size_check = selector_.GetRange(range->start); if (size_check.Empty()) { @@ -406,14 +405,20 @@ class ListPathNode { // def_levels entered first. break; } + if constexpr (RangeSelector::kCheckContiguous) { + if (size_check.start != child_range->end) { + break; + } + } else { + DCHECK_EQ(size_check.start, child_range->end) + << size_check.start << " != " << child_range->end; + } // This is the start of a new list. We can be sure it only applies // to the previous list (and doesn't jump to the start of any list // further up in nesting due to the constraints mentioned at the start // of the function). RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_)); RETURN_IF_ERROR(context->AppendRepLevels(size_check.Size() - 1, rep_level_)); - DCHECK_EQ(size_check.start, child_range->end) - << size_check.start << " != " << child_range->end; child_range->end = size_check.end; ++range->start; } @@ -434,19 +439,37 @@ class ListPathNode { template struct VarRangeSelector { + static constexpr bool kCheckContiguous = false; + ElementRange GetRange(int64_t index) const { - return ElementRange{offsets[index], offsets[index + 1]}; + return ElementRange{.start = offsets[index], .end = offsets[index + 1]}; } // Either int32_t* or int64_t*. const OffsetType* offsets; }; +template +struct ListViewRangeSelector { + static constexpr bool kCheckContiguous = true; + + ElementRange GetRange(int64_t index) const { + const int64_t start = offsets[index]; + return ElementRange{.start = start, .end = start + sizes[index]}; + } + + const OffsetType* offsets; + const OffsetType* sizes; +}; + struct FixedSizedRangeSelector { + static constexpr bool kCheckContiguous = false; + ElementRange GetRange(int64_t index) const { int64_t start = index * list_size; - return ElementRange{start, start + list_size}; + return ElementRange{.start = start, .end = start + list_size}; } + int list_size; }; @@ -510,6 +533,8 @@ class NullableNode { using ListNode = ListPathNode>; using LargeListNode = ListPathNode>; +using ListViewNode = ListPathNode>; +using LargeListViewNode = ListPathNode>; using FixedSizeListNode = ListPathNode; // Contains static information derived from traversing the schema. @@ -517,9 +542,9 @@ struct PathInfo { // The vectors are expected to the same length info. // Note index order matters here. - using Node = - std::variant; + using Node = std::variant; std::vector path; std::shared_ptr primitive_array; @@ -529,6 +554,28 @@ struct PathInfo { bool leaf_is_nullable = false; }; +struct WritePathVisitor { + IterationResult operator()(NullableNode& node) { + return node.Run(stack_position, stack_position + 1, context); + } + IterationResult operator()(NullableTerminalNode& node) { + return node.Run(*stack_position, context); + } + IterationResult operator()(AllPresentTerminalNode& node) { + return node.Run(*stack_position, context); + } + IterationResult operator()(AllNullsTerminalNode& node) { + return node.Run(*stack_position, context); + } + template + IterationResult operator()(ListPathNode& node) { + return node.Run(stack_position, stack_position + 1, context); + } + + ElementRange* stack_position; + PathWriteContext* context; +}; + /// Contains logic for writing a single leaf node to parquet. /// This tracks the path from root to leaf. /// @@ -575,32 +622,7 @@ Status WritePath(ElementRange root_range, PathInfo* path_info, // |root_range| are processed. while (stack_position >= stack_base) { PathInfo::Node& node = path_info->path[stack_position - stack_base]; - struct { - IterationResult operator()(NullableNode& node) { - return node.Run(stack_position, stack_position + 1, context); - } - IterationResult operator()(ListNode& node) { - return node.Run(stack_position, stack_position + 1, context); - } - IterationResult operator()(NullableTerminalNode& node) { - return node.Run(*stack_position, context); - } - IterationResult operator()(FixedSizeListNode& node) { - return node.Run(stack_position, stack_position + 1, context); - } - IterationResult operator()(AllPresentTerminalNode& node) { - return node.Run(*stack_position, context); - } - IterationResult operator()(AllNullsTerminalNode& node) { - return node.Run(*stack_position, context); - } - IterationResult operator()(LargeListNode& node) { - return node.Run(stack_position, stack_position + 1, context); - } - ElementRange* stack_position; - PathWriteContext* context; - } visitor = {stack_position, &context}; - + WritePathVisitor visitor = {.stack_position = stack_position, .context = &context}; IterationResult result = std::visit(visitor, node); if (ARROW_PREDICT_FALSE(result == kError)) { @@ -637,20 +659,17 @@ struct FixupVisitor { int max_rep_level = -1; int16_t rep_level_if_null = kLevelNotSet; - template - void HandleListNode(T& arg) { - if (arg.rep_level() == max_rep_level) { - arg.SetLast(); + template + void operator()(ListPathNode& node) { + if (node.rep_level() == max_rep_level) { + node.SetLast(); // after the last list node we don't need to fill // rep levels on null. rep_level_if_null = kLevelNotSet; } else { - rep_level_if_null = arg.rep_level(); + rep_level_if_null = node.rep_level(); } } - void operator()(ListNode& node) { HandleListNode(node); } - void operator()(LargeListNode& node) { HandleListNode(node); } - void operator()(FixedSizeListNode& node) { HandleListNode(node); } // For non-list intermediate nodes. template @@ -740,6 +759,23 @@ class PathBuilder { return VisitInline(*array.values()); } + template + requires ::arrow::is_list_view_type::value + Status Visit(const T& array) { + MaybeAddNullable(array); + // Increment necessary due to empty lists. + info_.max_def_level++; + info_.max_rep_level++; + // raw_value_offsets() and raw_value_sizes() account for any slice offset. + ListPathNode> node( + ListViewRangeSelector{array.raw_value_offsets(), + array.raw_value_sizes()}, + info_.max_rep_level, info_.max_def_level - 1); + info_.path.emplace_back(std::move(node)); + nullable_in_parent_ = array.list_view_type()->value_field()->nullable(); + return VisitInline(*array.values()); + } + Status Visit(const ::arrow::DictionaryArray& array) { // Only currently handle DictionaryArray where the dictionary is a // primitive type @@ -830,8 +866,6 @@ class PathBuilder { // Types not yet supported in Parquet. NOT_IMPLEMENTED_VISIT(Union) NOT_IMPLEMENTED_VISIT(RunEndEncoded); - NOT_IMPLEMENTED_VISIT(ListView); - NOT_IMPLEMENTED_VISIT(LargeListView); #undef NOT_IMPLEMENTED_VISIT std::vector& paths() { return paths_; } diff --git a/cpp/src/parquet/arrow/path_internal_test.cc b/cpp/src/parquet/arrow/path_internal_test.cc index 0145e889ddaf..4222bfc55124 100644 --- a/cpp/src/parquet/arrow/path_internal_test.cc +++ b/cpp/src/parquet/arrow/path_internal_test.cc @@ -24,6 +24,7 @@ #include #include +#include "arrow/array/array_nested.h" #include "arrow/testing/gtest_util.h" #include "arrow/type.h" @@ -296,6 +297,30 @@ TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllPresentEntries) { EXPECT_THAT(result.post_list_elements[0].end, Eq(3)); } +TEST_F(MultipathLevelBuilderTest, ListViewOutOfOrder) { + auto values = ::arrow::ArrayFromJSON(::arrow::int64(), "[1, 2, 3, 4, 5]"); + auto offsets = ::arrow::ArrayFromJSON(::arrow::int32(), "[3, 0, 5, 1]"); + auto sizes = ::arrow::ArrayFromJSON(::arrow::int32(), "[2, 1, 0, 2]"); + ASSERT_OK_AND_ASSIGN( + auto array, ::arrow::ListViewArray::FromArrays( + ::arrow::list_view( + field("Entries", ::arrow::int64(), /*nullable=*/false)), + *offsets, *sizes, *values, default_memory_pool())); + + ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/false, &context_, + callback_)); + + ASSERT_THAT(results_, SizeIs(1)); + const CapturedResult& result = results_[0]; + result.CheckLevels(/*def_levels=*/{1, 1, 1, 0, 1, 1}, + /*rep_levels=*/{0, 1, 0, 0, 0, 1}); + ASSERT_THAT(result.post_list_elements, SizeIs(2)); + EXPECT_THAT(result.post_list_elements[0].start, Eq(3)); + EXPECT_THAT(result.post_list_elements[0].end, Eq(5)); + EXPECT_THAT(result.post_list_elements[1].start, Eq(0)); + EXPECT_THAT(result.post_list_elements[1].end, Eq(3)); +} + TEST_F(MultipathLevelBuilderTest, NullableSingleListWithAllEmptyEntries) { auto entries = field("Entries", ::arrow::int64(), /*nullable=*/true); auto list_type = list(entries); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index a60af69aec9f..02ef6dd127bb 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -673,13 +673,39 @@ class ListReader : public ColumnReaderImpl { const std::shared_ptr field() override { return field_; } - private: + protected: std::shared_ptr ctx_; + + private: std::shared_ptr field_; ::parquet::internal::LevelInfo level_info_; std::unique_ptr item_reader_; }; +template +class PARQUET_NO_EXPORT ListViewReader : public ListReader { + public: + using ListReader::ListReader; + + ::arrow::Result> AssembleArray( + std::shared_ptr data) final { + DCHECK_EQ(data->buffers.size(), 2); + const auto* offsets = reinterpret_cast(data->buffers[1]->data()); + ARROW_ASSIGN_OR_RAISE( + auto sizes_buffer, + AllocateResizableBuffer(sizeof(IndexType) * data->length, this->ctx_->pool)); + auto* sizes = reinterpret_cast(sizes_buffer->mutable_data()); + for (int64_t i = 0; i < data->length; ++i) { + sizes[i] = offsets[i + 1] - offsets[i]; + } + data->buffers[1] = ::arrow::SliceBuffer(std::move(data->buffers[1]), 0, + sizeof(IndexType) * data->length); + data->buffers.push_back(std::move(sizes_buffer)); + std::shared_ptr result = ::arrow::MakeArray(data); + return std::make_shared(std::move(result)); + } +}; + class PARQUET_NO_EXPORT FixedSizeListReader : public ListReader { public: FixedSizeListReader(std::shared_ptr ctx, std::shared_ptr field, @@ -888,7 +914,9 @@ Status GetReader(const SchemaField& field, const std::shared_ptr& arrow_f field.level_info); } else if (type_id == ::arrow::Type::LIST || type_id == ::arrow::Type::MAP || type_id == ::arrow::Type::FIXED_SIZE_LIST || - type_id == ::arrow::Type::LARGE_LIST) { + type_id == ::arrow::Type::LARGE_LIST || + type_id == ::arrow::Type::LIST_VIEW || + type_id == ::arrow::Type::LARGE_LIST_VIEW) { auto list_field = arrow_field; auto child = &field.children[0]; std::unique_ptr child_reader; @@ -941,6 +969,20 @@ Status GetReader(const SchemaField& field, const std::shared_ptr& arrow_f *out = std::make_unique>(ctx, list_field, field.level_info, std::move(child_reader)); + } else if (type_id == ::arrow::Type::LIST_VIEW) { + if (!reader_child_type->Equals(schema_child_type)) { + list_field = list_field->WithType(::arrow::list_view(reader_child_type)); + } + + *out = std::make_unique>(ctx, list_field, field.level_info, + std::move(child_reader)); + } else if (type_id == ::arrow::Type::LARGE_LIST_VIEW) { + if (!reader_child_type->Equals(schema_child_type)) { + list_field = list_field->WithType(::arrow::large_list_view(reader_child_type)); + } + + *out = std::make_unique>(ctx, list_field, field.level_info, + std::move(child_reader)); } else if (type_id == ::arrow::Type::FIXED_SIZE_LIST) { if (!reader_child_type->Equals(schema_child_type)) { auto& fixed_list_type = diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 9c4c462c6b8c..bc4de6c39b5e 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include "arrow/extension/json.h" @@ -455,7 +456,9 @@ Status FieldToNode(const std::string& name, const std::shared_ptr& field, } case ArrowTypeId::FIXED_SIZE_LIST: case ArrowTypeId::LARGE_LIST: - case ArrowTypeId::LIST: { + case ArrowTypeId::LIST: + case ArrowTypeId::LARGE_LIST_VIEW: + case ArrowTypeId::LIST_VIEW: { auto list_type = std::static_pointer_cast<::arrow::BaseListType>(field->type()); return ListToNode(list_type, name, field->nullable(), field_id, properties, arrow_properties, out); @@ -984,6 +987,15 @@ Status GetOriginSchema(const std::shared_ptr& metadata, Result ApplyOriginalMetadata(const Field& origin_field, SchemaField* inferred); +template +auto GetListFactory(Args&&... args) { + return [... args = std::forward(args)](FieldVector fields) mutable { + DCHECK_EQ(fields.size(), 1); + return std::make_shared(std::move(fields[0]), + std::forward(args)...); + }; +} + std::function(FieldVector)> GetNestedFactory( const ArrowType& origin_type, const ArrowType& inferred_type) { switch (inferred_type.id()) { @@ -993,28 +1005,26 @@ std::function(FieldVector)> GetNestedFactory( } break; case ::arrow::Type::LIST: - case ::arrow::Type::LARGE_LIST: - if (origin_type.id() == ::arrow::Type::LIST) { - return [](FieldVector fields) { - DCHECK_EQ(fields.size(), 1); - return ::arrow::list(std::move(fields[0])); - }; - } - if (origin_type.id() == ::arrow::Type::LARGE_LIST) { - return [](FieldVector fields) { - DCHECK_EQ(fields.size(), 1); - return ::arrow::large_list(std::move(fields[0])); - }; - } - if (origin_type.id() == ::arrow::Type::FIXED_SIZE_LIST) { - const auto list_size = - checked_cast(origin_type).list_size(); - return [list_size](FieldVector fields) { - DCHECK_EQ(fields.size(), 1); - return ::arrow::fixed_size_list(std::move(fields[0]), list_size); - }; + case ::arrow::Type::LARGE_LIST: { + switch (origin_type.id()) { + case ::arrow::Type::LIST: + return GetListFactory<::arrow::ListType>(); + case ::arrow::Type::LARGE_LIST: + return GetListFactory<::arrow::LargeListType>(); + case ::arrow::Type::LIST_VIEW: + return GetListFactory<::arrow::ListViewType>(); + case ::arrow::Type::LARGE_LIST_VIEW: + return GetListFactory<::arrow::LargeListViewType>(); + case ::arrow::Type::FIXED_SIZE_LIST: { + const auto list_size = + checked_cast(origin_type).list_size(); + return GetListFactory<::arrow::FixedSizeListType>(list_size); + } + default: + break; } break; + } case ::arrow::Type::MAP: if (origin_type.id() == ::arrow::Type::MAP) { const bool keys_sorted = diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index 4b2b06e5e097..ea4d58f329f7 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -26,6 +26,7 @@ #include #include "arrow/array.h" +#include "arrow/array/concatenate.h" #include "arrow/extension_type.h" #include "arrow/ipc/writer.h" #include "arrow/record_batch.h" @@ -169,13 +170,20 @@ class ArrowColumnWriterV2 { leaf_idx, ctx, [&](const MultipathLevelBuilderResult& result) { size_t visited_component_size = result.post_list_visited_elements.size(); DCHECK_GT(visited_component_size, 0); - if (visited_component_size != 1) { - return Status::NotImplemented( - "Lists with non-zero length null components are not supported"); + std::shared_ptr values_array; + if (visited_component_size == 1) { + const ElementRange& range = result.post_list_visited_elements[0]; + values_array = result.leaf_array->Slice(range.start, range.Size()); + } else { + ::arrow::ArrayVector arrays; + arrays.reserve(visited_component_size); + for (const auto& range : result.post_list_visited_elements) { + DCHECK(!range.Empty()); + arrays.push_back(result.leaf_array->Slice(range.start, range.Size())); + } + ARROW_ASSIGN_OR_RAISE(values_array, + ::arrow::Concatenate(arrays, ctx->memory_pool)); } - const ElementRange& range = result.post_list_visited_elements[0]; - std::shared_ptr values_array = - result.leaf_array->Slice(range.start, range.Size()); return column_writer->WriteArrow(result.def_levels, result.rep_levels, result.def_rep_level_count, *values_array, From e707c7883c37390f1430d87291344ca4e0c71851 Mon Sep 17 00:00:00 2001 From: Zehua Zou Date: Fri, 12 Jun 2026 17:36:22 +0800 Subject: [PATCH 2/5] clang-format --- cpp/src/parquet/arrow/path_internal_test.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/arrow/path_internal_test.cc b/cpp/src/parquet/arrow/path_internal_test.cc index 4222bfc55124..2daa94e6226c 100644 --- a/cpp/src/parquet/arrow/path_internal_test.cc +++ b/cpp/src/parquet/arrow/path_internal_test.cc @@ -302,13 +302,13 @@ TEST_F(MultipathLevelBuilderTest, ListViewOutOfOrder) { auto offsets = ::arrow::ArrayFromJSON(::arrow::int32(), "[3, 0, 5, 1]"); auto sizes = ::arrow::ArrayFromJSON(::arrow::int32(), "[2, 1, 0, 2]"); ASSERT_OK_AND_ASSIGN( - auto array, ::arrow::ListViewArray::FromArrays( - ::arrow::list_view( - field("Entries", ::arrow::int64(), /*nullable=*/false)), - *offsets, *sizes, *values, default_memory_pool())); + auto array, + ::arrow::ListViewArray::FromArrays( + ::arrow::list_view(field("Entries", ::arrow::int64(), /*nullable=*/false)), + *offsets, *sizes, *values, default_memory_pool())); - ASSERT_OK(MultipathLevelBuilder::Write(*array, /*nullable=*/false, &context_, - callback_)); + ASSERT_OK( + MultipathLevelBuilder::Write(*array, /*nullable=*/false, &context_, callback_)); ASSERT_THAT(results_, SizeIs(1)); const CapturedResult& result = results_[0]; From 0683e1fdab4e00baf7b50223c5c13348ceb00652 Mon Sep 17 00:00:00 2001 From: Zehua Zou Date: Fri, 12 Jun 2026 18:05:44 +0800 Subject: [PATCH 3/5] fix large list roundtrip --- cpp/src/parquet/arrow/arrow_reader_writer_test.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index 42b79c993eca..505a350f801b 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -3371,9 +3371,10 @@ TEST(ArrowReadWrite, LargeListView) { auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]"); auto offsets = ArrayFromJSON(::arrow::int64(), "[3, 0, 5, 1]"); auto sizes = ArrayFromJSON(::arrow::int64(), "[2, 1, 0, 2]"); + auto element = ::arrow::field("element", ::arrow::int32()); ASSERT_OK_AND_ASSIGN(auto array, ::arrow::LargeListViewArray::FromArrays( - ::arrow::large_list_view(::arrow::int32()), - *offsets, *sizes, *values, default_memory_pool())); + ::arrow::large_list_view(element), *offsets, + *sizes, *values, default_memory_pool())); auto table = Table::Make( ::arrow::schema({::arrow::field("root", array->type(), false)}), {array}); @@ -3385,7 +3386,7 @@ TEST(ArrowReadWrite, LargeListView) { checked_cast(*array), default_memory_pool())); auto expected = Table::Make( - ::arrow::schema({::arrow::field("root", ::arrow::large_list(::arrow::int32()))}), + ::arrow::schema({::arrow::field("root", ::arrow::large_list(element), false)}), {expected_array}); ArrowReaderProperties reader_props; reader_props.set_list_type(::arrow::Type::LARGE_LIST); From 576195367635e8c710f86fbe7b21e4be03abd6f6 Mon Sep 17 00:00:00 2001 From: Zehua Zou Date: Mon, 15 Jun 2026 16:41:40 +0800 Subject: [PATCH 4/5] address review --- cpp/src/parquet/arrow/reader.cc | 9 ++++++++- cpp/src/parquet/arrow/writer.cc | 4 ++++ cpp/src/parquet/properties.h | 6 +++++- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 02ef6dd127bb..cc107c1802e3 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -42,6 +42,7 @@ #include "arrow/util/parallel.h" #include "arrow/util/range.h" #include "arrow/util/tracing_internal.h" +#include "arrow/util/type_traits.h" #include "parquet/arrow/reader_internal.h" #include "parquet/bloom_filter.h" @@ -689,6 +690,11 @@ class PARQUET_NO_EXPORT ListViewReader : public ListReader { ::arrow::Result> AssembleArray( std::shared_ptr data) final { + static_assert(::arrow::internal::IsOneOf::value); + constexpr auto expected_type_id = std::is_same_v + ? ::arrow::Type::LIST_VIEW + : ::arrow::Type::LARGE_LIST_VIEW; + DCHECK_EQ(this->field()->type()->id(), expected_type_id); DCHECK_EQ(data->buffers.size(), 2); const auto* offsets = reinterpret_cast(data->buffers[1]->data()); ARROW_ASSIGN_OR_RAISE( @@ -698,7 +704,8 @@ class PARQUET_NO_EXPORT ListViewReader : public ListReader { for (int64_t i = 0; i < data->length; ++i) { sizes[i] = offsets[i + 1] - offsets[i]; } - data->buffers[1] = ::arrow::SliceBuffer(std::move(data->buffers[1]), 0, + // ListReader produces length + 1 offsets; ListView stores one offset per slot. + data->buffers[1] = ::arrow::SliceBuffer(std::move(data->buffers[1]), /*offset=*/0, sizeof(IndexType) * data->length); data->buffers.push_back(std::move(sizes_buffer)); std::shared_ptr result = ::arrow::MakeArray(data); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index ea4d58f329f7..e0fbe308219c 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -175,6 +175,10 @@ class ArrowColumnWriterV2 { const ElementRange& range = result.post_list_visited_elements[0]; values_array = result.leaf_array->Slice(range.start, range.Size()); } else { + // Multiple leaf ranges can be produced when child values are + // skipped, such as null fixed-size-list slots, or when + // list-view ranges are non-contiguous. Concatenate the slices + // in logical write order. ::arrow::ArrayVector arrays; arrays.reserve(visited_component_size); for (const auto& range : result.post_list_visited_elements) { diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 7eb37c3d52e3..905389d23a8a 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -1374,7 +1374,11 @@ class PARQUET_EXPORT ArrowWriterProperties { /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file, /// to enable certain read options (like "read_dictionary") to be set - /// automatically + /// automatically. + /// + /// This is required to read back Arrow types that are stored using a + /// different Parquet physical representation, such as ListView and + /// LargeListView, which are stored as regular Parquet LIST columns. Builder* store_schema() { store_schema_ = true; return this; From 786157dfebd6a24cd9fe3453d48f45d97dc8bdee Mon Sep 17 00:00:00 2001 From: Zehua Zou Date: Mon, 15 Jun 2026 16:55:21 +0800 Subject: [PATCH 5/5] address review --- cpp/src/parquet/properties.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 905389d23a8a..e2244a1176e3 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -1374,11 +1374,8 @@ class PARQUET_EXPORT ArrowWriterProperties { /// \brief EXPERIMENTAL: Write binary serialized Arrow schema to the file, /// to enable certain read options (like "read_dictionary") to be set - /// automatically. - /// - /// This is required to read back Arrow types that are stored using a - /// different Parquet physical representation, such as ListView and - /// LargeListView, which are stored as regular Parquet LIST columns. + /// automatically or read back non-default Arrow types like ListView, + /// LargeListView, LargeList, and Arrow extension types. Builder* store_schema() { store_schema_ = true; return this;