-
Notifications
You must be signed in to change notification settings - Fork 4.1k
GH-38849: [C++][Parquet] Add support for list view and large list view #50160
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,6 +31,7 @@ | |
| #include <utility> | ||
| #include <vector> | ||
|
|
||
| #include "arrow/array/array_nested.h" | ||
| #include "arrow/array/builder_binary.h" | ||
| #include "arrow/array/builder_decimal.h" | ||
| #include "arrow/array/builder_dict.h" | ||
|
|
@@ -3324,6 +3325,75 @@ TEST(ArrowReadWrite, LargeList) { | |
| } | ||
| } | ||
|
|
||
| TEST(ArrowReadWrite, ListView) { | ||
| auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]"); | ||
| auto offsets = ArrayFromJSON(::arrow::int32(), "[3, 0, 5, 1]"); | ||
| auto sizes = ArrayFromJSON(::arrow::int32(), "[2, 1, 0, 2]"); | ||
| ASSERT_OK_AND_ASSIGN(auto array, ::arrow::ListViewArray::FromArrays( | ||
| ::arrow::list_view(::arrow::int32()), *offsets, | ||
| *sizes, *values, default_memory_pool())); | ||
| auto table = Table::Make( | ||
| ::arrow::schema({::arrow::field("root", array->type(), false)}), {array}); | ||
|
|
||
| auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); | ||
| CheckSimpleRoundtrip(table, 2, props_store_schema); | ||
|
|
||
| ASSERT_OK_AND_ASSIGN(auto expected_array, | ||
| ::arrow::ListArray::FromListView(*array, default_memory_pool())); | ||
| auto expected = Table::Make( | ||
| ::arrow::schema({::arrow::field("root", ::arrow::list(::arrow::int32()), false)}), | ||
| {expected_array}); | ||
| CheckConfiguredRoundtrip(table, expected); | ||
| } | ||
|
|
||
| TEST(ArrowReadWrite, EmptyListView) { | ||
| auto type = ::arrow::list_view(::arrow::int32()); | ||
| auto array = ArrayFromJSON(type, "[]"); | ||
| auto table = Table::Make(::arrow::schema({::arrow::field("root", type)}), {array}); | ||
|
|
||
| auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); | ||
| std::shared_ptr<Table> result; | ||
| ASSERT_NO_FATAL_FAILURE( | ||
| DoRoundtrip(table, 1, &result, default_writer_properties(), props_store_schema)); | ||
| ASSERT_OK(result->ValidateFull()); | ||
|
|
||
| ASSERT_EQ(1, result->column(0)->num_chunks()); | ||
| const auto& list_view = | ||
| checked_cast<const ::arrow::ListViewArray&>(*result->column(0)->chunk(0)); | ||
| ASSERT_EQ(0, list_view.length()); | ||
| ASSERT_NE(nullptr, list_view.value_offsets()); | ||
| ASSERT_EQ(0, list_view.value_offsets()->size()); | ||
| ASSERT_NE(nullptr, list_view.value_sizes()); | ||
| ASSERT_EQ(0, list_view.value_sizes()->size()); | ||
| } | ||
|
|
||
| TEST(ArrowReadWrite, LargeListView) { | ||
| auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]"); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we use a single test by parameterizing
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think there's much duplicate code here, and it's not easy to merge. Because without enabling
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure why the child array would have a different name vs. the regular ListView case, but in any case you can have a helper function that also takes the inner element's name as argument. |
||
| auto offsets = ArrayFromJSON(::arrow::int64(), "[3, 0, 5, 1]"); | ||
| auto sizes = ArrayFromJSON(::arrow::int64(), "[2, 1, 0, 2]"); | ||
| auto element = ::arrow::field("element", ::arrow::int32()); | ||
| ASSERT_OK_AND_ASSIGN(auto array, ::arrow::LargeListViewArray::FromArrays( | ||
| ::arrow::large_list_view(element), *offsets, | ||
| *sizes, *values, default_memory_pool())); | ||
| auto table = Table::Make( | ||
| ::arrow::schema({::arrow::field("root", array->type(), false)}), {array}); | ||
|
|
||
| auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build(); | ||
|
HuaHuaY marked this conversation as resolved.
|
||
| CheckSimpleRoundtrip(table, 2, props_store_schema); | ||
|
|
||
| ASSERT_OK_AND_ASSIGN(auto expected_array, | ||
| ::arrow::LargeListArray::FromListView( | ||
| checked_cast<const ::arrow::LargeListViewArray&>(*array), | ||
| default_memory_pool())); | ||
| auto expected = Table::Make( | ||
| ::arrow::schema({::arrow::field("root", ::arrow::large_list(element), false)}), | ||
| {expected_array}); | ||
| ArrowReaderProperties reader_props; | ||
| reader_props.set_list_type(::arrow::Type::LARGE_LIST); | ||
| CheckConfiguredRoundtrip(table, expected, ::parquet::default_writer_properties(), | ||
| default_arrow_writer_properties(), reader_props); | ||
| } | ||
|
|
||
| TEST(ArrowReadWrite, FixedSizeList) { | ||
| using ::arrow::field; | ||
| using ::arrow::fixed_size_list; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -387,17 +387,16 @@ class ListPathNode { | |
| PathWriteContext* context) { | ||
| // First fill int the remainder of the list. | ||
| RETURN_IF_ERROR(FillRepLevels(child_range->Size(), rep_level_, context)); | ||
|
|
||
| // Once we've reached this point the following preconditions should hold: | ||
| // 1. There are no more repeated path nodes to deal with. | ||
| // 2. All elements in |range| represent contiguous elements in the | ||
| // child array (Null values would have shortened the range to ensure | ||
| // all remaining list elements are present (though they may be empty lists)). | ||
| // 2. Null values would have shortened the range to ensure all remaining | ||
| // list elements are present (though they may be empty lists). | ||
| // 3. No element of range spans a parent list (intermediate | ||
| // list nodes only handle one list entry at a time). | ||
| // | ||
| // Given these preconditions it should be safe to fill runs on non-empty | ||
| // Given these preconditions it is safe to fill runs on contiguous non-empty | ||
| // lists here and expand the range in the child node accordingly. | ||
|
|
||
| while (!range->Empty()) { | ||
| ElementRange size_check = selector_.GetRange(range->start); | ||
| if (size_check.Empty()) { | ||
|
|
@@ -406,14 +405,20 @@ class ListPathNode { | |
| // def_levels entered first. | ||
| break; | ||
| } | ||
| if constexpr (RangeSelector::kCheckContiguous) { | ||
| if (size_check.start != child_range->end) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you comment on why we're doing this? This code is complicated and this will help the reader. |
||
| break; | ||
| } | ||
| } else { | ||
| DCHECK_EQ(size_check.start, child_range->end) | ||
| << size_check.start << " != " << child_range->end; | ||
| } | ||
| // This is the start of a new list. We can be sure it only applies | ||
| // to the previous list (and doesn't jump to the start of any list | ||
| // further up in nesting due to the constraints mentioned at the start | ||
| // of the function). | ||
| RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_)); | ||
| RETURN_IF_ERROR(context->AppendRepLevels(size_check.Size() - 1, rep_level_)); | ||
| DCHECK_EQ(size_check.start, child_range->end) | ||
| << size_check.start << " != " << child_range->end; | ||
| child_range->end = size_check.end; | ||
| ++range->start; | ||
| } | ||
|
|
@@ -434,19 +439,37 @@ class ListPathNode { | |
|
|
||
| template <typename OffsetType> | ||
| struct VarRangeSelector { | ||
| static constexpr bool kCheckContiguous = false; | ||
|
|
||
| ElementRange GetRange(int64_t index) const { | ||
| return ElementRange{offsets[index], offsets[index + 1]}; | ||
| return ElementRange{.start = offsets[index], .end = offsets[index + 1]}; | ||
| } | ||
|
|
||
| // Either int32_t* or int64_t*. | ||
| const OffsetType* offsets; | ||
| }; | ||
|
|
||
| template <typename OffsetType> | ||
| struct ListViewRangeSelector { | ||
| static constexpr bool kCheckContiguous = true; | ||
|
|
||
| ElementRange GetRange(int64_t index) const { | ||
| const int64_t start = offsets[index]; | ||
| return ElementRange{.start = start, .end = start + sizes[index]}; | ||
| } | ||
|
|
||
| const OffsetType* offsets; | ||
| const OffsetType* sizes; | ||
| }; | ||
|
|
||
| struct FixedSizedRangeSelector { | ||
| static constexpr bool kCheckContiguous = false; | ||
|
|
||
| ElementRange GetRange(int64_t index) const { | ||
| int64_t start = index * list_size; | ||
| return ElementRange{start, start + list_size}; | ||
| return ElementRange{.start = start, .end = start + list_size}; | ||
| } | ||
|
|
||
| int list_size; | ||
| }; | ||
|
|
||
|
|
@@ -510,16 +533,18 @@ class NullableNode { | |
|
|
||
| using ListNode = ListPathNode<VarRangeSelector<int32_t>>; | ||
| using LargeListNode = ListPathNode<VarRangeSelector<int64_t>>; | ||
| using ListViewNode = ListPathNode<ListViewRangeSelector<int32_t>>; | ||
| using LargeListViewNode = ListPathNode<ListViewRangeSelector<int64_t>>; | ||
| using FixedSizeListNode = ListPathNode<FixedSizedRangeSelector>; | ||
|
|
||
| // Contains static information derived from traversing the schema. | ||
| struct PathInfo { | ||
| // The vectors are expected to the same length info. | ||
|
|
||
| // Note index order matters here. | ||
| using Node = | ||
| std::variant<NullableTerminalNode, ListNode, LargeListNode, FixedSizeListNode, | ||
| NullableNode, AllPresentTerminalNode, AllNullsTerminalNode>; | ||
| using Node = std::variant<NullableTerminalNode, ListNode, LargeListNode, ListViewNode, | ||
| LargeListViewNode, FixedSizeListNode, NullableNode, | ||
| AllPresentTerminalNode, AllNullsTerminalNode>; | ||
|
|
||
| std::vector<Node> path; | ||
| std::shared_ptr<Array> primitive_array; | ||
|
|
@@ -529,6 +554,28 @@ struct PathInfo { | |
| bool leaf_is_nullable = false; | ||
| }; | ||
|
|
||
| struct WritePathVisitor { | ||
| IterationResult operator()(NullableNode& node) { | ||
| return node.Run(stack_position, stack_position + 1, context); | ||
| } | ||
| IterationResult operator()(NullableTerminalNode& node) { | ||
| return node.Run(*stack_position, context); | ||
| } | ||
| IterationResult operator()(AllPresentTerminalNode& node) { | ||
| return node.Run(*stack_position, context); | ||
| } | ||
| IterationResult operator()(AllNullsTerminalNode& node) { | ||
| return node.Run(*stack_position, context); | ||
| } | ||
| template <typename RangeSelector> | ||
| IterationResult operator()(ListPathNode<RangeSelector>& node) { | ||
| return node.Run(stack_position, stack_position + 1, context); | ||
| } | ||
|
|
||
| ElementRange* stack_position; | ||
| PathWriteContext* context; | ||
| }; | ||
|
|
||
| /// Contains logic for writing a single leaf node to parquet. | ||
| /// This tracks the path from root to leaf. | ||
| /// | ||
|
|
@@ -575,32 +622,7 @@ Status WritePath(ElementRange root_range, PathInfo* path_info, | |
| // |root_range| are processed. | ||
| while (stack_position >= stack_base) { | ||
| PathInfo::Node& node = path_info->path[stack_position - stack_base]; | ||
| struct { | ||
| IterationResult operator()(NullableNode& node) { | ||
| return node.Run(stack_position, stack_position + 1, context); | ||
| } | ||
| IterationResult operator()(ListNode& node) { | ||
| return node.Run(stack_position, stack_position + 1, context); | ||
| } | ||
| IterationResult operator()(NullableTerminalNode& node) { | ||
| return node.Run(*stack_position, context); | ||
| } | ||
| IterationResult operator()(FixedSizeListNode& node) { | ||
| return node.Run(stack_position, stack_position + 1, context); | ||
| } | ||
| IterationResult operator()(AllPresentTerminalNode& node) { | ||
| return node.Run(*stack_position, context); | ||
| } | ||
| IterationResult operator()(AllNullsTerminalNode& node) { | ||
| return node.Run(*stack_position, context); | ||
| } | ||
| IterationResult operator()(LargeListNode& node) { | ||
| return node.Run(stack_position, stack_position + 1, context); | ||
| } | ||
| ElementRange* stack_position; | ||
| PathWriteContext* context; | ||
| } visitor = {stack_position, &context}; | ||
|
|
||
| WritePathVisitor visitor = {.stack_position = stack_position, .context = &context}; | ||
| IterationResult result = std::visit(visitor, node); | ||
|
|
||
| if (ARROW_PREDICT_FALSE(result == kError)) { | ||
|
|
@@ -637,20 +659,17 @@ struct FixupVisitor { | |
| int max_rep_level = -1; | ||
| int16_t rep_level_if_null = kLevelNotSet; | ||
|
|
||
| template <typename T> | ||
| void HandleListNode(T& arg) { | ||
| if (arg.rep_level() == max_rep_level) { | ||
| arg.SetLast(); | ||
| template <typename RangeSelector> | ||
| void operator()(ListPathNode<RangeSelector>& node) { | ||
| if (node.rep_level() == max_rep_level) { | ||
| node.SetLast(); | ||
| // after the last list node we don't need to fill | ||
| // rep levels on null. | ||
| rep_level_if_null = kLevelNotSet; | ||
| } else { | ||
| rep_level_if_null = arg.rep_level(); | ||
| rep_level_if_null = node.rep_level(); | ||
| } | ||
| } | ||
| void operator()(ListNode& node) { HandleListNode(node); } | ||
| void operator()(LargeListNode& node) { HandleListNode(node); } | ||
| void operator()(FixedSizeListNode& node) { HandleListNode(node); } | ||
|
|
||
| // For non-list intermediate nodes. | ||
| template <typename T> | ||
|
|
@@ -740,6 +759,23 @@ class PathBuilder { | |
| return VisitInline(*array.values()); | ||
| } | ||
|
|
||
| template <typename T> | ||
| requires ::arrow::is_list_view_type<typename T::TypeClass>::value | ||
| Status Visit(const T& array) { | ||
| MaybeAddNullable(array); | ||
| // Increment necessary due to empty lists. | ||
| info_.max_def_level++; | ||
| info_.max_rep_level++; | ||
| // raw_value_offsets() and raw_value_sizes() account for any slice offset. | ||
| ListPathNode<ListViewRangeSelector<typename T::offset_type>> node( | ||
| ListViewRangeSelector<typename T::offset_type>{array.raw_value_offsets(), | ||
| array.raw_value_sizes()}, | ||
| info_.max_rep_level, info_.max_def_level - 1); | ||
| info_.path.emplace_back(std::move(node)); | ||
| nullable_in_parent_ = array.list_view_type()->value_field()->nullable(); | ||
| return VisitInline(*array.values()); | ||
| } | ||
|
|
||
| Status Visit(const ::arrow::DictionaryArray& array) { | ||
| // Only currently handle DictionaryArray where the dictionary is a | ||
| // primitive type | ||
|
|
@@ -830,8 +866,6 @@ class PathBuilder { | |
| // Types not yet supported in Parquet. | ||
| NOT_IMPLEMENTED_VISIT(Union) | ||
| NOT_IMPLEMENTED_VISIT(RunEndEncoded); | ||
| NOT_IMPLEMENTED_VISIT(ListView); | ||
| NOT_IMPLEMENTED_VISIT(LargeListView); | ||
|
|
||
| #undef NOT_IMPLEMENTED_VISIT | ||
| std::vector<PathInfo>& paths() { return paths_; } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we also check the expect List result when
store_schemais false? This would further validate that the actual Parquet contents are ok.