Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <utility>
#include <vector>

#include "arrow/array/array_nested.h"
#include "arrow/array/builder_binary.h"
#include "arrow/array/builder_decimal.h"
#include "arrow/array/builder_dict.h"
Expand Down Expand Up @@ -3324,6 +3325,75 @@ TEST(ArrowReadWrite, LargeList) {
}
}

TEST(ArrowReadWrite, ListView) {
auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]");
auto offsets = ArrayFromJSON(::arrow::int32(), "[3, 0, 5, 1]");
auto sizes = ArrayFromJSON(::arrow::int32(), "[2, 1, 0, 2]");
ASSERT_OK_AND_ASSIGN(auto array, ::arrow::ListViewArray::FromArrays(
::arrow::list_view(::arrow::int32()), *offsets,
*sizes, *values, default_memory_pool()));
auto table = Table::Make(
::arrow::schema({::arrow::field("root", array->type(), false)}), {array});

auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build();

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also check the expect List result when store_schema is false? This would further validate that the actual Parquet contents are ok.

CheckSimpleRoundtrip(table, 2, props_store_schema);

ASSERT_OK_AND_ASSIGN(auto expected_array,
::arrow::ListArray::FromListView(*array, default_memory_pool()));
auto expected = Table::Make(
::arrow::schema({::arrow::field("root", ::arrow::list(::arrow::int32()), false)}),
{expected_array});
CheckConfiguredRoundtrip(table, expected);
}

TEST(ArrowReadWrite, EmptyListView) {
auto type = ::arrow::list_view(::arrow::int32());
auto array = ArrayFromJSON(type, "[]");
auto table = Table::Make(::arrow::schema({::arrow::field("root", type)}), {array});

auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build();
std::shared_ptr<Table> result;
ASSERT_NO_FATAL_FAILURE(
DoRoundtrip(table, 1, &result, default_writer_properties(), props_store_schema));
ASSERT_OK(result->ValidateFull());

ASSERT_EQ(1, result->column(0)->num_chunks());
const auto& list_view =
checked_cast<const ::arrow::ListViewArray&>(*result->column(0)->chunk(0));
ASSERT_EQ(0, list_view.length());
ASSERT_NE(nullptr, list_view.value_offsets());
ASSERT_EQ(0, list_view.value_offsets()->size());
ASSERT_NE(nullptr, list_view.value_sizes());
ASSERT_EQ(0, list_view.value_sizes()->size());
}

TEST(ArrowReadWrite, LargeListView) {
auto values = ArrayFromJSON(::arrow::int32(), "[1, 2, 3, 4, 5]");

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use a single test by parameterizing schema and table to reduce duplicated lines? I don't remember if RecordBatchFromJson supports list view natively.

@HuaHuaY HuaHuaY Jun 15, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there's much duplicate code here, and it's not easy to merge. Because without enabling store_schema, large list type requires manually changing the inner array name from item to element.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure why the child array would have a different name vs. the regular ListView case, but in any case you can have a helper function that also takes the inner element's name as argument.

auto offsets = ArrayFromJSON(::arrow::int64(), "[3, 0, 5, 1]");
auto sizes = ArrayFromJSON(::arrow::int64(), "[2, 1, 0, 2]");
auto element = ::arrow::field("element", ::arrow::int32());
ASSERT_OK_AND_ASSIGN(auto array, ::arrow::LargeListViewArray::FromArrays(
::arrow::large_list_view(element), *offsets,
*sizes, *values, default_memory_pool()));
auto table = Table::Make(
::arrow::schema({::arrow::field("root", array->type(), false)}), {array});

auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build();
Comment thread
HuaHuaY marked this conversation as resolved.
CheckSimpleRoundtrip(table, 2, props_store_schema);

ASSERT_OK_AND_ASSIGN(auto expected_array,
::arrow::LargeListArray::FromListView(
checked_cast<const ::arrow::LargeListViewArray&>(*array),
default_memory_pool()));
auto expected = Table::Make(
::arrow::schema({::arrow::field("root", ::arrow::large_list(element), false)}),
{expected_array});
ArrowReaderProperties reader_props;
reader_props.set_list_type(::arrow::Type::LARGE_LIST);
CheckConfiguredRoundtrip(table, expected, ::parquet::default_writer_properties(),
default_arrow_writer_properties(), reader_props);
}

TEST(ArrowReadWrite, FixedSizeList) {
using ::arrow::field;
using ::arrow::fixed_size_list;
Expand Down
53 changes: 24 additions & 29 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1728,40 +1728,35 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) {
std::vector<NodePtr> parquet_fields;
std::vector<std::shared_ptr<Field>> arrow_fields;

// parquet_arrow will always generate 3-level LIST encodings

// // LargeList<String> (list-like non-null, elements nullable)
// required group my_list (LIST) {
// repeated group list {
// optional binary element (UTF8);
// }
// }
{
auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
GroupNode::Make("my_list", Repetition::REQUIRED, {list}, ConvertedType::LIST));
auto arrow_element = ::arrow::field("string", UTF8, true);
auto arrow_list = ::arrow::large_list(arrow_element);
arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
}
// // FixedSizeList[10]<String> (list-like non-null, elements nullable)
// parquet_arrow will always generate this 3-level LIST encoding for list-like
// non-null Arrow arrays with nullable string elements:
//
// required group my_list (LIST) {
// repeated group list {
// optional binary element (UTF8);
// }
// }
{
auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
GroupNode::Make("my_list", Repetition::REQUIRED, {list}, ConvertedType::LIST));
auto arrow_element = ::arrow::field("string", UTF8, true);
auto arrow_list = ::arrow::fixed_size_list(arrow_element, 10);
arrow_fields.push_back(::arrow::field("my_list", arrow_list, false));
}
auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
auto parquet_field =
GroupNode::Make("my_list", Repetition::REQUIRED, {list}, ConvertedType::LIST);

auto AddListLikeField = [&](std::shared_ptr<::arrow::DataType> arrow_list) {
parquet_fields.push_back(parquet_field);
arrow_fields.push_back(::arrow::field("my_list", std::move(arrow_list), false));
};

auto arrow_element = ::arrow::field("string", UTF8, true);

// LargeList<String> (list-like non-null, elements nullable)
AddListLikeField(::arrow::large_list(arrow_element));
// ListView<String> (list-like non-null, elements nullable)
AddListLikeField(::arrow::list_view(arrow_element));
// LargeListView<String> (list-like non-null, elements nullable)
AddListLikeField(::arrow::large_list_view(arrow_element));
// FixedSizeList[10]<String> (list-like non-null, elements nullable)
AddListLikeField(::arrow::fixed_size_list(arrow_element, 10));

ASSERT_OK(ConvertSchema(arrow_fields));

Expand Down
130 changes: 82 additions & 48 deletions cpp/src/parquet/arrow/path_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -387,17 +387,16 @@ class ListPathNode {
PathWriteContext* context) {
// First fill int the remainder of the list.
RETURN_IF_ERROR(FillRepLevels(child_range->Size(), rep_level_, context));

// Once we've reached this point the following preconditions should hold:
// 1. There are no more repeated path nodes to deal with.
// 2. All elements in |range| represent contiguous elements in the
// child array (Null values would have shortened the range to ensure
// all remaining list elements are present (though they may be empty lists)).
// 2. Null values would have shortened the range to ensure all remaining
// list elements are present (though they may be empty lists).
// 3. No element of range spans a parent list (intermediate
// list nodes only handle one list entry at a time).
//
// Given these preconditions it should be safe to fill runs on non-empty
// Given these preconditions it is safe to fill runs on contiguous non-empty
// lists here and expand the range in the child node accordingly.

while (!range->Empty()) {
ElementRange size_check = selector_.GetRange(range->start);
if (size_check.Empty()) {
Expand All @@ -406,14 +405,20 @@ class ListPathNode {
// def_levels entered first.
break;
}
if constexpr (RangeSelector::kCheckContiguous) {
if (size_check.start != child_range->end) {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you comment on why we're doing this? This code is complicated and this will help the reader.

break;
}
} else {
DCHECK_EQ(size_check.start, child_range->end)
<< size_check.start << " != " << child_range->end;
}
// This is the start of a new list. We can be sure it only applies
// to the previous list (and doesn't jump to the start of any list
// further up in nesting due to the constraints mentioned at the start
// of the function).
RETURN_IF_ERROR(context->AppendRepLevel(prev_rep_level_));
RETURN_IF_ERROR(context->AppendRepLevels(size_check.Size() - 1, rep_level_));
DCHECK_EQ(size_check.start, child_range->end)
<< size_check.start << " != " << child_range->end;
child_range->end = size_check.end;
++range->start;
}
Expand All @@ -434,19 +439,37 @@ class ListPathNode {

template <typename OffsetType>
struct VarRangeSelector {
static constexpr bool kCheckContiguous = false;

ElementRange GetRange(int64_t index) const {
return ElementRange{offsets[index], offsets[index + 1]};
return ElementRange{.start = offsets[index], .end = offsets[index + 1]};
}

// Either int32_t* or int64_t*.
const OffsetType* offsets;
};

template <typename OffsetType>
struct ListViewRangeSelector {
static constexpr bool kCheckContiguous = true;

ElementRange GetRange(int64_t index) const {
const int64_t start = offsets[index];
return ElementRange{.start = start, .end = start + sizes[index]};
}

const OffsetType* offsets;
const OffsetType* sizes;
};

struct FixedSizedRangeSelector {
static constexpr bool kCheckContiguous = false;

ElementRange GetRange(int64_t index) const {
int64_t start = index * list_size;
return ElementRange{start, start + list_size};
return ElementRange{.start = start, .end = start + list_size};
}

int list_size;
};

Expand Down Expand Up @@ -510,16 +533,18 @@ class NullableNode {

using ListNode = ListPathNode<VarRangeSelector<int32_t>>;
using LargeListNode = ListPathNode<VarRangeSelector<int64_t>>;
using ListViewNode = ListPathNode<ListViewRangeSelector<int32_t>>;
using LargeListViewNode = ListPathNode<ListViewRangeSelector<int64_t>>;
using FixedSizeListNode = ListPathNode<FixedSizedRangeSelector>;

// Contains static information derived from traversing the schema.
struct PathInfo {
// The vectors are expected to the same length info.

// Note index order matters here.
using Node =
std::variant<NullableTerminalNode, ListNode, LargeListNode, FixedSizeListNode,
NullableNode, AllPresentTerminalNode, AllNullsTerminalNode>;
using Node = std::variant<NullableTerminalNode, ListNode, LargeListNode, ListViewNode,
LargeListViewNode, FixedSizeListNode, NullableNode,
AllPresentTerminalNode, AllNullsTerminalNode>;

std::vector<Node> path;
std::shared_ptr<Array> primitive_array;
Expand All @@ -529,6 +554,28 @@ struct PathInfo {
bool leaf_is_nullable = false;
};

struct WritePathVisitor {
IterationResult operator()(NullableNode& node) {
return node.Run(stack_position, stack_position + 1, context);
}
IterationResult operator()(NullableTerminalNode& node) {
return node.Run(*stack_position, context);
}
IterationResult operator()(AllPresentTerminalNode& node) {
return node.Run(*stack_position, context);
}
IterationResult operator()(AllNullsTerminalNode& node) {
return node.Run(*stack_position, context);
}
template <typename RangeSelector>
IterationResult operator()(ListPathNode<RangeSelector>& node) {
return node.Run(stack_position, stack_position + 1, context);
}

ElementRange* stack_position;
PathWriteContext* context;
};

/// Contains logic for writing a single leaf node to parquet.
/// This tracks the path from root to leaf.
///
Expand Down Expand Up @@ -575,32 +622,7 @@ Status WritePath(ElementRange root_range, PathInfo* path_info,
// |root_range| are processed.
while (stack_position >= stack_base) {
PathInfo::Node& node = path_info->path[stack_position - stack_base];
struct {
IterationResult operator()(NullableNode& node) {
return node.Run(stack_position, stack_position + 1, context);
}
IterationResult operator()(ListNode& node) {
return node.Run(stack_position, stack_position + 1, context);
}
IterationResult operator()(NullableTerminalNode& node) {
return node.Run(*stack_position, context);
}
IterationResult operator()(FixedSizeListNode& node) {
return node.Run(stack_position, stack_position + 1, context);
}
IterationResult operator()(AllPresentTerminalNode& node) {
return node.Run(*stack_position, context);
}
IterationResult operator()(AllNullsTerminalNode& node) {
return node.Run(*stack_position, context);
}
IterationResult operator()(LargeListNode& node) {
return node.Run(stack_position, stack_position + 1, context);
}
ElementRange* stack_position;
PathWriteContext* context;
} visitor = {stack_position, &context};

WritePathVisitor visitor = {.stack_position = stack_position, .context = &context};
IterationResult result = std::visit(visitor, node);

if (ARROW_PREDICT_FALSE(result == kError)) {
Expand Down Expand Up @@ -637,20 +659,17 @@ struct FixupVisitor {
int max_rep_level = -1;
int16_t rep_level_if_null = kLevelNotSet;

template <typename T>
void HandleListNode(T& arg) {
if (arg.rep_level() == max_rep_level) {
arg.SetLast();
template <typename RangeSelector>
void operator()(ListPathNode<RangeSelector>& node) {
if (node.rep_level() == max_rep_level) {
node.SetLast();
// after the last list node we don't need to fill
// rep levels on null.
rep_level_if_null = kLevelNotSet;
} else {
rep_level_if_null = arg.rep_level();
rep_level_if_null = node.rep_level();
}
}
void operator()(ListNode& node) { HandleListNode(node); }
void operator()(LargeListNode& node) { HandleListNode(node); }
void operator()(FixedSizeListNode& node) { HandleListNode(node); }

// For non-list intermediate nodes.
template <typename T>
Expand Down Expand Up @@ -740,6 +759,23 @@ class PathBuilder {
return VisitInline(*array.values());
}

template <typename T>
requires ::arrow::is_list_view_type<typename T::TypeClass>::value
Status Visit(const T& array) {
MaybeAddNullable(array);
// Increment necessary due to empty lists.
info_.max_def_level++;
info_.max_rep_level++;
// raw_value_offsets() and raw_value_sizes() account for any slice offset.
ListPathNode<ListViewRangeSelector<typename T::offset_type>> node(
ListViewRangeSelector<typename T::offset_type>{array.raw_value_offsets(),
array.raw_value_sizes()},
info_.max_rep_level, info_.max_def_level - 1);
info_.path.emplace_back(std::move(node));
nullable_in_parent_ = array.list_view_type()->value_field()->nullable();
return VisitInline(*array.values());
}

Status Visit(const ::arrow::DictionaryArray& array) {
// Only currently handle DictionaryArray where the dictionary is a
// primitive type
Expand Down Expand Up @@ -830,8 +866,6 @@ class PathBuilder {
// Types not yet supported in Parquet.
NOT_IMPLEMENTED_VISIT(Union)
NOT_IMPLEMENTED_VISIT(RunEndEncoded);
NOT_IMPLEMENTED_VISIT(ListView);
NOT_IMPLEMENTED_VISIT(LargeListView);

#undef NOT_IMPLEMENTED_VISIT
std::vector<PathInfo>& paths() { return paths_; }
Expand Down
Loading
Loading