From 8fe0435daacdf659d5052a8be710c8f08d6bf4bd Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Mon, 15 Jun 2026 17:36:04 +0000 Subject: [PATCH 1/6] GH-50182: [C++][Parquet] Fix truncated min/max statistics for all-infinity float columns Seed the floating-point and Float16 min/max accumulation with +/-infinity instead of the largest/smallest finite value. A finite seed is not an identity for min/max once the data contains infinities, so an all-+Inf column previously reported min = FLT_MAX/DBL_MAX (and an all--Inf column reported max = -FLT_MAX/-DBL_MAX) rather than the infinity actually present -- a value written with is_min_value_exact = true. Update the empty/all-NaN detection in CleanStatistic and CleanFloat16Statistic to match the new seeds; an all-NaN or empty column still leaves min = +Inf, max = -Inf (min > max, impossible for real data) and so emits no statistics. Add a TestFloatStatistics.Infinities typed test covering FLOAT, DOUBLE and FLOAT16. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cpp/src/parquet/statistics.cc | 43 +++++++++++++++++++++++++----- cpp/src/parquet/statistics_test.cc | 40 +++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 09c48ef0ff69..b471d48eb1d8 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -62,6 +62,12 @@ struct Float16Constants { static constexpr const uint8_t* max() { return max_.data(); } static constexpr const uint8_t* positive_zero() { return positive_zero_.data(); } static constexpr const uint8_t* negative_zero() { return negative_zero_.data(); } + static constexpr const uint8_t* positive_infinity() { + return positive_infinity_.data(); + } + static constexpr const uint8_t* negative_infinity() { + return negative_infinity_.data(); + } private: using Bytes = std::array; @@ -70,6 +76,10 @@ struct Float16Constants { static constexpr Bytes max_ = std::numeric_limits::max().ToLittleEndian(); static constexpr Bytes positive_zero_ = (+Float16::FromBits(0)).ToLittleEndian(); static constexpr Bytes negative_zero_ = (-Float16::FromBits(0)).ToLittleEndian(); + static constexpr Bytes positive_infinity_ = + std::numeric_limits::infinity().ToLittleEndian(); + static constexpr Bytes negative_infinity_ = + (-std::numeric_limits::infinity()).ToLittleEndian(); }; template @@ -79,8 +89,24 @@ struct CompareHelper { static_assert(!std::is_unsigned::value || std::is_same::value, "T is an unsigned numeric"); - constexpr static T DefaultMin() { return std::numeric_limits::max(); } - constexpr static T DefaultMax() { return std::numeric_limits::lowest(); } + // For floating point, seed the running min/max with +/-infinity rather than + // the largest/smallest finite value: a finite seed is not an identity for + // min/max once the data contains infinities, so an all-+Inf (resp. all--Inf) + // column would otherwise never displace the seed and report a finite bound. + constexpr static T DefaultMin() { + if constexpr (std::is_floating_point_v) { + return std::numeric_limits::infinity(); + } else { + return std::numeric_limits::max(); + } + } + constexpr static T DefaultMax() { + if constexpr (std::is_floating_point_v) { + return -std::numeric_limits::infinity(); + } else { + return std::numeric_limits::lowest(); + } + } // MSVC17 fix, isnan is not overloaded for IntegralType as per C++11 // standard requirements. @@ -300,8 +326,8 @@ template <> struct CompareHelper { using T = FLBA; - static T DefaultMin() { return T{Float16Constants::max()}; } - static T DefaultMax() { return T{Float16Constants::lowest()}; } + static T DefaultMin() { return T{Float16Constants::positive_infinity()}; } + static T DefaultMax() { return T{Float16Constants::negative_infinity()}; } static T Coalesce(T val, T fallback) { return (val.ptr == nullptr || Float16::FromLittleEndian(val.ptr).is_nan()) ? fallback @@ -357,7 +383,10 @@ CleanStatistic(std::pair min_max, LogicalType::Type::type) { return ::std::nullopt; } - if (min == std::numeric_limits::max() && max == std::numeric_limits::lowest()) { + // No valid (non-NaN) value was seen: the running min/max still hold the + // +/-infinity seeds (note min > max here, which real data can never produce). + if (min == std::numeric_limits::infinity() && + max == -std::numeric_limits::infinity()) { return ::std::nullopt; } @@ -384,8 +413,8 @@ optional> CleanFloat16Statistic(std::pair min_ return ::std::nullopt; } - if (min == std::numeric_limits::max() && - max == std::numeric_limits::lowest()) { + if (min == std::numeric_limits::infinity() && + max == -std::numeric_limits::infinity()) { return ::std::nullopt; } diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 905502cb0a57..5f6ff09cff0c 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -1485,6 +1485,7 @@ class TestFloatStatistics : public ::testing::Test { } void TestNaNs(); + void TestInfinities(); protected: std::vector data_buf_; @@ -1559,6 +1560,24 @@ void TestFloatStatistics::TestNaNs() { valid_bitmap_no_nans); } +// GH-50182: an all-infinity column must report that infinity as its min/max, +// rather than the largest finite value (a finite seed is never displaced by an +// infinity of the same sign). +template +void TestFloatStatistics::TestInfinities() { + NodePtr node = this->MakeNode("f", Repetition::REQUIRED); + ColumnDescriptor descr(node, 0, 0); + + constexpr c_type inf = std::numeric_limits::infinity(); + std::array all_pos_inf{inf, inf, inf}; + std::array all_neg_inf{-inf, -inf, -inf}; + std::array mixed_inf{inf, -inf}; + + AssertMinMaxAre(MakeStatistics(&descr), all_pos_inf, inf, inf); + AssertMinMaxAre(MakeStatistics(&descr), all_neg_inf, -inf, -inf); + AssertMinMaxAre(MakeStatistics(&descr), mixed_inf, -inf, inf); +} + struct BufferedFloat16 { explicit BufferedFloat16(Float16 f16) : f16(f16) { this->f16.ToLittleEndian(bytes_.data()); @@ -1611,12 +1630,33 @@ void TestFloatStatistics::TestNaNs() { valid_bitmap_no_nans); } +template <> +void TestFloatStatistics::TestInfinities() { + NodePtr node = this->MakeNode("f", Repetition::REQUIRED); + ColumnDescriptor descr(node, 0, 0); + + using F16 = BufferedFloat16; + const auto pos_inf = F16(std::numeric_limits::infinity()); + const auto neg_inf = F16(-std::numeric_limits::infinity()); + const auto pinf = FLBA{pos_inf.bytes()}; + const auto ninf = FLBA{neg_inf.bytes()}; + + std::array all_pos_inf{pinf, pinf, pinf}; + std::array all_neg_inf{ninf, ninf, ninf}; + std::array mixed_inf{pinf, ninf}; + + AssertMinMaxAre(MakeStatistics(&descr), all_pos_inf, pinf, pinf); + AssertMinMaxAre(MakeStatistics(&descr), all_neg_inf, ninf, ninf); + AssertMinMaxAre(MakeStatistics(&descr), mixed_inf, ninf, pinf); +} + using FloatingPointTypes = ::testing::Types; TYPED_TEST_SUITE(TestFloatStatistics, FloatingPointTypes); TYPED_TEST(TestFloatStatistics, NegativeZeros) { this->TestNegativeZeroes(); } TYPED_TEST(TestFloatStatistics, NaNs) { this->TestNaNs(); } +TYPED_TEST(TestFloatStatistics, Infinities) { this->TestInfinities(); } // ARROW-7376 TEST(TestStatisticsSortOrderFloatNaN, NaNAndNullsInfiniteLoop) { From 7a4ea74541b18d5ec0fd41905684d9d7d277b7f3 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Mon, 15 Jun 2026 17:49:52 +0000 Subject: [PATCH 2/6] Address review: replace hardcoded infinity sentinel with IsUninitializedMinMax The empty/all-NaN detection in CleanStatistic / CleanFloat16Statistic now uses a single IsUninitializedMinMax(min, max) helper (max < min) instead of comparing against the +/-infinity seeds directly. It relies only on the invariant DefaultMin > DefaultMax, so the seed and its detection cannot drift. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cpp/src/parquet/statistics.cc | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index b471d48eb1d8..fb8f1582e6f9 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -356,6 +356,15 @@ struct CompareHelper { using ::std::optional; +// True when min/max still hold the DefaultMin()/DefaultMax() seeds, i.e. no +// valid (non-NaN) value was observed. The seeds satisfy DefaultMin > DefaultMax +// -- an ordering real data can never produce -- so this does not depend on the +// specific seed values and stays in sync with DefaultMin()/DefaultMax(). +template +bool IsUninitializedMinMax(const T& min, const T& max) { + return max < min; +} + template ::arrow::enable_if_t::value, optional>> CleanStatistic(std::pair min_max, LogicalType::Type::type) { @@ -383,10 +392,8 @@ CleanStatistic(std::pair min_max, LogicalType::Type::type) { return ::std::nullopt; } - // No valid (non-NaN) value was seen: the running min/max still hold the - // +/-infinity seeds (note min > max here, which real data can never produce). - if (min == std::numeric_limits::infinity() && - max == -std::numeric_limits::infinity()) { + // No valid (non-NaN) value was seen: the running min/max still hold the seeds. + if (IsUninitializedMinMax(min, max)) { return ::std::nullopt; } @@ -413,8 +420,7 @@ optional> CleanFloat16Statistic(std::pair min_ return ::std::nullopt; } - if (min == std::numeric_limits::infinity() && - max == -std::numeric_limits::infinity()) { + if (IsUninitializedMinMax(min, max)) { return ::std::nullopt; } From f29d14a23fda98214ca5682b429aa7b291409681 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Mon, 15 Jun 2026 18:18:19 +0000 Subject: [PATCH 3/6] Remove now-unused Float16Constants::max()/lowest() Switching the Float16 DefaultMin()/DefaultMax() seeds to the +/-infinity constants left Float16Constants::max()/lowest() (and their backing members) without any caller. Drop the dead accessors and members. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cpp/src/parquet/statistics.cc | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index fb8f1582e6f9..b865f5f1ebd8 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -58,8 +58,6 @@ constexpr int value_length(int type_length, const FLBA& value) { return type_len // Static "constants" for normalizing float16 min/max values. These need to be expressed // as pointers because `Float16LogicalType` represents an FLBA. struct Float16Constants { - static constexpr const uint8_t* lowest() { return lowest_.data(); } - static constexpr const uint8_t* max() { return max_.data(); } static constexpr const uint8_t* positive_zero() { return positive_zero_.data(); } static constexpr const uint8_t* negative_zero() { return negative_zero_.data(); } static constexpr const uint8_t* positive_infinity() { @@ -71,9 +69,6 @@ struct Float16Constants { private: using Bytes = std::array; - static constexpr Bytes lowest_ = - std::numeric_limits::lowest().ToLittleEndian(); - static constexpr Bytes max_ = std::numeric_limits::max().ToLittleEndian(); static constexpr Bytes positive_zero_ = (+Float16::FromBits(0)).ToLittleEndian(); static constexpr Bytes negative_zero_ = (-Float16::FromBits(0)).ToLittleEndian(); static constexpr Bytes positive_infinity_ = From 460995f3432b265b4b2857faaeeebdaba0a5e395 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Mon, 15 Jun 2026 18:54:28 +0000 Subject: [PATCH 4/6] Rename IsUninitializedMinMax to IsInvalidMinMax The helper tests max < min. "Uninitialized" overstated what it checks: a valid min/max pair always has min <= max, so max < min is simply an invalid pair -- either the inverted DefaultMin()/DefaultMax() seeds (no value observed) or an inverted caller-supplied range. Rename to IsInvalidMinMax and reword the comments so the predicate is definitional rather than implying an "iff uninitialized" claim. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cpp/src/parquet/statistics.cc | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index b865f5f1ebd8..d43998ef78f2 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -351,12 +351,14 @@ struct CompareHelper { using ::std::optional; -// True when min/max still hold the DefaultMin()/DefaultMax() seeds, i.e. no -// valid (non-NaN) value was observed. The seeds satisfy DefaultMin > DefaultMax -// -- an ordering real data can never produce -- so this does not depend on the -// specific seed values and stays in sync with DefaultMin()/DefaultMax(). +// A usable min/max pair always satisfies min <= max. The reverse ordering +// (min > max) is produced only by the inverted DefaultMin()/DefaultMax() seeds +// -- left in place when no valid, non-NaN value was observed -- or by an +// inverted caller-supplied range; in either case there is no statistic to emit. +// Testing the ordering keeps this independent of the specific seed values, so it +// cannot drift from DefaultMin()/DefaultMax(). template -bool IsUninitializedMinMax(const T& min, const T& max) { +bool IsInvalidMinMax(const T& min, const T& max) { return max < min; } @@ -387,8 +389,9 @@ CleanStatistic(std::pair min_max, LogicalType::Type::type) { return ::std::nullopt; } - // No valid (non-NaN) value was seen: the running min/max still hold the seeds. - if (IsUninitializedMinMax(min, max)) { + // Discard an inverted min/max: either an empty/all-NaN input left the seeds in + // place, or the supplied range is invalid. A real value always has min <= max. + if (IsInvalidMinMax(min, max)) { return ::std::nullopt; } @@ -415,7 +418,7 @@ optional> CleanFloat16Statistic(std::pair min_ return ::std::nullopt; } - if (IsUninitializedMinMax(min, max)) { + if (IsInvalidMinMax(min, max)) { return ::std::nullopt; } From d9e8e39531599fe1eb395b1bda60397e60146945 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Mon, 15 Jun 2026 19:14:05 +0000 Subject: [PATCH 5/6] Test infinity columns with an interspersed NaN Extend TestFloatStatistics.Infinities with [+Inf, NaN, +Inf] and [-Inf, NaN, -Inf] cases (float/double/Float16). This is the only case that intersects the new +/-infinity seeds with NaN coalescing on a column whose true min/max is infinite, and it confirms a NaN does not corrupt the infinite min/max. Verified to fail on the pre-fix code (min/max came back as the largest finite value). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cpp/src/parquet/statistics_test.cc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 5f6ff09cff0c..2e163a564e0a 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -1562,20 +1562,26 @@ void TestFloatStatistics::TestNaNs() { // GH-50182: an all-infinity column must report that infinity as its min/max, // rather than the largest finite value (a finite seed is never displaced by an -// infinity of the same sign). +// infinity of the same sign). A NaN interspersed in such a column must be +// ignored without corrupting the infinite min/max. template void TestFloatStatistics::TestInfinities() { NodePtr node = this->MakeNode("f", Repetition::REQUIRED); ColumnDescriptor descr(node, 0, 0); constexpr c_type inf = std::numeric_limits::infinity(); + constexpr c_type nan = std::numeric_limits::quiet_NaN(); std::array all_pos_inf{inf, inf, inf}; std::array all_neg_inf{-inf, -inf, -inf}; std::array mixed_inf{inf, -inf}; + std::array pos_inf_with_nan{inf, nan, inf}; + std::array neg_inf_with_nan{-inf, nan, -inf}; AssertMinMaxAre(MakeStatistics(&descr), all_pos_inf, inf, inf); AssertMinMaxAre(MakeStatistics(&descr), all_neg_inf, -inf, -inf); AssertMinMaxAre(MakeStatistics(&descr), mixed_inf, -inf, inf); + AssertMinMaxAre(MakeStatistics(&descr), pos_inf_with_nan, inf, inf); + AssertMinMaxAre(MakeStatistics(&descr), neg_inf_with_nan, -inf, -inf); } struct BufferedFloat16 { @@ -1638,16 +1644,22 @@ void TestFloatStatistics::TestInfinities() { using F16 = BufferedFloat16; const auto pos_inf = F16(std::numeric_limits::infinity()); const auto neg_inf = F16(-std::numeric_limits::infinity()); + const auto nan = F16(std::numeric_limits::quiet_NaN()); const auto pinf = FLBA{pos_inf.bytes()}; const auto ninf = FLBA{neg_inf.bytes()}; + const auto fnan = FLBA{nan.bytes()}; std::array all_pos_inf{pinf, pinf, pinf}; std::array all_neg_inf{ninf, ninf, ninf}; std::array mixed_inf{pinf, ninf}; + std::array pos_inf_with_nan{pinf, fnan, pinf}; + std::array neg_inf_with_nan{ninf, fnan, ninf}; AssertMinMaxAre(MakeStatistics(&descr), all_pos_inf, pinf, pinf); AssertMinMaxAre(MakeStatistics(&descr), all_neg_inf, ninf, ninf); AssertMinMaxAre(MakeStatistics(&descr), mixed_inf, ninf, pinf); + AssertMinMaxAre(MakeStatistics(&descr), pos_inf_with_nan, pinf, pinf); + AssertMinMaxAre(MakeStatistics(&descr), neg_inf_with_nan, ninf, ninf); } using FloatingPointTypes = ::testing::Types; From cae7c27f3117dc2f6c2d5a3cc9e01804eb9956bd Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Tue, 16 Jun 2026 17:43:31 +0000 Subject: [PATCH 6/6] Use std::vector instead of fixed-size std::array in TestInfinities Per review feedback, the infinity test inputs don't need the explicit array sizing; std::vector is simpler and reads more clearly in test code. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cpp/src/parquet/statistics_test.cc | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 2e163a564e0a..ba66b8e77c50 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -1571,11 +1571,11 @@ void TestFloatStatistics::TestInfinities() { constexpr c_type inf = std::numeric_limits::infinity(); constexpr c_type nan = std::numeric_limits::quiet_NaN(); - std::array all_pos_inf{inf, inf, inf}; - std::array all_neg_inf{-inf, -inf, -inf}; - std::array mixed_inf{inf, -inf}; - std::array pos_inf_with_nan{inf, nan, inf}; - std::array neg_inf_with_nan{-inf, nan, -inf}; + std::vector all_pos_inf{inf, inf, inf}; + std::vector all_neg_inf{-inf, -inf, -inf}; + std::vector mixed_inf{inf, -inf}; + std::vector pos_inf_with_nan{inf, nan, inf}; + std::vector neg_inf_with_nan{-inf, nan, -inf}; AssertMinMaxAre(MakeStatistics(&descr), all_pos_inf, inf, inf); AssertMinMaxAre(MakeStatistics(&descr), all_neg_inf, -inf, -inf); @@ -1649,11 +1649,11 @@ void TestFloatStatistics::TestInfinities() { const auto ninf = FLBA{neg_inf.bytes()}; const auto fnan = FLBA{nan.bytes()}; - std::array all_pos_inf{pinf, pinf, pinf}; - std::array all_neg_inf{ninf, ninf, ninf}; - std::array mixed_inf{pinf, ninf}; - std::array pos_inf_with_nan{pinf, fnan, pinf}; - std::array neg_inf_with_nan{ninf, fnan, ninf}; + std::vector all_pos_inf{pinf, pinf, pinf}; + std::vector all_neg_inf{ninf, ninf, ninf}; + std::vector mixed_inf{pinf, ninf}; + std::vector pos_inf_with_nan{pinf, fnan, pinf}; + std::vector neg_inf_with_nan{ninf, fnan, ninf}; AssertMinMaxAre(MakeStatistics(&descr), all_pos_inf, pinf, pinf); AssertMinMaxAre(MakeStatistics(&descr), all_neg_inf, ninf, ninf);