diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index 09c48ef0ff6..d43998ef78f 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -58,18 +58,23 @@ constexpr int value_length(int type_length, const FLBA& value) { return type_len // Static "constants" for normalizing float16 min/max values. These need to be expressed // as pointers because `Float16LogicalType` represents an FLBA. struct Float16Constants { - static constexpr const uint8_t* lowest() { return lowest_.data(); } - static constexpr const uint8_t* max() { return max_.data(); } static constexpr const uint8_t* positive_zero() { return positive_zero_.data(); } static constexpr const uint8_t* negative_zero() { return negative_zero_.data(); } + static constexpr const uint8_t* positive_infinity() { + return positive_infinity_.data(); + } + static constexpr const uint8_t* negative_infinity() { + return negative_infinity_.data(); + } private: using Bytes = std::array; - static constexpr Bytes lowest_ = - std::numeric_limits::lowest().ToLittleEndian(); - static constexpr Bytes max_ = std::numeric_limits::max().ToLittleEndian(); static constexpr Bytes positive_zero_ = (+Float16::FromBits(0)).ToLittleEndian(); static constexpr Bytes negative_zero_ = (-Float16::FromBits(0)).ToLittleEndian(); + static constexpr Bytes positive_infinity_ = + std::numeric_limits::infinity().ToLittleEndian(); + static constexpr Bytes negative_infinity_ = + (-std::numeric_limits::infinity()).ToLittleEndian(); }; template @@ -79,8 +84,24 @@ struct CompareHelper { static_assert(!std::is_unsigned::value || std::is_same::value, "T is an unsigned numeric"); - constexpr static T DefaultMin() { return std::numeric_limits::max(); } - constexpr static T DefaultMax() { return std::numeric_limits::lowest(); } + // For floating point, seed the running min/max with +/-infinity rather than + // the largest/smallest finite value: a finite seed is not an identity for + // min/max once the data contains infinities, so an all-+Inf (resp. all--Inf) + // column would otherwise never displace the seed and report a finite bound. + constexpr static T DefaultMin() { + if constexpr (std::is_floating_point_v) { + return std::numeric_limits::infinity(); + } else { + return std::numeric_limits::max(); + } + } + constexpr static T DefaultMax() { + if constexpr (std::is_floating_point_v) { + return -std::numeric_limits::infinity(); + } else { + return std::numeric_limits::lowest(); + } + } // MSVC17 fix, isnan is not overloaded for IntegralType as per C++11 // standard requirements. @@ -300,8 +321,8 @@ template <> struct CompareHelper { using T = FLBA; - static T DefaultMin() { return T{Float16Constants::max()}; } - static T DefaultMax() { return T{Float16Constants::lowest()}; } + static T DefaultMin() { return T{Float16Constants::positive_infinity()}; } + static T DefaultMax() { return T{Float16Constants::negative_infinity()}; } static T Coalesce(T val, T fallback) { return (val.ptr == nullptr || Float16::FromLittleEndian(val.ptr).is_nan()) ? fallback @@ -330,6 +351,17 @@ struct CompareHelper { using ::std::optional; +// A usable min/max pair always satisfies min <= max. The reverse ordering +// (min > max) is produced only by the inverted DefaultMin()/DefaultMax() seeds +// -- left in place when no valid, non-NaN value was observed -- or by an +// inverted caller-supplied range; in either case there is no statistic to emit. +// Testing the ordering keeps this independent of the specific seed values, so it +// cannot drift from DefaultMin()/DefaultMax(). +template +bool IsInvalidMinMax(const T& min, const T& max) { + return max < min; +} + template ::arrow::enable_if_t::value, optional>> CleanStatistic(std::pair min_max, LogicalType::Type::type) { @@ -357,7 +389,9 @@ CleanStatistic(std::pair min_max, LogicalType::Type::type) { return ::std::nullopt; } - if (min == std::numeric_limits::max() && max == std::numeric_limits::lowest()) { + // Discard an inverted min/max: either an empty/all-NaN input left the seeds in + // place, or the supplied range is invalid. A real value always has min <= max. + if (IsInvalidMinMax(min, max)) { return ::std::nullopt; } @@ -384,8 +418,7 @@ optional> CleanFloat16Statistic(std::pair min_ return ::std::nullopt; } - if (min == std::numeric_limits::max() && - max == std::numeric_limits::lowest()) { + if (IsInvalidMinMax(min, max)) { return ::std::nullopt; } diff --git a/cpp/src/parquet/statistics_test.cc b/cpp/src/parquet/statistics_test.cc index 905502cb0a5..ba66b8e77c5 100644 --- a/cpp/src/parquet/statistics_test.cc +++ b/cpp/src/parquet/statistics_test.cc @@ -1485,6 +1485,7 @@ class TestFloatStatistics : public ::testing::Test { } void TestNaNs(); + void TestInfinities(); protected: std::vector data_buf_; @@ -1559,6 +1560,30 @@ void TestFloatStatistics::TestNaNs() { valid_bitmap_no_nans); } +// GH-50182: an all-infinity column must report that infinity as its min/max, +// rather than the largest finite value (a finite seed is never displaced by an +// infinity of the same sign). A NaN interspersed in such a column must be +// ignored without corrupting the infinite min/max. +template +void TestFloatStatistics::TestInfinities() { + NodePtr node = this->MakeNode("f", Repetition::REQUIRED); + ColumnDescriptor descr(node, 0, 0); + + constexpr c_type inf = std::numeric_limits::infinity(); + constexpr c_type nan = std::numeric_limits::quiet_NaN(); + std::vector all_pos_inf{inf, inf, inf}; + std::vector all_neg_inf{-inf, -inf, -inf}; + std::vector mixed_inf{inf, -inf}; + std::vector pos_inf_with_nan{inf, nan, inf}; + std::vector neg_inf_with_nan{-inf, nan, -inf}; + + AssertMinMaxAre(MakeStatistics(&descr), all_pos_inf, inf, inf); + AssertMinMaxAre(MakeStatistics(&descr), all_neg_inf, -inf, -inf); + AssertMinMaxAre(MakeStatistics(&descr), mixed_inf, -inf, inf); + AssertMinMaxAre(MakeStatistics(&descr), pos_inf_with_nan, inf, inf); + AssertMinMaxAre(MakeStatistics(&descr), neg_inf_with_nan, -inf, -inf); +} + struct BufferedFloat16 { explicit BufferedFloat16(Float16 f16) : f16(f16) { this->f16.ToLittleEndian(bytes_.data()); @@ -1611,12 +1636,39 @@ void TestFloatStatistics::TestNaNs() { valid_bitmap_no_nans); } +template <> +void TestFloatStatistics::TestInfinities() { + NodePtr node = this->MakeNode("f", Repetition::REQUIRED); + ColumnDescriptor descr(node, 0, 0); + + using F16 = BufferedFloat16; + const auto pos_inf = F16(std::numeric_limits::infinity()); + const auto neg_inf = F16(-std::numeric_limits::infinity()); + const auto nan = F16(std::numeric_limits::quiet_NaN()); + const auto pinf = FLBA{pos_inf.bytes()}; + const auto ninf = FLBA{neg_inf.bytes()}; + const auto fnan = FLBA{nan.bytes()}; + + std::vector all_pos_inf{pinf, pinf, pinf}; + std::vector all_neg_inf{ninf, ninf, ninf}; + std::vector mixed_inf{pinf, ninf}; + std::vector pos_inf_with_nan{pinf, fnan, pinf}; + std::vector neg_inf_with_nan{ninf, fnan, ninf}; + + AssertMinMaxAre(MakeStatistics(&descr), all_pos_inf, pinf, pinf); + AssertMinMaxAre(MakeStatistics(&descr), all_neg_inf, ninf, ninf); + AssertMinMaxAre(MakeStatistics(&descr), mixed_inf, ninf, pinf); + AssertMinMaxAre(MakeStatistics(&descr), pos_inf_with_nan, pinf, pinf); + AssertMinMaxAre(MakeStatistics(&descr), neg_inf_with_nan, ninf, ninf); +} + using FloatingPointTypes = ::testing::Types; TYPED_TEST_SUITE(TestFloatStatistics, FloatingPointTypes); TYPED_TEST(TestFloatStatistics, NegativeZeros) { this->TestNegativeZeroes(); } TYPED_TEST(TestFloatStatistics, NaNs) { this->TestNaNs(); } +TYPED_TEST(TestFloatStatistics, Infinities) { this->TestInfinities(); } // ARROW-7376 TEST(TestStatisticsSortOrderFloatNaN, NaNAndNullsInfiniteLoop) {