-
Notifications
You must be signed in to change notification settings - Fork 4.1k
GH-50182: [C++][Parquet] Fix truncated min/max statistics for all-infinity floating-point columns #50183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
GH-50182: [C++][Parquet] Fix truncated min/max statistics for all-infinity floating-point columns #50183
Changes from all commits
8fe0435
7a4ea74
f29d14a
460995f
d9e8e39
cae7c27
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -58,18 +58,23 @@ constexpr int value_length(int type_length, const FLBA& value) { return type_len | |
| // Static "constants" for normalizing float16 min/max values. These need to be expressed | ||
| // as pointers because `Float16LogicalType` represents an FLBA. | ||
| struct Float16Constants { | ||
| static constexpr const uint8_t* lowest() { return lowest_.data(); } | ||
| static constexpr const uint8_t* max() { return max_.data(); } | ||
| static constexpr const uint8_t* positive_zero() { return positive_zero_.data(); } | ||
| static constexpr const uint8_t* negative_zero() { return negative_zero_.data(); } | ||
| static constexpr const uint8_t* positive_infinity() { | ||
| return positive_infinity_.data(); | ||
| } | ||
| static constexpr const uint8_t* negative_infinity() { | ||
| return negative_infinity_.data(); | ||
| } | ||
|
|
||
| private: | ||
| using Bytes = std::array<uint8_t, 2>; | ||
| static constexpr Bytes lowest_ = | ||
| std::numeric_limits<Float16>::lowest().ToLittleEndian(); | ||
| static constexpr Bytes max_ = std::numeric_limits<Float16>::max().ToLittleEndian(); | ||
| static constexpr Bytes positive_zero_ = (+Float16::FromBits(0)).ToLittleEndian(); | ||
| static constexpr Bytes negative_zero_ = (-Float16::FromBits(0)).ToLittleEndian(); | ||
| static constexpr Bytes positive_infinity_ = | ||
| std::numeric_limits<Float16>::infinity().ToLittleEndian(); | ||
| static constexpr Bytes negative_infinity_ = | ||
| (-std::numeric_limits<Float16>::infinity()).ToLittleEndian(); | ||
| }; | ||
|
|
||
| template <typename DType, bool is_signed> | ||
|
|
@@ -79,8 +84,24 @@ struct CompareHelper { | |
| static_assert(!std::is_unsigned<T>::value || std::is_same<T, bool>::value, | ||
| "T is an unsigned numeric"); | ||
|
|
||
| constexpr static T DefaultMin() { return std::numeric_limits<T>::max(); } | ||
| constexpr static T DefaultMax() { return std::numeric_limits<T>::lowest(); } | ||
| // For floating point, seed the running min/max with +/-infinity rather than | ||
| // the largest/smallest finite value: a finite seed is not an identity for | ||
| // min/max once the data contains infinities, so an all-+Inf (resp. all--Inf) | ||
| // column would otherwise never displace the seed and report a finite bound. | ||
| constexpr static T DefaultMin() { | ||
| if constexpr (std::is_floating_point_v<T>) { | ||
| return std::numeric_limits<T>::infinity(); | ||
| } else { | ||
| return std::numeric_limits<T>::max(); | ||
| } | ||
| } | ||
| constexpr static T DefaultMax() { | ||
| if constexpr (std::is_floating_point_v<T>) { | ||
| return -std::numeric_limits<T>::infinity(); | ||
| } else { | ||
| return std::numeric_limits<T>::lowest(); | ||
| } | ||
| } | ||
|
|
||
| // MSVC17 fix, isnan is not overloaded for IntegralType as per C++11 | ||
| // standard requirements. | ||
|
|
@@ -300,8 +321,8 @@ template <> | |
| struct CompareHelper<Float16LogicalType, /*is_signed=*/true> { | ||
| using T = FLBA; | ||
|
|
||
| static T DefaultMin() { return T{Float16Constants::max()}; } | ||
| static T DefaultMax() { return T{Float16Constants::lowest()}; } | ||
| static T DefaultMin() { return T{Float16Constants::positive_infinity()}; } | ||
| static T DefaultMax() { return T{Float16Constants::negative_infinity()}; } | ||
|
|
||
| static T Coalesce(T val, T fallback) { | ||
| return (val.ptr == nullptr || Float16::FromLittleEndian(val.ptr).is_nan()) ? fallback | ||
|
|
@@ -330,6 +351,17 @@ struct CompareHelper<Float16LogicalType, /*is_signed=*/true> { | |
|
|
||
| using ::std::optional; | ||
|
|
||
| // A usable min/max pair always satisfies min <= max. The reverse ordering | ||
| // (min > max) is produced only by the inverted DefaultMin()/DefaultMax() seeds | ||
| // -- left in place when no valid, non-NaN value was observed -- or by an | ||
| // inverted caller-supplied range; in either case there is no statistic to emit. | ||
| // Testing the ordering keeps this independent of the specific seed values, so it | ||
| // cannot drift from DefaultMin()/DefaultMax(). | ||
| template <typename T> | ||
| bool IsInvalidMinMax(const T& min, const T& max) { | ||
| return max < min; | ||
| } | ||
|
clee704 marked this conversation as resolved.
|
||
|
|
||
| template <typename T> | ||
| ::arrow::enable_if_t<std::is_integral<T>::value, optional<std::pair<T, T>>> | ||
| CleanStatistic(std::pair<T, T> min_max, LogicalType::Type::type) { | ||
|
|
@@ -357,7 +389,9 @@ CleanStatistic(std::pair<T, T> min_max, LogicalType::Type::type) { | |
| return ::std::nullopt; | ||
| } | ||
|
|
||
| if (min == std::numeric_limits<T>::max() && max == std::numeric_limits<T>::lowest()) { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was a logical AND, but
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In practice there's no change or this is better. When min/max is set, both are set always and it's always min <= max. This change also catches an invalid SetMinMax call that sets min > max. So it's a better guard than before. |
||
| // Discard an inverted min/max: either an empty/all-NaN input left the seeds in | ||
| // place, or the supplied range is invalid. A real value always has min <= max. | ||
| if (IsInvalidMinMax(min, max)) { | ||
| return ::std::nullopt; | ||
| } | ||
|
|
||
|
|
@@ -384,8 +418,7 @@ optional<std::pair<FLBA, FLBA>> CleanFloat16Statistic(std::pair<FLBA, FLBA> min_ | |
| return ::std::nullopt; | ||
| } | ||
|
|
||
| if (min == std::numeric_limits<Float16>::max() && | ||
| max == std::numeric_limits<Float16>::lowest()) { | ||
| if (IsInvalidMinMax(min, max)) { | ||
| return ::std::nullopt; | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.