Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 45 additions & 12 deletions cpp/src/parquet/statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,23 @@ constexpr int value_length(int type_length, const FLBA& value) { return type_len
// Static "constants" for normalizing float16 min/max values. These need to be expressed
// as pointers because `Float16LogicalType` represents an FLBA.
struct Float16Constants {
static constexpr const uint8_t* lowest() { return lowest_.data(); }
static constexpr const uint8_t* max() { return max_.data(); }
static constexpr const uint8_t* positive_zero() { return positive_zero_.data(); }
static constexpr const uint8_t* negative_zero() { return negative_zero_.data(); }
static constexpr const uint8_t* positive_infinity() {
return positive_infinity_.data();
}
static constexpr const uint8_t* negative_infinity() {
return negative_infinity_.data();
}

private:
using Bytes = std::array<uint8_t, 2>;
static constexpr Bytes lowest_ =
std::numeric_limits<Float16>::lowest().ToLittleEndian();
static constexpr Bytes max_ = std::numeric_limits<Float16>::max().ToLittleEndian();
static constexpr Bytes positive_zero_ = (+Float16::FromBits(0)).ToLittleEndian();
static constexpr Bytes negative_zero_ = (-Float16::FromBits(0)).ToLittleEndian();
static constexpr Bytes positive_infinity_ =
std::numeric_limits<Float16>::infinity().ToLittleEndian();
static constexpr Bytes negative_infinity_ =
(-std::numeric_limits<Float16>::infinity()).ToLittleEndian();
};

template <typename DType, bool is_signed>
Expand All @@ -79,8 +84,24 @@ struct CompareHelper {
static_assert(!std::is_unsigned<T>::value || std::is_same<T, bool>::value,
"T is an unsigned numeric");

constexpr static T DefaultMin() { return std::numeric_limits<T>::max(); }
constexpr static T DefaultMax() { return std::numeric_limits<T>::lowest(); }
// For floating point, seed the running min/max with +/-infinity rather than
// the largest/smallest finite value: a finite seed is not an identity for
// min/max once the data contains infinities, so an all-+Inf (resp. all--Inf)
// column would otherwise never displace the seed and report a finite bound.
constexpr static T DefaultMin() {
if constexpr (std::is_floating_point_v<T>) {
return std::numeric_limits<T>::infinity();
} else {
return std::numeric_limits<T>::max();
}
}
constexpr static T DefaultMax() {
if constexpr (std::is_floating_point_v<T>) {
return -std::numeric_limits<T>::infinity();
} else {
return std::numeric_limits<T>::lowest();
}
}
Comment thread
clee704 marked this conversation as resolved.

// MSVC17 fix, isnan is not overloaded for IntegralType as per C++11
// standard requirements.
Expand Down Expand Up @@ -300,8 +321,8 @@ template <>
struct CompareHelper<Float16LogicalType, /*is_signed=*/true> {
using T = FLBA;

static T DefaultMin() { return T{Float16Constants::max()}; }
static T DefaultMax() { return T{Float16Constants::lowest()}; }
static T DefaultMin() { return T{Float16Constants::positive_infinity()}; }
static T DefaultMax() { return T{Float16Constants::negative_infinity()}; }

static T Coalesce(T val, T fallback) {
return (val.ptr == nullptr || Float16::FromLittleEndian(val.ptr).is_nan()) ? fallback
Expand Down Expand Up @@ -330,6 +351,17 @@ struct CompareHelper<Float16LogicalType, /*is_signed=*/true> {

using ::std::optional;

// A usable min/max pair always satisfies min <= max. The reverse ordering
// (min > max) is produced only by the inverted DefaultMin()/DefaultMax() seeds
// -- left in place when no valid, non-NaN value was observed -- or by an
// inverted caller-supplied range; in either case there is no statistic to emit.
// Testing the ordering keeps this independent of the specific seed values, so it
// cannot drift from DefaultMin()/DefaultMax().
template <typename T>
bool IsInvalidMinMax(const T& min, const T& max) {
return max < min;
}
Comment thread
clee704 marked this conversation as resolved.

template <typename T>
::arrow::enable_if_t<std::is_integral<T>::value, optional<std::pair<T, T>>>
CleanStatistic(std::pair<T, T> min_max, LogicalType::Type::type) {
Expand Down Expand Up @@ -357,7 +389,9 @@ CleanStatistic(std::pair<T, T> min_max, LogicalType::Type::type) {
return ::std::nullopt;
}

if (min == std::numeric_limits<T>::max() && max == std::numeric_limits<T>::lowest()) {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was a logical AND, but IsInvalidMinMax will return true if either condition holds. This looks like a behavior change, does it matter in practice?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In practice there's no change or this is better. When min/max is set, both are set always and it's always min <= max. This change also catches an invalid SetMinMax call that sets min > max. So it's a better guard than before.

// Discard an inverted min/max: either an empty/all-NaN input left the seeds in
// place, or the supplied range is invalid. A real value always has min <= max.
if (IsInvalidMinMax(min, max)) {
return ::std::nullopt;
}

Expand All @@ -384,8 +418,7 @@ optional<std::pair<FLBA, FLBA>> CleanFloat16Statistic(std::pair<FLBA, FLBA> min_
return ::std::nullopt;
}

if (min == std::numeric_limits<Float16>::max() &&
max == std::numeric_limits<Float16>::lowest()) {
if (IsInvalidMinMax(min, max)) {
return ::std::nullopt;
}

Expand Down
52 changes: 52 additions & 0 deletions cpp/src/parquet/statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1485,6 +1485,7 @@ class TestFloatStatistics : public ::testing::Test {
}

void TestNaNs();
void TestInfinities();

protected:
std::vector<uint8_t> data_buf_;
Expand Down Expand Up @@ -1559,6 +1560,30 @@ void TestFloatStatistics<T>::TestNaNs() {
valid_bitmap_no_nans);
}

// GH-50182: an all-infinity column must report that infinity as its min/max,
// rather than the largest finite value (a finite seed is never displaced by an
// infinity of the same sign). A NaN interspersed in such a column must be
// ignored without corrupting the infinite min/max.
template <typename T>
void TestFloatStatistics<T>::TestInfinities() {
NodePtr node = this->MakeNode("f", Repetition::REQUIRED);
ColumnDescriptor descr(node, 0, 0);

constexpr c_type inf = std::numeric_limits<c_type>::infinity();
constexpr c_type nan = std::numeric_limits<c_type>::quiet_NaN();
std::vector<c_type> all_pos_inf{inf, inf, inf};
std::vector<c_type> all_neg_inf{-inf, -inf, -inf};
std::vector<c_type> mixed_inf{inf, -inf};
std::vector<c_type> pos_inf_with_nan{inf, nan, inf};
std::vector<c_type> neg_inf_with_nan{-inf, nan, -inf};

AssertMinMaxAre(MakeStatistics<ParquetType>(&descr), all_pos_inf, inf, inf);
AssertMinMaxAre(MakeStatistics<ParquetType>(&descr), all_neg_inf, -inf, -inf);
AssertMinMaxAre(MakeStatistics<ParquetType>(&descr), mixed_inf, -inf, inf);
AssertMinMaxAre(MakeStatistics<ParquetType>(&descr), pos_inf_with_nan, inf, inf);
AssertMinMaxAre(MakeStatistics<ParquetType>(&descr), neg_inf_with_nan, -inf, -inf);
}

struct BufferedFloat16 {
explicit BufferedFloat16(Float16 f16) : f16(f16) {
this->f16.ToLittleEndian(bytes_.data());
Expand Down Expand Up @@ -1611,12 +1636,39 @@ void TestFloatStatistics<Float16LogicalType>::TestNaNs() {
valid_bitmap_no_nans);
}

template <>
void TestFloatStatistics<Float16LogicalType>::TestInfinities() {
NodePtr node = this->MakeNode("f", Repetition::REQUIRED);
ColumnDescriptor descr(node, 0, 0);

using F16 = BufferedFloat16;
const auto pos_inf = F16(std::numeric_limits<Float16>::infinity());
const auto neg_inf = F16(-std::numeric_limits<Float16>::infinity());
const auto nan = F16(std::numeric_limits<Float16>::quiet_NaN());
const auto pinf = FLBA{pos_inf.bytes()};
const auto ninf = FLBA{neg_inf.bytes()};
const auto fnan = FLBA{nan.bytes()};

std::vector<FLBA> all_pos_inf{pinf, pinf, pinf};
std::vector<FLBA> all_neg_inf{ninf, ninf, ninf};
std::vector<FLBA> mixed_inf{pinf, ninf};
std::vector<FLBA> pos_inf_with_nan{pinf, fnan, pinf};
std::vector<FLBA> neg_inf_with_nan{ninf, fnan, ninf};

AssertMinMaxAre(MakeStatistics<ParquetType>(&descr), all_pos_inf, pinf, pinf);
AssertMinMaxAre(MakeStatistics<ParquetType>(&descr), all_neg_inf, ninf, ninf);
AssertMinMaxAre(MakeStatistics<ParquetType>(&descr), mixed_inf, ninf, pinf);
AssertMinMaxAre(MakeStatistics<ParquetType>(&descr), pos_inf_with_nan, pinf, pinf);
AssertMinMaxAre(MakeStatistics<ParquetType>(&descr), neg_inf_with_nan, ninf, ninf);
}

using FloatingPointTypes = ::testing::Types<FloatType, DoubleType, Float16LogicalType>;

TYPED_TEST_SUITE(TestFloatStatistics, FloatingPointTypes);

TYPED_TEST(TestFloatStatistics, NegativeZeros) { this->TestNegativeZeroes(); }
TYPED_TEST(TestFloatStatistics, NaNs) { this->TestNaNs(); }
TYPED_TEST(TestFloatStatistics, Infinities) { this->TestInfinities(); }

// ARROW-7376
TEST(TestStatisticsSortOrderFloatNaN, NaNAndNullsInfiniteLoop) {
Expand Down
Loading