From 1ab923c16b46fade36fb52497e34f7615340fa02 Mon Sep 17 00:00:00 2001 From: Norbert Orzechowicz Date: Thu, 5 Mar 2026 09:47:28 -0600 Subject: [PATCH] fix: parquet statistics dont save string length into min/max statistics --- .../ParquetFile/RowGroup/StatisticsReader.php | 8 ++-- .../Flow/Parquet/Writer/StatisticsCounter.php | 22 +++++++-- .../Unit/Writer/StatisticsCounterTest.php | 47 +++++++++++++++++++ 3 files changed, 70 insertions(+), 7 deletions(-) diff --git a/src/lib/parquet/src/Flow/Parquet/ParquetFile/RowGroup/StatisticsReader.php b/src/lib/parquet/src/Flow/Parquet/ParquetFile/RowGroup/StatisticsReader.php index 59ccc932f5..65a408d70b 100644 --- a/src/lib/parquet/src/Flow/Parquet/ParquetFile/RowGroup/StatisticsReader.php +++ b/src/lib/parquet/src/Flow/Parquet/ParquetFile/RowGroup/StatisticsReader.php @@ -27,7 +27,7 @@ public function max(FlatColumn $column) : mixed return null; } - if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->max, 'UTF-8')) { + if (ColumnPrimitiveType::isString($column)) { return $this->statistics->max; } @@ -40,7 +40,7 @@ public function maxValue(FlatColumn $column) : mixed return null; } - if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->maxValue, 'UTF-8')) { + if (ColumnPrimitiveType::isString($column)) { return $this->statistics->maxValue; } @@ -53,7 +53,7 @@ public function min(FlatColumn $column) : mixed return null; } - if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->min, 'UTF-8')) { + if (ColumnPrimitiveType::isString($column)) { return $this->statistics->min; } @@ -66,7 +66,7 @@ public function minValue(FlatColumn $column) : mixed return null; } - if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->minValue, 'UTF-8')) { + if (ColumnPrimitiveType::isString($column)) { return $this->statistics->minValue; } diff --git a/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php b/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php index 58ded18d02..4867d13e4b 100644 --- a/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php +++ b/src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php @@ -9,7 +9,7 @@ use Flow\Parquet\Data\PlainValuesPacker; use Flow\Parquet\Dremel\Statistics\Comparator; use Flow\Parquet\Exception\InvalidArgumentException; -use Flow\Parquet\ParquetFile\Schema\FlatColumn; +use Flow\Parquet\ParquetFile\Schema\{FlatColumn, PhysicalType}; use Flow\Parquet\ParquetFile\Statistics; final class StatisticsCounter @@ -126,8 +126,24 @@ public function toStatistics() : Statistics $minBuffer = ''; $maxBuffer = ''; - (new PlainValuesPacker(new BinaryBufferWriter($minBuffer)))->packValues($this->column, [$this->min()]); - (new PlainValuesPacker(new BinaryBufferWriter($maxBuffer)))->packValues($this->column, [$this->max()]); + $min = $this->min(); + $max = $this->max(); + + if ($min !== null) { + if ($this->column->type() === PhysicalType::BYTE_ARRAY && \is_string($min)) { + (new BinaryBufferWriter($minBuffer))->append($min); + } else { + (new PlainValuesPacker(new BinaryBufferWriter($minBuffer)))->packValues($this->column, [$min]); + } + } + + if ($max !== null) { + if ($this->column->type() === PhysicalType::BYTE_ARRAY && \is_string($max)) { + (new BinaryBufferWriter($maxBuffer))->append($max); + } else { + (new PlainValuesPacker(new BinaryBufferWriter($maxBuffer)))->packValues($this->column, [$max]); + } + } return new Statistics( max: $maxBuffer !== '' ? $maxBuffer : null, diff --git a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php index 988e85f926..5873cdf639 100644 --- a/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php +++ b/src/lib/parquet/tests/Flow/Parquet/Tests/Unit/Writer/StatisticsCounterTest.php @@ -463,6 +463,53 @@ public function test_reset() : void self::assertNull($statistics->max()); } + public function test_to_statistics_encodes_byte_array_without_length_prefix() : void + { + $column = FlatColumn::string('test_column'); + $statistics = new StatisticsCounter($column); + + $statistics->add('hello'); + $statistics->add('world'); + + $result = $statistics->toStatistics(); + + self::assertSame('hello', $result->min); + self::assertSame('world', $result->max); + self::assertSame('hello', $result->minValue); + self::assertSame('world', $result->maxValue); + } + + public function test_to_statistics_with_int32_encodes_with_packer() : void + { + $column = FlatColumn::int32('test_column'); + $statistics = new StatisticsCounter($column); + + $statistics->add(5); + $statistics->add(10); + + $result = $statistics->toStatistics(); + + self::assertSame(\pack('l', 5), $result->min); + self::assertSame(\pack('l', 10), $result->max); + } + + public function test_to_statistics_with_null_values_does_not_encode() : void + { + $column = FlatColumn::string('test_column'); + $statistics = new StatisticsCounter($column); + + $statistics->add(null); + $statistics->add(null); + + $result = $statistics->toStatistics(); + + self::assertNull($result->min); + self::assertNull($result->max); + self::assertNull($result->minValue); + self::assertNull($result->maxValue); + self::assertSame(2, $result->nullCount); + } + public function test_values_count_calculation_with_arrays() : void { $column = FlatColumn::string('test_column');