Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public function max(FlatColumn $column) : mixed
return null;
}

if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->max, 'UTF-8')) {
if (ColumnPrimitiveType::isString($column)) {
return $this->statistics->max;
}

Expand All @@ -40,7 +40,7 @@ public function maxValue(FlatColumn $column) : mixed
return null;
}

if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->maxValue, 'UTF-8')) {
if (ColumnPrimitiveType::isString($column)) {
return $this->statistics->maxValue;
}

Expand All @@ -53,7 +53,7 @@ public function min(FlatColumn $column) : mixed
return null;
}

if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->min, 'UTF-8')) {
if (ColumnPrimitiveType::isString($column)) {
return $this->statistics->min;
}

Expand All @@ -66,7 +66,7 @@ public function minValue(FlatColumn $column) : mixed
return null;
}

if (ColumnPrimitiveType::isString($column) && \mb_check_encoding($this->statistics->minValue, 'UTF-8')) {
if (ColumnPrimitiveType::isString($column)) {
return $this->statistics->minValue;
}

Expand Down
22 changes: 19 additions & 3 deletions src/lib/parquet/src/Flow/Parquet/Writer/StatisticsCounter.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
use Flow\Parquet\Data\PlainValuesPacker;
use Flow\Parquet\Dremel\Statistics\Comparator;
use Flow\Parquet\Exception\InvalidArgumentException;
use Flow\Parquet\ParquetFile\Schema\FlatColumn;
use Flow\Parquet\ParquetFile\Schema\{FlatColumn, PhysicalType};
use Flow\Parquet\ParquetFile\Statistics;

final class StatisticsCounter
Expand Down Expand Up @@ -126,8 +126,24 @@ public function toStatistics() : Statistics
$minBuffer = '';
$maxBuffer = '';

(new PlainValuesPacker(new BinaryBufferWriter($minBuffer)))->packValues($this->column, [$this->min()]);
(new PlainValuesPacker(new BinaryBufferWriter($maxBuffer)))->packValues($this->column, [$this->max()]);
$min = $this->min();
$max = $this->max();

if ($min !== null) {
if ($this->column->type() === PhysicalType::BYTE_ARRAY && \is_string($min)) {
(new BinaryBufferWriter($minBuffer))->append($min);
} else {
(new PlainValuesPacker(new BinaryBufferWriter($minBuffer)))->packValues($this->column, [$min]);
}
}

if ($max !== null) {
if ($this->column->type() === PhysicalType::BYTE_ARRAY && \is_string($max)) {
(new BinaryBufferWriter($maxBuffer))->append($max);
} else {
(new PlainValuesPacker(new BinaryBufferWriter($maxBuffer)))->packValues($this->column, [$max]);
}
}

return new Statistics(
max: $maxBuffer !== '' ? $maxBuffer : null,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,53 @@ public function test_reset() : void
self::assertNull($statistics->max());
}

public function test_to_statistics_encodes_byte_array_without_length_prefix() : void
{
$column = FlatColumn::string('test_column');
$statistics = new StatisticsCounter($column);

$statistics->add('hello');
$statistics->add('world');

$result = $statistics->toStatistics();

self::assertSame('hello', $result->min);
self::assertSame('world', $result->max);
self::assertSame('hello', $result->minValue);
self::assertSame('world', $result->maxValue);
}

public function test_to_statistics_with_int32_encodes_with_packer() : void
{
$column = FlatColumn::int32('test_column');
$statistics = new StatisticsCounter($column);

$statistics->add(5);
$statistics->add(10);

$result = $statistics->toStatistics();

self::assertSame(\pack('l', 5), $result->min);
self::assertSame(\pack('l', 10), $result->max);
}

public function test_to_statistics_with_null_values_does_not_encode() : void
{
$column = FlatColumn::string('test_column');
$statistics = new StatisticsCounter($column);

$statistics->add(null);
$statistics->add(null);

$result = $statistics->toStatistics();

self::assertNull($result->min);
self::assertNull($result->max);
self::assertNull($result->minValue);
self::assertNull($result->maxValue);
self::assertSame(2, $result->nullCount);
}

public function test_values_count_calculation_with_arrays() : void
{
$column = FlatColumn::string('test_column');
Expand Down
Loading