From 0bac3885cf0f14a8b496f8066dab4ef570635755 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Mon, 27 Apr 2026 12:57:26 -0700 Subject: [PATCH 1/5] Type promotion - metadata file reading --- pyiceberg/conversions.py | 4 +++ tests/expressions/test_evaluator.py | 44 +++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py index 0311e76d89..d8a280dc9f 100644 --- a/pyiceberg/conversions.py +++ b/pyiceberg/conversions.py @@ -350,6 +350,8 @@ def _(_: PrimitiveType, b: bytes) -> int: @from_bytes.register(TimestampNanoType) @from_bytes.register(TimestamptzNanoType) def _(_: PrimitiveType, b: bytes) -> int: + if len(b) == 4: + return _INT_STRUCT.unpack(b)[0] return _LONG_STRUCT.unpack(b)[0] @@ -360,6 +362,8 @@ def _(_: FloatType, b: bytes) -> float: @from_bytes.register(DoubleType) def _(_: DoubleType, b: bytes) -> float: + if len(b) == 4: + return _FLOAT_STRUCT.unpack(b)[0] return _DOUBLE_STRUCT.unpack(b)[0] diff --git a/tests/expressions/test_evaluator.py b/tests/expressions/test_evaluator.py index 03a33e4c78..4b4629ea21 100644 --- a/tests/expressions/test_evaluator.py +++ b/tests/expressions/test_evaluator.py @@ -50,6 +50,7 @@ FloatType, IcebergType, IntegerType, + LongType, NestedField, PrimitiveType, StringType, @@ -1463,3 +1464,46 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file should_read = _StrictMetricsEvaluator(strict_data_file_schema, NotIn("no_nulls", {"abc", "def"})).eval(strict_data_file_1) assert not should_read, "Should not match: no_nulls field does not have bounds" + + +def test_inclusive_metrics_evaluator_with_type_promotion_crash() -> None: + # Schema defines 'id' as LongType (evolved state) + schema = Schema(NestedField(1, "id", LongType(), required=True)) + + # Historical manifest contains 4-byte integer bounds + data_file = DataFile.from_args( + file_path="file_1.parquet", + file_format=FileFormat.PARQUET, + partition={}, + record_count=100, + file_size_in_bytes=1024, + lower_bounds={1: to_bytes(IntegerType(), 30)}, + upper_bounds={1: to_bytes(IntegerType(), 79)}, + ) + + # Predicate: id > 100 + # Decodes 4-byte bounds correctly and skips the file + evaluator_pruning = _InclusiveMetricsEvaluator(schema, GreaterThan("id", 100)) + assert not evaluator_pruning.eval(data_file) + + +def test_inclusive_metrics_evaluator_with_float_to_double_promotion() -> None: + # Schema defines 'val' as DoubleType (evolved state) + schema = Schema(NestedField(1, "val", DoubleType(), required=True)) + + # Historical manifest contains 4-byte float bounds + data_file = DataFile.from_args( + file_path="file_1.parquet", + file_format=FileFormat.PARQUET, + partition={}, + record_count=100, + file_size_in_bytes=1024, + lower_bounds={1: to_bytes(FloatType(), 30.0)}, + upper_bounds={1: to_bytes(FloatType(), 79.0)}, + ) + + # Predicate: val < 50.0 + evaluator = _InclusiveMetricsEvaluator(schema, LessThan("val", 50.0)) + + # Should not crash and should correctly identify that the file might match + assert evaluator.eval(data_file) From 095fd76f98879be1becef9d315f57a33b893e923 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Mon, 27 Apr 2026 13:45:49 -0700 Subject: [PATCH 2/5] linter fix --- pyiceberg/conversions.py | 4 --- pyiceberg/expressions/visitors.py | 43 +++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py index d8a280dc9f..0311e76d89 100644 --- a/pyiceberg/conversions.py +++ b/pyiceberg/conversions.py @@ -350,8 +350,6 @@ def _(_: PrimitiveType, b: bytes) -> int: @from_bytes.register(TimestampNanoType) @from_bytes.register(TimestamptzNanoType) def _(_: PrimitiveType, b: bytes) -> int: - if len(b) == 4: - return _INT_STRUCT.unpack(b)[0] return _LONG_STRUCT.unpack(b)[0] @@ -362,8 +360,6 @@ def _(_: FloatType, b: bytes) -> float: @from_bytes.register(DoubleType) def _(_: DoubleType, b: bytes) -> float: - if len(b) == 4: - return _FLOAT_STRUCT.unpack(b)[0] return _DOUBLE_STRUCT.unpack(b)[0] diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index 0beb0f3df0..04f0c3f971 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -59,12 +59,16 @@ from pyiceberg.schema import Schema from pyiceberg.typedef import EMPTY_DICT, L, LiteralValue, Record, StructProtocol from pyiceberg.types import ( + DateType, DoubleType, FloatType, IcebergType, + IntegerType, + LongType, NestedField, PrimitiveType, StructType, + TimestampNanoType, TimestampType, TimestamptzType, ) @@ -73,6 +77,17 @@ T = TypeVar("T") +def _from_bytes_with_promotion(field_type: PrimitiveType, b: bytes) -> Any: + if len(b) == 4: + if isinstance(field_type, LongType): + return from_bytes(IntegerType(), b) + elif isinstance(field_type, DoubleType): + return from_bytes(FloatType(), b) + elif isinstance(field_type, (TimestampType, TimestampNanoType)): + return from_bytes(DateType(), b) + return from_bytes(field_type, b) + + class BooleanExpressionVisitor(Generic[T], ABC): @abstractmethod def visit_true(self) -> T: @@ -1242,7 +1257,7 @@ def visit_less_than(self, term: BoundTerm, literal: LiteralValue) -> bool: raise ValueError(f"Expected PrimitiveType: {field.field_type}") if lower_bound_bytes := self.lower_bounds.get(field_id): - lower_bound = from_bytes(field.field_type, lower_bound_bytes) + lower_bound = _from_bytes_with_promotion(field.field_type, lower_bound_bytes) if self._is_nan(lower_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. @@ -1264,7 +1279,7 @@ def visit_less_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> bo raise ValueError(f"Expected PrimitiveType: {field.field_type}") if lower_bound_bytes := self.lower_bounds.get(field_id): - lower_bound = from_bytes(field.field_type, lower_bound_bytes) + lower_bound = _from_bytes_with_promotion(field.field_type, lower_bound_bytes) if self._is_nan(lower_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH @@ -1285,7 +1300,7 @@ def visit_greater_than(self, term: BoundTerm, literal: LiteralValue) -> bool: raise ValueError(f"Expected PrimitiveType: {field.field_type}") if upper_bound_bytes := self.upper_bounds.get(field_id): - upper_bound = from_bytes(field.field_type, upper_bound_bytes) + upper_bound = _from_bytes_with_promotion(field.field_type, upper_bound_bytes) if upper_bound <= literal.value: if self._is_nan(upper_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. @@ -1306,7 +1321,7 @@ def visit_greater_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> raise ValueError(f"Expected PrimitiveType: {field.field_type}") if upper_bound_bytes := self.upper_bounds.get(field_id): - upper_bound = from_bytes(field.field_type, upper_bound_bytes) + upper_bound = _from_bytes_with_promotion(field.field_type, upper_bound_bytes) if upper_bound < literal.value: if self._is_nan(upper_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. @@ -1327,7 +1342,7 @@ def visit_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: raise ValueError(f"Expected PrimitiveType: {field.field_type}") if lower_bound_bytes := self.lower_bounds.get(field_id): - lower_bound = from_bytes(field.field_type, lower_bound_bytes) + lower_bound = _from_bytes_with_promotion(field.field_type, lower_bound_bytes) if self._is_nan(lower_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH @@ -1336,7 +1351,7 @@ def visit_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: return ROWS_CANNOT_MATCH if upper_bound_bytes := self.upper_bounds.get(field_id): - upper_bound = from_bytes(field.field_type, upper_bound_bytes) + upper_bound = _from_bytes_with_promotion(field.field_type, upper_bound_bytes) if self._is_nan(upper_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH @@ -1364,22 +1379,22 @@ def visit_in(self, term: BoundTerm, literals: set[L]) -> bool: raise ValueError(f"Expected PrimitiveType: {field.field_type}") if lower_bound_bytes := self.lower_bounds.get(field_id): - lower_bound = from_bytes(field.field_type, lower_bound_bytes) + lower_bound = _from_bytes_with_promotion(field.field_type, lower_bound_bytes) if self._is_nan(lower_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH - literals = {lit for lit in literals if lower_bound <= lit} # type: ignore[operator] + literals = {lit for lit in literals if lower_bound <= lit} if len(literals) == 0: return ROWS_CANNOT_MATCH if upper_bound_bytes := self.upper_bounds.get(field_id): - upper_bound = from_bytes(field.field_type, upper_bound_bytes) + upper_bound = _from_bytes_with_promotion(field.field_type, upper_bound_bytes) # this is different from Java, here NaN is always larger if self._is_nan(upper_bound): return ROWS_MIGHT_MATCH - literals = {lit for lit in literals if upper_bound >= lit} # type: ignore[operator] + literals = {lit for lit in literals if upper_bound >= lit} if len(literals) == 0: return ROWS_CANNOT_MATCH @@ -1404,14 +1419,14 @@ def visit_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: len_prefix = len(prefix) if lower_bound_bytes := self.lower_bounds.get(field_id): - lower_bound = str(from_bytes(field.field_type, lower_bound_bytes)) + lower_bound = str(_from_bytes_with_promotion(field.field_type, lower_bound_bytes)) # truncate lower bound so that its length is not greater than the length of prefix if lower_bound and lower_bound[:len_prefix] > prefix: return ROWS_CANNOT_MATCH if upper_bound_bytes := self.upper_bounds.get(field_id): - upper_bound = str(from_bytes(field.field_type, upper_bound_bytes)) + upper_bound = str(_from_bytes_with_promotion(field.field_type, upper_bound_bytes)) # truncate upper bound so that its length is not greater than the length of prefix if upper_bound is not None and upper_bound[:len_prefix] < prefix: @@ -1435,8 +1450,8 @@ def visit_not_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: # not_starts_with will match unless all values must start with the prefix. This happens when # the lower and upper bounds both start with the prefix. if (lower_bound_bytes := self.lower_bounds.get(field_id)) and (upper_bound_bytes := self.upper_bounds.get(field_id)): - lower_bound = str(from_bytes(field.field_type, lower_bound_bytes)) - upper_bound = str(from_bytes(field.field_type, upper_bound_bytes)) + lower_bound = str(_from_bytes_with_promotion(field.field_type, lower_bound_bytes)) + upper_bound = str(_from_bytes_with_promotion(field.field_type, upper_bound_bytes)) # if lower is shorter than the prefix then lower doesn't start with the prefix if len(lower_bound) < len_prefix: From 803ac61d3f6e31c352f4030fbc531da679a70a44 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Wed, 29 Apr 2026 13:06:36 -0700 Subject: [PATCH 3/5] PR comments --- pyiceberg/expressions/visitors.py | 9 ++- tests/expressions/test_evaluator.py | 96 +++++++++++++++++++++-------- 2 files changed, 76 insertions(+), 29 deletions(-) diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index 04f0c3f971..9dbe04b564 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -59,7 +59,6 @@ from pyiceberg.schema import Schema from pyiceberg.typedef import EMPTY_DICT, L, LiteralValue, Record, StructProtocol from pyiceberg.types import ( - DateType, DoubleType, FloatType, IcebergType, @@ -68,7 +67,6 @@ NestedField, PrimitiveType, StructType, - TimestampNanoType, TimestampType, TimestamptzType, ) @@ -78,13 +76,14 @@ def _from_bytes_with_promotion(field_type: PrimitiveType, b: bytes) -> Any: + # Integer, Float, Date are 4 bytes + # Long, Double, Timestamps are 8 bytes + # If we have 4 bytes, we may have to handle type promotion. if len(b) == 4: if isinstance(field_type, LongType): return from_bytes(IntegerType(), b) elif isinstance(field_type, DoubleType): return from_bytes(FloatType(), b) - elif isinstance(field_type, (TimestampType, TimestampNanoType)): - return from_bytes(DateType(), b) return from_bytes(field_type, b) @@ -555,7 +554,7 @@ def visit_or(self, left_result: bool, right_result: bool) -> bool: def _from_byte_buffer(field_type: IcebergType, val: bytes) -> Any: if not isinstance(field_type, PrimitiveType): raise ValueError(f"Expected a PrimitiveType, got: {type(field_type)}") - return from_bytes(field_type, val) + return _from_bytes_with_promotion(field_type, val) class _ManifestEvalVisitor(BoundBooleanExpressionVisitor[bool]): diff --git a/tests/expressions/test_evaluator.py b/tests/expressions/test_evaluator.py index 4b4629ea21..5245c8a27a 100644 --- a/tests/expressions/test_evaluator.py +++ b/tests/expressions/test_evaluator.py @@ -41,7 +41,14 @@ Or, StartsWith, ) -from pyiceberg.expressions.visitors import _InclusiveMetricsEvaluator, _StrictMetricsEvaluator +from pyiceberg.expressions.visitors import ( + ROWS_CANNOT_MATCH, + ROWS_MIGHT_MATCH, + ROWS_MIGHT_NOT_MATCH, + ROWS_MUST_MATCH, + _InclusiveMetricsEvaluator, + _StrictMetricsEvaluator, +) from pyiceberg.manifest import DataFile, FileFormat from pyiceberg.schema import Schema from pyiceberg.typedef import Record @@ -1466,44 +1473,85 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file assert not should_read, "Should not match: no_nulls field does not have bounds" -def test_inclusive_metrics_evaluator_with_type_promotion_crash() -> None: - # Schema defines 'id' as LongType (evolved state) - schema = Schema(NestedField(1, "id", LongType(), required=True)) +@pytest.mark.parametrize( + "file_type, evolved_type, lower_bound, upper_bound, op, lit, expected", + [ + # Int -> Long + (IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_CANNOT_MATCH), + (IntegerType(), LongType(), 30, 79, LessThan, 50, ROWS_MIGHT_MATCH), + # Float -> Double + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_CANNOT_MATCH), + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 50.0, ROWS_MIGHT_MATCH), + ], +) +def test_inclusive_metrics_evaluator_with_type_promotion( + file_type: PrimitiveType, + evolved_type: PrimitiveType, + lower_bound: Any, + upper_bound: Any, + op: Any, + lit: Any, + expected: bool, +) -> None: + # Schema defines 'col' with evolved state + schema = Schema(NestedField(1, "col", evolved_type, required=True)) - # Historical manifest contains 4-byte integer bounds + # Historical manifest contains file_type bounds data_file = DataFile.from_args( file_path="file_1.parquet", file_format=FileFormat.PARQUET, partition={}, record_count=100, file_size_in_bytes=1024, - lower_bounds={1: to_bytes(IntegerType(), 30)}, - upper_bounds={1: to_bytes(IntegerType(), 79)}, + lower_bounds={1: to_bytes(file_type, lower_bound)}, + upper_bounds={1: to_bytes(file_type, upper_bound)}, ) - # Predicate: id > 100 - # Decodes 4-byte bounds correctly and skips the file - evaluator_pruning = _InclusiveMetricsEvaluator(schema, GreaterThan("id", 100)) - assert not evaluator_pruning.eval(data_file) - - -def test_inclusive_metrics_evaluator_with_float_to_double_promotion() -> None: - # Schema defines 'val' as DoubleType (evolved state) - schema = Schema(NestedField(1, "val", DoubleType(), required=True)) + # Predicate refers to 'col' + evaluator = _InclusiveMetricsEvaluator(schema, op("col", lit)) + assert evaluator.eval(data_file) == expected + + +@pytest.mark.parametrize( + "file_type, evolved_type, lower_bound, upper_bound, op, lit, expected", + [ + # Int -> Long + (IntegerType(), LongType(), 30, 79, GreaterThan, 20, ROWS_MUST_MATCH), + (IntegerType(), LongType(), 30, 79, GreaterThan, 100, ROWS_MIGHT_NOT_MATCH), + (IntegerType(), LongType(), 30, 79, LessThan, 100, ROWS_MUST_MATCH), + (IntegerType(), LongType(), 30, 79, LessThan, 20, ROWS_MIGHT_NOT_MATCH), + # Float -> Double + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 20.0, ROWS_MUST_MATCH), + (FloatType(), DoubleType(), 30.0, 79.0, GreaterThan, 100.0, ROWS_MIGHT_NOT_MATCH), + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 100.0, ROWS_MUST_MATCH), + (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 20.0, ROWS_MIGHT_NOT_MATCH), + ], +) +def test_strict_metrics_evaluator_with_type_promotion( + file_type: PrimitiveType, + evolved_type: PrimitiveType, + lower_bound: Any, + upper_bound: Any, + op: Any, + lit: Any, + expected: bool, +) -> None: + # Schema defines 'col' with evolved state + schema = Schema(NestedField(1, "col", evolved_type, required=True)) - # Historical manifest contains 4-byte float bounds + # Historical manifest contains file_type bounds data_file = DataFile.from_args( file_path="file_1.parquet", file_format=FileFormat.PARQUET, partition={}, record_count=100, file_size_in_bytes=1024, - lower_bounds={1: to_bytes(FloatType(), 30.0)}, - upper_bounds={1: to_bytes(FloatType(), 79.0)}, + lower_bounds={1: to_bytes(file_type, lower_bound)}, + upper_bounds={1: to_bytes(file_type, upper_bound)}, + null_value_counts={1: 0}, + nan_value_counts={1: 0}, ) - # Predicate: val < 50.0 - evaluator = _InclusiveMetricsEvaluator(schema, LessThan("val", 50.0)) - - # Should not crash and should correctly identify that the file might match - assert evaluator.eval(data_file) + # Predicate refers to 'col' + evaluator = _StrictMetricsEvaluator(schema, op("col", lit)) + assert evaluator.eval(data_file) == expected From ef8e8c9dfe8268951a2c74b0a3c6d443952dec57 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Thu, 30 Apr 2026 14:15:43 -0700 Subject: [PATCH 4/5] PR comments --- pyiceberg/conversions.py | 12 ++++++++- pyiceberg/expressions/visitors.py | 44 +++++++++++-------------------- 2 files changed, 26 insertions(+), 30 deletions(-) diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py index 0311e76d89..a071a3c884 100644 --- a/pyiceberg/conversions.py +++ b/pyiceberg/conversions.py @@ -343,7 +343,6 @@ def _(_: PrimitiveType, b: bytes) -> int: return _INT_STRUCT.unpack(b)[0] -@from_bytes.register(LongType) @from_bytes.register(TimeType) @from_bytes.register(TimestampType) @from_bytes.register(TimestamptzType) @@ -353,6 +352,14 @@ def _(_: PrimitiveType, b: bytes) -> int: return _LONG_STRUCT.unpack(b)[0] +@from_bytes.register(LongType) +def _(_: PrimitiveType, b: bytes) -> int: + if len(b) == 4: + # If the length is 4 bytes, it is a promoted IntegerType + return _INT_STRUCT.unpack(b)[0] + return _LONG_STRUCT.unpack(b)[0] + + @from_bytes.register(FloatType) def _(_: FloatType, b: bytes) -> float: return _FLOAT_STRUCT.unpack(b)[0] @@ -360,6 +367,9 @@ def _(_: FloatType, b: bytes) -> float: @from_bytes.register(DoubleType) def _(_: DoubleType, b: bytes) -> float: + if len(b) == 4: + # If the length is 4 bytes, it is a promoted FloatType + return _FLOAT_STRUCT.unpack(b)[0] return _DOUBLE_STRUCT.unpack(b)[0] diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index 9dbe04b564..0beb0f3df0 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -62,8 +62,6 @@ DoubleType, FloatType, IcebergType, - IntegerType, - LongType, NestedField, PrimitiveType, StructType, @@ -75,18 +73,6 @@ T = TypeVar("T") -def _from_bytes_with_promotion(field_type: PrimitiveType, b: bytes) -> Any: - # Integer, Float, Date are 4 bytes - # Long, Double, Timestamps are 8 bytes - # If we have 4 bytes, we may have to handle type promotion. - if len(b) == 4: - if isinstance(field_type, LongType): - return from_bytes(IntegerType(), b) - elif isinstance(field_type, DoubleType): - return from_bytes(FloatType(), b) - return from_bytes(field_type, b) - - class BooleanExpressionVisitor(Generic[T], ABC): @abstractmethod def visit_true(self) -> T: @@ -554,7 +540,7 @@ def visit_or(self, left_result: bool, right_result: bool) -> bool: def _from_byte_buffer(field_type: IcebergType, val: bytes) -> Any: if not isinstance(field_type, PrimitiveType): raise ValueError(f"Expected a PrimitiveType, got: {type(field_type)}") - return _from_bytes_with_promotion(field_type, val) + return from_bytes(field_type, val) class _ManifestEvalVisitor(BoundBooleanExpressionVisitor[bool]): @@ -1256,7 +1242,7 @@ def visit_less_than(self, term: BoundTerm, literal: LiteralValue) -> bool: raise ValueError(f"Expected PrimitiveType: {field.field_type}") if lower_bound_bytes := self.lower_bounds.get(field_id): - lower_bound = _from_bytes_with_promotion(field.field_type, lower_bound_bytes) + lower_bound = from_bytes(field.field_type, lower_bound_bytes) if self._is_nan(lower_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. @@ -1278,7 +1264,7 @@ def visit_less_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> bo raise ValueError(f"Expected PrimitiveType: {field.field_type}") if lower_bound_bytes := self.lower_bounds.get(field_id): - lower_bound = _from_bytes_with_promotion(field.field_type, lower_bound_bytes) + lower_bound = from_bytes(field.field_type, lower_bound_bytes) if self._is_nan(lower_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH @@ -1299,7 +1285,7 @@ def visit_greater_than(self, term: BoundTerm, literal: LiteralValue) -> bool: raise ValueError(f"Expected PrimitiveType: {field.field_type}") if upper_bound_bytes := self.upper_bounds.get(field_id): - upper_bound = _from_bytes_with_promotion(field.field_type, upper_bound_bytes) + upper_bound = from_bytes(field.field_type, upper_bound_bytes) if upper_bound <= literal.value: if self._is_nan(upper_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. @@ -1320,7 +1306,7 @@ def visit_greater_than_or_equal(self, term: BoundTerm, literal: LiteralValue) -> raise ValueError(f"Expected PrimitiveType: {field.field_type}") if upper_bound_bytes := self.upper_bounds.get(field_id): - upper_bound = _from_bytes_with_promotion(field.field_type, upper_bound_bytes) + upper_bound = from_bytes(field.field_type, upper_bound_bytes) if upper_bound < literal.value: if self._is_nan(upper_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. @@ -1341,7 +1327,7 @@ def visit_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: raise ValueError(f"Expected PrimitiveType: {field.field_type}") if lower_bound_bytes := self.lower_bounds.get(field_id): - lower_bound = _from_bytes_with_promotion(field.field_type, lower_bound_bytes) + lower_bound = from_bytes(field.field_type, lower_bound_bytes) if self._is_nan(lower_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH @@ -1350,7 +1336,7 @@ def visit_equal(self, term: BoundTerm, literal: LiteralValue) -> bool: return ROWS_CANNOT_MATCH if upper_bound_bytes := self.upper_bounds.get(field_id): - upper_bound = _from_bytes_with_promotion(field.field_type, upper_bound_bytes) + upper_bound = from_bytes(field.field_type, upper_bound_bytes) if self._is_nan(upper_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH @@ -1378,22 +1364,22 @@ def visit_in(self, term: BoundTerm, literals: set[L]) -> bool: raise ValueError(f"Expected PrimitiveType: {field.field_type}") if lower_bound_bytes := self.lower_bounds.get(field_id): - lower_bound = _from_bytes_with_promotion(field.field_type, lower_bound_bytes) + lower_bound = from_bytes(field.field_type, lower_bound_bytes) if self._is_nan(lower_bound): # NaN indicates unreliable bounds. See the InclusiveMetricsEvaluator docs for more. return ROWS_MIGHT_MATCH - literals = {lit for lit in literals if lower_bound <= lit} + literals = {lit for lit in literals if lower_bound <= lit} # type: ignore[operator] if len(literals) == 0: return ROWS_CANNOT_MATCH if upper_bound_bytes := self.upper_bounds.get(field_id): - upper_bound = _from_bytes_with_promotion(field.field_type, upper_bound_bytes) + upper_bound = from_bytes(field.field_type, upper_bound_bytes) # this is different from Java, here NaN is always larger if self._is_nan(upper_bound): return ROWS_MIGHT_MATCH - literals = {lit for lit in literals if upper_bound >= lit} + literals = {lit for lit in literals if upper_bound >= lit} # type: ignore[operator] if len(literals) == 0: return ROWS_CANNOT_MATCH @@ -1418,14 +1404,14 @@ def visit_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: len_prefix = len(prefix) if lower_bound_bytes := self.lower_bounds.get(field_id): - lower_bound = str(_from_bytes_with_promotion(field.field_type, lower_bound_bytes)) + lower_bound = str(from_bytes(field.field_type, lower_bound_bytes)) # truncate lower bound so that its length is not greater than the length of prefix if lower_bound and lower_bound[:len_prefix] > prefix: return ROWS_CANNOT_MATCH if upper_bound_bytes := self.upper_bounds.get(field_id): - upper_bound = str(_from_bytes_with_promotion(field.field_type, upper_bound_bytes)) + upper_bound = str(from_bytes(field.field_type, upper_bound_bytes)) # truncate upper bound so that its length is not greater than the length of prefix if upper_bound is not None and upper_bound[:len_prefix] < prefix: @@ -1449,8 +1435,8 @@ def visit_not_starts_with(self, term: BoundTerm, literal: LiteralValue) -> bool: # not_starts_with will match unless all values must start with the prefix. This happens when # the lower and upper bounds both start with the prefix. if (lower_bound_bytes := self.lower_bounds.get(field_id)) and (upper_bound_bytes := self.upper_bounds.get(field_id)): - lower_bound = str(_from_bytes_with_promotion(field.field_type, lower_bound_bytes)) - upper_bound = str(_from_bytes_with_promotion(field.field_type, upper_bound_bytes)) + lower_bound = str(from_bytes(field.field_type, lower_bound_bytes)) + upper_bound = str(from_bytes(field.field_type, upper_bound_bytes)) # if lower is shorter than the prefix then lower doesn't start with the prefix if len(lower_bound) < len_prefix: From 7467c0a8132591150f2803eb3bff4af901995274 Mon Sep 17 00:00:00 2001 From: Alex Stephen Date: Thu, 30 Apr 2026 17:40:08 -0700 Subject: [PATCH 5/5] PR comments --- pyiceberg/conversions.py | 4 ++-- tests/expressions/test_evaluator.py | 10 ++-------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/pyiceberg/conversions.py b/pyiceberg/conversions.py index a071a3c884..42d996f756 100644 --- a/pyiceberg/conversions.py +++ b/pyiceberg/conversions.py @@ -354,7 +354,7 @@ def _(_: PrimitiveType, b: bytes) -> int: @from_bytes.register(LongType) def _(_: PrimitiveType, b: bytes) -> int: - if len(b) == 4: + if len(b) < 8: # If the length is 4 bytes, it is a promoted IntegerType return _INT_STRUCT.unpack(b)[0] return _LONG_STRUCT.unpack(b)[0] @@ -367,7 +367,7 @@ def _(_: FloatType, b: bytes) -> float: @from_bytes.register(DoubleType) def _(_: DoubleType, b: bytes) -> float: - if len(b) == 4: + if len(b) < 8: # If the length is 4 bytes, it is a promoted FloatType return _FLOAT_STRUCT.unpack(b)[0] return _DOUBLE_STRUCT.unpack(b)[0] diff --git a/tests/expressions/test_evaluator.py b/tests/expressions/test_evaluator.py index 5245c8a27a..b8e4d87044 100644 --- a/tests/expressions/test_evaluator.py +++ b/tests/expressions/test_evaluator.py @@ -1484,7 +1484,7 @@ def test_strict_integer_not_in(strict_data_file_schema: Schema, strict_data_file (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 50.0, ROWS_MIGHT_MATCH), ], ) -def test_inclusive_metrics_evaluator_with_type_promotion( +def test_inclusive_metrics_eval_bounds_after_promotion( file_type: PrimitiveType, evolved_type: PrimitiveType, lower_bound: Any, @@ -1493,10 +1493,8 @@ def test_inclusive_metrics_evaluator_with_type_promotion( lit: Any, expected: bool, ) -> None: - # Schema defines 'col' with evolved state schema = Schema(NestedField(1, "col", evolved_type, required=True)) - # Historical manifest contains file_type bounds data_file = DataFile.from_args( file_path="file_1.parquet", file_format=FileFormat.PARQUET, @@ -1507,7 +1505,6 @@ def test_inclusive_metrics_evaluator_with_type_promotion( upper_bounds={1: to_bytes(file_type, upper_bound)}, ) - # Predicate refers to 'col' evaluator = _InclusiveMetricsEvaluator(schema, op("col", lit)) assert evaluator.eval(data_file) == expected @@ -1527,7 +1524,7 @@ def test_inclusive_metrics_evaluator_with_type_promotion( (FloatType(), DoubleType(), 30.0, 79.0, LessThan, 20.0, ROWS_MIGHT_NOT_MATCH), ], ) -def test_strict_metrics_evaluator_with_type_promotion( +def test_strict_metrics_eval_bounds_after_promotion( file_type: PrimitiveType, evolved_type: PrimitiveType, lower_bound: Any, @@ -1536,10 +1533,8 @@ def test_strict_metrics_evaluator_with_type_promotion( lit: Any, expected: bool, ) -> None: - # Schema defines 'col' with evolved state schema = Schema(NestedField(1, "col", evolved_type, required=True)) - # Historical manifest contains file_type bounds data_file = DataFile.from_args( file_path="file_1.parquet", file_format=FileFormat.PARQUET, @@ -1552,6 +1547,5 @@ def test_strict_metrics_evaluator_with_type_promotion( nan_value_counts={1: 0}, ) - # Predicate refers to 'col' evaluator = _StrictMetricsEvaluator(schema, op("col", lit)) assert evaluator.eval(data_file) == expected