diff --git a/pyiceberg/table/deletion_vector.py b/pyiceberg/table/deletion_vector.py index f337c758a7..88fb3daf73 100644 --- a/pyiceberg/table/deletion_vector.py +++ b/pyiceberg/table/deletion_vector.py @@ -77,11 +77,17 @@ def to_vector(self) -> "pa.ChunkedArray": return self._bitmaps_to_chunked_array(self._bitmaps) +def _extract_vector_payload(blob_payload: bytes) -> bytes: + """Strip deletion-vector-v1 blob framing: length(4 big-endian) + DV magic(4) ... CRC(4 big-endian).""" + length_prefix = int.from_bytes(blob_payload[0:4], "big") + return blob_payload[8 : 4 + length_prefix] + + def deletion_vectors_from_puffin_file(puffin_file: PuffinFile) -> list[DeletionVector]: return [ DeletionVector( referenced_data_file=blob.properties[PROPERTY_REFERENCED_DATA_FILE], - bitmaps=DeletionVector._deserialize_bitmap(puffin_file.get_blob_payload(blob)), + bitmaps=DeletionVector._deserialize_bitmap(_extract_vector_payload(puffin_file.get_blob_payload(blob))), ) for blob in puffin_file.footer.blobs ] diff --git a/pyiceberg/table/puffin.py b/pyiceberg/table/puffin.py index 571687bb3f..07874176c2 100644 --- a/pyiceberg/table/puffin.py +++ b/pyiceberg/table/puffin.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from typing import TYPE_CHECKING, Literal +from typing import TYPE_CHECKING from pydantic import Field @@ -29,7 +29,7 @@ class PuffinBlobMetadata(IcebergBaseModel): - type: Literal["deletion-vector-v1"] = Field() + type: str = Field() fields: list[int] = Field() snapshot_id: int = Field(alias="snapshot-id") sequence_number: int = Field(alias="sequence-number") @@ -46,7 +46,7 @@ class Footer(IcebergBaseModel): class PuffinFile: footer: Footer - _payload: bytes + _file_bytes: bytes def __init__(self, puffin: bytes) -> None: for magic_bytes in [puffin[:4], puffin[-4:]]: @@ -65,10 +65,10 @@ def __init__(self, puffin: bytes) -> None: footer_payload_size_int = int.from_bytes(puffin[-12:-8], byteorder="little") self.footer = Footer.model_validate_json(puffin[-(footer_payload_size_int + 12) : -12]) - self._payload = puffin[8:] + self._file_bytes = puffin def get_blob_payload(self, blob: PuffinBlobMetadata) -> bytes: - return self._payload[blob.offset : blob.offset + blob.length] + return self._file_bytes[blob.offset : blob.offset + blob.length] @deprecated(deprecated_in="0.12.0", removed_in="0.13.0", help_message="Use deletion_vectors_from_puffin_file(...) instead") def to_vector(self) -> dict[str, "pa.ChunkedArray"]: diff --git a/tests/table/puffin/v1/empty-puffin-uncompressed.bin b/tests/table/puffin/v1/empty-puffin-uncompressed.bin new file mode 100644 index 0000000000..142b45bd4e Binary files /dev/null and b/tests/table/puffin/v1/empty-puffin-uncompressed.bin differ diff --git a/tests/table/puffin/v1/sample-metric-data-uncompressed.bin b/tests/table/puffin/v1/sample-metric-data-uncompressed.bin new file mode 100644 index 0000000000..ab8da13822 Binary files /dev/null and b/tests/table/puffin/v1/sample-metric-data-uncompressed.bin differ diff --git a/tests/table/test_puffin.py b/tests/table/test_puffin.py new file mode 100644 index 0000000000..bc7d56a6b8 --- /dev/null +++ b/tests/table/test_puffin.py @@ -0,0 +1,57 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +from os import path + +from pyiceberg.table.puffin import PuffinFile + + +def _open_file(file: str) -> bytes: + cur_dir = path.dirname(path.realpath(__file__)) + with open(f"{cur_dir}/puffin/{file}", "rb") as f: + return f.read() + + +def test_read_empty_uncompressed() -> None: + puffin_bytes = _open_file("v1/empty-puffin-uncompressed.bin") + pf = PuffinFile(puffin_bytes) + + assert pf.footer.blobs == [] + assert pf.footer.properties == {} + + +def test_read_two_blobs_uncompressed() -> None: + puffin_bytes = _open_file("v1/sample-metric-data-uncompressed.bin") + pf = PuffinFile(puffin_bytes) + + assert pf.footer.properties == {"created-by": "Test 1234"} + assert len(pf.footer.blobs) == 2 + + blob1 = pf.footer.blobs[0] + assert blob1.type == "some-blob" + assert blob1.fields == [1] + assert blob1.snapshot_id == 2 + assert blob1.sequence_number == 1 + assert blob1.compression_codec is None + assert pf.get_blob_payload(blob1) == b"abcdefghi" + + blob2 = pf.footer.blobs[1] + assert blob2.type == "some-other-blob" + assert blob2.fields == [2] + assert blob2.compression_codec is None + assert pf.get_blob_payload(blob2) == ( + b"some blob \x00 binary data \xf0\x9f\xa4\xaf that is not very very very very very very long, is it?" + )