diff --git a/pyiceberg/table/puffin.py b/pyiceberg/table/puffin.py index 07874176c2..e74749305b 100644 --- a/pyiceberg/table/puffin.py +++ b/pyiceberg/table/puffin.py @@ -16,6 +16,7 @@ # under the License. from typing import TYPE_CHECKING +import zstandard from pydantic import Field from pyiceberg.typedef import IcebergBaseModel @@ -68,7 +69,12 @@ def __init__(self, puffin: bytes) -> None: self._file_bytes = puffin def get_blob_payload(self, blob: PuffinBlobMetadata) -> bytes: - return self._file_bytes[blob.offset : blob.offset + blob.length] + raw = self._file_bytes[blob.offset : blob.offset + blob.length] + if blob.compression_codec is None: + return raw + if blob.compression_codec == "zstd": + return zstandard.ZstdDecompressor().decompress(raw) + raise ValueError(f"Unsupported compression codec: {blob.compression_codec!r}") @deprecated(deprecated_in="0.12.0", removed_in="0.13.0", help_message="Use deletion_vectors_from_puffin_file(...) instead") def to_vector(self) -> dict[str, "pa.ChunkedArray"]: diff --git a/tests/table/puffin/v1/sample-metric-data-compressed-zstd.bin b/tests/table/puffin/v1/sample-metric-data-compressed-zstd.bin new file mode 100644 index 0000000000..ac8b69c76e Binary files /dev/null and b/tests/table/puffin/v1/sample-metric-data-compressed-zstd.bin differ diff --git a/tests/table/test_puffin.py b/tests/table/test_puffin.py index bc7d56a6b8..93b16158bb 100644 --- a/tests/table/test_puffin.py +++ b/tests/table/test_puffin.py @@ -33,6 +33,30 @@ def test_read_empty_uncompressed() -> None: assert pf.footer.properties == {} +def test_read_compressed_zstd() -> None: + puffin_bytes = _open_file("v1/sample-metric-data-compressed-zstd.bin") + pf = PuffinFile(puffin_bytes) + + assert pf.footer.properties == {"created-by": "Test 1234"} + assert len(pf.footer.blobs) == 2 + + blob1 = pf.footer.blobs[0] + assert blob1.type == "some-blob" + assert blob1.fields == [1] + assert blob1.snapshot_id == 2 + assert blob1.sequence_number == 1 + assert blob1.compression_codec == "zstd" + assert pf.get_blob_payload(blob1) == b"abcdefghi" + + blob2 = pf.footer.blobs[1] + assert blob2.type == "some-other-blob" + assert blob2.fields == [2] + assert blob2.compression_codec == "zstd" + assert pf.get_blob_payload(blob2) == ( + b"some blob \x00 binary data \xf0\x9f\xa4\xaf that is not very very very very very very long, is it?" + ) + + def test_read_two_blobs_uncompressed() -> None: puffin_bytes = _open_file("v1/sample-metric-data-uncompressed.bin") pf = PuffinFile(puffin_bytes)