Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,6 @@ tests/.hypothesis

zarr/version.py
zarr.egg-info/

# zarr-metadata package lockfile (a library, not an app)
packages/zarr-metadata/uv.lock
15 changes: 15 additions & 0 deletions packages/zarr-metadata/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# zarr-metadata

Spec-defined metadata types for Zarr v2 and v3, distributed as pure-typing
artifacts (TypedDicts, type aliases, unions). No runtime logic, no numpy,
no storage backends.

`zarr-metadata` is developed in the [zarr-python](https://github.com/zarr-developers/zarr-python)
repository at `packages/zarr-metadata/`.

## Principle

Every type that models a spec artifact (v2 or v3 array/group/consolidated
metadata, chunk grids, codec metadata, dtype shapes) belongs in
`zarr-metadata`. Zarr-python implementation details (runtime codecs,
config dataclasses, numcodecs-derived helpers) stay in `zarr`.
51 changes: 51 additions & 0 deletions packages/zarr-metadata/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
[build-system]
requires = ["hatchling>=1.29.0"]
build-backend = "hatchling.build"

[project]
name = "zarr-metadata"
version = "0.1.0"
description = "Spec-defined metadata types for Zarr v2 and v3."
readme = "README.md"
requires-python = ">=3.11"
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will this cause any issues with zarr-python being 3.12+? Should they be consistent?

license = "MIT"
authors = [
{ name = "Davis Bennett", email = "davis.v.bennett@gmail.com" },
]
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"Typing :: Typed",
]
dependencies = [
"typing_extensions>=4.13",
]

[project.optional-dependencies]
test = ["pytest"]

[tool.hatch.build.targets.wheel]
packages = ["src/zarr_metadata"]

[tool.numpydoc_validation]
checks = [
"GL10",
"SS04",
"PR02",
"PR03",
"PR05",
"PR06",
]

[tool.pyright]
include = ["src"]
enableExperimentalFeatures = true
typeCheckingMode = "strict"
pythonVersion = "3.11"
24 changes: 24 additions & 0 deletions packages/zarr-metadata/src/zarr_metadata/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from zarr_metadata.common import JSON, NamedConfig, NamedRequiredConfig
from zarr_metadata.v2.array import ArrayMetadataV2
from zarr_metadata.v2.group import GroupMetadataV2
from zarr_metadata.v3.array import ArrayMetadataV3
from zarr_metadata.v3.group import GroupMetadataV3

ArrayMetadata = ArrayMetadataV2 | ArrayMetadataV3
"""Any Zarr array metadata document (v2 or v3)."""

GroupMetadata = GroupMetadataV2 | GroupMetadataV3
"""Any Zarr group metadata document (v2 or v3)."""


__all__ = [
"JSON",
"ArrayMetadata",
"ArrayMetadataV2",
"ArrayMetadataV3",
"GroupMetadata",
"GroupMetadataV2",
"GroupMetadataV3",
"NamedConfig",
"NamedRequiredConfig",
]
47 changes: 47 additions & 0 deletions packages/zarr-metadata/src/zarr_metadata/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Top-level cross-version primitives for Zarr metadata.
Version-specific types live under `zarr_metadata.v2` and `zarr_metadata.v3`.
Codec and dtype spec types live under `zarr_metadata.v3.codec` and
`zarr_metadata.v3.data_type`.
"""

from collections.abc import Mapping, Sequence
from typing import Generic, NotRequired, TypedDict, TypeVar

from typing_extensions import ReadOnly

JSON = str | int | float | bool | Mapping[str, "JSON"] | Sequence["JSON"] | None
"""Any valid JSON value."""


TName = TypeVar("TName", bound=str)
TConfig = TypeVar("TConfig", bound=Mapping[str, object])


class NamedConfig(TypedDict, Generic[TName, TConfig]): # noqa: UP046
"""
Named-config envelope with optional configuration.
Generic with two parameters: name literal and configuration mapping.
Uses the PEP 484 ``Generic[T]`` form rather than PEP 695 ``[T]`` syntax
so the package supports Python 3.11.
"""

name: ReadOnly[TName]
configuration: NotRequired[ReadOnly[TConfig]]


class NamedRequiredConfig(TypedDict, Generic[TName, TConfig]): # noqa: UP046
"""
Named-config envelope with required configuration.
Generic with two parameters: name literal and configuration mapping.
Uses the PEP 484 ``Generic[T]`` form rather than PEP 695 ``[T]`` syntax
so the package supports Python 3.11.
"""

name: ReadOnly[TName]
configuration: ReadOnly[TConfig]
Comment on lines +36 to +47
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why include this?

Empty file.
15 changes: 15 additions & 0 deletions packages/zarr-metadata/src/zarr_metadata/v2/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
"""Zarr v2 metadata types."""

from zarr_metadata.v2.array import ArrayMetadataV2, DataTypeV2, DataTypeV2Structured
from zarr_metadata.v2.codec import NumcodecsConfig
from zarr_metadata.v2.consolidated import ConsolidatedMetadataV2
from zarr_metadata.v2.group import GroupMetadataV2

__all__ = [
"ArrayMetadataV2",
"ConsolidatedMetadataV2",
"DataTypeV2",
"DataTypeV2Structured",
"GroupMetadataV2",
"NumcodecsConfig",
]
59 changes: 59 additions & 0 deletions packages/zarr-metadata/src/zarr_metadata/v2/array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
"""Zarr v2 array metadata types."""

from __future__ import annotations

from typing import TYPE_CHECKING, Literal, NotRequired, TypedDict

if TYPE_CHECKING:
from zarr_metadata.common import JSON
from zarr_metadata.v2.codec import NumcodecsConfig


class DataTypeV2Structured(TypedDict):
"""
A single field entry inside a structured v2 dtype.

Spec-faithful: `datatype` is a numpy-style dtype string; `shape` is
present only when the field is a subarray field.

See https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html#data-type-encoding
"""

fieldname: str
datatype: str
shape: NotRequired[tuple[int, ...]]
Comment on lines +19 to +24
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wouldn't the spec-faithful representation just have a list of lists? the V2 spec doesn't have JSON keys for fieldname or datatype



DataTypeV2 = str | tuple[DataTypeV2Structured, ...]
"""The v2 dtype representation.

Simple dtypes are numpy-style strings (e.g. `"<f8"`, `"|S10"`).
Structured dtypes are lists of field records. Endianness is encoded in the
prefix character of the string; parsing it out is a caller concern, not
part of this type.
"""


class ArrayMetadataV2(TypedDict):
"""
Zarr v2 array metadata document (the `.zarray` content).

See https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html
"""

zarr_format: Literal[2]
shape: tuple[int, ...]
chunks: tuple[int, ...]
dtype: DataTypeV2
compressor: NumcodecsConfig | None
fill_value: JSON
order: Literal["C", "F"]
filters: tuple[NumcodecsConfig, ...] | None
dimension_separator: NotRequired[Literal[".", "/"]]


__all__ = [
"ArrayMetadataV2",
"DataTypeV2",
"DataTypeV2Structured",
]
29 changes: 29 additions & 0 deletions packages/zarr-metadata/src/zarr_metadata/v2/codec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""
Zarr v2 codec configuration shape.

V2 compressors and filters are numcodecs configuration dicts: a required
`id` field naming the codec, plus arbitrary codec-specific extra fields.
"""

from typing_extensions import ReadOnly, TypedDict

from zarr_metadata.common import JSON


class NumcodecsConfig(TypedDict, extra_items=JSON): # type: ignore[call-arg]
"""
A numcodecs configuration dict, used as a v2 compressor or filter.

The required `id` field names the codec; codec-specific parameters
(e.g. `cname`, `clevel` for blosc) appear as extra fields.

See the "compressor" and "filters" sections of
https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html
"""

id: ReadOnly[str]


__all__ = [
"NumcodecsConfig",
]
37 changes: 37 additions & 0 deletions packages/zarr-metadata/src/zarr_metadata/v2/consolidated.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Zarr v2 consolidated metadata (`.zmetadata` file).

This module models the de-facto `.zmetadata` file used by the reference
Python implementation of Zarr v2. **This is NOT a spec artifact.** There
is no Zarr v2 specification that defines `.zmetadata`; it is a
canonical-implementation convention.
"""

from __future__ import annotations

from typing import TYPE_CHECKING, TypedDict

if TYPE_CHECKING:
from collections.abc import Mapping

from zarr_metadata.common import JSON

from .array import ArrayMetadataV2
from .group import GroupMetadataV2


class ConsolidatedMetadataV2(TypedDict):
"""
`.zmetadata` file contents.

The `metadata` map uses flat path keys (`"foo/bar/.zarray"`,
`"foo/.zattrs"`, etc.) pointing to the JSON contents of the file at
that path. The keys include the filename suffix, not just the node path.
"""

zarr_consolidated_format: int
metadata: Mapping[str, GroupMetadataV2 | ArrayMetadataV2 | JSON]
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why allow arbitrary JSON as the values?



__all__ = [
"ConsolidatedMetadataV2",
]
19 changes: 19 additions & 0 deletions packages/zarr-metadata/src/zarr_metadata/v2/group.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""Zarr v2 group metadata types."""

from typing import Literal, TypedDict


class GroupMetadataV2(TypedDict):
"""
Zarr v2 group metadata document (the `.zgroup` content).
Attributes live in a sibling `.zattrs` file, so they are not part
of this dict.
"""

zarr_format: Literal[2]


__all__ = [
"GroupMetadataV2",
]
13 changes: 13 additions & 0 deletions packages/zarr-metadata/src/zarr_metadata/v3/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""Zarr v3 metadata types."""

from zarr_metadata.v3.array import AllowedExtraField, ArrayMetadataV3, MetadataField
from zarr_metadata.v3.consolidated import ConsolidatedMetadataV3
from zarr_metadata.v3.group import GroupMetadataV3

__all__ = [
"AllowedExtraField",
"ArrayMetadataV3",
"ConsolidatedMetadataV3",
"GroupMetadataV3",
"MetadataField",
]
52 changes: 52 additions & 0 deletions packages/zarr-metadata/src/zarr_metadata/v3/array.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Zarr v3 array metadata types."""

from collections.abc import Mapping
from typing import Literal, NotRequired

from typing_extensions import TypedDict

from zarr_metadata.common import JSON, NamedConfig


class AllowedExtraField(TypedDict, extra_items=JSON): # type: ignore[call-arg]
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd slightly prefer OptionalExtension as the name here

"""
Extra field on a v3 array metadata document.
Extras must include `must_understand: false` and may carry arbitrary
additional JSON data.
"""

must_understand: Literal[False]


MetadataField = str | NamedConfig[str, Mapping[str, JSON]]
"""A string or a {name: str, configuration: {...}} key value pair."""


class ArrayMetadataV3(TypedDict, extra_items=AllowedExtraField): # type: ignore[call-arg]
"""
Zarr v3 array metadata document (the `zarr.json` content for an array).
Extra keys are permitted if they conform to `AllowedExtraField`.
See https://zarr-specs.readthedocs.io/en/latest/v3/core/index.html#array-metadata
"""

zarr_format: Literal[3]
node_type: Literal["array"]
data_type: MetadataField
shape: tuple[int, ...]
chunk_grid: MetadataField
chunk_key_encoding: MetadataField
fill_value: JSON
codecs: tuple[MetadataField, ...]
attributes: NotRequired[Mapping[str, JSON]]
storage_transformers: NotRequired[tuple[MetadataField, ...]]
Comment on lines +37 to +44
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there are more restrictions on data_type, chunk_grid, chunk_key_encoding, and codecs than implied by MetadataField. This would be more useful if it were more strictly typed. In addition, MetadataField allowing a string shorthand is wrong in the chunk grid case.

Comment on lines +37 to +44
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

storage_transformers should be empty array until one is defined as an extension

dimension_names: NotRequired[tuple[str | None, ...]]


__all__ = [
"AllowedExtraField",
"ArrayMetadataV3",
"MetadataField",
]
10 changes: 10 additions & 0 deletions packages/zarr-metadata/src/zarr_metadata/v3/chunk_grid/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""
Zarr v3 chunk grid metadata types.
Each chunk grid lives in its own submodule:
- `regular` -- core v3 spec
- `rectilinear` -- zarr-extensions
See https://zarr-specs.readthedocs.io/en/latest/v3/core/index.html#chunk-grids
"""
Loading
Loading