-
-
Notifications
You must be signed in to change notification settings - Fork 290
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add
string
and bytes
dtypes plus vlen-utf8
and vlen-bytes
cod…
…ecs (#2036) * add legacy vlen-utf8 codec * got it working again * got strings working; broke everything else * change v3.metadata.data_type type * fixed tests * satisfy mypy for tests * make strings work * add missing module * store -> storage * rename module * add vlen bytes * fix type assertions in test * much better validation of fill value * retype parse_fill_value * tests pass but not mypy * attempted to change parse_fill_value typing * restore DEFAULT_DTYPE * fixup * docstring * update test * add better DataType tests * more progress on typing; still not passing mypy * fix typing yay! * make types work with numpy <, 2 * Apply suggestions from code review Co-authored-by: Joe Hamman <[email protected]> * Apply suggestions from code review Co-authored-by: Joe Hamman <[email protected]> * apply Joe's suggestions * add missing module * make _STRING_DTYPE private to try to make sphinx happy --------- Co-authored-by: Davis Bennett <[email protected]> Co-authored-by: Tom Augspurger <[email protected]> Co-authored-by: Joe Hamman <[email protected]>
- Loading branch information
1 parent
c258b27
commit 7e2be57
Showing
12 changed files
with
584 additions
and
94 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass | ||
from typing import TYPE_CHECKING | ||
|
||
import numpy as np | ||
from numcodecs.vlen import VLenBytes, VLenUTF8 | ||
|
||
from zarr.abc.codec import ArrayBytesCodec | ||
from zarr.core.buffer import Buffer, NDBuffer | ||
from zarr.core.common import JSON, parse_named_configuration | ||
from zarr.core.strings import cast_to_string_dtype | ||
from zarr.registry import register_codec | ||
|
||
if TYPE_CHECKING: | ||
from typing import Self | ||
|
||
from zarr.core.array_spec import ArraySpec | ||
|
||
|
||
# can use a global because there are no parameters | ||
_vlen_utf8_codec = VLenUTF8() | ||
_vlen_bytes_codec = VLenBytes() | ||
|
||
|
||
@dataclass(frozen=True) | ||
class VLenUTF8Codec(ArrayBytesCodec): | ||
@classmethod | ||
def from_dict(cls, data: dict[str, JSON]) -> Self: | ||
_, configuration_parsed = parse_named_configuration( | ||
data, "vlen-utf8", require_configuration=False | ||
) | ||
configuration_parsed = configuration_parsed or {} | ||
return cls(**configuration_parsed) | ||
|
||
def to_dict(self) -> dict[str, JSON]: | ||
return {"name": "vlen-utf8", "configuration": {}} | ||
|
||
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: | ||
return self | ||
|
||
async def _decode_single( | ||
self, | ||
chunk_bytes: Buffer, | ||
chunk_spec: ArraySpec, | ||
) -> NDBuffer: | ||
assert isinstance(chunk_bytes, Buffer) | ||
|
||
raw_bytes = chunk_bytes.as_array_like() | ||
decoded = _vlen_utf8_codec.decode(raw_bytes) | ||
assert decoded.dtype == np.object_ | ||
decoded.shape = chunk_spec.shape | ||
# coming out of the code, we know this is safe, so don't issue a warning | ||
as_string_dtype = cast_to_string_dtype(decoded, safe=True) | ||
return chunk_spec.prototype.nd_buffer.from_numpy_array(as_string_dtype) | ||
|
||
async def _encode_single( | ||
self, | ||
chunk_array: NDBuffer, | ||
chunk_spec: ArraySpec, | ||
) -> Buffer | None: | ||
assert isinstance(chunk_array, NDBuffer) | ||
return chunk_spec.prototype.buffer.from_bytes( | ||
_vlen_utf8_codec.encode(chunk_array.as_numpy_array()) | ||
) | ||
|
||
def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: | ||
# what is input_byte_length for an object dtype? | ||
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs") | ||
|
||
|
||
@dataclass(frozen=True) | ||
class VLenBytesCodec(ArrayBytesCodec): | ||
@classmethod | ||
def from_dict(cls, data: dict[str, JSON]) -> Self: | ||
_, configuration_parsed = parse_named_configuration( | ||
data, "vlen-bytes", require_configuration=False | ||
) | ||
configuration_parsed = configuration_parsed or {} | ||
return cls(**configuration_parsed) | ||
|
||
def to_dict(self) -> dict[str, JSON]: | ||
return {"name": "vlen-bytes", "configuration": {}} | ||
|
||
def evolve_from_array_spec(self, array_spec: ArraySpec) -> Self: | ||
return self | ||
|
||
async def _decode_single( | ||
self, | ||
chunk_bytes: Buffer, | ||
chunk_spec: ArraySpec, | ||
) -> NDBuffer: | ||
assert isinstance(chunk_bytes, Buffer) | ||
|
||
raw_bytes = chunk_bytes.as_array_like() | ||
decoded = _vlen_bytes_codec.decode(raw_bytes) | ||
assert decoded.dtype == np.object_ | ||
decoded.shape = chunk_spec.shape | ||
return chunk_spec.prototype.nd_buffer.from_numpy_array(decoded) | ||
|
||
async def _encode_single( | ||
self, | ||
chunk_array: NDBuffer, | ||
chunk_spec: ArraySpec, | ||
) -> Buffer | None: | ||
assert isinstance(chunk_array, NDBuffer) | ||
return chunk_spec.prototype.buffer.from_bytes( | ||
_vlen_bytes_codec.encode(chunk_array.as_numpy_array()) | ||
) | ||
|
||
def compute_encoded_size(self, input_byte_length: int, _chunk_spec: ArraySpec) -> int: | ||
# what is input_byte_length for an object dtype? | ||
raise NotImplementedError("compute_encoded_size is not implemented for VLen codecs") | ||
|
||
|
||
register_codec("vlen-utf8", VLenUTF8Codec) | ||
register_codec("vlen-bytes", VLenBytesCodec) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.