Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve support for legacy vlen codecs and compat with zarr-python #100

Merged
merged 6 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Relax `output_subset` requirements on `ArrayToBytesCodecTraits::decode_into` and `ArrayPartialDecoderTraits::partial_decode_into`
- The subset shape and dimensionality no longer has to match, only the number of elements
- Bump `pco` (pcodec) to 0.4
- **Breaking**: Change `experimental_codec_names` config hashmap to `HashMap<String, String>` from `HashMap<&'static str, String>`
- **Breaking**: Add `name` parameter to `vlen_v2` codec constructors
- Register `vlen-array`, `vlen-bytes`, and `vlen-utf8` codecs

### Removed
- Remove `async-recursion` dependency
- Remove `Default` implementation for `VlenV2Codec`

### Fixed
- Fix panics that could occur with with empty byte ranges / empty array subsets in `Array`, `ByteRange` and codec methods
Expand Down
4 changes: 0 additions & 4 deletions doc/TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@
- Remove most/all `_opt` methods when Rust [`import-trait-associated-functions`](https://github.com/rust-lang/rfcs/pull/3591) stabilises
- Use lending iterators where/if possible to avoid `Vec` allocations in iterators?

### Ecosystem Compatibility
- Support `vlen-utf8`/`vlen-bytes`/`vlen-array` for `zarr-python` V3 compatibility?
- My thoughts on variable-length data type standardisation: https://github.com/zarr-developers/zeps/pull/47#issuecomment-2238480835

### Codecs
- Implement codecs for compatibility with virtual NetCDF/HDF5 data with compression?

Expand Down
26 changes: 25 additions & 1 deletion zarrs/src/array/codec/array_to_bytes/vlen_v2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,15 @@ use crate::{
inventory::submit! {
CodecPlugin::new(IDENTIFIER, is_name_vlen_v2, create_codec_vlen_v2)
}
inventory::submit! {
CodecPlugin::new(crate::metadata::v2::array::codec::vlen_array::IDENTIFIER, is_name_vlen_array, create_codec_vlen_v2)
}
inventory::submit! {
CodecPlugin::new(crate::metadata::v2::array::codec::vlen_bytes::IDENTIFIER, is_name_vlen_bytes, create_codec_vlen_v2)
}
inventory::submit! {
CodecPlugin::new(crate::metadata::v2::array::codec::vlen_utf8::IDENTIFIER, is_name_vlen_utf8, create_codec_vlen_v2)
}

fn is_name_vlen_v2(name: &str) -> bool {
name.eq(IDENTIFIER)
Expand All @@ -38,11 +47,26 @@ fn is_name_vlen_v2(name: &str) -> bool {
.expect("experimental codec identifier in global map")
}

fn is_name_vlen_array(name: &str) -> bool {
name.eq(crate::metadata::v2::array::codec::vlen_array::IDENTIFIER)
}

fn is_name_vlen_bytes(name: &str) -> bool {
name.eq(crate::metadata::v2::array::codec::vlen_bytes::IDENTIFIER)
}

fn is_name_vlen_utf8(name: &str) -> bool {
name.eq(crate::metadata::v2::array::codec::vlen_utf8::IDENTIFIER)
}

pub(crate) fn create_codec_vlen_v2(metadata: &MetadataV3) -> Result<Codec, PluginCreateError> {
let configuration: VlenV2CodecConfiguration = metadata
.to_configuration()
.map_err(|_| PluginMetadataInvalidError::new(IDENTIFIER, "codec", metadata.clone()))?;
let codec = Arc::new(VlenV2Codec::new_with_configuration(&configuration));
let codec = Arc::new(VlenV2Codec::new_with_name_configuration(
metadata.name().to_string(),
&configuration,
));
Ok(Codec::ArrayToBytes(codec))
}

Expand Down
37 changes: 19 additions & 18 deletions zarrs/src/array/codec/array_to_bytes/vlen_v2/vlen_v2_codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,37 +23,38 @@ use crate::array::codec::{AsyncArrayPartialDecoderTraits, AsyncBytesPartialDecod
use super::{VlenV2CodecConfiguration, VlenV2CodecConfigurationV1};

/// The `vlen_v2` codec implementation.
#[derive(Debug, Clone, Default)]
pub struct VlenV2Codec {}
#[derive(Debug, Clone)]
pub struct VlenV2Codec {
name: String,
}

impl VlenV2Codec {
/// Create a new `vlen` codec.
/// Create a new `vlen_v2` codec.
#[must_use]
pub fn new() -> Self {
Self {}
pub fn new(name: String) -> Self {
Self { name }
}

/// Create a new `vlen` codec from configuration.
/// Create a new `vlen_v2` codec from configuration.
#[must_use]
pub fn new_with_configuration(_configuration: &VlenV2CodecConfiguration) -> Self {
pub fn new_with_name_configuration(
name: String,
_configuration: &VlenV2CodecConfiguration,
) -> Self {
// let VlenV2CodecConfiguration::V1(configuration) = configuration;
Self {}
Self { name }
}
}

impl CodecTraits for VlenV2Codec {
fn create_metadata_opt(&self, _options: &ArrayMetadataOptions) -> Option<MetadataV3> {
let config = global_config();
let name = config
.experimental_codec_names()
.get(&self.name)
.unwrap_or(&self.name);
let configuration = VlenV2CodecConfigurationV1 {};
Some(
MetadataV3::new_with_serializable_configuration(
global_config()
.experimental_codec_names()
.get(super::IDENTIFIER)
.expect("experimental codec identifier in global map"),
&configuration,
)
.unwrap(),
)
Some(MetadataV3::new_with_serializable_configuration(name, &configuration).unwrap())
}

fn partial_decoder_should_cache_input(&self) -> bool {
Expand Down
18 changes: 9 additions & 9 deletions zarrs/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
metadata_convert_version: MetadataConvertVersion,
metadata_erase_version: MetadataEraseVersion,
include_zarrs_metadata: bool,
experimental_codec_names: HashMap<&'static str, String>,
experimental_codec_names: HashMap<String, String>,
experimental_partial_encoding: bool,
}

Expand All @@ -125,17 +125,17 @@
let experimental_codec_names = HashMap::from([
// Array to array
#[cfg(feature = "bitround")]
(codec::bitround::IDENTIFIER, "https://codec.zarrs.dev/array_to_array/bitround".to_string()),
(codec::bitround::IDENTIFIER.to_string(), "https://codec.zarrs.dev/array_to_array/bitround".to_string()),
// Array to bytes
#[cfg(feature = "zfp")]
(codec::zfp::IDENTIFIER, "https://codec.zarrs.dev/array_to_bytes/zfp".to_string()),
(codec::zfp::IDENTIFIER.to_string(), "https://codec.zarrs.dev/array_to_bytes/zfp".to_string()),
#[cfg(feature = "pcodec")]
(codec::pcodec::IDENTIFIER, "https://codec.zarrs.dev/array_to_bytes/pcodec".to_string()),
(codec::vlen::IDENTIFIER, "https://codec.zarrs.dev/array_to_bytes/vlen".to_string()),
(codec::vlen_v2::IDENTIFIER, "https://codec.zarrs.dev/array_to_bytes/vlen_v2".to_string()),
(codec::pcodec::IDENTIFIER.to_string(), "https://codec.zarrs.dev/array_to_bytes/pcodec".to_string()),
(codec::vlen::IDENTIFIER.to_string(), "https://codec.zarrs.dev/array_to_bytes/vlen".to_string()),
(codec::vlen_v2::IDENTIFIER.to_string(), "https://codec.zarrs.dev/array_to_bytes/vlen_v2".to_string()),
// Bytes to bytes
#[cfg(feature = "bz2")]
(codec::bz2::IDENTIFIER, "https://codec.zarrs.dev/bytes_to_bytes/bz2".to_string()),
(codec::bz2::IDENTIFIER.to_string(), "https://codec.zarrs.dev/bytes_to_bytes/bz2".to_string()),
]);

let concurrency_multiply = 1;
Expand Down Expand Up @@ -259,12 +259,12 @@

/// Get the [experimental codec names](#experimental-codec-names) configuration.
#[must_use]
pub fn experimental_codec_names(&self) -> &HashMap<&'static str, String> {
pub fn experimental_codec_names(&self) -> &HashMap<String, String> {
&self.experimental_codec_names
}

/// Get a mutable reference to the [experimental codec names](#experimental-codec-names) configuration.
pub fn experimental_codec_names_mut(&mut self) -> &mut HashMap<&'static str, String> {
pub fn experimental_codec_names_mut(&mut self) -> &mut HashMap<String, String> {

Check warning on line 267 in zarrs/src/config.rs

View check run for this annotation

Codecov / codecov/patch

zarrs/src/config.rs#L267

Added line #L267 was not covered by tests
&mut self.experimental_codec_names
}

Expand Down
32 changes: 31 additions & 1 deletion zarrs/tests/cities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ fn cities() -> Result<(), Box<dyn Error>> {
assert_eq!(cities[47862], "Sariwŏn-si");
assert_eq!(cities[47867], "Charlotte Amalie");

let vlen_v2 = Arc::new(VlenV2Codec::default());
let vlen_v2 = Arc::new(VlenV2Codec::new("vlen-utf8".to_string()));

// let vlen = Arc::new(VlenCodec::default());
let vlen_configuration: VlenCodecConfiguration = serde_json::from_str(r#"{
Expand Down Expand Up @@ -132,3 +132,33 @@ fn cities() -> Result<(), Box<dyn Error>> {

Ok(())
}

#[test]
fn cities_zarr_python_v2_compat() -> Result<(), Box<dyn Error>> {
let store = Arc::new(FilesystemStore::new(
"tests/data/zarr_python_compat/cities_v2.zarr",
)?);
let array = zarrs::array::Array::open(store.clone(), "/")?;
let subset_all = array.subset_all();
let cities_out = array.retrieve_array_subset_elements::<String>(&subset_all)?;

let cities = read_cities()?;
assert_eq!(cities, cities_out);

Ok(())
}

#[test]
fn cities_zarr_python_v3_compat() -> Result<(), Box<dyn Error>> {
let store = Arc::new(FilesystemStore::new(
"tests/data/zarr_python_compat/cities_v3.zarr",
)?);
let array = zarrs::array::Array::open(store.clone(), "/")?;
let subset_all = array.subset_all();
let cities_out = array.retrieve_array_subset_elements::<String>(&subset_all)?;

let cities = read_cities()?;
assert_eq!(cities, cities_out);

Ok(())
}
13 changes: 3 additions & 10 deletions zarrs/tests/data/v2_cities.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
import zarr # v2
import zarr
import pandas as pd

print(zarr.__version__)
print(zarr.__version__) # This was generated with zarr==2.18

df = pd.read_csv("tests/data/cities.csv", header=None)
cities = df[0]

path_out = 'tests/data/v2/cities.zarr'
path_out = 'tests/data/zarr_python_compat/cities_v2.zarr'
array = zarr.open(path_out, mode='w', dtype=str, shape=(len(cities),), chunks=(1000,), compressor = None, fill_value='')
array[:] = cities.values
print(array.info)

for i in range(48):
v2 = open(f'tests/data/v2/cities.zarr/{i}', 'rb').read()
v3 = open(f'tests/data/v3/cities.zarr/c/{i}', 'rb').read()
assert v2 == v3

print("V2 and V3 chunks are identical!")
2 changes: 1 addition & 1 deletion zarrs/tests/data/v3/cities.zarr/zarr.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
"fill_value": "",
"codecs": [
{
"name": "https://codec.zarrs.dev/array_to_bytes/vlen_v2"
"name": "vlen-utf8"
}
]
}
23 changes: 23 additions & 0 deletions zarrs/tests/data/v3_cities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import zarr
import pandas as pd

print(zarr.__version__) # This was generate with zarr==2.18

df = pd.read_csv("tests/data/cities.csv", header=None)
cities = df[0]

path_out = 'tests/data/zarr_python_compat/cities_v3.zarr'
array = zarr.open(path_out, mode='w', dtype=str, shape=(len(cities),), chunks=(1000,))
array[:] = cities.values
print(array.info)

array_v2 = zarr.open('tests/data/zarr_python_compat/cities_v2.zarr', dtype=str, shape=(len(cities),), chunks=(1000,))

assert((array[:] == array_v2[:]).all())

# for i in range(48):
# v2 = open(f'tests/data/v2/cities.zarr/{i}', 'rb').read()
# v3 = open(f'tests/data/v3/cities.zarr/c/{i}', 'rb').read()
# assert v2 == v3

print("V2 and V3 chunks are identical!")
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"shape": [47868], "data_type": "string", "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": [1000]}}, "chunk_key_encoding": {"name": "default", "configuration": {"separator": "/"}}, "fill_value": "0", "codecs": [{"name": "vlen-utf8", "configuration": {}}], "attributes": {}, "zarr_format": 3, "node_type": "array", "storage_transformers": []}
2 changes: 2 additions & 0 deletions zarrs_metadata/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add `GroupMetadataV2` constructors
- Add `ArrayMetadataV2` constructors
- Implement `From<{&str,String}>` for `DataTypeMetadataV2`
- Add `v2::array::codec::vlen_{array,bytes,utf8}` modules
- Add support for Zarr V2 string fill values

### Changed
- **Breaking**: Mark `GroupMetadataV3` and `ArrayMetadataV3` as non-exhaustive
Expand Down
11 changes: 10 additions & 1 deletion zarrs_metadata/src/v2/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@
pub mod bz2;
/// `gzip` codec metadata.
pub mod gzip;
/// `vlen-array` codec metadata.
pub mod vlen_array;
/// `vlen-bytes` codec metadata.
pub mod vlen_bytes;
/// `vlen-utf8` codec metadata.
pub mod vlen_utf8;
/// `zfpy` codec metadata.
pub mod zfpy;
/// `zstd` codec metadata.
Expand Down Expand Up @@ -244,6 +250,8 @@
NegInfinity,
/// A number.
Number(serde_json::Number),
/// A string.
String(String),
}

impl<'de> serde::Deserialize<'de> for FillValueMetadataV2 {
Expand All @@ -261,7 +269,7 @@
"NaN" => Ok(Self::NaN),
"Infinity" => Ok(Self::Infinity),
"-Infinity" => Ok(Self::NegInfinity),
_ => Err(serde::de::Error::custom("unsupported fill value")),
_ => Ok(Self::String(string)),
},
FillValueMetadataV2Type::Number(number) => Ok(Self::Number(number)),
FillValueMetadataV2Type::Null => Ok(Self::Null),
Expand All @@ -280,6 +288,7 @@
Self::Infinity => serializer.serialize_str("Infinity"),
Self::NegInfinity => serializer.serialize_str("-Infinity"),
Self::Number(number) => number.serialize(serializer),
Self::String(string) => string.serialize(serializer),

Check warning on line 291 in zarrs_metadata/src/v2/array.rs

View check run for this annotation

Codecov / codecov/patch

zarrs_metadata/src/v2/array.rs#L291

Added line #L291 was not covered by tests
}
}
}
Expand Down
4 changes: 4 additions & 0 deletions zarrs_metadata/src/v2/array/codec/vlen_array.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/// The identifier for the `vlen-array` codec.
pub const IDENTIFIER: &str = "vlen-array";

pub use crate::v3::array::codec::vlen_v2::VlenV2CodecConfigurationV1;
4 changes: 4 additions & 0 deletions zarrs_metadata/src/v2/array/codec/vlen_bytes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/// The identifier for the `vlen-bytes` codec.
pub const IDENTIFIER: &str = "vlen-bytes";

pub use crate::v3::array::codec::vlen_v2::VlenV2CodecConfigurationV1;
4 changes: 4 additions & 0 deletions zarrs_metadata/src/v2/array/codec/vlen_utf8.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/// The identifier for the `vlen-utf8` codec.
pub const IDENTIFIER: &str = "vlen-utf8";

pub use crate::v3::array::codec::vlen_v2::VlenV2CodecConfigurationV1;
5 changes: 4 additions & 1 deletion zarrs_metadata/src/v2_to_v3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,9 @@ pub fn array_metadata_v2_to_v3(
for filter in filters {
// TODO: Add a V2 registry with V2 to V3 conversion functions
match filter.id() {
"vlen-utf8" | "vlen-bytes" | "vlen-array" => {
crate::v2::array::codec::vlen_array::IDENTIFIER
| crate::v2::array::codec::vlen_bytes::IDENTIFIER
| crate::v2::array::codec::vlen_utf8::IDENTIFIER => {
has_array_to_bytes = true;
let vlen_v2_metadata = MetadataV3::new_with_serializable_configuration(
crate::v3::array::codec::vlen_v2::IDENTIFIER,
Expand Down Expand Up @@ -314,5 +316,6 @@ pub fn array_metadata_fill_value_v2_to_v3(
unreachable!("number must be convertible to u64, i64 or f64")
}
}
FillValueMetadataV2::String(string) => Some(FillValueMetadataV3::String(string.clone())),
}
}