From a5e7671df03b6551661adedfe69f34138053f63b Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sat, 21 Sep 2024 14:54:06 +0200 Subject: [PATCH 1/5] Allow disabling inlining for MultiZarrToZarr --- kerchunk/combine.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/kerchunk/combine.py b/kerchunk/combine.py index 6dc71be7..11f8160e 100644 --- a/kerchunk/combine.py +++ b/kerchunk/combine.py @@ -1,7 +1,7 @@ import collections.abc import logging import re -from typing import List +from typing import List, Optional import warnings import fsspec @@ -78,8 +78,9 @@ class MultiZarrToZarr: :param remote_protocol: str The protocol of the original data :param remote_options: dict - :param inline_threshold: int - Size below which binary blocks are included directly in the output + :param inline_threshold: int | None + Size below which binary blocks are included directly in the output. If None, no + inlining is done. :param preprocess: callable Acts on the references dict of all inputs before processing. See ``drop()`` for an example. @@ -106,7 +107,7 @@ def __init__( target_options=None, remote_protocol=None, remote_options=None, - inline_threshold=500, + inline_threshold: Optional[int] = 500, preprocess=None, postprocess=None, out=None, @@ -584,9 +585,13 @@ def second_pass(self): key = key.rstrip(".") ref = fs.references.get(fn) - if isinstance(ref, list) and ( - (len(ref) > 1 and ref[2] < self.inline) - or fs.info(fn)["size"] < self.inline + if ( + self.inline is not None + and isinstance(ref, list) + and ( + (len(ref) > 1 and ref[2] < self.inline) + or fs.info(fn)["size"] < self.inline + ) ): to_download[key] = fn else: From 12c8e0f1c5d1108860302cdbcae779c315b681d3 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Sat, 21 Sep 2024 16:50:25 +0200 Subject: [PATCH 2/5] Add a test that MultiZarrToZarr doesn't access files when inlining disabled --- kerchunk/tests/test_combine.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/kerchunk/tests/test_combine.py b/kerchunk/tests/test_combine.py index efd7e051..e22e505c 100644 --- a/kerchunk/tests/test_combine.py +++ b/kerchunk/tests/test_combine.py @@ -768,6 +768,29 @@ def test_inline(refs): assert ref.references["data/0.0.0"].startswith("base64:") +def test_no_inline(refs): + """Ensure that inline_threshold=None disables MultiZarrToZarr checking file size.""" + ds = xr.Dataset(dict(x=[1, 2, 3])) + ds["y"] = 3 + ds["x"] + store = fsspec.get_mapper("memory://zarr_store") + ds.to_zarr(store, mode="w", consolidated=False) + ref = kerchunk.utils.consolidate(store) + # This type of reference with no offset or total size is produced by + # kerchunk.zarr.single_zarr or equivalently ZarrToZarr.translate. + ref["refs"]["y/0"] = ["file:///tmp/some/data-that-shouldnt-be-accessed"] + + mzz_no_inline = MultiZarrToZarr([ref], concat_dims=["x"], inline_threshold=None) + # Should be okay because inline_threshold=None so we don't check the file size + # in order to see if it should be inlined + mzz_no_inline.translate() + + mzz_inline = MultiZarrToZarr([ref], concat_dims=["x"], inline_threshold=1) + with pytest.raises(FileNotFoundError): + # Should raise because we check the file size to see if it should be inlined, + # and the example was engineered so that the file doesn't exist. + mzz_inline.translate() + + def test_merge_vars(): a = dict({"version": 1, "refs": dict({"item1": 1})}) b = dict({"version": 1, "refs": dict({"item2": 2})}) From 1730d47ff0e3ae9ff518f0a86e4403c838989761 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Wed, 25 Sep 2024 17:43:09 +0200 Subject: [PATCH 3/5] Use 0 instead of None for disabling MultiZarrToZarr inlining --- kerchunk/combine.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kerchunk/combine.py b/kerchunk/combine.py index 11f8160e..0f5ccb9b 100644 --- a/kerchunk/combine.py +++ b/kerchunk/combine.py @@ -1,7 +1,7 @@ import collections.abc import logging import re -from typing import List, Optional +from typing import List import warnings import fsspec @@ -78,9 +78,8 @@ class MultiZarrToZarr: :param remote_protocol: str The protocol of the original data :param remote_options: dict - :param inline_threshold: int | None - Size below which binary blocks are included directly in the output. If None, no - inlining is done. + :param inline_threshold: int + Size below which binary blocks are included directly in the output :param preprocess: callable Acts on the references dict of all inputs before processing. See ``drop()`` for an example. @@ -95,6 +94,8 @@ class MultiZarrToZarr: If True, will load the references specified by out and add to them rather than starting from scratch. Assumes the same coordinates are being concatenated. """ + + inline: int def __init__( self, @@ -107,7 +108,7 @@ def __init__( target_options=None, remote_protocol=None, remote_options=None, - inline_threshold: Optional[int] = 500, + inline_threshold: int = 500, preprocess=None, postprocess=None, out=None, @@ -586,7 +587,7 @@ def second_pass(self): ref = fs.references.get(fn) if ( - self.inline is not None + self.inline > 0 and isinstance(ref, list) and ( (len(ref) > 1 and ref[2] < self.inline) From 0eae84c6f16e8a77e274937f779993a7fcc4d160 Mon Sep 17 00:00:00 2001 From: Ben Mares Date: Wed, 25 Sep 2024 17:44:08 +0200 Subject: [PATCH 4/5] Update test to use 0 instead of None --- kerchunk/tests/test_combine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kerchunk/tests/test_combine.py b/kerchunk/tests/test_combine.py index e22e505c..13994921 100644 --- a/kerchunk/tests/test_combine.py +++ b/kerchunk/tests/test_combine.py @@ -769,7 +769,7 @@ def test_inline(refs): def test_no_inline(refs): - """Ensure that inline_threshold=None disables MultiZarrToZarr checking file size.""" + """Ensure that inline_threshold=0 disables MultiZarrToZarr checking file size.""" ds = xr.Dataset(dict(x=[1, 2, 3])) ds["y"] = 3 + ds["x"] store = fsspec.get_mapper("memory://zarr_store") @@ -779,7 +779,7 @@ def test_no_inline(refs): # kerchunk.zarr.single_zarr or equivalently ZarrToZarr.translate. ref["refs"]["y/0"] = ["file:///tmp/some/data-that-shouldnt-be-accessed"] - mzz_no_inline = MultiZarrToZarr([ref], concat_dims=["x"], inline_threshold=None) + mzz_no_inline = MultiZarrToZarr([ref], concat_dims=["x"], inline_threshold=0) # Should be okay because inline_threshold=None so we don't check the file size # in order to see if it should be inlined mzz_no_inline.translate() From 0a778603cc7e6c087b382d6c7137eb1fd6182f74 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 25 Sep 2024 12:21:08 -0400 Subject: [PATCH 5/5] Update kerchunk/combine.py --- kerchunk/combine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kerchunk/combine.py b/kerchunk/combine.py index 0f5ccb9b..eb891de1 100644 --- a/kerchunk/combine.py +++ b/kerchunk/combine.py @@ -94,7 +94,7 @@ class MultiZarrToZarr: If True, will load the references specified by out and add to them rather than starting from scratch. Assumes the same coordinates are being concatenated. """ - + inline: int def __init__(