-
Notifications
You must be signed in to change notification settings - Fork 25
/
test_integration.py
177 lines (141 loc) · 7.05 KB
/
test_integration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import numpy as np
import pytest
import xarray as xr
import xarray.testing as xrt
from virtualizarr import open_virtual_dataset
@pytest.mark.parametrize(
"inline_threshold, vars_to_inline",
[
(5e2, ["lat", "lon"]),
(5e4, ["lat", "lon", "time"]),
pytest.param(
5e7,
["lat", "lon", "time", "air"],
marks=pytest.mark.xfail(reason="scale factor encoding"),
),
],
)
def test_numpy_arrays_to_inlined_kerchunk_refs(
netcdf4_file, inline_threshold, vars_to_inline
):
from kerchunk.hdf import SingleHdf5ToZarr
# inline_threshold is chosen to test inlining only the variables listed in vars_to_inline
expected = SingleHdf5ToZarr(
netcdf4_file, inline_threshold=int(inline_threshold)
).translate()
# loading the variables should produce same result as inlining them using kerchunk
vds = open_virtual_dataset(
netcdf4_file, loadable_variables=vars_to_inline, indexes={}
)
refs = vds.virtualize.to_kerchunk(format="dict")
# TODO I would just compare the entire dicts but kerchunk returns inconsistent results - see https://github.com/TomNicholas/VirtualiZarr/pull/73#issuecomment-2040931202
# assert refs == expected
assert refs["refs"]["air/0.0.0"] == expected["refs"]["air/0.0.0"]
assert refs["refs"]["lon/0"] == expected["refs"]["lon/0"]
assert refs["refs"]["lat/0"] == expected["refs"]["lat/0"]
assert refs["refs"]["time/0"] == expected["refs"]["time/0"]
@pytest.mark.parametrize("format", ["dict", "json", "parquet"])
class TestKerchunkRoundtrip:
def test_kerchunk_roundtrip_no_concat(self, tmpdir, format):
# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=False)
# save it to disk as netCDF (in temporary directory)
ds.to_netcdf(f"{tmpdir}/air.nc")
# use open_dataset_via_kerchunk to read it as references
vds = open_virtual_dataset(f"{tmpdir}/air.nc", indexes={})
if format == "dict":
# write those references to an in-memory kerchunk-formatted references dictionary
ds_refs = vds.virtualize.to_kerchunk(format=format)
# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False)
else:
# write those references to disk as kerchunk references format
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
)
# assert identical to original dataset
xrt.assert_identical(roundtrip, ds)
@pytest.mark.parametrize("decode_times,time_vars", [(False, []), (True, ["time"])])
def test_kerchunk_roundtrip_concat(self, tmpdir, format, decode_times, time_vars):
# set up example xarray dataset
ds = xr.tutorial.open_dataset("air_temperature", decode_times=decode_times)
# split into two datasets
ds1, ds2 = ds.isel(time=slice(None, 1460)), ds.isel(time=slice(1460, None))
# save it to disk as netCDF (in temporary directory)
ds1.to_netcdf(f"{tmpdir}/air1.nc")
ds2.to_netcdf(f"{tmpdir}/air2.nc")
# use open_dataset_via_kerchunk to read it as references
vds1 = open_virtual_dataset(
f"{tmpdir}/air1.nc",
indexes={},
loadable_variables=time_vars,
cftime_variables=time_vars,
)
vds2 = open_virtual_dataset(
f"{tmpdir}/air2.nc",
indexes={},
loadable_variables=time_vars,
cftime_variables=time_vars,
)
if decode_times is False:
assert vds1.time.dtype == np.dtype("float32")
else:
assert vds1.time.dtype == np.dtype("<M8[ns]")
assert "units" in vds1.time.encoding
assert "calendar" in vds1.time.encoding
# concatenate virtually along time
vds = xr.concat([vds1, vds2], dim="time", coords="minimal", compat="override")
if format == "dict":
# write those references to an in-memory kerchunk-formatted references dictionary
ds_refs = vds.virtualize.to_kerchunk(format=format)
# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(
ds_refs, engine="kerchunk", decode_times=decode_times
)
else:
# write those references to disk as kerchunk references format
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=decode_times
)
if decode_times is False:
# assert identical to original dataset
xrt.assert_identical(roundtrip, ds)
else:
# they are very very close! But assert_allclose doesn't seem to work on datetimes
assert (roundtrip.time - ds.time).sum() == 0
assert roundtrip.time.dtype == ds.time.dtype
assert roundtrip.time.encoding["units"] == ds.time.encoding["units"]
assert roundtrip.time.encoding["calendar"] == ds.time.encoding["calendar"]
def test_non_dimension_coordinates(self, tmpdir, format):
# regression test for GH issue #105
# set up example xarray dataset containing non-dimension coordinate variables
ds = xr.Dataset(coords={"lat": (["x", "y"], np.arange(6.0).reshape(2, 3))})
# save it to disk as netCDF (in temporary directory)
ds.to_netcdf(f"{tmpdir}/non_dim_coords.nc")
vds = open_virtual_dataset(f"{tmpdir}/non_dim_coords.nc", indexes={})
assert "lat" in vds.coords
assert "coordinates" not in vds.attrs
if format == "dict":
# write those references to an in-memory kerchunk-formatted references dictionary
ds_refs = vds.virtualize.to_kerchunk(format=format)
# use fsspec to read the dataset from the kerchunk references dict
roundtrip = xr.open_dataset(ds_refs, engine="kerchunk", decode_times=False)
else:
# write those references to disk as kerchunk references format
vds.virtualize.to_kerchunk(f"{tmpdir}/refs.{format}", format=format)
# use fsspec to read the dataset from disk via the kerchunk references
roundtrip = xr.open_dataset(
f"{tmpdir}/refs.{format}", engine="kerchunk", decode_times=False
)
# assert equal to original dataset
xrt.assert_identical(roundtrip, ds)
def test_open_scalar_variable(tmpdir):
# regression test for GH issue #100
ds = xr.Dataset(data_vars={"a": 0})
ds.to_netcdf(f"{tmpdir}/scalar.nc")
vds = open_virtual_dataset(f"{tmpdir}/scalar.nc", indexes={})
assert vds["a"].shape == ()