Skip to content

Commit

Permalink
Use real array for data of of small netCDF variables. (#5229)
Browse files Browse the repository at this point in the history
* Small netCDF variable data is real.

* Various test fixes.

* More test fixing.

* Fix printout in Mesh documentation.

* Whatsnew + doctests fix.

* Tweak whatsnew.
  • Loading branch information
pp-mo authored Apr 6, 2023
1 parent b2b4e3b commit c74d783
Show file tree
Hide file tree
Showing 17 changed files with 142 additions and 55 deletions.
16 changes: 8 additions & 8 deletions docs/src/further_topics/ugrid/operations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -430,20 +430,20 @@ creating any associated :class:`~iris.cube.Cube`\s:
node
node_dimension: 'Mesh2d_node'
node coordinates
<AuxCoord: longitude / (degrees) <lazy> shape(5,)>
<AuxCoord: latitude / (unknown) <lazy> shape(5,)>
<AuxCoord: longitude / (degrees) [...] shape(5,)>
<AuxCoord: latitude / (unknown) [...] shape(5,)>
edge
edge_dimension: 'Mesh2d_edge'
edge_node_connectivity: <Connectivity: mesh2d_edge / (unknown) <lazy> shape(6, 2)>
edge_node_connectivity: <Connectivity: mesh2d_edge / (unknown) [...] shape(6, 2)>
edge coordinates
<AuxCoord: longitude / (unknown) <lazy> shape(6,)>
<AuxCoord: latitude / (unknown) <lazy> shape(6,)>
<AuxCoord: longitude / (unknown) [...] shape(6,)>
<AuxCoord: latitude / (unknown) [...] shape(6,)>
face
face_dimension: 'Mesh2d_face'
face_node_connectivity: <Connectivity: mesh2d_face / (unknown) <lazy> shape(2, 4)>
face_node_connectivity: <Connectivity: mesh2d_face / (unknown) [...] shape(2, 4)>
face coordinates
<AuxCoord: longitude / (unknown) <lazy> shape(2,)>
<AuxCoord: latitude / (unknown) <lazy> shape(2,)>
<AuxCoord: longitude / (unknown) [...] shape(2,)>
<AuxCoord: latitude / (unknown) [...] shape(2,)>
long_name: 'my_mesh'
var_name: 'my_mesh'

Expand Down
15 changes: 7 additions & 8 deletions docs/src/userguide/real_and_lazy_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -188,17 +188,17 @@ coordinates' lazy points and bounds:

.. doctest::

>>> cube = iris.load_cube(iris.sample_data_path('hybrid_height.nc'), 'air_potential_temperature')
>>> cube = iris.load_cube(iris.sample_data_path('orca2_votemper.nc'),'votemper')

>>> dim_coord = cube.coord('model_level_number')
>>> dim_coord = cube.coord('depth')
>>> print(dim_coord.has_lazy_points())
False
>>> print(dim_coord.has_bounds())
False
True
>>> print(dim_coord.has_lazy_bounds())
False

>>> aux_coord = cube.coord('sigma')
>>> aux_coord = cube.coord('longitude')
>>> print(aux_coord.has_lazy_points())
True
>>> print(aux_coord.has_bounds())
Expand All @@ -213,17 +213,16 @@ coordinates' lazy points and bounds:
>>> print(aux_coord.has_lazy_bounds())
True

>>> derived_coord = cube.coord('altitude')
# Fetch a derived coordinate, from a different file: These can also have lazy data.
>>> cube2 = iris.load_cube(iris.sample_data_path('hybrid_height.nc'), 'air_potential_temperature')
>>> derived_coord = cube2.coord('altitude')
>>> print(derived_coord.has_lazy_points())
True
>>> print(derived_coord.has_bounds())
True
>>> print(derived_coord.has_lazy_bounds())
True

.. note::
Printing a lazy :class:`~iris.coords.AuxCoord` will realise its points and bounds arrays!


Dask Processing Options
-----------------------
Expand Down
5 changes: 4 additions & 1 deletion docs/src/whatsnew/latest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@ This document explains the changes made to Iris for this release
🚀 Performance Enhancements
===========================

#. N/A
#. `@pp-mo`_ changed the netCDF loader to fetch data immediately from small netCDF
variables, instead of creating a dask array: This saves both time and memory.
Note that some cubes, coordinates etc loaded from netCDF will now have real data
where previously it was lazy. (:pull:`5229`)


🔥 Deprecations
Expand Down
65 changes: 46 additions & 19 deletions lib/iris/fileformats/netcdf/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,26 +173,53 @@ def _get_actual_dtype(cf_var):
return dummy_data.dtype


# An arbitrary variable array size, below which we will fetch real data from a variable
# rather than making a lazy array for deferred access.
# Set by experiment at roughly the point where it begins to save us memory, but actually
# mostly done for speed improvement. See https://github.com/SciTools/iris/pull/5069
_LAZYVAR_MIN_BYTES = 5000


def _get_cf_var_data(cf_var, filename):
# Get lazy chunked data out of a cf variable.
dtype = _get_actual_dtype(cf_var)

# Create cube with deferred data, but no metadata
fill_value = getattr(
cf_var.cf_data,
"_FillValue",
_thread_safe_nc.default_fillvals[cf_var.dtype.str[1:]],
)
proxy = NetCDFDataProxy(
cf_var.shape, dtype, filename, cf_var.cf_name, fill_value
)
# Get the chunking specified for the variable : this is either a shape, or
# maybe the string "contiguous".
chunks = cf_var.cf_data.chunking()
# In the "contiguous" case, pass chunks=None to 'as_lazy_data'.
if chunks == "contiguous":
chunks = None
return as_lazy_data(proxy, chunks=chunks)
"""
Get an array representing the data of a CF variable.
This is typically a lazy array based around a NetCDFDataProxy, but if the variable
is "sufficiently small", we instead fetch the data as a real (numpy) array.
The latter is especially valuable for scalar coordinates, which are otherwise
unnecessarily slow + wasteful of memory.
"""
total_bytes = cf_var.size * cf_var.dtype.itemsize
if total_bytes < _LAZYVAR_MIN_BYTES:
# Don't make a lazy array, as it will cost more memory AND more time to access.
# Instead fetch the data immediately, as a real array, and return that.
result = cf_var[:]

else:
# Get lazy chunked data out of a cf variable.
dtype = _get_actual_dtype(cf_var)

# Make a data-proxy that mimics array access and can fetch from the file.
fill_value = getattr(
cf_var.cf_data,
"_FillValue",
_thread_safe_nc.default_fillvals[cf_var.dtype.str[1:]],
)
proxy = NetCDFDataProxy(
cf_var.shape, dtype, filename, cf_var.cf_name, fill_value
)
# Get the chunking specified for the variable : this is either a shape, or
# maybe the string "contiguous".
chunks = cf_var.cf_data.chunking()
# In the "contiguous" case, pass chunks=None to 'as_lazy_data'.
if chunks == "contiguous":
chunks = None

# Return a dask array providing deferred access.
result = as_lazy_data(proxy, chunks=chunks)

return result


class _OrderedAddableList(list):
Expand Down
7 changes: 6 additions & 1 deletion lib/iris/tests/integration/netcdf/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import os.path
import shutil
import tempfile
from unittest import mock
import warnings

import numpy as np
Expand All @@ -34,7 +35,11 @@ def test_lazy_preserved_save(self):
fpath = tests.get_data_path(
("NetCDF", "label_and_climate", "small_FC_167_mon_19601101.nc")
)
acube = iris.load_cube(fpath, "air_temperature")
# While loading, "turn off" loading small variables as real data.
with mock.patch(
"iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES", 0
):
acube = iris.load_cube(fpath, "air_temperature")
self.assertTrue(acube.has_lazy_data())
# Also check a coord with lazy points + bounds.
self.assertTrue(acube.coord("forecast_period").has_lazy_points())
Expand Down
10 changes: 9 additions & 1 deletion lib/iris/tests/integration/test_cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
# importing anything else.
import iris.tests as tests # isort:skip

from unittest import mock

import numpy as np

import iris
Expand All @@ -23,7 +25,13 @@ def test_agg_by_aux_coord(self):
problem_test_file = tests.get_data_path(
("NetCDF", "testing", "small_theta_colpex.nc")
)
cube = iris.load_cube(problem_test_file, "air_potential_temperature")
# While loading, "turn off" loading small variables as real data.
with mock.patch(
"iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES", 0
):
cube = iris.load_cube(
problem_test_file, "air_potential_temperature"
)

# Test aggregating by aux coord, notably the `forecast_period` aux
# coord on `cube`, whose `_points` attribute is a lazy array.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
</coord>
</coords>
<cellMethods/>
<data dtype="float64" shape="(2, 2)" state="deferred"/>
<data dtype="float64" shape="(2, 2)" state="loaded"/>
</cube>
</cubes>
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
</coord>
</coords>
<cellMethods/>
<data dtype="float64" shape="(2, 2)" state="deferred"/>
<data dtype="float64" shape="(2, 2)" state="loaded"/>
</cube>
</cubes>
2 changes: 1 addition & 1 deletion lib/iris/tests/results/netcdf/save_load_traj.cml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,6 @@
<coord name="time"/>
</cellMethod>
</cellMethods>
<data dtype="float32" shape="(10,)" state="deferred"/>
<data dtype="float32" shape="(10,)" state="loaded"/>
</cube>
</cubes>
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
</coord>
</coords>
<cellMethods/>
<data dtype="float64" shape="(2, 2)" state="deferred"/>
<data dtype="float64" shape="(2, 2)" state="loaded"/>
</cube>
</cubes>
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
</coord>
</coords>
<cellMethods/>
<data dtype="float64" shape="(2, 2)" state="deferred"/>
<data dtype="float64" shape="(2, 2)" state="loaded"/>
</cube>
</cubes>
8 changes: 7 additions & 1 deletion lib/iris/tests/unit/aux_factory/test_AuxCoordFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# importing anything else.
import iris.tests as tests # isort:skip

from unittest import mock

import numpy as np

import iris
Expand Down Expand Up @@ -143,7 +145,11 @@ def setUp(self):
path = tests.get_data_path(
["NetCDF", "testing", "small_theta_colpex.nc"]
)
self.cube = iris.load_cube(path, "air_potential_temperature")
# While loading, "turn off" loading small variables as real data.
with mock.patch(
"iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES", 0
):
self.cube = iris.load_cube(path, "air_potential_temperature")

def _check_lazy(self):
coords = self.cube.aux_coords + self.cube.derived_coords
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def mock_cf_av_var(monkeypatch):
long_name="wibble",
units="m2",
shape=data.shape,
size=np.prod(data.shape),
dtype=data.dtype,
__getitem__=lambda self, key: data[key],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
build_auxilliary_coordinate`.
"""

# import iris tests first so that some things can be initialised before
# importing anything else
import iris.tests as tests # isort:skip

import contextlib
from unittest import mock

import numpy as np
Expand Down Expand Up @@ -48,6 +48,7 @@ def setUp(self):
long_name="wibble",
units="m",
shape=points.shape,
size=np.prod(points.shape),
dtype=points.dtype,
__getitem__=lambda self, key: points[key],
)
Expand Down Expand Up @@ -111,6 +112,7 @@ def _make_cf_bounds_var(self, dimension_names):
cf_name="wibble_bnds",
cf_data=cf_data,
shape=bounds.shape,
size=np.prod(bounds.shape),
dtype=bounds.dtype,
__getitem__=lambda self, key: bounds[key],
)
Expand Down Expand Up @@ -165,6 +167,7 @@ def setUp(self):
long_name="wibble",
units="m",
shape=points.shape,
size=np.prod(points.shape),
dtype=points.dtype,
__getitem__=lambda self, key: points[key],
)
Expand All @@ -176,21 +179,29 @@ def setUp(self):
cube_parts=dict(coordinates=[]),
)

@contextlib.contextmanager
def deferred_load_patch(self):
def patched__getitem__(proxy_self, keys):
if proxy_self.variable_name == self.cf_coord_var.cf_name:
return self.cf_coord_var[keys]
raise RuntimeError()

self.deferred_load_patch = mock.patch(
# Fix for deferred load, *AND* avoid loading small variable data in real arrays.
with mock.patch(
"iris.fileformats.netcdf.NetCDFDataProxy.__getitem__",
new=patched__getitem__,
)
):
# While loading, "turn off" loading small variables as real data.
with mock.patch(
"iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES", 0
):
yield

def test_scale_factor_add_offset_int(self):
self.cf_coord_var.scale_factor = 3
self.cf_coord_var.add_offset = 5

with self.deferred_load_patch:
with self.deferred_load_patch():
build_auxiliary_coordinate(self.engine, self.cf_coord_var)

coord, _ = self.engine.cube_parts["coordinates"][0]
Expand All @@ -199,7 +210,7 @@ def test_scale_factor_add_offset_int(self):
def test_scale_factor_float(self):
self.cf_coord_var.scale_factor = 3.0

with self.deferred_load_patch:
with self.deferred_load_patch():
build_auxiliary_coordinate(self.engine, self.cf_coord_var)

coord, _ = self.engine.cube_parts["coordinates"][0]
Expand All @@ -208,7 +219,7 @@ def test_scale_factor_float(self):
def test_add_offset_float(self):
self.cf_coord_var.add_offset = 5.0

with self.deferred_load_patch:
with self.deferred_load_patch():
build_auxiliary_coordinate(self.engine, self.cf_coord_var)

coord, _ = self.engine.cube_parts["coordinates"][0]
Expand Down Expand Up @@ -239,6 +250,7 @@ def setUp(self):
units="days since 1970-01-01",
calendar=None,
shape=points.shape,
size=np.prod(points.shape),
dtype=points.dtype,
__getitem__=lambda self, key: points[key],
)
Expand All @@ -251,6 +263,7 @@ def setUp(self):
cf_name="wibble_bnds",
cf_data=mock.MagicMock(chunking=mock.Mock(return_value=None)),
shape=bounds.shape,
size=np.prod(bounds.shape),
dtype=bounds.dtype,
__getitem__=lambda self, key: bounds[key],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def mock_cf_cm_var(monkeypatch):
long_name="wibble",
units="m2",
shape=data.shape,
size=np.prod(data.shape),
dtype=data.dtype,
__getitem__=lambda self, key: data[key],
cf_measure="area",
Expand Down
Loading

0 comments on commit c74d783

Please sign in to comment.