Avoid loading entire dataset by getting the nbytes in an array (#7356)

* Avoid instantiating entire dataset by getting the nbytes in an array Using `.data` accidentally tries to load the whole lazy arrays into memory. Sad. * DOC: Add release note for bugfix. * Add test to ensure that number of bytes of sparse array is correctly reported * Add suggested test using InaccessibleArray * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove duplicate test Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Deepak Cherian <[email protected]>
pydata · Dec 12, 2022 · 021c73e · 021c73e
1 parent db68db6
commit 021c73e
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 2 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -34,6 +34,9 @@ Deprecations
 
 Bug fixes
 ~~~~~~~~~
+
+- Accessing the property ``.nbytes`` of a DataArray, or Variable no longer
+  accidentally triggers loading the variable into memory.
 - Allow numpy-only objects in :py:func:`where` when ``keep_attrs=True`` (:issue:`7362`, :pull:`7364`).
   By `Sam Levang <https://github.com/slevang>`_.
 - add a ``keep_attrs`` parameter to :py:meth:`Dataset.pad`, :py:meth:`DataArray.pad`,

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
@@ -402,8 +402,8 @@ def nbytes(self) -> int:
         If the underlying data array does not include ``nbytes``, estimates
         the bytes consumed based on the ``size`` and ``dtype``.
         """
-        if hasattr(self.data, "nbytes"):
-            return self.data.nbytes
+        if hasattr(self._data, "nbytes"):
+            return self._data.nbytes
         else:
             return self.size * self.dtype.itemsize
 

diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py
@@ -30,6 +30,7 @@
 from xarray.core.types import QueryEngineOptions, QueryParserOptions
 from xarray.core.utils import is_scalar
 from xarray.tests import (
+    InaccessibleArray,
     ReturnItem,
     assert_allclose,
     assert_array_equal,
@@ -3277,6 +3278,21 @@ def test_from_multiindex_series_sparse(self) -> None:
 
         np.testing.assert_equal(actual_coords, expected_coords)
 
+    def test_nbytes_does_not_load_data(self) -> None:
+        array = InaccessibleArray(np.zeros((3, 3), dtype="uint8"))
+        da = xr.DataArray(array, dims=["x", "y"])
+
+        # If xarray tries to instantiate the InaccessibleArray to compute
+        # nbytes, the following will raise an error.
+        # However, it should still be able to accurately give us information
+        # about the number of bytes from the metadata
+        assert da.nbytes == 9
+        # Here we confirm that this does not depend on array having the
+        # nbytes property, since it isn't really required by the array
+        # interface. nbytes is more a property of arrays that have been
+        # cast to numpy arrays.
+        assert not hasattr(array, "nbytes")
+
     def test_to_and_from_empty_series(self) -> None:
         # GH697
         expected = pd.Series([], dtype=np.float64)