Read small integers as float32, not float64

AKA the "I just wasted 4.6 TB of memory" patch.
pydata · Jan 19, 2018 · c4fbcea · c4fbcea
1 parent f3deb2f
commit c4fbcea
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 5 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -50,6 +50,9 @@ Enhancements
 - :py:func:`~plot.line()` learned to draw multiple lines if provided with a
   2D variable.
   By `Deepak Cherian <https://github.com/dcherian>`_.
+- Reduce memory usage when decoding a variable with a scale_factor, by
+  converting 8-bit and 16-bit integers to float32 instead of float64.
+  By `Zac Hatfield-Dodds <https://github.com/Zac-HD>`_.
 
 .. _Zarr: http://zarr.readthedocs.io/
 
@@ -66,11 +69,9 @@ Bug fixes
 - Fixed encoding of multi-dimensional coordinates in
   :py:meth:`~Dataset.to_netcdf` (:issue:`1763`).
   By `Mike Neish <https://github.com/neishm>`_.
-
 - Bug fix in open_dataset(engine='pydap') (:issue:`1775`)
   By `Keisuke Fujii <https://github.com/fujiisoup>`_.
-
-- Bug fix in vectorized assignment  (:issue:`1743`, `1744`).
+- Bug fix in vectorized assignment  (:issue:`1743`, :issue:`1744`).
   Now item assignment to :py:meth:`~DataArray.__setitem__` checks
 - Bug fix in vectorized assignment  (:issue:`1743`, :issue:`1744`).
   Now item assignment to :py:meth:`DataArray.__setitem__` checks

diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -212,11 +212,24 @@ class CFScaleOffsetCoder(VariableCoder):
         decode_values = encoded_values * scale_factor + add_offset
     """
 
+    @staticmethod
+    def _choose_float_dtype(data, attributes):
+        # We default to en/decoding as float64, but for small integer types
+        # and no offset it's usually safe to use float32, which saves a lot
+        # of memory for eg. TB-scale satellite imagery collections.
+        if data.dtype.itemsize <= 2 and \
+                np.issubdtype(data.dtype, np.integer) and \
+                'add_offset' not in attributes and \
+                2 ** -23 < float(attributes.get('scale_factor', 1)) < 2 ** 8:
+            return np.float32
+        return np.float64
+
     def encode(self, variable, name=None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
         if 'scale_factor' in encoding or 'add_offset' in encoding:
-            data = data.astype(dtype=np.float64, copy=True)
+            dtype = self._choose_float_dtype(data, encoding)
+            data = data.astype(dtype=dtype, copy=True)
             if 'add_offset' in encoding:
                 data -= pop_to(encoding, attrs, 'add_offset', name=name)
             if 'scale_factor' in encoding:
@@ -230,7 +243,7 @@ def decode(self, variable, name=None):
         if 'scale_factor' in attrs or 'add_offset' in attrs:
             scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name)
             add_offset = pop_to(attrs, encoding, 'add_offset', name=name)
-            dtype = np.float64
+            dtype = self._choose_float_dtype(data, attrs)
             transform = partial(_scale_offset_decoding,
                                 scale_factor=scale_factor,
                                 add_offset=add_offset,