From 60d5717ba5b9a51cb031b506885a656e50199d22 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 21 May 2024 03:59:27 -1000 Subject: [PATCH] Improve performance of Series.to_numpy/to_cupy (#15792) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xref https://github.com/rapidsai/cudf/issues/11648 Essentially refactors `Frame._to_array` to short circuit some checks for a `Frame` with 1 column or `ndim == 1` ```python In [1]: import cudf In [2]: s = cudf.Series(range(10000)) In [3]: %timeit s.to_cupy() 252 µs ± 3.47 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each) # PR 419 µs ± 2.21 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each) # branch 24.06 ``` I needed to add `Frame.ndim` which will raise a `NotImplementedError` (until Frame actually becomes an ABC) Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/15792 --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/frame.py | 85 +++++++++++--------- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/single_column_frame.py | 2 +- 5 files changed, 53 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 6c116e740ff..e6868ae3431 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -145,7 +145,7 @@ def name(self): raise NotImplementedError @property # type: ignore - def ndim(self): # noqa: D401 + def ndim(self) -> int: # noqa: D401 """Number of dimensions of the underlying data, by definition 1.""" return 1 diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8442cf05f01..88b1ae2ea22 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1234,7 +1234,7 @@ def dtypes(self): return pd.Series(self._dtypes, dtype="object") @property - def ndim(self): + def ndim(self) -> int: """Dimension of the data. DataFrame ndim is always 2.""" return 2 diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 58932db2bda..92ca76d6ceb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6,6 +6,7 @@ import itertools import operator import pickle +import types import warnings from collections import abc from typing import ( @@ -91,6 +92,10 @@ def _dtypes(self): zip(self._data.names, (col.dtype for col in self._data.columns)) ) + @property + def ndim(self) -> int: + raise NotImplementedError() + @_cudf_nvtx_annotate def serialize(self): # TODO: See if self._data can be serialized outright @@ -417,51 +422,60 @@ def __arrow_array__(self, type=None): @_cudf_nvtx_annotate def _to_array( self, - get_column_values: Callable, - make_empty_matrix: Callable, + get_array: Callable, + module: types.ModuleType, + copy: bool, dtype: Union[Dtype, None] = None, na_value=None, - ) -> Union[cupy.ndarray, np.ndarray]: + ) -> Union[cupy.ndarray, numpy.ndarray]: # Internal function to implement to_cupy and to_numpy, which are nearly # identical except for the attribute they access to generate values. - def get_column_values_na(col): + def to_array( + col: ColumnBase, dtype: np.dtype + ) -> Union[cupy.ndarray, numpy.ndarray]: if na_value is not None: col = col.fillna(na_value) - return get_column_values(col) + array = get_array(col) + casted_array = module.asarray(array, dtype=dtype) + if copy and casted_array is array: + # Don't double copy after asarray + casted_array = casted_array.copy() + return casted_array - # Early exit for an empty Frame. ncol = self._num_columns if ncol == 0: - return make_empty_matrix( - shape=(len(self), ncol), dtype=np.dtype("float64"), order="F" + return module.empty( + shape=(len(self), ncol), + dtype=numpy.dtype("float64"), + order="F", ) if dtype is None: - dtypes = [col.dtype for col in self._data.values()] - for dtype in dtypes: - if isinstance( - dtype, - ( - cudf.ListDtype, - cudf.core.dtypes.DecimalDtype, - cudf.StructDtype, - ), - ): - raise NotImplementedError( - f"{dtype} cannot be exposed as a cupy array" - ) - dtype = find_common_type(dtypes) + if ncol == 1: + dtype = next(iter(self._data.values())).dtype + else: + dtype = find_common_type( + [col.dtype for col in self._data.values()] + ) - matrix = make_empty_matrix( - shape=(len(self), ncol), dtype=dtype, order="F" - ) - for i, col in enumerate(self._data.values()): - # TODO: col.values may fail if there is nullable data or an - # unsupported dtype. We may want to catch and provide a more - # suitable error. - matrix[:, i] = get_column_values_na(col) - return matrix + if not isinstance(dtype, numpy.dtype): + raise NotImplementedError( + f"{dtype} cannot be exposed as an array" + ) + + if self.ndim == 1: + return to_array(self._data.columns[0], dtype) + else: + matrix = module.empty( + shape=(len(self), ncol), dtype=dtype, order="F" + ) + for i, col in enumerate(self._data.values()): + # TODO: col.values may fail if there is nullable data or an + # unsupported dtype. We may want to catch and provide a more + # suitable error. + matrix[:, i] = to_array(col, dtype) + return matrix # TODO: As of now, calling cupy.asarray is _much_ faster than calling # to_cupy. We should investigate the reasons why and whether we can provide @@ -496,10 +510,9 @@ def to_cupy( cupy.ndarray """ return self._to_array( - (lambda col: col.values.copy()) - if copy - else (lambda col: col.values), - cupy.empty, + lambda col: col.values, + cupy, + copy, dtype, na_value, ) @@ -536,7 +549,7 @@ def to_numpy( ) return self._to_array( - (lambda col: col.values_host), np.empty, dtype, na_value + lambda col: col.values_host, numpy, copy, dtype, na_value ) @_cudf_nvtx_annotate diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 58a2846bf43..c149a1028a0 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -563,7 +563,7 @@ def levels(self): @property # type: ignore @_cudf_nvtx_annotate - def ndim(self): + def ndim(self) -> int: """Dimension of the data. For MultiIndex ndim is always 2.""" return 2 diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 829790007c9..d864b563208 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -77,7 +77,7 @@ def name(self, value): @property # type: ignore @_cudf_nvtx_annotate - def ndim(self): # noqa: D401 + def ndim(self) -> int: # noqa: D401 """Number of dimensions of the underlying data, by definition 1.""" return 1