From 70406dedf93c8034ed9c17b2ebbf77629315c030 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 9 Mar 2022 11:06:07 -0600 Subject: [PATCH] Refactor `nvtx` annotations in `cudf` & `dask-cudf` (#10396) This PR consolidates all `nvtx.annotate` calls using common decorators `cudf_nvtx_annotate` & `dask_cudf_nvtx_annotate` that makes it easier to maintain and annotate APIs. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Benjamin Zaitlen (https://github.com/quasiben) URL: https://github.com/rapidsai/cudf/pull/10396 --- python/cudf/cudf/core/dataframe.py | 198 ++++++++-------- python/cudf/cudf/core/frame.py | 209 ++++++++--------- python/cudf/cudf/core/groupby/groupby.py | 5 +- python/cudf/cudf/core/index.py | 227 ++++++++----------- python/cudf/cudf/core/indexed_frame.py | 14 +- python/cudf/cudf/core/multiindex.py | 144 +++++------- python/cudf/cudf/core/series.py | 227 +++++++++---------- python/cudf/cudf/core/single_column_frame.py | 89 ++------ python/cudf/cudf/core/udf/utils.py | 8 +- python/cudf/cudf/io/csv.py | 8 +- python/cudf/cudf/io/parquet.py | 30 ++- python/cudf/cudf/io/text.py | 5 +- python/cudf/cudf/utils/utils.py | 26 +++ python/dask_cudf/dask_cudf/backends.py | 44 ++-- python/dask_cudf/dask_cudf/core.py | 74 +++--- python/dask_cudf/dask_cudf/groupby.py | 66 ++---- python/dask_cudf/dask_cudf/sorting.py | 16 +- 17 files changed, 625 insertions(+), 765 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index e687c274d2f..9d32f95cba3 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -87,7 +87,11 @@ min_scalar_type, numeric_normalize_types, ) -from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api +from cudf.utils.utils import ( + GetAttrGetItemMixin, + _cudf_nvtx_annotate, + _external_only_api, +) T = TypeVar("T", bound="DataFrame") @@ -126,7 +130,7 @@ def __setitem__(self, key, value): key = (key, slice(None)) return self._setitem_tuple_arg(key, value) - @annotate("_CAN_DOWNCAST_TO_SERIES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _can_downcast_to_series(self, df, arg): """ This method encapsulates the logic used @@ -167,7 +171,7 @@ def _can_downcast_to_series(self, df, arg): return True return False - @annotate("_DOWNCAST_TO_SERIES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _downcast_to_series(self, df, arg): """ "Downcast" from a DataFrame to a Series @@ -209,11 +213,11 @@ class _DataFrameLocIndexer(_DataFrameIndexer): For selection by label. """ - @annotate("_GETITEM_SCALAR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _getitem_scalar(self, arg): return self._frame[arg[1]].loc[arg[0]] - @annotate("LOC_GETITEM", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def _getitem_tuple_arg(self, arg): from uuid import uuid4 @@ -296,7 +300,7 @@ def _getitem_tuple_arg(self, arg): return self._downcast_to_series(df, arg) return df - @annotate("LOC_SETITEM", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def _setitem_tuple_arg(self, key, value): if ( isinstance(self._frame.index, MultiIndex) @@ -363,7 +367,7 @@ class _DataFrameIlocIndexer(_DataFrameIndexer): For selection by index. """ - @annotate("ILOC_GETITEM", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def _getitem_tuple_arg(self, arg): # Iloc Step 1: # Gather the columns specified by the second tuple arg @@ -415,7 +419,7 @@ def _getitem_tuple_arg(self, arg): df._index = as_index(self._frame.index[arg[0]]) return df - @annotate("ILOC_SETITEM", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def _setitem_tuple_arg(self, key, value): columns = self._frame._get_columns_by_index(key[1]) @@ -528,7 +532,7 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): _loc_indexer_type = _DataFrameLocIndexer _iloc_indexer_type = _DataFrameIlocIndexer - @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def __init__( self, data=None, index=None, columns=None, dtype=None, nan_as_null=True ): @@ -648,9 +652,7 @@ def __init__( if dtype: self._data = self.astype(dtype)._data - @annotate( - "DATAFRAME_INIT_FROM_SERIES_LIST", color="blue", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _init_from_series_list(self, data, columns, index): if index is None: # When `index` is `None`, the final index of @@ -750,9 +752,7 @@ def _init_from_series_list(self, data, columns, index): ) self._data = self._data.select_by_label(columns) - @annotate( - "DATAFRAME_INIT_FROM_LIST_LIKE", color="blue", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _init_from_list_like(self, data, index=None, columns=None): if index is None: index = RangeIndex(start=0, stop=len(data)) @@ -789,9 +789,7 @@ def _init_from_list_like(self, data, index=None, columns=None): self.columns = columns - @annotate( - "DATAFRAME_INIT_FROM_DICT_LIKE", color="blue", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _init_from_dict_like( self, data, index=None, columns=None, nan_as_null=None ): @@ -865,11 +863,7 @@ def _from_data( return out @staticmethod - @annotate( - "DATAFRAME_ALIGN_INPUT_SERIES_INDICES", - color="blue", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def _align_input_series_indices(data, index): data = data.copy() @@ -1021,7 +1015,7 @@ def __setattr__(self, key, col): else: super().__setattr__(key, col) - @annotate("DATAFRAME_GETITEM", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def __getitem__(self, arg): """ If *arg* is a ``str`` or ``int`` type, return the column Series. @@ -1105,7 +1099,7 @@ def __getitem__(self, arg): f"__getitem__ on type {type(arg)} is not supported" ) - @annotate("DATAFRAME_SETITEM", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def __setitem__(self, arg, value): """Add/set column by *arg or DataFrame*""" if isinstance(arg, DataFrame): @@ -1226,7 +1220,7 @@ def __setitem__(self, arg, value): def __delitem__(self, name): self._drop_column(name) - @annotate("DATAFRAME_SLICE", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def _slice(self: T, arg: slice) -> T: """ _slice : slice the frame as per the arg @@ -1328,13 +1322,13 @@ def _slice(self: T, arg: slice) -> T: result._set_column_names_like(self) return result - @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def memory_usage(self, index=True, deep=False): return Series( {str(k): v for k, v in super().memory_usage(index, deep).items()} ) - @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def __array_function__(self, func, types, args, kwargs): if "out" in kwargs or not all( issubclass(t, (Series, DataFrame)) for t in types @@ -1363,7 +1357,7 @@ def __array_function__(self, func, types, args, kwargs): return NotImplemented # The _get_numeric_data method is necessary for dask compatibility. - @annotate("DATAFRAME_GET_NUMERIC_DATA", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def _get_numeric_data(self): """Return a dataframe with only numeric data types""" columns = [ @@ -1373,7 +1367,7 @@ def _get_numeric_data(self): ] return self[columns] - @annotate("DATAFRAME_ASSIGN", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def assign(self, **kwargs): """ Assign columns to DataFrame from keyword arguments. @@ -1401,7 +1395,7 @@ def assign(self, **kwargs): return new_df @classmethod - @annotate("CONCAT", color="orange", domain="cudf_python") + @_cudf_nvtx_annotate def _concat( cls, objs, axis=0, join="outer", ignore_index=False, sort=False ): @@ -1774,12 +1768,12 @@ def _get_renderable_dataframe(self): return output - @annotate("DATAFRAME_REPR", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def __repr__(self): output = self._get_renderable_dataframe() return self._clean_renderable_dataframe(output) - @annotate("DATAFRAME_REPR_HTML", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def _repr_html_(self): lines = ( self._get_renderable_dataframe() @@ -1796,13 +1790,11 @@ def _repr_html_(self): lines.append("") return "\n".join(lines) - @annotate("DATAFRAME_REPR_LATEX", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def _repr_latex_(self): return self._get_renderable_dataframe().to_pandas()._repr_latex_() - @annotate( - "DATAFRAME_GET_COLUMNS_BY_LABEL", color="blue", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _get_columns_by_label(self, labels, downcast=False): """ Return columns of dataframe by `labels` @@ -1921,7 +1913,7 @@ def _make_operands_and_index_for_binop( return operands, lhs._index - @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def update( self, other, @@ -2016,11 +2008,11 @@ def update( self._mimic_inplace(source_df, inplace=True) - @annotate("DATAFRAME_ITER", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def __iter__(self): return iter(self._column_names) - @annotate("DATAFRAME_ITERITEMS", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def iteritems(self): """Iterate over column names and series pairs""" warnings.warn( @@ -2030,13 +2022,13 @@ def iteritems(self): ) return self.items() - @annotate("DATAFRAME_ITEMS", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def items(self): """Iterate over column names and series pairs""" for k in self: yield (k, self[k]) - @annotate("DATAFRAME_EQUALS", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def equals(self, other, **kwargs): ret = super().equals(other) # If all other checks matched, validate names. @@ -2069,13 +2061,13 @@ def at(self): "index is absolutely necessary. For checking if the columns are a " "MultiIndex, use _data.multiindex." ) - @annotate("DATAFRAME_COLUMNS_GETTER", color="yellow", domain="cudf_python") + @_cudf_nvtx_annotate def columns(self): """Returns a tuple of columns""" return self._data.to_pandas_index() @columns.setter # type: ignore - @annotate("DATAFRAME_COLUMNS_SETTER", color="yellow", domain="cudf_python") + @_cudf_nvtx_annotate def columns(self, columns): if isinstance(columns, cudf.BaseIndex): columns = columns.to_pandas() @@ -2110,7 +2102,7 @@ def _set_column_names_like(self, other): other._data.names, other._data.multiindex, other._data.level_names ) - @annotate("DATAFRAME_REINDEX_INTERNAL", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def _reindex( self, columns, dtypes=None, deep=False, index=None, inplace=False ): @@ -2187,7 +2179,7 @@ def _reindex( return self._mimic_inplace(result, inplace=inplace) - @annotate("DATAFRAME_REINDEX", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def reindex( self, labels=None, axis=None, index=None, columns=None, copy=True ): @@ -2266,7 +2258,7 @@ def reindex( inplace=False, ) - @annotate("DATAFRAME_SET_INDEX", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def set_index( self, keys, @@ -2527,7 +2519,7 @@ def reset_index( inplace=inplace, ) - @annotate("DATAFRAME_INSERT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def insert(self, loc, name, value, nan_as_null=None): """Add a column to DataFrame at the index specified by loc. @@ -2551,7 +2543,7 @@ def insert(self, loc, name, value, nan_as_null=None): ignore_index=False, ) - @annotate("DATAFRAME__INSERT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): """ Same as `insert`, with additional `ignore_index` param. @@ -2679,7 +2671,7 @@ def diff(self, periods=1, axis=0): return self - self.shift(periods=periods) - @annotate("DATAFRAME_DROP_DUPLICATES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def drop_duplicates( self, subset=None, keep="first", inplace=False, ignore_index=False ): @@ -2757,14 +2749,14 @@ def drop_duplicates( return self._mimic_inplace(outdf, inplace=inplace) - @annotate("DATAFRAME_POP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def pop(self, item): """Return a column and drop it from the DataFrame.""" popped = self[item] del self[item] return popped - @annotate("DATAFRAME_RENAME", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def rename( self, mapper=None, @@ -2908,7 +2900,7 @@ def rename( else: return out.copy(deep=copy) - @annotate("DATAFRAME_ADD_PREFIX", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def add_prefix(self, prefix): out = self.copy(deep=True) out.columns = [ @@ -2916,7 +2908,7 @@ def add_prefix(self, prefix): ] return out - @annotate("DATAFRAME_ADD_SUFFIX", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def add_suffix(self, suffix): out = self.copy(deep=True) out.columns = [ @@ -2924,7 +2916,7 @@ def add_suffix(self, suffix): ] return out - @annotate("DATAFRAME_AGG", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def agg(self, aggs, axis=None): """ Aggregate using one or more operations over the specified axis. @@ -3056,7 +3048,7 @@ def agg(self, aggs, axis=None): else: raise ValueError("argument must be a string, list or dict") - @annotate("DATAFRAME_NLARGEST", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def nlargest(self, n, columns, keep="first"): """Get the rows of the DataFrame sorted by the n largest value of *columns* @@ -3188,7 +3180,7 @@ def nsmallest(self, n, columns, keep="first"): """ return self._n_largest_or_smallest(False, n, columns, keep) - @annotate("DATAFRAME_TRANSPOSE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def transpose(self): """Transpose index and columns. @@ -3219,7 +3211,7 @@ def transpose(self): T = property(transpose, doc=transpose.__doc__) - @annotate("DATAFRAME_MELT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def melt(self, **kwargs): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. @@ -3249,7 +3241,7 @@ def melt(self, **kwargs): return melt(self, **kwargs) - @annotate("DATAFRAME_JOIN", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def merge( self, right, @@ -3389,7 +3381,7 @@ def merge( ) return gdf_result - @annotate("JOIN", color="blue", domain="cudf_python") + @_cudf_nvtx_annotate def join( self, other, on=None, how="left", lsuffix="", rsuffix="", sort=False, ): @@ -3431,7 +3423,7 @@ def join( ) return df - @annotate("DATAFRAME_GROUPBY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate @copy_docstring(DataFrameGroupBy) def groupby( self, @@ -3571,7 +3563,7 @@ def query(self, expr, local_dict=None): boolmask = queryutils.query_execute(self, expr, callenv) return self._apply_boolean_mask(boolmask) - @annotate("DATAFRAME_APPLY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def apply( self, func, axis=1, raw=False, result_type=None, args=(), **kwargs ): @@ -3720,7 +3712,7 @@ def apply( return self._apply(func, _get_row_kernel, *args, **kwargs) - @annotate("DATAFRAME_APPLY_ROWS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate @applyutils.doc_apply() def apply_rows( self, @@ -3799,7 +3791,7 @@ def apply_rows( cache_key=cache_key, ) - @annotate("DATAFRAME_APPLY_CHUNKS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate @applyutils.doc_applychunks() def apply_chunks( self, @@ -3867,9 +3859,7 @@ def apply_chunks( tpb=tpb, ) - @annotate( - "DATAFRAME_PARTITION_BY_HASH", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def partition_by_hash(self, columns, nparts, keep_index=True): """Partition the dataframe by the hashed value of data in *columns*. @@ -4205,7 +4195,7 @@ def _sizeof_fmt(num, size_qualifier): cudf.utils.ioutils.buffer_write_lines(buf, lines) - @annotate("DATAFRAME_DESCRIBE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate @docutils.doc_describe() def describe( self, @@ -4265,7 +4255,7 @@ def describe( sort=False, ) - @annotate("DATAFRAME_TO_PANDAS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_pandas(self, nullable=False, **kwargs): """ Convert to a Pandas DataFrame. @@ -4340,7 +4330,7 @@ def to_pandas(self, nullable=False, **kwargs): return out_df @classmethod - @annotate("DATAFRAME_FROM_PANDAS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_pandas(cls, dataframe, nan_as_null=None): """ Convert from a Pandas DataFrame. @@ -4410,7 +4400,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): return result @classmethod - @annotate("DATAFRAME_FROM_ARROW", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_arrow(cls, table): """ Convert from PyArrow Table to DataFrame. @@ -4466,7 +4456,7 @@ def from_arrow(cls, table): return out - @annotate("DATAFRAME_TO_ARROW", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_arrow(self, preserve_index=True): """ Convert to a PyArrow Table. @@ -4548,7 +4538,7 @@ def to_arrow(self, preserve_index=True): return out.replace_schema_metadata(metadata) - @annotate("DATAFRAME_TO_RECORDS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_records(self, index=True): """Convert to a numpy recarray @@ -4572,7 +4562,7 @@ def to_records(self, index=True): return ret @classmethod - @annotate("DATAFRAME_FROM_RECORDS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_records(cls, data, index=None, columns=None, nan_as_null=False): """ Convert structured or record ndarray to DataFrame. @@ -4634,9 +4624,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): return df @classmethod - @annotate( - "DATAFRAME_FROM_ARRAYS_INTERNAL", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): """Convert a numpy/cupy array to DataFrame. @@ -4696,7 +4684,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): df._index = as_index(index) return df - @annotate("DATAFRAME_INTERPOLATE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def interpolate( self, method="linear", @@ -4727,7 +4715,7 @@ def interpolate( **kwargs, ) - @annotate("DATAFRAME_QUANTILE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def quantile( self, q=0.5, @@ -4843,7 +4831,7 @@ def quantile( result.index = q return result - @annotate("DATAFRAME_QUANTILES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def quantiles(self, q=0.5, interpolation="nearest"): """ Return values at the given quantile. @@ -4883,7 +4871,7 @@ def quantiles(self, q=0.5, interpolation="nearest"): result.index = as_index(q) return result - @annotate("DATAFRAME_ISIN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def isin(self, values): """ Whether each element in the DataFrame is contained in values. @@ -5021,9 +5009,7 @@ def make_false_column_like_self(): # # Stats # - @annotate( - "DATAFRAME_PREPARE_FOR_ROWWISE_OP", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _prepare_for_rowwise_op(self, method, skipna): """Prepare a DataFrame for CuPy-based row-wise operations.""" @@ -5073,7 +5059,7 @@ def _prepare_for_rowwise_op(self, method, skipna): coerced = coerced.astype("int64", copy=False) return coerced, mask, common_dtype - @annotate("DATAFRAME_COUNT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def count(self, axis=0, level=None, numeric_only=False, **kwargs): """ Count ``non-NA`` cells for each column or row. @@ -5120,7 +5106,7 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): "columns": 1, } - @annotate("DATAFRAME_REDUCE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _reduce( self, op, axis=None, level=None, numeric_only=None, **kwargs, ): @@ -5148,7 +5134,7 @@ def _reduce( elif axis == 1: return self._apply_cupy_method_axis_1(op, **kwargs) - @annotate("DATAFRAME_SCAN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _scan( self, op, axis=None, *args, **kwargs, ): @@ -5159,7 +5145,7 @@ def _scan( elif axis == 1: return self._apply_cupy_method_axis_1(op, **kwargs) - @annotate("DATAFRAME_MODE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def mode(self, axis=0, numeric_only=False, dropna=True): """ Get the mode(s) of each element along the selected axis. @@ -5259,7 +5245,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return df - @annotate("DATAFRAME_KURTOSIS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -5268,7 +5254,7 @@ def kurtosis( axis, skipna, level, numeric_only, **kwargs ) - @annotate("DATAFRAME_SKEW", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -5277,17 +5263,17 @@ def skew( axis, skipna, level, numeric_only, **kwargs ) - @annotate("DATAFRAME_ALL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self return super(DataFrame, obj).all(axis, skipna, level, **kwargs) - @annotate("DATAFRAME_ANY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self return super(DataFrame, obj).any(axis, skipna, level, **kwargs) - @annotate("DATAFRAME_APPLY_CUPY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _apply_cupy_method_axis_1(self, method, *args, **kwargs): # This method uses cupy to perform scans and reductions along rows of a # DataFrame. Since cuDF is designed around columnar storage and @@ -5388,7 +5374,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result_df._set_column_names_like(prepared) return result_df - @annotate("DATAFRAME_COLUMNS_VIEW", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _columns_view(self, columns): """ Return a subset of the DataFrame's columns as a view. @@ -5397,7 +5383,7 @@ def _columns_view(self, columns): {col: self._data[col] for col in columns}, index=self.index ) - @annotate("DATAFRAME_SELECT_DTYPES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def select_dtypes(self, include=None, exclude=None): """Return a subset of the DataFrame’s columns based on the column dtypes. @@ -5584,7 +5570,7 @@ def to_orc(self, fname, compression=None, *args, **kwargs): orc.to_orc(self, fname, compression, *args, **kwargs) - @annotate("DATAFRAME_STACK", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def stack(self, level=-1, dropna=True): """Stack the prescribed level(s) from columns to index @@ -5646,7 +5632,7 @@ def stack(self, level=-1, dropna=True): else: return result - @annotate("DATAFRAME_COV", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def cov(self, **kwargs): """Compute the covariance matrix of a DataFrame. @@ -5665,7 +5651,7 @@ def cov(self, **kwargs): df._set_column_names_like(self) return df - @annotate("DATAFRAME_CORR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def corr(self): """Compute the correlation matrix of a DataFrame.""" corr = cupy.corrcoef(self.values, rowvar=False) @@ -5674,7 +5660,7 @@ def corr(self): df._set_column_names_like(self) return df - @annotate("DATAFRAME_TO_STRUCT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_struct(self, name=None): """ Return a struct Series composed of the columns of the DataFrame. @@ -5707,7 +5693,7 @@ def to_struct(self, name=None): name=name, ) - @annotate("DATAFRAME_KEYS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def keys(self): """ Get the columns. @@ -5755,7 +5741,7 @@ def iterrows(self): "if you wish to iterate over each row." ) - @annotate("DATAFRAME_APPEND", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def append( self, other, ignore_index=False, verify_integrity=False, sort=False ): @@ -5898,7 +5884,7 @@ def append( other, ignore_index, verify_integrity, sort ) - @annotate("DATAFRAME_PIVOT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate @copy_docstring(reshape.pivot) def pivot(self, index, columns, values=None): @@ -5906,14 +5892,14 @@ def pivot(self, index, columns, values=None): self, index=index, columns=columns, values=values ) - @annotate("DATAFRAME_UNSTACK", color="green", domain="cudf_python") + @_cudf_nvtx_annotate @copy_docstring(reshape.unstack) def unstack(self, level=-1, fill_value=None): return cudf.core.reshape.unstack( self, level=level, fill_value=fill_value ) - @annotate("DATAFRAME_EXPLODE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def explode(self, column, ignore_index=False): """ Transform each element of a list-like to a row, replicating index @@ -6180,7 +6166,7 @@ def func(left, right, output): ) -@annotate("CUDF_FROM_PANDAS", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def from_pandas(obj, nan_as_null=None): """ Convert certain Pandas objects into the cudf equivalent. @@ -6301,7 +6287,7 @@ def from_pandas(obj, nan_as_null=None): ) -@annotate("CUDF_MERGE", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def merge(left, right, *args, **kwargs): return left.merge(right, *args, **kwargs) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index aba2b6d1a11..6d5a3c14674 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -24,7 +24,6 @@ import numpy as np import pandas as pd import pyarrow as pa -from nvtx import annotate import cudf from cudf import _lib as libcudf @@ -52,6 +51,7 @@ from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import find_common_type +from cudf.utils.utils import _cudf_nvtx_annotate T = TypeVar("T", bound="Frame") @@ -185,7 +185,7 @@ def deserialize(cls, header, frames): return cls_deserialize._from_data(dict(zip(column_names, columns))) @classmethod - @annotate("FRAME_FROM_DATA", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _from_data( cls, data: MutableMapping, @@ -196,7 +196,7 @@ def _from_data( return obj @classmethod - @annotate("FRAME_FROM_COLUMNS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _from_columns( cls, columns: List[ColumnBase], @@ -227,9 +227,7 @@ def _from_columns( return cls._from_data(data, index) - @annotate( - "FRAME_FROM_COLUMNS_LIKE_SELF", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _from_columns_like_self( self, columns: List[ColumnBase], @@ -427,7 +425,7 @@ def memory_usage(self, deep=False): def __len__(self): return self._num_rows - @annotate("FRAME_COPY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def copy(self: T, deep: bool = True) -> T: """ Make a copy of this object's indices and data. @@ -513,6 +511,7 @@ def copy(self: T, deep: bool = True) -> T: return new_frame + @_cudf_nvtx_annotate def astype(self, dtype, copy=False, **kwargs): result = {} for col_name, col in self._data.items(): @@ -524,7 +523,7 @@ def astype(self, dtype, copy=False, **kwargs): return result - @annotate("FRAME_EQUALS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def equals(self, other, **kwargs): """ Test whether two objects contain the same elements. @@ -607,9 +606,7 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) - @annotate( - "FRAME_GET_COLUMNS_BY_LABEL", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _get_columns_by_label(self, labels, downcast=False): """ Returns columns of the Frame specified by `labels` @@ -617,9 +614,7 @@ def _get_columns_by_label(self, labels, downcast=False): """ return self._data.select_by_label(labels) - @annotate( - "FRAME_GET_COLUMNS_BY_INDEX", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _get_columns_by_index(self, indices): """ Returns columns of the Frame specified by `labels` @@ -730,7 +725,7 @@ def get_column_values_na(col): # particular, we need to benchmark how much of the overhead is coming from # (potentially unavoidable) local copies in to_cupy and how much comes from # inefficiencies in the implementation. - @annotate("FRAME_TO_CUPY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_cupy( self, dtype: Union[Dtype, None] = None, @@ -765,7 +760,7 @@ def to_cupy( na_value, ) - @annotate("FRAME_TO_NUMPY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_numpy( self, dtype: Union[Dtype, None] = None, @@ -800,7 +795,7 @@ def to_numpy( (lambda col: col.values_host), np.empty, dtype, na_value ) - @annotate("FRAME_CLIP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def clip(self, lower=None, upper=None, inplace=False, axis=1): """ Trim values at input threshold(s). @@ -928,7 +923,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) - @annotate("FRAME_WHERE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def where(self, cond, other=None, inplace=False): """ Replace values where the condition is False. @@ -987,7 +982,7 @@ def where(self, cond, other=None, inplace=False): frame=self, cond=cond, other=other, inplace=inplace ) - @annotate("FRAME_MASK", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def mask(self, cond, other=None, inplace=False): """ Replace values where the condition is True. @@ -1049,7 +1044,7 @@ def mask(self, cond, other=None, inplace=False): return self.where(cond=~cond, other=other, inplace=inplace) - @annotate("FRAME_PIPE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def pipe(self, func, *args, **kwargs): """ Apply ``func(self, *args, **kwargs)``. @@ -1097,7 +1092,7 @@ def pipe(self, func, *args, **kwargs): """ return cudf.core.common.pipe(self, func, *args, **kwargs) - @annotate("SCATTER_BY_MAP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def scatter_by_map( self, map_index, map_size=None, keep_index=True, **kwargs ): @@ -1180,7 +1175,7 @@ def scatter_by_map( return result - @annotate("FRAME_FILLNA", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None ): @@ -1335,14 +1330,14 @@ def fillna( inplace=inplace, ) - @annotate("FRAME_DROP_COLUMN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _drop_column(self, name): """Drop a column by *name*""" if name not in self._data: raise KeyError(f"column '{name}' does not exist") del self._data[name] - @annotate("FRAME_DROPNA_COLUMNS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _drop_na_columns(self, how="any", subset=None, thresh=None): """ Drop columns containing nulls @@ -1374,7 +1369,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): return self[out_cols] - @annotate("FRAME_INTERPOLATE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def interpolate( self, method="linear", @@ -1444,7 +1439,7 @@ def interpolate( else result._gather(perm_sort.argsort()) ) - @annotate("FRAME_QUANTILES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _quantiles( self, q, @@ -1477,7 +1472,7 @@ def _quantiles( result._copy_type_metadata(self) return result - @annotate("FRAME_RANK", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def rank( self, axis=0, @@ -1554,7 +1549,7 @@ def rank( return self._from_data(data, index).astype(np.float64) - @annotate("FRAME_REPEAT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def repeat(self, repeats, axis=None): """Repeats elements consecutively. @@ -1644,7 +1639,7 @@ def repeat(self, repeats, axis=None): result._copy_type_metadata(self) return result - @annotate("FRAME_SHIFT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def shift(self, periods=1, freq=None, axis=0, fill_value=None): """Shift values by `periods` positions.""" axis = self._get_axis_from_axis_arg(axis) @@ -1661,7 +1656,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): ) @classmethod - @annotate("FRAME_FROM_ARROW", color="orange", domain="cudf_python") + @_cudf_nvtx_annotate def from_arrow(cls, data): """Convert from PyArrow Table to Frame @@ -1801,7 +1796,7 @@ def from_arrow(cls, data): return cls._from_data({name: result[name] for name in column_names}) - @annotate("FRAME_TO_ARROW", color="orange", domain="cudf_python") + @_cudf_nvtx_annotate def to_arrow(self): """ Convert to arrow Table @@ -1837,7 +1832,7 @@ def _positions_from_column_names(self, column_names): if name in set(column_names) ] - @annotate("FRAME_REPLACE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def replace( self, to_replace=None, @@ -2124,7 +2119,7 @@ def _copy_type_metadata( return self - @annotate("FRAME_ISNULL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def isnull(self): """ Identify missing values. @@ -2206,7 +2201,7 @@ def isnull(self): # Alias for isnull isna = isnull - @annotate("FRAME_NOTNULL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def notnull(self): """ Identify non-missing values. @@ -2288,7 +2283,7 @@ def notnull(self): # Alias for notnull notna = notnull - @annotate("FRAME_INTERLEAVE_COLUMNS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def interleave_columns(self): """ Interleave Series columns of a table into a single column. @@ -2328,7 +2323,7 @@ def interleave_columns(self): return result - @annotate("FRAME_TILE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def tile(self, count): """ Repeats the rows from `self` DataFrame `count` times to form a @@ -2358,7 +2353,7 @@ def tile(self, count): result._copy_type_metadata(self) return result - @annotate("FRAME_SEARCHSORTED", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def searchsorted( self, values, side="left", ascending=True, na_position="last" ): @@ -2443,7 +2438,7 @@ def searchsorted( else: return result - @annotate("FRAME_ARGSORT", color="yellow", domain="cudf_python") + @_cudf_nvtx_annotate def argsort( self, by=None, @@ -2546,7 +2541,7 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): return libcudf.sort.order_by(to_sort, ascending, na_position) - @annotate("FRAME_SIN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def sin(self): """ Get Trigonometric sine, element-wise. @@ -2613,7 +2608,7 @@ def sin(self): return self._unaryop("sin") - @annotate("FRAME_COS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def cos(self): """ Get Trigonometric cosine, element-wise. @@ -2680,7 +2675,7 @@ def cos(self): return self._unaryop("cos") - @annotate("FRAME_TAN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def tan(self): """ Get Trigonometric tangent, element-wise. @@ -2747,7 +2742,7 @@ def tan(self): return self._unaryop("tan") - @annotate("FRAME_ASIN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def asin(self): """ Get Trigonometric inverse sine, element-wise. @@ -2803,7 +2798,7 @@ def asin(self): return self._unaryop("asin") - @annotate("FRAME_ACOS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def acos(self): """ Get Trigonometric inverse cosine, element-wise. @@ -2867,7 +2862,7 @@ def acos(self): result = result.mask((result < 0) | (result > np.pi + 1)) return result - @annotate("FRAME_ATAN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def atan(self): """ Get Trigonometric inverse tangent, element-wise. @@ -2933,7 +2928,7 @@ def atan(self): return self._unaryop("atan") - @annotate("FRAME_EXP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def exp(self): """ Get the exponential of all elements, element-wise. @@ -3001,7 +2996,7 @@ def exp(self): return self._unaryop("exp") - @annotate("FRAME_LOG", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def log(self): """ Get the natural logarithm of all elements, element-wise. @@ -3068,7 +3063,7 @@ def log(self): return self._unaryop("log") - @annotate("FRAME_SQRT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def sqrt(self): """ Get the non-negative square-root of all elements, element-wise. @@ -3129,7 +3124,7 @@ def sqrt(self): return self._unaryop("sqrt") - @annotate("FRAME_ABS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def abs(self): """ Return a Series/DataFrame with absolute numeric value of each element. @@ -3156,7 +3151,7 @@ def abs(self): return self._unaryop("abs") # Rounding - @annotate("FRAME_CEIL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def ceil(self): """ Rounds each value upward to the smallest integral value not less @@ -3193,7 +3188,7 @@ def ceil(self): return self._unaryop("ceil") - @annotate("FRAME_FLOOR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def floor(self): """Rounds each value downward to the largest integral value not greater than the original. @@ -3233,7 +3228,7 @@ def floor(self): return self._unaryop("floor") - @annotate("FRAME_SCALE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def scale(self): """ Scale values to [0, 1] in float64 @@ -3268,7 +3263,7 @@ def scale(self): scaled._index = self._index.copy(deep=False) return scaled - @annotate("FRAME_INTERNAL_MERGE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _merge( self, right, @@ -3312,7 +3307,7 @@ def _merge( suffixes=suffixes, ).perform_merge() - @annotate("FRAME_IS_SORTED", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _is_sorted(self, ascending=None, null_position=None): """ Returns a boolean indicating whether the data of the Frame are sorted @@ -3343,7 +3338,7 @@ def _is_sorted(self, ascending=None, null_position=None): self, ascending=ascending, null_position=null_position ) - @annotate("FRAME_SPLIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _split(self, splits): """Split a frame with split points in ``splits``. Returns a list of Frames of length `len(splits) + 1`. @@ -3358,7 +3353,7 @@ def _split(self, splits): for split_idx in range(len(splits) + 1) ] - @annotate("FRAME_ENCODE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _encode(self): data, index, indices = libcudf.transform.table_encode(self) for name, col in data.items(): @@ -3366,7 +3361,7 @@ def _encode(self): keys = self.__class__._from_data(data, index) return keys, indices - @annotate("FRAME_UNARYOP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _unaryop(self, op): data_columns = (col.unary_operator(op) for col in self._columns) return self.__class__._from_data( @@ -3398,7 +3393,7 @@ def _binaryop( ) @classmethod - @annotate("FRAME_COLWISE_BINOP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _colwise_binop( cls, operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], @@ -3636,7 +3631,7 @@ def _apply_cupy_ufunc_to_operands( data[i][name] = as_column(out).set_mask(mask) return data - @annotate("FRAME_DOT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def dot(self, other, reflect=False): """ Get dot product of frame and other, (binary operator `dot`). @@ -3738,7 +3733,7 @@ def _reduce(self, *args, **kwargs): f"Reductions are not supported for objects of type {type(self)}." ) - @annotate("FRAME_MIN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def min( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): @@ -3784,7 +3779,7 @@ def min( **kwargs, ) - @annotate("FRAME_MAX", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def max( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): @@ -3830,7 +3825,7 @@ def max( **kwargs, ) - @annotate("FRAME_SUM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def sum( self, axis=None, @@ -3889,7 +3884,7 @@ def sum( **kwargs, ) - @annotate("FRAME_PRODUCT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def product( self, axis=None, @@ -3954,7 +3949,7 @@ def product( # Alias for pandas compatibility. prod = product - @annotate("FRAME_MEAN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def mean( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -3999,7 +3994,7 @@ def mean( **kwargs, ) - @annotate("FRAME_STD", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def std( self, axis=None, @@ -4056,7 +4051,7 @@ def std( **kwargs, ) - @annotate("FRAME_VAR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def var( self, axis=None, @@ -4112,7 +4107,7 @@ def var( **kwargs, ) - @annotate("FRAME_KURTOSIS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4181,7 +4176,7 @@ def kurt( **kwargs, ) - @annotate("FRAME_SKEW", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4239,7 +4234,7 @@ def skew( **kwargs, ) - @annotate("FRAME_ALL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def all(self, axis=0, skipna=True, level=None, **kwargs): """ Return whether all elements are True in DataFrame. @@ -4275,7 +4270,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): "all", axis=axis, skipna=skipna, level=level, **kwargs, ) - @annotate("FRAME_ANY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def any(self, axis=0, skipna=True, level=None, **kwargs): """ Return whether any elements is True in DataFrame. @@ -4311,7 +4306,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): "any", axis=axis, skipna=skipna, level=level, **kwargs, ) - @annotate("FRAME_SUM_OF_SQUARES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def sum_of_squares(self, dtype=None): """Return the sum of squares of values. @@ -4335,7 +4330,7 @@ def sum_of_squares(self, dtype=None): """ return self._reduce("sum_of_squares", dtype=dtype) - @annotate("FRAME_MEDIAN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def median( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4381,7 +4376,7 @@ def median( ) # Scans - @annotate("FRAME_SCAN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _scan(self, op, axis=None, skipna=True): """ Return {op_name} of the {cls}. @@ -4463,7 +4458,7 @@ def _scan(self, op, axis=None, skipna=True): # for Index._from_data and simplify. return self._from_data(results, index=self._index) - @annotate("FRAME_TO_JSON", color="green", domain="cudf_python") + @_cudf_nvtx_annotate @ioutils.doc_to_json() def to_json(self, path_or_buf=None, *args, **kwargs): """{docstring}""" @@ -4472,21 +4467,21 @@ def to_json(self, path_or_buf=None, *args, **kwargs): self, path_or_buf=path_or_buf, *args, **kwargs ) - @annotate("FRAME_TO_HDF", color="green", domain="cudf_python") + @_cudf_nvtx_annotate @ioutils.doc_to_hdf() def to_hdf(self, path_or_buf, key, *args, **kwargs): """{docstring}""" cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) - @annotate("FRAME_TO_DLPACK", color="green", domain="cudf_python") + @_cudf_nvtx_annotate @ioutils.doc_to_dlpack() def to_dlpack(self): """{docstring}""" return cudf.io.dlpack.to_dlpack(self) - @annotate("FRAME_TO_STRING", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_string(self): """ Convert to string @@ -4512,15 +4507,15 @@ def to_string(self): def __str__(self): return self.to_string() - @annotate("FRAME_DEEP_COPY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __deepcopy__(self, memo): return self.copy(deep=True) - @annotate("FRAME_COPY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __copy__(self): return self.copy(deep=False) - @annotate("FRAME_HEAD", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def head(self, n=5): """ Return the first `n` rows. @@ -4604,7 +4599,7 @@ def head(self, n=5): """ return self.iloc[:n] - @annotate("FRAME_TAIL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def tail(self, n=5): """ Returns the last n rows as a new DataFrame or Series @@ -4636,7 +4631,7 @@ def tail(self, n=5): return self.iloc[-n:] - @annotate("FRAME_ROLLING", color="green", domain="cudf_python") + @_cudf_nvtx_annotate @copy_docstring(Rolling) def rolling( self, window, min_periods=None, center=False, axis=0, win_type=None @@ -4650,7 +4645,7 @@ def rolling( win_type=win_type, ) - @annotate("FRAME_NANS_TO_NULLS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def nans_to_nulls(self): """ Convert nans (if any) to nulls @@ -4705,7 +4700,7 @@ def nans_to_nulls(self): result_data[name] = col.copy() return self._from_data(result_data, self._index) - @annotate("FRAME_INVERT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data( @@ -4716,7 +4711,7 @@ def __invert__(self): self._index, ) - @annotate("FRAME_ADD", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def add(self, other, axis, level=None, fill_value=None): """ Get Addition of dataframe or series and other, element-wise (binary @@ -4787,7 +4782,7 @@ def add(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "__add__", fill_value) - @annotate("FRAME_RADD", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def radd(self, other, axis, level=None, fill_value=None): """ Get Addition of dataframe or series and other, element-wise (binary @@ -4867,7 +4862,7 @@ def radd(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "__radd__", fill_value) - @annotate("FRAME_SUBTRACT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def subtract(self, other, axis, level=None, fill_value=None): """ Get Subtraction of dataframe or series and other, element-wise (binary @@ -4950,7 +4945,7 @@ def subtract(self, other, axis, level=None, fill_value=None): sub = subtract - @annotate("FRAME_RSUB", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def rsub(self, other, axis, level=None, fill_value=None): """ Get Subtraction of dataframe or series and other, element-wise (binary @@ -5034,7 +5029,7 @@ def rsub(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "__rsub__", fill_value) - @annotate("FRAME_MULTIPLY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def multiply(self, other, axis, level=None, fill_value=None): """ Get Multiplication of dataframe or series and other, element-wise @@ -5119,7 +5114,7 @@ def multiply(self, other, axis, level=None, fill_value=None): mul = multiply - @annotate("FRAME_RMUL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def rmul(self, other, axis, level=None, fill_value=None): """ Get Multiplication of dataframe or series and other, element-wise @@ -5204,7 +5199,7 @@ def rmul(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "__rmul__", fill_value) - @annotate("FRAME_MOD", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def mod(self, other, axis, level=None, fill_value=None): """ Get Modulo division of dataframe or series and other, element-wise @@ -5275,7 +5270,7 @@ def mod(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "__mod__", fill_value) - @annotate("FRAME_RMOD", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def rmod(self, other, axis, level=None, fill_value=None): """ Get Modulo division of dataframe or series and other, element-wise @@ -5358,7 +5353,7 @@ def rmod(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "__rmod__", fill_value) - @annotate("FRAME_POW", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def pow(self, other, axis, level=None, fill_value=None): """ Get Exponential power of dataframe series and other, element-wise @@ -5438,7 +5433,7 @@ def pow(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "__pow__", fill_value) - @annotate("FRAME_RPOW", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def rpow(self, other, axis, level=None, fill_value=None): """ Get Exponential power of dataframe or series and other, element-wise @@ -5518,7 +5513,7 @@ def rpow(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "__rpow__", fill_value) - @annotate("FRAME_FLOORDIV", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def floordiv(self, other, axis, level=None, fill_value=None): """ Get Integer division of dataframe or series and other, element-wise @@ -5598,7 +5593,7 @@ def floordiv(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "__floordiv__", fill_value) - @annotate("FRAME_RFLOORDIV", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def rfloordiv(self, other, axis, level=None, fill_value=None): """ Get Integer division of dataframe or series and other, element-wise @@ -5695,7 +5690,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "__rfloordiv__", fill_value) - @annotate("FRAME_TRUEDIV", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def truediv(self, other, axis, level=None, fill_value=None): """ Get Floating division of dataframe or series and other, element-wise @@ -5784,7 +5779,7 @@ def truediv(self, other, axis, level=None, fill_value=None): div = truediv divide = truediv - @annotate("FRAME_RTRUEDIV", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def rtruediv(self, other, axis, level=None, fill_value=None): """ Get Floating division of dataframe or series and other, element-wise @@ -5877,7 +5872,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None): # Alias for rtruediv rdiv = rtruediv - @annotate("FRAME_EQ", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def eq(self, other, axis="columns", level=None, fill_value=None): """Equal to, element-wise (binary operator eq). @@ -5953,7 +5948,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None): other=other, op="__eq__", fill_value=fill_value, can_reindex=True ) - @annotate("FRAME_NE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def ne(self, other, axis="columns", level=None, fill_value=None): """Not equal to, element-wise (binary operator ne). @@ -6029,7 +6024,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None): other=other, op="__ne__", fill_value=fill_value, can_reindex=True ) - @annotate("FRAME_LT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def lt(self, other, axis="columns", level=None, fill_value=None): """Less than, element-wise (binary operator lt). @@ -6105,7 +6100,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None): other=other, op="__lt__", fill_value=fill_value, can_reindex=True ) - @annotate("FRAME_LE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def le(self, other, axis="columns", level=None, fill_value=None): """Less than or equal, element-wise (binary operator le). @@ -6181,7 +6176,7 @@ def le(self, other, axis="columns", level=None, fill_value=None): other=other, op="__le__", fill_value=fill_value, can_reindex=True ) - @annotate("FRAME_GT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def gt(self, other, axis="columns", level=None, fill_value=None): """Greater than, element-wise (binary operator gt). @@ -6257,7 +6252,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None): other=other, op="__gt__", fill_value=fill_value, can_reindex=True ) - @annotate("FRAME_GE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def ge(self, other, axis="columns", level=None, fill_value=None): """Greater than or equal, element-wise (binary operator ge). @@ -6356,11 +6351,7 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True): } -@annotate( - "FRAME_GET_REPLACEMENT_VALUES_FOR_COLUMNS", - color="green", - domain="cudf_python", -) +@_cudf_nvtx_annotate def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] ) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]: @@ -6525,7 +6516,7 @@ def _is_series(obj): return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None -@annotate("FRAME_DROP_ROWS_BY_LABELS", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def _drop_rows_by_labels( obj: DataFrameOrSeries, labels: Union[ColumnLike, abc.Iterable, str], diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 8af73a5175b..264f0ea5df6 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -7,7 +7,6 @@ from functools import cached_property import numpy as np -from nvtx import annotate import cudf from cudf._lib import groupby as libgroupby @@ -18,7 +17,7 @@ from cudf.core.column.column import arange, as_column from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex -from cudf.utils.utils import GetAttrGetItemMixin +from cudf.utils.utils import GetAttrGetItemMixin, _cudf_nvtx_annotate # The three functions below return the quantiles [25%, 50%, 75%] @@ -207,7 +206,7 @@ def cumcount(self): def _groupby(self): return libgroupby.GroupBy(self.grouping.keys, dropna=self._dropna) - @annotate("GROUPBY_AGG", domain="cudf_python") + @_cudf_nvtx_annotate def agg(self, func): """ Apply aggregation(s) to the groups. diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index d418ffc0394..1c68289898f 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -22,7 +22,6 @@ import cupy import numpy as np import pandas as pd -from nvtx import annotate from pandas._config import get_option import cudf @@ -57,7 +56,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring, doc_apply from cudf.utils.dtypes import find_common_type -from cudf.utils.utils import search_range +from cudf.utils.utils import _cudf_nvtx_annotate, search_range T = TypeVar("T", bound="Frame") @@ -161,7 +160,7 @@ class RangeIndex(BaseIndex, BinaryOperand): _range: range - @annotate("RANGEINDEX_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __init__( self, start, stop=None, step=1, dtype=None, copy=False, name=None ): @@ -193,7 +192,7 @@ def _copy_type_metadata( return self @property # type: ignore - @annotate("RANGEINDEX_NAME", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def name(self): """ Returns the name of the Index. @@ -201,12 +200,12 @@ def name(self): return self._name @name.setter # type: ignore - @annotate("RANGEINDEX_INIT_SETTER", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def name(self, value): self._name = value @property # type: ignore - @annotate("RANGEINDEX_START", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def start(self): """ The value of the `start` parameter (0 if this was not supplied). @@ -214,7 +213,7 @@ def start(self): return self._start @property # type: ignore - @annotate("RANGEINDEX_STOP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def stop(self): """ The value of the stop parameter. @@ -222,7 +221,7 @@ def stop(self): return self._stop @property # type: ignore - @annotate("RANGEINDEX_STEP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def step(self): """ The value of the step parameter. @@ -230,12 +229,12 @@ def step(self): return self._step @property # type: ignore - @annotate("RANGEINDEX_NUM_ROWS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _num_rows(self): return len(self) @cached_property - @annotate("RANGEINDEX_VALUES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _values(self): if len(self) > 0: return column.arange( @@ -266,13 +265,13 @@ def is_interval(self): return False @property # type: ignore - @annotate("RANGEINDEX_DATA", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _data(self): return cudf.core.column_accessor.ColumnAccessor( {self.name: self._values} ) - @annotate("RANGEINDEX_CONTAINS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __contains__(self, item): if not isinstance( item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float]) @@ -282,7 +281,7 @@ def __contains__(self, item): return False return item in range(self._start, self._stop, self._step) - @annotate("RANGEINDEX_COPY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def copy(self, name=None, deep=False, dtype=None, names=None): """ Make a copy of this object. @@ -313,19 +312,18 @@ def copy(self, name=None, deep=False, dtype=None, names=None): start=self._start, stop=self._stop, step=self._step, name=name ) + @_cudf_nvtx_annotate @doc_apply(_index_astype_docstring) def astype(self, dtype, copy: bool = True): if is_dtype_equal(dtype, np.int64): return self return self._as_int64().astype(dtype, copy=copy) - @annotate( - "RANGEINDEX_DROP_DUPLICATES", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def drop_duplicates(self, keep="first"): return self - @annotate("RANGEINDEX_REPR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __repr__(self): return ( f"{self.__class__.__name__}(start={self._start}, stop={self._stop}" @@ -338,11 +336,11 @@ def __repr__(self): + ")" ) - @annotate("RANGEINDEX_LEN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __len__(self): return len(range(self._start, self._stop, self._step)) - @annotate("RANGEINDEX_GETITEM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __getitem__(self, index): len_self = len(self) if isinstance(index, slice): @@ -368,7 +366,7 @@ def __getitem__(self, index): return as_index(self._values[index], name=self.name) - @annotate("RangeIndex_EQUALS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def equals(self, other): if isinstance(other, RangeIndex): if (self._start, self._stop, self._step) == ( @@ -379,7 +377,7 @@ def equals(self, other): return True return Int64Index._from_data(self._data).equals(other) - @annotate("RANGEINDEX_SERIALIZE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def serialize(self): header = {} header["index_column"] = {} @@ -400,7 +398,7 @@ def serialize(self): return header, frames @classmethod - @annotate("RANGEINDEX_DESERIALIZE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def deserialize(cls, header, frames): h = header["index_column"] name = pickle.loads(header["name"]) @@ -410,16 +408,14 @@ def deserialize(cls, header, frames): return RangeIndex(start=start, stop=stop, step=step, name=name) @property # type: ignore - @annotate("RANGEINDEX_DTYPE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def dtype(self): """ `dtype` of the range of values in RangeIndex. """ return cudf.dtype(np.int64) - @annotate( - "RANGEINDEX_FIND_LABEL_RANGE", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def find_label_range(self, first=None, last=None): """Find subrange in the ``RangeIndex``, marked by their positions, that starts greater or equal to ``first`` and ends less or equal to ``last`` @@ -459,7 +455,7 @@ def find_label_range(self, first=None, last=None): return begin, end - @annotate("RANGEINDEX_TO_PANDAS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_pandas(self): return pd.RangeIndex( start=self._start, @@ -477,26 +473,16 @@ def is_unique(self): return True @property # type: ignore - @annotate( - "RANGEINDEX_IS_MONOTONIC_INCREASING", - color="green", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def is_monotonic_increasing(self): return self._step > 0 or len(self) <= 1 @property # type: ignore - @annotate( - "RANGEINDEX_IS_MONOTONIC_DECREASING", - color="green", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def is_monotonic_decreasing(self): return self._step < 0 or len(self) <= 1 - @annotate( - "RANGEINDEX_GET_SLICE_BOUND", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def get_slice_bound(self, label, side, kind=None): """ Calculate slice bound that corresponds to given label. @@ -531,7 +517,7 @@ def get_slice_bound(self, label, side, kind=None): pos = search_range(start, stop, label, step, side=side) return pos - @annotate("RANGEINDEX_MEMORY_USAGE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def memory_usage(self, deep=False): if deep: warnings.warn( @@ -544,7 +530,7 @@ def unique(self): # RangeIndex always has unique values return self - @annotate("RANGEINDEX_MUL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __mul__(self, other): # Multiplication by raw ints must return a RangeIndex to match pandas. if isinstance(other, cudf.Scalar) and other.dtype.kind in "iu": @@ -561,24 +547,24 @@ def __mul__(self, other): ) return self._as_int64().__mul__(other) - @annotate("RANGEINDEX_RMUL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __rmul__(self, other): # Multiplication is commutative. return self.__mul__(other) - @annotate("RANGEINDEX_AS_INT64", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _as_int64(self): # Convert self to an Int64Index. This method is used to perform ops # that are not defined directly on RangeIndex. return Int64Index._from_data(self._data) - @annotate("RANGEINDEX_ARRAY_UFUNC", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return self._as_int64().__array_ufunc__( ufunc, method, *inputs, **kwargs ) - @annotate("RANGEINDEX_GETATTR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __getattr__(self, key): # For methods that are not defined for RangeIndex we attempt to operate # on the corresponding integer index if possible. @@ -589,7 +575,7 @@ def __getattr__(self, key): f"'{type(self)}' object has no attribute {key}" ) - @annotate("RANGEINDEX_GET_LOC", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def get_loc(self, key, method=None, tolerance=None): # Given an actual integer, idx = (key - self._start) / self._step @@ -623,7 +609,7 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(key) return np.clip(round_method(idx), 0, idx_int_upper_bound, dtype=int) - @annotate("RANGEINDEX_UNION_INTERNAL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _union(self, other, sort=None): if isinstance(other, RangeIndex): # Variable suffixes are of the @@ -698,9 +684,7 @@ def _union(self, other, sort=None): # then perform `union`. return Int64Index(self._values)._union(other, sort=sort) - @annotate( - "RANGEINDEX_INTERSECTION_INTERNAL", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _intersection(self, other, sort=False): if not isinstance(other, RangeIndex): return super()._intersection(other, sort=sort) @@ -746,18 +730,14 @@ def _intersection(self, other, sort=False): return new_index - @annotate( - "RANGEINDEX_GATHER_INTERNAL", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _gather(self, gather_map, nullify=False, check_bounds=True): gather_map = cudf.core.column.as_column(gather_map) return Int64Index._from_columns( [self._values.take(gather_map, nullify, check_bounds)], [self.name] ) - @annotate( - "RANGEINDEX_APPLY_BOOLEAN_MASK", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _apply_boolean_mask(self, boolean_mask): return Int64Index._from_columns( [self._values.apply_boolean_mask(boolean_mask)], [self.name] @@ -801,7 +781,7 @@ class GenericIndex(SingleColumnFrame, BaseIndex): Column's, the data Column will be cloned to adopt this name. """ - @annotate("GENERICINDEX_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __init__(self, data, **kwargs): kwargs = _setdefault_name(data, **kwargs) @@ -822,7 +802,7 @@ def __init__(self, data, **kwargs): name = kwargs.get("name") super().__init__({name: data}) - @annotate("GENERICINDEX_ARRAY_UFUNC", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs) @@ -858,7 +838,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): return NotImplemented - @annotate("GENERICINDEX_BINARYOP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _binaryop( self, other: T, op: str, fill_value: Any = None, *args, **kwargs, ) -> SingleColumnFrame: @@ -877,9 +857,7 @@ def _binaryop( return ret.values return ret - @annotate( - "GENERICINDEX_COPY_TYPE_METADATA", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _copy_type_metadata( self, other: Frame, include_index: bool = True ) -> GenericIndex: @@ -897,12 +875,12 @@ def _copy_type_metadata( return self @property # type: ignore - @annotate("GENERICINDEX_VALUES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _values(self): return self._column @classmethod - @annotate("GENERICINDEX_CONCAT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _concat(cls, objs): if all(isinstance(obj, RangeIndex) for obj in objs): result = _concat_range_index(objs) @@ -919,11 +897,11 @@ def _concat(cls, objs): result.name = name return result - @annotate("GENERICINDEX_MEMORY_USAGE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def memory_usage(self, deep=False): return sum(super().memory_usage(deep=deep).values()) - @annotate("INDEX_EQUALS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def equals(self, other, **kwargs): """ Determine if two Index objects contain the same elements. @@ -953,7 +931,7 @@ def equals(self, other, **kwargs): except TypeError: return False - @annotate("GENERICINDEX_COPY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def copy(self, name=None, deep=False, dtype=None, names=None): """ Make a copy of this object. @@ -981,11 +959,12 @@ def copy(self, name=None, deep=False, dtype=None, names=None): col = self._values.astype(dtype) return _index_from_data({name: col.copy(True) if deep else col}) + @_cudf_nvtx_annotate @doc_apply(_index_astype_docstring) def astype(self, dtype, copy: bool = True): return _index_from_data(super().astype({self.name: dtype}, copy)) - @annotate("GENERICINDEX_GET_LOC", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def get_loc(self, key, method=None, tolerance=None): """Get integer location, slice or boolean mask for requested label. @@ -1104,7 +1083,7 @@ def get_loc(self, key, method=None, tolerance=None): mask[true_inds] = True return mask - @annotate("GENERICINDEX_REPR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __repr__(self): max_seq_items = get_option("max_seq_items") or len(self) mr = 0 @@ -1181,7 +1160,7 @@ def __repr__(self): return "\n".join(lines) - @annotate("GENERICINDEX_GETITEM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __getitem__(self, index): if type(self) == IntervalIndex: raise NotImplementedError( @@ -1194,16 +1173,14 @@ def __getitem__(self, index): return res @property # type: ignore - @annotate("GENERICINDEX_DTYPE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def dtype(self): """ `dtype` of the underlying values in GenericIndex. """ return self._values.dtype - @annotate( - "GENERICINDEX_FIND_LABEL_RANGE", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def find_label_range(self, first, last): """Find range that starts with *first* and ends with *last*, inclusively. @@ -1223,9 +1200,7 @@ def find_label_range(self, first, last): end += 1 return begin, end - @annotate( - "GENERICINDEX_GET_SLICE_BOUND", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def get_slice_bound(self, label, side, kind=None): return self._values.get_slice_bound(label, side, kind) @@ -1250,7 +1225,7 @@ def is_categorical(self): def is_interval(self): return False - @annotate("GENERICINDEX_ARGSORT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def argsort( self, axis=0, @@ -1312,7 +1287,7 @@ class NumericIndex(GenericIndex): # Subclasses must define the dtype they are associated with. _dtype: Union[None, Type[np.number]] = None - @annotate("NUMERICINDEX_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __init__(self, data=None, dtype=None, copy=False, name=None): dtype = type(self)._dtype @@ -1650,7 +1625,7 @@ class DatetimeIndex(GenericIndex): dtype='datetime64[ns]', name='a') """ - @annotate("DATETIMEINDEX_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __init__( self, data=None, @@ -1706,7 +1681,7 @@ def __init__( super().__init__(data, **kwargs) @property # type: ignore - @annotate("DATETIMEINDEX_YEAR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def year(self): """ The year of the datetime. @@ -1725,7 +1700,7 @@ def year(self): return self._get_dt_field("year") @property # type: ignore - @annotate("DATETIMEINDEX_MONTH", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def month(self): """ The month as January=1, December=12. @@ -1744,7 +1719,7 @@ def month(self): return self._get_dt_field("month") @property # type: ignore - @annotate("DATETIMEINDEX_DAY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def day(self): """ The day of the datetime. @@ -1763,7 +1738,7 @@ def day(self): return self._get_dt_field("day") @property # type: ignore - @annotate("DATETIMEINDEX_HOUR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def hour(self): """ The hours of the datetime. @@ -1784,7 +1759,7 @@ def hour(self): return self._get_dt_field("hour") @property # type: ignore - @annotate("DATETIMEINDEX_MINUTE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def minute(self): """ The minutes of the datetime. @@ -1805,7 +1780,7 @@ def minute(self): return self._get_dt_field("minute") @property # type: ignore - @annotate("DATETIMEINDEX_SECOND", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def second(self): """ The seconds of the datetime. @@ -1826,7 +1801,7 @@ def second(self): return self._get_dt_field("second") @property # type: ignore - @annotate("DATETIMEINDEX_WEEKDAY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def weekday(self): """ The day of the week with Monday=0, Sunday=6. @@ -1848,7 +1823,7 @@ def weekday(self): return self._get_dt_field("weekday") @property # type: ignore - @annotate("DATETIMEINDEX_DAYOFWEEK", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def dayofweek(self): """ The day of the week with Monday=0, Sunday=6. @@ -1870,7 +1845,7 @@ def dayofweek(self): return self._get_dt_field("weekday") @property # type: ignore - @annotate("DATETIMEINDEX_DAYOFYEAR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def dayofyear(self): """ The day of the year, from 1-365 in non-leap years and @@ -1893,7 +1868,7 @@ def dayofyear(self): return self._get_dt_field("day_of_year") @property # type: ignore - @annotate("DATETIMEINDEX_DAY_OF_YEAR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def day_of_year(self): """ The day of the year, from 1-365 in non-leap years and @@ -1916,9 +1891,7 @@ def day_of_year(self): return self._get_dt_field("day_of_year") @property # type: ignore - @annotate( - "DATETIMEINDEX_IS_LEAP_YEAR", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def is_leap_year(self): """ Boolean indicator if the date belongs to a leap year. @@ -1937,7 +1910,7 @@ def is_leap_year(self): return cupy.asarray(res) @property # type: ignore - @annotate("DATETIMEINDEX_QUARTER", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def quarter(self): """ Integer indicator for which quarter of the year the date belongs in. @@ -1962,7 +1935,7 @@ def quarter(self): res = extract_quarter(self._values) return Int8Index(res, dtype="int8") - @annotate("DATETIMEINDEX_ISOCALENDAR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def isocalendar(self): """ Returns a DataFrame with the year, week, and day @@ -1984,14 +1957,12 @@ def isocalendar(self): """ return cudf.core.tools.datetimes._to_iso_calendar(self) - @annotate("DATETIMEINDEX_TO_PANDAS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_pandas(self): nanos = self._values.astype("datetime64[ns]") return pd.DatetimeIndex(nanos.to_pandas(), name=self.name) - @annotate( - "DATETIMEINDEX_GET_DT_FIELD", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _get_dt_field(self, field): out_column = self._values.get_dt_field(field) # column.column_empty_like always returns a Column object @@ -2008,7 +1979,7 @@ def _get_dt_field(self, field): def is_boolean(self): return False - @annotate("DATETIMEINDEX_CEIL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def ceil(self, freq): """ Perform ceil operation on the data to the specified freq. @@ -2041,7 +2012,7 @@ def ceil(self, freq): return self.__class__._from_data({self.name: out_column}) - @annotate("DATETIMEINDEX_FLOOR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def floor(self, freq): """ Perform floor operation on the data to the specified freq. @@ -2074,7 +2045,7 @@ def floor(self, freq): return self.__class__._from_data({self.name: out_column}) - @annotate("DATETIMEINDEX_ROUND", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def round(self, freq): """ Perform round operation on the data to the specified freq. @@ -2157,7 +2128,7 @@ class TimedeltaIndex(GenericIndex): dtype='timedelta64[s]', name='delta-index') """ - @annotate("TIMEDELTAINDEX_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __init__( self, data=None, @@ -2189,7 +2160,7 @@ def __init__( data = column.as_column(np.array(data, dtype=dtype)) super().__init__(data, **kwargs) - @annotate("TIMEDELTAINDEX_TO_PANDAS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_pandas(self): return pd.TimedeltaIndex( self._values.to_pandas(), @@ -2198,7 +2169,7 @@ def to_pandas(self): ) @property # type: ignore - @annotate("TIMEDELTAINDEX_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def days(self): """ Number of days for each element. @@ -2206,7 +2177,7 @@ def days(self): return as_index(arbitrary=self._values.days, name=self.name) @property # type: ignore - @annotate("TIMEDELTAINDEX_SECONDS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def seconds(self): """ Number of seconds (>= 0 and less than 1 day) for each element. @@ -2214,9 +2185,7 @@ def seconds(self): return as_index(arbitrary=self._values.seconds, name=self.name) @property # type: ignore - @annotate( - "TIMEDELTAINDEX_MICROSECONDS", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def microseconds(self): """ Number of microseconds (>= 0 and less than 1 second) for each element. @@ -2224,9 +2193,7 @@ def microseconds(self): return as_index(arbitrary=self._values.microseconds, name=self.name) @property # type: ignore - @annotate( - "TIMEDELTAINDEX_NANOSECONDS", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def nanoseconds(self): """ Number of nanoseconds (>= 0 and less than 1 microsecond) for each @@ -2235,7 +2202,7 @@ def nanoseconds(self): return as_index(arbitrary=self._values.nanoseconds, name=self.name) @property # type: ignore - @annotate("TIMEDELTAINDEX_COMPONENTS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def components(self): """ Return a dataframe of the components (days, hours, minutes, @@ -2301,7 +2268,7 @@ class CategoricalIndex(GenericIndex): CategoricalIndex([1, 2, 3, ], categories=[1, 2, 3], ordered=False, dtype='category', name='a') """ # noqa: E501 - @annotate("CATEGORICALINDEX_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __init__( self, data=None, @@ -2357,7 +2324,7 @@ def __init__( super().__init__(data, **kwargs) @property # type: ignore - @annotate("CATEGORICALINDEX_CODES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def codes(self): """ The category codes of this categorical. @@ -2365,9 +2332,7 @@ def codes(self): return as_index(self._values.codes) @property # type: ignore - @annotate( - "CATEGORICALINDEX_CATEGORIES", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def categories(self): """ The categories of this categorical. @@ -2381,7 +2346,7 @@ def is_categorical(self): return True -@annotate("INDEX_INTERVAL_RANGE", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def interval_range( start=None, end=None, periods=None, freq=None, name=None, closed="right", ) -> "IntervalIndex": @@ -2544,7 +2509,7 @@ class IntervalIndex(GenericIndex): IntervalIndex """ - @annotate("INTERVALINDEX_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __init__( self, data, closed=None, dtype=None, copy=False, name=None, ): @@ -2569,7 +2534,7 @@ def __init__( self.closed = closed super().__init__(data, **kwargs) - @annotate("INTERVALINDEX_FROM_BREAKS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None): """ Construct an IntervalIndex from an array of splits. @@ -2626,7 +2591,7 @@ class StringIndex(GenericIndex): name: A string """ - @annotate("STRINGINDEX_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __init__(self, values, copy=False, **kwargs): kwargs = _setdefault_name(values, **kwargs) if isinstance(values, StringColumn): @@ -2642,13 +2607,13 @@ def __init__(self, values, copy=False, **kwargs): super().__init__(values, **kwargs) - @annotate("STRINGINDEX_TO_PANDAS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_pandas(self): return pd.Index( self.to_numpy(na_value=None), name=self.name, dtype="object" ) - @annotate("STRINGINDEX_REPR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __repr__(self): return ( f"{self.__class__.__name__}({self._values.values_host}," @@ -2663,7 +2628,7 @@ def __repr__(self): @copy_docstring(StringMethods) # type: ignore @property - @annotate("STRINGINDEX_STR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def str(self): return StringMethods(parent=self) @@ -2684,7 +2649,7 @@ def is_object(self): return True -@annotate("INDEX_AS_INDEX", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: """Create an Index from an arbitrary object @@ -2813,7 +2778,7 @@ class Index(BaseIndex, metaclass=IndexMeta): names=['a', 'b']) """ - @annotate("INDEX_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __new__( cls, data=None, @@ -2842,7 +2807,7 @@ def __new__( ) @classmethod - @annotate("INDEX_FROM_ARROW", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_arrow(cls, obj): try: return cls(ColumnBase.from_arrow(obj)) @@ -2851,7 +2816,7 @@ def from_arrow(cls, obj): return cudf.MultiIndex.from_arrow(obj) -@annotate("INDEX_CONCAT_RANGE_INDEX", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: """ An internal Utility function to concat RangeIndex objects. @@ -2892,7 +2857,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: return RangeIndex(start, stop, step) -@annotate("INDEX_EXTENDEX_GCD", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: """ Extended Euclidean algorithms to solve Bezout's identity: diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 82b7645b138..3fa951241f7 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -14,7 +14,6 @@ import cupy as cp import numpy as np import pandas as pd -from nvtx import annotate import cudf import cudf._lib as libcudf @@ -33,6 +32,7 @@ from cudf.core.index import Index, RangeIndex, _index_from_columns from cudf.core.multiindex import MultiIndex from cudf.core.udf.utils import _compile_or_get, _supported_cols_from_frame +from cudf.utils.utils import _cudf_nvtx_annotate doc_reset_index_template = """ Reset the index of the {klass}, or a level of it. @@ -340,7 +340,7 @@ def iloc(self): """ return self._iloc_indexer_type(self) - @annotate("SORT_INDEX", color="red", domain="cudf_python") + @_cudf_nvtx_annotate def sort_index( self, axis=0, @@ -722,7 +722,7 @@ def drop_duplicates( self._index.names if not ignore_index else None, ) - @annotate("FRAME_EMPTY_LIKE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _empty_like(self, keep_index=True): return self._from_columns_like_self( libcudf.copying.columns_empty_like( @@ -871,7 +871,7 @@ def add_suffix(self, suffix): Use `Series.add_suffix` or `DataFrame.add_suffix`" ) - @annotate("APPLY", color="purple", domain="cudf_python") + @_cudf_nvtx_annotate def _apply(self, func, kernel_getter, *args, **kwargs): """Apply `func` across the rows of the frame.""" if kwargs: @@ -1748,7 +1748,7 @@ def last(self, offset): slice_func=lambda i: self.iloc[i:], ) - @annotate("SAMPLE", color="orange", domain="cudf_python") + @_cudf_nvtx_annotate def sample( self, n=None, @@ -2180,7 +2180,7 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): return self._from_data(data, index=self._index) - @annotate("INDEXED_FRAME_DROP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def drop( self, labels=None, @@ -2393,7 +2393,7 @@ def drop( if not inplace: return out - @annotate("INDEXED_FRAME_EXPLODE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _explode(self, explode_column: Any, ignore_index: bool): # Helper function for `explode` in `Series` and `Dataframe`, explodes a # specified nested column. Other columns' corresponding rows are diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index efb714e89c2..c9036db05fa 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -13,7 +13,6 @@ import cupy import numpy as np import pandas as pd -from nvtx import annotate from pandas._config import get_option import cudf @@ -30,7 +29,11 @@ as_index, ) from cudf.utils.docutils import doc_apply -from cudf.utils.utils import NotIterable, _maybe_indices_to_slice +from cudf.utils.utils import ( + NotIterable, + _cudf_nvtx_annotate, + _maybe_indices_to_slice, +) class MultiIndex(Frame, BaseIndex, NotIterable): @@ -70,7 +73,7 @@ class MultiIndex(Frame, BaseIndex, NotIterable): ) """ - @annotate("MULTIINDEX_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __init__( self, levels=None, @@ -156,12 +159,12 @@ def __init__( self.names = names @property # type: ignore - @annotate("MULTIINDEX_NAMES_GETTER", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def names(self): return self._names @names.setter # type: ignore - @annotate("MULTIINDEX_NAMES_SETTER", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def names(self, value): value = [None] * self.nlevels if value is None else value @@ -179,6 +182,7 @@ def names(self, value): ) self._names = pd.core.indexes.frozen.FrozenList(value) + @_cudf_nvtx_annotate @doc_apply(_index_astype_docstring) def astype(self, dtype, copy: bool = True): if not is_object_dtype(dtype): @@ -188,7 +192,7 @@ def astype(self, dtype, copy: bool = True): ) return self - @annotate("MULTIINDEX_RENAME", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def rename(self, names, inplace=False): """ Alter MultiIndex level names @@ -235,7 +239,7 @@ def rename(self, names, inplace=False): """ return self.set_names(names, level=None, inplace=inplace) - @annotate("MULTIINDEX_SET_NAMES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def set_names(self, names, level=None, inplace=False): names_is_list_like = is_list_like(names) level_is_list_like = is_list_like(level) @@ -273,7 +277,7 @@ def set_names(self, names, level=None, inplace=False): return self._set_names(names=names, inplace=inplace) @classmethod - @annotate("MULTIINDEX_FROM_DATA", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _from_data( cls, data: MutableMapping, @@ -287,16 +291,16 @@ def _from_data( return obj @property # type: ignore - @annotate("MULTIINDEX_NAME_GETTER", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def name(self): return self._name @name.setter # type: ignore - @annotate("MULTIINDEX_NAME_GETTER", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def name(self, value): self._name = value - @annotate("MULTIINDEX_COPY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def copy( self, names=None, @@ -392,7 +396,7 @@ def copy( return mi - @annotate("MULTIINDEX_REPR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __repr__(self): max_seq_items = get_option("display.max_seq_items") or len(self) @@ -470,7 +474,7 @@ def __repr__(self): return output_prefix + data_output @property # type: ignore - @annotate("MULTIINDEX_CODES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def codes(self): """ Returns the codes of the underlying MultiIndex. @@ -501,13 +505,13 @@ def codes(self): return self._codes @property # type: ignore - @annotate("MULTIINDEX_NLEVELS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def nlevels(self): """Integer number of levels in this MultiIndex.""" return len(self._data) @property # type: ignore - @annotate("MULTIINDEX_LEVELS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def levels(self): """ Returns list of levels in the MultiIndex @@ -545,14 +549,12 @@ def levels(self): return self._levels @property # type: ignore - @annotate("MULTIINDEX_NDIM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def ndim(self): """Dimension of the data. For MultiIndex ndim is always 2.""" return 2 - @annotate( - "MULTIINDEX_GET_LEVEL_LABEL", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _get_level_label(self, level): """Get name of the level. @@ -569,7 +571,7 @@ def _get_level_label(self, level): else: return self._data.names[level] - @annotate("MULTIINDEX_ISIN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def isin(self, values, level=None): """Return a boolean array where the index values are in values. @@ -674,11 +676,7 @@ def where(self, cond, other=None, inplace=False): ".where is not supported for MultiIndex operations" ) - @annotate( - "MULTIINDEX_COMPUTE_LEVELS_AND_CODES", - color="green", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def _compute_levels_and_codes(self): levels = [] @@ -691,9 +689,7 @@ def _compute_levels_and_codes(self): self._levels = levels self._codes = cudf.DataFrame._from_data(codes) - @annotate( - "MULTIINDEX_COMPUTE_VALIDITY_MASK", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _compute_validity_mask(self, index, row_tuple, max_length): """Computes the valid set of indices of values in the lookup""" lookup = cudf.DataFrame() @@ -722,11 +718,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length): raise KeyError(row) return result - @annotate( - "MULTIINDEX_GET_VALID_INDICES_BY_TUPLE", - color="green", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): # Instructions for Slicing # if tuple, get first and last elements of tuple @@ -754,9 +746,7 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length): return row_tuple return self._compute_validity_mask(index, row_tuple, max_length) - @annotate( - "MULTIINDEX_INDEX_AND_DOWNCAST", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _index_and_downcast(self, result, index, index_key): if isinstance(index_key, (numbers.Number, slice)): @@ -825,7 +815,7 @@ def _index_and_downcast(self, result, index, index_key): result.index = index return result - @annotate("MULTIINDEX_GET_ROW_MAJOR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _get_row_major( self, df: DataFrameOrSeries, @@ -851,9 +841,7 @@ def _get_row_major( final = self._index_and_downcast(result, result.index, row_tuple) return final - @annotate( - "MULTIINDEX_VALIDATE_INDEXER", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _validate_indexer( self, indexer: Union[ @@ -880,7 +868,7 @@ def _validate_indexer( for i in indexer: self._validate_indexer(i) - @annotate("MULTIINDEX_EQ", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __eq__(self, other): if isinstance(other, MultiIndex): for self_col, other_col in zip( @@ -892,12 +880,12 @@ def __eq__(self, other): return NotImplemented @property # type: ignore - @annotate("MULTIINDEX_SIZE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def size(self): # The size of a MultiIndex is only dependent on the number of rows. return self._num_rows - @annotate("MULTIINDEX_TAKE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def take(self, indices): if isinstance(indices, cudf.Series) and indices.has_nulls: raise ValueError("Column must have no nulls.") @@ -905,7 +893,7 @@ def take(self, indices): obj.names = self.names return obj - @annotate("MULTIINDEX_SERIALIZE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def serialize(self): header, frames = super().serialize() # Overwrite the names in _data with the true names. @@ -913,7 +901,7 @@ def serialize(self): return header, frames @classmethod - @annotate("MULTIINDEX_DESERIALIZE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def deserialize(cls, header, frames): # Spoof the column names to construct the frame, then set manually. column_names = pickle.loads(header["column_names"]) @@ -921,7 +909,7 @@ def deserialize(cls, header, frames): obj = super().deserialize(header, frames) return obj._set_names(column_names) - @annotate("MULTIINDEX_GETITEM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __getitem__(self, index): flatten = isinstance(index, int) @@ -944,7 +932,7 @@ def __getitem__(self, index): result.names = self.names return result - @annotate("MULTIINDEX_TO_FRAME", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_frame(self, index=True, name=None): # TODO: Currently this function makes a shallow copy, which is # incorrect. We want to make a deep copy, otherwise further @@ -961,9 +949,7 @@ def to_frame(self, index=True, name=None): df.columns = name return df - @annotate( - "MULTIINDEX_GET_LEVEL_VALUES", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def get_level_values(self, level): """ Return the values at the requested level @@ -1017,7 +1003,7 @@ def is_interval(self): return False @classmethod - @annotate("MULTIINDEX_CONCAT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _concat(cls, objs): source_data = [o.to_frame(index=False) for o in objs] @@ -1038,7 +1024,7 @@ def _concat(cls, objs): return cudf.MultiIndex.from_frame(source_data, names=names) @classmethod - @annotate("MULTIINDEX_FROM_TUPLES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_tuples(cls, tuples, names=None): """ Convert list of tuples to MultiIndex. @@ -1076,7 +1062,7 @@ def from_tuples(cls, tuples, names=None): return cls.from_pandas(pdi) @property # type: ignore - @annotate("MULTIINDEX_VALUES_HOST", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def values_host(self): """ Return a numpy representation of the MultiIndex. @@ -1104,7 +1090,7 @@ def values_host(self): return self.to_pandas().values @property # type: ignore - @annotate("MULTIINDEX_VALUES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def values(self): """ Return a CuPy representation of the MultiIndex. @@ -1136,7 +1122,7 @@ def values(self): return self.to_frame(index=False).values @classmethod - @annotate("MULTIINDEX_FROM_FRAME", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_frame(cls, df, names=None): """ Make a MultiIndex from a DataFrame. @@ -1210,7 +1196,7 @@ def from_frame(cls, df, names=None): return obj @classmethod - @annotate("MULTIINDEX_FROM_PRODUCT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_product(cls, arrays, names=None): """ Make a MultiIndex from the cartesian product of multiple iterables. @@ -1251,7 +1237,7 @@ def from_product(cls, arrays, names=None): pdi = pd.MultiIndex.from_product(arrays, names=names) return cls.from_pandas(pdi) - @annotate("MULTIINDEX_POP_LEVELS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _poplevels(self, level): """ Remove and return the specified levels from self. @@ -1302,7 +1288,7 @@ def _poplevels(self, level): return popped - @annotate("MULTIINDEX_DROP_LEVEL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def droplevel(self, level=-1): """ Removes the specified levels from the MultiIndex. @@ -1365,13 +1351,13 @@ def droplevel(self, level=-1): else: return mi - @annotate("MULTIINDEX_TO_PANDAS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_pandas(self, nullable=False, **kwargs): result = self.to_frame(index=False).to_pandas(nullable=nullable) return pd.MultiIndex.from_frame(result, names=self.names) @classmethod - @annotate("MULTIINDEX_FROM_PANDAS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_pandas(cls, multiindex, nan_as_null=None): """ Convert from a Pandas MultiIndex @@ -1408,16 +1394,12 @@ def from_pandas(cls, multiindex, nan_as_null=None): return cls.from_frame(df, names=multiindex.names) @cached_property - @annotate("MULTIINDEX_IS_UNIQUE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def is_unique(self): return len(self) == len(self.unique()) @property # type: ignore - @annotate( - "MULTIINDEX_IS_MONOTONIC_INCREASING", - color="green", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def is_monotonic_increasing(self): """ Return if the index is monotonic increasing @@ -1426,11 +1408,7 @@ def is_monotonic_increasing(self): return self._is_sorted(ascending=None, null_position=None) @property # type: ignore - @annotate( - "MULTIINDEX_IS_MONOTONIC_DECREASING", - color="green", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def is_monotonic_decreasing(self): """ Return if the index is monotonic decreasing @@ -1440,7 +1418,7 @@ def is_monotonic_decreasing(self): ascending=[False] * len(self.levels), null_position=None ) - @annotate("MULTIINDEX_FILLNA", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def fillna(self, value): """ Fill null values with the specified value. @@ -1481,7 +1459,7 @@ def fillna(self, value): return super().fillna(value=value) - @annotate("MULTIINDEX_UNIQUE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def unique(self): return self.drop_duplicates(keep="first") @@ -1495,7 +1473,7 @@ def _clean_nulls_from_index(self): index_df._clean_nulls_from_dataframe(index_df), names=self.names ) - @annotate("MULTIINDEX_MEMORY_USAGE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def memory_usage(self, deep=False): usage = sum(super().memory_usage(deep=deep).values()) if self.levels: @@ -1506,13 +1484,13 @@ def memory_usage(self, deep=False): usage += col.memory_usage return usage - @annotate("MULTIINDEX_DIFFERENCE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def difference(self, other, sort=None): if hasattr(other, "to_pandas"): other = other.to_pandas() return self.to_pandas().difference(other, sort) - @annotate("MULTIINDEX_APPEND", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def append(self, other): """ Append a collection of MultiIndex objects together @@ -1575,7 +1553,7 @@ def append(self, other): return MultiIndex._concat(to_concat) - @annotate("MULTIINDEX_ARRAY_FUNCTION", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __array_function__(self, func, types, args, kwargs): cudf_df_module = MultiIndex @@ -1622,7 +1600,7 @@ def _level_index_from_level(self, level): ) from None return level - @annotate("MULTIINDEX_GET_LOC", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def get_loc(self, key, method=None, tolerance=None): """ Get location for a label or a tuple of labels. @@ -1759,7 +1737,7 @@ def _maybe_match_names(self, other): for self_name, other_name in zip(self.names, other.names) ] - @annotate("MULTIINDEX_UNION", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _union(self, other, sort=None): # TODO: When to_frame is refactored to return a # deep copy in future, we should push most of the common @@ -1785,7 +1763,7 @@ def _union(self, other, sort=None): return midx.sort_values() return midx - @annotate("MULTIINDEX_INTERSECTION", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _intersection(self, other, sort=None): if self.names != other.names: deep = True @@ -1808,9 +1786,7 @@ def _intersection(self, other, sort=None): return midx.sort_values() return midx - @annotate( - "MULTIINDEX_COPY_TYPE_METADATA", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _copy_type_metadata( self, other: Frame, include_index: bool = True ) -> Frame: @@ -1818,7 +1794,7 @@ def _copy_type_metadata( res._names = other._names return res - @annotate("MULTIINDEX_SPLIT_LEVELS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _split_columns_by_levels(self, levels): # This function assumes that for levels with duplicate names, they are # specified by indices, not name by ``levels``. E.g. [None, None] can diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index af4ea9806cc..b3b73b8961c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -14,7 +14,6 @@ import cupy import numpy as np import pandas as pd -from nvtx import annotate from pandas._config import get_option import cudf @@ -75,7 +74,7 @@ is_mixed_with_object_dtype, min_scalar_type, ) -from cudf.utils.utils import to_cudf_compatible_scalar +from cudf.utils.utils import _cudf_nvtx_annotate, to_cudf_compatible_scalar def _append_new_row_inplace(col: ColumnLike, value: ScalarLike): @@ -94,7 +93,7 @@ class _SeriesIlocIndexer(_FrameIndexer): For integer-location based selection. """ - @annotate("SERIESILOC_GETITEM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __getitem__(self, arg): if isinstance(arg, tuple): arg = list(arg) @@ -110,7 +109,7 @@ def __getitem__(self, arg): {self._frame.name: data}, index=cudf.Index(self._frame.index[arg]), ) - @annotate("SERIESILOC_SETITEM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __setitem__(self, key, value): from cudf.core.column import column @@ -154,7 +153,7 @@ class _SeriesLocIndexer(_FrameIndexer): Label-based selection """ - @annotate("SERIESLOC_GETITEM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]: if isinstance(arg, pd.MultiIndex): arg = cudf.from_pandas(arg) @@ -177,7 +176,7 @@ def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]: return self._frame.iloc[arg] - @annotate("SERIESLOC_SETITEM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __setitem__(self, key, value): try: key = self._loc_to_iloc(key) @@ -301,7 +300,7 @@ def _constructor_expanddim(self): return cudf.DataFrame @classmethod - @annotate("SERIES_FROM_CATEGORICAL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_categorical(cls, categorical, codes=None): """Creates from a pandas.Categorical @@ -342,7 +341,7 @@ def from_categorical(cls, categorical, codes=None): return Series(data=col) @classmethod - @annotate("SERIES_FROM_MASKED_ARRAY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_masked_array(cls, data, mask, null_count=None): """Create a Series with null-mask. This is equivalent to: @@ -391,7 +390,7 @@ def from_masked_array(cls, data, mask, null_count=None): col = column.as_column(data).set_mask(mask) return cls(data=col) - @annotate("SERIES_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __init__( self, data=None, index=None, dtype=None, name=None, nan_as_null=True, ): @@ -453,7 +452,7 @@ def __init__( self._index = RangeIndex(len(data)) if index is None else index @classmethod - @annotate("SERIES_FROM_DATA", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _from_data( cls, data: MutableMapping, @@ -468,12 +467,12 @@ def _from_data( out._index = RangeIndex(out._data.nrows) return out - @annotate("SERIES_CONTAINS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __contains__(self, item): return item in self._index @classmethod - @annotate("SERIES_FROM_PANDAS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def from_pandas(cls, s, nan_as_null=None): """ Convert from a Pandas Series. @@ -515,7 +514,7 @@ def from_pandas(cls, s, nan_as_null=None): return cls(s, nan_as_null=nan_as_null) @property # type: ignore - @annotate("SERIES_DT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def dt(self): """ Accessor object for datetime-like properties of the Series values. @@ -556,7 +555,7 @@ def dt(self): "Can only use .dt accessor with datetimelike values" ) - @annotate("SERIES_SERIALIZE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def serialize(self): header, frames = super().serialize() @@ -569,7 +568,7 @@ def serialize(self): return header, frames @classmethod - @annotate("SERIES_DESERIALIZE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def deserialize(cls, header, frames): index_nframes = header["index_frame_count"] obj = super().deserialize( @@ -596,7 +595,7 @@ def _get_columns_by_label(self, labels, downcast=False): else self.__class__(dtype=self.dtype, name=self.name) ) - @annotate("SERIES_DROP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def drop( self, labels=None, @@ -616,7 +615,7 @@ def drop( labels, axis, index, columns, level, inplace, errors ) - @annotate("SERIES_APPEND", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def append(self, to_append, ignore_index=False, verify_integrity=False): """Append values from another ``Series`` or array-like object. If ``ignore_index=True``, the index is reset. @@ -692,7 +691,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): """ return super()._append(to_append, ignore_index, verify_integrity) - @annotate("SERIES_REINDEX", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def reindex(self, index=None, copy=True): """Return a Series that conforms to a new index @@ -728,7 +727,7 @@ def reindex(self, index=None, copy=True): series.name = self.name return series - @annotate("SERIES_RESET_INDEX", color="green", domain="cudf_python") + @_cudf_nvtx_annotate @docutils.doc_apply( doc_reset_index_template.format( klass="Series", @@ -810,7 +809,7 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): inplace=inplace, ) - @annotate("SERIES_TO_FRAME", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_frame(self, name=None): """Convert Series into a DataFrame @@ -853,11 +852,11 @@ def to_frame(self, name=None): return cudf.DataFrame({col: self._column}, index=self.index) - @annotate("SERIES_MEMORY_USAGE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def memory_usage(self, index=True, deep=False): return sum(super().memory_usage(index, deep).values()) - @annotate("SERIES_ARRAY_FUNCTION", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __array_function__(self, func, types, args, kwargs): if "out" in kwargs or not all(issubclass(t, Series) for t in types): return NotImplemented @@ -913,7 +912,7 @@ def __array_function__(self, func, types, args, kwargs): return NotImplemented - @annotate("SERIES_MAP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def map(self, arg, na_action=None) -> "Series": """ Map values of Series according to input correspondence. @@ -1015,7 +1014,7 @@ def map(self, arg, na_action=None) -> "Series": result = self.applymap(arg) return result - @annotate("SERIES_GETITEM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __getitem__(self, arg): if isinstance(arg, slice): return self.iloc[arg] @@ -1026,7 +1025,7 @@ def __getitem__(self, arg): items = SingleColumnFrame.__iter__ - @annotate("SERIES_SETITEM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __setitem__(self, key, value): if isinstance(key, slice): self.iloc[key] = value @@ -1178,7 +1177,7 @@ def _make_operands_and_index_for_binop( operands = lhs._make_operands_for_binop(other, fill_value, reflect) return operands, lhs._index - @annotate("SERIES_LOGICAL_AND", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def logical_and(self, other): warnings.warn( "Series.logical_and is deprecated and will be removed.", @@ -1186,7 +1185,7 @@ def logical_and(self, other): ) return self._binaryop(other, "__l_and__").astype(np.bool_) - @annotate("SERIES_REMAINDER", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def remainder(self, other): warnings.warn( "Series.remainder is deprecated and will be removed.", @@ -1194,7 +1193,7 @@ def remainder(self, other): ) return self._binaryop(other, "__mod__") - @annotate("SERIES_LOGICAL_OR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def logical_or(self, other): warnings.warn( "Series.logical_or is deprecated and will be removed.", @@ -1202,7 +1201,7 @@ def logical_or(self, other): ) return self._binaryop(other, "__l_or__").astype(np.bool_) - @annotate("SERIES_LOGICAL_NOT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def logical_not(self): warnings.warn( "Series.logical_not is deprecated and will be removed.", @@ -1212,36 +1211,36 @@ def logical_not(self): @copy_docstring(CategoricalAccessor) # type: ignore @property - @annotate("SERIES_CAT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def cat(self): return CategoricalAccessor(parent=self) @copy_docstring(StringMethods) # type: ignore @property - @annotate("SERIES_STR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def str(self): return StringMethods(parent=self) @copy_docstring(ListMethods) # type: ignore @property - @annotate("SERIES_LIST", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def list(self): return ListMethods(parent=self) @copy_docstring(StructMethods) # type: ignore @property - @annotate("SERIES_STRUCT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def struct(self): return StructMethods(parent=self) @property # type: ignore - @annotate("SERIES_DTYPE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def dtype(self): """dtype of the Series""" return self._column.dtype @classmethod - @annotate("SERIES_CONCAT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _concat(cls, objs, axis=0, index=True): # Concatenate index if not provided if index is True: @@ -1312,25 +1311,25 @@ def _concat(cls, objs, axis=0, index=True): return cls(data=col, index=index, name=name) @property # type: ignore - @annotate("SERIES_VALID_COUNT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def valid_count(self): """Number of non-null values""" return self._column.valid_count @property # type: ignore - @annotate("SERIES_NULL_COUNT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def null_count(self): """Number of null values""" return self._column.null_count @property # type: ignore - @annotate("SERIES_NULLABLE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def nullable(self): """A boolean indicating whether a null-mask is needed""" return self._column.nullable @property # type: ignore - @annotate("SERIES_HAS_NULLS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def has_nulls(self): """ Indicator whether Series contains null values. @@ -1359,7 +1358,7 @@ def has_nulls(self): """ return self._column.has_nulls() - @annotate("SERIES_DROPNA", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def dropna(self, axis=0, inplace=False, how=None): """ Return a Series with null values removed. @@ -1439,7 +1438,7 @@ def dropna(self, axis=0, inplace=False, how=None): return self._mimic_inplace(result, inplace=inplace) - @annotate("SERIES_DROP_DUPLICATES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): """ Return Series with duplicate values removed. @@ -1513,7 +1512,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False): return self._mimic_inplace(result, inplace=inplace) - @annotate("SERIES_FILLNA", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None ): @@ -1537,7 +1536,7 @@ def fillna( value=value, method=method, axis=axis, inplace=inplace, limit=limit ) - @annotate("SERIES_ALL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): if bool_only not in (None, True): raise NotImplementedError( @@ -1545,7 +1544,7 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): ) return super().all(axis, skipna, level, **kwargs) - @annotate("SERIES_ANY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): if bool_only not in (None, True): raise NotImplementedError( @@ -1553,7 +1552,7 @@ def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): ) return super().any(axis, skipna, level, **kwargs) - @annotate("SERIES_TO_PANDAS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_pandas(self, index=True, nullable=False, **kwargs): """ Convert to a Pandas Series. @@ -1618,7 +1617,7 @@ def to_pandas(self, index=True, nullable=False, **kwargs): return s @property # type: ignore - @annotate("SERIES_DATA", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def data(self): """The gpu buffer for the data @@ -1645,12 +1644,12 @@ def data(self): return self._column.data @property # type: ignore - @annotate("SERIES_NULLMASK", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def nullmask(self): """The gpu buffer for the null-mask""" return cudf.Series(self._column.nullmask) - @annotate("SERIES_ASTYPE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def astype(self, dtype, copy=False, errors="raise", **kwargs): if is_dict_like(dtype): if len(dtype) > 1 or self.name not in dtype: @@ -1662,13 +1661,13 @@ def astype(self, dtype, copy=False, errors="raise", **kwargs): dtype = {self.name: dtype} return super().astype(dtype, copy, errors, **kwargs) - @annotate("SERIES_SORT_INDEX", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def sort_index(self, axis=0, *args, **kwargs): if axis not in (0, "index"): raise ValueError("Only axis=0 is valid for Series.") return super().sort_index(axis=axis, *args, **kwargs) - @annotate("SERIES_SORT_VALUES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def sort_values( self, axis=0, @@ -1723,7 +1722,7 @@ def sort_values( ignore_index=ignore_index, ) - @annotate("SERIES_NLARGEST", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def nlargest(self, n=5, keep="first"): """Returns a new Series of the *n* largest element. @@ -1786,7 +1785,7 @@ def nlargest(self, n=5, keep="first"): """ return self._n_largest_or_smallest(True, n, [self.name], keep) - @annotate("SERIES_NSMALLEST", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def nsmallest(self, n=5, keep="first"): """ Returns a new Series of the *n* smallest element. @@ -1862,7 +1861,7 @@ def nsmallest(self, n=5, keep="first"): """ return self._n_largest_or_smallest(False, n, [self.name], keep) - @annotate("SERIES_ARGSORT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def argsort( self, axis=0, @@ -1885,7 +1884,7 @@ def argsort( obj.name = self.name return obj - @annotate("SERIES_REPLACE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def replace(self, to_replace=None, value=None, *args, **kwargs): if is_dict_like(to_replace) and value is not None: raise ValueError( @@ -1895,7 +1894,7 @@ def replace(self, to_replace=None, value=None, *args, **kwargs): return super().replace(to_replace, value, *args, **kwargs) - @annotate("SERIES_UPDATE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def update(self, other): """ Modify Series in place using values from passed Series. @@ -2000,7 +1999,7 @@ def update(self, other): self.mask(mask, other, inplace=True) - @annotate("SERIES_LABEL_ENCODING", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _label_encoding(self, cats, dtype=None, na_sentinel=-1): # Private implementation of deprecated public label_encoding method def _return_sentinel_series(): @@ -2044,7 +2043,7 @@ def _return_sentinel_series(): return codes # UDF related - @annotate("SERIES_APPLY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def apply(self, func, convert_dtype=True, args=(), **kwargs): """ Apply a scalar function to the values of a Series. @@ -2133,7 +2132,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs): raise ValueError("Series.apply only supports convert_dtype=True") return self._apply(func, _get_scalar_kernel, *args, **kwargs) - @annotate("SERIES_APPLY_MAP", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def applymap(self, udf, out_dtype=None): """Apply an elementwise function to transform the values in the Column. @@ -2249,7 +2248,7 @@ def applymap(self, udf, out_dtype=None): # # Stats # - @annotate("SERIES_COUNT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def count(self, level=None, **kwargs): """ Return number of non-NA/null observations in the Series @@ -2276,7 +2275,7 @@ def count(self, level=None, **kwargs): return self.valid_count - @annotate("SERIES_MODE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def mode(self, dropna=True): """ Return the mode(s) of the dataset. @@ -2345,7 +2344,7 @@ def mode(self, dropna=True): return Series(val_counts.index.sort_values(), name=self.name) - @annotate("SERIES_ROUND", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def round(self, decimals=0, how="half_even"): if not is_integer(decimals): raise ValueError( @@ -2354,7 +2353,7 @@ def round(self, decimals=0, how="half_even"): decimals = int(decimals) return super().round(decimals, how) - @annotate("SERIES_COV", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def cov(self, other, min_periods=None): """ Compute covariance with Series, excluding missing values. @@ -2404,7 +2403,7 @@ def cov(self, other, min_periods=None): f"{other.dtype}" ) - @annotate("SERIES_TRANSPOSE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def transpose(self): """Return the transpose, which is by definition self. """ @@ -2413,7 +2412,7 @@ def transpose(self): T = property(transpose, doc=transpose.__doc__) - @annotate("SERIES_CORR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def corr(self, other, method="pearson", min_periods=None): """Calculates the sample correlation between two Series, excluding missing values. @@ -2447,7 +2446,7 @@ def corr(self, other, method="pearson", min_periods=None): f"cannot perform corr with types {self.dtype}, {other.dtype}" ) - @annotate("SERIES_AUTOCORR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def autocorr(self, lag=1): """Compute the lag-N autocorrelation. This method computes the Pearson correlation between the Series and its shifted self. @@ -2473,7 +2472,7 @@ def autocorr(self, lag=1): """ return self.corr(self.shift(lag)) - @annotate("SERIES_ISIN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def isin(self, values): """Check whether values are contained in Series. @@ -2543,7 +2542,7 @@ def isin(self, values): {self.name: self._column.isin(values)}, index=self.index ) - @annotate("SERIES_UNIQUE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def unique(self): """ Returns unique values of this Series. @@ -2576,7 +2575,7 @@ def unique(self): res = self._column.unique() return Series(res, name=self.name) - @annotate("SERIES_VALUE_COUNTS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def value_counts( self, normalize=False, @@ -2699,7 +2698,7 @@ def value_counts( res = res / float(res._column.sum()) return res - @annotate("SERIES_QUANTILE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def quantile( self, q=0.5, interpolation="linear", exact=True, quant_index=True ): @@ -2764,7 +2763,7 @@ def quantile( return Series(result, index=index, name=self.name) @docutils.doc_describe() - @annotate("SERIES_DESCRIBE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def describe( self, percentiles=None, @@ -2920,7 +2919,7 @@ def _describe_timestamp(self): else: return _describe_categorical(self) - @annotate("SERIES_DIGITIZE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def digitize(self, bins, right=False): """Return the indices of the bins to which each value in series belongs. @@ -2956,7 +2955,7 @@ def digitize(self, bins, right=False): cudf.core.column.numerical.digitize(self._column, bins, right) ) - @annotate("SERIES_DIFF", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def diff(self, periods=1): """Calculate the difference between values at positions i and i - N in an array and store the output in a new array. @@ -3045,7 +3044,7 @@ def diff(self, periods=1): return Series(output_col, name=self.name, index=self.index) @copy_docstring(SeriesGroupBy) - @annotate("SERIES_GROUPBY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def groupby( self, by=None, @@ -3091,7 +3090,7 @@ def groupby( ) ) - @annotate("SERIES_RENAME", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def rename(self, index=None, copy=True): """ Alter Series name @@ -3137,7 +3136,7 @@ def rename(self, index=None, copy=True): out_data = self._data.copy(deep=copy) return Series._from_data(out_data, self.index, name=index) - @annotate("SERIES_MERGE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def merge( self, other, @@ -3189,21 +3188,21 @@ def merge( return result - @annotate("SERIES_ADD_PREFIX", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def add_prefix(self, prefix): return Series._from_data( data=self._data.copy(deep=True), index=prefix + self.index.astype(str), ) - @annotate("SERIES_ADD_SUFFIX", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def add_suffix(self, suffix): return Series._from_data( data=self._data.copy(deep=True), index=self.index.astype(str) + suffix, ) - @annotate("SERIES_KEYS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def keys(self): """ Return alias for index. @@ -3247,7 +3246,7 @@ def keys(self): """ return self.index - @annotate("SERIES_EXPLODE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def explode(self, ignore_index=False): """ Transform each element of a list-like to a row, replicating index @@ -3284,7 +3283,7 @@ def explode(self, ignore_index=False): """ return super()._explode(self.name, ignore_index) - @annotate("SERIES_PCT_CHANGE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def pct_change( self, periods=1, fill_method="ffill", limit=None, freq=None ): @@ -3437,7 +3436,7 @@ def __init__(self, series): self.series = series @property # type: ignore - @annotate("SERIES_DT_YEAR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def year(self): """ The year of the datetime. @@ -3462,7 +3461,7 @@ def year(self): return self._get_dt_field("year") @property # type: ignore - @annotate("SERIES_DT_MONTH", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def month(self): """ The month as January=1, December=12. @@ -3487,7 +3486,7 @@ def month(self): return self._get_dt_field("month") @property # type: ignore - @annotate("SERIES_DT_DAY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def day(self): """ The day of the datetime. @@ -3512,7 +3511,7 @@ def day(self): return self._get_dt_field("day") @property # type: ignore - @annotate("SERIES_DT_HOUR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def hour(self): """ The hours of the datetime. @@ -3537,7 +3536,7 @@ def hour(self): return self._get_dt_field("hour") @property # type: ignore - @annotate("SERIES_DT_MINUTE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def minute(self): """ The minutes of the datetime. @@ -3562,7 +3561,7 @@ def minute(self): return self._get_dt_field("minute") @property # type: ignore - @annotate("SERIES_DT_SECOND", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def second(self): """ The seconds of the datetime. @@ -3587,7 +3586,7 @@ def second(self): return self._get_dt_field("second") @property # type: ignore - @annotate("SERIES_DT_WEEKDAY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def weekday(self): """ The day of the week with Monday=0, Sunday=6. @@ -3624,7 +3623,7 @@ def weekday(self): return self._get_dt_field("weekday") @property # type: ignore - @annotate("SERIES_DT_DAYOFWEEK", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def dayofweek(self): """ The day of the week with Monday=0, Sunday=6. @@ -3661,7 +3660,7 @@ def dayofweek(self): return self._get_dt_field("weekday") @property # type: ignore - @annotate("SERIES_DT_DAYOFYEAR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def dayofyear(self): """ The day of the year, from 1-365 in non-leap years and @@ -3699,7 +3698,7 @@ def dayofyear(self): return self._get_dt_field("day_of_year") @property # type: ignore - @annotate("SERIES_DT_DAY_OF_YEAR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def day_of_year(self): """ The day of the year, from 1-365 in non-leap years and @@ -3737,7 +3736,7 @@ def day_of_year(self): return self._get_dt_field("day_of_year") @property # type: ignore - @annotate("SERIES_DT_IS_LEAP_YEAR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def is_leap_year(self): """ Boolean indicator if the date belongs to a leap year. @@ -3796,7 +3795,7 @@ def is_leap_year(self): ) @property # type: ignore - @annotate("SERIES_DT_QUARTER", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def quarter(self): """ Integer indicator for which quarter of the year the date belongs in. @@ -3827,7 +3826,7 @@ def quarter(self): {None: res}, index=self.series._index, name=self.series.name, ) - @annotate("SERIES_DT_ISOCALENDAR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def isocalendar(self): """ Returns a DataFrame with the year, week, and day @@ -3872,7 +3871,7 @@ def isocalendar(self): return cudf.core.tools.datetimes._to_iso_calendar(self) @property # type: ignore - @annotate("SERIES_DT_IS_MONTH_START", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def is_month_start(self): """ Booleans indicating if dates are the first day of the month. @@ -3880,7 +3879,7 @@ def is_month_start(self): return (self.day == 1).fillna(False) @property # type: ignore - @annotate("SERIES_DT_DAYS_IN_MONTH", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def days_in_month(self): """ Get the total number of days in the month that the date falls on. @@ -3932,7 +3931,7 @@ def days_in_month(self): ) @property # type: ignore - @annotate("SERIES_DT_IS_MONTH_END", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def is_month_end(self): """ Boolean indicator if the date is the last day of the month. @@ -3979,9 +3978,7 @@ def is_month_end(self): return (self.day == last_day.dt.day).fillna(False) @property # type: ignore - @annotate( - "SERIES_DT_IS_EQUARTER_START", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def is_quarter_start(self): """ Boolean indicator if the date is the first day of a quarter. @@ -4028,7 +4025,7 @@ def is_quarter_start(self): ) @property # type: ignore - @annotate("SERIES_DT_IS_QUARTER_END", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def is_quarter_end(self): """ Boolean indicator if the date is the last day of a quarter. @@ -4077,7 +4074,7 @@ def is_quarter_end(self): ) @property # type: ignore - @annotate("SERIES_DT_IS_YEAR_START", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def is_year_start(self): """ Boolean indicator if the date is the first day of the year. @@ -4112,7 +4109,7 @@ def is_year_start(self): ) @property # type: ignore - @annotate("SERIES_DT_IS_YEAR_END", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def is_year_end(self): """ Boolean indicator if the date is the last day of the year. @@ -4148,14 +4145,14 @@ def is_year_end(self): {None: result}, index=self.series._index, name=self.series.name, ) - @annotate("SERIES_DT_GET_DT_FIELD", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _get_dt_field(self, field): out_column = self.series._column.get_dt_field(field) return Series( data=out_column, index=self.series._index, name=self.series.name ) - @annotate("SERIES_DT_CEIL", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def ceil(self, freq): """ Perform ceil operation on the data to the specified freq. @@ -4192,7 +4189,7 @@ def ceil(self, freq): data={self.series.name: out_column}, index=self.series._index ) - @annotate("SERIES_DT_FLOOR", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def floor(self, freq): """ Perform floor operation on the data to the specified freq. @@ -4229,7 +4226,7 @@ def floor(self, freq): data={self.series.name: out_column}, index=self.series._index ) - @annotate("SERIES_DT_ROUND", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def round(self, freq): """ Perform round operation on the data to the specified freq. @@ -4269,7 +4266,7 @@ def round(self, freq): data={self.series.name: out_column}, index=self.series._index ) - @annotate("SERIES_DT_STRFTIME", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def strftime(self, date_format, *args, **kwargs): """ Convert to Series using specified ``date_format``. @@ -4424,7 +4421,7 @@ def __init__(self, series): self.series = series @property # type: ignore - @annotate("SERIES_TD_DAYS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def days(self): """ Number of days. @@ -4456,7 +4453,7 @@ def days(self): return self._get_td_field("days") @property # type: ignore - @annotate("SERIES_TD_SECONDS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def seconds(self): """ Number of seconds (>= 0 and less than 1 day). @@ -4495,7 +4492,7 @@ def seconds(self): return self._get_td_field("seconds") @property # type: ignore - @annotate("SERIES_TD_MICROSECONDS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def microseconds(self): """ Number of microseconds (>= 0 and less than 1 second). @@ -4527,7 +4524,7 @@ def microseconds(self): return self._get_td_field("microseconds") @property # type: ignore - @annotate("SERIES_TD_NANOSECONDS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def nanoseconds(self): """ Return the number of nanoseconds (n), where 0 <= n < 1 microsecond. @@ -4559,7 +4556,7 @@ def nanoseconds(self): return self._get_td_field("nanoseconds") @property # type: ignore - @annotate("SERIES_TD_COMPONENTS", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def components(self): """ Return a Dataframe of the components of the Timedeltas. @@ -4588,7 +4585,7 @@ def components(self): """ # noqa: E501 return self.series._column.components(index=self.series._index) - @annotate("SERIES_TD_GET_TD_FIELD", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _get_td_field(self, field): out_column = getattr(self.series._column, field) return Series( @@ -4596,7 +4593,7 @@ def _get_td_field(self, field): ) -@annotate("SERIES__ALIGN_INDICES", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def _align_indices(series_list, how="outer", allow_non_unique=False): """ Internal util to align the indices of a list of Series objects @@ -4664,7 +4661,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False): return result -@annotate("CUDF_ISCLOSE", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): """Returns a boolean array where two arrays are equal within a tolerance. diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 666b743f7ef..85f1d253293 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -18,14 +18,13 @@ import cupy import numpy as np import pandas as pd -from nvtx import annotate import cudf from cudf._typing import Dtype from cudf.api.types import _is_scalar_or_zero_d_array from cudf.core.column import ColumnBase, as_column from cudf.core.frame import Frame -from cudf.utils.utils import NotIterable +from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate T = TypeVar("T", bound="Frame") @@ -43,7 +42,7 @@ class SingleColumnFrame(Frame, NotIterable): "index": 0, } - @annotate("SINGLECOLUMNFRAME_REDUCE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _reduce( self, op, axis=None, level=None, numeric_only=None, **kwargs, ): @@ -62,7 +61,7 @@ def _reduce( except AttributeError: raise TypeError(f"cannot perform {op} with type {self.dtype}") - @annotate("SINGLECOLUMNFRAME_SCAN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _scan(self, op, axis=None, *args, **kwargs): if axis not in (None, 0): raise NotImplementedError("axis parameter is not implemented yet") @@ -70,9 +69,7 @@ def _scan(self, op, axis=None, *args, **kwargs): return super()._scan(op, axis=axis, *args, **kwargs) @classmethod - @annotate( - "SINGLECOLUMNFRAME_FROM_DATA", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _from_data( cls, data: MutableMapping, @@ -86,26 +83,24 @@ def _from_data( return out @property # type: ignore - @annotate("SINGLECOLUMNFRAME_NAME", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def name(self): """Get the name of this object.""" return next(iter(self._data.names)) @name.setter # type: ignore - @annotate( - "SINGLECOLUMNFRAME_NAME_SETTER", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def name(self, value): self._data[value] = self._data.pop(self.name) @property # type: ignore - @annotate("SINGLECOLUMNFRAME_NDIM", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def ndim(self): """Get the dimensionality (always 1 for single-columned frames).""" return 1 @property # type: ignore - @annotate("SINGLECOLUMNFRAME_SHAPE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def shape(self): """Get a tuple representing the dimensionality of the Index.""" return (len(self),) @@ -117,37 +112,31 @@ def __bool__(self): ) @property # type: ignore - @annotate( - "SINGLECOLUMNFRAME_NUM_COLUMNS", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _num_columns(self): return 1 @property # type: ignore - @annotate("SINGLECOLUMNFRAME_COLUMN", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def _column(self): return self._data[self.name] @_column.setter # type: ignore - @annotate( - "SINGLECOLUMNFRAME_COLUMN_SETTER", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def _column(self, value): self._data[self.name] = value @property # type: ignore - @annotate("SINGLECOLUMNFRAME_VALUES", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def values(self): # noqa: D102 return self._column.values @property # type: ignore - @annotate( - "SINGLECOLUMNFRAME_VALUES_HOST", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def values_host(self): # noqa: D102 return self._column.values_host - @annotate("SINGLECOLUMNFRAME_TO_CUPY", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def to_cupy( self, dtype: Union[Dtype, None] = None, @@ -156,9 +145,7 @@ def to_cupy( ) -> cupy.ndarray: # noqa: D102 return super().to_cupy(dtype, copy, na_value).flatten() - @annotate( - "SINGLECOLUMNFRAME_TO_NUMPY", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def to_numpy( self, dtype: Union[Dtype, None] = None, @@ -178,9 +165,7 @@ def tolist(self): # noqa: D102 to_list = tolist @classmethod - @annotate( - "SINGLECOLUMNFRAME_FROM_ARROW", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def from_arrow(cls, array): """Create from PyArrow Array/ChunkedArray. @@ -211,9 +196,7 @@ def from_arrow(cls, array): """ return cls(ColumnBase.from_arrow(array)) - @annotate( - "SINGLECOLUMNFRAME_TO_ARROW", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def to_arrow(self): """ Convert to a PyArrow Array. @@ -245,9 +228,7 @@ def to_arrow(self): return self._column.to_arrow() @property # type: ignore - @annotate( - "SINGLECOLUMNFRAME_IS_UNIQUE", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def is_unique(self): """Return boolean if values in the object are unique. @@ -258,9 +239,7 @@ def is_unique(self): return self._column.is_unique @property # type: ignore - @annotate( - "SINGLECOLUMNFRAME_IS_MONOTONIC", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def is_monotonic(self): """Return boolean if values in the object are monotonically increasing. @@ -273,11 +252,7 @@ def is_monotonic(self): return self.is_monotonic_increasing @property # type: ignore - @annotate( - "SINGLECOLUMNFRAME_IS_MONOTONIC_INCREASING", - color="green", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def is_monotonic_increasing(self): """Return boolean if values in the object are monotonically increasing. @@ -288,11 +263,7 @@ def is_monotonic_increasing(self): return self._column.is_monotonic_increasing @property # type: ignore - @annotate( - "SINGLECOLUMNFRAME_IS_MONOTONIC_DECREASING", - color="green", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def is_monotonic_decreasing(self): """Return boolean if values in the object are monotonically decreasing. @@ -303,17 +274,11 @@ def is_monotonic_decreasing(self): return self._column.is_monotonic_decreasing @property # type: ignore - @annotate( - "SINGLECOLUMNFRAME_CUDA_ARRAY_INTERFACE", - color="green", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def __cuda_array_interface__(self): return self._column.__cuda_array_interface__ - @annotate( - "SINGLECOLUMNFRAME_FACTORIZE", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def factorize(self, na_sentinel=-1): """Encode the input values as integer labels. @@ -341,11 +306,7 @@ def factorize(self, na_sentinel=-1): """ return cudf.core.algorithms.factorize(self, na_sentinel=na_sentinel) - @annotate( - "SINGLECOLUMNFRAME_MAKE_OPERANDS_FOR_BINOP", - color="green", - domain="cudf_python", - ) + @_cudf_nvtx_annotate def _make_operands_for_binop( self, other: Any, @@ -399,7 +360,7 @@ def _make_operands_for_binop( return {result_name: (self._column, other, reflect, fill_value)} - @annotate("SINGLECOLUMNFRAME_NUNIQUE", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def nunique(self, method: builtins.str = "sort", dropna: bool = True): """ Return count of unique values for the column. diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py index a98ee40274e..f5c270a3705 100644 --- a/python/cudf/cudf/core/udf/utils.py +++ b/python/cudf/cudf/core/udf/utils.py @@ -1,3 +1,5 @@ +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + from typing import Callable import cachetools @@ -6,7 +8,6 @@ from numba.core.errors import TypingError from numba.np import numpy_support from numba.types import Poison, Tuple, boolean, int64, void -from nvtx import annotate from cudf.core.dtypes import CategoricalDtype from cudf.core.udf.typing import MaskedType @@ -17,6 +18,7 @@ NUMERIC_TYPES, TIMEDELTA_TYPES, ) +from cudf.utils.utils import _cudf_nvtx_annotate JIT_SUPPORTED_TYPES = ( NUMERIC_TYPES | BOOL_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES @@ -28,7 +30,7 @@ precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32) -@annotate("NUMBA JIT", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def _get_udf_return_type(argty, func: Callable, args=()): """ Get the return type of a masked UDF for a given set of argument dtypes. It @@ -165,7 +167,7 @@ def _generate_cache_key(frame, func: Callable): ) -@annotate("UDF COMPILATION", color="darkgreen", domain="cudf_python") +@_cudf_nvtx_annotate def _compile_or_get(frame, func, args, kernel_getter=None): """ Return a compiled kernel in terms of MaskedTypes that launches a diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 4694243ad18..f15fef19c07 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -1,17 +1,17 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. from io import BytesIO, StringIO -from nvtx import annotate from pyarrow.lib import NativeFile import cudf from cudf import _lib as libcudf from cudf.api.types import is_scalar from cudf.utils import ioutils +from cudf.utils.utils import _cudf_nvtx_annotate -@annotate("READ_CSV", color="purple", domain="cudf_python") +@_cudf_nvtx_annotate @ioutils.doc_read_csv() def read_csv( filepath_or_buffer, @@ -106,7 +106,7 @@ def read_csv( ) -@annotate("WRITE_CSV", color="purple", domain="cudf_python") +@_cudf_nvtx_annotate @ioutils.doc_to_csv() def to_csv( df, diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 6e4e104df4d..253d7950c54 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -7,7 +7,6 @@ from uuid import uuid4 import numpy as np -from nvtx import annotate from pyarrow import dataset as ds, parquet as pq import cudf @@ -15,9 +14,10 @@ from cudf.api.types import is_list_like from cudf.core.column import as_column, build_categorical_column from cudf.utils import ioutils +from cudf.utils.utils import _cudf_nvtx_annotate -@annotate("_WRITE_PARQUET", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def _write_parquet( df, paths, @@ -75,7 +75,7 @@ def _write_parquet( # Logic chosen to match: https://arrow.apache.org/ # docs/_modules/pyarrow/parquet.html#write_to_dataset -@annotate("WRITE_TO_DATASET", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def write_to_dataset( df, root_path, @@ -164,7 +164,7 @@ def write_to_dataset( @ioutils.doc_read_parquet_metadata() -@annotate("READ_PARQUET_METADATA", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def read_parquet_metadata(path): """{docstring}""" @@ -177,7 +177,7 @@ def read_parquet_metadata(path): return num_rows, num_row_groups, col_names -@annotate("_PROCESS_DATASET", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def _process_dataset( paths, fs, filters=None, row_groups=None, categorical_partitions=True, ): @@ -313,7 +313,7 @@ def _process_dataset( @ioutils.doc_read_parquet() -@annotate("READ_PARQUET", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def read_parquet( filepath_or_buffer, engine="cudf", @@ -441,7 +441,7 @@ def read_parquet( ) -@annotate("_PARQUET_TO_FRAME", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def _parquet_to_frame( paths_or_buffers, *args, @@ -509,7 +509,7 @@ def _parquet_to_frame( ) -@annotate("_WRITE_PARQUET", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def _read_parquet( filepaths_or_buffers, engine, @@ -543,7 +543,7 @@ def _read_parquet( @ioutils.doc_to_parquet() -@annotate("TO_PARQUET", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def to_parquet( df, path, @@ -655,7 +655,7 @@ def _generate_filename(): return uuid4().hex + ".parquet" -@annotate("_GET_PARTITIONED", color="green", domain="cudf_python") +@_cudf_nvtx_annotate def _get_partitioned( df, root_path, @@ -699,7 +699,7 @@ def _get_partitioned( class ParquetDatasetWriter: - @annotate("ParquetDatasetWriter_INIT", color="green", domain="cudf_python") + @_cudf_nvtx_annotate def __init__( self, path, @@ -776,9 +776,7 @@ def __init__( self.path_cw_map: Dict[str, int] = {} self.filename = None - @annotate( - "ParquetDatasetWriter_WRITE_TABLE", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def write_table(self, df): """ Write a dataframe to the file/dataset @@ -835,9 +833,7 @@ def write_table(self, df): self.path_cw_map.update({k: new_cw_idx for k in new_paths}) self._chunked_writers[-1][0].write_table(grouped_df, part_info) - @annotate( - "ParquetDatasetWriter_CLOSE", color="green", domain="cudf_python" - ) + @_cudf_nvtx_annotate def close(self, return_metadata=False): """ Close all open files and optionally return footer metadata as a binary diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py index 04809f8fd59..e5a3beb7d61 100644 --- a/python/cudf/cudf/io/text.py +++ b/python/cudf/cudf/io/text.py @@ -2,14 +2,13 @@ from io import BytesIO, StringIO -from nvtx import annotate - import cudf from cudf._lib import text as libtext from cudf.utils import ioutils +from cudf.utils.utils import _cudf_nvtx_annotate -@annotate("READ_TEXT", color="purple", domain="cudf_python") +@_cudf_nvtx_annotate @ioutils.doc_read_text() def read_text( filepath_or_buffer, delimiter=None, byte_range=None, **kwargs, diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 4fa6b7d934c..1bd3fa7558e 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -2,13 +2,16 @@ import decimal import functools +import hashlib import os import traceback +from functools import partial from typing import FrozenSet, Set, Union import cupy as cp import numpy as np import pandas as pd +from nvtx import annotate import rmm @@ -31,6 +34,7 @@ "__ge__", } +_NVTX_COLORS = ["green", "blue", "purple", "rapids"] # The test root is set by pytest to support situations where tests are run from # a source tree on a built version of cudf. @@ -400,3 +404,25 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]: if (indices == cp.arange(start, stop, step)).all(): return slice(start, stop, step) return indices + + +def _get_color_for_nvtx(name): + m = hashlib.sha256() + m.update(name.encode()) + hash_value = int(m.hexdigest(), 16) + idx = hash_value % len(_NVTX_COLORS) + return _NVTX_COLORS[idx] + + +def _cudf_nvtx_annotate(func, domain="cudf_python"): + """Decorator for applying nvtx annotations to methods in cudf.""" + return annotate( + message=func.__qualname__, + color=_get_color_for_nvtx(func.__qualname__), + domain=domain, + )(func) + + +_dask_cudf_nvtx_annotate = partial( + _cudf_nvtx_annotate, domain="dask_cudf_python" +) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 1b1f3e29ab2..bd9a8fc2769 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -6,7 +6,6 @@ import numpy as np import pandas as pd import pyarrow as pa -from nvtx import annotate from dask.dataframe.core import get_parallel_type, meta_nonempty from dask.dataframe.dispatch import ( @@ -31,6 +30,7 @@ import cudf from cudf.api.types import is_string_dtype +from cudf.utils.utils import _dask_cudf_nvtx_annotate from .core import DataFrame, Index, Series @@ -40,7 +40,7 @@ @meta_nonempty.register(cudf.BaseIndex) -@annotate("_nonempty_index", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _nonempty_index(idx): if isinstance(idx, cudf.core.index.RangeIndex): return cudf.core.index.RangeIndex(2, name=idx.name) @@ -75,7 +75,7 @@ def _nonempty_index(idx): raise TypeError(f"Don't know how to handle index of type {type(idx)}") -@annotate("_get_non_empty_data", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _get_non_empty_data(s): if isinstance(s._column, cudf.core.column.CategoricalColumn): categories = ( @@ -103,7 +103,7 @@ def _get_non_empty_data(s): @meta_nonempty.register(cudf.Series) -@annotate("_nonempty_series", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _nonempty_series(s, idx=None): if idx is None: idx = _nonempty_index(s.index) @@ -113,7 +113,7 @@ def _nonempty_series(s, idx=None): @meta_nonempty.register(cudf.DataFrame) -@annotate("meta_nonempty_cudf", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def meta_nonempty_cudf(x): idx = meta_nonempty(x.index) columns_with_dtype = dict() @@ -129,18 +129,18 @@ def meta_nonempty_cudf(x): @make_meta_dispatch.register((cudf.Series, cudf.DataFrame)) -@annotate("make_meta_cudf", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def make_meta_cudf(x, index=None): return x.head(0) @make_meta_dispatch.register(cudf.BaseIndex) -@annotate("make_meta_cudf_index", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def make_meta_cudf_index(x, index=None): return x[:0] -@annotate("_empty_series", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _empty_series(name, dtype, index=None): if isinstance(dtype, str) and dtype == "category": return cudf.Series( @@ -150,7 +150,7 @@ def _empty_series(name, dtype, index=None): @make_meta_obj.register(object) -@annotate("make_meta_object_cudf", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def make_meta_object_cudf(x, index=None): """Create an empty cudf object containing the desired metadata. @@ -221,7 +221,7 @@ def make_meta_object_cudf(x, index=None): @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex)) -@annotate("concat_cudf", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def concat_cudf( dfs, axis=0, @@ -246,13 +246,13 @@ def concat_cudf( @categorical_dtype_dispatch.register( (cudf.DataFrame, cudf.Series, cudf.BaseIndex) ) -@annotate("categorical_dtype_cudf", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def categorical_dtype_cudf(categories=None, ordered=None): return cudf.CategoricalDtype(categories=categories, ordered=ordered) @tolist_dispatch.register((cudf.Series, cudf.BaseIndex)) -@annotate("tolist_cudf", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def tolist_cudf(obj): return obj.to_arrow().to_pylist() @@ -260,9 +260,7 @@ def tolist_cudf(obj): @is_categorical_dtype_dispatch.register( (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series) ) -@annotate( - "is_categorical_dtype_cudf", color="green", domain="dask_cudf_python" -) +@_dask_cudf_nvtx_annotate def is_categorical_dtype_cudf(obj): return cudf.api.types.is_categorical_dtype(obj) @@ -276,7 +274,7 @@ def is_categorical_dtype_cudf(obj): ) @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex)) - @annotate("percentile_cudf", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def percentile_cudf(a, q, interpolation="linear"): # Cudf dispatch to the equivalent of `np.percentile`: # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html @@ -321,7 +319,7 @@ def percentile_cudf(a, q, interpolation="linear"): @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex)) -@annotate("union_categoricals_cudf", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def union_categoricals_cudf( to_union, sort_categories=False, ignore_order=False ): @@ -330,13 +328,13 @@ def union_categoricals_cudf( ) -@annotate("safe_hash", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def safe_hash(frame): return cudf.Series(frame.hash_values(), index=frame.index) @hash_object_dispatch.register((cudf.DataFrame, cudf.Series)) -@annotate("hash_object_cudf", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def hash_object_cudf(frame, index=True): if index: return safe_hash(frame.reset_index()) @@ -344,7 +342,7 @@ def hash_object_cudf(frame, index=True): @hash_object_dispatch.register(cudf.BaseIndex) -@annotate("hash_object_cudf_index", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def hash_object_cudf_index(ind, index=None): if isinstance(ind, cudf.MultiIndex): @@ -355,7 +353,7 @@ def hash_object_cudf_index(ind, index=None): @group_split_dispatch.register((cudf.Series, cudf.DataFrame)) -@annotate("group_split_cudf", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def group_split_cudf(df, c, k, ignore_index=False): return dict( zip( @@ -370,12 +368,12 @@ def group_split_cudf(df, c, k, ignore_index=False): @sizeof_dispatch.register(cudf.DataFrame) -@annotate("sizeof_cudf_dataframe", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def sizeof_cudf_dataframe(df): return int(df.memory_usage().sum()) @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex)) -@annotate("sizeof_cudf_series_index", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def sizeof_cudf_series_index(obj): return obj.memory_usage() diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index d8802f33941..4d193f34b9f 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -6,7 +6,6 @@ import numpy as np import pandas as pd -from nvtx import annotate from tlz import partition_all import dask @@ -26,6 +25,7 @@ import cudf from cudf import _lib as libcudf +from cudf.utils.utils import _dask_cudf_nvtx_annotate from dask_cudf import sorting from dask_cudf.accessors import ListMethods, StructMethods @@ -58,7 +58,7 @@ def __dask_postcompute__(self): def __dask_postpersist__(self): return type(self), (self._name, self._meta, self.divisions) - @annotate("_FRAME_INIT", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def __init__(self, dsk, name, meta, divisions): if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[]) @@ -84,9 +84,7 @@ def __repr__(self): s = "" return s % (type(self).__name__, len(self.dask), self.npartitions) - @annotate( - "_FRAME_to_dask_dataframe", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def to_dask_dataframe(self, **kwargs): """Create a dask.dataframe object from a dask_cudf object""" nullable_pd_dtype = kwargs.get("nullable_pd_dtype", False) @@ -104,9 +102,7 @@ def to_dask_dataframe(self, **kwargs): class DataFrame(_Frame, dd.core.DataFrame): _partition_type = cudf.DataFrame - @annotate( - "DATAFRAME_assign_column", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def _assign_column(self, k, v): def assigner(df, k, v): out = df.copy() @@ -116,7 +112,7 @@ def assigner(df, k, v): meta = assigner(self._meta, k, dask_make_meta(v)) return self.map_partitions(assigner, k, v, meta=meta) - @annotate("DATAFRAME_apply_rows", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): import uuid @@ -136,7 +132,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs): do_apply_rows, func, incols, outcols, kwargs, meta=meta ) - @annotate("DATAFRAME_merge", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def merge(self, other, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -148,7 +144,7 @@ def merge(self, other, **kwargs): on = list(on) return super().merge(other, on=on, shuffle="tasks", **kwargs) - @annotate("DATAFRAME_join", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def join(self, other, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -166,7 +162,7 @@ def join(self, other, **kwargs): on = list(on) return super().join(other, how=how, on=on, shuffle="tasks", **kwargs) - @annotate("DATAFRAME_set_index", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def set_index(self, other, sorted=False, divisions=None, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -238,9 +234,7 @@ def set_index(self, other, sorted=False, divisions=None, **kwargs): **kwargs, ) - @annotate( - "DATAFRAME_sort_values", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def sort_values( self, by, @@ -276,14 +270,14 @@ def sort_values( return df.reset_index(drop=True) return df - @annotate("DATAFRAME_to_parquet", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def to_parquet(self, path, *args, **kwargs): """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" from dask_cudf.io import to_parquet return to_parquet(self, path, *args, **kwargs) - @annotate("DATAFRAME_to_orc", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def to_orc(self, path, **kwargs): """Calls dask_cudf.io.to_orc""" from dask_cudf.io import to_orc @@ -291,7 +285,7 @@ def to_orc(self, path, **kwargs): return to_orc(self, path, **kwargs) @derived_from(pd.DataFrame) - @annotate("DATAFRAME_var", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def var( self, axis=None, @@ -320,9 +314,7 @@ def var( else: return _parallel_var(self, meta, skipna, split_every, out) - @annotate( - "DATAFRAME_repartition", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def repartition(self, *args, **kwargs): """Wraps dask.dataframe DataFrame.repartition method. Uses DataFrame.shuffle if `columns=` is specified. @@ -345,7 +337,7 @@ def repartition(self, *args, **kwargs): ) return super().repartition(*args, **kwargs) - @annotate("DATAFRAME_shuffle", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def shuffle(self, *args, **kwargs): """Wraps dask.dataframe DataFrame.shuffle method""" shuffle_arg = kwargs.pop("shuffle", None) @@ -353,21 +345,21 @@ def shuffle(self, *args, **kwargs): raise ValueError("dask_cudf does not support disk-based shuffle.") return super().shuffle(*args, shuffle="tasks", **kwargs) - @annotate("DATAFRAME_groupby", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def groupby(self, by=None, **kwargs): from .groupby import CudfDataFrameGroupBy return CudfDataFrameGroupBy(self, by=by, **kwargs) -@annotate("DATAFRAME_sum_of_squares", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def sum_of_squares(x): x = x.astype("f8")._column outcol = libcudf.reduce.reduce("sum_of_squares", x) return cudf.Series(outcol) -@annotate("DATAFRAME_var_aggregate", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def var_aggregate(x2, x, n, ddof): try: with warnings.catch_warnings(record=True): @@ -380,12 +372,12 @@ def var_aggregate(x2, x, n, ddof): return np.float64(np.nan) -@annotate("DATAFRAME_nlargest_agg", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def nlargest_agg(x, **kwargs): return cudf.concat(x).nlargest(**kwargs) -@annotate("DATAFRAME_nsmallest_agg", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def nsmallest_agg(x, **kwargs): return cudf.concat(x).nsmallest(**kwargs) @@ -393,7 +385,7 @@ def nsmallest_agg(x, **kwargs): class Series(_Frame, dd.core.Series): _partition_type = cudf.Series - @annotate("Series_count", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def count(self, split_every=False): return reduction( [self], @@ -403,14 +395,14 @@ def count(self, split_every=False): meta="i8", ) - @annotate("Series_mean", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def mean(self, split_every=False): sum = self.sum(split_every=split_every) n = self.count(split_every=split_every) return sum / n @derived_from(pd.DataFrame) - @annotate("Series_var", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def var( self, axis=None, @@ -439,19 +431,19 @@ def var( else: return _parallel_var(self, meta, skipna, split_every, out) - @annotate("Series_groupby", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def groupby(self, *args, **kwargs): from .groupby import CudfSeriesGroupBy return CudfSeriesGroupBy(self, *args, **kwargs) @property - @annotate("Series_list", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def list(self): return ListMethods(self) @property - @annotate("Series_struct", color="green", domain="dask_cudf_python") + @_dask_cudf_nvtx_annotate def struct(self): return StructMethods(self) @@ -460,7 +452,7 @@ class Index(Series, dd.core.Index): _partition_type = cudf.Index # type: ignore -@annotate("_naive_var", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _naive_var(ddf, meta, skipna, ddof, split_every, out): num = ddf._get_numeric_data() x = 1.0 * num.sum(skipna=skipna, split_every=split_every) @@ -475,7 +467,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out): return handle_out(out, result) -@annotate("_parallel_var", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): if skipna: @@ -542,7 +534,7 @@ def _finalize_var(vals): return handle_out(out, result) -@annotate("_extract_meta", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _extract_meta(x): """ Extract internal cache data (``_meta``) from dask_cudf objects @@ -558,7 +550,7 @@ def _extract_meta(x): return x -@annotate("_emulate", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _emulate(func, *args, **kwargs): """ Apply a function using args / kwargs. If arguments contain dd.DataFrame / @@ -568,7 +560,7 @@ def _emulate(func, *args, **kwargs): return func(*_extract_meta(args), **_extract_meta(kwargs)) -@annotate("align_partitions", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def align_partitions(args): """Align partitions between dask_cudf objects. @@ -584,7 +576,7 @@ def align_partitions(args): return args -@annotate("reduction", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def reduction( args, chunk=None, @@ -723,7 +715,7 @@ def reduction( return dd.core.new_dd_object(graph, b, meta, (None, None)) -@annotate("from_cudf", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): if isinstance(getattr(data, "index", None), cudf.MultiIndex): raise NotImplementedError( @@ -745,7 +737,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): ) -@annotate("from_dask_dataframe", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def from_dask_dataframe(df): return df.map_partitions(cudf.from_pandas) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 658e63ea923..76533706030 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -6,7 +6,6 @@ import numpy as np import pandas as pd -from nvtx import annotate from dask.base import tokenize from dask.dataframe.core import ( @@ -20,6 +19,7 @@ from dask.highlevelgraph import HighLevelGraph import cudf +from cudf.utils.utils import _dask_cudf_nvtx_annotate SUPPORTED_AGGS = ( "count", @@ -36,19 +36,13 @@ class CudfDataFrameGroupBy(DataFrameGroupBy): - @annotate( - "CudfDataFrameGroupBy_INIT", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def __init__(self, *args, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) super().__init__(*args, **kwargs) - @annotate( - "CudfDataFrameGroupBy_GETITEM", - color="green", - domain="dask_cudf_python", - ) + @_dask_cudf_nvtx_annotate def __getitem__(self, key): if isinstance(key, list): g = CudfDataFrameGroupBy( @@ -62,9 +56,7 @@ def __getitem__(self, key): g._meta = g._meta[key] return g - @annotate( - "CudfDataFrameGroupBy_MEAN", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -78,11 +70,7 @@ def mean(self, split_every=None, split_out=1): as_index=self.as_index, ) - @annotate( - "CudfDataFrameGroupBy_COLLECT", - color="green", - domain="dask_cudf_python", - ) + @_dask_cudf_nvtx_annotate def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -96,11 +84,7 @@ def collect(self, split_every=None, split_out=1): as_index=self.as_index, ) - @annotate( - "CudfDataFrameGroupBy_AGGREGATE", - color="green", - domain="dask_cudf_python", - ) + @_dask_cudf_nvtx_annotate def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() @@ -140,17 +124,13 @@ def aggregate(self, arg, split_every=None, split_out=1): class CudfSeriesGroupBy(SeriesGroupBy): - @annotate( - "CudfSeriesGroupBy_INIT", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def __init__(self, *args, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) super().__init__(*args, **kwargs) - @annotate( - "CudfSeriesGroupBy_MEAN", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -164,9 +144,7 @@ def mean(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] - @annotate( - "CudfSeriesGroupBy_STD", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def std(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -180,9 +158,7 @@ def std(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] - @annotate( - "CudfSeriesGroupBy_VAR", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def var(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -196,9 +172,7 @@ def var(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] - @annotate( - "CudfSeriesGroupBy_COLLECT", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -212,9 +186,7 @@ def collect(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] - @annotate( - "CudfSeriesGroupBy_AGGREGATE", color="green", domain="dask_cudf_python" - ) + @_dask_cudf_nvtx_annotate def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() @@ -245,7 +217,7 @@ def aggregate(self, arg, split_every=None, split_out=1): ) -@annotate("groupby_agg", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def groupby_agg( ddf, gb_cols, @@ -412,7 +384,7 @@ def groupby_agg( return new_dd_object(graph, gb_agg_name, _meta, divisions) -@annotate("_redirect_aggs", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _redirect_aggs(arg): """Redirect aggregations to their corresponding name in cuDF""" redirects = { @@ -439,7 +411,7 @@ def _redirect_aggs(arg): return redirects.get(arg, arg) -@annotate("_is_supported", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _is_supported(arg, supported: set): """Check that aggregations in `arg` are a subset of `supported`""" if isinstance(arg, (list, dict)): @@ -465,7 +437,7 @@ def _make_name(*args, sep="_"): return sep.join(_args) -@annotate("_groupby_partition_agg", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _groupby_partition_agg( df, gb_cols, aggs, columns, split_out, dropna, sort, sep ): @@ -523,7 +495,7 @@ def _groupby_partition_agg( return output -@annotate("_tree_node_agg", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): """Node in groupby-aggregation reduction tree. @@ -558,7 +530,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): return gb -@annotate("_var_agg", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): """Calculate variance (given count, sum, and sum-squared columns).""" @@ -580,7 +552,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): return var -@annotate("_finalize_gb_agg", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _finalize_gb_agg( gb, gb_cols, diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index ada738c5a9b..5b286b0ff3d 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -5,7 +5,6 @@ import cupy import numpy as np import tlz as toolz -from nvtx import annotate from dask.base import tokenize from dask.dataframe import methods @@ -16,16 +15,17 @@ import cudf as gd from cudf.api.types import is_categorical_dtype +from cudf.utils.utils import _dask_cudf_nvtx_annotate -@annotate("set_index_post", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def set_index_post(df, index_name, drop, column_dtype): df2 = df.set_index(index_name, drop=drop) df2.columns = df2.columns.astype(column_dtype) return df2 -@annotate("_set_partitions_pre", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): if ascending: partitions = divisions.searchsorted(s, side="right") - 1 @@ -42,7 +42,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): return partitions -@annotate("_quantile", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _quantile(a, q): n = len(a) if not len(a): @@ -50,7 +50,7 @@ def _quantile(a, q): return (a.quantiles(q=q.tolist(), interpolation="nearest"), n) -@annotate("merge_quantiles", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def merge_quantiles(finalq, qs, vals): """Combine several quantile calculations of different data. [NOTE: Same logic as dask.array merge_percentiles] @@ -113,7 +113,7 @@ def _append_counts(val, count): return rv.reset_index(drop=True) -@annotate("_approximate_quantile", color="green", domain="dask_cudf_python") +@_dask_cudf_nvtx_annotate def _approximate_quantile(df, q): """Approximate quantiles of DataFrame or Series. [NOTE: Same logic as dask.dataframe Series quantile] @@ -187,7 +187,7 @@ def set_quantile_index(df): return df -@annotate("quantile_divisions", color="green", domain="cudf_python") +@_dask_cudf_nvtx_annotate def quantile_divisions(df, by, npartitions): qn = np.linspace(0.0, 1.0, npartitions + 1).tolist() divisions = _approximate_quantile(df[by], qn).compute() @@ -221,7 +221,7 @@ def quantile_divisions(df, by, npartitions): return divisions -@annotate("sort_values", color="green", domain="cudf_python") +@_dask_cudf_nvtx_annotate def sort_values( df, by,