diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8a49efabcde..c3c79135c34 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -112,6 +112,7 @@ def __setitem__(self, key, value): key = (key, slice(None)) return self._setitem_tuple_arg(key, value) + @annotate("_CAN_DOWNCAST_TO_SERIES", color="green", domain="cudf_python") def _can_downcast_to_series(self, df, arg): """ This method encapsulates the logic used @@ -152,6 +153,7 @@ def _can_downcast_to_series(self, df, arg): return True return False + @annotate("_DOWNCAST_TO_SERIES", color="green", domain="cudf_python") def _downcast_to_series(self, df, arg): """ "Downcast" from a DataFrame to a Series @@ -193,6 +195,7 @@ class _DataFrameLocIndexer(_DataFrameIndexer): For selection by label. """ + @annotate("_GETITEM_SCALAR", color="green", domain="cudf_python") def _getitem_scalar(self, arg): return self._frame[arg[1]].loc[arg[0]] @@ -634,6 +637,9 @@ def __init__( if dtype: self._data = self.astype(dtype)._data + @annotate( + "DATAFRAME_INIT_FROM_SERIES_LIST", color="blue", domain="cudf_python" + ) def _init_from_series_list(self, data, columns, index): if index is None: # When `index` is `None`, the final index of @@ -732,6 +738,9 @@ def _init_from_series_list(self, data, columns, index): ) self._data = self._data.select_by_label(columns) + @annotate( + "DATAFRAME_INIT_FROM_LIST_LIKE", color="blue", domain="cudf_python" + ) def _init_from_list_like(self, data, index=None, columns=None): if index is None: index = RangeIndex(start=0, stop=len(data)) @@ -768,6 +777,9 @@ def _init_from_list_like(self, data, index=None, columns=None): self.columns = columns + @annotate( + "DATAFRAME_INIT_FROM_DICT_LIKE", color="blue", domain="cudf_python" + ) def _init_from_dict_like( self, data, index=None, columns=None, nan_as_null=None ): @@ -841,6 +853,11 @@ def _from_data( return out @staticmethod + @annotate( + "DATAFRAME_ALIGN_INPUT_SERIES_INDICES", + color="blue", + domain="cudf_python", + ) def _align_input_series_indices(data, index): data = data.copy() @@ -1186,6 +1203,7 @@ def __delitem__(self, name): """ self._drop_column(name) + @annotate("DATAFRAME_SLICE", color="blue", domain="cudf_python") def _slice(self: T, arg: slice) -> T: """ _slice : slice the frame as per the arg @@ -1247,11 +1265,13 @@ def _slice(self: T, arg: slice) -> T: result.columns = self.columns return result + @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python") def memory_usage(self, index=True, deep=False): return Series( {str(k): v for k, v in super().memory_usage(index, deep).items()} ) + @annotate("DATAFRAME_ARRAY_UFUNC", color="blue", domain="cudf_python") def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if method == "__call__" and hasattr(cudf, ufunc.__name__): func = getattr(cudf, ufunc.__name__) @@ -1259,6 +1279,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: return NotImplemented + @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python") def __array_function__(self, func, types, args, kwargs): cudf_df_module = DataFrame @@ -1297,6 +1318,7 @@ def __array_function__(self, func, types, args, kwargs): return NotImplemented # The _get_numeric_data method is necessary for dask compatibility. + @annotate("DATAFRAME_GET_NUMERIC_DATA", color="blue", domain="cudf_python") def _get_numeric_data(self): """Return a dataframe with only numeric data types""" columns = [ @@ -1306,6 +1328,7 @@ def _get_numeric_data(self): ] return self[columns] + @annotate("DATAFRAME_ASSIGN", color="blue", domain="cudf_python") def assign(self, **kwargs): """ Assign columns to DataFrame from keyword arguments. @@ -1784,10 +1807,12 @@ def _get_renderable_dataframe(self): return output + @annotate("DATAFRAME_REPR", color="blue", domain="cudf_python") def __repr__(self): output = self._get_renderable_dataframe() return self._clean_renderable_dataframe(output) + @annotate("DATAFRAME_REPR_HTML", color="blue", domain="cudf_python") def _repr_html_(self): lines = ( self._get_renderable_dataframe() @@ -1804,9 +1829,13 @@ def _repr_html_(self): lines.append("") return "\n".join(lines) + @annotate("DATAFRAME_REPR_LATEX", color="blue", domain="cudf_python") def _repr_latex_(self): return self._get_renderable_dataframe().to_pandas()._repr_latex_() + @annotate( + "DATAFRAME_GET_COLUMNS_BY_LABEL", color="blue", domain="cudf_python" + ) def _get_columns_by_label(self, labels, downcast=False): """ Return columns of dataframe by `labels` @@ -1829,6 +1858,7 @@ def _get_columns_by_label(self, labels, downcast=False): ) return out + @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") def _binaryop( self, other: Any, @@ -1919,6 +1949,7 @@ def _binaryop( index=lhs._index, ) + @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python") def update( self, other, @@ -2012,14 +2043,17 @@ def update( self._mimic_inplace(source_df, inplace=True) + @annotate("DATAFRAME_ITER", color="blue", domain="cudf_python") def __iter__(self): return iter(self.columns) + @annotate("DATAFRAME_ITERITEMS", color="blue", domain="cudf_python") def iteritems(self): """Iterate over column names and series pairs""" for k in self: yield (k, self[k]) + @annotate("DATAFRAME_EQUALS", color="blue", domain="cudf_python") def equals(self, other, **kwargs): ret = super().equals(other) # If all other checks matched, validate names. @@ -2082,6 +2116,7 @@ def columns(self, columns): data, multiindex=is_multiindex, level_names=columns.names, ) + @annotate("DATAFRAME_REINDEX_INTERNAL", color="blue", domain="cudf_python") def _reindex( self, columns, dtypes=None, deep=False, index=None, inplace=False ): @@ -2158,6 +2193,7 @@ def _reindex( return self._mimic_inplace(result, inplace=inplace) + @annotate("DATAFRAME_REINDEX", color="blue", domain="cudf_python") def reindex( self, labels=None, axis=None, index=None, columns=None, copy=True ): @@ -2236,6 +2272,7 @@ def reindex( inplace=False, ) + @annotate("DATAFRAME_SET_INDEX", color="blue", domain="cudf_python") def set_index( self, keys, @@ -2496,7 +2533,7 @@ def reset_index( inplace=inplace, ) - @annotate("INSERT", color="green", domain="cudf_python") + @annotate("DATAFRAME_INSERT", color="green", domain="cudf_python") def insert(self, loc, name, value, nan_as_null=None): """Add a column to DataFrame at the index specified by loc. @@ -2622,6 +2659,7 @@ def diff(self, periods=1, axis=0): return self - self.shift(periods=periods) + @annotate("DATAFRAME_DROP", color="green", domain="cudf_python") def drop( self, labels=None, @@ -2795,12 +2833,14 @@ def drop( if not inplace: return out + @annotate("DATAFRAME_DROP_COLUMN", color="green", domain="cudf_python") def _drop_column(self, name): """Drop a column by *name*""" if name not in self._data: raise KeyError(f"column '{name}' does not exist") del self._data[name] + @annotate("DATAFRAME_DROP_DUPLICATES", color="green", domain="cudf_python") def drop_duplicates( self, subset=None, keep="first", inplace=False, ignore_index=False ): @@ -2878,12 +2918,14 @@ def drop_duplicates( return self._mimic_inplace(outdf, inplace=inplace) + @annotate("DATAFRAME_POP", color="green", domain="cudf_python") def pop(self, item): """Return a column and drop it from the DataFrame.""" popped = self[item] del self[item] return popped + @annotate("DATAFRAME_RENAME", color="green", domain="cudf_python") def rename( self, mapper=None, @@ -3027,6 +3069,7 @@ def rename( else: return out.copy(deep=copy) + @annotate("DATAFRAME_ADD_PREFIX", color="green", domain="cudf_python") def add_prefix(self, prefix): out = self.copy(deep=True) out.columns = [ @@ -3034,6 +3077,7 @@ def add_prefix(self, prefix): ] return out + @annotate("DATAFRAME_ADD_SUFFIX", color="green", domain="cudf_python") def add_suffix(self, suffix): out = self.copy(deep=True) out.columns = [ @@ -3041,6 +3085,7 @@ def add_suffix(self, suffix): ] return out + @annotate("DATAFRAME_AGG", color="green", domain="cudf_python") def agg(self, aggs, axis=None): """ Aggregate using one or more operations over the specified axis. @@ -3172,6 +3217,7 @@ def agg(self, aggs, axis=None): else: raise ValueError("argument must be a string, list or dict") + @annotate("DATAFRAME_NLARGEST", color="green", domain="cudf_python") def nlargest(self, n, columns, keep="first"): """Get the rows of the DataFrame sorted by the n largest value of *columns* @@ -3303,6 +3349,7 @@ def nsmallest(self, n, columns, keep="first"): """ return self._n_largest_or_smallest(False, n, columns, keep) + @annotate("DATAFRAME_TRANSPOSE", color="green", domain="cudf_python") def transpose(self): """Transpose index and columns. @@ -3333,6 +3380,7 @@ def transpose(self): T = property(transpose, doc=transpose.__doc__) + @annotate("DATAFRAME_MELT", color="green", domain="cudf_python") def melt(self, **kwargs): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. @@ -3362,7 +3410,7 @@ def melt(self, **kwargs): return melt(self, **kwargs) - @annotate("JOIN", color="blue", domain="cudf_python") + @annotate("DATAFRAME_JOIN", color="blue", domain="cudf_python") def merge( self, right, @@ -3544,6 +3592,7 @@ def join( ) return df + @annotate("DATAFRAME_GROUPBY", color="green", domain="cudf_python") @copy_docstring(DataFrameGroupBy) def groupby( self, @@ -3659,7 +3708,7 @@ def query(self, expr, local_dict=None): """ # can't use `annotate` decorator here as we inspect the calling # environment. - with annotate("QUERY", color="purple", domain="cudf_python"): + with annotate("DATAFRAME_QUERY", color="purple", domain="cudf_python"): if local_dict is None: local_dict = {} @@ -3683,6 +3732,7 @@ def query(self, expr, local_dict=None): boolmask = queryutils.query_execute(self, expr, callenv) return self._apply_boolean_mask(boolmask) + @annotate("DATAFRAME_APPLY", color="green", domain="cudf_python") def apply( self, func, axis=1, raw=False, result_type=None, args=(), **kwargs ): @@ -3831,6 +3881,7 @@ def apply( return self._apply(func, _get_row_kernel, *args, **kwargs) + @annotate("DATAFRAME_APPLY_ROWS", color="green", domain="cudf_python") @applyutils.doc_apply() def apply_rows( self, @@ -3909,6 +3960,7 @@ def apply_rows( cache_key=cache_key, ) + @annotate("DATAFRAME_APPLY_CHUNKS", color="green", domain="cudf_python") @applyutils.doc_applychunks() def apply_chunks( self, @@ -3976,6 +4028,9 @@ def apply_chunks( tpb=tpb, ) + @annotate( + "DATAFRAME_PARTITION_BY_HASH", color="green", domain="cudf_python" + ) def partition_by_hash(self, columns, nparts, keep_index=True): """Partition the dataframe by the hashed value of data in *columns*. @@ -4313,6 +4368,7 @@ def _sizeof_fmt(num, size_qualifier): cudf.utils.ioutils.buffer_write_lines(buf, lines) + @annotate("DATAFRAME_DESCRIBE", color="green", domain="cudf_python") @docutils.doc_describe() def describe( self, @@ -4372,6 +4428,7 @@ def describe( sort=False, ) + @annotate("DATAFRAME_TO_PANDAS", color="green", domain="cudf_python") def to_pandas(self, nullable=False, **kwargs): """ Convert to a Pandas DataFrame. @@ -4458,6 +4515,7 @@ def to_pandas(self, nullable=False, **kwargs): return out_df @classmethod + @annotate("DATAFRAME_FROM_PANDAS", color="green", domain="cudf_python") def from_pandas(cls, dataframe, nan_as_null=None): """ Convert from a Pandas DataFrame. @@ -4527,6 +4585,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): return result @classmethod + @annotate("DATAFRAME_FROM_ARROW", color="green", domain="cudf_python") def from_arrow(cls, table): """ Convert from PyArrow Table to DataFrame. @@ -4582,6 +4641,7 @@ def from_arrow(cls, table): return out + @annotate("DATAFRAME_TO_ARROW", color="green", domain="cudf_python") def to_arrow(self, preserve_index=True): """ Convert to a PyArrow Table. @@ -4663,6 +4723,7 @@ def to_arrow(self, preserve_index=True): return out.replace_schema_metadata(metadata) + @annotate("DATAFRAME_TO_RECORDS", color="green", domain="cudf_python") def to_records(self, index=True): """Convert to a numpy recarray @@ -4686,6 +4747,7 @@ def to_records(self, index=True): return ret @classmethod + @annotate("DATAFRAME_FROM_RECORDS", color="green", domain="cudf_python") def from_records(cls, data, index=None, columns=None, nan_as_null=False): """ Convert structured or record ndarray to DataFrame. @@ -4747,6 +4809,9 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): return df @classmethod + @annotate( + "DATAFRAME_FROM_ARRAYS_INTERNAL", color="green", domain="cudf_python" + ) def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): """Convert a numpy/cupy array to DataFrame. @@ -4806,6 +4871,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): df._index = as_index(index) return df + @annotate("DATAFRAME_INTERPOLATE", color="green", domain="cudf_python") def interpolate( self, method="linear", @@ -4836,6 +4902,7 @@ def interpolate( **kwargs, ) + @annotate("DATAFRAME_QUANTILE", color="green", domain="cudf_python") def quantile( self, q=0.5, @@ -4951,6 +5018,7 @@ def quantile( result.index = q return result + @annotate("DATAFRAME_QUANTILES", color="green", domain="cudf_python") def quantiles(self, q=0.5, interpolation="nearest"): """ Return values at the given quantile. @@ -4990,6 +5058,7 @@ def quantiles(self, q=0.5, interpolation="nearest"): result.index = as_index(q) return result + @annotate("DATAFRAME_ISIN", color="green", domain="cudf_python") def isin(self, values): """ Whether each element in the DataFrame is contained in values. @@ -5127,6 +5196,9 @@ def isin(self, values): # # Stats # + @annotate( + "DATAFRAME_PREPARE_FOR_ROWWISE_OP", color="green", domain="cudf_python" + ) def _prepare_for_rowwise_op(self, method, skipna): """Prepare a DataFrame for CuPy-based row-wise operations.""" @@ -5176,6 +5248,7 @@ def _prepare_for_rowwise_op(self, method, skipna): coerced = coerced.astype("int64", copy=False) return coerced, mask, common_dtype + @annotate("DATAFRAME_COUNT", color="green", domain="cudf_python") def count(self, axis=0, level=None, numeric_only=False, **kwargs): """ Count ``non-NA`` cells for each column or row. @@ -5222,6 +5295,7 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): "columns": 1, } + @annotate("DATAFRAME_REDUCE", color="green", domain="cudf_python") def _reduce( self, op, axis=None, level=None, numeric_only=None, **kwargs, ): @@ -5246,6 +5320,7 @@ def _reduce( elif axis == 1: return self._apply_cupy_method_axis_1(op, **kwargs) + @annotate("DATAFRAME_SCAN", color="green", domain="cudf_python") def _scan( self, op, axis=None, *args, **kwargs, ): @@ -5256,6 +5331,7 @@ def _scan( elif axis == 1: return self._apply_cupy_method_axis_1(f"cum{op}", **kwargs) + @annotate("DATAFRAME_MODE", color="green", domain="cudf_python") def mode(self, axis=0, numeric_only=False, dropna=True): """ Get the mode(s) of each element along the selected axis. @@ -5355,6 +5431,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return df + @annotate("DATAFRAME_KURTOSIS", color="green", domain="cudf_python") def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -5363,6 +5440,7 @@ def kurtosis( axis, skipna, level, numeric_only, **kwargs ) + @annotate("DATAFRAME_SKEW", color="green", domain="cudf_python") def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -5371,14 +5449,17 @@ def skew( axis, skipna, level, numeric_only, **kwargs ) + @annotate("DATAFRAME_ALL", color="green", domain="cudf_python") def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self return super(DataFrame, obj).all(axis, skipna, level, **kwargs) + @annotate("DATAFRAME_ANY", color="green", domain="cudf_python") def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self return super(DataFrame, obj).any(axis, skipna, level, **kwargs) + @annotate("DATAFRAME_APPLY_CUPY", color="green", domain="cudf_python") def _apply_cupy_method_axis_1(self, method, *args, **kwargs): # This method uses cupy to perform scans and reductions along rows of a # DataFrame. Since cuDF is designed around columnar storage and @@ -5479,6 +5560,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result_df.columns = prepared.columns return result_df + @annotate("DATAFRAME_COLUMNS_VIEW", color="green", domain="cudf_python") def _columns_view(self, columns): """ Return a subset of the DataFrame's columns as a view. @@ -5487,6 +5569,7 @@ def _columns_view(self, columns): {col: self._data[col] for col in columns}, index=self.index ) + @annotate("DATAFRAME_SELECT_DTYPES", color="green", domain="cudf_python") def select_dtypes(self, include=None, exclude=None): """Return a subset of the DataFrame’s columns based on the column dtypes. @@ -5673,6 +5756,7 @@ def to_orc(self, fname, compression=None, *args, **kwargs): orc.to_orc(self, fname, compression, *args, **kwargs) + @annotate("DATAFRAME_STACK", color="green", domain="cudf_python") def stack(self, level=-1, dropna=True): """Stack the prescribed level(s) from columns to index @@ -5734,6 +5818,7 @@ def stack(self, level=-1, dropna=True): else: return result + @annotate("DATAFRAME_COV", color="green", domain="cudf_python") def cov(self, **kwargs): """Compute the covariance matrix of a DataFrame. @@ -5751,6 +5836,7 @@ def cov(self, **kwargs): df.columns = self.columns return df + @annotate("DATAFRAME_CORR", color="green", domain="cudf_python") def corr(self): """Compute the correlation matrix of a DataFrame.""" corr = cupy.corrcoef(self.values, rowvar=False) @@ -5758,6 +5844,7 @@ def corr(self): df.columns = self.columns return df + @annotate("DATAFRAME_TO_STRUCT", color="green", domain="cudf_python") def to_struct(self, name=None): """ Return a struct Series composed of the columns of the DataFrame. @@ -5782,6 +5869,7 @@ def to_struct(self, name=None): name=name, ) + @annotate("DATAFRAME_KEYS", color="green", domain="cudf_python") def keys(self): """ Get the columns. @@ -5829,6 +5917,7 @@ def iterrows(self): "if you wish to iterate over each row." ) + @annotate("DATAFRAME_APPEND", color="green", domain="cudf_python") def append( self, other, ignore_index=False, verify_integrity=False, sort=False ): @@ -5981,6 +6070,7 @@ def append( return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort) + @annotate("DATAFRAME_PIVOT", color="green", domain="cudf_python") @copy_docstring(reshape.pivot) def pivot(self, index, columns, values=None): @@ -5988,12 +6078,14 @@ def pivot(self, index, columns, values=None): self, index=index, columns=columns, values=values ) + @annotate("DATAFRAME_UNSTACK", color="green", domain="cudf_python") @copy_docstring(reshape.unstack) def unstack(self, level=-1, fill_value=None): return cudf.core.reshape.unstack( self, level=level, fill_value=fill_value ) + @annotate("DATAFRAME_EXPLODE", color="green", domain="cudf_python") def explode(self, column, ignore_index=False): """ Transform each element of a list-like to a row, replicating index @@ -6199,6 +6291,7 @@ def func(left, right, output): ) +@annotate("CUDF_FROM_PANDAS", color="green", domain="cudf_python") def from_pandas(obj, nan_as_null=None): """ Convert certain Pandas objects into the cudf equivalent. @@ -6319,6 +6412,7 @@ def from_pandas(obj, nan_as_null=None): ) +@annotate("CUDF_MERGE", color="green", domain="cudf_python") def merge(left, right, *args, **kwargs): return left.merge(right, *args, **kwargs) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8d8dd2a1bc0..2d073d46891 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -123,6 +123,7 @@ def deserialize(cls, header, frames): return cls_deserialize._from_data(dict(zip(column_names, columns))) @classmethod + @annotate("FRAME_FROM_DATA", color="green", domain="cudf_python") def _from_data( cls, data: MutableMapping, @@ -133,6 +134,7 @@ def _from_data( return obj @classmethod + @annotate("FRAME_FROM_COLUMNS", color="green", domain="cudf_python") def _from_columns( cls, columns: List[ColumnBase], @@ -163,6 +165,9 @@ def _from_columns( return cls._from_data(data, index) + @annotate( + "FRAME_FROM_COLUMNS_LIKE_SELF", color="green", domain="cudf_python" + ) def _from_columns_like_self( self, columns: List[ColumnBase], @@ -360,6 +365,7 @@ def memory_usage(self, deep=False): def __len__(self): return self._num_rows + @annotate("FRAME_COPY", color="green", domain="cudf_python") def copy(self: T, deep: bool = True) -> T: """ Make a copy of this object's indices and data. @@ -445,6 +451,7 @@ def copy(self: T, deep: bool = True) -> T: return new_frame + @annotate("FRAME_EQUALS", color="green", domain="cudf_python") def equals(self, other, **kwargs): """ Test whether two objects contain the same elements. @@ -527,6 +534,7 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) + @annotate("FRAME_EXPLODE", color="green", domain="cudf_python") def _explode(self, explode_column: Any, ignore_index: bool): """Helper function for `explode` in `Series` and `Dataframe`, explodes a specified nested column. Other columns' corresponding rows are @@ -550,6 +558,9 @@ def _explode(self, explode_column: Any, ignore_index: bool): res.index.names = self._index.names return res + @annotate( + "FRAME_GET_COLUMNS_BY_LABEL", color="green", domain="cudf_python" + ) def _get_columns_by_label(self, labels, downcast=False): """ Returns columns of the Frame specified by `labels` @@ -557,6 +568,9 @@ def _get_columns_by_label(self, labels, downcast=False): """ return self._data.select_by_label(labels) + @annotate( + "FRAME_GET_COLUMNS_BY_INDEX", color="green", domain="cudf_python" + ) def _get_columns_by_index(self, indices): """ Returns columns of the Frame specified by `labels` @@ -580,6 +594,7 @@ def _as_column(self): return self._data[None].copy(deep=False) + @annotate("FRAME_EMPTY_LIKE", color="green", domain="cudf_python") def _empty_like(self, keep_index=True): result = self.__class__._from_data( *libcudf.copying.table_empty_like(self, keep_index) @@ -675,6 +690,7 @@ def get_column_values_na(col): # particular, we need to benchmark how much of the overhead is coming from # (potentially unavoidable) local copies in to_cupy and how much comes from # inefficiencies in the implementation. + @annotate("FRAME_TO_CUPY", color="green", domain="cudf_python") def to_cupy( self, dtype: Union[Dtype, None] = None, @@ -709,6 +725,7 @@ def to_cupy( na_value, ) + @annotate("FRAME_TO_NUMPY", color="green", domain="cudf_python") def to_numpy( self, dtype: Union[Dtype, None] = None, @@ -743,6 +760,7 @@ def to_numpy( (lambda col: col.values_host), np.empty, dtype, na_value ) + @annotate("FRAME_CLIP", color="green", domain="cudf_python") def clip(self, lower=None, upper=None, inplace=False, axis=1): """ Trim values at input threshold(s). @@ -870,6 +888,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) + @annotate("FRAME_WHERE", color="green", domain="cudf_python") def where(self, cond, other=None, inplace=False): """ Replace values where the condition is False. @@ -928,6 +947,7 @@ def where(self, cond, other=None, inplace=False): frame=self, cond=cond, other=other, inplace=inplace ) + @annotate("FRAME_MASK", color="green", domain="cudf_python") def mask(self, cond, other=None, inplace=False): """ Replace values where the condition is True. @@ -989,6 +1009,7 @@ def mask(self, cond, other=None, inplace=False): return self.where(cond=~cond, other=other, inplace=inplace) + @annotate("FRAME_PIPE", color="green", domain="cudf_python") def pipe(self, func, *args, **kwargs): """ Apply ``func(self, *args, **kwargs)``. @@ -1119,6 +1140,7 @@ def scatter_by_map( return result + @annotate("FRAME_FILLNA", color="green", domain="cudf_python") def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None ): @@ -1273,6 +1295,7 @@ def fillna( inplace=inplace, ) + @annotate("FRAME_DROPNA_COLUMNS", color="green", domain="cudf_python") def _drop_na_columns(self, how="any", subset=None, thresh=None): """ Drop columns containing nulls @@ -1300,6 +1323,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): return self[out_cols] + @annotate("FRAME_INTERPOLATE", color="green", domain="cudf_python") def interpolate( self, method="linear", @@ -1369,6 +1393,7 @@ def interpolate( else result._gather(perm_sort.argsort()) ) + @annotate("FRAME_QUANTILES", color="green", domain="cudf_python") def _quantiles( self, q, @@ -1401,6 +1426,7 @@ def _quantiles( result._copy_type_metadata(self) return result + @annotate("FRAME_RANK", color="green", domain="cudf_python") def rank( self, axis=0, @@ -1477,6 +1503,7 @@ def rank( return self._from_data(data, index).astype(np.float64) + @annotate("FRAME_REPEAT", color="green", domain="cudf_python") def repeat(self, repeats, axis=None): """Repeats elements consecutively. @@ -1566,6 +1593,7 @@ def repeat(self, repeats, axis=None): result._copy_type_metadata(self) return result + @annotate("FRAME_SHIFT", color="green", domain="cudf_python") def shift(self, periods=1, freq=None, axis=0, fill_value=None): """Shift values by `periods` positions.""" axis = self._get_axis_from_axis_arg(axis) @@ -1581,7 +1609,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): zip(self._column_names, data_columns), self._index ) - @annotate("SAMPLE", color="orange", domain="cudf_python") + @annotate("FRAME_SAMPLE", color="orange", domain="cudf_python") def sample( self, n=None, @@ -1775,7 +1803,7 @@ def sample( return result @classmethod - @annotate("FROM_ARROW", color="orange", domain="cudf_python") + @annotate("FRAME_FROM_ARROW", color="orange", domain="cudf_python") def from_arrow(cls, data): """Convert from PyArrow Table to Frame @@ -1915,7 +1943,7 @@ def from_arrow(cls, data): return cls._from_data({name: result[name] for name in column_names}) - @annotate("TO_ARROW", color="orange", domain="cudf_python") + @annotate("FRAME_TO_ARROW", color="orange", domain="cudf_python") def to_arrow(self): """ Convert to arrow Table @@ -1951,6 +1979,7 @@ def _positions_from_column_names(self, column_names): if name in set(column_names) ] + @annotate("FRAME_REPLACE", color="green", domain="cudf_python") def replace( self, to_replace=None, @@ -2237,6 +2266,7 @@ def _copy_type_metadata( return self + @annotate("FRAME_ISNULL", color="green", domain="cudf_python") def isnull(self): """ Identify missing values. @@ -2318,6 +2348,7 @@ def isnull(self): # Alias for isnull isna = isnull + @annotate("FRAME_NOTNULL", color="green", domain="cudf_python") def notnull(self): """ Identify non-missing values. @@ -2399,6 +2430,7 @@ def notnull(self): # Alias for notnull notna = notnull + @annotate("FRAME_INTERLEAVE_COLUMNS", color="green", domain="cudf_python") def interleave_columns(self): """ Interleave Series columns of a table into a single column. @@ -2438,6 +2470,7 @@ def interleave_columns(self): return result + @annotate("FRAME_TILE", color="green", domain="cudf_python") def tile(self, count): """ Repeats the rows from `self` DataFrame `count` times to form a @@ -2467,6 +2500,7 @@ def tile(self, count): result._copy_type_metadata(self) return result + @annotate("FRAME_SEARCHSORTED", color="green", domain="cudf_python") def searchsorted( self, values, side="left", ascending=True, na_position="last" ): @@ -2551,7 +2585,7 @@ def searchsorted( else: return result - @annotate("ARGSORT", color="yellow", domain="cudf_python") + @annotate("FRAME_ARGSORT", color="yellow", domain="cudf_python") def argsort( self, by=None, @@ -2654,6 +2688,7 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): return libcudf.sort.order_by(to_sort, ascending, na_position) + @annotate("FRAME_SIN", color="green", domain="cudf_python") def sin(self): """ Get Trigonometric sine, element-wise. @@ -2715,6 +2750,7 @@ def sin(self): """ return self._unaryop("sin") + @annotate("FRAME_COS", color="green", domain="cudf_python") def cos(self): """ Get Trigonometric cosine, element-wise. @@ -2776,6 +2812,7 @@ def cos(self): """ return self._unaryop("cos") + @annotate("FRAME_TAN", color="green", domain="cudf_python") def tan(self): """ Get Trigonometric tangent, element-wise. @@ -2837,6 +2874,7 @@ def tan(self): """ return self._unaryop("tan") + @annotate("FRAME_ASIN", color="green", domain="cudf_python") def asin(self): """ Get Trigonometric inverse sine, element-wise. @@ -2887,6 +2925,7 @@ def asin(self): """ return self._unaryop("asin") + @annotate("FRAME_ACOS", color="green", domain="cudf_python") def acos(self): """ Get Trigonometric inverse cosine, element-wise. @@ -2945,6 +2984,7 @@ def acos(self): result = result.mask((result < 0) | (result > np.pi + 1)) return result + @annotate("FRAME_ATAN", color="green", domain="cudf_python") def atan(self): """ Get Trigonometric inverse tangent, element-wise. @@ -3005,6 +3045,7 @@ def atan(self): """ return self._unaryop("atan") + @annotate("FRAME_EXP", color="green", domain="cudf_python") def exp(self): """ Get the exponential of all elements, element-wise. @@ -3067,6 +3108,7 @@ def exp(self): """ return self._unaryop("exp") + @annotate("FRAME_LOG", color="green", domain="cudf_python") def log(self): """ Get the natural logarithm of all elements, element-wise. @@ -3128,6 +3170,7 @@ def log(self): """ return self._unaryop("log") + @annotate("FRAME_SQRT", color="green", domain="cudf_python") def sqrt(self): """ Get the non-negative square-root of all elements, element-wise. @@ -3183,6 +3226,7 @@ def sqrt(self): """ return self._unaryop("sqrt") + @annotate("FRAME_ABS", color="green", domain="cudf_python") def abs(self): """ Return a Series/DataFrame with absolute numeric value of each element. @@ -3209,6 +3253,7 @@ def abs(self): return self._unaryop("abs") # Rounding + @annotate("FRAME_CEIL", color="green", domain="cudf_python") def ceil(self): """ Rounds each value upward to the smallest integral value not less @@ -3245,6 +3290,7 @@ def ceil(self): return self._unaryop("ceil") + @annotate("FRAME_FLOOR", color="green", domain="cudf_python") def floor(self): """Rounds each value downward to the largest integral value not greater than the original. @@ -3284,6 +3330,7 @@ def floor(self): return self._unaryop("floor") + @annotate("FRAME_SCALE", color="green", domain="cudf_python") def scale(self): """ Scale values to [0, 1] in float64 @@ -3318,6 +3365,7 @@ def scale(self): scaled._index = self._index.copy(deep=False) return scaled + @annotate("FRAME_INTERNAL_MERGE", color="green", domain="cudf_python") def _merge( self, right, @@ -3361,6 +3409,7 @@ def _merge( suffixes=suffixes, ).perform_merge() + @annotate("FRAME_IS_SORTED", color="green", domain="cudf_python") def _is_sorted(self, ascending=None, null_position=None): """ Returns a boolean indicating whether the data of the Frame are sorted @@ -3391,12 +3440,14 @@ def _is_sorted(self, ascending=None, null_position=None): self, ascending=ascending, null_position=null_position ) + @annotate("FRAME_SPLIT", color="green", domain="cudf_python") def _split(self, splits, keep_index=True): results = libcudf.copying.table_split( self, splits, keep_index=keep_index ) return [self.__class__._from_data(*result) for result in results] + @annotate("FRAME_ENCODE", color="green", domain="cudf_python") def _encode(self): data, index, indices = libcudf.transform.table_encode(self) for name, col in data.items(): @@ -3404,6 +3455,7 @@ def _encode(self): keys = self.__class__._from_data(data, index) return keys, indices + @annotate("FRAME_UNARYOP", color="green", domain="cudf_python") def _unaryop(self, op): data_columns = (col.unary_operator(op) for col in self._columns) return self.__class__._from_data( @@ -3443,6 +3495,7 @@ def _binaryop( raise NotImplementedError @classmethod + @annotate("FRAME_COLWISE_BINOP", color="green", domain="cudf_python") def _colwise_binop( cls, operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], @@ -3601,6 +3654,7 @@ def _colwise_binop( return output + @annotate("FRAME_DOT", color="green", domain="cudf_python") def dot(self, other, reflect=False): """ Get dot product of frame and other, (binary operator `dot`). @@ -3773,6 +3827,7 @@ def _reduce(self, *args, **kwargs): f"Reductions are not supported for objects of type {type(self)}." ) + @annotate("FRAME_MIN", color="green", domain="cudf_python") def min( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): @@ -3818,6 +3873,7 @@ def min( **kwargs, ) + @annotate("FRAME_MAX", color="green", domain="cudf_python") def max( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): @@ -3863,6 +3919,7 @@ def max( **kwargs, ) + @annotate("FRAME_SUM", color="green", domain="cudf_python") def sum( self, axis=None, @@ -3921,6 +3978,7 @@ def sum( **kwargs, ) + @annotate("FRAME_PRODUCT", color="green", domain="cudf_python") def product( self, axis=None, @@ -3985,6 +4043,7 @@ def product( # Alias for pandas compatibility. prod = product + @annotate("FRAME_MEAN", color="green", domain="cudf_python") def mean( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4029,6 +4088,7 @@ def mean( **kwargs, ) + @annotate("FRAME_STD", color="green", domain="cudf_python") def std( self, axis=None, @@ -4085,6 +4145,7 @@ def std( **kwargs, ) + @annotate("FRAME_VAR", color="green", domain="cudf_python") def var( self, axis=None, @@ -4140,6 +4201,7 @@ def var( **kwargs, ) + @annotate("FRAME_KURTOSIS", color="green", domain="cudf_python") def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4208,6 +4270,7 @@ def kurt( **kwargs, ) + @annotate("FRAME_SKEW", color="green", domain="cudf_python") def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4265,6 +4328,7 @@ def skew( **kwargs, ) + @annotate("FRAME_ALL", color="green", domain="cudf_python") def all(self, axis=0, skipna=True, level=None, **kwargs): """ Return whether all elements are True in DataFrame. @@ -4300,6 +4364,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): "all", axis=axis, skipna=skipna, level=level, **kwargs, ) + @annotate("FRAME_ANY", color="green", domain="cudf_python") def any(self, axis=0, skipna=True, level=None, **kwargs): """ Return whether any elements is True in DataFrame. @@ -4335,6 +4400,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): "any", axis=axis, skipna=skipna, level=level, **kwargs, ) + @annotate("FRAME_SUM_OF_SQUARES", color="green", domain="cudf_python") def sum_of_squares(self, dtype=None): """Return the sum of squares of values. @@ -4358,6 +4424,7 @@ def sum_of_squares(self, dtype=None): """ return self._reduce("sum_of_squares", dtype=dtype) + @annotate("FRAME_MEDIAN", color="green", domain="cudf_python") def median( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4403,6 +4470,7 @@ def median( ) # Scans + @annotate("FRAME_SCAN", color="green", domain="cudf_python") def _scan(self, op, axis=None, skipna=True, cast_to_int=False): skipna = True if skipna is None else skipna @@ -4437,6 +4505,7 @@ def _scan(self, op, axis=None, skipna=True, cast_to_int=False): # for Index._from_data and simplify. return self._from_data(results, index=self._index) + @annotate("FRAME_CUMMIN", color="green", domain="cudf_python") def cummin(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative minimum of the Series or DataFrame. @@ -4480,6 +4549,7 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): """ return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs) + @annotate("FRAME_CUMMAX", color="green", domain="cudf_python") def cummax(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative maximum of the Series or DataFrame. @@ -4523,6 +4593,7 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): """ return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs) + @annotate("FRAME_CUMSUM", color="green", domain="cudf_python") def cumsum(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative sum of the Series or DataFrame. @@ -4569,6 +4640,7 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): "sum", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs ) + @annotate("FRAME_CUMPROD", color="green", domain="cudf_python") def cumprod(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative product of the Series or DataFrame. @@ -4614,6 +4686,7 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs ) + @annotate("FRAME_TO_JSON", color="green", domain="cudf_python") @ioutils.doc_to_json() def to_json(self, path_or_buf=None, *args, **kwargs): """{docstring}""" @@ -4622,18 +4695,21 @@ def to_json(self, path_or_buf=None, *args, **kwargs): self, path_or_buf=path_or_buf, *args, **kwargs ) + @annotate("FRAME_TO_HDF", color="green", domain="cudf_python") @ioutils.doc_to_hdf() def to_hdf(self, path_or_buf, key, *args, **kwargs): """{docstring}""" cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) + @annotate("FRAME_TO_DLPACK", color="green", domain="cudf_python") @ioutils.doc_to_dlpack() def to_dlpack(self): """{docstring}""" return cudf.io.dlpack.to_dlpack(self) + @annotate("FRAME_TO_STRING", color="green", domain="cudf_python") def to_string(self): """ Convert to string @@ -4659,12 +4735,15 @@ def to_string(self): def __str__(self): return self.to_string() + @annotate("FRAME_DEEP_COPY", color="green", domain="cudf_python") def __deepcopy__(self, memo): return self.copy(deep=True) + @annotate("FRAME_COPY", color="green", domain="cudf_python") def __copy__(self): return self.copy(deep=False) + @annotate("FRAME_HEAD", color="green", domain="cudf_python") def head(self, n=5): """ Return the first `n` rows. @@ -4748,6 +4827,7 @@ def head(self, n=5): """ return self.iloc[:n] + @annotate("FRAME_TAIL", color="green", domain="cudf_python") def tail(self, n=5): """ Returns the last n rows as a new DataFrame or Series @@ -4779,6 +4859,7 @@ def tail(self, n=5): return self.iloc[-n:] + @annotate("FRAME_ROLLING", color="green", domain="cudf_python") @copy_docstring(Rolling) def rolling( self, window, min_periods=None, center=False, axis=0, win_type=None @@ -4792,6 +4873,7 @@ def rolling( win_type=win_type, ) + @annotate("FRAME_NANS_TO_NULLS", color="green", domain="cudf_python") def nans_to_nulls(self): """ Convert nans (if any) to nulls @@ -4846,6 +4928,7 @@ def nans_to_nulls(self): self._index, ) + @annotate("FRAME_INVERT", color="green", domain="cudf_python") def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data( @@ -4856,6 +4939,7 @@ def __invert__(self): self._index, ) + @annotate("FRAME_ADD", color="green", domain="cudf_python") def add(self, other, axis, level=None, fill_value=None): """ Get Addition of dataframe or series and other, element-wise (binary @@ -4926,6 +5010,7 @@ def add(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "add", fill_value) + @annotate("FRAME_RADD", color="green", domain="cudf_python") def radd(self, other, axis, level=None, fill_value=None): """ Get Addition of dataframe or series and other, element-wise (binary @@ -5005,6 +5090,7 @@ def radd(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "add", fill_value, reflect=True) + @annotate("FRAME_SUBTRACT", color="green", domain="cudf_python") def subtract(self, other, axis, level=None, fill_value=None): """ Get Subtraction of dataframe or series and other, element-wise (binary @@ -5087,6 +5173,7 @@ def subtract(self, other, axis, level=None, fill_value=None): sub = subtract + @annotate("FRAME_RSUB", color="green", domain="cudf_python") def rsub(self, other, axis, level=None, fill_value=None): """ Get Subtraction of dataframe or series and other, element-wise (binary @@ -5170,6 +5257,7 @@ def rsub(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "sub", fill_value, reflect=True) + @annotate("FRAME_MULTIPLY", color="green", domain="cudf_python") def multiply(self, other, axis, level=None, fill_value=None): """ Get Multiplication of dataframe or series and other, element-wise @@ -5254,6 +5342,7 @@ def multiply(self, other, axis, level=None, fill_value=None): mul = multiply + @annotate("FRAME_RMUL", color="green", domain="cudf_python") def rmul(self, other, axis, level=None, fill_value=None): """ Get Multiplication of dataframe or series and other, element-wise @@ -5338,6 +5427,7 @@ def rmul(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mul", fill_value, reflect=True) + @annotate("FRAME_MOD", color="green", domain="cudf_python") def mod(self, other, axis, level=None, fill_value=None): """ Get Modulo division of dataframe or series and other, element-wise @@ -5408,6 +5498,7 @@ def mod(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mod", fill_value) + @annotate("FRAME_RMOD", color="green", domain="cudf_python") def rmod(self, other, axis, level=None, fill_value=None): """ Get Modulo division of dataframe or series and other, element-wise @@ -5490,6 +5581,7 @@ def rmod(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mod", fill_value, reflect=True) + @annotate("FRAME_POW", color="green", domain="cudf_python") def pow(self, other, axis, level=None, fill_value=None): """ Get Exponential power of dataframe series and other, element-wise @@ -5569,6 +5661,7 @@ def pow(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "pow", fill_value) + @annotate("FRAME_RPOW", color="green", domain="cudf_python") def rpow(self, other, axis, level=None, fill_value=None): """ Get Exponential power of dataframe or series and other, element-wise @@ -5648,6 +5741,7 @@ def rpow(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "pow", fill_value, reflect=True) + @annotate("FRAME_FLOORDIV", color="green", domain="cudf_python") def floordiv(self, other, axis, level=None, fill_value=None): """ Get Integer division of dataframe or series and other, element-wise @@ -5727,6 +5821,7 @@ def floordiv(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "floordiv", fill_value) + @annotate("FRAME_RFLOORDIV", color="green", domain="cudf_python") def rfloordiv(self, other, axis, level=None, fill_value=None): """ Get Integer division of dataframe or series and other, element-wise @@ -5823,6 +5918,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "floordiv", fill_value, reflect=True) + @annotate("FRAME_TRUEDIV", color="green", domain="cudf_python") def truediv(self, other, axis, level=None, fill_value=None): """ Get Floating division of dataframe or series and other, element-wise @@ -5911,6 +6007,7 @@ def truediv(self, other, axis, level=None, fill_value=None): div = truediv divide = truediv + @annotate("FRAME_RTRUEDIV", color="green", domain="cudf_python") def rtruediv(self, other, axis, level=None, fill_value=None): """ Get Floating division of dataframe or series and other, element-wise @@ -6003,6 +6100,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None): # Alias for rtruediv rdiv = rtruediv + @annotate("FRAME_EQ", color="green", domain="cudf_python") def eq(self, other, axis="columns", level=None, fill_value=None): """Equal to, element-wise (binary operator eq). @@ -6078,6 +6176,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None): other=other, fn="eq", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_NE", color="green", domain="cudf_python") def ne(self, other, axis="columns", level=None, fill_value=None): """Not equal to, element-wise (binary operator ne). @@ -6153,6 +6252,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None): other=other, fn="ne", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_LT", color="green", domain="cudf_python") def lt(self, other, axis="columns", level=None, fill_value=None): """Less than, element-wise (binary operator lt). @@ -6228,6 +6328,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None): other=other, fn="lt", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_LE", color="green", domain="cudf_python") def le(self, other, axis="columns", level=None, fill_value=None): """Less than or equal, element-wise (binary operator le). @@ -6303,6 +6404,7 @@ def le(self, other, axis="columns", level=None, fill_value=None): other=other, fn="le", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_GT", color="green", domain="cudf_python") def gt(self, other, axis="columns", level=None, fill_value=None): """Greater than, element-wise (binary operator gt). @@ -6378,6 +6480,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None): other=other, fn="gt", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_GE", color="green", domain="cudf_python") def ge(self, other, axis="columns", level=None, fill_value=None): """Greater than or equal, element-wise (binary operator ge). @@ -6476,6 +6579,11 @@ def nunique(self, method: builtins.str = "sort", dropna: bool = True): } +@annotate( + "FRAME_GET_REPLACEMENT_VALUES_FOR_COLUMNS", + color="green", + domain="cudf_python", +) def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] ) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]: @@ -6640,6 +6748,7 @@ def _is_series(obj): return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None +@annotate("FRAME_DROP_ROWS_BY_LABELS", color="green", domain="cudf_python") def _drop_rows_by_labels( obj: DataFrameOrSeries, labels: Union[ColumnLike, abc.Iterable, str], diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index a919b00692d..948428de4f0 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -7,6 +7,7 @@ from uuid import uuid4 import numpy as np +from nvtx import annotate from pyarrow import dataset as ds, parquet as pq import cudf @@ -16,6 +17,7 @@ from cudf.utils import ioutils +@annotate("_WRITE_PARQUET", color="green", domain="cudf_python") def _write_parquet( df, paths, @@ -73,6 +75,7 @@ def _write_parquet( # Logic chosen to match: https://arrow.apache.org/ # docs/_modules/pyarrow/parquet.html#write_to_dataset +@annotate("WRITE_TO_DATASET", color="green", domain="cudf_python") def write_to_dataset( df, root_path, @@ -161,6 +164,7 @@ def write_to_dataset( @ioutils.doc_read_parquet_metadata() +@annotate("READ_PARQUET_METADATA", color="green", domain="cudf_python") def read_parquet_metadata(path): """{docstring}""" @@ -173,6 +177,7 @@ def read_parquet_metadata(path): return num_rows, num_row_groups, col_names +@annotate("_PROCESS_DATASET", color="green", domain="cudf_python") def _process_dataset( paths, fs, filters=None, row_groups=None, categorical_partitions=True, ): @@ -308,6 +313,7 @@ def _process_dataset( @ioutils.doc_read_parquet() +@annotate("READ_PARQUET", color="green", domain="cudf_python") def read_parquet( filepath_or_buffer, engine="cudf", @@ -435,6 +441,7 @@ def read_parquet( ) +@annotate("_PARQUET_TO_FRAME", color="green", domain="cudf_python") def _parquet_to_frame( paths_or_buffers, *args, @@ -502,6 +509,7 @@ def _parquet_to_frame( ) +@annotate("_WRITE_PARQUET", color="green", domain="cudf_python") def _read_parquet( filepaths_or_buffers, engine, @@ -535,6 +543,7 @@ def _read_parquet( @ioutils.doc_to_parquet() +@annotate("TO_PARQUET", color="green", domain="cudf_python") def to_parquet( df, path, @@ -646,6 +655,7 @@ def _generate_filename(): return uuid4().hex + ".parquet" +@annotate("_GET_PARTITIONED", color="green", domain="cudf_python") def _get_partitioned( df, root_path, @@ -689,6 +699,7 @@ def _get_partitioned( class ParquetDatasetWriter: + @annotate("ParquetDatasetWriter_INIT", color="green", domain="cudf_python") def __init__( self, path, @@ -765,6 +776,9 @@ def __init__( self.path_cw_map: Dict[str, int] = {} self.filename = None + @annotate( + "ParquetDatasetWriter_WRITE_TABLE", color="green", domain="cudf_python" + ) def write_table(self, df): """ Write a dataframe to the file/dataset @@ -821,6 +835,9 @@ def write_table(self, df): self.path_cw_map.update({k: new_cw_idx for k in new_paths}) self._chunked_writers[-1][0].write_table(grouped_df, part_info) + @annotate( + "ParquetDatasetWriter_CLOSE", color="green", domain="cudf_python" + ) def close(self, return_metadata=False): """ Close all open files and optionally return footer metadata as a binary diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 89b5301ee83..1b1f3e29ab2 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from collections.abc import Iterator @@ -6,6 +6,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from nvtx import annotate from dask.dataframe.core import get_parallel_type, meta_nonempty from dask.dataframe.dispatch import ( @@ -39,6 +40,7 @@ @meta_nonempty.register(cudf.BaseIndex) +@annotate("_nonempty_index", color="green", domain="dask_cudf_python") def _nonempty_index(idx): if isinstance(idx, cudf.core.index.RangeIndex): return cudf.core.index.RangeIndex(2, name=idx.name) @@ -73,6 +75,7 @@ def _nonempty_index(idx): raise TypeError(f"Don't know how to handle index of type {type(idx)}") +@annotate("_get_non_empty_data", color="green", domain="dask_cudf_python") def _get_non_empty_data(s): if isinstance(s._column, cudf.core.column.CategoricalColumn): categories = ( @@ -100,6 +103,7 @@ def _get_non_empty_data(s): @meta_nonempty.register(cudf.Series) +@annotate("_nonempty_series", color="green", domain="dask_cudf_python") def _nonempty_series(s, idx=None): if idx is None: idx = _nonempty_index(s.index) @@ -109,6 +113,7 @@ def _nonempty_series(s, idx=None): @meta_nonempty.register(cudf.DataFrame) +@annotate("meta_nonempty_cudf", color="green", domain="dask_cudf_python") def meta_nonempty_cudf(x): idx = meta_nonempty(x.index) columns_with_dtype = dict() @@ -124,15 +129,18 @@ def meta_nonempty_cudf(x): @make_meta_dispatch.register((cudf.Series, cudf.DataFrame)) +@annotate("make_meta_cudf", color="green", domain="dask_cudf_python") def make_meta_cudf(x, index=None): return x.head(0) @make_meta_dispatch.register(cudf.BaseIndex) +@annotate("make_meta_cudf_index", color="green", domain="dask_cudf_python") def make_meta_cudf_index(x, index=None): return x[:0] +@annotate("_empty_series", color="green", domain="dask_cudf_python") def _empty_series(name, dtype, index=None): if isinstance(dtype, str) and dtype == "category": return cudf.Series( @@ -142,6 +150,7 @@ def _empty_series(name, dtype, index=None): @make_meta_obj.register(object) +@annotate("make_meta_object_cudf", color="green", domain="dask_cudf_python") def make_meta_object_cudf(x, index=None): """Create an empty cudf object containing the desired metadata. @@ -212,6 +221,7 @@ def make_meta_object_cudf(x, index=None): @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex)) +@annotate("concat_cudf", color="green", domain="dask_cudf_python") def concat_cudf( dfs, axis=0, @@ -236,11 +246,13 @@ def concat_cudf( @categorical_dtype_dispatch.register( (cudf.DataFrame, cudf.Series, cudf.BaseIndex) ) +@annotate("categorical_dtype_cudf", color="green", domain="dask_cudf_python") def categorical_dtype_cudf(categories=None, ordered=None): return cudf.CategoricalDtype(categories=categories, ordered=ordered) @tolist_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("tolist_cudf", color="green", domain="dask_cudf_python") def tolist_cudf(obj): return obj.to_arrow().to_pylist() @@ -248,6 +260,9 @@ def tolist_cudf(obj): @is_categorical_dtype_dispatch.register( (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series) ) +@annotate( + "is_categorical_dtype_cudf", color="green", domain="dask_cudf_python" +) def is_categorical_dtype_cudf(obj): return cudf.api.types.is_categorical_dtype(obj) @@ -261,6 +276,7 @@ def is_categorical_dtype_cudf(obj): ) @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex)) + @annotate("percentile_cudf", color="green", domain="dask_cudf_python") def percentile_cudf(a, q, interpolation="linear"): # Cudf dispatch to the equivalent of `np.percentile`: # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html @@ -305,6 +321,7 @@ def percentile_cudf(a, q, interpolation="linear"): @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("union_categoricals_cudf", color="green", domain="dask_cudf_python") def union_categoricals_cudf( to_union, sort_categories=False, ignore_order=False ): @@ -313,11 +330,13 @@ def union_categoricals_cudf( ) +@annotate("safe_hash", color="green", domain="dask_cudf_python") def safe_hash(frame): return cudf.Series(frame.hash_values(), index=frame.index) @hash_object_dispatch.register((cudf.DataFrame, cudf.Series)) +@annotate("hash_object_cudf", color="green", domain="dask_cudf_python") def hash_object_cudf(frame, index=True): if index: return safe_hash(frame.reset_index()) @@ -325,6 +344,7 @@ def hash_object_cudf(frame, index=True): @hash_object_dispatch.register(cudf.BaseIndex) +@annotate("hash_object_cudf_index", color="green", domain="dask_cudf_python") def hash_object_cudf_index(ind, index=None): if isinstance(ind, cudf.MultiIndex): @@ -335,6 +355,7 @@ def hash_object_cudf_index(ind, index=None): @group_split_dispatch.register((cudf.Series, cudf.DataFrame)) +@annotate("group_split_cudf", color="green", domain="dask_cudf_python") def group_split_cudf(df, c, k, ignore_index=False): return dict( zip( @@ -349,10 +370,12 @@ def group_split_cudf(df, c, k, ignore_index=False): @sizeof_dispatch.register(cudf.DataFrame) +@annotate("sizeof_cudf_dataframe", color="green", domain="dask_cudf_python") def sizeof_cudf_dataframe(df): return int(df.memory_usage().sum()) @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("sizeof_cudf_series_index", color="green", domain="dask_cudf_python") def sizeof_cudf_series_index(obj): return obj.memory_usage() diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 729db6c232d..d8802f33941 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import math import warnings @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from nvtx import annotate from tlz import partition_all import dask @@ -57,6 +58,7 @@ def __dask_postcompute__(self): def __dask_postpersist__(self): return type(self), (self._name, self._meta, self.divisions) + @annotate("_FRAME_INIT", color="green", domain="dask_cudf_python") def __init__(self, dsk, name, meta, divisions): if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[]) @@ -82,6 +84,9 @@ def __repr__(self): s = "" return s % (type(self).__name__, len(self.dask), self.npartitions) + @annotate( + "_FRAME_to_dask_dataframe", color="green", domain="dask_cudf_python" + ) def to_dask_dataframe(self, **kwargs): """Create a dask.dataframe object from a dask_cudf object""" nullable_pd_dtype = kwargs.get("nullable_pd_dtype", False) @@ -99,6 +104,9 @@ def to_dask_dataframe(self, **kwargs): class DataFrame(_Frame, dd.core.DataFrame): _partition_type = cudf.DataFrame + @annotate( + "DATAFRAME_assign_column", color="green", domain="dask_cudf_python" + ) def _assign_column(self, k, v): def assigner(df, k, v): out = df.copy() @@ -108,6 +116,7 @@ def assigner(df, k, v): meta = assigner(self._meta, k, dask_make_meta(v)) return self.map_partitions(assigner, k, v, meta=meta) + @annotate("DATAFRAME_apply_rows", color="green", domain="dask_cudf_python") def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): import uuid @@ -127,6 +136,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs): do_apply_rows, func, incols, outcols, kwargs, meta=meta ) + @annotate("DATAFRAME_merge", color="green", domain="dask_cudf_python") def merge(self, other, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -138,6 +148,7 @@ def merge(self, other, **kwargs): on = list(on) return super().merge(other, on=on, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_join", color="green", domain="dask_cudf_python") def join(self, other, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -155,6 +166,7 @@ def join(self, other, **kwargs): on = list(on) return super().join(other, how=how, on=on, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_set_index", color="green", domain="dask_cudf_python") def set_index(self, other, sorted=False, divisions=None, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -226,6 +238,9 @@ def set_index(self, other, sorted=False, divisions=None, **kwargs): **kwargs, ) + @annotate( + "DATAFRAME_sort_values", color="green", domain="dask_cudf_python" + ) def sort_values( self, by, @@ -261,12 +276,14 @@ def sort_values( return df.reset_index(drop=True) return df + @annotate("DATAFRAME_to_parquet", color="green", domain="dask_cudf_python") def to_parquet(self, path, *args, **kwargs): """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" from dask_cudf.io import to_parquet return to_parquet(self, path, *args, **kwargs) + @annotate("DATAFRAME_to_orc", color="green", domain="dask_cudf_python") def to_orc(self, path, **kwargs): """Calls dask_cudf.io.to_orc""" from dask_cudf.io import to_orc @@ -274,6 +291,7 @@ def to_orc(self, path, **kwargs): return to_orc(self, path, **kwargs) @derived_from(pd.DataFrame) + @annotate("DATAFRAME_var", color="green", domain="dask_cudf_python") def var( self, axis=None, @@ -302,6 +320,9 @@ def var( else: return _parallel_var(self, meta, skipna, split_every, out) + @annotate( + "DATAFRAME_repartition", color="green", domain="dask_cudf_python" + ) def repartition(self, *args, **kwargs): """Wraps dask.dataframe DataFrame.repartition method. Uses DataFrame.shuffle if `columns=` is specified. @@ -324,6 +345,7 @@ def repartition(self, *args, **kwargs): ) return super().repartition(*args, **kwargs) + @annotate("DATAFRAME_shuffle", color="green", domain="dask_cudf_python") def shuffle(self, *args, **kwargs): """Wraps dask.dataframe DataFrame.shuffle method""" shuffle_arg = kwargs.pop("shuffle", None) @@ -331,18 +353,21 @@ def shuffle(self, *args, **kwargs): raise ValueError("dask_cudf does not support disk-based shuffle.") return super().shuffle(*args, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_groupby", color="green", domain="dask_cudf_python") def groupby(self, by=None, **kwargs): from .groupby import CudfDataFrameGroupBy return CudfDataFrameGroupBy(self, by=by, **kwargs) +@annotate("DATAFRAME_sum_of_squares", color="green", domain="dask_cudf_python") def sum_of_squares(x): x = x.astype("f8")._column outcol = libcudf.reduce.reduce("sum_of_squares", x) return cudf.Series(outcol) +@annotate("DATAFRAME_var_aggregate", color="green", domain="dask_cudf_python") def var_aggregate(x2, x, n, ddof): try: with warnings.catch_warnings(record=True): @@ -355,10 +380,12 @@ def var_aggregate(x2, x, n, ddof): return np.float64(np.nan) +@annotate("DATAFRAME_nlargest_agg", color="green", domain="dask_cudf_python") def nlargest_agg(x, **kwargs): return cudf.concat(x).nlargest(**kwargs) +@annotate("DATAFRAME_nsmallest_agg", color="green", domain="dask_cudf_python") def nsmallest_agg(x, **kwargs): return cudf.concat(x).nsmallest(**kwargs) @@ -366,6 +393,7 @@ def nsmallest_agg(x, **kwargs): class Series(_Frame, dd.core.Series): _partition_type = cudf.Series + @annotate("Series_count", color="green", domain="dask_cudf_python") def count(self, split_every=False): return reduction( [self], @@ -375,12 +403,14 @@ def count(self, split_every=False): meta="i8", ) + @annotate("Series_mean", color="green", domain="dask_cudf_python") def mean(self, split_every=False): sum = self.sum(split_every=split_every) n = self.count(split_every=split_every) return sum / n @derived_from(pd.DataFrame) + @annotate("Series_var", color="green", domain="dask_cudf_python") def var( self, axis=None, @@ -409,16 +439,19 @@ def var( else: return _parallel_var(self, meta, skipna, split_every, out) + @annotate("Series_groupby", color="green", domain="dask_cudf_python") def groupby(self, *args, **kwargs): from .groupby import CudfSeriesGroupBy return CudfSeriesGroupBy(self, *args, **kwargs) @property + @annotate("Series_list", color="green", domain="dask_cudf_python") def list(self): return ListMethods(self) @property + @annotate("Series_struct", color="green", domain="dask_cudf_python") def struct(self): return StructMethods(self) @@ -427,6 +460,7 @@ class Index(Series, dd.core.Index): _partition_type = cudf.Index # type: ignore +@annotate("_naive_var", color="green", domain="dask_cudf_python") def _naive_var(ddf, meta, skipna, ddof, split_every, out): num = ddf._get_numeric_data() x = 1.0 * num.sum(skipna=skipna, split_every=split_every) @@ -441,6 +475,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out): return handle_out(out, result) +@annotate("_parallel_var", color="green", domain="dask_cudf_python") def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): if skipna: @@ -507,6 +542,7 @@ def _finalize_var(vals): return handle_out(out, result) +@annotate("_extract_meta", color="green", domain="dask_cudf_python") def _extract_meta(x): """ Extract internal cache data (``_meta``) from dask_cudf objects @@ -522,6 +558,7 @@ def _extract_meta(x): return x +@annotate("_emulate", color="green", domain="dask_cudf_python") def _emulate(func, *args, **kwargs): """ Apply a function using args / kwargs. If arguments contain dd.DataFrame / @@ -531,6 +568,7 @@ def _emulate(func, *args, **kwargs): return func(*_extract_meta(args), **_extract_meta(kwargs)) +@annotate("align_partitions", color="green", domain="dask_cudf_python") def align_partitions(args): """Align partitions between dask_cudf objects. @@ -546,6 +584,7 @@ def align_partitions(args): return args +@annotate("reduction", color="green", domain="dask_cudf_python") def reduction( args, chunk=None, @@ -684,6 +723,7 @@ def reduction( return dd.core.new_dd_object(graph, b, meta, (None, None)) +@annotate("from_cudf", color="green", domain="dask_cudf_python") def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): if isinstance(getattr(data, "index", None), cudf.MultiIndex): raise NotImplementedError( @@ -705,6 +745,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): ) +@annotate("from_dask_dataframe", color="green", domain="dask_cudf_python") def from_dask_dataframe(df): return df.map_partitions(cudf.from_pandas) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 1bc270a5b9f..658e63ea923 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from nvtx import annotate from dask.base import tokenize from dask.dataframe.core import ( @@ -35,11 +36,19 @@ class CudfDataFrameGroupBy(DataFrameGroupBy): + @annotate( + "CudfDataFrameGroupBy_INIT", color="green", domain="dask_cudf_python" + ) def __init__(self, *args, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) super().__init__(*args, **kwargs) + @annotate( + "CudfDataFrameGroupBy_GETITEM", + color="green", + domain="dask_cudf_python", + ) def __getitem__(self, key): if isinstance(key, list): g = CudfDataFrameGroupBy( @@ -53,6 +62,9 @@ def __getitem__(self, key): g._meta = g._meta[key] return g + @annotate( + "CudfDataFrameGroupBy_MEAN", color="green", domain="dask_cudf_python" + ) def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -66,6 +78,11 @@ def mean(self, split_every=None, split_out=1): as_index=self.as_index, ) + @annotate( + "CudfDataFrameGroupBy_COLLECT", + color="green", + domain="dask_cudf_python", + ) def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -79,6 +96,11 @@ def collect(self, split_every=None, split_out=1): as_index=self.as_index, ) + @annotate( + "CudfDataFrameGroupBy_AGGREGATE", + color="green", + domain="dask_cudf_python", + ) def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() @@ -118,11 +140,17 @@ def aggregate(self, arg, split_every=None, split_out=1): class CudfSeriesGroupBy(SeriesGroupBy): + @annotate( + "CudfSeriesGroupBy_INIT", color="green", domain="dask_cudf_python" + ) def __init__(self, *args, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) super().__init__(*args, **kwargs) + @annotate( + "CudfSeriesGroupBy_MEAN", color="green", domain="dask_cudf_python" + ) def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -136,6 +164,9 @@ def mean(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_STD", color="green", domain="dask_cudf_python" + ) def std(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -149,6 +180,9 @@ def std(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_VAR", color="green", domain="dask_cudf_python" + ) def var(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -162,6 +196,9 @@ def var(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_COLLECT", color="green", domain="dask_cudf_python" + ) def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -175,6 +212,9 @@ def collect(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_AGGREGATE", color="green", domain="dask_cudf_python" + ) def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() @@ -205,6 +245,7 @@ def aggregate(self, arg, split_every=None, split_out=1): ) +@annotate("groupby_agg", color="green", domain="dask_cudf_python") def groupby_agg( ddf, gb_cols, @@ -371,6 +412,7 @@ def groupby_agg( return new_dd_object(graph, gb_agg_name, _meta, divisions) +@annotate("_redirect_aggs", color="green", domain="dask_cudf_python") def _redirect_aggs(arg): """Redirect aggregations to their corresponding name in cuDF""" redirects = { @@ -397,6 +439,7 @@ def _redirect_aggs(arg): return redirects.get(arg, arg) +@annotate("_is_supported", color="green", domain="dask_cudf_python") def _is_supported(arg, supported: set): """Check that aggregations in `arg` are a subset of `supported`""" if isinstance(arg, (list, dict)): @@ -422,6 +465,7 @@ def _make_name(*args, sep="_"): return sep.join(_args) +@annotate("_groupby_partition_agg", color="green", domain="dask_cudf_python") def _groupby_partition_agg( df, gb_cols, aggs, columns, split_out, dropna, sort, sep ): @@ -479,6 +523,7 @@ def _groupby_partition_agg( return output +@annotate("_tree_node_agg", color="green", domain="dask_cudf_python") def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): """Node in groupby-aggregation reduction tree. @@ -513,6 +558,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): return gb +@annotate("_var_agg", color="green", domain="dask_cudf_python") def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): """Calculate variance (given count, sum, and sum-squared columns).""" @@ -534,6 +580,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): return var +@annotate("_finalize_gb_agg", color="green", domain="dask_cudf_python") def _finalize_gb_agg( gb, gb_cols, diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index af40d9ca41b..ada738c5a9b 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -1,9 +1,11 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + from collections.abc import Iterator import cupy import numpy as np import tlz as toolz +from nvtx import annotate from dask.base import tokenize from dask.dataframe import methods @@ -16,12 +18,14 @@ from cudf.api.types import is_categorical_dtype +@annotate("set_index_post", color="green", domain="dask_cudf_python") def set_index_post(df, index_name, drop, column_dtype): df2 = df.set_index(index_name, drop=drop) df2.columns = df2.columns.astype(column_dtype) return df2 +@annotate("_set_partitions_pre", color="green", domain="dask_cudf_python") def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): if ascending: partitions = divisions.searchsorted(s, side="right") - 1 @@ -38,6 +42,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): return partitions +@annotate("_quantile", color="green", domain="dask_cudf_python") def _quantile(a, q): n = len(a) if not len(a): @@ -45,6 +50,7 @@ def _quantile(a, q): return (a.quantiles(q=q.tolist(), interpolation="nearest"), n) +@annotate("merge_quantiles", color="green", domain="dask_cudf_python") def merge_quantiles(finalq, qs, vals): """Combine several quantile calculations of different data. [NOTE: Same logic as dask.array merge_percentiles] @@ -107,6 +113,7 @@ def _append_counts(val, count): return rv.reset_index(drop=True) +@annotate("_approximate_quantile", color="green", domain="dask_cudf_python") def _approximate_quantile(df, q): """Approximate quantiles of DataFrame or Series. [NOTE: Same logic as dask.dataframe Series quantile] @@ -180,6 +187,7 @@ def set_quantile_index(df): return df +@annotate("quantile_divisions", color="green", domain="cudf_python") def quantile_divisions(df, by, npartitions): qn = np.linspace(0.0, 1.0, npartitions + 1).tolist() divisions = _approximate_quantile(df[by], qn).compute() @@ -213,6 +221,7 @@ def quantile_divisions(df, by, npartitions): return divisions +@annotate("sort_values", color="green", domain="cudf_python") def sort_values( df, by,