From 5cce7b616b78be7fbc0553e153400e6e922a58c1 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 2 Feb 2022 13:49:21 -0800 Subject: [PATCH 1/8] add new annotations --- python/cudf/cudf/core/dataframe.py | 100 +++++++++++++++++++++++- python/cudf/cudf/core/frame.py | 117 ++++++++++++++++++++++++++++- python/cudf/cudf/io/parquet.py | 17 +++++ 3 files changed, 228 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 323a5ad088a..3131e10d52c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -111,6 +111,7 @@ def __setitem__(self, key, value): key = (key, slice(None)) return self._setitem_tuple_arg(key, value) + @annotate("_CAN_DOWNCAST_TO_SERIES", color="green", domain="cudf_python") def _can_downcast_to_series(self, df, arg): """ This method encapsulates the logic used @@ -151,6 +152,7 @@ def _can_downcast_to_series(self, df, arg): return True return False + @annotate("_DOWNCAST_TO_SERIES", color="green", domain="cudf_python") def _downcast_to_series(self, df, arg): """ "Downcast" from a DataFrame to a Series @@ -192,6 +194,7 @@ class _DataFrameLocIndexer(_DataFrameIndexer): For selection by label. """ + @annotate("_GETITEM_SCALAR", color="green", domain="cudf_python") def _getitem_scalar(self, arg): return self._frame[arg[1]].loc[arg[0]] @@ -628,6 +631,9 @@ def __init__( if dtype: self._data = self.astype(dtype)._data + @annotate( + "DATAFRAME_INIT_FROM_SERIES_LIST", color="blue", domain="cudf_python" + ) def _init_from_series_list(self, data, columns, index): if index is None: # When `index` is `None`, the final index of @@ -726,6 +732,9 @@ def _init_from_series_list(self, data, columns, index): ) self._data = self._data.select_by_label(columns) + @annotate( + "DATAFRAME_INIT_FROM_LIST_LIKE", color="blue", domain="cudf_python" + ) def _init_from_list_like(self, data, index=None, columns=None): if index is None: index = RangeIndex(start=0, stop=len(data)) @@ -762,6 +771,9 @@ def _init_from_list_like(self, data, index=None, columns=None): self.columns = columns + @annotate( + "DATAFRAME_INIT_FROM_DICT_LIKE", color="blue", domain="cudf_python" + ) def _init_from_dict_like( self, data, index=None, columns=None, nan_as_null=None ): @@ -835,6 +847,11 @@ def _from_data( return out @staticmethod + @annotate( + "DATAFRAME_ALIGN_INPUT_SERIES_INDICES", + color="blue", + domain="cudf_python", + ) def _align_input_series_indices(data, index): data = data.copy() @@ -1180,6 +1197,7 @@ def __delitem__(self, name): """ self._drop_column(name) + @annotate("DATAFRAME_SLICE", color="blue", domain="cudf_python") def _slice(self: T, arg: slice) -> T: """ _slice : slice the frame as per the arg @@ -1241,6 +1259,7 @@ def _slice(self: T, arg: slice) -> T: result.columns = self.columns return result + @annotate("DATAFRAME_MEMORY_USAGE", color="blue", domain="cudf_python") def memory_usage(self, index=True, deep=False): """ Return the memory usage of each column in bytes. @@ -1303,6 +1322,7 @@ def memory_usage(self, index=True, deep=False): sizes.append(self.index.memory_usage()) return Series(sizes, index=ind) + @annotate("DATAFRAME_ARRAY_UFUNC", color="blue", domain="cudf_python") def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): if method == "__call__" and hasattr(cudf, ufunc.__name__): func = getattr(cudf, ufunc.__name__) @@ -1310,6 +1330,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: return NotImplemented + @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python") def __array_function__(self, func, types, args, kwargs): cudf_df_module = DataFrame @@ -1348,6 +1369,7 @@ def __array_function__(self, func, types, args, kwargs): return NotImplemented # The _get_numeric_data method is necessary for dask compatibility. + @annotate("DATAFRAME_GET_NUMERIC_DATA", color="blue", domain="cudf_python") def _get_numeric_data(self): """Return a dataframe with only numeric data types""" columns = [ @@ -1357,6 +1379,7 @@ def _get_numeric_data(self): ] return self[columns] + @annotate("DATAFRAME_ASSIGN", color="blue", domain="cudf_python") def assign(self, **kwargs): """ Assign columns to DataFrame from keyword arguments. @@ -1835,10 +1858,12 @@ def _get_renderable_dataframe(self): return output + @annotate("DATAFRAME_REPR", color="blue", domain="cudf_python") def __repr__(self): output = self._get_renderable_dataframe() return self._clean_renderable_dataframe(output) + @annotate("DATAFRAME_REPR_HTML", color="blue", domain="cudf_python") def _repr_html_(self): lines = ( self._get_renderable_dataframe() @@ -1855,9 +1880,13 @@ def _repr_html_(self): lines.append("") return "\n".join(lines) + @annotate("DATAFRAME_REPR_LATEX", color="blue", domain="cudf_python") def _repr_latex_(self): return self._get_renderable_dataframe().to_pandas()._repr_latex_() + @annotate( + "DATAFRAME_GET_COLUMNS_BY_LABEL", color="blue", domain="cudf_python" + ) def _get_columns_by_label(self, labels, downcast=False): """ Return columns of dataframe by `labels` @@ -1880,6 +1909,7 @@ def _get_columns_by_label(self, labels, downcast=False): ) return out + @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") def _binaryop( self, other: Any, @@ -1970,6 +2000,7 @@ def _binaryop( index=lhs._index, ) + @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python") def update( self, other, @@ -2063,14 +2094,17 @@ def update( self._mimic_inplace(source_df, inplace=True) + @annotate("DATAFRAME_ITER", color="blue", domain="cudf_python") def __iter__(self): return iter(self.columns) + @annotate("DATAFRAME_ITERITEMS", color="blue", domain="cudf_python") def iteritems(self): """Iterate over column names and series pairs""" for k in self: yield (k, self[k]) + @annotate("DATAFRAME_EQUALS", color="blue", domain="cudf_python") def equals(self, other, **kwargs): ret = super().equals(other) # If all other checks matched, validate names. @@ -2133,6 +2167,7 @@ def columns(self, columns): data, multiindex=is_multiindex, level_names=columns.names, ) + @annotate("DATAFRAME_REINDEX_INTERNAL", color="blue", domain="cudf_python") def _reindex( self, columns, dtypes=None, deep=False, index=None, inplace=False ): @@ -2209,6 +2244,7 @@ def _reindex( return self._mimic_inplace(result, inplace=inplace) + @annotate("DATAFRAME_REINDEX", color="blue", domain="cudf_python") def reindex( self, labels=None, axis=None, index=None, columns=None, copy=True ): @@ -2287,6 +2323,7 @@ def reindex( inplace=False, ) + @annotate("DATAFRAME_SET_INDEX", color="blue", domain="cudf_python") def set_index( self, keys, @@ -2547,12 +2584,13 @@ def reset_index( inplace=inplace, ) + @annotate("DATAFRAME_TAKE", color="blue", domain="cudf_python") def take(self, indices, axis=0): out = super().take(indices) out.columns = self.columns return out - @annotate("INSERT", color="green", domain="cudf_python") + @annotate("DATAFRAME_INSERT", color="green", domain="cudf_python") def insert(self, loc, name, value, nan_as_null=None): """Add a column to DataFrame at the index specified by loc. @@ -2604,6 +2642,7 @@ def insert(self, loc, name, value, nan_as_null=None): self._data.insert(name, value, loc=loc) + @annotate("DATAFRAME_DROP", color="green", domain="cudf_python") def drop( self, labels=None, @@ -2777,12 +2816,14 @@ def drop( if not inplace: return out + @annotate("DATAFRAME_DROP_COLUMN", color="green", domain="cudf_python") def _drop_column(self, name): """Drop a column by *name*""" if name not in self._data: raise KeyError(f"column '{name}' does not exist") del self._data[name] + @annotate("DATAFRAME_DROP_DUPLICATES", color="green", domain="cudf_python") def drop_duplicates( self, subset=None, keep="first", inplace=False, ignore_index=False ): @@ -2860,12 +2901,14 @@ def drop_duplicates( return self._mimic_inplace(outdf, inplace=inplace) + @annotate("DATAFRAME_POP", color="green", domain="cudf_python") def pop(self, item): """Return a column and drop it from the DataFrame.""" popped = self[item] del self[item] return popped + @annotate("DATAFRAME_RENAME", color="green", domain="cudf_python") def rename( self, mapper=None, @@ -3009,6 +3052,7 @@ def rename( else: return out.copy(deep=copy) + @annotate("DATAFRAME_ADD_PREFIX", color="green", domain="cudf_python") def add_prefix(self, prefix): out = self.copy(deep=True) out.columns = [ @@ -3016,6 +3060,7 @@ def add_prefix(self, prefix): ] return out + @annotate("DATAFRAME_ADD_SUFFIX", color="green", domain="cudf_python") def add_suffix(self, suffix): out = self.copy(deep=True) out.columns = [ @@ -3023,6 +3068,7 @@ def add_suffix(self, suffix): ] return out + @annotate("DATAFRAME_AGG", color="green", domain="cudf_python") def agg(self, aggs, axis=None): """ Aggregate using one or more operations over the specified axis. @@ -3154,6 +3200,7 @@ def agg(self, aggs, axis=None): else: raise ValueError("argument must be a string, list or dict") + @annotate("DATAFRAME_NLARGEST", color="green", domain="cudf_python") def nlargest(self, n, columns, keep="first"): """Get the rows of the DataFrame sorted by the n largest value of *columns* @@ -3285,6 +3332,7 @@ def nsmallest(self, n, columns, keep="first"): """ return self._n_largest_or_smallest(False, n, columns, keep) + @annotate("DATAFRAME_TRANSPOSE", color="green", domain="cudf_python") def transpose(self): """Transpose index and columns. @@ -3315,6 +3363,7 @@ def transpose(self): T = property(transpose, doc=transpose.__doc__) + @annotate("DATAFRAME_MELT", color="green", domain="cudf_python") def melt(self, **kwargs): """Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. @@ -3344,7 +3393,7 @@ def melt(self, **kwargs): return melt(self, **kwargs) - @annotate("JOIN", color="blue", domain="cudf_python") + @annotate("DATAFRAME_JOIN", color="blue", domain="cudf_python") def merge( self, right, @@ -3526,6 +3575,7 @@ def join( ) return df + @annotate("DATAFRAME_GROUPBY", color="green", domain="cudf_python") @copy_docstring(DataFrameGroupBy) def groupby( self, @@ -3575,6 +3625,7 @@ def groupby( ) ) + @annotate("DATAFRAME_QUERY", color="green", domain="cudf_python") def query(self, expr, local_dict=None): """ Query with a boolean expression using Numba to compile a GPU kernel. @@ -3665,6 +3716,7 @@ def query(self, expr, local_dict=None): boolmask = queryutils.query_execute(self, expr, callenv) return self._apply_boolean_mask(boolmask) + @annotate("DATAFRAME_APPLY", color="green", domain="cudf_python") def apply( self, func, axis=1, raw=False, result_type=None, args=(), **kwargs ): @@ -3813,6 +3865,7 @@ def apply( return self._apply(func, _get_row_kernel, *args, **kwargs) + @annotate("DATAFRAME_APPLY_ROWS", color="green", domain="cudf_python") @applyutils.doc_apply() def apply_rows( self, @@ -3891,6 +3944,7 @@ def apply_rows( cache_key=cache_key, ) + @annotate("DATAFRAME_APPLY_CHUNKS", color="green", domain="cudf_python") @applyutils.doc_applychunks() def apply_chunks( self, @@ -3958,6 +4012,9 @@ def apply_chunks( tpb=tpb, ) + @annotate( + "DATAFRAME_PARTITION_BY_HASH", color="green", domain="cudf_python" + ) def partition_by_hash(self, columns, nparts, keep_index=True): """Partition the dataframe by the hashed value of data in *columns*. @@ -4295,6 +4352,7 @@ def _sizeof_fmt(num, size_qualifier): cudf.utils.ioutils.buffer_write_lines(buf, lines) + @annotate("DATAFRAME_DESCRIBE", color="green", domain="cudf_python") @docutils.doc_describe() def describe( self, @@ -4354,6 +4412,7 @@ def describe( sort=False, ) + @annotate("DATAFRAME_TO_PANDAS", color="green", domain="cudf_python") def to_pandas(self, nullable=False, **kwargs): """ Convert to a Pandas DataFrame. @@ -4439,6 +4498,7 @@ def to_pandas(self, nullable=False, **kwargs): out_df.columns = out_columns return out_df + @annotate("DATAFRAME_FROM_PANDAS", color="green", domain="cudf_python") @classmethod def from_pandas(cls, dataframe, nan_as_null=None): """ @@ -4508,6 +4568,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): return result + @annotate("DATAFRAME_FROM_ARROW", color="green", domain="cudf_python") @classmethod def from_arrow(cls, table): """ @@ -4564,6 +4625,7 @@ def from_arrow(cls, table): return out + @annotate("DATAFRAME_TO_ARROW", color="green", domain="cudf_python") def to_arrow(self, preserve_index=True): """ Convert to a PyArrow Table. @@ -4638,6 +4700,7 @@ def to_arrow(self, preserve_index=True): return out.replace_schema_metadata(metadata) + @annotate("DATAFRAME_TO_RECORDS", color="green", domain="cudf_python") def to_records(self, index=True): """Convert to a numpy recarray @@ -4660,6 +4723,7 @@ def to_records(self, index=True): ret[col] = self[col].to_numpy() return ret + @annotate("DATAFRAME_FROM_RECORDS", color="green", domain="cudf_python") @classmethod def from_records(cls, data, index=None, columns=None, nan_as_null=False): """ @@ -4722,6 +4786,9 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): return df @classmethod + @annotate( + "DATAFRAME_FROM_ARRAYS_INTERNAL", color="green", domain="cudf_python" + ) def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): """Convert a numpy/cupy array to DataFrame. @@ -4781,6 +4848,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): df._index = as_index(index) return df + @annotate("DATAFRAME_INTERPOLATE", color="green", domain="cudf_python") def interpolate( self, method="linear", @@ -4811,6 +4879,7 @@ def interpolate( **kwargs, ) + @annotate("DATAFRAME_QUANTILE", color="green", domain="cudf_python") def quantile( self, q=0.5, @@ -4926,6 +4995,7 @@ def quantile( result.index = q return result + @annotate("DATAFRAME_QUANTILES", color="green", domain="cudf_python") def quantiles(self, q=0.5, interpolation="nearest"): """ Return values at the given quantile. @@ -4965,6 +5035,7 @@ def quantiles(self, q=0.5, interpolation="nearest"): result.index = as_index(q) return result + @annotate("DATAFRAME_ISIN", color="green", domain="cudf_python") def isin(self, values): """ Whether each element in the DataFrame is contained in values. @@ -5102,6 +5173,9 @@ def isin(self, values): # # Stats # + @annotate( + "DATAFRAME_PREPARE_FOR_ROWWISE_OP", color="green", domain="cudf_python" + ) def _prepare_for_rowwise_op(self, method, skipna): """Prepare a DataFrame for CuPy-based row-wise operations.""" @@ -5151,6 +5225,7 @@ def _prepare_for_rowwise_op(self, method, skipna): coerced = coerced.astype("int64", copy=False) return coerced, mask, common_dtype + @annotate("DATAFRAME_COUNT", color="green", domain="cudf_python") def count(self, axis=0, level=None, numeric_only=False, **kwargs): """ Count ``non-NA`` cells for each column or row. @@ -5197,6 +5272,7 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): "columns": 1, } + @annotate("DATAFRAME_REDUCE", color="green", domain="cudf_python") def _reduce( self, op, axis=None, level=None, numeric_only=None, **kwargs, ): @@ -5221,6 +5297,7 @@ def _reduce( elif axis == 1: return self._apply_cupy_method_axis_1(op, **kwargs) + @annotate("DATAFRAME_SCAN", color="green", domain="cudf_python") def _scan( self, op, axis=None, *args, **kwargs, ): @@ -5231,6 +5308,7 @@ def _scan( elif axis == 1: return self._apply_cupy_method_axis_1(f"cum{op}", **kwargs) + @annotate("DATAFRAME_MODE", color="green", domain="cudf_python") def mode(self, axis=0, numeric_only=False, dropna=True): """ Get the mode(s) of each element along the selected axis. @@ -5330,6 +5408,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return df + @annotate("DATAFRAME_KURTOSIS", color="green", domain="cudf_python") def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -5338,6 +5417,7 @@ def kurtosis( axis, skipna, level, numeric_only, **kwargs ) + @annotate("DATAFRAME_SKEW", color="green", domain="cudf_python") def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -5346,14 +5426,17 @@ def skew( axis, skipna, level, numeric_only, **kwargs ) + @annotate("DATAFRAME_ALL", color="green", domain="cudf_python") def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self return super(DataFrame, obj).all(axis, skipna, level, **kwargs) + @annotate("DATAFRAME_ANY", color="green", domain="cudf_python") def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): obj = self.select_dtypes(include="bool") if bool_only else self return super(DataFrame, obj).any(axis, skipna, level, **kwargs) + @annotate("DATAFRAME_APPLY_CUPY", color="green", domain="cudf_python") def _apply_cupy_method_axis_1(self, method, *args, **kwargs): # This method uses cupy to perform scans and reductions along rows of a # DataFrame. Since cuDF is designed around columnar storage and @@ -5454,6 +5537,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): result_df.columns = prepared.columns return result_df + @annotate("DATAFRAME_COLUMNS_VIEW", color="green", domain="cudf_python") def _columns_view(self, columns): """ Return a subset of the DataFrame's columns as a view. @@ -5462,6 +5546,7 @@ def _columns_view(self, columns): {col: self._data[col] for col in columns}, index=self.index ) + @annotate("DATAFRAME_SELECT_DTYPES", color="green", domain="cudf_python") def select_dtypes(self, include=None, exclude=None): """Return a subset of the DataFrame’s columns based on the column dtypes. @@ -5650,6 +5735,7 @@ def to_orc(self, fname, compression=None, *args, **kwargs): orc.to_orc(self, fname, compression, *args, **kwargs) + @annotate("DATAFRAME_STACK", color="green", domain="cudf_python") def stack(self, level=-1, dropna=True): """Stack the prescribed level(s) from columns to index @@ -5711,6 +5797,7 @@ def stack(self, level=-1, dropna=True): else: return result + @annotate("DATAFRAME_COV", color="green", domain="cudf_python") def cov(self, **kwargs): """Compute the covariance matrix of a DataFrame. @@ -5728,6 +5815,7 @@ def cov(self, **kwargs): df.columns = self.columns return df + @annotate("DATAFRAME_CORR", color="green", domain="cudf_python") def corr(self): """Compute the correlation matrix of a DataFrame.""" corr = cupy.corrcoef(self.values, rowvar=False) @@ -5735,6 +5823,7 @@ def corr(self): df.columns = self.columns return df + @annotate("DATAFRAME_TO_STRUCT", color="green", domain="cudf_python") def to_struct(self, name=None): """ Return a struct Series composed of the columns of the DataFrame. @@ -5759,6 +5848,7 @@ def to_struct(self, name=None): name=name, ) + @annotate("DATAFRAME_KEYS", color="green", domain="cudf_python") def keys(self): """ Get the columns. @@ -5806,6 +5896,7 @@ def iterrows(self): "if you wish to iterate over each row." ) + @annotate("DATAFRAME_APPEND", color="green", domain="cudf_python") def append( self, other, ignore_index=False, verify_integrity=False, sort=False ): @@ -5958,6 +6049,7 @@ def append( return cudf.concat(to_concat, ignore_index=ignore_index, sort=sort) + @annotate("DATAFRAME_PIVOT", color="green", domain="cudf_python") @copy_docstring(reshape.pivot) def pivot(self, index, columns, values=None): @@ -5965,12 +6057,14 @@ def pivot(self, index, columns, values=None): self, index=index, columns=columns, values=values ) + @annotate("DATAFRAME_UNSTACK", color="green", domain="cudf_python") @copy_docstring(reshape.unstack) def unstack(self, level=-1, fill_value=None): return cudf.core.reshape.unstack( self, level=level, fill_value=fill_value ) + @annotate("DATAFRAME_EXPLODE", color="green", domain="cudf_python") def explode(self, column, ignore_index=False): """ Transform each element of a list-like to a row, replicating index @@ -6145,6 +6239,7 @@ def func(left, right, output): ) +@annotate("CUDF_FROM_PANDAS", color="green", domain="cudf_python") def from_pandas(obj, nan_as_null=None): """ Convert certain Pandas objects into the cudf equivalent. @@ -6265,6 +6360,7 @@ def from_pandas(obj, nan_as_null=None): ) +@annotate("CUDF_MERGE", color="green", domain="cudf_python") def merge(left, right, *args, **kwargs): return left.merge(right, *args, **kwargs) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 891f58657b0..6b0f978b6b1 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -121,6 +121,7 @@ def deserialize(cls, header, frames): columns = deserialize_columns(header["columns"], frames) return cls_deserialize._from_data(dict(zip(column_names, columns))) + @annotate("FRAME_FROM_DATA", color="green", domain="cudf_python") @classmethod def _from_data( cls, @@ -131,6 +132,7 @@ def _from_data( Frame.__init__(obj, data, index) return obj + @annotate("FRAME_FROM_COLUMNS", color="green", domain="cudf_python") @classmethod def _from_columns( cls, @@ -162,6 +164,9 @@ def _from_columns( return cls._from_data(data, index) + @annotate( + "FRAME_FROM_COLUMNS_LIKE_SELF", color="green", domain="cudf_python" + ) def _from_columns_like_self( self, columns: List[ColumnBase], @@ -339,6 +344,7 @@ def empty(self): def __len__(self): return self._num_rows + @annotate("FRAME_COPY", color="green", domain="cudf_python") def copy(self: T, deep: bool = True) -> T: """ Make a copy of this object's indices and data. @@ -424,6 +430,7 @@ def copy(self: T, deep: bool = True) -> T: return new_frame + @annotate("FRAME_EQUALS", color="green", domain="cudf_python") def equals(self, other, **kwargs): """ Test whether two objects contain the same elements. @@ -506,6 +513,7 @@ def equals(self, other, **kwargs): else: return self._index.equals(other._index) + @annotate("FRAME_EXPLODE", color="green", domain="cudf_python") def _explode(self, explode_column: Any, ignore_index: bool): """Helper function for `explode` in `Series` and `Dataframe`, explodes a specified nested column. Other columns' corresponding rows are @@ -529,6 +537,9 @@ def _explode(self, explode_column: Any, ignore_index: bool): res.index.names = self._index.names return res + @annotate( + "FRAME_GET_COLUMNS_BY_LABEL", color="green", domain="cudf_python" + ) def _get_columns_by_label(self, labels, downcast=False): """ Returns columns of the Frame specified by `labels` @@ -536,6 +547,9 @@ def _get_columns_by_label(self, labels, downcast=False): """ return self._data.select_by_label(labels) + @annotate( + "FRAME_GET_COLUMNS_BY_INDEX", color="green", domain="cudf_python" + ) def _get_columns_by_index(self, indices): """ Returns columns of the Frame specified by `labels` @@ -559,6 +573,7 @@ def _as_column(self): return self._data[None].copy(deep=False) + @annotate("FRAME_EMPTY_LIKE", color="green", domain="cudf_python") def _empty_like(self, keep_index=True): result = self.__class__._from_data( *libcudf.copying.table_empty_like(self, keep_index) @@ -648,6 +663,7 @@ def get_column_values_na(col): matrix[:, i] = get_column_values_na(col) return matrix + @annotate("FRAME_TO_CUPY", color="green", domain="cudf_python") def to_cupy( self, dtype: Union[Dtype, None] = None, @@ -682,6 +698,7 @@ def to_cupy( na_value, ) + @annotate("FRAME_TO_NUMPY", color="green", domain="cudf_python") def to_numpy( self, dtype: Union[Dtype, None] = None, @@ -716,6 +733,7 @@ def to_numpy( (lambda col: col.values_host), np.empty, dtype, na_value ) + @annotate("FRAME_CLIP", color="green", domain="cudf_python") def clip(self, lower=None, upper=None, inplace=False, axis=1): """ Trim values at input threshold(s). @@ -843,6 +861,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): return self._mimic_inplace(output, inplace=inplace) + @annotate("FRAME_WHERE", color="green", domain="cudf_python") def where(self, cond, other=None, inplace=False): """ Replace values where the condition is False. @@ -901,6 +920,7 @@ def where(self, cond, other=None, inplace=False): frame=self, cond=cond, other=other, inplace=inplace ) + @annotate("FRAME_MASK", color="green", domain="cudf_python") def mask(self, cond, other=None, inplace=False): """ Replace values where the condition is True. @@ -962,6 +982,7 @@ def mask(self, cond, other=None, inplace=False): return self.where(cond=~cond, other=other, inplace=inplace) + @annotate("FRAME_PIPE", color="green", domain="cudf_python") def pipe(self, func, *args, **kwargs): """ Apply ``func(self, *args, **kwargs)``. @@ -1092,6 +1113,7 @@ def scatter_by_map( return result + @annotate("FRAME_FILLNA", color="green", domain="cudf_python") def fillna( self, value=None, method=None, axis=None, inplace=False, limit=None ): @@ -1238,6 +1260,7 @@ def fillna( return self._mimic_inplace(result, inplace=inplace) + @annotate("FRAME_DROPNA_COLUMNS", color="green", domain="cudf_python") def _drop_na_columns(self, how="any", subset=None, thresh=None): """ Drop columns containing nulls @@ -1265,6 +1288,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None): return self[out_cols] + @annotate("FRAME_INTERPOLATE", color="green", domain="cudf_python") def interpolate( self, method="linear", @@ -1334,6 +1358,7 @@ def interpolate( else result._gather(perm_sort.argsort()) ) + @annotate("FRAME_QUANTILES", color="green", domain="cudf_python") def _quantiles( self, q, @@ -1366,6 +1391,7 @@ def _quantiles( result._copy_type_metadata(self) return result + @annotate("FRAME_RANK", color="green", domain="cudf_python") def rank( self, axis=0, @@ -1442,6 +1468,7 @@ def rank( return self._from_data(data, index).astype(np.float64) + @annotate("FRAME_REPEAT", color="green", domain="cudf_python") def repeat(self, repeats, axis=None): """Repeats elements consecutively. @@ -1531,6 +1558,7 @@ def repeat(self, repeats, axis=None): result._copy_type_metadata(self) return result + @annotate("FRAME_SHIFT", color="green", domain="cudf_python") def shift(self, periods=1, freq=None, axis=0, fill_value=None): """Shift values by `periods` positions.""" axis = self._get_axis_from_axis_arg(axis) @@ -1546,7 +1574,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): zip(self._column_names, data_columns), self._index ) - @annotate("SAMPLE", color="orange", domain="cudf_python") + @annotate("FRAME_SAMPLE", color="orange", domain="cudf_python") def sample( self, n=None, @@ -1740,7 +1768,7 @@ def sample( return result @classmethod - @annotate("FROM_ARROW", color="orange", domain="cudf_python") + @annotate("FRAME_FROM_ARROW", color="orange", domain="cudf_python") def from_arrow(cls, data): """Convert from PyArrow Table to Frame @@ -1880,7 +1908,7 @@ def from_arrow(cls, data): return cls._from_data({name: result[name] for name in column_names}) - @annotate("TO_ARROW", color="orange", domain="cudf_python") + @annotate("FRAME_TO_ARROW", color="orange", domain="cudf_python") def to_arrow(self): """ Convert to arrow Table @@ -1912,6 +1940,7 @@ def _positions_from_column_names(self, column_names): if name in set(column_names) ] + @annotate("FRAME_REPLACE", color="green", domain="cudf_python") def replace( self, to_replace=None, @@ -2198,6 +2227,7 @@ def _copy_type_metadata( return self + @annotate("FRAME_ISNULL", color="green", domain="cudf_python") def isnull(self): """ Identify missing values. @@ -2279,6 +2309,7 @@ def isnull(self): # Alias for isnull isna = isnull + @annotate("FRAME_NOTNULL", color="green", domain="cudf_python") def notnull(self): """ Identify non-missing values. @@ -2360,6 +2391,7 @@ def notnull(self): # Alias for notnull notna = notnull + @annotate("FRAME_INTERLEAVE_COLUMNS", color="green", domain="cudf_python") def interleave_columns(self): """ Interleave Series columns of a table into a single column. @@ -2399,6 +2431,7 @@ def interleave_columns(self): return result + @annotate("FRAME_TILE", color="green", domain="cudf_python") def tile(self, count): """ Repeats the rows from `self` DataFrame `count` times to form a @@ -2428,6 +2461,7 @@ def tile(self, count): result._copy_type_metadata(self) return result + @annotate("FRAME_SEARCHSORTED", color="green", domain="cudf_python") def searchsorted( self, values, side="left", ascending=True, na_position="last" ): @@ -2512,7 +2546,7 @@ def searchsorted( else: return result - @annotate("ARGSORT", color="yellow", domain="cudf_python") + @annotate("FRAME_ARGSORT", color="yellow", domain="cudf_python") def argsort( self, by=None, @@ -2615,6 +2649,7 @@ def _get_sorted_inds(self, by=None, ascending=True, na_position="last"): return libcudf.sort.order_by(to_sort, ascending, na_position) + @annotate("FRAME_SIN", color="green", domain="cudf_python") def sin(self): """ Get Trigonometric sine, element-wise. @@ -2676,6 +2711,7 @@ def sin(self): """ return self._unaryop("sin") + @annotate("FRAME_COS", color="green", domain="cudf_python") def cos(self): """ Get Trigonometric cosine, element-wise. @@ -2737,6 +2773,7 @@ def cos(self): """ return self._unaryop("cos") + @annotate("FRAME_TAN", color="green", domain="cudf_python") def tan(self): """ Get Trigonometric tangent, element-wise. @@ -2798,6 +2835,7 @@ def tan(self): """ return self._unaryop("tan") + @annotate("FRAME_ASIN", color="green", domain="cudf_python") def asin(self): """ Get Trigonometric inverse sine, element-wise. @@ -2848,6 +2886,7 @@ def asin(self): """ return self._unaryop("asin") + @annotate("FRAME_ACOS", color="green", domain="cudf_python") def acos(self): """ Get Trigonometric inverse cosine, element-wise. @@ -2906,6 +2945,7 @@ def acos(self): result = result.mask((result < 0) | (result > np.pi + 1)) return result + @annotate("FRAME_ATAN", color="green", domain="cudf_python") def atan(self): """ Get Trigonometric inverse tangent, element-wise. @@ -2966,6 +3006,7 @@ def atan(self): """ return self._unaryop("atan") + @annotate("FRAME_EXP", color="green", domain="cudf_python") def exp(self): """ Get the exponential of all elements, element-wise. @@ -3028,6 +3069,7 @@ def exp(self): """ return self._unaryop("exp") + @annotate("FRAME_LOG", color="green", domain="cudf_python") def log(self): """ Get the natural logarithm of all elements, element-wise. @@ -3089,6 +3131,7 @@ def log(self): """ return self._unaryop("log") + @annotate("FRAME_SQRT", color="green", domain="cudf_python") def sqrt(self): """ Get the non-negative square-root of all elements, element-wise. @@ -3144,6 +3187,7 @@ def sqrt(self): """ return self._unaryop("sqrt") + @annotate("FRAME_ABS", color="green", domain="cudf_python") def abs(self): """ Return a Series/DataFrame with absolute numeric value of each element. @@ -3170,6 +3214,7 @@ def abs(self): return self._unaryop("abs") # Rounding + @annotate("FRAME_CEIL", color="green", domain="cudf_python") def ceil(self): """ Rounds each value upward to the smallest integral value not less @@ -3206,6 +3251,7 @@ def ceil(self): return self._unaryop("ceil") + @annotate("FRAME_FLOOR", color="green", domain="cudf_python") def floor(self): """Rounds each value downward to the largest integral value not greater than the original. @@ -3245,6 +3291,7 @@ def floor(self): return self._unaryop("floor") + @annotate("FRAME_SCALE", color="green", domain="cudf_python") def scale(self): """ Scale values to [0, 1] in float64 @@ -3279,6 +3326,7 @@ def scale(self): scaled._index = self._index.copy(deep=False) return scaled + @annotate("FRAME_INTERNAL_MERGE", color="green", domain="cudf_python") def _merge( self, right, @@ -3322,6 +3370,7 @@ def _merge( suffixes=suffixes, ).perform_merge() + @annotate("FRAME_IS_SORTED", color="green", domain="cudf_python") def _is_sorted(self, ascending=None, null_position=None): """ Returns a boolean indicating whether the data of the Frame are sorted @@ -3352,12 +3401,14 @@ def _is_sorted(self, ascending=None, null_position=None): self, ascending=ascending, null_position=null_position ) + @annotate("FRAME_SPLIT", color="green", domain="cudf_python") def _split(self, splits, keep_index=True): results = libcudf.copying.table_split( self, splits, keep_index=keep_index ) return [self.__class__._from_data(*result) for result in results] + @annotate("FRAME_ENCODE", color="green", domain="cudf_python") def _encode(self): data, index, indices = libcudf.transform.table_encode(self) for name, col in data.items(): @@ -3365,6 +3416,7 @@ def _encode(self): keys = self.__class__._from_data(data, index) return keys, indices + @annotate("FRAME_UNARYOP", color="green", domain="cudf_python") def _unaryop(self, op): data_columns = (col.unary_operator(op) for col in self._columns) return self.__class__._from_data( @@ -3404,6 +3456,7 @@ def _binaryop( raise NotImplementedError @classmethod + @annotate("FRAME_COLWISE_BINOP", color="green", domain="cudf_python") def _colwise_binop( cls, operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]], @@ -3562,6 +3615,7 @@ def _colwise_binop( return output + @annotate("FRAME_DOT", color="green", domain="cudf_python") def dot(self, other, reflect=False): """ Get dot product of frame and other, (binary operator `dot`). @@ -3722,6 +3776,7 @@ def _reduce(self, *args, **kwargs): f"Reductions are not supported for objects of type {type(self)}." ) + @annotate("FRAME_MIN", color="green", domain="cudf_python") def min( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): @@ -3767,6 +3822,7 @@ def min( **kwargs, ) + @annotate("FRAME_MAX", color="green", domain="cudf_python") def max( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, ): @@ -3812,6 +3868,7 @@ def max( **kwargs, ) + @annotate("FRAME_SUM", color="green", domain="cudf_python") def sum( self, axis=None, @@ -3870,6 +3927,7 @@ def sum( **kwargs, ) + @annotate("FRAME_PRODUCT", color="green", domain="cudf_python") def product( self, axis=None, @@ -3934,6 +3992,7 @@ def product( # Alias for pandas compatibility. prod = product + @annotate("FRAME_MEAN", color="green", domain="cudf_python") def mean( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -3978,6 +4037,7 @@ def mean( **kwargs, ) + @annotate("FRAME_STD", color="green", domain="cudf_python") def std( self, axis=None, @@ -4034,6 +4094,7 @@ def std( **kwargs, ) + @annotate("FRAME_VAR", color="green", domain="cudf_python") def var( self, axis=None, @@ -4089,6 +4150,7 @@ def var( **kwargs, ) + @annotate("FRAME_KURTOSIS", color="green", domain="cudf_python") def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4157,6 +4219,7 @@ def kurt( **kwargs, ) + @annotate("FRAME_SKEW", color="green", domain="cudf_python") def skew( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4214,6 +4277,7 @@ def skew( **kwargs, ) + @annotate("FRAME_ALL", color="green", domain="cudf_python") def all(self, axis=0, skipna=True, level=None, **kwargs): """ Return whether all elements are True in DataFrame. @@ -4249,6 +4313,7 @@ def all(self, axis=0, skipna=True, level=None, **kwargs): "all", axis=axis, skipna=skipna, level=level, **kwargs, ) + @annotate("FRAME_ANY", color="green", domain="cudf_python") def any(self, axis=0, skipna=True, level=None, **kwargs): """ Return whether any elements is True in DataFrame. @@ -4284,6 +4349,7 @@ def any(self, axis=0, skipna=True, level=None, **kwargs): "any", axis=axis, skipna=skipna, level=level, **kwargs, ) + @annotate("FRAME_SUM_OF_SQUARES", color="green", domain="cudf_python") def sum_of_squares(self, dtype=None): """Return the sum of squares of values. @@ -4307,6 +4373,7 @@ def sum_of_squares(self, dtype=None): """ return self._reduce("sum_of_squares", dtype=dtype) + @annotate("FRAME_MEDIAN", color="green", domain="cudf_python") def median( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -4352,6 +4419,7 @@ def median( ) # Scans + @annotate("FRAME_SCAN", color="green", domain="cudf_python") def _scan(self, op, axis=None, skipna=True, cast_to_int=False): skipna = True if skipna is None else skipna @@ -4386,6 +4454,7 @@ def _scan(self, op, axis=None, skipna=True, cast_to_int=False): # for Index._from_data and simplify. return self._from_data(results, index=self._index) + @annotate("FRAME_CUMMIN", color="green", domain="cudf_python") def cummin(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative minimum of the Series or DataFrame. @@ -4429,6 +4498,7 @@ def cummin(self, axis=None, skipna=True, *args, **kwargs): """ return self._scan("min", axis=axis, skipna=skipna, *args, **kwargs) + @annotate("FRAME_CUMMAX", color="green", domain="cudf_python") def cummax(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative maximum of the Series or DataFrame. @@ -4472,6 +4542,7 @@ def cummax(self, axis=None, skipna=True, *args, **kwargs): """ return self._scan("max", axis=axis, skipna=skipna, *args, **kwargs) + @annotate("FRAME_CUMSUM", color="green", domain="cudf_python") def cumsum(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative sum of the Series or DataFrame. @@ -4518,6 +4589,7 @@ def cumsum(self, axis=None, skipna=True, *args, **kwargs): "sum", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs ) + @annotate("FRAME_CUMPROD", color="green", domain="cudf_python") def cumprod(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative product of the Series or DataFrame. @@ -4563,6 +4635,7 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs ) + @annotate("FRAME_TO_JSON", color="green", domain="cudf_python") @ioutils.doc_to_json() def to_json(self, path_or_buf=None, *args, **kwargs): """{docstring}""" @@ -4571,18 +4644,21 @@ def to_json(self, path_or_buf=None, *args, **kwargs): self, path_or_buf=path_or_buf, *args, **kwargs ) + @annotate("FRAME_TO_HDF", color="green", domain="cudf_python") @ioutils.doc_to_hdf() def to_hdf(self, path_or_buf, key, *args, **kwargs): """{docstring}""" cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) + @annotate("FRAME_TO_DLPACK", color="green", domain="cudf_python") @ioutils.doc_to_dlpack() def to_dlpack(self): """{docstring}""" return cudf.io.dlpack.to_dlpack(self) + @annotate("FRAME_TO_STRING", color="green", domain="cudf_python") def to_string(self): """ Convert to string @@ -4608,12 +4684,15 @@ def to_string(self): def __str__(self): return self.to_string() + @annotate("FRAME_DEEP_COPY", color="green", domain="cudf_python") def __deepcopy__(self, memo): return self.copy(deep=True) + @annotate("FRAME_COPY", color="green", domain="cudf_python") def __copy__(self): return self.copy(deep=False) + @annotate("FRAME_HEAD", color="green", domain="cudf_python") def head(self, n=5): """ Return the first `n` rows. @@ -4697,6 +4776,7 @@ def head(self, n=5): """ return self.iloc[:n] + @annotate("FRAME_TAIL", color="green", domain="cudf_python") def tail(self, n=5): """ Returns the last n rows as a new DataFrame or Series @@ -4728,6 +4808,7 @@ def tail(self, n=5): return self.iloc[-n:] + @annotate("FRAME_ROLLING", color="green", domain="cudf_python") @copy_docstring(Rolling) def rolling( self, window, min_periods=None, center=False, axis=0, win_type=None @@ -4741,6 +4822,7 @@ def rolling( win_type=win_type, ) + @annotate("FRAME_NANS_TO_NULLS", color="green", domain="cudf_python") def nans_to_nulls(self): """ Convert nans (if any) to nulls @@ -4795,6 +4877,7 @@ def nans_to_nulls(self): self._index, ) + @annotate("FRAME_INVERT", color="green", domain="cudf_python") def __invert__(self): """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" return self._from_data( @@ -4805,6 +4888,7 @@ def __invert__(self): self._index, ) + @annotate("FRAME_ADD", color="green", domain="cudf_python") def add(self, other, axis, level=None, fill_value=None): """ Get Addition of dataframe or series and other, element-wise (binary @@ -4875,6 +4959,7 @@ def add(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "add", fill_value) + @annotate("FRAME_RADD", color="green", domain="cudf_python") def radd(self, other, axis, level=None, fill_value=None): """ Get Addition of dataframe or series and other, element-wise (binary @@ -4954,6 +5039,7 @@ def radd(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "add", fill_value, reflect=True) + @annotate("FRAME_SUBTRACT", color="green", domain="cudf_python") def subtract(self, other, axis, level=None, fill_value=None): """ Get Subtraction of dataframe or series and other, element-wise (binary @@ -5036,6 +5122,7 @@ def subtract(self, other, axis, level=None, fill_value=None): sub = subtract + @annotate("FRAME_RSUB", color="green", domain="cudf_python") def rsub(self, other, axis, level=None, fill_value=None): """ Get Subtraction of dataframe or series and other, element-wise (binary @@ -5119,6 +5206,7 @@ def rsub(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "sub", fill_value, reflect=True) + @annotate("FRAME_MULTIPLY", color="green", domain="cudf_python") def multiply(self, other, axis, level=None, fill_value=None): """ Get Multiplication of dataframe or series and other, element-wise @@ -5203,6 +5291,7 @@ def multiply(self, other, axis, level=None, fill_value=None): mul = multiply + @annotate("FRAME_RMUL", color="green", domain="cudf_python") def rmul(self, other, axis, level=None, fill_value=None): """ Get Multiplication of dataframe or series and other, element-wise @@ -5287,6 +5376,7 @@ def rmul(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mul", fill_value, reflect=True) + @annotate("FRAME_MOD", color="green", domain="cudf_python") def mod(self, other, axis, level=None, fill_value=None): """ Get Modulo division of dataframe or series and other, element-wise @@ -5357,6 +5447,7 @@ def mod(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mod", fill_value) + @annotate("FRAME_RMOD", color="green", domain="cudf_python") def rmod(self, other, axis, level=None, fill_value=None): """ Get Modulo division of dataframe or series and other, element-wise @@ -5439,6 +5530,7 @@ def rmod(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "mod", fill_value, reflect=True) + @annotate("FRAME_POW", color="green", domain="cudf_python") def pow(self, other, axis, level=None, fill_value=None): """ Get Exponential power of dataframe series and other, element-wise @@ -5518,6 +5610,7 @@ def pow(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "pow", fill_value) + @annotate("FRAME_RPOW", color="green", domain="cudf_python") def rpow(self, other, axis, level=None, fill_value=None): """ Get Exponential power of dataframe or series and other, element-wise @@ -5597,6 +5690,7 @@ def rpow(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "pow", fill_value, reflect=True) + @annotate("FRAME_FLOORDIV", color="green", domain="cudf_python") def floordiv(self, other, axis, level=None, fill_value=None): """ Get Integer division of dataframe or series and other, element-wise @@ -5676,6 +5770,7 @@ def floordiv(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "floordiv", fill_value) + @annotate("FRAME_RFLOORDIV", color="green", domain="cudf_python") def rfloordiv(self, other, axis, level=None, fill_value=None): """ Get Integer division of dataframe or series and other, element-wise @@ -5772,6 +5867,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None): return self._binaryop(other, "floordiv", fill_value, reflect=True) + @annotate("FRAME_TRUEDIV", color="green", domain="cudf_python") def truediv(self, other, axis, level=None, fill_value=None): """ Get Floating division of dataframe or series and other, element-wise @@ -5860,6 +5956,7 @@ def truediv(self, other, axis, level=None, fill_value=None): div = truediv divide = truediv + @annotate("FRAME_RTRUEDIV", color="green", domain="cudf_python") def rtruediv(self, other, axis, level=None, fill_value=None): """ Get Floating division of dataframe or series and other, element-wise @@ -5952,6 +6049,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None): # Alias for rtruediv rdiv = rtruediv + @annotate("FRAME_EQ", color="green", domain="cudf_python") def eq(self, other, axis="columns", level=None, fill_value=None): """Equal to, element-wise (binary operator eq). @@ -6027,6 +6125,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None): other=other, fn="eq", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_NE", color="green", domain="cudf_python") def ne(self, other, axis="columns", level=None, fill_value=None): """Not equal to, element-wise (binary operator ne). @@ -6102,6 +6201,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None): other=other, fn="ne", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_LT", color="green", domain="cudf_python") def lt(self, other, axis="columns", level=None, fill_value=None): """Less than, element-wise (binary operator lt). @@ -6177,6 +6277,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None): other=other, fn="lt", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_LE", color="green", domain="cudf_python") def le(self, other, axis="columns", level=None, fill_value=None): """Less than or equal, element-wise (binary operator le). @@ -6252,6 +6353,7 @@ def le(self, other, axis="columns", level=None, fill_value=None): other=other, fn="le", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_GT", color="green", domain="cudf_python") def gt(self, other, axis="columns", level=None, fill_value=None): """Greater than, element-wise (binary operator gt). @@ -6327,6 +6429,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None): other=other, fn="gt", fill_value=fill_value, can_reindex=True ) + @annotate("FRAME_GE", color="green", domain="cudf_python") def ge(self, other, axis="columns", level=None, fill_value=None): """Greater than or equal, element-wise (binary operator ge). @@ -6403,6 +6506,11 @@ def ge(self, other, axis="columns", level=None, fill_value=None): ) +@annotate( + "FRAME_GET_REPLACEMENT_VALUES_FOR_COLUMNS", + color="green", + domain="cudf_python", +) def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] ) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]: @@ -6567,6 +6675,7 @@ def _is_series(obj): return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None +@annotate("FRAME_DROP_ROWS_BY_LABELS", color="green", domain="cudf_python") def _drop_rows_by_labels( obj: DataFrameOrSeries, labels: Union[ColumnLike, abc.Iterable, str], diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index a919b00692d..948428de4f0 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -7,6 +7,7 @@ from uuid import uuid4 import numpy as np +from nvtx import annotate from pyarrow import dataset as ds, parquet as pq import cudf @@ -16,6 +17,7 @@ from cudf.utils import ioutils +@annotate("_WRITE_PARQUET", color="green", domain="cudf_python") def _write_parquet( df, paths, @@ -73,6 +75,7 @@ def _write_parquet( # Logic chosen to match: https://arrow.apache.org/ # docs/_modules/pyarrow/parquet.html#write_to_dataset +@annotate("WRITE_TO_DATASET", color="green", domain="cudf_python") def write_to_dataset( df, root_path, @@ -161,6 +164,7 @@ def write_to_dataset( @ioutils.doc_read_parquet_metadata() +@annotate("READ_PARQUET_METADATA", color="green", domain="cudf_python") def read_parquet_metadata(path): """{docstring}""" @@ -173,6 +177,7 @@ def read_parquet_metadata(path): return num_rows, num_row_groups, col_names +@annotate("_PROCESS_DATASET", color="green", domain="cudf_python") def _process_dataset( paths, fs, filters=None, row_groups=None, categorical_partitions=True, ): @@ -308,6 +313,7 @@ def _process_dataset( @ioutils.doc_read_parquet() +@annotate("READ_PARQUET", color="green", domain="cudf_python") def read_parquet( filepath_or_buffer, engine="cudf", @@ -435,6 +441,7 @@ def read_parquet( ) +@annotate("_PARQUET_TO_FRAME", color="green", domain="cudf_python") def _parquet_to_frame( paths_or_buffers, *args, @@ -502,6 +509,7 @@ def _parquet_to_frame( ) +@annotate("_WRITE_PARQUET", color="green", domain="cudf_python") def _read_parquet( filepaths_or_buffers, engine, @@ -535,6 +543,7 @@ def _read_parquet( @ioutils.doc_to_parquet() +@annotate("TO_PARQUET", color="green", domain="cudf_python") def to_parquet( df, path, @@ -646,6 +655,7 @@ def _generate_filename(): return uuid4().hex + ".parquet" +@annotate("_GET_PARTITIONED", color="green", domain="cudf_python") def _get_partitioned( df, root_path, @@ -689,6 +699,7 @@ def _get_partitioned( class ParquetDatasetWriter: + @annotate("ParquetDatasetWriter_INIT", color="green", domain="cudf_python") def __init__( self, path, @@ -765,6 +776,9 @@ def __init__( self.path_cw_map: Dict[str, int] = {} self.filename = None + @annotate( + "ParquetDatasetWriter_WRITE_TABLE", color="green", domain="cudf_python" + ) def write_table(self, df): """ Write a dataframe to the file/dataset @@ -821,6 +835,9 @@ def write_table(self, df): self.path_cw_map.update({k: new_cw_idx for k in new_paths}) self._chunked_writers[-1][0].write_table(grouped_df, part_info) + @annotate( + "ParquetDatasetWriter_CLOSE", color="green", domain="cudf_python" + ) def close(self, return_metadata=False): """ Close all open files and optionally return footer metadata as a binary From 814ed75bac6d873a970ac4103c810c66a9b8bdbf Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 2 Feb 2022 18:39:48 -0600 Subject: [PATCH 2/8] Update frame.py --- python/cudf/cudf/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6b0f978b6b1..2123569e482 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -121,8 +121,8 @@ def deserialize(cls, header, frames): columns = deserialize_columns(header["columns"], frames) return cls_deserialize._from_data(dict(zip(column_names, columns))) - @annotate("FRAME_FROM_DATA", color="green", domain="cudf_python") @classmethod + @annotate("FRAME_FROM_DATA", color="green", domain="cudf_python") def _from_data( cls, data: MutableMapping, @@ -132,8 +132,8 @@ def _from_data( Frame.__init__(obj, data, index) return obj - @annotate("FRAME_FROM_COLUMNS", color="green", domain="cudf_python") @classmethod + @annotate("FRAME_FROM_COLUMNS", color="green", domain="cudf_python") def _from_columns( cls, columns: List[ColumnBase], From 6377d68bcbdeed327806e001dfd8cc1faef600dd Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 2 Feb 2022 18:56:11 -0600 Subject: [PATCH 3/8] Update dataframe.py --- python/cudf/cudf/core/dataframe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3131e10d52c..48ef140c949 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4498,8 +4498,8 @@ def to_pandas(self, nullable=False, **kwargs): out_df.columns = out_columns return out_df - @annotate("DATAFRAME_FROM_PANDAS", color="green", domain="cudf_python") @classmethod + @annotate("DATAFRAME_FROM_PANDAS", color="green", domain="cudf_python") def from_pandas(cls, dataframe, nan_as_null=None): """ Convert from a Pandas DataFrame. @@ -4568,8 +4568,8 @@ def from_pandas(cls, dataframe, nan_as_null=None): return result - @annotate("DATAFRAME_FROM_ARROW", color="green", domain="cudf_python") @classmethod + @annotate("DATAFRAME_FROM_ARROW", color="green", domain="cudf_python") def from_arrow(cls, table): """ Convert from PyArrow Table to DataFrame. @@ -4723,8 +4723,8 @@ def to_records(self, index=True): ret[col] = self[col].to_numpy() return ret - @annotate("DATAFRAME_FROM_RECORDS", color="green", domain="cudf_python") @classmethod + @annotate("DATAFRAME_FROM_RECORDS", color="green", domain="cudf_python") def from_records(cls, data, index=None, columns=None, nan_as_null=False): """ Convert structured or record ndarray to DataFrame. From 24e9d4985afa0d6ff403b54b3ed2650b0bfda724 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 2 Feb 2022 17:41:12 -0800 Subject: [PATCH 4/8] remove annotation --- python/cudf/cudf/core/dataframe.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 48ef140c949..8cbb9b0df5f 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3625,7 +3625,6 @@ def groupby( ) ) - @annotate("DATAFRAME_QUERY", color="green", domain="cudf_python") def query(self, expr, local_dict=None): """ Query with a boolean expression using Numba to compile a GPU kernel. @@ -3692,7 +3691,7 @@ def query(self, expr, local_dict=None): """ # can't use `annotate` decorator here as we inspect the calling # environment. - with annotate("QUERY", color="purple", domain="cudf_python"): + with annotate("DATAFRAME_QUERY", color="purple", domain="cudf_python"): if local_dict is None: local_dict = {} From 98676756024597e8449176f62fca340757970d84 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Thu, 3 Feb 2022 07:40:56 -0800 Subject: [PATCH 5/8] annotations in dask_cudf --- python/dask_cudf/dask_cudf/backends.py | 23 +++++++++++++ python/dask_cudf/dask_cudf/core.py | 41 ++++++++++++++++++++++ python/dask_cudf/dask_cudf/groupby.py | 47 ++++++++++++++++++++++++++ python/dask_cudf/dask_cudf/sorting.py | 8 +++++ 4 files changed, 119 insertions(+) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 89b5301ee83..9ad4bfb5b82 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from nvtx import annotate from dask.dataframe.core import get_parallel_type, meta_nonempty from dask.dataframe.dispatch import ( @@ -39,6 +40,7 @@ @meta_nonempty.register(cudf.BaseIndex) +@annotate("_nonempty_index", color="green", domain="dask_cudf_python") def _nonempty_index(idx): if isinstance(idx, cudf.core.index.RangeIndex): return cudf.core.index.RangeIndex(2, name=idx.name) @@ -73,6 +75,7 @@ def _nonempty_index(idx): raise TypeError(f"Don't know how to handle index of type {type(idx)}") +@annotate("_get_non_empty_data", color="green", domain="dask_cudf_python") def _get_non_empty_data(s): if isinstance(s._column, cudf.core.column.CategoricalColumn): categories = ( @@ -100,6 +103,7 @@ def _get_non_empty_data(s): @meta_nonempty.register(cudf.Series) +@annotate("_nonempty_series", color="green", domain="dask_cudf_python") def _nonempty_series(s, idx=None): if idx is None: idx = _nonempty_index(s.index) @@ -109,6 +113,7 @@ def _nonempty_series(s, idx=None): @meta_nonempty.register(cudf.DataFrame) +@annotate("meta_nonempty_cudf", color="green", domain="dask_cudf_python") def meta_nonempty_cudf(x): idx = meta_nonempty(x.index) columns_with_dtype = dict() @@ -124,15 +129,18 @@ def meta_nonempty_cudf(x): @make_meta_dispatch.register((cudf.Series, cudf.DataFrame)) +@annotate("make_meta_cudf", color="green", domain="dask_cudf_python") def make_meta_cudf(x, index=None): return x.head(0) @make_meta_dispatch.register(cudf.BaseIndex) +@annotate("make_meta_cudf_index", color="green", domain="dask_cudf_python") def make_meta_cudf_index(x, index=None): return x[:0] +@annotate("_empty_series", color="green", domain="dask_cudf_python") def _empty_series(name, dtype, index=None): if isinstance(dtype, str) and dtype == "category": return cudf.Series( @@ -142,6 +150,7 @@ def _empty_series(name, dtype, index=None): @make_meta_obj.register(object) +@annotate("make_meta_object_cudf", color="green", domain="dask_cudf_python") def make_meta_object_cudf(x, index=None): """Create an empty cudf object containing the desired metadata. @@ -212,6 +221,7 @@ def make_meta_object_cudf(x, index=None): @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex)) +@annotate("concat_cudf", color="green", domain="dask_cudf_python") def concat_cudf( dfs, axis=0, @@ -236,11 +246,13 @@ def concat_cudf( @categorical_dtype_dispatch.register( (cudf.DataFrame, cudf.Series, cudf.BaseIndex) ) +@annotate("categorical_dtype_cudf", color="green", domain="dask_cudf_python") def categorical_dtype_cudf(categories=None, ordered=None): return cudf.CategoricalDtype(categories=categories, ordered=ordered) @tolist_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("tolist_cudf", color="green", domain="dask_cudf_python") def tolist_cudf(obj): return obj.to_arrow().to_pylist() @@ -248,6 +260,9 @@ def tolist_cudf(obj): @is_categorical_dtype_dispatch.register( (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series) ) +@annotate( + "is_categorical_dtype_cudf", color="green", domain="dask_cudf_python" +) def is_categorical_dtype_cudf(obj): return cudf.api.types.is_categorical_dtype(obj) @@ -261,6 +276,7 @@ def is_categorical_dtype_cudf(obj): ) @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex)) + @annotate("percentile_cudf", color="green", domain="dask_cudf_python") def percentile_cudf(a, q, interpolation="linear"): # Cudf dispatch to the equivalent of `np.percentile`: # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html @@ -305,6 +321,7 @@ def percentile_cudf(a, q, interpolation="linear"): @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("union_categoricals_cudf", color="green", domain="dask_cudf_python") def union_categoricals_cudf( to_union, sort_categories=False, ignore_order=False ): @@ -313,11 +330,13 @@ def union_categoricals_cudf( ) +@annotate("safe_hash", color="green", domain="dask_cudf_python") def safe_hash(frame): return cudf.Series(frame.hash_values(), index=frame.index) @hash_object_dispatch.register((cudf.DataFrame, cudf.Series)) +@annotate("hash_object_cudf", color="green", domain="dask_cudf_python") def hash_object_cudf(frame, index=True): if index: return safe_hash(frame.reset_index()) @@ -325,6 +344,7 @@ def hash_object_cudf(frame, index=True): @hash_object_dispatch.register(cudf.BaseIndex) +@annotate("hash_object_cudf_index", color="green", domain="dask_cudf_python") def hash_object_cudf_index(ind, index=None): if isinstance(ind, cudf.MultiIndex): @@ -335,6 +355,7 @@ def hash_object_cudf_index(ind, index=None): @group_split_dispatch.register((cudf.Series, cudf.DataFrame)) +@annotate("group_split_cudf", color="green", domain="dask_cudf_python") def group_split_cudf(df, c, k, ignore_index=False): return dict( zip( @@ -349,10 +370,12 @@ def group_split_cudf(df, c, k, ignore_index=False): @sizeof_dispatch.register(cudf.DataFrame) +@annotate("sizeof_cudf_dataframe", color="green", domain="dask_cudf_python") def sizeof_cudf_dataframe(df): return int(df.memory_usage().sum()) @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex)) +@annotate("sizeof_cudf_series_index", color="green", domain="dask_cudf_python") def sizeof_cudf_series_index(obj): return obj.memory_usage() diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index e191873f82b..c195240501f 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from nvtx import annotate from tlz import partition_all import dask @@ -57,6 +58,7 @@ def __dask_postcompute__(self): def __dask_postpersist__(self): return type(self), (self._name, self._meta, self.divisions) + @annotate("_FRAME_INIT", color="green", domain="dask_cudf_python") def __init__(self, dsk, name, meta, divisions): if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[]) @@ -82,6 +84,9 @@ def __repr__(self): s = "" return s % (type(self).__name__, len(self.dask), self.npartitions) + @annotate( + "_FRAME_to_dask_dataframe", color="green", domain="dask_cudf_python" + ) def to_dask_dataframe(self, **kwargs): """Create a dask.dataframe object from a dask_cudf object""" nullable_pd_dtype = kwargs.get("nullable_pd_dtype", False) @@ -99,6 +104,9 @@ def to_dask_dataframe(self, **kwargs): class DataFrame(_Frame, dd.core.DataFrame): _partition_type = cudf.DataFrame + @annotate( + "DATAFRAME_assign_column", color="green", domain="dask_cudf_python" + ) def _assign_column(self, k, v): def assigner(df, k, v): out = df.copy() @@ -108,6 +116,7 @@ def assigner(df, k, v): meta = assigner(self._meta, k, dask_make_meta(v)) return self.map_partitions(assigner, k, v, meta=meta) + @annotate("DATAFRAME_apply_rows", color="green", domain="dask_cudf_python") def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): import uuid @@ -127,6 +136,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs): do_apply_rows, func, incols, outcols, kwargs, meta=meta ) + @annotate("DATAFRAME_merge", color="green", domain="dask_cudf_python") def merge(self, other, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -138,6 +148,7 @@ def merge(self, other, **kwargs): on = list(on) return super().merge(other, on=on, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_join", color="green", domain="dask_cudf_python") def join(self, other, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -155,6 +166,7 @@ def join(self, other, **kwargs): on = list(on) return super().join(other, how=how, on=on, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_set_index", color="green", domain="dask_cudf_python") def set_index(self, other, sorted=False, divisions=None, **kwargs): if kwargs.pop("shuffle", "tasks") != "tasks": raise ValueError( @@ -226,6 +238,9 @@ def set_index(self, other, sorted=False, divisions=None, **kwargs): **kwargs, ) + @annotate( + "DATAFRAME_sort_values", color="green", domain="dask_cudf_python" + ) def sort_values( self, by, @@ -261,12 +276,14 @@ def sort_values( return df.reset_index(drop=True) return df + @annotate("DATAFRAME_to_parquet", color="green", domain="dask_cudf_python") def to_parquet(self, path, *args, **kwargs): """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" from dask_cudf.io import to_parquet return to_parquet(self, path, *args, **kwargs) + @annotate("DATAFRAME_to_orc", color="green", domain="dask_cudf_python") def to_orc(self, path, **kwargs): """Calls dask_cudf.io.to_orc""" from dask_cudf.io import to_orc @@ -274,6 +291,7 @@ def to_orc(self, path, **kwargs): return to_orc(self, path, **kwargs) @derived_from(pd.DataFrame) + @annotate("DATAFRAME_var", color="green", domain="dask_cudf_python") def var( self, axis=None, @@ -302,6 +320,9 @@ def var( else: return _parallel_var(self, meta, skipna, split_every, out) + @annotate( + "DATAFRAME_repartition", color="green", domain="dask_cudf_python" + ) def repartition(self, *args, **kwargs): """Wraps dask.dataframe DataFrame.repartition method. Uses DataFrame.shuffle if `columns=` is specified. @@ -324,6 +345,7 @@ def repartition(self, *args, **kwargs): ) return super().repartition(*args, **kwargs) + @annotate("DATAFRAME_shuffle", color="green", domain="dask_cudf_python") def shuffle(self, *args, **kwargs): """Wraps dask.dataframe DataFrame.shuffle method""" shuffle_arg = kwargs.pop("shuffle", None) @@ -331,18 +353,21 @@ def shuffle(self, *args, **kwargs): raise ValueError("dask_cudf does not support disk-based shuffle.") return super().shuffle(*args, shuffle="tasks", **kwargs) + @annotate("DATAFRAME_groupby", color="green", domain="dask_cudf_python") def groupby(self, by=None, **kwargs): from .groupby import CudfDataFrameGroupBy return CudfDataFrameGroupBy(self, by=by, **kwargs) +@annotate("DATAFRAME_sum_of_squares", color="green", domain="dask_cudf_python") def sum_of_squares(x): x = x.astype("f8")._column outcol = libcudf.reduce.reduce("sum_of_squares", x) return cudf.Series(outcol) +@annotate("DATAFRAME_var_aggregate", color="green", domain="dask_cudf_python") def var_aggregate(x2, x, n, ddof): try: with warnings.catch_warnings(record=True): @@ -355,10 +380,12 @@ def var_aggregate(x2, x, n, ddof): return np.float64(np.nan) +@annotate("DATAFRAME_nlargest_agg", color="green", domain="dask_cudf_python") def nlargest_agg(x, **kwargs): return cudf.concat(x).nlargest(**kwargs) +@annotate("DATAFRAME_nsmallest_agg", color="green", domain="dask_cudf_python") def nsmallest_agg(x, **kwargs): return cudf.concat(x).nsmallest(**kwargs) @@ -366,6 +393,7 @@ def nsmallest_agg(x, **kwargs): class Series(_Frame, dd.core.Series): _partition_type = cudf.Series + @annotate("Series_count", color="green", domain="dask_cudf_python") def count(self, split_every=False): return reduction( [self], @@ -375,12 +403,14 @@ def count(self, split_every=False): meta="i8", ) + @annotate("Series_mean", color="green", domain="dask_cudf_python") def mean(self, split_every=False): sum = self.sum(split_every=split_every) n = self.count(split_every=split_every) return sum / n @derived_from(pd.DataFrame) + @annotate("Series_var", color="green", domain="dask_cudf_python") def var( self, axis=None, @@ -409,16 +439,19 @@ def var( else: return _parallel_var(self, meta, skipna, split_every, out) + @annotate("Series_groupby", color="green", domain="dask_cudf_python") def groupby(self, *args, **kwargs): from .groupby import CudfSeriesGroupBy return CudfSeriesGroupBy(self, *args, **kwargs) @property + @annotate("Series_list", color="green", domain="dask_cudf_python") def list(self): return ListMethods(self) @property + @annotate("Series_struct", color="green", domain="dask_cudf_python") def struct(self): return StructMethods(self) @@ -427,6 +460,7 @@ class Index(Series, dd.core.Index): _partition_type = cudf.Index # type: ignore +@annotate("_naive_var", color="green", domain="dask_cudf_python") def _naive_var(ddf, meta, skipna, ddof, split_every, out): num = ddf._get_numeric_data() x = 1.0 * num.sum(skipna=skipna, split_every=split_every) @@ -441,6 +475,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out): return handle_out(out, result) +@annotate("_parallel_var", color="green", domain="dask_cudf_python") def _parallel_var(ddf, meta, skipna, split_every, out): def _local_var(x, skipna): if skipna: @@ -507,6 +542,7 @@ def _finalize_var(vals): return handle_out(out, result) +@annotate("_extract_meta", color="green", domain="dask_cudf_python") def _extract_meta(x): """ Extract internal cache data (``_meta``) from dask_cudf objects @@ -522,6 +558,7 @@ def _extract_meta(x): return x +@annotate("_emulate", color="green", domain="dask_cudf_python") def _emulate(func, *args, **kwargs): """ Apply a function using args / kwargs. If arguments contain dd.DataFrame / @@ -531,6 +568,7 @@ def _emulate(func, *args, **kwargs): return func(*_extract_meta(args), **_extract_meta(kwargs)) +@annotate("align_partitions", color="green", domain="dask_cudf_python") def align_partitions(args): """Align partitions between dask_cudf objects. @@ -546,6 +584,7 @@ def align_partitions(args): return args +@annotate("reduction", color="green", domain="dask_cudf_python") def reduction( args, chunk=None, @@ -686,6 +725,7 @@ def reduction( return dd.core.new_dd_object(graph, b, meta, (None, None)) +@annotate("from_cudf", color="green", domain="dask_cudf_python") def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): if isinstance(getattr(data, "index", None), cudf.MultiIndex): raise NotImplementedError( @@ -707,6 +747,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): ) +@annotate("from_dask_dataframe", color="green", domain="dask_cudf_python") def from_dask_dataframe(df): return df.map_partitions(cudf.from_pandas) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py index 1bc270a5b9f..658e63ea923 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/groupby.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from nvtx import annotate from dask.base import tokenize from dask.dataframe.core import ( @@ -35,11 +36,19 @@ class CudfDataFrameGroupBy(DataFrameGroupBy): + @annotate( + "CudfDataFrameGroupBy_INIT", color="green", domain="dask_cudf_python" + ) def __init__(self, *args, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) super().__init__(*args, **kwargs) + @annotate( + "CudfDataFrameGroupBy_GETITEM", + color="green", + domain="dask_cudf_python", + ) def __getitem__(self, key): if isinstance(key, list): g = CudfDataFrameGroupBy( @@ -53,6 +62,9 @@ def __getitem__(self, key): g._meta = g._meta[key] return g + @annotate( + "CudfDataFrameGroupBy_MEAN", color="green", domain="dask_cudf_python" + ) def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -66,6 +78,11 @@ def mean(self, split_every=None, split_out=1): as_index=self.as_index, ) + @annotate( + "CudfDataFrameGroupBy_COLLECT", + color="green", + domain="dask_cudf_python", + ) def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -79,6 +96,11 @@ def collect(self, split_every=None, split_out=1): as_index=self.as_index, ) + @annotate( + "CudfDataFrameGroupBy_AGGREGATE", + color="green", + domain="dask_cudf_python", + ) def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() @@ -118,11 +140,17 @@ def aggregate(self, arg, split_every=None, split_out=1): class CudfSeriesGroupBy(SeriesGroupBy): + @annotate( + "CudfSeriesGroupBy_INIT", color="green", domain="dask_cudf_python" + ) def __init__(self, *args, **kwargs): self.sep = kwargs.pop("sep", "___") self.as_index = kwargs.pop("as_index", True) super().__init__(*args, **kwargs) + @annotate( + "CudfSeriesGroupBy_MEAN", color="green", domain="dask_cudf_python" + ) def mean(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -136,6 +164,9 @@ def mean(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_STD", color="green", domain="dask_cudf_python" + ) def std(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -149,6 +180,9 @@ def std(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_VAR", color="green", domain="dask_cudf_python" + ) def var(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -162,6 +196,9 @@ def var(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_COLLECT", color="green", domain="dask_cudf_python" + ) def collect(self, split_every=None, split_out=1): return groupby_agg( self.obj, @@ -175,6 +212,9 @@ def collect(self, split_every=None, split_out=1): as_index=self.as_index, )[self._slice] + @annotate( + "CudfSeriesGroupBy_AGGREGATE", color="green", domain="dask_cudf_python" + ) def aggregate(self, arg, split_every=None, split_out=1): if arg == "size": return self.size() @@ -205,6 +245,7 @@ def aggregate(self, arg, split_every=None, split_out=1): ) +@annotate("groupby_agg", color="green", domain="dask_cudf_python") def groupby_agg( ddf, gb_cols, @@ -371,6 +412,7 @@ def groupby_agg( return new_dd_object(graph, gb_agg_name, _meta, divisions) +@annotate("_redirect_aggs", color="green", domain="dask_cudf_python") def _redirect_aggs(arg): """Redirect aggregations to their corresponding name in cuDF""" redirects = { @@ -397,6 +439,7 @@ def _redirect_aggs(arg): return redirects.get(arg, arg) +@annotate("_is_supported", color="green", domain="dask_cudf_python") def _is_supported(arg, supported: set): """Check that aggregations in `arg` are a subset of `supported`""" if isinstance(arg, (list, dict)): @@ -422,6 +465,7 @@ def _make_name(*args, sep="_"): return sep.join(_args) +@annotate("_groupby_partition_agg", color="green", domain="dask_cudf_python") def _groupby_partition_agg( df, gb_cols, aggs, columns, split_out, dropna, sort, sep ): @@ -479,6 +523,7 @@ def _groupby_partition_agg( return output +@annotate("_tree_node_agg", color="green", domain="dask_cudf_python") def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): """Node in groupby-aggregation reduction tree. @@ -513,6 +558,7 @@ def _tree_node_agg(dfs, gb_cols, split_out, dropna, sort, sep): return gb +@annotate("_var_agg", color="green", domain="dask_cudf_python") def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): """Calculate variance (given count, sum, and sum-squared columns).""" @@ -534,6 +580,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): return var +@annotate("_finalize_gb_agg", color="green", domain="dask_cudf_python") def _finalize_gb_agg( gb, gb_cols, diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index af40d9ca41b..63755cf9b89 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -4,6 +4,7 @@ import cupy import numpy as np import tlz as toolz +from nvtx import annotate from dask.base import tokenize from dask.dataframe import methods @@ -16,12 +17,14 @@ from cudf.api.types import is_categorical_dtype +@annotate("set_index_post", color="green", domain="dask_cudf_python") def set_index_post(df, index_name, drop, column_dtype): df2 = df.set_index(index_name, drop=drop) df2.columns = df2.columns.astype(column_dtype) return df2 +@annotate("_set_partitions_pre", color="green", domain="dask_cudf_python") def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): if ascending: partitions = divisions.searchsorted(s, side="right") - 1 @@ -38,6 +41,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): return partitions +@annotate("_quantile", color="green", domain="dask_cudf_python") def _quantile(a, q): n = len(a) if not len(a): @@ -45,6 +49,7 @@ def _quantile(a, q): return (a.quantiles(q=q.tolist(), interpolation="nearest"), n) +@annotate("merge_quantiles", color="green", domain="dask_cudf_python") def merge_quantiles(finalq, qs, vals): """Combine several quantile calculations of different data. [NOTE: Same logic as dask.array merge_percentiles] @@ -107,6 +112,7 @@ def _append_counts(val, count): return rv.reset_index(drop=True) +@annotate("_approximate_quantile", color="green", domain="dask_cudf_python") def _approximate_quantile(df, q): """Approximate quantiles of DataFrame or Series. [NOTE: Same logic as dask.dataframe Series quantile] @@ -180,6 +186,7 @@ def set_quantile_index(df): return df +@annotate("quantile_divisions", color="green", domain="cudf_python") def quantile_divisions(df, by, npartitions): qn = np.linspace(0.0, 1.0, npartitions + 1).tolist() divisions = _approximate_quantile(df[by], qn).compute() @@ -213,6 +220,7 @@ def quantile_divisions(df, by, npartitions): return divisions +@annotate("sort_values", color="green", domain="cudf_python") def sort_values( df, by, From ed1519b566c31949b9663b4e52fb08997254a3d6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 9 Feb 2022 19:50:52 -0600 Subject: [PATCH 6/8] Update backends.py --- python/dask_cudf/dask_cudf/backends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 9ad4bfb5b82..1b1f3e29ab2 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from collections.abc import Iterator From 6170c5c06b2a743bd7ecce5d223b0fe3e93daba4 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 9 Feb 2022 19:51:07 -0600 Subject: [PATCH 7/8] Update core.py --- python/dask_cudf/dask_cudf/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index f4764beeb57..d8802f33941 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2021, NVIDIA CORPORATION. +# Copyright (c) 2018-2022, NVIDIA CORPORATION. import math import warnings From e201d11ee2e98d62c2cf9c6c15ac60b7e76cd98c Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 9 Feb 2022 19:51:25 -0600 Subject: [PATCH 8/8] Update sorting.py --- python/dask_cudf/dask_cudf/sorting.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index 63755cf9b89..ada738c5a9b 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. + from collections.abc import Iterator import cupy