From 8a5c876be6fef2e26d94bd1c81a3433a2f83243f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 16 May 2024 17:30:15 -0700 Subject: [PATCH 1/4] Remove private uses of ._index --- python/cudf/cudf/core/column_accessor.py | 1 + python/cudf/cudf/core/dataframe.py | 196 ++++++++++++-------- python/cudf/cudf/core/groupby/groupby.py | 10 +- python/cudf/cudf/core/indexed_frame.py | 167 ++++++++--------- python/cudf/cudf/core/join/_join_helpers.py | 4 +- python/cudf/cudf/core/join/join.py | 12 +- python/cudf/cudf/core/reshape.py | 10 +- python/cudf/cudf/core/series.py | 56 +++--- python/cudf/cudf/tests/test_dataframe.py | 1 + 9 files changed, 244 insertions(+), 213 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 9f3de061ee8..86ff6077fd4 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -346,6 +346,7 @@ def insert( # TODO: we should move all insert logic here if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") + breakpoint() if loc == old_ncols: if validate: value = column.as_column(value) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 8442cf05f01..41d6c2defc7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -254,7 +254,7 @@ def _getitem_tuple_arg(self, arg): # Step 1: Gather columns if isinstance(arg, tuple): columns_df = self._frame._get_columns_by_label(arg[1]) - columns_df._index = self._frame._index + columns_df.index = self._frame.index else: columns_df = self._frame @@ -545,7 +545,7 @@ def __getitem__(self, arg): @_cudf_nvtx_annotate def _setitem_tuple_arg(self, key, value): columns_df = self._frame._from_data( - self._frame._data.select_by_index(key[1]), self._frame._index + self._frame._data.select_by_index(key[1]), self._frame.index ) if is_scalar(value): @@ -710,11 +710,11 @@ def __init__( if index is not None: if not data.index.equals(index): data = data.reindex(index) - index = data._index + index = data.index else: index = as_index(index) else: - index = data._index + index = data.index self._index = index @@ -1176,7 +1176,7 @@ def _constructor_expanddim(self): def serialize(self): header, frames = super().serialize() - header["index"], index_frames = self._index.serialize() + header["index"], index_frames = self.index.serialize() header["index_frame_count"] = len(index_frames) # For backwards compatibility with older versions of cuDF, index # columns are placed before data columns. @@ -1193,7 +1193,7 @@ def deserialize(cls, header, frames): idx_typ = pickle.loads(header["index"]["type-serialized"]) index = idx_typ.deserialize(header["index"], frames[:index_nframes]) - obj._index = index + obj.index = index return obj @@ -1397,9 +1397,9 @@ def __setitem__(self, arg, value): if arg in self._data: if not is_scalar(value) and len(self) == 0: if isinstance(value, (pd.Series, Series)): - self._index = as_index(value.index) + self.index = as_index(value.index) elif len(value) > 0: - self._index = RangeIndex(start=0, stop=len(value)) + self.index = RangeIndex(start=0, stop=len(value)) value = column.as_column(value) new_data = self._data.__class__() for key in self._data: @@ -1416,7 +1416,7 @@ def __setitem__(self, arg, value): return elif isinstance(value, (pd.Series, Series)): value = Series(value)._align_to_index( - self._index, + self.index, how="right", sort=False, allow_non_unique=True, @@ -1489,7 +1489,7 @@ def memory_usage(self, index=True, deep=False): mem_usage = [col.memory_usage for col in self._data.columns] names = [str(name) for name in self._data.names] if index: - mem_usage.append(self._index.memory_usage()) + mem_usage.append(self.index.memory_usage()) names.append("Index") return Series._from_data( data={None: as_column(mem_usage)}, @@ -1698,7 +1698,7 @@ def _concat( [] if are_all_range_index or (ignore_index and not empty_has_index) - else list(f._index._data.columns) + else list(f.index._data.columns) ) + [f._data[name] if name in f._data else None for name in names] for f in objs @@ -1761,11 +1761,9 @@ def _concat( # least one input frame has an index, assign a new RangeIndex # to the result frame. if empty_has_index and num_empty_input_frames == len(objs): - out._index = cudf.RangeIndex(result_index_length) + out.index = cudf.RangeIndex(result_index_length) elif are_all_range_index and not ignore_index: - out._index = cudf.core.index.Index._concat( - [o._index for o in objs] - ) + out.index = cudf.core.index.Index._concat([o.index for o in objs]) # Reassign the categories for any categorical table cols _reassign_categories( @@ -1773,14 +1771,14 @@ def _concat( ) # Reassign the categories for any categorical index cols - if not isinstance(out._index, cudf.RangeIndex): + if not isinstance(out.index, cudf.RangeIndex): _reassign_categories( categories, - out._index._data, + out.index._data, indices[:first_data_column_position], ) - if not isinstance(out._index, MultiIndex) and isinstance( - out._index.dtype, cudf.CategoricalDtype + if not isinstance(out.index, MultiIndex) and isinstance( + out.index.dtype, cudf.CategoricalDtype ): out = out.set_index( cudf.core.index.as_index(out.index._values) @@ -1796,8 +1794,8 @@ def _concat( else: out.columns = names if not ignore_index: - out._index.name = objs[0]._index.name - out._index.names = objs[0]._index.names + out.index.name = objs[0].index.name + out.index.names = objs[0].index.names return out @@ -1965,7 +1963,7 @@ def _get_renderable_dataframe(self): output = cudf.concat([upper, lower]) output = self._clean_nulls_from_dataframe(output) - output._index = output._index._clean_nulls_from_index() + output.index = output.index._clean_nulls_from_index() return output @@ -2036,7 +2034,7 @@ def _make_operands_and_index_for_binop( bool, ]: lhs, rhs = self._data, other - index = self._index + index = self.index fill_requires_key = False left_default: Any = False equal_columns = False @@ -2069,7 +2067,7 @@ def _make_operands_and_index_for_binop( ) can_use_self_column_name = ( equal_columns - or list(other._index._data.names) == self._data._level_names + or list(other.index._data.names) == self._data._level_names ) elif isinstance(other, DataFrame): if ( @@ -2086,7 +2084,7 @@ def _make_operands_and_index_for_binop( "Can only compare identically-labeled DataFrame objects" ) new_lhs, new_rhs = _align_indices(self, other) - index = new_lhs._index + index = new_lhs.index lhs, rhs = new_lhs._data, new_rhs._data fill_requires_key = True # For DataFrame-DataFrame ops, always default to operating against @@ -2460,7 +2458,7 @@ def scatter_by_map( ) partitioned_columns, output_offsets = libcudf.partitioning.partition( - [*(self._index._columns if keep_index else ()), *self._columns], + [*(self.index._columns if keep_index else ()), *self._columns], map_index, map_size, ) @@ -3275,23 +3273,31 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if len(self) == 0: if isinstance(value, (pd.Series, Series)): if not ignore_index: - self._index = as_index(value.index) - elif len(value) > 0: - self._index = RangeIndex(start=0, stop=len(value)) - new_data = self._data.__class__() + self.index = as_index(value.index) + elif (length := len(value)) > 0: + breakpoint() if num_cols != 0: - for col_name in self._data: - new_data[col_name] = column.column_empty_like( - self._data[col_name], - masked=True, - newsize=len(value), - ) - self._data = new_data + ca_data = ColumnAccessor( + { + col_name: column.column_empty_like( + col_data, masked=True, newsize=length + ) + for col_name, col_data in self._data.items() + }, + verify=False, + ) + else: + ca_data = {} + # TODO: Clear self._data cache here + self._mimic_inplace( + self._from_data(ca_data, index=RangeIndex(length)) + ) + elif isinstance(value, (pd.Series, Series)): value = Series(value, nan_as_null=nan_as_null) if not ignore_index: value = value._align_to_index( - self._index, how="right", sort=False + self.index, how="right", sort=False ) value = column.as_column(value, nan_as_null=nan_as_null) @@ -3320,7 +3326,7 @@ def axes(self): Index(['key', 'k2', 'val', 'temp'], dtype='object')] """ - return [self._index, self._data.to_pandas_index()] + return [self.index, self._data.to_pandas_index()] def diff(self, periods=1, axis=0): """ @@ -4880,8 +4886,8 @@ def partition_by_hash(self, columns, nparts, keep_index=True): """ key_indices = [self._column_names.index(k) for k in columns] if keep_index: - cols = [*self._index._columns, *self._columns] - key_indices = [i + len(self._index._columns) for i in key_indices] + cols = [*self.index._columns, *self._columns] + key_indices = [i + len(self.index._columns) for i in key_indices] else: cols = [*self._columns] @@ -5046,13 +5052,13 @@ def info( lines = [str(type(self))] - index_name = type(self._index).__name__ - if len(self._index) > 0: - entries_summary = f", {self._index[0]} to {self._index[-1]}" + index_name = type(self.index).__name__ + if len(self.index) > 0: + entries_summary = f", {self.index[0]} to {self.index[-1]}" else: entries_summary = "" index_summary = ( - f"{index_name}: {len(self._index)} entries{entries_summary}" + f"{index_name}: {len(self.index)} entries{entries_summary}" ) lines.append(index_summary) @@ -5656,7 +5662,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): num_cols = len(data[0]) if columns is None and data.dtype.names is None: - names = [i for i in range(num_cols)] + names = range(num_cols) elif data.dtype.names is not None: names = data.dtype.names @@ -5669,28 +5675,42 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): ) names = columns - df = DataFrame() - if data.ndim == 2: - for i, k in enumerate(names): - df._data[k] = column.as_column( - data[:, i], nan_as_null=nan_as_null - ) + ca_data = { + k: column.as_column(data[:, i], nan_as_null=nan_as_null) + for i, k in enumerate(names) + } elif data.ndim == 1: - for k in names: - df._data[k] = column.as_column( - data[k], nan_as_null=nan_as_null - ) + ca_data = { + names[0]: column.as_column(data, nan_as_null=nan_as_null) + } - if index is None: - df._index = RangeIndex(start=0, stop=len(data)) - elif is_scalar(index): - df._index = RangeIndex(start=0, stop=len(data)) - df = df.set_index(index) + if not is_scalar(index): + new_index = as_index(index) else: - df._index = as_index(index) - if isinstance(columns, pd.Index): - df._data._level_names = tuple(columns.names) + new_index = None + + if isinstance(columns, (pd.Index, cudf.Index)): + level_names = tuple(columns.names) + else: + level_names = None + + df = cls._from_data( + ColumnAccessor( + data=ca_data, + multiindex=isinstance( + columns, (pd.MultiIndex, cudf.MultiIndex) + ), + rangeindex=isinstance( + columns, (range, pd.RangeIndex, cudf.RangeIndex) + ), + level_names=level_names, + label_dtype=getattr(columns, "dtype", None), + ), + index=new_index, + ) + if is_scalar(index) and index is not None: + df = df.set_index(index) return df @classmethod @@ -5739,26 +5759,38 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): raise ValueError("Duplicate column names are not allowed") names = columns - df = cls() if data.ndim == 2: - for i, k in enumerate(names): - df._data[k] = column.as_column( - data[:, i], nan_as_null=nan_as_null - ) + ca_data = { + k: column.as_column(data[:, i], nan_as_null=nan_as_null) + for i, k in enumerate(names) + } elif data.ndim == 1: - df._data[names[0]] = column.as_column( - data, nan_as_null=nan_as_null - ) - if isinstance(columns, pd.Index): - df._data._level_names = tuple(columns.names) - if isinstance(columns, (range, pd.RangeIndex, cudf.RangeIndex)): - df._data.rangeindex = True + ca_data = { + names[0]: column.as_column(data, nan_as_null=nan_as_null) + } - if index is None: - df._index = RangeIndex(start=0, stop=len(data)) + if index is not None: + index = as_index(index) + + if isinstance(columns, (pd.Index, cudf.Index)): + level_names = tuple(columns.names) else: - df._index = as_index(index) - return df + level_names = None + + return cls._from_data( + ColumnAccessor( + data=ca_data, + multiindex=isinstance( + columns, (pd.MultiIndex, cudf.MultiIndex) + ), + rangeindex=isinstance( + columns, (range, pd.RangeIndex, cudf.RangeIndex) + ), + level_names=level_names, + label_dtype=getattr(columns, "dtype", None), + ), + index=index, + ) @_cudf_nvtx_annotate def interpolate( @@ -7033,7 +7065,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False): # Assemble the final index new_index_columns = [*repeated_index._columns, *tiled_index] - index_names = [*self._index.names, *unique_named_levels.names] + index_names = [*self.index.names, *unique_named_levels.names] new_index = MultiIndex.from_frame( DataFrame._from_data( dict(zip(range(0, len(new_index_columns)), new_index_columns)) @@ -7824,7 +7856,7 @@ def value_counts( result = result / result._column.sum() # Pandas always returns MultiIndex even if only one column. if not isinstance(result.index, MultiIndex): - result.index = MultiIndex._from_data(result._index._data) + result.index = MultiIndex._from_data(result.index._data) result.name = "proportion" if normalize else "count" return result diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 3e4b8192888..33410ce23a3 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -940,7 +940,7 @@ def nth(self, n): result = result[sizes > n] - result._index = self.obj.index.take( + result.index = self.obj.index.take( result._data["__groupbynth_order__"] ) del result._data["__groupbynth_order__"] @@ -1029,7 +1029,7 @@ def ngroup(self, ascending=True): if has_null_group: group_ids.iloc[-1] = cudf.NA - group_ids._index = index + group_ids.index = index return self._broadcast(group_ids) def sample( @@ -1199,7 +1199,7 @@ def deserialize(cls, header, frames): def _grouped(self, *, include_groups: bool = True): offsets, grouped_key_cols, grouped_value_cols = self._groupby.groups( - [*self.obj._index._columns, *self.obj._columns] + [*self.obj.index._columns, *self.obj._columns] ) grouped_keys = cudf.core.index._index_from_data( dict(enumerate(grouped_key_cols)) @@ -2839,8 +2839,8 @@ def _handle_label(self, by): self._key_columns.append(self._obj._data[by]) except KeyError as e: # `by` can be index name(label) too. - if by in self._obj._index.names: - self._key_columns.append(self._obj._index._data[by]) + if by in self._obj.index.names: + self._key_columns.append(self._obj.index._data[by]) else: raise e self.names.append(by) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7aae0d1729e..a166c256689 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -289,11 +289,11 @@ def __init__(self, data=None, index=None): @property def _num_rows(self) -> int: # Important to use the index because the data may be empty. - return len(self._index) + return len(self.index) @property def _index_names(self) -> Tuple[Any, ...]: # TODO: Tuple[str]? - return self._index._data.names + return self.index._data.names @classmethod def _from_data( @@ -307,7 +307,7 @@ def _from_data( @_cudf_nvtx_annotate def _from_data_like_self(self, data: MutableMapping): - out = self._from_data(data, self._index) + out = self._from_data(data, self.index) out._data._level_names = self._data._level_names return out @@ -350,7 +350,7 @@ def _from_columns_like_self( frame = self.__class__._from_data(data) if index is not None: - frame._index = index + frame.index = index return frame._copy_type_metadata( self, include_index=bool(index_names), @@ -367,7 +367,7 @@ def _mimic_inplace( self, result: Self, inplace: bool = False ) -> Optional[Self]: if inplace: - self._index = result._index + self._index = result.index return super()._mimic_inplace(result, inplace) # Scans @@ -442,15 +442,15 @@ def _scan(self, op, axis=None, skipna=True): # pandas returns an int64 dtype for all int or bool dtypes. result_col = result_col.astype(np.int64) results[name] = getattr(result_col, op)() - return self._from_data(results, self._index) + return self._from_data(results, self.index) def _check_data_index_length_match(self) -> None: # Validate that the number of rows in the data matches the index if the # data is not empty. This is a helper for the constructor. - if self._data.nrows > 0 and self._data.nrows != len(self._index): + if self._data.nrows > 0 and self._data.nrows != len(self.index): raise ValueError( f"Length of values ({self._data.nrows}) does not " - f"match length of index ({len(self._index)})" + f"match length of index ({len(self.index)})" ) @property @@ -618,14 +618,14 @@ def copy(self, deep: bool = True) -> Self: return self._from_data( self._data.copy(deep=deep), # Indexes are immutable so copies can always be shallow. - self._index.copy(deep=False), + self.index.copy(deep=False), ) @_cudf_nvtx_annotate def equals(self, other): # noqa: D102 if not super().equals(other): return False - return self._index.equals(other._index) + return self.index.equals(other.index) @property def index(self): @@ -908,7 +908,7 @@ def replace( else: copy_data = self._data.copy(deep=True) - result = self._from_data(copy_data, self._index) + result = self._from_data(copy_data, self.index) return self._mimic_inplace(result, inplace=inplace) @@ -1033,7 +1033,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1): name: col.clip(lower[i], upper[i]) for i, (name, col) in enumerate(self._data.items()) } - output = self._from_data(data, self._index) + output = self._from_data(data, self.index) output._copy_type_metadata(self, include_index=False) return self._mimic_inplace(output, inplace=inplace) @@ -1935,29 +1935,27 @@ def _copy_type_metadata( super()._copy_type_metadata(other, override_dtypes=override_dtypes) if ( include_index - and self._index is not None - and other._index is not None + and self.index is not None + and other.index is not None ): - self._index._copy_type_metadata(other._index) - # When other._index is a CategoricalIndex, the current index + self.index._copy_type_metadata(other.index) + # When other.index is a CategoricalIndex, the current index # will be a NumericalIndex with an underlying CategoricalColumn # (the above _copy_type_metadata call will have converted the # column). Calling cudf.Index on that column generates the # appropriate index. if isinstance( - other._index, cudf.core.index.CategoricalIndex - ) and not isinstance( - self._index, cudf.core.index.CategoricalIndex - ): - self._index = cudf.Index( - cast("cudf.Index", self._index)._column, - name=self._index.name, + other.index, cudf.core.index.CategoricalIndex + ) and not isinstance(self.index, cudf.core.index.CategoricalIndex): + self.index = cudf.Index( + cast("cudf.Index", self.index)._column, + name=self.index.name, ) - elif isinstance(other._index, cudf.MultiIndex) and not isinstance( - self._index, cudf.MultiIndex + elif isinstance(other.index, cudf.MultiIndex) and not isinstance( + self.index, cudf.MultiIndex ): - self._index = cudf.MultiIndex._from_data( - self._index._data, name=self._index.name + self.index = cudf.MultiIndex._from_data( + self.index._data, name=self.index.name ) return self @@ -2017,8 +2015,8 @@ def interpolate( data = self - if not isinstance(data._index, cudf.RangeIndex): - perm_sort = data._index.argsort() + if not isinstance(data.index, cudf.RangeIndex): + perm_sort = data.index.argsort() data = data._gather( GatherMap.from_column_unchecked( cudf.core.column.as_column(perm_sort), @@ -2040,13 +2038,13 @@ def interpolate( col = col.astype("float64").fillna(np.nan) # Interpolation methods may or may not need the index - columns[colname] = interpolator(col, index=data._index) + columns[colname] = interpolator(col, index=data.index) - result = self._from_data(columns, index=data._index) + result = self._from_data(columns, index=data.index) return ( result - if isinstance(data._index, cudf.RangeIndex) + if isinstance(data.index, cudf.RangeIndex) # TODO: This should be a scatter, avoiding an argsort. else result._gather( GatherMap.from_column_unchecked( @@ -2070,7 +2068,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): col.shift(periods, fill_value) for col in self._columns ) return self.__class__._from_data( - zip(self._column_names, data_columns), self._index + zip(self._column_names, data_columns), self.index ) @_cudf_nvtx_annotate @@ -2254,7 +2252,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True): if not copy: raise ValueError("Truncating with copy=False is not supported.") axis = self._get_axis_from_axis_arg(axis) - ax = self._index if axis == 0 else self._data.to_pandas_index() + ax = self.index if axis == 0 else self._data.to_pandas_index() if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing: raise ValueError("truncate requires a sorted index") @@ -2585,7 +2583,7 @@ def scale(self): vmin = self.min() vmax = self.max() scaled = (self - vmin) / (vmax - vmin) - scaled._index = self._index.copy(deep=False) + scaled.index = self.index.copy(deep=False) return scaled @_cudf_nvtx_annotate @@ -2919,14 +2917,14 @@ def _gather( raise IndexError("Gather map is out of bounds") return self._from_columns_like_self( libcudf.copying.gather( - list(self._index._columns + self._columns) + list(self.index._columns + self._columns) if keep_index else list(self._columns), gather_map.column, nullify=gather_map.nullify, ), self._column_names, - self._index.names if keep_index else None, + self.index.names if keep_index else None, ) def _slice(self, arg: slice, keep_index: bool = True) -> Self: @@ -3000,7 +2998,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: columns_to_slice = [ *( - self._index._data.columns + self.index._data.columns if keep_index and not has_range_index else [] ), @@ -3009,7 +3007,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: result = self._from_columns_like_self( libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0], self._column_names, - None if has_range_index or not keep_index else self._index.names, + None if has_range_index or not keep_index else self.index.names, ) result._data.label_dtype = self._data.label_dtype result._data.rangeindex = self._data.rangeindex @@ -3028,7 +3026,7 @@ def _positions_from_column_names( indices returned corresponds to the column order in this Frame. """ num_index_columns = ( - len(self._index._data) if offset_by_index_columns else 0 + len(self.index._data) if offset_by_index_columns else 0 ) return [ i + num_index_columns @@ -3073,13 +3071,13 @@ def drop_duplicates( libcudf.stream_compaction.drop_duplicates( list(self._columns) if ignore_index - else list(self._index._columns + self._columns), + else list(self.index._columns + self._columns), keys=keys, keep=keep, nulls_are_equal=nulls_are_equal, ), self._column_names, - self._index.names if not ignore_index else None, + self.index.names if not ignore_index else None, ) @_cudf_nvtx_annotate @@ -3197,12 +3195,12 @@ def _empty_like(self, keep_index=True) -> Self: result = self._from_columns_like_self( libcudf.copying.columns_empty_like( [ - *(self._index._data.columns if keep_index else ()), + *(self.index._data.columns if keep_index else ()), *self._columns, ] ), self._column_names, - self._index.names if keep_index else None, + self.index.names if keep_index else None, ) result._data.label_dtype = self._data.label_dtype result._data.rangeindex = self._data.rangeindex @@ -3214,7 +3212,7 @@ def _split(self, splits, keep_index=True): columns_split = libcudf.copying.columns_split( [ - *(self._index._data.columns if keep_index else []), + *(self.index._data.columns if keep_index else []), *self._columns, ], splits, @@ -3224,7 +3222,7 @@ def _split(self, splits, keep_index=True): self._from_columns_like_self( columns_split[i], self._column_names, - self._index.names if keep_index else None, + self.index.names if keep_index else None, ) for i in range(len(splits) + 1) ] @@ -3244,12 +3242,12 @@ def fillna( "Use obj.ffill() or obj.bfill() instead.", FutureWarning, ) - old_index = self._index + old_index = self.index ret = super().fillna(value, method, axis, inplace, limit) if inplace: - self._index = old_index + self.index = old_index else: - ret._index = old_index + ret.index = old_index return ret @_cudf_nvtx_annotate @@ -3479,7 +3477,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs): col = _post_process_output_col(ans_col, retty) col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask)) - result = cudf.Series._from_data({None: col}, self._index) + result = cudf.Series._from_data({None: col}, self.index) return result @@ -3706,12 +3704,12 @@ def _reindex( df = self if index is not None: - if not df._index.is_unique: + if not df.index.is_unique: raise ValueError( "cannot reindex on an axis with duplicate labels" ) index = cudf.core.index.as_index( - index, name=getattr(index, "name", self._index.name) + index, name=getattr(index, "name", self.index.name) ) idx_dtype_match = (df.index.nlevels == index.nlevels) and all( @@ -3739,7 +3737,7 @@ def _reindex( else name: col for name, col in df._data.items() }, - index=df._index, + index=df.index, ) df = lhs.join(rhs, how="left", sort=True) # double-argsort to map back from sorted to unsorted positions @@ -3915,7 +3913,7 @@ def round(self, decimals=0, how="half_even"): multiindex=self._data.multiindex, level_names=self._data.level_names, ), - index=self._index, + index=self.index, ) def resample( @@ -4267,7 +4265,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): return self._from_columns_like_self( libcudf.stream_compaction.drop_nulls( - [*self._index._data.columns, *data_columns], + [*self.index._data.columns, *data_columns], how=how, keys=self._positions_from_column_names( subset, offset_by_index_columns=True @@ -4275,7 +4273,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None): thresh=thresh, ), self._column_names, - self._index.names, + self.index.names, ) def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True): @@ -4292,13 +4290,13 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True): ) return self._from_columns_like_self( libcudf.stream_compaction.apply_boolean_mask( - list(self._index._columns + self._columns) + list(self.index._columns + self._columns) if keep_index else list(self._columns), boolean_mask.column, ), column_names=self._column_names, - index_names=self._index.names if keep_index else None, + index_names=self.index.names if keep_index else None, ) def take(self, indices, axis=0): @@ -4358,7 +4356,7 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""): ) if not isinstance(level, (tuple, list)): level = (level,) - _check_duplicate_level_names(level, self._index.names) + _check_duplicate_level_names(level, self.index.names) index = self.index._new_index_for_reset_index(level, self.index.name) if index is None: @@ -4394,7 +4392,7 @@ def _first_or_last( self, offset, idx: int, op: Callable, side: str, slice_func: Callable ) -> "IndexedFrame": """Shared code path for ``first`` and ``last``.""" - if not isinstance(self._index, cudf.core.index.DatetimeIndex): + if not isinstance(self.index, cudf.core.index.DatetimeIndex): raise TypeError("'first' only supports a DatetimeIndex index.") if not isinstance(offset, str): raise NotImplementedError( @@ -4406,20 +4404,20 @@ def _first_or_last( pd_offset = pd.tseries.frequencies.to_offset(offset) to_search = op( - pd.Timestamp(self._index._column.element_indexing(idx)), pd_offset + pd.Timestamp(self.index._column.element_indexing(idx)), pd_offset ) if ( idx == 0 and not isinstance(pd_offset, pd.tseries.offsets.Tick) - and pd_offset.is_on_offset(pd.Timestamp(self._index[0])) + and pd_offset.is_on_offset(pd.Timestamp(self.index[0])) ): # Special handle is required when the start time of the index # is on the end of the offset. See pandas gh29623 for detail. to_search = to_search - pd_offset.base return self.loc[:to_search] - needle = as_column(to_search, dtype=self._index.dtype) + needle = as_column(to_search, dtype=self.index.dtype) end_point = int( - self._index._column.searchsorted( + self.index._column.searchsorted( needle, side=side ).element_indexing(0) ) @@ -4802,7 +4800,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): name: (col, None, False, None) for name, col in self._data.items() } - index = self._index + index = self.index data = self._apply_cupy_ufunc_to_operands( ufunc, cupy_func, inputs, **kwargs @@ -4880,7 +4878,7 @@ def repeat(self, repeats, axis=None): """ res = self._from_columns_like_self( Frame._repeat( - [*self._index._data.columns, *self._columns], repeats, axis + [*self.index._data.columns, *self._columns], repeats, axis ), self._column_names, self._index_names, @@ -5011,7 +5009,7 @@ def astype( raise e return self - return self._from_data(data, index=self._index) + return self._from_data(data, index=self.index) @_cudf_nvtx_annotate def drop( @@ -5220,8 +5218,7 @@ def drop( columns = _get_host_unique(columns) _drop_columns(dropped, columns, errors) - out._data = dropped._data - out._index = dropped._index + out._mimic_inplace(dropped, inplace=True) if not inplace: return out @@ -5234,18 +5231,18 @@ def _explode(self, explode_column: Any, ignore_index: bool): # exploded and will be replaced with a `RangeIndex`. if not isinstance(self._data[explode_column].dtype, ListDtype): data = self._data.copy(deep=True) - idx = None if ignore_index else self._index.copy(deep=True) + idx = None if ignore_index else self.index.copy(deep=True) return self.__class__._from_data(data, index=idx) column_index = self._column_names.index(explode_column) - if not ignore_index and self._index is not None: - index_offset = self._index.nlevels + if not ignore_index and self.index is not None: + index_offset = self.index.nlevels else: index_offset = 0 exploded = libcudf.lists.explode_outer( [ - *(self._index._data.columns if not ignore_index else ()), + *(self.index._data.columns if not ignore_index else ()), *self._columns, ], column_index + index_offset, @@ -5292,7 +5289,7 @@ def tile(self, count): """ return self._from_columns_like_self( libcudf.reshape.tile( - [*self._index._columns, *self._columns], count + [*self.index._columns, *self._columns], count ), column_names=self._column_names, index_names=self._index_names, @@ -6273,7 +6270,7 @@ def rank( return self.__class__._from_data( dict(zip(source._column_names, result_columns)), - index=source._index, + index=source.index, ).astype(np.float64) def convert_dtypes( @@ -6505,7 +6502,7 @@ def _is_series(obj): Checks if the `obj` is of type `cudf.Series` instead of checking for isinstance(obj, cudf.Series) """ - return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None + return isinstance(obj, Frame) and obj.ndim == 1 and obj.index is not None @_cudf_nvtx_annotate @@ -6518,7 +6515,7 @@ def _drop_rows_by_labels( """Remove rows specified by `labels`. If `errors="raise"`, an error is raised if some items in `labels` do not - exist in `obj._index`. + exist in `obj.index`. Will raise if level(int) is greater or equal to index nlevels. """ @@ -6539,17 +6536,17 @@ def _drop_rows_by_labels( if isinstance(level, int): ilevel = level else: - ilevel = obj._index.names.index(level) + ilevel = obj.index.names.index(level) # 1. Merge Index df and data df along column axis: - # | id | ._index df | data column(s) | - idx_nlv = obj._index.nlevels - working_df = obj._index.to_frame(index=False) + # | id | .index df | data column(s) | + idx_nlv = obj.index.nlevels + working_df = obj.index.to_frame(index=False) working_df.columns = list(range(idx_nlv)) for i, col in enumerate(obj._data): working_df[idx_nlv + i] = obj._data[col] # 2. Set `level` as common index: - # | level | ._index df w/o level | data column(s) | + # | level | .index df w/o level | data column(s) | working_df = working_df.set_index(level) # 3. Use "leftanti" join to drop @@ -6560,11 +6557,11 @@ def _drop_rows_by_labels( # 4. Reconstruct original layout, and rename join_res._insert( - ilevel, name=join_res._index.name, value=join_res._index + ilevel, name=join_res.index.name, value=join_res.index ) midx = cudf.MultiIndex.from_frame( - join_res.iloc[:, 0:idx_nlv], names=obj._index.names + join_res.iloc[:, 0:idx_nlv], names=obj.index.names ) if isinstance(obj, cudf.Series): @@ -6596,7 +6593,7 @@ def _drop_rows_by_labels( # Join changes the index to common type, # but we need to preserve the type of # index being returned, Hence this type-cast. - res._index = res.index.astype(obj.index.dtype) + res.index = res.index.astype(obj.index.dtype) return res diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py index 6a619945e75..05cbb4429b9 100644 --- a/python/cudf/cudf/core/join/_join_helpers.py +++ b/python/cudf/cudf/core/join/_join_helpers.py @@ -43,10 +43,10 @@ def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False): class _IndexIndexer(_Indexer): def get(self, obj: cudf.DataFrame) -> ColumnBase: - return obj._index._data[self.name] + return obj.index._data[self.name] def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False): - obj._index._data.set_by_label(self.name, value, validate=validate) + obj.index._data.set_by_label(self.name, value, validate=validate) def _match_join_keys( diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 1ef2915bc59..da999441ca3 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -373,10 +373,10 @@ def _merge_results( index: Optional[cudf.BaseIndex] if self._using_right_index: # right_index and left_on - index = left_result._index + index = left_result.index elif self._using_left_index: # left_index and right_on - index = right_result._index + index = right_result.index else: index = None @@ -400,7 +400,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: # producing the input result. by: List[Any] = [] if self._using_left_index and self._using_right_index: - by.extend(result._index._data.columns) + by.extend(result.index._data.columns) if not self._using_left_index: by.extend([result._data[col.name] for col in self._left_keys]) if not self._using_right_index: @@ -408,8 +408,8 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame: if by: keep_index = self._using_left_index or self._using_right_index if keep_index: - to_sort = [*result._index._columns, *result._columns] - index_names = result._index.names + to_sort = [*result.index._columns, *result._columns] + index_names = result.index.names else: to_sort = [*result._columns] index_names = None @@ -547,4 +547,4 @@ class MergeSemi(Merge): def _merge_results(self, lhs: cudf.DataFrame, rhs: cudf.DataFrame): # semi-join result includes only lhs columns - return lhs._data, lhs._index + return lhs._data, lhs.index diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 0b44ab58f30..d4772d5b4c2 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -836,7 +836,7 @@ def get_dummies( dtype=dtype, ) result_data.update(col_enc_data) - return cudf.DataFrame._from_data(result_data, index=df._index) + return cudf.DataFrame._from_data(result_data, index=df.index) else: ser = cudf.Series(df) unique = _get_unique(column=ser._column, dummy_na=dummy_na) @@ -847,7 +847,7 @@ def get_dummies( prefix_sep=prefix_sep, dtype=dtype, ) - return cudf.DataFrame._from_data(data, index=ser._index) + return cudf.DataFrame._from_data(data, index=ser.index) def _merge_sorted( @@ -899,7 +899,7 @@ def _merge_sorted( raise ValueError("`by_index` and `ignore_index` cannot both be True") if by_index: - key_columns_indices = list(range(0, objs[0]._index.nlevels)) + key_columns_indices = list(range(0, objs[0].index.nlevels)) else: if keys is None: key_columns_indices = list(range(0, objs[0]._num_columns)) @@ -909,12 +909,12 @@ def _merge_sorted( ] if not ignore_index: key_columns_indices = [ - idx + objs[0]._index.nlevels for idx in key_columns_indices + idx + objs[0].index.nlevels for idx in key_columns_indices ] columns = [ [ - *(obj._index._data.columns if not ignore_index else ()), + *(obj.index._data.columns if not ignore_index else ()), *obj._columns, ] for obj in objs diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index c7bc97edd68..41fbf269699 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -296,7 +296,7 @@ def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]: result = self._frame.index._get_row_major(self._frame, row_arg) if ( isinstance(arg, tuple) - and len(arg) == self._frame._index.nlevels + and len(arg) == self._frame.index.nlevels and not any(isinstance(x, slice) for x in arg) ): result = result.iloc[0] @@ -318,7 +318,7 @@ def __setitem__(self, key, value): and not isinstance(self._frame.index, cudf.MultiIndex) and is_scalar(value) ): - idx = self._frame._index + idx = self._frame.index if isinstance(idx, cudf.RangeIndex): if isinstance(key, int) and (key == idx[-1] + idx.step): idx_copy = cudf.RangeIndex( @@ -682,7 +682,7 @@ def _from_data( @_cudf_nvtx_annotate def __contains__(self, item): - return item in self._index + return item in self.index @classmethod @_cudf_nvtx_annotate @@ -832,7 +832,7 @@ def hasnans(self): def serialize(self): header, frames = super().serialize() - header["index"], index_frames = self._index.serialize() + header["index"], index_frames = self.index.serialize() header["index_frame_count"] = len(index_frames) # For backwards compatibility with older versions of cuDF, index # columns are placed before data columns. @@ -850,7 +850,7 @@ def deserialize(cls, header, frames): idx_typ = pickle.loads(header["index"]["type-serialized"]) index = idx_typ.deserialize(header["index"], frames[:index_nframes]) - obj._index = index + obj.index = index return obj @@ -995,7 +995,7 @@ def reindex(self, *args, **kwargs): "'index' passed as both positional and keyword argument" ) else: - index = kwargs.get("index", self._index) + index = kwargs.get("index", self.index) name = self.name or 0 series = self._reindex( @@ -1140,7 +1140,7 @@ def to_frame(self, name=None): @_cudf_nvtx_annotate def memory_usage(self, index=True, deep=False): return self._column.memory_usage + ( - self._index.memory_usage() if index else 0 + self.index.memory_usage() if index else 0 ) @_cudf_nvtx_annotate @@ -1506,7 +1506,7 @@ def _make_operands_and_index_for_binop( can_use_self_column_name = False operands = lhs._make_operands_for_binop(other, fill_value, reflect) - return operands, lhs._index, can_use_self_column_name + return operands, lhs.index, can_use_self_column_name @copy_docstring(CategoricalAccessor) # type: ignore @property @@ -1917,7 +1917,7 @@ def between(self, left, right, inclusive="both") -> Series: "Inclusive has to be either string of 'both', " "'left', 'right', or 'neither'." ) - return self._from_data({self.name: lmask & rmask}, self._index) + return self._from_data({self.name: lmask & rmask}, self.index) @_cudf_nvtx_annotate def all(self, axis=0, bool_only=None, skipna=True, **kwargs): @@ -3119,7 +3119,7 @@ def value_counts( # TODO: Remove this workaround once `observed` # parameter support is added to `groupby` res = res.reindex(self.dtype.categories).fillna(0) - res._index = res._index.astype(self.dtype) + res.index = res.index.astype(self.dtype) res.index.name = self.name @@ -3927,7 +3927,7 @@ def microsecond(self): * cudf.Scalar(1000, dtype="int32") ) + self.series._column.get_dt_field("microsecond"), - index=self.series._index, + index=self.series.index, name=self.series.name, ) @@ -4161,7 +4161,7 @@ def is_leap_year(self): res = libcudf.datetime.is_leap_year(self.series._column).fillna(False) return Series._from_data( ColumnAccessor({None: res}), - index=self.series._index, + index=self.series.index, name=self.series.name, ) @@ -4195,7 +4195,7 @@ def quarter(self): ) return Series._from_data( {None: res}, - index=self.series._index, + index=self.series.index, name=self.series.name, ) @@ -4299,7 +4299,7 @@ def days_in_month(self): res = libcudf.datetime.days_in_month(self.series._column) return Series._from_data( ColumnAccessor({None: res}), - index=self.series._index, + index=self.series.index, name=self.series.name, ) @@ -4345,7 +4345,7 @@ def is_month_end(self): last_day = libcudf.datetime.last_day_of_month(self.series._column) last_day = Series._from_data( ColumnAccessor({None: last_day}), - index=self.series._index, + index=self.series.index, name=self.series.name, ) return (self.day == last_day.dt.day).fillna(False) @@ -4395,7 +4395,7 @@ def is_quarter_start(self): result = ((day == cudf.Scalar(1)) & first_month).fillna(False) return Series._from_data( {None: result}, - index=self.series._index, + index=self.series.index, name=self.series.name, ) @@ -4446,7 +4446,7 @@ def is_quarter_end(self): result = ((day == last_day) & last_month).fillna(False) return Series._from_data( {None: result}, - index=self.series._index, + index=self.series.index, name=self.series.name, ) @@ -4481,7 +4481,7 @@ def is_year_start(self): ) == cudf.Scalar(1) return Series._from_data( {None: outcol.fillna(False)}, - index=self.series._index, + index=self.series.index, name=self.series.name, ) @@ -4520,7 +4520,7 @@ def is_year_end(self): result = result.fillna(False) return Series._from_data( {None: result}, - index=self.series._index, + index=self.series.index, name=self.series.name, ) @@ -4528,7 +4528,7 @@ def is_year_end(self): def _get_dt_field(self, field): out_column = self.series._column.get_dt_field(field) return Series( - data=out_column, index=self.series._index, name=self.series.name + data=out_column, index=self.series.index, name=self.series.name ) @_cudf_nvtx_annotate @@ -4565,7 +4565,7 @@ def ceil(self, freq): out_column = self.series._column.ceil(freq) return Series._from_data( - data={self.series.name: out_column}, index=self.series._index + data={self.series.name: out_column}, index=self.series.index ) @_cudf_nvtx_annotate @@ -4602,7 +4602,7 @@ def floor(self, freq): out_column = self.series._column.floor(freq) return Series._from_data( - data={self.series.name: out_column}, index=self.series._index + data={self.series.name: out_column}, index=self.series.index ) @_cudf_nvtx_annotate @@ -4642,7 +4642,7 @@ def round(self, freq): out_column = self.series._column.round(freq) return Series._from_data( - data={self.series.name: out_column}, index=self.series._index + data={self.series.name: out_column}, index=self.series.index ) @_cudf_nvtx_annotate @@ -4724,7 +4724,7 @@ def strftime(self, date_format, *args, **kwargs): dtype="str", format=date_format ) return Series( - data=str_col, index=self.series._index, name=self.series.name + data=str_col, index=self.series.index, name=self.series.name ) @copy_docstring(DatetimeIndex.tz_localize) @@ -4739,7 +4739,7 @@ def tz_localize( ) return Series._from_data( data={self.series.name: result_col}, - index=self.series._index, + index=self.series.index, ) @copy_docstring(DatetimeIndex.tz_convert) @@ -4755,7 +4755,7 @@ def tz_convert(self, tz: str | None): """ result_col = self.series._column.tz_convert(tz) return Series._from_data( - {self.series.name: result_col}, index=self.series._index + {self.series.name: result_col}, index=self.series.index ) @@ -4993,13 +4993,13 @@ def components(self): 3 0 0 35 35 656 0 0 4 37 13 12 14 234 0 0 """ # noqa: E501 - return self.series._column.components(index=self.series._index) + return self.series._column.components(index=self.series.index) @_cudf_nvtx_annotate def _get_td_field(self, field): out_column = getattr(self.series._column, field) return Series( - data=out_column, index=self.series._index, name=self.series.name + data=out_column, index=self.series.index, name=self.series.name ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8b18e53d320..24b55062be8 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4216,6 +4216,7 @@ def test_empty_dataframe_any(axis): ) @pytest.mark.parametrize("non_list_data", [123, "abc", "zyx", "rapids", 0.8]) def test_create_dataframe_cols_empty_data(a, b, misc_data, non_list_data): + breakpoint() expected = pd.DataFrame({"a": a}) actual = cudf.DataFrame.from_pandas(expected) expected["b"] = b From bd3f3973ee6bfdccd18479621c38d30c0d352e07 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 17 May 2024 16:35:47 -0700 Subject: [PATCH 2/4] Go back to setting private ._index --- python/cudf/cudf/core/column_accessor.py | 1 - python/cudf/cudf/core/dataframe.py | 19 ++++++++----------- python/cudf/cudf/tests/test_dataframe.py | 1 - 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index 86ff6077fd4..9f3de061ee8 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -346,7 +346,6 @@ def insert( # TODO: we should move all insert logic here if name in self._data: raise ValueError(f"Cannot insert '{name}', already exists") - breakpoint() if loc == old_ncols: if validate: value = column.as_column(value) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 41d6c2defc7..cf30c042026 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3275,23 +3275,20 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if not ignore_index: self.index = as_index(value.index) elif (length := len(value)) > 0: - breakpoint() if num_cols != 0: - ca_data = ColumnAccessor( - { - col_name: column.column_empty_like( + ca = self._data._from_columns_like_self( + ( + column.column_empty_like( col_data, masked=True, newsize=length ) - for col_name, col_data in self._data.items() - }, + for col_data in self._data.values() + ), verify=False, ) else: - ca_data = {} - # TODO: Clear self._data cache here - self._mimic_inplace( - self._from_data(ca_data, index=RangeIndex(length)) - ) + ca = ColumnAccessor({}) + self._data = ca + self._index = RangeIndex(length) elif isinstance(value, (pd.Series, Series)): value = Series(value, nan_as_null=nan_as_null) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 24b55062be8..8b18e53d320 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4216,7 +4216,6 @@ def test_empty_dataframe_any(axis): ) @pytest.mark.parametrize("non_list_data", [123, "abc", "zyx", "rapids", 0.8]) def test_create_dataframe_cols_empty_data(a, b, misc_data, non_list_data): - breakpoint() expected = pd.DataFrame({"a": a}) actual = cudf.DataFrame.from_pandas(expected) expected["b"] = b From 53f345722ba5f99218afb735839a2c2540aecc87 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 17 May 2024 17:42:50 -0700 Subject: [PATCH 3/4] Address last failures --- python/cudf/cudf/core/dataframe.py | 33 +++++++++++++++--------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index cf30c042026..bde8dd5e3e7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1396,23 +1396,23 @@ def __setitem__(self, arg, value): else: if arg in self._data: if not is_scalar(value) and len(self) == 0: + value = column.as_column(value) + length = len(value) + new_columns = ( + value + if key == arg + else column.column_empty_like( + col, masked=True, newsize=length + ) + for key, col in self._data.items() + ) + self._data = self._data._from_columns_like_self( + new_columns, verify=False + ) if isinstance(value, (pd.Series, Series)): - self.index = as_index(value.index) + self._index = as_index(value.index) elif len(value) > 0: - self.index = RangeIndex(start=0, stop=len(value)) - value = column.as_column(value) - new_data = self._data.__class__() - for key in self._data: - if key == arg: - new_data[key] = value - else: - new_data[key] = column.column_empty_like( - self._data[key], - masked=True, - newsize=len(value), - ) - - self._data = new_data + self._index = RangeIndex(length) return elif isinstance(value, (pd.Series, Series)): value = Series(value)._align_to_index( @@ -5679,7 +5679,8 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): } elif data.ndim == 1: ca_data = { - names[0]: column.as_column(data, nan_as_null=nan_as_null) + name: column.as_column(data[name], nan_as_null=nan_as_null) + for name in names } if not is_scalar(index): From b826e6ae92f776d3fde47e7d75cf05ea5d95c789 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 21 May 2024 17:35:45 -0700 Subject: [PATCH 4/4] test passing now --- python/cudf/cudf/tests/test_dlpack.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py index 6e34817c4fd..aafe920d3a1 100644 --- a/python/cudf/cudf/tests/test_dlpack.py +++ b/python/cudf/cudf/tests/test_dlpack.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2022, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import itertools from contextlib import ExitStack as does_not_raise @@ -201,12 +201,7 @@ def test_to_dlpack_mixed_dtypes(): "shape", [ (0, 3), - pytest.param( - (3, 0), - marks=pytest.mark.xfail( - reason="Index information not available via from_dlpack" - ), - ), + (3, 0), (0, 0), ], )