diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index d25092a7526..0116ee858ab 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -298,7 +298,7 @@ cpdef write_csv( if col_name not in cols: raise NameError('column {!r} does not exist in DataFrame' .format(col_name)) - col = cols[col_name]._column + col = cols[col_name] check_gdf_compatibility(col) # Workaround for string columns if col.dtype.type == np.object_: @@ -308,7 +308,6 @@ cpdef write_csv( list_cols.push_back(c_col) else: for idx, (col_name, col) in enumerate(cols.items()): - col = col._column check_gdf_compatibility(col) # Workaround for string columns if col.dtype.type == np.object_: diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0e2a05d45ee..9fc3e1b1853 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -176,7 +176,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): ) else: if is_list_like(data): - if len(data) > 0 and not is_list_like(data[0]): + if len(data) > 0 and is_scalar(data[0]): data = [data] self._init_from_list_like(data, index=index, columns=columns) @@ -201,9 +201,12 @@ def _init_from_list_like(self, data, index=None, columns=None): data = list(itertools.zip_longest(*data)) if columns is None: columns = range(len(data)) + for col_name, col in enumerate(data): self._cols[col_name] = column.as_column(col) + self.columns = columns + def _init_from_dict_like(self, data, index=None, columns=None): data = data.copy() num_rows = 0 @@ -425,7 +428,7 @@ def __getitem__(self, arg): s = cudf.Series(self._cols[arg], name=arg, index=self.index) return s elif isinstance(arg, slice): - df = DataFrame() + df = DataFrame(index=self.index[arg]) for k, col in self._cols.items(): df[k] = col[arg] return df @@ -449,6 +452,7 @@ def __getitem__(self, arg): return df for col in arg: df[col] = self[col] + df.index = self.index return df elif isinstance(arg, DataFrame): return self.mask(arg) @@ -1498,6 +1502,10 @@ def insert(self, loc, name, value, forceindex=False): if is_scalar(value): value = utils.scalar_broadcast_to(value, len(self)) + if isinstance(value, (pd.Series, Series)): + if len(self) == 0: + self._index = as_index(value.index) + value = column.as_column(value) if len(self.index) == 0: @@ -1653,9 +1661,9 @@ def drop_duplicates(self, subset=None, keep="first", inplace=False): if isinstance(self.index, cudf.core.multiindex.MultiIndex): new_index = self.index.take(new_index) if inplace: - self.index = new_index + self._index = new_index for k, new_col in zip(self._cols, out_cols): - self[k] = Series(new_col, new_index) + self[k] = new_col else: outdf = DataFrame() for k, new_col in zip(self._cols, out_cols): @@ -3291,7 +3299,10 @@ def to_pandas(self): out_columns.names = self.columns.names else: out_columns.name = self.columns.name - return pd.DataFrame(out_data, index=out_index, columns=out_columns) + out_df = pd.DataFrame(out_data, index=out_index) + if out_columns is not None: + out_df.columns = out_columns + return out_df @classmethod def from_pandas(cls, dataframe, nan_as_null=True): diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4abedb7fba8..78541c730fb 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -153,6 +153,7 @@ def __getattr__(self, key): raise AttributeError() if key in self._df.columns: by_list = [] + for by_name, by in zip( self._groupby.key_names, self._groupby.key_columns ): @@ -446,10 +447,10 @@ def construct_result(self, out_key_columns, out_value_columns): index = self.compute_result_index(out_key_columns, out_value_columns) if len(result) == 0 and len(index) != 0: - # len(result) must be len(index) for - # ``result.index = index`` to work: - result._size = len(index) - result.index = index + # Can't go through the setter in this case + result._index = index + else: + result.index = index if isinstance(self.obj, cudf.Series): # May need to downcast from DataFrame to Series: diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py index faa88ba9447..e769872c905 100755 --- a/python/cudf/cudf/core/indexing.py +++ b/python/cudf/cudf/core/indexing.py @@ -203,9 +203,10 @@ def _getitem_tuple_arg(self, arg): columns_df = self._df.columns._get_column_major(self._df, arg[1]) else: columns = self._get_column_selection(arg[1]) - columns_df = DataFrame() + columns_df = DataFrame(index=self._df.index) for i, col in enumerate(columns): columns_df.insert(i, col, self._df[col]) + # Step 2: Gather rows if isinstance(columns_df.index, MultiIndex): return columns_df.index._get_row_major(columns_df, arg[0]) @@ -221,6 +222,7 @@ def _getitem_tuple_arg(self, arg): df = DataFrame() for col in columns_df.columns: df[col] = columns_df[col].loc[arg[0]] + # Step 3: Gather index if df.shape[0] == 1: # we have a single row if isinstance(arg[0], slice): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index fcfe46c6e58..c8e0c71a444 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -127,9 +127,8 @@ def __init__( if index is not None and not isinstance(index, Index): index = as_index(index) + assert isinstance(data, column.ColumnBase) - if name is None: - name = data.name super().__init__([data], [name]) self._index = RangeIndex(len(data)) if index is None else index @@ -2597,7 +2596,9 @@ def weekday(self): def get_dt_field(self, field): out_column = self.series._column.get_dt_field(field) - return Series(data=out_column, index=self.series._index) + return Series( + data=out_column, index=self.series._index, name=self.series.name + ) def _align_indices(lhs, rhs, join="outer"): diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py index e3ed62db858..92f073cb8b7 100644 --- a/python/cudf/cudf/io/dlpack.py +++ b/python/cudf/cudf/io/dlpack.py @@ -68,7 +68,7 @@ def to_dlpack(cudf_obj): raise ValueError("Cannot create DLPack tensor of 0 size") if isinstance(cudf_obj, DataFrame): - gdf_cols = [col[1]._column for col in cudf_obj._cols.items()] + gdf_cols = list(cudf_obj._cols.values()) elif isinstance(cudf_obj, Series): gdf_cols = [cudf_obj._column] elif isinstance(cudf_obj, Index): diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 6d09c887bb3..fe560706cc7 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -129,7 +129,7 @@ def is_scalar(val): or isinstance(val, numbers.Number) or np.isscalar(val) or isinstance(val, pd.Timestamp) - or isinstance(val, pd.Categorical) + or (isinstance(val, pd.Categorical) and len(val) == 1) )