Skip to content

Commit

Permalink
Fixes to csv, dlpack and groupby to support DataFrame._cols
Browse files Browse the repository at this point in the history
  • Loading branch information
shwina committed Dec 4, 2019
1 parent e2c1cd6 commit 6251122
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 17 deletions.
3 changes: 1 addition & 2 deletions python/cudf/cudf/_lib/csv.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ cpdef write_csv(
if col_name not in cols:
raise NameError('column {!r} does not exist in DataFrame'
.format(col_name))
col = cols[col_name]._column
col = cols[col_name]
check_gdf_compatibility(col)
# Workaround for string columns
if col.dtype.type == np.object_:
Expand All @@ -308,7 +308,6 @@ cpdef write_csv(
list_cols.push_back(c_col)
else:
for idx, (col_name, col) in enumerate(cols.items()):
col = col._column
check_gdf_compatibility(col)
# Workaround for string columns
if col.dtype.type == np.object_:
Expand Down
21 changes: 16 additions & 5 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
)
else:
if is_list_like(data):
if len(data) > 0 and not is_list_like(data[0]):
if len(data) > 0 and is_scalar(data[0]):
data = [data]
self._init_from_list_like(data, index=index, columns=columns)

Expand All @@ -201,9 +201,12 @@ def _init_from_list_like(self, data, index=None, columns=None):
data = list(itertools.zip_longest(*data))
if columns is None:
columns = range(len(data))

for col_name, col in enumerate(data):
self._cols[col_name] = column.as_column(col)

self.columns = columns

def _init_from_dict_like(self, data, index=None, columns=None):
data = data.copy()
num_rows = 0
Expand Down Expand Up @@ -425,7 +428,7 @@ def __getitem__(self, arg):
s = cudf.Series(self._cols[arg], name=arg, index=self.index)
return s
elif isinstance(arg, slice):
df = DataFrame()
df = DataFrame(index=self.index[arg])
for k, col in self._cols.items():
df[k] = col[arg]
return df
Expand All @@ -449,6 +452,7 @@ def __getitem__(self, arg):
return df
for col in arg:
df[col] = self[col]
df.index = self.index
return df
elif isinstance(arg, DataFrame):
return self.mask(arg)
Expand Down Expand Up @@ -1498,6 +1502,10 @@ def insert(self, loc, name, value, forceindex=False):
if is_scalar(value):
value = utils.scalar_broadcast_to(value, len(self))

if isinstance(value, (pd.Series, Series)):
if len(self) == 0:
self._index = as_index(value.index)

value = column.as_column(value)

if len(self.index) == 0:
Expand Down Expand Up @@ -1653,9 +1661,9 @@ def drop_duplicates(self, subset=None, keep="first", inplace=False):
if isinstance(self.index, cudf.core.multiindex.MultiIndex):
new_index = self.index.take(new_index)
if inplace:
self.index = new_index
self._index = new_index
for k, new_col in zip(self._cols, out_cols):
self[k] = Series(new_col, new_index)
self[k] = new_col
else:
outdf = DataFrame()
for k, new_col in zip(self._cols, out_cols):
Expand Down Expand Up @@ -3291,7 +3299,10 @@ def to_pandas(self):
out_columns.names = self.columns.names
else:
out_columns.name = self.columns.name
return pd.DataFrame(out_data, index=out_index, columns=out_columns)
out_df = pd.DataFrame(out_data, index=out_index)
if out_columns is not None:
out_df.columns = out_columns
return out_df

@classmethod
def from_pandas(cls, dataframe, nan_as_null=True):
Expand Down
9 changes: 5 additions & 4 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ def __getattr__(self, key):
raise AttributeError()
if key in self._df.columns:
by_list = []

for by_name, by in zip(
self._groupby.key_names, self._groupby.key_columns
):
Expand Down Expand Up @@ -446,10 +447,10 @@ def construct_result(self, out_key_columns, out_value_columns):

index = self.compute_result_index(out_key_columns, out_value_columns)
if len(result) == 0 and len(index) != 0:
# len(result) must be len(index) for
# ``result.index = index`` to work:
result._size = len(index)
result.index = index
# Can't go through the setter in this case
result._index = index
else:
result.index = index

if isinstance(self.obj, cudf.Series):
# May need to downcast from DataFrame to Series:
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,9 +203,10 @@ def _getitem_tuple_arg(self, arg):
columns_df = self._df.columns._get_column_major(self._df, arg[1])
else:
columns = self._get_column_selection(arg[1])
columns_df = DataFrame()
columns_df = DataFrame(index=self._df.index)
for i, col in enumerate(columns):
columns_df.insert(i, col, self._df[col])

# Step 2: Gather rows
if isinstance(columns_df.index, MultiIndex):
return columns_df.index._get_row_major(columns_df, arg[0])
Expand All @@ -221,6 +222,7 @@ def _getitem_tuple_arg(self, arg):
df = DataFrame()
for col in columns_df.columns:
df[col] = columns_df[col].loc[arg[0]]

# Step 3: Gather index
if df.shape[0] == 1: # we have a single row
if isinstance(arg[0], slice):
Expand Down
7 changes: 4 additions & 3 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,8 @@ def __init__(

if index is not None and not isinstance(index, Index):
index = as_index(index)

assert isinstance(data, column.ColumnBase)
if name is None:
name = data.name

super().__init__([data], [name])
self._index = RangeIndex(len(data)) if index is None else index
Expand Down Expand Up @@ -2597,7 +2596,9 @@ def weekday(self):

def get_dt_field(self, field):
out_column = self.series._column.get_dt_field(field)
return Series(data=out_column, index=self.series._index)
return Series(
data=out_column, index=self.series._index, name=self.series.name
)


def _align_indices(lhs, rhs, join="outer"):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/io/dlpack.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def to_dlpack(cudf_obj):
raise ValueError("Cannot create DLPack tensor of 0 size")

if isinstance(cudf_obj, DataFrame):
gdf_cols = [col[1]._column for col in cudf_obj._cols.items()]
gdf_cols = list(cudf_obj._cols.values())
elif isinstance(cudf_obj, Series):
gdf_cols = [cudf_obj._column]
elif isinstance(cudf_obj, Index):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/utils/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def is_scalar(val):
or isinstance(val, numbers.Number)
or np.isscalar(val)
or isinstance(val, pd.Timestamp)
or isinstance(val, pd.Categorical)
or (isinstance(val, pd.Categorical) and len(val) == 1)
)


Expand Down

0 comments on commit 6251122

Please sign in to comment.