Fixes to csv, dlpack and groupby to support DataFrame._cols

vyasr · Dec 4, 2019 · 6251122 · 6251122
1 parent e2c1cd6
commit 6251122
Show file tree

Hide file tree

Showing 7 changed files with 31 additions and 17 deletions.
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
@@ -298,7 +298,7 @@ cpdef write_csv(
             if col_name not in cols:
                 raise NameError('column {!r} does not exist in DataFrame'
                                 .format(col_name))
-            col = cols[col_name]._column
+            col = cols[col_name]
             check_gdf_compatibility(col)
             # Workaround for string columns
             if col.dtype.type == np.object_:
@@ -308,7 +308,6 @@ cpdef write_csv(
             list_cols.push_back(c_col)
     else:
         for idx, (col_name, col) in enumerate(cols.items()):
-            col = col._column
             check_gdf_compatibility(col)
             # Workaround for string columns
             if col.dtype.type == np.object_:

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -176,7 +176,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
                 )
         else:
             if is_list_like(data):
-                if len(data) > 0 and not is_list_like(data[0]):
+                if len(data) > 0 and is_scalar(data[0]):
                     data = [data]
                 self._init_from_list_like(data, index=index, columns=columns)
 
@@ -201,9 +201,12 @@ def _init_from_list_like(self, data, index=None, columns=None):
         data = list(itertools.zip_longest(*data))
         if columns is None:
             columns = range(len(data))
+
         for col_name, col in enumerate(data):
             self._cols[col_name] = column.as_column(col)
 
+        self.columns = columns
+
     def _init_from_dict_like(self, data, index=None, columns=None):
         data = data.copy()
         num_rows = 0
@@ -425,7 +428,7 @@ def __getitem__(self, arg):
             s = cudf.Series(self._cols[arg], name=arg, index=self.index)
             return s
         elif isinstance(arg, slice):
-            df = DataFrame()
+            df = DataFrame(index=self.index[arg])
             for k, col in self._cols.items():
                 df[k] = col[arg]
             return df
@@ -449,6 +452,7 @@ def __getitem__(self, arg):
                     return df
                 for col in arg:
                     df[col] = self[col]
+                df.index = self.index
             return df
         elif isinstance(arg, DataFrame):
             return self.mask(arg)
@@ -1498,6 +1502,10 @@ def insert(self, loc, name, value, forceindex=False):
         if is_scalar(value):
             value = utils.scalar_broadcast_to(value, len(self))
 
+        if isinstance(value, (pd.Series, Series)):
+            if len(self) == 0:
+                self._index = as_index(value.index)
+
         value = column.as_column(value)
 
         if len(self.index) == 0:
@@ -1653,9 +1661,9 @@ def drop_duplicates(self, subset=None, keep="first", inplace=False):
         if isinstance(self.index, cudf.core.multiindex.MultiIndex):
             new_index = self.index.take(new_index)
         if inplace:
-            self.index = new_index
+            self._index = new_index
             for k, new_col in zip(self._cols, out_cols):
-                self[k] = Series(new_col, new_index)
+                self[k] = new_col
         else:
             outdf = DataFrame()
             for k, new_col in zip(self._cols, out_cols):
@@ -3291,7 +3299,10 @@ def to_pandas(self):
                     out_columns.names = self.columns.names
             else:
                 out_columns.name = self.columns.name
-        return pd.DataFrame(out_data, index=out_index, columns=out_columns)
+        out_df = pd.DataFrame(out_data, index=out_index)
+        if out_columns is not None:
+            out_df.columns = out_columns
+        return out_df
 
     @classmethod
     def from_pandas(cls, dataframe, nan_as_null=True):

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
@@ -153,6 +153,7 @@ def __getattr__(self, key):
             raise AttributeError()
         if key in self._df.columns:
             by_list = []
+
             for by_name, by in zip(
                 self._groupby.key_names, self._groupby.key_columns
             ):
@@ -446,10 +447,10 @@ def construct_result(self, out_key_columns, out_value_columns):
 
         index = self.compute_result_index(out_key_columns, out_value_columns)
         if len(result) == 0 and len(index) != 0:
-            # len(result) must be len(index) for
-            # ``result.index = index`` to work:
-            result._size = len(index)
-        result.index = index
+            # Can't go through the setter in this case
+            result._index = index
+        else:
+            result.index = index
 
         if isinstance(self.obj, cudf.Series):
             # May need to downcast from DataFrame to Series:

diff --git a/python/cudf/cudf/core/indexing.py b/python/cudf/cudf/core/indexing.py
@@ -203,9 +203,10 @@ def _getitem_tuple_arg(self, arg):
             columns_df = self._df.columns._get_column_major(self._df, arg[1])
         else:
             columns = self._get_column_selection(arg[1])
-            columns_df = DataFrame()
+            columns_df = DataFrame(index=self._df.index)
             for i, col in enumerate(columns):
                 columns_df.insert(i, col, self._df[col])
+
         # Step 2: Gather rows
         if isinstance(columns_df.index, MultiIndex):
             return columns_df.index._get_row_major(columns_df, arg[0])
@@ -221,6 +222,7 @@ def _getitem_tuple_arg(self, arg):
                 df = DataFrame()
                 for col in columns_df.columns:
                     df[col] = columns_df[col].loc[arg[0]]
+
         # Step 3: Gather index
         if df.shape[0] == 1:  # we have a single row
             if isinstance(arg[0], slice):

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -127,9 +127,8 @@ def __init__(
 
         if index is not None and not isinstance(index, Index):
             index = as_index(index)
+
         assert isinstance(data, column.ColumnBase)
-        if name is None:
-            name = data.name
 
         super().__init__([data], [name])
         self._index = RangeIndex(len(data)) if index is None else index
@@ -2597,7 +2596,9 @@ def weekday(self):
 
     def get_dt_field(self, field):
         out_column = self.series._column.get_dt_field(field)
-        return Series(data=out_column, index=self.series._index)
+        return Series(
+            data=out_column, index=self.series._index, name=self.series.name
+        )
 
 
 def _align_indices(lhs, rhs, join="outer"):

diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
@@ -68,7 +68,7 @@ def to_dlpack(cudf_obj):
         raise ValueError("Cannot create DLPack tensor of 0 size")
 
     if isinstance(cudf_obj, DataFrame):
-        gdf_cols = [col[1]._column for col in cudf_obj._cols.items()]
+        gdf_cols = list(cudf_obj._cols.values())
     elif isinstance(cudf_obj, Series):
         gdf_cols = [cudf_obj._column]
     elif isinstance(cudf_obj, Index):

diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
@@ -129,7 +129,7 @@ def is_scalar(val):
         or isinstance(val, numbers.Number)
         or np.isscalar(val)
         or isinstance(val, pd.Timestamp)
-        or isinstance(val, pd.Categorical)
+        or (isinstance(val, pd.Categorical) and len(val) == 1)
     )