From d08fba7adabc846358d5c14f72867506b16b3f25 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 27 Nov 2023 19:06:30 -0800 Subject: [PATCH 01/22] Start refactoring DataFrame init --- python/cudf/cudf/core/dataframe.py | 375 ++++++++--------------- python/cudf/cudf/tests/test_dataframe.py | 5 + 2 files changed, 127 insertions(+), 253 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 785f3d98712..47ac856ef86 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -666,38 +666,26 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): def __init__( self, data=None, index=None, columns=None, dtype=None, nan_as_null=True ): - super().__init__() + if columns is not None: + columns = as_index(columns).to_pandas() - if isinstance(columns, (Series, cudf.BaseIndex)): - columns = columns.to_pandas() + if index is not None: + index = as_index(index) + + if data is None: + data = [] + elif isinstance(data, Iterator) and not isinstance(data, str): + data = list(data) + + index_from_data = None + columns_from_data = None if isinstance(data, (DataFrame, pd.DataFrame)): if isinstance(data, pd.DataFrame): data = self.from_pandas(data, nan_as_null=nan_as_null) - - if index is not None: - if not data.index.equals(index): - data = data.reindex(index) - index = data._index - else: - index = as_index(index) - else: - index = data._index - - self._index = index - - if columns is not None: - self._data = data._data - self._reindex( - column_names=columns, index=index, deep=False, inplace=True - ) - if isinstance( - columns, (range, pd.RangeIndex, cudf.RangeIndex) - ): - self._data.rangeindex = True - else: - self._data = data._data - self._data.rangeindex = True + col_dict = data._data + index_from_data = data.index + columns_from_data = data.columns elif isinstance(data, (cudf.Series, pd.Series)): if isinstance(data, pd.Series): data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null) @@ -719,35 +707,8 @@ def __init__( name = columns[0] else: name = data.name or 0 - self._init_from_dict_like( - {name: data}, - index=index, - columns=columns, - nan_as_null=nan_as_null, - ) - elif data is None: - if index is None: - self._index = RangeIndex(0) - else: - self._index = as_index(index) - if columns is not None: - rangeindex = isinstance( - columns, (range, pd.RangeIndex, cudf.RangeIndex) - ) - label_dtype = getattr(columns, "dtype", None) - self._data = ColumnAccessor( - { - k: column.column_empty( - len(self), dtype="object", masked=True - ) - for k in columns - }, - level_names=tuple(columns.names) - if isinstance(columns, pd.Index) - else None, - rangeindex=rangeindex, - label_dtype=label_dtype, - ) + col_dict = {name: data._column} + index_from_data = data.index elif isinstance(data, ColumnAccessor): raise TypeError( "Use cudf.Series._from_data for constructing a Series from " @@ -759,69 +720,76 @@ def __init__( # descr is an optional field of the _cuda_ary_iface_ if "descr" in arr_interface: if len(arr_interface["descr"]) == 1: - new_df = self._from_arrays( + col_dict = self._from_arrays( data, index=index, columns=columns ) else: - new_df = self.from_records( + col_dict = self.from_records( data, index=index, columns=columns - ) + )._data else: - new_df = self._from_arrays(data, index=index, columns=columns) + col_dict = self._from_arrays( + data, index=index, columns=columns + ) - self._data = new_df._data - self._index = new_df._index - self._check_data_index_length_match() elif hasattr(data, "__array_interface__"): arr_interface = data.__array_interface__ if len(arr_interface["descr"]) == 1: # not record arrays - new_df = self._from_arrays(data, index=index, columns=columns) + col_dict = self._from_arrays( + data, index=index, columns=columns + ) else: - new_df = self.from_records(data, index=index, columns=columns) - self._data = new_df._data - self._index = new_df._index - self._check_data_index_length_match() - else: - if isinstance(data, Iterator): - data = list(data) - if is_list_like(data): - if len(data) > 0 and is_scalar(data[0]): - if columns is not None: - data = dict(zip(columns, [data])) - rangeindex = isinstance( - columns, (range, pd.RangeIndex, cudf.RangeIndex) - ) - else: - data = dict(enumerate([data])) - rangeindex = True - new_df = DataFrame(data=data, index=index) - - self._data = new_df._data - self._index = new_df._index - self._data._level_names = ( - tuple(columns.names) - if isinstance(columns, pd.Index) - else self._data._level_names - ) - self._data.rangeindex = rangeindex - elif len(data) > 0 and isinstance(data[0], Series): - self._init_from_series_list( - data=data, columns=columns, index=index + col_dict = self.from_records( + data, index=index, columns=columns + )._data + elif is_scalar(data): + if index is None or columns is None: + raise ValueError("DataFrame constructor not properly called!") + col_dict = { + col_label: as_column( + data, nan_as_null=nan_as_null, length=len(index) + ) + for col_label in columns + } + elif is_list_like(data): + if len(data) > 0 and is_scalar(data[0]): + if columns is not None: + data = dict(zip(columns, [data])) + rangeindex = isinstance( + columns, (range, pd.RangeIndex, cudf.RangeIndex) ) else: - self._init_from_list_like( - data, index=index, columns=columns - ) - self._check_data_index_length_match() + data = dict(enumerate([data])) + rangeindex = True + new_df = DataFrame(data=data, index=index) + + self._data = new_df._data + self._index = new_df._index + self._data._level_names = ( + tuple(columns.names) + if isinstance(columns, pd.Index) + else self._data._level_names + ) + self._data.rangeindex = rangeindex + elif len(data) > 0 and isinstance(data[0], Series): + self._init_from_series_list( + data=data, columns=columns, index=index + ) else: - if not is_dict_like(data): - raise TypeError("data must be list or dict-like") + self._init_from_list_like(data, index=index, columns=columns) + self._check_data_index_length_match() + elif is_dict_like(data): + col_dict, index_from_data = self._init_from_dict_like( + data, nan_as_null=nan_as_null + ) + else: + raise TypeError( + f"data must be list or dict-like, not {type(data).__name__}" + ) - self._init_from_dict_like( - data, index=index, columns=columns, nan_as_null=nan_as_null - ) - self._check_data_index_length_match() + super().__init__(col_dict, index=index) + self._check_data_index_length_match() if dtype: self._data = self.astype(dtype)._data @@ -1001,80 +969,18 @@ def _init_from_list_like(self, data, index=None, columns=None): @_cudf_nvtx_annotate def _init_from_dict_like( - self, data, index=None, columns=None, nan_as_null=None - ): - label_dtype = None - if columns is not None: - label_dtype = getattr(columns, "dtype", None) - # remove all entries in data that are not in columns, - # inserting new empty columns for entries in columns that - # are not in data - if any(c in data for c in columns): - # Let the downstream logic determine the length of the - # empty columns here - empty_column = lambda: None # noqa: E731 - else: - # If keys is empty, none of the data keys match the - # columns, so we need to create an empty DataFrame. To - # match pandas, the size of the dataframe must match - # the provided index, so we need to return a masked - # array of nulls if an index is given. - empty_column = functools.partial( - cudf.core.column.column_empty, - row_count=(0 if index is None else len(index)), - dtype=None, - masked=index is not None, - ) - - data = { - c: data[c] if c in data else empty_column() for c in columns - } - - data, index = self._align_input_series_indices(data, index=index) - - if index is None: - num_rows = 0 - if data: - keys, values, lengths = zip( - *( - (k, v, 1) - if is_scalar(v) - else ( - k, - vc := as_column(v, nan_as_null=nan_as_null), - len(vc), - ) - for k, v in data.items() - ) - ) - data = dict(zip(keys, values)) - try: - (num_rows,) = (set(lengths) - {1}) or {1} - except ValueError: - raise ValueError("All arrays must be the same length") - - self._index = RangeIndex(0, num_rows) - else: - self._index = as_index(index) - - if len(data): - self._data.multiindex = True - for i, col_name in enumerate(data): - self._data.multiindex = self._data.multiindex and isinstance( - col_name, tuple - ) - self._insert( - i, - col_name, - data[col_name], - nan_as_null=nan_as_null, - ) - self._data._level_names = ( - tuple(columns.names) - if isinstance(columns, pd.Index) - else self._data._level_names + self, data: dict, nan_as_null: bool | None = None + ) -> tuple[dict, None | cudf.Index]: + if not data: + return data, None + data, index_from_data, value_length = self._align_input_series_indices( + data, nan_as_null=nan_as_null ) - self._data.label_dtype = label_dtype + col_data = { + key: as_column(value, nan_as_null=nan_as_null, length=value_length) + for key, value in data.items() + } + return col_data, index_from_data @classmethod def _from_data( @@ -1090,33 +996,33 @@ def _from_data( @staticmethod @_cudf_nvtx_annotate - def _align_input_series_indices(data, index): + def _align_input_series_indices( + data: dict, nan_as_null: bool | None = None + ) -> tuple[dict, None | cudf.Index, int]: + input_series = {} + value_lengths: set[int] = set() + for key, val in data.items(): + if isinstance(val, (pd.Series, Series, dict)): + val = Series(val, nan_as_null=nan_as_null) + input_series[key] = val + if not is_scalar(val): + value_lengths.add(len(val)) + if len(value_lengths) > 1: + raise ValueError(f"Found varying data lengths: {value_lengths}") + + if not input_series: + return data, None, value_lengths.pop() + + aligned_input_series = cudf.core.series._align_indices( + list(input_series.values()) + ) + index = aligned_input_series[0].index data = data.copy() - - input_series = [ - Series(val) - for val in data.values() - if isinstance(val, (pd.Series, Series, dict)) - ] - - if input_series: - if index is not None: - aligned_input_series = [ - sr._align_to_index(index, how="right", sort=False) - for sr in input_series - ] - - else: - aligned_input_series = cudf.core.series._align_indices( - input_series - ) - index = aligned_input_series[0].index - - for name, val in data.items(): - if isinstance(val, (pd.Series, Series, dict)): - data[name] = aligned_input_series.pop(0) - - return data, index + for key, aligned_series in zip( + input_series.keys(), aligned_input_series + ): + data[key] = aligned_series + return data, index, value_lengths.pop() # The `constructor*` properties are used by `dask` (and `dask_cudf`) @property @@ -5531,70 +5437,33 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): @classmethod @_cudf_nvtx_annotate - def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False): - """Convert a numpy/cupy array to DataFrame. + def _from_arrays(cls, data, nan_as_null=False) -> dict[int, ColumnBase]: + """Convert a numpy/cupy array to a dict of columns. Parameters ---------- data : numpy/cupy array of ndim 1 or 2, - dimensions greater than 2 are not supported yet. - index : Index or array-like - Index to use for resulting frame. Will default to - RangeIndex if no indexing information part of input data and - no index provided. - columns : list of str - List of column names to include. + dimensions greater than 2 are not supported. + nan_as_null : bool + whether the NaN should represent NA Returns ------- - DataFrame + {int: Column} """ data = cupy.asarray(data) - if data.ndim != 1 and data.ndim != 2: + if data.ndim not in (1, 2): raise ValueError( f"records dimension expected 1 or 2 but found: {data.ndim}" ) - if data.ndim == 2: - num_cols = data.shape[1] - else: - # Since we validate ndim to be either 1 or 2 above, - # this case can be assumed to be ndim == 1. - num_cols = 1 - - if columns is None: - names = range(num_cols) - else: - if len(columns) != num_cols: - raise ValueError( - f"columns length expected {num_cols} but " - f"found {len(columns)}" - ) - elif len(columns) != len(set(columns)): - raise ValueError("Duplicate column names are not allowed") - names = columns - - df = cls() - if data.ndim == 2: - for i, k in enumerate(names): - df._data[k] = column.as_column( - data[:, i], nan_as_null=nan_as_null - ) - elif data.ndim == 1: - df._data[names[0]] = column.as_column( - data, nan_as_null=nan_as_null - ) - if isinstance(columns, pd.Index): - df._data._level_names = tuple(columns.names) - if isinstance(columns, (range, pd.RangeIndex, cudf.RangeIndex)): - df._data.rangeindex = True - - if index is None: - df._index = RangeIndex(start=0, stop=len(data)) - else: - df._index = as_index(index) - return df + if data.ndim == 1: + data = data.reshape(1, len(data)) + return { + i: column.as_column(data[:, i], nan_as_null=nan_as_null) + for i in range(data.shape[1]) + } @_cudf_nvtx_annotate def interpolate( diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 97c89217f9f..f79cc7ed875 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10653,6 +10653,11 @@ def test_dataframe_from_ndarray_dup_columns(): cudf.DataFrame(np.eye(2), columns=["A", "A"]) +def test_dataframe_from_dict_only_scalar_values_raises(): + with pytest.raises(ValueError): + cudf.DataFrame({0: 3, 1: 2}) + + @pytest.mark.parametrize("name", ["a", 0, None, np.nan, cudf.NA]) @pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA]) @pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]]) From 09690654f7c77a9225aef03c7d1716be7cb638cf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 28 Nov 2023 13:44:37 -0800 Subject: [PATCH 02/22] Add dataframe reindexing tests, refactor logic --- python/cudf/cudf/core/dataframe.py | 50 +++++++++++++++++------- python/cudf/cudf/tests/test_dataframe.py | 20 ++++++++++ 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d90fba8ab26..7f683b99329 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -684,8 +684,8 @@ def __init__( if isinstance(data, pd.DataFrame): data = self.from_pandas(data, nan_as_null=nan_as_null) col_dict = data._data - index_from_data = data.index - columns_from_data = data.columns + index, index_from_data = data.index, index + columns, columns_from_data = data.columns, columns elif isinstance(data, (cudf.Series, pd.Series)): if isinstance(data, pd.Series): data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null) @@ -698,17 +698,21 @@ def __init__( # -> return 1 column DataFrame # Series.name is None and columns # -> return 1 column DataFrame if len(columns) in {0, 1} - if data.name is None and columns is not None: - if len(columns) > 1: - raise ValueError( - "Length of columns must be less than 2 if " - f"{type(data).__name__}.name is None." - ) - name = columns[0] + if data.name is None: + if columns is not None: + if len(columns) > 1: + raise ValueError( + "Length of columns must be less than 2 if " + f"{type(data).__name__}.name is None." + ) + name = columns[0] + else: + name = 0 else: - name = data.name or 0 + name = data.name + columns, columns_from_data = pd.Index([data.name]), columns col_dict = {name: data._column} - index_from_data = data.index + index, index_from_data = data.index, index elif isinstance(data, ColumnAccessor): raise TypeError( "Use cudf.Series._from_data for constructing a Series from " @@ -752,7 +756,12 @@ def __init__( ) for col_label in columns } + elif is_dict_like(data): + result = self._init_from_dict_like(data, nan_as_null=nan_as_null) + col_dict = result[0] + index, index_from_data = result[1], index elif is_list_like(data): + super().__init__() if len(data) > 0 and is_scalar(data[0]): if columns is not None: data = dict(zip(columns, [data])) @@ -779,16 +788,27 @@ def __init__( else: self._init_from_list_like(data, index=index, columns=columns) self._check_data_index_length_match() - elif is_dict_like(data): - col_dict, index_from_data = self._init_from_dict_like( - data, nan_as_null=nan_as_null - ) + return else: raise TypeError( f"data must be list or dict-like, not {type(data).__name__}" ) super().__init__(col_dict, index=index) + if columns_from_data is not None: + # TODO: This there a better way to do this? + columns_from_data = as_index(columns_from_data) + reindexed = self.reindex( + columns=columns_from_data.to_pandas(), copy=False + ) + self._data = reindexed._data + self._index = index + if index_from_data is not None: + # TODO: This there a better way to do this? + index_from_data = as_index(index_from_data) + reindexed = self.reindex(index=index_from_data, copy=False) + self._data = reindexed._data + self._index = index_from_data self._check_data_index_length_match() if dtype: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f79cc7ed875..65b874fe85a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10658,6 +10658,26 @@ def test_dataframe_from_dict_only_scalar_values_raises(): cudf.DataFrame({0: 3, 1: 2}) +@pytest.mark.parametrize("klass", [cudf.DataFrame, pd.DataFrame]) +@pytest.mark.parametrize( + "axis_kwargs, exp_data", + [ + [ + {"index": [1, 2], "columns": [1, 2]}, + np.array([[1.0, np.nan], [np.nan, np.nan]]), + ], + [{"index": [1, 2]}, np.array([[0.0, 1.0], [np.nan, np.nan]])], + [{"columns": [1, 2]}, np.array([[0.0, np.nan], [1.0, np.nan]])], + ], +) +def test_dataframe_from_frame_with_index_or_columns_reindexes( + klass, axis_kwargs, exp_data +): + result = cudf.DataFrame(klass(np.eye(2)), **axis_kwargs) + expected = cudf.DataFrame(exp_data, **axis_kwargs) + assert_eq(result, expected) + + @pytest.mark.parametrize("name", ["a", 0, None, np.nan, cudf.NA]) @pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA]) @pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]]) From 2fa5f3a5a9c6b5a3a959ad97ae610fc67740c681 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 28 Nov 2023 19:08:24 -0800 Subject: [PATCH 03/22] Fix more logic --- python/cudf/cudf/core/dataframe.py | 126 ++++++++++++++++++++--------- 1 file changed, 86 insertions(+), 40 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7f683b99329..a0f2ab98f99 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -666,8 +666,16 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): def __init__( self, data=None, index=None, columns=None, dtype=None, nan_as_null=True ): + col_is_rangeindex = False + col_is_multiindex = False + if columns is not None: - columns = as_index(columns).to_pandas() + columns = as_index(columns) + if columns.nunique() != len(columns): + raise ValueError("Columns cannot contain duplicate values") + columns = columns.to_pandas() + col_is_rangeindex = isinstance(columns, pd.RangeIndex) + col_is_multiindex = isinstance(columns, pd.MultiIndex) if index is not None: index = as_index(index) @@ -708,10 +716,15 @@ def __init__( name = columns[0] else: name = 0 + col_is_rangeindex = True + col_dict = {name: data._column} else: - name = data.name - columns, columns_from_data = pd.Index([data.name]), columns - col_dict = {name: data._column} + if columns is not None and not columns.isin([data.name]).any(): + data = data.copy()[:0] + col_dict = {col: data._column for col in columns} + else: + col_dict = {data.name: data._column} + columns, columns_from_data = pd.Index([data.name]), columns index, index_from_data = data.index, index elif isinstance(data, ColumnAccessor): raise TypeError( @@ -725,7 +738,7 @@ def __init__( if "descr" in arr_interface: if len(arr_interface["descr"]) == 1: col_dict = self._from_arrays( - data, index=index, columns=columns + data, columns=columns, nan_as_null=nan_as_null ) else: col_dict = self.from_records( @@ -733,33 +746,32 @@ def __init__( )._data else: col_dict = self._from_arrays( - data, index=index, columns=columns + data, columns=columns, nan_as_null=nan_as_null ) - + index, index_from_data = RangeIndex(data.shape[0]), index elif hasattr(data, "__array_interface__"): arr_interface = data.__array_interface__ if len(arr_interface["descr"]) == 1: # not record arrays col_dict = self._from_arrays( - data, index=index, columns=columns + data, columns=columns, nan_as_null=nan_as_null ) else: col_dict = self.from_records( data, index=index, columns=columns )._data + index, index_from_data = RangeIndex(data.shape[0]), index elif is_scalar(data): if index is None or columns is None: - raise ValueError("DataFrame constructor not properly called!") + raise ValueError( + "Must provide an index and columns if data is a scalar." + ) col_dict = { col_label: as_column( data, nan_as_null=nan_as_null, length=len(index) ) for col_label in columns } - elif is_dict_like(data): - result = self._init_from_dict_like(data, nan_as_null=nan_as_null) - col_dict = result[0] - index, index_from_data = result[1], index elif is_list_like(data): super().__init__() if len(data) > 0 and is_scalar(data[0]): @@ -789,6 +801,12 @@ def __init__( self._init_from_list_like(data, index=index, columns=columns) self._check_data_index_length_match() return + elif is_dict_like(data): + result = self._init_from_dict_like( + data, index, nan_as_null=nan_as_null + ) + col_dict = result[0] + index, index_from_data = result[1], index else: raise TypeError( f"data must be list or dict-like, not {type(data).__name__}" @@ -798,6 +816,8 @@ def __init__( if columns_from_data is not None: # TODO: This there a better way to do this? columns_from_data = as_index(columns_from_data) + col_is_rangeindex = isinstance(columns, cudf.RangeIndex) + col_is_multiindex = isinstance(columns, cudf.MultiIndex) reindexed = self.reindex( columns=columns_from_data.to_pandas(), copy=False ) @@ -814,9 +834,8 @@ def __init__( if dtype: self._data = self.astype(dtype)._data - self._data.multiindex = self._data.multiindex or isinstance( - columns, pd.MultiIndex - ) + self._data.rangeindex = self._data.rangeindex or col_is_rangeindex + self._data.multiindex = self._data.multiindex or col_is_multiindex @_cudf_nvtx_annotate def _init_from_series_list(self, data, columns, index): @@ -989,13 +1008,33 @@ def _init_from_list_like(self, data, index=None, columns=None): @_cudf_nvtx_annotate def _init_from_dict_like( - self, data: dict, nan_as_null: bool | None = None - ) -> tuple[dict, None | cudf.Index]: + self, data: dict, index: None | cudf.Index, nan_as_null=None + ) -> tuple[dict, cudf.Index]: if not data: - return data, None - data, index_from_data, value_length = self._align_input_series_indices( - data, nan_as_null=nan_as_null - ) + return data, cudf.RangeIndex(0) + data, index_from_data = self._align_input_series_indices(data) + + value_lengths = set() + if index_from_data is not None: + value_lengths.add(len(index_from_data)) + + scalar_keys = [] + col_data = {} + for key, value in data: + if is_scalar(value): + scalar_keys.append(key) + col_data[key] = value + else: + value_lengths.add(len(value)) + col_data[key] = as_column(value, nan_as_null=nan_as_null) + + if len(scalar_keys) != len(data) and len(value_lengths) > 1: + raise ValueError( + "Found varying value lengths when all values " + f"must have the same length: {value_lengths}" + ) + # TODO: If all scalars, use index length + col_data = { key: as_column(value, nan_as_null=nan_as_null, length=value_length) for key, value in data.items() @@ -1017,32 +1056,27 @@ def _from_data( @staticmethod @_cudf_nvtx_annotate def _align_input_series_indices( - data: dict, nan_as_null: bool | None = None - ) -> tuple[dict, None | cudf.Index, int]: - input_series = {} - value_lengths: set[int] = set() - for key, val in data.items(): - if isinstance(val, (pd.Series, Series, dict)): - val = Series(val, nan_as_null=nan_as_null) - input_series[key] = val - if not is_scalar(val): - value_lengths.add(len(val)) - if len(value_lengths) > 1: - raise ValueError(f"Found varying data lengths: {value_lengths}") + data: dict, + ) -> tuple[dict, None | cudf.Index]: + """If data.values() contains Series/dicts, align their indexes before processing""" + input_series = { + key: val + for key, val in data.items() + if isinstance(val, (pd.Series, Series, dict)) + } if not input_series: - return data, None, value_lengths.pop() + return data, None aligned_input_series = cudf.core.series._align_indices( list(input_series.values()) ) - index = aligned_input_series[0].index data = data.copy() for key, aligned_series in zip( input_series.keys(), aligned_input_series ): data[key] = aligned_series - return data, index, value_lengths.pop() + return data, aligned_series.index # The `constructor*` properties are used by `dask` (and `dask_cudf`) @property @@ -5448,7 +5482,9 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False): @classmethod @_cudf_nvtx_annotate - def _from_arrays(cls, data, nan_as_null=False) -> dict[int, ColumnBase]: + def _from_arrays( + cls, data, columns, nan_as_null=False + ) -> dict[Any, ColumnBase]: """Convert a numpy/cupy array to a dict of columns. Parameters @@ -5471,9 +5507,19 @@ def _from_arrays(cls, data, nan_as_null=False) -> dict[int, ColumnBase]: if data.ndim == 1: data = data.reshape(1, len(data)) + + if columns is not None: + if len(columns) != data.shape[1]: + raise ValueError( + f"columns length expected {data.shape[1]} but " + f"found {len(columns)}" + ) + columns_labels = columns + else: + columns_labels = range(data.shape[1]) return { - i: column.as_column(data[:, i], nan_as_null=nan_as_null) - for i in range(data.shape[1]) + column_label: column.as_column(data[:, i], nan_as_null=nan_as_null) + for column_label, i in zip(columns_labels, range(data.shape[1])) } @_cudf_nvtx_annotate From 89f92806a26e0eb644824a5c418c79939790ec7b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 29 Nov 2023 14:31:25 -0800 Subject: [PATCH 04/22] Adjust dict logic --- python/cudf/cudf/core/dataframe.py | 69 ++++++++++++++++-------- python/cudf/cudf/tests/test_dataframe.py | 10 +++- 2 files changed, 57 insertions(+), 22 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a0f2ab98f99..772e8e1923a 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -671,7 +671,9 @@ def __init__( if columns is not None: columns = as_index(columns) - if columns.nunique() != len(columns): + if not isinstance( + columns, MultiIndex + ) and columns.nunique() != len(columns): raise ValueError("Columns cannot contain duplicate values") columns = columns.to_pandas() col_is_rangeindex = isinstance(columns, pd.RangeIndex) @@ -748,7 +750,8 @@ def __init__( col_dict = self._from_arrays( data, columns=columns, nan_as_null=nan_as_null ) - index, index_from_data = RangeIndex(data.shape[0]), index + if index is None: + index = RangeIndex(arr_interface["shape"][0]) elif hasattr(data, "__array_interface__"): arr_interface = data.__array_interface__ if len(arr_interface["descr"]) == 1: @@ -760,7 +763,8 @@ def __init__( col_dict = self.from_records( data, index=index, columns=columns )._data - index, index_from_data = RangeIndex(data.shape[0]), index + if index is None: + index = RangeIndex(arr_interface["shape"][0]) elif is_scalar(data): if index is None or columns is None: raise ValueError( @@ -1010,36 +1014,58 @@ def _init_from_list_like(self, data, index=None, columns=None): def _init_from_dict_like( self, data: dict, index: None | cudf.Index, nan_as_null=None ) -> tuple[dict, cudf.Index]: + # 1) Align indexes of all data.values() that are Series/dicts + # 2) Convert all array-like data.values() to columns + # 3) Convert all remaining scalar data.values() to columns if not data: return data, cudf.RangeIndex(0) - data, index_from_data = self._align_input_series_indices(data) + data, index_from_data = self._align_input_series_indices( + data, nan_as_null=nan_as_null + ) value_lengths = set() + result_index = None if index_from_data is not None: value_lengths.add(len(index_from_data)) + result_index = index_from_data + elif index is not None: + result_index = index scalar_keys = [] col_data = {} - for key, value in data: + for key, value in data.items(): if is_scalar(value): scalar_keys.append(key) col_data[key] = value else: - value_lengths.add(len(value)) - col_data[key] = as_column(value, nan_as_null=nan_as_null) + column = as_column(value, nan_as_null=nan_as_null) + value_lengths.add(len(column)) + col_data[key] = column if len(scalar_keys) != len(data) and len(value_lengths) > 1: raise ValueError( "Found varying value lengths when all values " f"must have the same length: {value_lengths}" ) - # TODO: If all scalars, use index length + elif len(scalar_keys) == len(data): + # All data.values() are scalars + if index is None: + raise ValueError( + "If using all scalar values, you must pass an index" + ) + scalar_length = len(index) + else: + scalar_length = value_lengths.pop() + + for key in scalar_keys: + col_data[key] = as_column( + col_data[key], nan_as_null=nan_as_null, length=scalar_length + ) - col_data = { - key: as_column(value, nan_as_null=nan_as_null, length=value_length) - for key, value in data.items() - } - return col_data, index_from_data + if result_index is None: + result_index = cudf.RangeIndex(scalar_length) + + return col_data, result_index @classmethod def _from_data( @@ -1056,11 +1082,10 @@ def _from_data( @staticmethod @_cudf_nvtx_annotate def _align_input_series_indices( - data: dict, + data: dict, nan_as_null=None ) -> tuple[dict, None | cudf.Index]: - """If data.values() contains Series/dicts, align their indexes before processing""" input_series = { - key: val + key: Series(val, nan_as_null=nan_as_null) for key, val in data.items() if isinstance(val, (pd.Series, Series, dict)) } @@ -6408,11 +6433,13 @@ def select_dtypes(self, include=None, exclude=None): inclusion = set() # remove all exclude types inclusion = inclusion - exclude_subtypes - - for k, col in self._data.items(): - infered_type = cudf_dtype_from_pydata_dtype(col.dtype) - if infered_type in inclusion: - df._insert(len(df._data), k, col) + if inclusion: + for k, col in self._data.items(): + infered_type = cudf_dtype_from_pydata_dtype(col.dtype) + if infered_type in inclusion: + df._insert(len(df._data), k, col) + else: + df.columns = df.columns[:0] return df diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c792bee2a58..d1e2f03420f 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1581,7 +1581,15 @@ def test_concat_empty_dataframe(df_1, df_2): # ignoring dtypes as pandas upcasts int to float # on concatenation with empty dataframes - assert_eq(got, expect, check_dtype=False, check_index_type=True) + # pandas>=2.0 has RangeIndex columns (matching cudf) + # pandas<=1.5 returns Index[object] columns + assert_eq( + got, + expect, + check_dtype=False, + check_index_type=True, + check_column_type=PANDAS_GE_200, + ) @pytest.mark.parametrize( From a4da710baeecd47249f75cbe9ce6ae8097cab16a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 30 Nov 2023 17:54:10 -0800 Subject: [PATCH 05/22] More bugs in dict and array logic --- python/cudf/cudf/core/column/column.py | 12 +++++++- python/cudf/cudf/core/column_accessor.py | 4 +++ python/cudf/cudf/core/dataframe.py | 39 +++++++++++++++++++----- python/cudf/cudf/core/frame.py | 4 +++ python/cudf/cudf/tests/test_dataframe.py | 35 +++++++++++++++++++-- 5 files changed, 84 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a76f4d7383c..c284b8c44bd 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2173,7 +2173,10 @@ def as_column( if dtype is not None: data = data.astype(dtype) - elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview): + elif arbitrary is None or ( + np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview) + ): + # TODO: use is_scalar instead of np.isscalar length = length or 1 if ( (nan_as_null is True) @@ -2183,6 +2186,8 @@ def as_column( arbitrary = None if dtype is None: dtype = cudf.dtype("float64") + elif arbitrary is None and dtype is None: + dtype = cudf.dtype("object") data = as_column(full(length, arbitrary, dtype=dtype)) if not nan_as_null and not is_decimal_dtype(data.dtype): @@ -2202,6 +2207,11 @@ def as_column( arbitrary = np.asarray(arbitrary) + if arbitrary.ndim == 0: + arbitrary = arbitrary.reshape( + 1, + ) + # Handle case that `arbitrary` elements are cupy arrays if ( shape diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py index b106b8bbb02..021d4994613 100644 --- a/python/cudf/cudf/core/column_accessor.py +++ b/python/cudf/cudf/core/column_accessor.py @@ -157,6 +157,8 @@ def _create_unsafe( data: Dict[Any, ColumnBase], multiindex: bool = False, level_names=None, + rangeindex: bool = False, + label_dtype: Dtype | None = None, ) -> ColumnAccessor: # create a ColumnAccessor without verifying column # type or size @@ -164,6 +166,8 @@ def _create_unsafe( obj._data = data obj.multiindex = multiindex obj._level_names = level_names + obj.rangeindex = rangeindex + obj.label_dtype = label_dtype return obj def __iter__(self): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 772e8e1923a..f0f7a666a10 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -668,6 +668,7 @@ def __init__( ): col_is_rangeindex = False col_is_multiindex = False + col_dtype = None if columns is not None: columns = as_index(columns) @@ -678,13 +679,13 @@ def __init__( columns = columns.to_pandas() col_is_rangeindex = isinstance(columns, pd.RangeIndex) col_is_multiindex = isinstance(columns, pd.MultiIndex) + if not isinstance(columns, pd.MultiIndex): + col_dtype = columns.dtype if index is not None: index = as_index(index) - if data is None: - data = [] - elif isinstance(data, Iterator) and not isinstance(data, str): + if isinstance(data, Iterator) and not isinstance(data, str): data = list(data) index_from_data = None @@ -728,6 +729,27 @@ def __init__( col_dict = {data.name: data._column} columns, columns_from_data = pd.Index([data.name]), columns index, index_from_data = data.index, index + elif data is None: + if index is None: + index = RangeIndex(0) + if columns is not None: + level_names = ( + tuple(columns.names) + if isinstance(columns, pd.Index) + else None + ) + col_dict = ColumnAccessor( + { + k: column.column_empty( + len(index), dtype="object", masked=True + ) + for k in columns + }, + level_names=level_names, + ) + else: + col_dict = {} + col_is_rangeindex = True elif isinstance(data, ColumnAccessor): raise TypeError( "Use cudf.Series._from_data for constructing a Series from " @@ -811,6 +833,8 @@ def __init__( ) col_dict = result[0] index, index_from_data = result[1], index + columns, columns_from_data = result[2], columns + col_is_multiindex = isinstance(columns, pd.MultiIndex) else: raise TypeError( f"data must be list or dict-like, not {type(data).__name__}" @@ -840,6 +864,7 @@ def __init__( self._data.rangeindex = self._data.rangeindex or col_is_rangeindex self._data.multiindex = self._data.multiindex or col_is_multiindex + self._data.label_dtype = self._data.label_dtype or col_dtype @_cudf_nvtx_annotate def _init_from_series_list(self, data, columns, index): @@ -1013,12 +1038,12 @@ def _init_from_list_like(self, data, index=None, columns=None): @_cudf_nvtx_annotate def _init_from_dict_like( self, data: dict, index: None | cudf.Index, nan_as_null=None - ) -> tuple[dict, cudf.Index]: + ) -> tuple[dict, cudf.Index, pd.Index]: # 1) Align indexes of all data.values() that are Series/dicts # 2) Convert all array-like data.values() to columns # 3) Convert all remaining scalar data.values() to columns if not data: - return data, cudf.RangeIndex(0) + return data, cudf.RangeIndex(0), pd.RangeIndex(0) data, index_from_data = self._align_input_series_indices( data, nan_as_null=nan_as_null ) @@ -1065,7 +1090,7 @@ def _init_from_dict_like( if result_index is None: result_index = cudf.RangeIndex(scalar_length) - return col_data, result_index + return col_data, result_index, pd.Index(col_data) @classmethod def _from_data( @@ -5531,7 +5556,7 @@ def _from_arrays( ) if data.ndim == 1: - data = data.reshape(1, len(data)) + data = data.reshape(len(data), 1) if columns is not None: if len(columns) != data.shape[1]: diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index b2f0651d576..e1b2f7d674d 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -280,6 +280,8 @@ def astype(self, dtype, copy=False, **kwargs): data=result_data, multiindex=self._data.multiindex, level_names=self._data.level_names, + rangeindex=self._data.rangeindex, + label_dtype=self._data.label_dtype, ) @_cudf_nvtx_annotate @@ -876,6 +878,8 @@ def fillna( data=filled_data, multiindex=self._data.multiindex, level_names=self._data.level_names, + rangeindex=self._data.rangeindex, + label_dtype=self._data.label_dtype, ) ), inplace=inplace, diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d1e2f03420f..be34cb65d17 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -4556,8 +4556,9 @@ def test_create_dataframe_column(): columns=["a", "b", "c"], index=["A", "Z", "X"], ) - - assert_eq(pdf, gdf) + # pandas C column is NaN of object type + # cudf C column is NA of type float + assert_eq(pdf, gdf, check_dtype=False) @pytest.mark.parametrize( @@ -4601,6 +4602,36 @@ def test_dataframe_columns_empty_data_preserves_dtype(dtype, idx_data, data): assert_eq(result, expected) +@pytest.mark.parametrize("dtype", ["int64", "datetime64[ns]", "int8"]) +def test_dataframe_astype_preserves_column_dtype(dtype): + result = cudf.DataFrame([1], columns=cudf.Index([1], dtype=dtype)) + result = result.astype(np.int32).columns + expected = pd.Index([1], dtype=dtype) + assert_eq(result, expected) + + +def test_dataframe_astype_preserves_column_rangeindex(): + result = cudf.DataFrame([1], columns=range(1)) + result = result.astype(np.int32).columns + expected = pd.RangeIndex(1) + assert_eq(result, expected) + + +@pytest.mark.parametrize("dtype", ["int64", "datetime64[ns]", "int8"]) +def test_dataframe_fillna_preserves_column_dtype(dtype): + result = cudf.DataFrame([1, None], columns=cudf.Index([1], dtype=dtype)) + result = result.fillna(2).columns + expected = pd.Index([1], dtype=dtype) + assert_eq(result, expected) + + +def test_dataframe_fillna_preserves_column_rangeindex(): + result = cudf.DataFrame([1, None], columns=range(1)) + result = result.fillna(2).columns + expected = pd.RangeIndex(1) + assert_eq(result, expected) + + @pytest.mark.parametrize( "data", [ From 8a547910d1ab57e4b76008e04fb5ae82c76e8176 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 1 Dec 2023 18:27:02 -0800 Subject: [PATCH 06/22] Fix mode initialization, remove working xfail now --- python/cudf/cudf/core/dataframe.py | 13 ++++++++++--- python/cudf/cudf/tests/test_dataframe.py | 23 ++++++++++++++--------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f0f7a666a10..c258220b429 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -729,7 +729,11 @@ def __init__( col_dict = {data.name: data._column} columns, columns_from_data = pd.Index([data.name]), columns index, index_from_data = data.index, index - elif data is None: + elif data is None or ( + isinstance(data, dict) + and columns is not None + and (~columns.isin(data.keys())).all() + ): if index is None: index = RangeIndex(0) if columns is not None: @@ -826,10 +830,13 @@ def __init__( else: self._init_from_list_like(data, index=index, columns=columns) self._check_data_index_length_match() + + if dtype: + self._data = self.astype(dtype)._data return elif is_dict_like(data): result = self._init_from_dict_like( - data, index, nan_as_null=nan_as_null + data, index=index, nan_as_null=nan_as_null ) col_dict = result[0] index, index_from_data = result[1], index @@ -6199,7 +6206,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): ] if len(mode_results) == 0: - return DataFrame() + return DataFrame(columns=self.columns[:0]) df = cudf.concat(mode_results, axis=1) if isinstance(df, Series): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index be34cb65d17..1bf7f4b700e 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7539,7 +7539,11 @@ def test_dataframe_append_dataframe(df, other, sort, ignore_index): if expected.shape != df.shape: assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) else: - assert_eq(expected, actual, check_index_type=not gdf.empty) + # pandas returns Index[object] while this should be an empty RangeIndex + # for empty df/other + assert_eq( + expected, actual, check_index_type=False, check_column_type=False + ) @pytest_unmark_spilling @@ -7579,8 +7583,8 @@ def test_dataframe_append_dataframe(df, other, sort, ignore_index): "https://github.com/pandas-dev/pandas/issues/35092", ), ), - {1: 1}, - {0: 10, 1: 100, 2: 102}, + {1: [1]}, + {0: [10], 1: [100], 2: [102]}, ], ) @pytest.mark.parametrize("sort", [False, True]) @@ -7769,7 +7773,11 @@ def test_dataframe_append_dataframe_lists(df, other, sort, ignore_index): if expected.shape != df.shape: assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False) else: - assert_eq(expected, actual, check_index_type=not gdf.empty) + # pandas returns Index[object] while this should be an empty RangeIndex + # for empty df/other + assert_eq( + expected, actual, check_index_type=False, check_column_type=False + ) @pytest.mark.parametrize( @@ -8152,11 +8160,7 @@ def test_series_empty(ps): "columns", [["a"], ["another column name"], None, pd.Index(["a"], name="index name")], ) -def test_dataframe_init_with_columns(data, columns, request): - if data == [] and columns is None and not PANDAS_GE_200: - request.node.add_marker( - pytest.mark.xfail(reason=".column returns Index[object]") - ) +def test_dataframe_init_with_columns(data, columns): pdf = pd.DataFrame(data, columns=columns) gdf = cudf.DataFrame(data, columns=columns) @@ -8164,6 +8168,7 @@ def test_dataframe_init_with_columns(data, columns, request): pdf, gdf, check_index_type=len(pdf.index) != 0, + check_column_type=data is not None and columns is not None, check_dtype=not (pdf.empty and len(pdf.columns)), ) From 36b85cc9cb3670adbefffeefed1679ec161c1d6e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Dec 2023 15:12:31 -0800 Subject: [PATCH 07/22] Clean up tests, fix more bugs --- python/cudf/cudf/core/dataframe.py | 15 +++++++---- python/cudf/cudf/tests/test_dataframe.py | 34 +++++++++--------------- 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ef92dff7692..3707bd185c5 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -698,6 +698,10 @@ def __init__( col_dict = data._data index, index_from_data = data.index, index columns, columns_from_data = data.columns, columns + if columns_from_data is not None and len(columns_from_data) == 0: + # TODO: Can this be avoided? + # as_index([]) returns Index[float64] + columns_from_data = columns_from_data.astype(columns.dtype) elif isinstance(data, (cudf.Series, pd.Series)): if isinstance(data, pd.Series): data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null) @@ -854,6 +858,7 @@ def __init__( columns_from_data = as_index(columns_from_data) col_is_rangeindex = isinstance(columns, cudf.RangeIndex) col_is_multiindex = isinstance(columns, cudf.MultiIndex) + col_dtype = columns_from_data.dtype reindexed = self.reindex( columns=columns_from_data.to_pandas(), copy=False ) @@ -3516,12 +3521,12 @@ def rename( ) if level is not None and isinstance(self.index, MultiIndex): - out_index = self.index.copy(deep=copy) - out_index.get_level_values(level).to_frame().replace( - to_replace=list(index.keys()), - value=list(index.values()), - inplace=True, + out_frame = self.index.to_frame(index=False) + level = self.index._get_level_label(level) + out_frame[level] = out_frame[level].replace( + to_replace=list(index.keys()), value=list(index.values()) ) + out_index = type(self.index).from_frame(out_frame) out = DataFrame(index=out_index) else: to_replace = list(index.keys()) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 2c157daa78c..836824ac879 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8951,8 +8951,9 @@ def test_dataframe_from_pandas_duplicate_columns(): ["column_not_exists1", "column_not_exists2"], ], ) -@pytest.mark.parametrize("index", [["abc", "def", "ghi"]]) -def test_dataframe_constructor_columns(df, columns, index, request): +def test_dataframe_constructor_columns(df, columns, request): + index = ["abc", "def", "ghi"] + def assert_local_eq(actual, df, expected, host_columns): check_index_type = not expected.empty if host_columns is not None and any( @@ -8967,12 +8968,6 @@ def assert_local_eq(actual, df, expected, host_columns): else: assert_eq(expected, actual, check_index_type=check_index_type) - if df.empty and columns is None and not PANDAS_GE_200: - request.node.add_marker( - pytest.mark.xfail( - reason="pandas returns Index[object] instead of RangeIndex" - ) - ) gdf = cudf.from_pandas(df) host_columns = ( columns.to_pandas() if isinstance(columns, cudf.BaseIndex) else columns @@ -9279,23 +9274,20 @@ def test_dataframe_setitem_cupy_array(): assert_eq(pdf, gdf) -@pytest.mark.parametrize( - "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}] -) -@pytest.mark.parametrize( - "index", - [{0: 123, 1: 4, 2: 6}], -) @pytest.mark.parametrize( "level", ["x", 0], ) -def test_rename_for_level_MultiIndex_dataframe(data, index, level): +def test_rename_for_level_MultiIndex_dataframe(level): + data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} + index = {0: 123, 1: 4, 2: 6} pdf = pd.DataFrame( data, - index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]), + index=pd.MultiIndex.from_tuples( + [(0, 1, 2), (1, 2, 3), (2, 3, 4)], names=["x", "y", "z"] + ), ) - pdf.index.names = ["x", "y", "z"] + gdf = cudf.from_pandas(pdf) expect = pdf.rename(index=index, level=level) @@ -9304,9 +9296,6 @@ def test_rename_for_level_MultiIndex_dataframe(data, index, level): assert_eq(expect, got) -@pytest.mark.parametrize( - "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}] -) @pytest.mark.parametrize( "columns", [{"a": "f", "b": "g"}, {1: 3, 2: 4}, lambda s: 2 * s], @@ -9315,7 +9304,8 @@ def test_rename_for_level_MultiIndex_dataframe(data, index, level): "level", [0, 1], ) -def test_rename_for_level_MultiColumn_dataframe(data, columns, level): +def test_rename_for_level_MultiColumn_dataframe(columns, level): + data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} gdf = cudf.DataFrame(data) gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]) From 553fe3683f9e58ff7dfff7aa3b377272da43c865 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 7 Dec 2023 19:02:47 -0800 Subject: [PATCH 08/22] Fix more tests, test reindex bug --- python/cudf/cudf/core/dataframe.py | 17 +++++++++++------ python/cudf/cudf/core/indexed_frame.py | 11 ++++++++--- python/cudf/cudf/tests/test_dataframe.py | 12 +++++++++++- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3707bd185c5..fddf7a0d2a4 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -672,7 +672,12 @@ def __init__( col_dtype = None if columns is not None: - columns = as_index(columns) + dtype = None + if isinstance(columns, list) and len(columns) == 0: + # TODO: Generically, an empty dtype-less container + # TODO: Why does as_index([]) return FloatIndex + dtype = object + columns = as_index(columns, dtype=dtype) if not isinstance( columns, MultiIndex ) and columns.nunique() != len(columns): @@ -698,10 +703,6 @@ def __init__( col_dict = data._data index, index_from_data = data.index, index columns, columns_from_data = data.columns, columns - if columns_from_data is not None and len(columns_from_data) == 0: - # TODO: Can this be avoided? - # as_index([]) returns Index[float64] - columns_from_data = columns_from_data.astype(columns.dtype) elif isinstance(data, (cudf.Series, pd.Series)): if isinstance(data, pd.Series): data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null) @@ -794,6 +795,7 @@ def __init__( col_dict = self.from_records( data, index=index, columns=columns )._data + columns_from_data = columns if index is None: index = RangeIndex(arr_interface["shape"][0]) elif is_scalar(data): @@ -851,8 +853,8 @@ def __init__( raise TypeError( f"data must be list or dict-like, not {type(data).__name__}" ) - super().__init__(col_dict, index=index) + self._check_data_index_length_match() if columns_from_data is not None: # TODO: This there a better way to do this? columns_from_data = as_index(columns_from_data) @@ -870,12 +872,15 @@ def __init__( reindexed = self.reindex(index=index_from_data, copy=False) self._data = reindexed._data self._index = index_from_data + # TODO this one might not be needed self._check_data_index_length_match() if dtype: self._data = self.astype(dtype)._data self._data.rangeindex = self._data.rangeindex or col_is_rangeindex + # TODO: multiindex assignment + # test_non_string_column_name_to_arrow to fail self._data.multiindex = self._data.multiindex or col_is_multiindex self._data.label_dtype = self._data.label_dtype or col_dtype diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c81174482e0..f73299667d2 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2653,6 +2653,7 @@ def _reindex( name: ( df._data[name].copy(deep=deep) if name in df._data + # Why does this default to np.float64? else cudf.core.column.column.column_empty( dtype=dtypes.get(name, np.float64), masked=True, @@ -2661,13 +2662,17 @@ def _reindex( ) for name in names } + if column_names is None: + level_names = self._data.level_names + elif isinstance(column_names, pd.Index): + level_names = tuple(column_names.names) + else: + level_names = None result = self.__class__._from_data( data=cudf.core.column_accessor.ColumnAccessor( cols, multiindex=self._data.multiindex, - level_names=tuple(column_names.names) - if isinstance(column_names, pd.Index) - else None, + level_names=level_names, ), index=index, ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 836824ac879..8809ab33224 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9441,7 +9441,8 @@ def test_dataframe_init_from_series(data, columns, index): assert_eq( expected, actual, - check_index_type=len(expected) != 0, + # TODO: reindex creates new cols of float64, why not object? + check_dtype=False, ) @@ -10823,3 +10824,12 @@ def test_dataframe_duplicate_index_reindex(): lfunc_args_and_kwargs=([10, 11, 12, 13], {}), rfunc_args_and_kwargs=([10, 11, 12, 13], {}), ) + + +def test_dataframe_reindex_doesnt_remove_column_name(): + gdf = cudf.DataFrame([1], columns=pd.Index(["a"], name="foo")) + result = gdf.reindex(index=pd.Index([0, 1])) + expected = cudf.DataFrame( + [1, None], columns=pd.Index(["a"], name="foo"), index=pd.Index([0, 1]) + ) + assert_eq(result, expected) From 5baac4e26bce6c99e369f72d424f5ef3cd9ca4b7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Dec 2023 13:27:30 -0800 Subject: [PATCH 09/22] Fix dict like to avoid reindexing --- python/cudf/cudf/core/dataframe.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fddf7a0d2a4..c1189c63350 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -846,7 +846,7 @@ def __init__( data, index=index, nan_as_null=nan_as_null ) col_dict = result[0] - index, index_from_data = result[1], index + index = result[1] columns, columns_from_data = result[2], columns col_is_multiindex = isinstance(columns, pd.MultiIndex) else: @@ -1063,7 +1063,7 @@ def _init_from_dict_like( if not data: return data, cudf.RangeIndex(0), pd.RangeIndex(0) data, index_from_data = self._align_input_series_indices( - data, nan_as_null=nan_as_null + data, index=index, nan_as_null=nan_as_null ) value_lengths = set() @@ -1125,7 +1125,7 @@ def _from_data( @staticmethod @_cudf_nvtx_annotate def _align_input_series_indices( - data: dict, nan_as_null=None + data: dict, index: cudf.Index | None, nan_as_null=None ) -> tuple[dict, None | cudf.Index]: input_series = { key: Series(val, nan_as_null=nan_as_null) @@ -1143,6 +1143,8 @@ def _align_input_series_indices( for key, aligned_series in zip( input_series.keys(), aligned_input_series ): + if index is not None: + aligned_series = aligned_series.reindex(index=index) data[key] = aligned_series return data, aligned_series.index From 9ce0a69f921b6a259917834f31952aa83fe19a94 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Dec 2023 13:30:42 -0800 Subject: [PATCH 10/22] Adjust test_series_data_with_name_with_columns_matching_align --- python/cudf/cudf/tests/test_dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8809ab33224..8089f670efd 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10619,7 +10619,9 @@ def test_series_data_with_name_with_columns_not_matching(): def test_series_data_with_name_with_columns_matching_align(): gdf = cudf.DataFrame(cudf.Series([1], name=2), columns=[1, 2]) pdf = pd.DataFrame(pd.Series([1], name=2), columns=[1, 2]) - assert_eq(gdf, pdf) + # pandas A column is NaN of object type + # cudf A column is NA of type float + assert_eq(gdf, pdf, check_dtype=False) @pytest.mark.parametrize("digits", [0, 1, 3, 4, 10]) From 5fcce39648ba94487d1a9337bd720a8ede2c1ef7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Dec 2023 14:31:53 -0800 Subject: [PATCH 11/22] add comments --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c1189c63350..569f901a56b 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5569,7 +5569,7 @@ def _from_arrays( Returns ------- - {int: Column} + {Any: Column} """ data = cupy.asarray(data) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 8089f670efd..3fd4f26f909 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10619,8 +10619,8 @@ def test_series_data_with_name_with_columns_not_matching(): def test_series_data_with_name_with_columns_matching_align(): gdf = cudf.DataFrame(cudf.Series([1], name=2), columns=[1, 2]) pdf = pd.DataFrame(pd.Series([1], name=2), columns=[1, 2]) - # pandas A column is NaN of object type - # cudf A column is NA of type float + # pandas 1 column is NaN of object type + # cudf 1 column is NA of type float assert_eq(gdf, pdf, check_dtype=False) From df93b636d5475fadb5ee76999e9269b0cbf8ed0d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Dec 2023 13:34:07 -0800 Subject: [PATCH 12/22] Fix some tests and a naming bug --- python/cudf/cudf/core/dataframe.py | 6 +++--- python/cudf/cudf/tests/test_dataframe.py | 10 +++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 569f901a56b..1bc77635cd2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -672,12 +672,12 @@ def __init__( col_dtype = None if columns is not None: - dtype = None + as_idx_typ = None if isinstance(columns, list) and len(columns) == 0: # TODO: Generically, an empty dtype-less container # TODO: Why does as_index([]) return FloatIndex - dtype = object - columns = as_index(columns, dtype=dtype) + as_idx_typ = object + columns = as_index(columns, dtype=as_idx_typ) if not isinstance( columns, MultiIndex ) and columns.nunique() != len(columns): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 3aac6b2f54d..6c6d76835a2 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10467,10 +10467,18 @@ def test_dataframe_dict_like_with_columns(columns, index): data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]} expect = pd.DataFrame(data, columns=columns, index=index) actual = cudf.DataFrame(data, columns=columns, index=index) + # TODO(pandas2.0): New NA columns will be object instead of float type + check_dtype = isinstance(columns, list) and columns == [ + "a", + "d", + "b", + "e", + "c", + ] if index is None and len(columns) == 0: # We make an empty range index, pandas makes an empty index expect = expect.reset_index(drop=True) - assert_eq(expect, actual) + assert_eq(expect, actual, check_dtype=not check_dtype) def test_dataframe_init_columns_named_multiindex(): From 77ab160e27ae07d19a91e6b35f24e3804f35ea9e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Dec 2023 14:57:18 -0800 Subject: [PATCH 13/22] pass arguments through colaccessor --- python/cudf/cudf/core/dataframe.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 1bc77635cd2..346f922da71 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -756,6 +756,9 @@ def __init__( for k in columns }, level_names=level_names, + multiindex=col_is_multiindex, + rangeindex=col_is_rangeindex, + label_dtype=col_dtype, ) else: col_dict = {} @@ -853,18 +856,28 @@ def __init__( raise TypeError( f"data must be list or dict-like, not {type(data).__name__}" ) - super().__init__(col_dict, index=index) + col_accessor = ColumnAccessor( + col_dict, + multiindex=col_is_multiindex, + rangeindex=col_is_rangeindex, + label_dtype=col_dtype, + ) + super().__init__(col_accessor, index=index) self._check_data_index_length_match() if columns_from_data is not None: # TODO: This there a better way to do this? columns_from_data = as_index(columns_from_data) - col_is_rangeindex = isinstance(columns, cudf.RangeIndex) - col_is_multiindex = isinstance(columns, cudf.MultiIndex) - col_dtype = columns_from_data.dtype reindexed = self.reindex( columns=columns_from_data.to_pandas(), copy=False ) self._data = reindexed._data + self._data.rangeindex = isinstance( + columns_from_data, cudf.RangeIndex + ) + self._data.multiindex = isinstance( + columns_from_data, cudf.MultiIndex + ) + self._data.label_dtype = columns_from_data.dtype self._index = index if index_from_data is not None: # TODO: This there a better way to do this? @@ -878,12 +891,6 @@ def __init__( if dtype: self._data = self.astype(dtype)._data - self._data.rangeindex = self._data.rangeindex or col_is_rangeindex - # TODO: multiindex assignment - # test_non_string_column_name_to_arrow to fail - self._data.multiindex = self._data.multiindex or col_is_multiindex - self._data.label_dtype = self._data.label_dtype or col_dtype - @_cudf_nvtx_annotate def _init_from_series_list(self, data, columns, index): if index is None: From 4981b05db478bbefcb1a763a0937096a4724b999 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Dec 2023 17:08:21 -0800 Subject: [PATCH 14/22] Remove redundant check --- python/cudf/cudf/core/dataframe.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 346f922da71..68a19788d46 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -885,8 +885,6 @@ def __init__( reindexed = self.reindex(index=index_from_data, copy=False) self._data = reindexed._data self._index = index_from_data - # TODO this one might not be needed - self._check_data_index_length_match() if dtype: self._data = self.astype(dtype)._data From 3fdeb870e39f0c5bee45705b7f2b6e92278c7554 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Dec 2023 18:59:04 -0800 Subject: [PATCH 15/22] Adjust test and add another one with defined behavior --- python/cudf/cudf/core/dataframe.py | 21 +++++++++++++++++++++ python/cudf/cudf/tests/test_dataframe.py | 15 +++++++++++++-- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 68a19788d46..584dd32f420 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1080,16 +1080,26 @@ def _init_from_dict_like( result_index = index scalar_keys = [] + tuple_key_count = 0 + tuple_key_lengths = set() col_data = {} for key, value in data.items(): if is_scalar(value): scalar_keys.append(key) col_data[key] = value else: + if isinstance(key, tuple): + tuple_key_count += 1 + tuple_key_lengths.add(len(key)) column = as_column(value, nan_as_null=nan_as_null) value_lengths.add(len(column)) col_data[key] = column + if tuple_key_count not in (0, len(data)): + raise ValueError( + "All dict keys must be tuples if a tuple key exists." + ) + if len(scalar_keys) != len(data) and len(value_lengths) > 1: raise ValueError( "Found varying value lengths when all values " @@ -1110,6 +1120,17 @@ def _init_from_dict_like( col_data[key], nan_as_null=nan_as_null, length=scalar_length ) + if tuple_key_count and len(tuple_key_lengths) > 1: + # All tuple keys must be the same length + final_length = max(tuple_key_lengths) + col_data = { + old_key + if len(old_key) == final_length + else old_key + + (cudf.NA,) * (final_length - len(old_key)): column + for old_key, column in col_data.items() + } + if result_index is None: result_index = cudf.RangeIndex(scalar_length) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6c6d76835a2..4aa07537ab2 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10230,18 +10230,24 @@ def test_dataframe_assign_scalar_to_empty_series(): "data", [ {0: [1, 2, 3], 2: [10, 11, 23]}, - {("a", "b"): [1, 2, 3], ("2",): [10, 11, 23]}, + {("a", "b"): [1, 2, 3], ("2", "3"): [10, 11, 23]}, ], ) def test_non_string_column_name_to_arrow(data): df = cudf.DataFrame(data) - expected = df.to_arrow() actual = pa.Table.from_pandas(df.to_pandas()) assert expected.equals(actual) +def test_dict_uneven_tuple_keys_fill_with_NA(): + data = ({("a", "b"): [1, 2, 3], ("2",): [10, 11, 23]},) + result = cudf.DataFrame(data) + expected = pd.DataFrame(data) + assert_eq(result, expected) + + def test_complex_types_from_arrow(): expected = pa.Table.from_arrays( [ @@ -10824,6 +10830,11 @@ def test_dataframe_series_dot(): assert_eq(expected, actual) +def test_dict_tuple_keys_must_all_be_tuple_keys(): + with pytest.raises(ValueError): + cudf.DataFrame({(1, 2): [1], 3: [2]}) + + def test_dataframe_reindex_keep_colname(): gdf = cudf.DataFrame([1], columns=cudf.Index([1], name="foo")) result = gdf.reindex(index=[0, 1]) From 03f2e7f53c1a479df73cd147993edb6184f00e68 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 12 Dec 2023 09:21:52 -0800 Subject: [PATCH 16/22] Move all new tests together, reduce diff --- python/cudf/cudf/core/indexed_frame.py | 1 - python/cudf/cudf/tests/test_dataframe.py | 75 ++++++++++++------------ 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index f72d1b0a332..0c23d6dd45b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2645,7 +2645,6 @@ def _reindex( name: ( df._data[name].copy(deep=deep) if name in df._data - # Why does this default to np.float64? else cudf.core.column.column.column_empty( dtype=dtypes.get(name, np.float64), masked=True, diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6cb42253b4c..0b7bcb6fb43 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10285,19 +10285,13 @@ def test_dataframe_assign_scalar_to_empty_series(): ) def test_non_string_column_name_to_arrow(data): df = cudf.DataFrame(data) + expected = df.to_arrow() actual = pa.Table.from_pandas(df.to_pandas()) assert expected.equals(actual) -def test_dict_uneven_tuple_keys_fill_with_NA(): - data = ({("a", "b"): [1, 2, 3], ("2",): [10, 11, 23]},) - result = cudf.DataFrame(data) - expected = pd.DataFrame(data) - assert_eq(result, expected) - - def test_complex_types_from_arrow(): expected = pa.Table.from_arrays( [ @@ -10772,31 +10766,6 @@ def test_dataframe_from_ndarray_dup_columns(): cudf.DataFrame(np.eye(2), columns=["A", "A"]) -def test_dataframe_from_dict_only_scalar_values_raises(): - with pytest.raises(ValueError): - cudf.DataFrame({0: 3, 1: 2}) - - -@pytest.mark.parametrize("klass", [cudf.DataFrame, pd.DataFrame]) -@pytest.mark.parametrize( - "axis_kwargs, exp_data", - [ - [ - {"index": [1, 2], "columns": [1, 2]}, - np.array([[1.0, np.nan], [np.nan, np.nan]]), - ], - [{"index": [1, 2]}, np.array([[0.0, 1.0], [np.nan, np.nan]])], - [{"columns": [1, 2]}, np.array([[0.0, np.nan], [1.0, np.nan]])], - ], -) -def test_dataframe_from_frame_with_index_or_columns_reindexes( - klass, axis_kwargs, exp_data -): - result = cudf.DataFrame(klass(np.eye(2)), **axis_kwargs) - expected = cudf.DataFrame(exp_data, **axis_kwargs) - assert_eq(result, expected) - - @pytest.mark.parametrize("name", ["a", 0, None, np.nan, cudf.NA]) @pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA]) @pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]]) @@ -10880,11 +10849,6 @@ def test_dataframe_series_dot(): assert_eq(expected, actual) -def test_dict_tuple_keys_must_all_be_tuple_keys(): - with pytest.raises(ValueError): - cudf.DataFrame({(1, 2): [1], 3: [2]}) - - def test_dataframe_reindex_keep_colname(): gdf = cudf.DataFrame([1], columns=cudf.Index([1], name="foo")) result = gdf.reindex(index=[0, 1]) @@ -10906,6 +10870,43 @@ def test_dataframe_duplicate_index_reindex(): ) +def test_dict_uneven_tuple_keys_fill_with_NA(): + data = ({("a", "b"): [1, 2, 3], ("2",): [10, 11, 23]},) + result = cudf.DataFrame(data) + expected = pd.DataFrame(data) + assert_eq(result, expected) + + +def test_dataframe_from_dict_only_scalar_values_raises(): + with pytest.raises(ValueError): + cudf.DataFrame({0: 3, 1: 2}) + + +@pytest.mark.parametrize("klass", [cudf.DataFrame, pd.DataFrame]) +@pytest.mark.parametrize( + "axis_kwargs, exp_data", + [ + [ + {"index": [1, 2], "columns": [1, 2]}, + np.array([[1.0, np.nan], [np.nan, np.nan]]), + ], + [{"index": [1, 2]}, np.array([[0.0, 1.0], [np.nan, np.nan]])], + [{"columns": [1, 2]}, np.array([[0.0, np.nan], [1.0, np.nan]])], + ], +) +def test_dataframe_from_frame_with_index_or_columns_reindexes( + klass, axis_kwargs, exp_data +): + result = cudf.DataFrame(klass(np.eye(2)), **axis_kwargs) + expected = cudf.DataFrame(exp_data, **axis_kwargs) + assert_eq(result, expected) + + +def test_dict_tuple_keys_must_all_be_tuple_keys(): + with pytest.raises(ValueError): + cudf.DataFrame({(1, 2): [1], 3: [2]}) + + def test_dataframe_reindex_doesnt_remove_column_name(): gdf = cudf.DataFrame([1], columns=pd.Index(["a"], name="foo")) result = gdf.reindex(index=pd.Index([0, 1])) From ad81d4b73c2be84b026190049acea9270a130021 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 12 Dec 2023 09:26:06 -0800 Subject: [PATCH 17/22] Remove redundant test --- python/cudf/cudf/tests/test_dataframe.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 0b7bcb6fb43..b213a9c6c4a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10905,12 +10905,3 @@ def test_dataframe_from_frame_with_index_or_columns_reindexes( def test_dict_tuple_keys_must_all_be_tuple_keys(): with pytest.raises(ValueError): cudf.DataFrame({(1, 2): [1], 3: [2]}) - - -def test_dataframe_reindex_doesnt_remove_column_name(): - gdf = cudf.DataFrame([1], columns=pd.Index(["a"], name="foo")) - result = gdf.reindex(index=pd.Index([0, 1])) - expected = cudf.DataFrame( - [1, None], columns=pd.Index(["a"], name="foo"), index=pd.Index([0, 1]) - ) - assert_eq(result, expected) From baeaa87b465be44ac814a55f72cc4393ab6b36d3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Dec 2023 15:32:23 -0800 Subject: [PATCH 18/22] Ensure columns are maintained in slicing --- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 16 +++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fe48997edfd..7b39cc6e6e7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7167,7 +7167,7 @@ def append( >>> df = cudf.DataFrame(columns=['A']) >>> for i in range(5): - ... df = df.append({'A': i}, ignore_index=True) + ... df = df.append({'A': [i]}, ignore_index=True) >>> df A 0 0 diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a2f3db681ec..b0b45e38d3b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1863,11 +1863,17 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: start, stop, stride = arg.indices(num_rows) index = self.index has_range_index = isinstance(index, RangeIndex) + col_was_multiindex = self._data.multiindex + col_was_rangeindex = self._data.rangeindex + col_label_dtype = self._data.label_dtype if len(range(start, stop, stride)) == 0: # Avoid materialising the range index column result = self._empty_like( keep_index=keep_index and not has_range_index ) + result._data.rangeindex = col_was_rangeindex + result._data.multiindex = col_was_multiindex + result._data.label_dtype = col_label_dtype if keep_index and has_range_index: lo = index.start + start * index.step hi = index.start + stop * index.step @@ -1896,7 +1902,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: stop = min(stop, num_rows) if stride != 1: - return self._gather( + result = self._gather( GatherMap.from_column_unchecked( cudf.core.column.arange( start, @@ -1909,6 +1915,10 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: ), keep_index=keep_index, ) + result._data.rangeindex = col_was_rangeindex + result._data.multiindex = col_was_multiindex + result._data.label_dtype = col_label_dtype + return result columns_to_slice = [ *( @@ -1924,6 +1934,10 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: None if has_range_index or not keep_index else self._index.names, ) + result._data.rangeindex = col_was_rangeindex + result._data.multiindex = col_was_multiindex + result._data.label_dtype = col_label_dtype + if keep_index and has_range_index: result.index = self.index[start:stop] return result From 645cc3368a123bb64daa3295d2523d62341052e3 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 14 Dec 2023 16:34:00 -0800 Subject: [PATCH 19/22] Fix .columns usage, fix for pandas 2.0 in concat --- python/cudf/cudf/core/dataframe.py | 6 ++++-- python/cudf/cudf/tests/test_concat.py | 9 +++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7b39cc6e6e7..c0d43790602 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -702,7 +702,7 @@ def __init__( data = self.from_pandas(data, nan_as_null=nan_as_null) col_dict = data._data index, index_from_data = data.index, index - columns, columns_from_data = data.columns, columns + columns, columns_from_data = data._data.to_pandas_index(), columns elif isinstance(data, (cudf.Series, pd.Series)): if isinstance(data, pd.Series): data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null) @@ -6518,7 +6518,9 @@ def select_dtypes(self, include=None, exclude=None): if infered_type in inclusion: df._insert(len(df._data), k, col) else: - df.columns = df.columns[:0] + df._data.rangeindex = self._data.rangeindex + df._data.multiindex = self._data.multiindex + df._data.label_dtype = self._data.label_dtype return df diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index df743a96759..1fb3bc08413 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -8,7 +8,7 @@ import cudf as gd from cudf.api.types import is_categorical_dtype -from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140 +from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_LT_140 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype from cudf.testing._utils import assert_eq, assert_exceptions_equal @@ -596,7 +596,12 @@ def test_concat_empty_dataframes(df, other, ignore_index): actual[key] = col.fillna(-1) assert_eq(expected, actual, check_dtype=False, check_index_type=True) else: - assert_eq(expected, actual, check_index_type=not gdf.empty) + assert_eq( + expected, + actual, + check_index_type=not gdf.empty, + check_column_type=PANDAS_GE_200, + ) @pytest.mark.parametrize("ignore_index", [True, False]) From d1ce06b28d1d17629b33db6f50b32982ed208032 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 19 Dec 2023 15:59:28 -0800 Subject: [PATCH 20/22] Address test failures --- python/cudf/cudf/core/dataframe.py | 2 +- .../cudf/cudf/tests/test_avro_reader_fastavro_integration.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 4 ++-- python/cudf/cudf/tests/test_orc.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index baaeea35305..f4e71f43d18 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6251,7 +6251,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True): ] if len(mode_results) == 0: - return DataFrame(columns=self.columns[:0]) + return data_df.head(0) df = cudf.concat(mode_results, axis=1) if isinstance(df, Series): diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 2272231fec1..9a3d3af3fd8 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -209,7 +209,7 @@ def test_can_parse_no_schema(): schema_root = None records = [] actual = cudf_from_avro_util(schema_root, records) - expected = cudf.DataFrame() + expected = cudf.DataFrame(columns=cudf.Index([], dtype="object")) assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 57aa6e72eae..3853b4aee12 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -3406,8 +3406,8 @@ def test_head_tail_empty(): # GH #13397 values = [1, 2, 3] - pdf = pd.DataFrame({}, index=values) - df = cudf.DataFrame({}, index=values) + pdf = pd.DataFrame(index=values) + df = cudf.DataFrame(index=values) expected = pdf.groupby(pd.Series(values)).head() got = df.groupby(cudf.Series(values)).head() diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 7407da9c4ac..67588e58fc0 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -885,7 +885,7 @@ def test_nanoseconds_overflow(): def test_empty_dataframe(): buffer = BytesIO() - expected = cudf.DataFrame() + expected = cudf.DataFrame(columns=cudf.Index([], dtype="object")) expected.to_orc(buffer) # Raise error if column name is mentioned, but it doesn't exist. From c62aaa6e5304df50541af4824b59dbf8b8e08fc0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 19 Dec 2023 18:02:48 -0800 Subject: [PATCH 21/22] Fix mode --- python/cudf/cudf/core/dataframe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f4e71f43d18..fc0ef017519 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6251,7 +6251,9 @@ def mode(self, axis=0, numeric_only=False, dropna=True): ] if len(mode_results) == 0: - return data_df.head(0) + result = data_df.head(0) + result.index = cudf.RangeIndex(0) + return result df = cudf.concat(mode_results, axis=1) if isinstance(df, Series): From 498fc756994b769a5a5fd1ee4545adfc645979b4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 3 Jan 2024 17:36:13 -0800 Subject: [PATCH 22/22] Allow columns to not be an index --- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/dataframe.py | 27 +++++++++++-------- python/cudf/cudf/core/indexed_frame.py | 2 +- .../test_avro_reader_fastavro_integration.py | 2 +- python/cudf/cudf/tests/test_concat.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 2 +- python/cudf/cudf/tests/test_groupby.py | 2 +- python/cudf/cudf/tests/test_orc.py | 2 +- 8 files changed, 23 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 64f9c25a9f0..47d8bad7cb1 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fc0ef017519..62bfdfc7922 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from __future__ import annotations @@ -675,16 +675,21 @@ def __init__( # TODO: Generically, an empty dtype-less container # TODO: Why does as_index([]) return FloatIndex as_idx_typ = object - columns = as_index(columns, dtype=as_idx_typ) - if not isinstance( - columns, MultiIndex - ) and columns.nunique() != len(columns): - raise ValueError("Columns cannot contain duplicate values") - columns = columns.to_pandas() - col_is_rangeindex = isinstance(columns, pd.RangeIndex) - col_is_multiindex = isinstance(columns, pd.MultiIndex) - if not isinstance(columns, pd.MultiIndex): - col_dtype = columns.dtype + try: + columns = as_index(columns, dtype=as_idx_typ) + except pa.lib.ArrowInvalid: + # mixed typed elements are allowed e.g. [(1, 2), "a"] + columns = list(columns) + else: + if not isinstance( + columns, MultiIndex + ) and columns.nunique() != len(columns): + raise ValueError("Columns cannot contain duplicate values") + columns = columns.to_pandas() + col_is_rangeindex = isinstance(columns, pd.RangeIndex) + col_is_multiindex = isinstance(columns, pd.MultiIndex) + if not isinstance(columns, pd.MultiIndex): + col_dtype = columns.dtype if index is not None: index = as_index(index) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 8d7d396f57d..f7c3e180fc5 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. """Base class for Frame types that have an index.""" from __future__ import annotations diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 9a3d3af3fd8..2711926ae12 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 1fb3bc08413..d393e9f81cb 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. from decimal import Decimal diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c2f105cdc34..dc82b1c1f3f 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import array as arr import contextlib diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 3853b4aee12..efb2dda12e4 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2023, NVIDIA CORPORATION. +# Copyright (c) 2018-2024, NVIDIA CORPORATION. import collections import datetime diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index 67588e58fc0..cafcb347d52 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. import datetime import decimal