From d08fba7adabc846358d5c14f72867506b16b3f25 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 27 Nov 2023 19:06:30 -0800
Subject: [PATCH 01/22] Start refactoring DataFrame init

---
 python/cudf/cudf/core/dataframe.py       | 375 ++++++++---------------
 python/cudf/cudf/tests/test_dataframe.py |   5 +
 2 files changed, 127 insertions(+), 253 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 785f3d98712..47ac856ef86 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -666,38 +666,26 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     def __init__(
         self, data=None, index=None, columns=None, dtype=None, nan_as_null=True
     ):
-        super().__init__()
+        if columns is not None:
+            columns = as_index(columns).to_pandas()
 
-        if isinstance(columns, (Series, cudf.BaseIndex)):
-            columns = columns.to_pandas()
+        if index is not None:
+            index = as_index(index)
+
+        if data is None:
+            data = []
+        elif isinstance(data, Iterator) and not isinstance(data, str):
+            data = list(data)
+
+        index_from_data = None
+        columns_from_data = None
 
         if isinstance(data, (DataFrame, pd.DataFrame)):
             if isinstance(data, pd.DataFrame):
                 data = self.from_pandas(data, nan_as_null=nan_as_null)
-
-            if index is not None:
-                if not data.index.equals(index):
-                    data = data.reindex(index)
-                    index = data._index
-                else:
-                    index = as_index(index)
-            else:
-                index = data._index
-
-            self._index = index
-
-            if columns is not None:
-                self._data = data._data
-                self._reindex(
-                    column_names=columns, index=index, deep=False, inplace=True
-                )
-                if isinstance(
-                    columns, (range, pd.RangeIndex, cudf.RangeIndex)
-                ):
-                    self._data.rangeindex = True
-            else:
-                self._data = data._data
-                self._data.rangeindex = True
+            col_dict = data._data
+            index_from_data = data.index
+            columns_from_data = data.columns
         elif isinstance(data, (cudf.Series, pd.Series)):
             if isinstance(data, pd.Series):
                 data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null)
@@ -719,35 +707,8 @@ def __init__(
                 name = columns[0]
             else:
                 name = data.name or 0
-            self._init_from_dict_like(
-                {name: data},
-                index=index,
-                columns=columns,
-                nan_as_null=nan_as_null,
-            )
-        elif data is None:
-            if index is None:
-                self._index = RangeIndex(0)
-            else:
-                self._index = as_index(index)
-            if columns is not None:
-                rangeindex = isinstance(
-                    columns, (range, pd.RangeIndex, cudf.RangeIndex)
-                )
-                label_dtype = getattr(columns, "dtype", None)
-                self._data = ColumnAccessor(
-                    {
-                        k: column.column_empty(
-                            len(self), dtype="object", masked=True
-                        )
-                        for k in columns
-                    },
-                    level_names=tuple(columns.names)
-                    if isinstance(columns, pd.Index)
-                    else None,
-                    rangeindex=rangeindex,
-                    label_dtype=label_dtype,
-                )
+            col_dict = {name: data._column}
+            index_from_data = data.index
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
@@ -759,69 +720,76 @@ def __init__(
             # descr is an optional field of the _cuda_ary_iface_
             if "descr" in arr_interface:
                 if len(arr_interface["descr"]) == 1:
-                    new_df = self._from_arrays(
+                    col_dict = self._from_arrays(
                         data, index=index, columns=columns
                     )
                 else:
-                    new_df = self.from_records(
+                    col_dict = self.from_records(
                         data, index=index, columns=columns
-                    )
+                    )._data
             else:
-                new_df = self._from_arrays(data, index=index, columns=columns)
+                col_dict = self._from_arrays(
+                    data, index=index, columns=columns
+                )
 
-            self._data = new_df._data
-            self._index = new_df._index
-            self._check_data_index_length_match()
         elif hasattr(data, "__array_interface__"):
             arr_interface = data.__array_interface__
             if len(arr_interface["descr"]) == 1:
                 # not record arrays
-                new_df = self._from_arrays(data, index=index, columns=columns)
+                col_dict = self._from_arrays(
+                    data, index=index, columns=columns
+                )
             else:
-                new_df = self.from_records(data, index=index, columns=columns)
-            self._data = new_df._data
-            self._index = new_df._index
-            self._check_data_index_length_match()
-        else:
-            if isinstance(data, Iterator):
-                data = list(data)
-            if is_list_like(data):
-                if len(data) > 0 and is_scalar(data[0]):
-                    if columns is not None:
-                        data = dict(zip(columns, [data]))
-                        rangeindex = isinstance(
-                            columns, (range, pd.RangeIndex, cudf.RangeIndex)
-                        )
-                    else:
-                        data = dict(enumerate([data]))
-                        rangeindex = True
-                    new_df = DataFrame(data=data, index=index)
-
-                    self._data = new_df._data
-                    self._index = new_df._index
-                    self._data._level_names = (
-                        tuple(columns.names)
-                        if isinstance(columns, pd.Index)
-                        else self._data._level_names
-                    )
-                    self._data.rangeindex = rangeindex
-                elif len(data) > 0 and isinstance(data[0], Series):
-                    self._init_from_series_list(
-                        data=data, columns=columns, index=index
+                col_dict = self.from_records(
+                    data, index=index, columns=columns
+                )._data
+        elif is_scalar(data):
+            if index is None or columns is None:
+                raise ValueError("DataFrame constructor not properly called!")
+            col_dict = {
+                col_label: as_column(
+                    data, nan_as_null=nan_as_null, length=len(index)
+                )
+                for col_label in columns
+            }
+        elif is_list_like(data):
+            if len(data) > 0 and is_scalar(data[0]):
+                if columns is not None:
+                    data = dict(zip(columns, [data]))
+                    rangeindex = isinstance(
+                        columns, (range, pd.RangeIndex, cudf.RangeIndex)
                     )
                 else:
-                    self._init_from_list_like(
-                        data, index=index, columns=columns
-                    )
-                self._check_data_index_length_match()
+                    data = dict(enumerate([data]))
+                    rangeindex = True
+                new_df = DataFrame(data=data, index=index)
+
+                self._data = new_df._data
+                self._index = new_df._index
+                self._data._level_names = (
+                    tuple(columns.names)
+                    if isinstance(columns, pd.Index)
+                    else self._data._level_names
+                )
+                self._data.rangeindex = rangeindex
+            elif len(data) > 0 and isinstance(data[0], Series):
+                self._init_from_series_list(
+                    data=data, columns=columns, index=index
+                )
             else:
-                if not is_dict_like(data):
-                    raise TypeError("data must be list or dict-like")
+                self._init_from_list_like(data, index=index, columns=columns)
+            self._check_data_index_length_match()
+        elif is_dict_like(data):
+            col_dict, index_from_data = self._init_from_dict_like(
+                data, nan_as_null=nan_as_null
+            )
+        else:
+            raise TypeError(
+                f"data must be list or dict-like, not {type(data).__name__}"
+            )
 
-                self._init_from_dict_like(
-                    data, index=index, columns=columns, nan_as_null=nan_as_null
-                )
-                self._check_data_index_length_match()
+        super().__init__(col_dict, index=index)
+        self._check_data_index_length_match()
 
         if dtype:
             self._data = self.astype(dtype)._data
@@ -1001,80 +969,18 @@ def _init_from_list_like(self, data, index=None, columns=None):
 
     @_cudf_nvtx_annotate
     def _init_from_dict_like(
-        self, data, index=None, columns=None, nan_as_null=None
-    ):
-        label_dtype = None
-        if columns is not None:
-            label_dtype = getattr(columns, "dtype", None)
-            # remove all entries in data that are not in columns,
-            # inserting new empty columns for entries in columns that
-            # are not in data
-            if any(c in data for c in columns):
-                # Let the downstream logic determine the length of the
-                # empty columns here
-                empty_column = lambda: None  # noqa: E731
-            else:
-                # If keys is empty, none of the data keys match the
-                # columns, so we need to create an empty DataFrame. To
-                # match pandas, the size of the dataframe must match
-                # the provided index, so we need to return a masked
-                # array of nulls if an index is given.
-                empty_column = functools.partial(
-                    cudf.core.column.column_empty,
-                    row_count=(0 if index is None else len(index)),
-                    dtype=None,
-                    masked=index is not None,
-                )
-
-            data = {
-                c: data[c] if c in data else empty_column() for c in columns
-            }
-
-        data, index = self._align_input_series_indices(data, index=index)
-
-        if index is None:
-            num_rows = 0
-            if data:
-                keys, values, lengths = zip(
-                    *(
-                        (k, v, 1)
-                        if is_scalar(v)
-                        else (
-                            k,
-                            vc := as_column(v, nan_as_null=nan_as_null),
-                            len(vc),
-                        )
-                        for k, v in data.items()
-                    )
-                )
-                data = dict(zip(keys, values))
-                try:
-                    (num_rows,) = (set(lengths) - {1}) or {1}
-                except ValueError:
-                    raise ValueError("All arrays must be the same length")
-
-            self._index = RangeIndex(0, num_rows)
-        else:
-            self._index = as_index(index)
-
-        if len(data):
-            self._data.multiindex = True
-            for i, col_name in enumerate(data):
-                self._data.multiindex = self._data.multiindex and isinstance(
-                    col_name, tuple
-                )
-                self._insert(
-                    i,
-                    col_name,
-                    data[col_name],
-                    nan_as_null=nan_as_null,
-                )
-        self._data._level_names = (
-            tuple(columns.names)
-            if isinstance(columns, pd.Index)
-            else self._data._level_names
+        self, data: dict, nan_as_null: bool | None = None
+    ) -> tuple[dict, None | cudf.Index]:
+        if not data:
+            return data, None
+        data, index_from_data, value_length = self._align_input_series_indices(
+            data, nan_as_null=nan_as_null
         )
-        self._data.label_dtype = label_dtype
+        col_data = {
+            key: as_column(value, nan_as_null=nan_as_null, length=value_length)
+            for key, value in data.items()
+        }
+        return col_data, index_from_data
 
     @classmethod
     def _from_data(
@@ -1090,33 +996,33 @@ def _from_data(
 
     @staticmethod
     @_cudf_nvtx_annotate
-    def _align_input_series_indices(data, index):
+    def _align_input_series_indices(
+        data: dict, nan_as_null: bool | None = None
+    ) -> tuple[dict, None | cudf.Index, int]:
+        input_series = {}
+        value_lengths: set[int] = set()
+        for key, val in data.items():
+            if isinstance(val, (pd.Series, Series, dict)):
+                val = Series(val, nan_as_null=nan_as_null)
+                input_series[key] = val
+            if not is_scalar(val):
+                value_lengths.add(len(val))
+        if len(value_lengths) > 1:
+            raise ValueError(f"Found varying data lengths: {value_lengths}")
+
+        if not input_series:
+            return data, None, value_lengths.pop()
+
+        aligned_input_series = cudf.core.series._align_indices(
+            list(input_series.values())
+        )
+        index = aligned_input_series[0].index
         data = data.copy()
-
-        input_series = [
-            Series(val)
-            for val in data.values()
-            if isinstance(val, (pd.Series, Series, dict))
-        ]
-
-        if input_series:
-            if index is not None:
-                aligned_input_series = [
-                    sr._align_to_index(index, how="right", sort=False)
-                    for sr in input_series
-                ]
-
-            else:
-                aligned_input_series = cudf.core.series._align_indices(
-                    input_series
-                )
-                index = aligned_input_series[0].index
-
-            for name, val in data.items():
-                if isinstance(val, (pd.Series, Series, dict)):
-                    data[name] = aligned_input_series.pop(0)
-
-        return data, index
+        for key, aligned_series in zip(
+            input_series.keys(), aligned_input_series
+        ):
+            data[key] = aligned_series
+        return data, index, value_lengths.pop()
 
     # The `constructor*` properties are used by `dask` (and `dask_cudf`)
     @property
@@ -5531,70 +5437,33 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
 
     @classmethod
     @_cudf_nvtx_annotate
-    def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
-        """Convert a numpy/cupy array to DataFrame.
+    def _from_arrays(cls, data, nan_as_null=False) -> dict[int, ColumnBase]:
+        """Convert a numpy/cupy array to a dict of columns.
 
         Parameters
         ----------
         data : numpy/cupy array of ndim 1 or 2,
-            dimensions greater than 2 are not supported yet.
-        index : Index or array-like
-            Index to use for resulting frame. Will default to
-            RangeIndex if no indexing information part of input data and
-            no index provided.
-        columns : list of str
-            List of column names to include.
+            dimensions greater than 2 are not supported.
+        nan_as_null : bool
+            whether the NaN should represent NA
 
         Returns
         -------
-        DataFrame
+        {int: Column}
         """
 
         data = cupy.asarray(data)
-        if data.ndim != 1 and data.ndim != 2:
+        if data.ndim not in (1, 2):
             raise ValueError(
                 f"records dimension expected 1 or 2 but found: {data.ndim}"
             )
 
-        if data.ndim == 2:
-            num_cols = data.shape[1]
-        else:
-            # Since we validate ndim to be either 1 or 2 above,
-            # this case can be assumed to be ndim == 1.
-            num_cols = 1
-
-        if columns is None:
-            names = range(num_cols)
-        else:
-            if len(columns) != num_cols:
-                raise ValueError(
-                    f"columns length expected {num_cols} but "
-                    f"found {len(columns)}"
-                )
-            elif len(columns) != len(set(columns)):
-                raise ValueError("Duplicate column names are not allowed")
-            names = columns
-
-        df = cls()
-        if data.ndim == 2:
-            for i, k in enumerate(names):
-                df._data[k] = column.as_column(
-                    data[:, i], nan_as_null=nan_as_null
-                )
-        elif data.ndim == 1:
-            df._data[names[0]] = column.as_column(
-                data, nan_as_null=nan_as_null
-            )
-        if isinstance(columns, pd.Index):
-            df._data._level_names = tuple(columns.names)
-        if isinstance(columns, (range, pd.RangeIndex, cudf.RangeIndex)):
-            df._data.rangeindex = True
-
-        if index is None:
-            df._index = RangeIndex(start=0, stop=len(data))
-        else:
-            df._index = as_index(index)
-        return df
+        if data.ndim == 1:
+            data = data.reshape(1, len(data))
+        return {
+            i: column.as_column(data[:, i], nan_as_null=nan_as_null)
+            for i in range(data.shape[1])
+        }
 
     @_cudf_nvtx_annotate
     def interpolate(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 97c89217f9f..f79cc7ed875 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10653,6 +10653,11 @@ def test_dataframe_from_ndarray_dup_columns():
         cudf.DataFrame(np.eye(2), columns=["A", "A"])
 
 
+def test_dataframe_from_dict_only_scalar_values_raises():
+    with pytest.raises(ValueError):
+        cudf.DataFrame({0: 3, 1: 2})
+
+
 @pytest.mark.parametrize("name", ["a", 0, None, np.nan, cudf.NA])
 @pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA])
 @pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]])

From 09690654f7c77a9225aef03c7d1716be7cb638cf Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 28 Nov 2023 13:44:37 -0800
Subject: [PATCH 02/22] Add dataframe reindexing tests, refactor logic

---
 python/cudf/cudf/core/dataframe.py       | 50 +++++++++++++++++-------
 python/cudf/cudf/tests/test_dataframe.py | 20 ++++++++++
 2 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d90fba8ab26..7f683b99329 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -684,8 +684,8 @@ def __init__(
             if isinstance(data, pd.DataFrame):
                 data = self.from_pandas(data, nan_as_null=nan_as_null)
             col_dict = data._data
-            index_from_data = data.index
-            columns_from_data = data.columns
+            index, index_from_data = data.index, index
+            columns, columns_from_data = data.columns, columns
         elif isinstance(data, (cudf.Series, pd.Series)):
             if isinstance(data, pd.Series):
                 data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null)
@@ -698,17 +698,21 @@ def __init__(
             #   -> return 1 column DataFrame
             # Series.name is None and columns
             #   -> return 1 column DataFrame if len(columns) in {0, 1}
-            if data.name is None and columns is not None:
-                if len(columns) > 1:
-                    raise ValueError(
-                        "Length of columns must be less than 2 if "
-                        f"{type(data).__name__}.name is None."
-                    )
-                name = columns[0]
+            if data.name is None:
+                if columns is not None:
+                    if len(columns) > 1:
+                        raise ValueError(
+                            "Length of columns must be less than 2 if "
+                            f"{type(data).__name__}.name is None."
+                        )
+                    name = columns[0]
+                else:
+                    name = 0
             else:
-                name = data.name or 0
+                name = data.name
+                columns, columns_from_data = pd.Index([data.name]), columns
             col_dict = {name: data._column}
-            index_from_data = data.index
+            index, index_from_data = data.index, index
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
@@ -752,7 +756,12 @@ def __init__(
                 )
                 for col_label in columns
             }
+        elif is_dict_like(data):
+            result = self._init_from_dict_like(data, nan_as_null=nan_as_null)
+            col_dict = result[0]
+            index, index_from_data = result[1], index
         elif is_list_like(data):
+            super().__init__()
             if len(data) > 0 and is_scalar(data[0]):
                 if columns is not None:
                     data = dict(zip(columns, [data]))
@@ -779,16 +788,27 @@ def __init__(
             else:
                 self._init_from_list_like(data, index=index, columns=columns)
             self._check_data_index_length_match()
-        elif is_dict_like(data):
-            col_dict, index_from_data = self._init_from_dict_like(
-                data, nan_as_null=nan_as_null
-            )
+            return
         else:
             raise TypeError(
                 f"data must be list or dict-like, not {type(data).__name__}"
             )
 
         super().__init__(col_dict, index=index)
+        if columns_from_data is not None:
+            # TODO: This there a better way to do this?
+            columns_from_data = as_index(columns_from_data)
+            reindexed = self.reindex(
+                columns=columns_from_data.to_pandas(), copy=False
+            )
+            self._data = reindexed._data
+            self._index = index
+        if index_from_data is not None:
+            # TODO: This there a better way to do this?
+            index_from_data = as_index(index_from_data)
+            reindexed = self.reindex(index=index_from_data, copy=False)
+            self._data = reindexed._data
+            self._index = index_from_data
         self._check_data_index_length_match()
 
         if dtype:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index f79cc7ed875..65b874fe85a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10658,6 +10658,26 @@ def test_dataframe_from_dict_only_scalar_values_raises():
         cudf.DataFrame({0: 3, 1: 2})
 
 
+@pytest.mark.parametrize("klass", [cudf.DataFrame, pd.DataFrame])
+@pytest.mark.parametrize(
+    "axis_kwargs, exp_data",
+    [
+        [
+            {"index": [1, 2], "columns": [1, 2]},
+            np.array([[1.0, np.nan], [np.nan, np.nan]]),
+        ],
+        [{"index": [1, 2]}, np.array([[0.0, 1.0], [np.nan, np.nan]])],
+        [{"columns": [1, 2]}, np.array([[0.0, np.nan], [1.0, np.nan]])],
+    ],
+)
+def test_dataframe_from_frame_with_index_or_columns_reindexes(
+    klass, axis_kwargs, exp_data
+):
+    result = cudf.DataFrame(klass(np.eye(2)), **axis_kwargs)
+    expected = cudf.DataFrame(exp_data, **axis_kwargs)
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize("name", ["a", 0, None, np.nan, cudf.NA])
 @pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA])
 @pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]])

From 2fa5f3a5a9c6b5a3a959ad97ae610fc67740c681 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 28 Nov 2023 19:08:24 -0800
Subject: [PATCH 03/22] Fix more logic

---
 python/cudf/cudf/core/dataframe.py | 126 ++++++++++++++++++++---------
 1 file changed, 86 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7f683b99329..a0f2ab98f99 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -666,8 +666,16 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     def __init__(
         self, data=None, index=None, columns=None, dtype=None, nan_as_null=True
     ):
+        col_is_rangeindex = False
+        col_is_multiindex = False
+
         if columns is not None:
-            columns = as_index(columns).to_pandas()
+            columns = as_index(columns)
+            if columns.nunique() != len(columns):
+                raise ValueError("Columns cannot contain duplicate values")
+            columns = columns.to_pandas()
+            col_is_rangeindex = isinstance(columns, pd.RangeIndex)
+            col_is_multiindex = isinstance(columns, pd.MultiIndex)
 
         if index is not None:
             index = as_index(index)
@@ -708,10 +716,15 @@ def __init__(
                     name = columns[0]
                 else:
                     name = 0
+                    col_is_rangeindex = True
+                col_dict = {name: data._column}
             else:
-                name = data.name
-                columns, columns_from_data = pd.Index([data.name]), columns
-            col_dict = {name: data._column}
+                if columns is not None and not columns.isin([data.name]).any():
+                    data = data.copy()[:0]
+                    col_dict = {col: data._column for col in columns}
+                else:
+                    col_dict = {data.name: data._column}
+                    columns, columns_from_data = pd.Index([data.name]), columns
             index, index_from_data = data.index, index
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
@@ -725,7 +738,7 @@ def __init__(
             if "descr" in arr_interface:
                 if len(arr_interface["descr"]) == 1:
                     col_dict = self._from_arrays(
-                        data, index=index, columns=columns
+                        data, columns=columns, nan_as_null=nan_as_null
                     )
                 else:
                     col_dict = self.from_records(
@@ -733,33 +746,32 @@ def __init__(
                     )._data
             else:
                 col_dict = self._from_arrays(
-                    data, index=index, columns=columns
+                    data, columns=columns, nan_as_null=nan_as_null
                 )
-
+            index, index_from_data = RangeIndex(data.shape[0]), index
         elif hasattr(data, "__array_interface__"):
             arr_interface = data.__array_interface__
             if len(arr_interface["descr"]) == 1:
                 # not record arrays
                 col_dict = self._from_arrays(
-                    data, index=index, columns=columns
+                    data, columns=columns, nan_as_null=nan_as_null
                 )
             else:
                 col_dict = self.from_records(
                     data, index=index, columns=columns
                 )._data
+            index, index_from_data = RangeIndex(data.shape[0]), index
         elif is_scalar(data):
             if index is None or columns is None:
-                raise ValueError("DataFrame constructor not properly called!")
+                raise ValueError(
+                    "Must provide an index and columns if data is a scalar."
+                )
             col_dict = {
                 col_label: as_column(
                     data, nan_as_null=nan_as_null, length=len(index)
                 )
                 for col_label in columns
             }
-        elif is_dict_like(data):
-            result = self._init_from_dict_like(data, nan_as_null=nan_as_null)
-            col_dict = result[0]
-            index, index_from_data = result[1], index
         elif is_list_like(data):
             super().__init__()
             if len(data) > 0 and is_scalar(data[0]):
@@ -789,6 +801,12 @@ def __init__(
                 self._init_from_list_like(data, index=index, columns=columns)
             self._check_data_index_length_match()
             return
+        elif is_dict_like(data):
+            result = self._init_from_dict_like(
+                data, index, nan_as_null=nan_as_null
+            )
+            col_dict = result[0]
+            index, index_from_data = result[1], index
         else:
             raise TypeError(
                 f"data must be list or dict-like, not {type(data).__name__}"
@@ -798,6 +816,8 @@ def __init__(
         if columns_from_data is not None:
             # TODO: This there a better way to do this?
             columns_from_data = as_index(columns_from_data)
+            col_is_rangeindex = isinstance(columns, cudf.RangeIndex)
+            col_is_multiindex = isinstance(columns, cudf.MultiIndex)
             reindexed = self.reindex(
                 columns=columns_from_data.to_pandas(), copy=False
             )
@@ -814,9 +834,8 @@ def __init__(
         if dtype:
             self._data = self.astype(dtype)._data
 
-        self._data.multiindex = self._data.multiindex or isinstance(
-            columns, pd.MultiIndex
-        )
+        self._data.rangeindex = self._data.rangeindex or col_is_rangeindex
+        self._data.multiindex = self._data.multiindex or col_is_multiindex
 
     @_cudf_nvtx_annotate
     def _init_from_series_list(self, data, columns, index):
@@ -989,13 +1008,33 @@ def _init_from_list_like(self, data, index=None, columns=None):
 
     @_cudf_nvtx_annotate
     def _init_from_dict_like(
-        self, data: dict, nan_as_null: bool | None = None
-    ) -> tuple[dict, None | cudf.Index]:
+        self, data: dict, index: None | cudf.Index, nan_as_null=None
+    ) -> tuple[dict, cudf.Index]:
         if not data:
-            return data, None
-        data, index_from_data, value_length = self._align_input_series_indices(
-            data, nan_as_null=nan_as_null
-        )
+            return data, cudf.RangeIndex(0)
+        data, index_from_data = self._align_input_series_indices(data)
+
+        value_lengths = set()
+        if index_from_data is not None:
+            value_lengths.add(len(index_from_data))
+
+        scalar_keys = []
+        col_data = {}
+        for key, value in data:
+            if is_scalar(value):
+                scalar_keys.append(key)
+                col_data[key] = value
+            else:
+                value_lengths.add(len(value))
+                col_data[key] = as_column(value, nan_as_null=nan_as_null)
+
+        if len(scalar_keys) != len(data) and len(value_lengths) > 1:
+            raise ValueError(
+                "Found varying value lengths when all values "
+                f"must have the same length: {value_lengths}"
+            )
+        # TODO: If all scalars, use index length
+
         col_data = {
             key: as_column(value, nan_as_null=nan_as_null, length=value_length)
             for key, value in data.items()
@@ -1017,32 +1056,27 @@ def _from_data(
     @staticmethod
     @_cudf_nvtx_annotate
     def _align_input_series_indices(
-        data: dict, nan_as_null: bool | None = None
-    ) -> tuple[dict, None | cudf.Index, int]:
-        input_series = {}
-        value_lengths: set[int] = set()
-        for key, val in data.items():
-            if isinstance(val, (pd.Series, Series, dict)):
-                val = Series(val, nan_as_null=nan_as_null)
-                input_series[key] = val
-            if not is_scalar(val):
-                value_lengths.add(len(val))
-        if len(value_lengths) > 1:
-            raise ValueError(f"Found varying data lengths: {value_lengths}")
+        data: dict,
+    ) -> tuple[dict, None | cudf.Index]:
+        """If data.values() contains Series/dicts, align their indexes before processing"""
+        input_series = {
+            key: val
+            for key, val in data.items()
+            if isinstance(val, (pd.Series, Series, dict))
+        }
 
         if not input_series:
-            return data, None, value_lengths.pop()
+            return data, None
 
         aligned_input_series = cudf.core.series._align_indices(
             list(input_series.values())
         )
-        index = aligned_input_series[0].index
         data = data.copy()
         for key, aligned_series in zip(
             input_series.keys(), aligned_input_series
         ):
             data[key] = aligned_series
-        return data, index, value_lengths.pop()
+        return data, aligned_series.index
 
     # The `constructor*` properties are used by `dask` (and `dask_cudf`)
     @property
@@ -5448,7 +5482,9 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
 
     @classmethod
     @_cudf_nvtx_annotate
-    def _from_arrays(cls, data, nan_as_null=False) -> dict[int, ColumnBase]:
+    def _from_arrays(
+        cls, data, columns, nan_as_null=False
+    ) -> dict[Any, ColumnBase]:
         """Convert a numpy/cupy array to a dict of columns.
 
         Parameters
@@ -5471,9 +5507,19 @@ def _from_arrays(cls, data, nan_as_null=False) -> dict[int, ColumnBase]:
 
         if data.ndim == 1:
             data = data.reshape(1, len(data))
+
+        if columns is not None:
+            if len(columns) != data.shape[1]:
+                raise ValueError(
+                    f"columns length expected {data.shape[1]} but "
+                    f"found {len(columns)}"
+                )
+            columns_labels = columns
+        else:
+            columns_labels = range(data.shape[1])
         return {
-            i: column.as_column(data[:, i], nan_as_null=nan_as_null)
-            for i in range(data.shape[1])
+            column_label: column.as_column(data[:, i], nan_as_null=nan_as_null)
+            for column_label, i in zip(columns_labels, range(data.shape[1]))
         }
 
     @_cudf_nvtx_annotate

From 89f92806a26e0eb644824a5c418c79939790ec7b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 29 Nov 2023 14:31:25 -0800
Subject: [PATCH 04/22] Adjust dict logic

---
 python/cudf/cudf/core/dataframe.py       | 69 ++++++++++++++++--------
 python/cudf/cudf/tests/test_dataframe.py | 10 +++-
 2 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a0f2ab98f99..772e8e1923a 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -671,7 +671,9 @@ def __init__(
 
         if columns is not None:
             columns = as_index(columns)
-            if columns.nunique() != len(columns):
+            if not isinstance(
+                columns, MultiIndex
+            ) and columns.nunique() != len(columns):
                 raise ValueError("Columns cannot contain duplicate values")
             columns = columns.to_pandas()
             col_is_rangeindex = isinstance(columns, pd.RangeIndex)
@@ -748,7 +750,8 @@ def __init__(
                 col_dict = self._from_arrays(
                     data, columns=columns, nan_as_null=nan_as_null
                 )
-            index, index_from_data = RangeIndex(data.shape[0]), index
+            if index is None:
+                index = RangeIndex(arr_interface["shape"][0])
         elif hasattr(data, "__array_interface__"):
             arr_interface = data.__array_interface__
             if len(arr_interface["descr"]) == 1:
@@ -760,7 +763,8 @@ def __init__(
                 col_dict = self.from_records(
                     data, index=index, columns=columns
                 )._data
-            index, index_from_data = RangeIndex(data.shape[0]), index
+            if index is None:
+                index = RangeIndex(arr_interface["shape"][0])
         elif is_scalar(data):
             if index is None or columns is None:
                 raise ValueError(
@@ -1010,36 +1014,58 @@ def _init_from_list_like(self, data, index=None, columns=None):
     def _init_from_dict_like(
         self, data: dict, index: None | cudf.Index, nan_as_null=None
     ) -> tuple[dict, cudf.Index]:
+        # 1) Align indexes of all data.values() that are Series/dicts
+        # 2) Convert all array-like data.values() to columns
+        # 3) Convert all remaining scalar data.values() to columns
         if not data:
             return data, cudf.RangeIndex(0)
-        data, index_from_data = self._align_input_series_indices(data)
+        data, index_from_data = self._align_input_series_indices(
+            data, nan_as_null=nan_as_null
+        )
 
         value_lengths = set()
+        result_index = None
         if index_from_data is not None:
             value_lengths.add(len(index_from_data))
+            result_index = index_from_data
+        elif index is not None:
+            result_index = index
 
         scalar_keys = []
         col_data = {}
-        for key, value in data:
+        for key, value in data.items():
             if is_scalar(value):
                 scalar_keys.append(key)
                 col_data[key] = value
             else:
-                value_lengths.add(len(value))
-                col_data[key] = as_column(value, nan_as_null=nan_as_null)
+                column = as_column(value, nan_as_null=nan_as_null)
+                value_lengths.add(len(column))
+                col_data[key] = column
 
         if len(scalar_keys) != len(data) and len(value_lengths) > 1:
             raise ValueError(
                 "Found varying value lengths when all values "
                 f"must have the same length: {value_lengths}"
             )
-        # TODO: If all scalars, use index length
+        elif len(scalar_keys) == len(data):
+            # All data.values() are scalars
+            if index is None:
+                raise ValueError(
+                    "If using all scalar values, you must pass an index"
+                )
+            scalar_length = len(index)
+        else:
+            scalar_length = value_lengths.pop()
+
+        for key in scalar_keys:
+            col_data[key] = as_column(
+                col_data[key], nan_as_null=nan_as_null, length=scalar_length
+            )
 
-        col_data = {
-            key: as_column(value, nan_as_null=nan_as_null, length=value_length)
-            for key, value in data.items()
-        }
-        return col_data, index_from_data
+        if result_index is None:
+            result_index = cudf.RangeIndex(scalar_length)
+
+        return col_data, result_index
 
     @classmethod
     def _from_data(
@@ -1056,11 +1082,10 @@ def _from_data(
     @staticmethod
     @_cudf_nvtx_annotate
     def _align_input_series_indices(
-        data: dict,
+        data: dict, nan_as_null=None
     ) -> tuple[dict, None | cudf.Index]:
-        """If data.values() contains Series/dicts, align their indexes before processing"""
         input_series = {
-            key: val
+            key: Series(val, nan_as_null=nan_as_null)
             for key, val in data.items()
             if isinstance(val, (pd.Series, Series, dict))
         }
@@ -6408,11 +6433,13 @@ def select_dtypes(self, include=None, exclude=None):
             inclusion = set()
         # remove all exclude types
         inclusion = inclusion - exclude_subtypes
-
-        for k, col in self._data.items():
-            infered_type = cudf_dtype_from_pydata_dtype(col.dtype)
-            if infered_type in inclusion:
-                df._insert(len(df._data), k, col)
+        if inclusion:
+            for k, col in self._data.items():
+                infered_type = cudf_dtype_from_pydata_dtype(col.dtype)
+                if infered_type in inclusion:
+                    df._insert(len(df._data), k, col)
+        else:
+            df.columns = df.columns[:0]
 
         return df
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index c792bee2a58..d1e2f03420f 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1581,7 +1581,15 @@ def test_concat_empty_dataframe(df_1, df_2):
     # ignoring dtypes as pandas upcasts int to float
     # on concatenation with empty dataframes
 
-    assert_eq(got, expect, check_dtype=False, check_index_type=True)
+    # pandas>=2.0 has RangeIndex columns (matching cudf)
+    # pandas<=1.5 returns Index[object] columns
+    assert_eq(
+        got,
+        expect,
+        check_dtype=False,
+        check_index_type=True,
+        check_column_type=PANDAS_GE_200,
+    )
 
 
 @pytest.mark.parametrize(

From a4da710baeecd47249f75cbe9ce6ae8097cab16a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 30 Nov 2023 17:54:10 -0800
Subject: [PATCH 05/22] More bugs in dict and array logic

---
 python/cudf/cudf/core/column/column.py   | 12 +++++++-
 python/cudf/cudf/core/column_accessor.py |  4 +++
 python/cudf/cudf/core/dataframe.py       | 39 +++++++++++++++++++-----
 python/cudf/cudf/core/frame.py           |  4 +++
 python/cudf/cudf/tests/test_dataframe.py | 35 +++++++++++++++++++--
 5 files changed, 84 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a76f4d7383c..c284b8c44bd 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2173,7 +2173,10 @@ def as_column(
         if dtype is not None:
             data = data.astype(dtype)
 
-    elif np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview):
+    elif arbitrary is None or (
+        np.isscalar(arbitrary) and not isinstance(arbitrary, memoryview)
+    ):
+        # TODO: use is_scalar instead of np.isscalar
         length = length or 1
         if (
             (nan_as_null is True)
@@ -2183,6 +2186,8 @@ def as_column(
             arbitrary = None
             if dtype is None:
                 dtype = cudf.dtype("float64")
+        elif arbitrary is None and dtype is None:
+            dtype = cudf.dtype("object")
 
         data = as_column(full(length, arbitrary, dtype=dtype))
         if not nan_as_null and not is_decimal_dtype(data.dtype):
@@ -2202,6 +2207,11 @@ def as_column(
 
         arbitrary = np.asarray(arbitrary)
 
+        if arbitrary.ndim == 0:
+            arbitrary = arbitrary.reshape(
+                1,
+            )
+
         # Handle case that `arbitrary` elements are cupy arrays
         if (
             shape
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index b106b8bbb02..021d4994613 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -157,6 +157,8 @@ def _create_unsafe(
         data: Dict[Any, ColumnBase],
         multiindex: bool = False,
         level_names=None,
+        rangeindex: bool = False,
+        label_dtype: Dtype | None = None,
     ) -> ColumnAccessor:
         # create a ColumnAccessor without verifying column
         # type or size
@@ -164,6 +166,8 @@ def _create_unsafe(
         obj._data = data
         obj.multiindex = multiindex
         obj._level_names = level_names
+        obj.rangeindex = rangeindex
+        obj.label_dtype = label_dtype
         return obj
 
     def __iter__(self):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 772e8e1923a..f0f7a666a10 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -668,6 +668,7 @@ def __init__(
     ):
         col_is_rangeindex = False
         col_is_multiindex = False
+        col_dtype = None
 
         if columns is not None:
             columns = as_index(columns)
@@ -678,13 +679,13 @@ def __init__(
             columns = columns.to_pandas()
             col_is_rangeindex = isinstance(columns, pd.RangeIndex)
             col_is_multiindex = isinstance(columns, pd.MultiIndex)
+            if not isinstance(columns, pd.MultiIndex):
+                col_dtype = columns.dtype
 
         if index is not None:
             index = as_index(index)
 
-        if data is None:
-            data = []
-        elif isinstance(data, Iterator) and not isinstance(data, str):
+        if isinstance(data, Iterator) and not isinstance(data, str):
             data = list(data)
 
         index_from_data = None
@@ -728,6 +729,27 @@ def __init__(
                     col_dict = {data.name: data._column}
                     columns, columns_from_data = pd.Index([data.name]), columns
             index, index_from_data = data.index, index
+        elif data is None:
+            if index is None:
+                index = RangeIndex(0)
+            if columns is not None:
+                level_names = (
+                    tuple(columns.names)
+                    if isinstance(columns, pd.Index)
+                    else None
+                )
+                col_dict = ColumnAccessor(
+                    {
+                        k: column.column_empty(
+                            len(index), dtype="object", masked=True
+                        )
+                        for k in columns
+                    },
+                    level_names=level_names,
+                )
+            else:
+                col_dict = {}
+                col_is_rangeindex = True
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
@@ -811,6 +833,8 @@ def __init__(
             )
             col_dict = result[0]
             index, index_from_data = result[1], index
+            columns, columns_from_data = result[2], columns
+            col_is_multiindex = isinstance(columns, pd.MultiIndex)
         else:
             raise TypeError(
                 f"data must be list or dict-like, not {type(data).__name__}"
@@ -840,6 +864,7 @@ def __init__(
 
         self._data.rangeindex = self._data.rangeindex or col_is_rangeindex
         self._data.multiindex = self._data.multiindex or col_is_multiindex
+        self._data.label_dtype = self._data.label_dtype or col_dtype
 
     @_cudf_nvtx_annotate
     def _init_from_series_list(self, data, columns, index):
@@ -1013,12 +1038,12 @@ def _init_from_list_like(self, data, index=None, columns=None):
     @_cudf_nvtx_annotate
     def _init_from_dict_like(
         self, data: dict, index: None | cudf.Index, nan_as_null=None
-    ) -> tuple[dict, cudf.Index]:
+    ) -> tuple[dict, cudf.Index, pd.Index]:
         # 1) Align indexes of all data.values() that are Series/dicts
         # 2) Convert all array-like data.values() to columns
         # 3) Convert all remaining scalar data.values() to columns
         if not data:
-            return data, cudf.RangeIndex(0)
+            return data, cudf.RangeIndex(0), pd.RangeIndex(0)
         data, index_from_data = self._align_input_series_indices(
             data, nan_as_null=nan_as_null
         )
@@ -1065,7 +1090,7 @@ def _init_from_dict_like(
         if result_index is None:
             result_index = cudf.RangeIndex(scalar_length)
 
-        return col_data, result_index
+        return col_data, result_index, pd.Index(col_data)
 
     @classmethod
     def _from_data(
@@ -5531,7 +5556,7 @@ def _from_arrays(
             )
 
         if data.ndim == 1:
-            data = data.reshape(1, len(data))
+            data = data.reshape(len(data), 1)
 
         if columns is not None:
             if len(columns) != data.shape[1]:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index b2f0651d576..e1b2f7d674d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -280,6 +280,8 @@ def astype(self, dtype, copy=False, **kwargs):
             data=result_data,
             multiindex=self._data.multiindex,
             level_names=self._data.level_names,
+            rangeindex=self._data.rangeindex,
+            label_dtype=self._data.label_dtype,
         )
 
     @_cudf_nvtx_annotate
@@ -876,6 +878,8 @@ def fillna(
                     data=filled_data,
                     multiindex=self._data.multiindex,
                     level_names=self._data.level_names,
+                    rangeindex=self._data.rangeindex,
+                    label_dtype=self._data.label_dtype,
                 )
             ),
             inplace=inplace,
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d1e2f03420f..be34cb65d17 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4556,8 +4556,9 @@ def test_create_dataframe_column():
         columns=["a", "b", "c"],
         index=["A", "Z", "X"],
     )
-
-    assert_eq(pdf, gdf)
+    # pandas C column is NaN of object type
+    # cudf C column is NA of type float
+    assert_eq(pdf, gdf, check_dtype=False)
 
 
 @pytest.mark.parametrize(
@@ -4601,6 +4602,36 @@ def test_dataframe_columns_empty_data_preserves_dtype(dtype, idx_data, data):
     assert_eq(result, expected)
 
 
+@pytest.mark.parametrize("dtype", ["int64", "datetime64[ns]", "int8"])
+def test_dataframe_astype_preserves_column_dtype(dtype):
+    result = cudf.DataFrame([1], columns=cudf.Index([1], dtype=dtype))
+    result = result.astype(np.int32).columns
+    expected = pd.Index([1], dtype=dtype)
+    assert_eq(result, expected)
+
+
+def test_dataframe_astype_preserves_column_rangeindex():
+    result = cudf.DataFrame([1], columns=range(1))
+    result = result.astype(np.int32).columns
+    expected = pd.RangeIndex(1)
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("dtype", ["int64", "datetime64[ns]", "int8"])
+def test_dataframe_fillna_preserves_column_dtype(dtype):
+    result = cudf.DataFrame([1, None], columns=cudf.Index([1], dtype=dtype))
+    result = result.fillna(2).columns
+    expected = pd.Index([1], dtype=dtype)
+    assert_eq(result, expected)
+
+
+def test_dataframe_fillna_preserves_column_rangeindex():
+    result = cudf.DataFrame([1, None], columns=range(1))
+    result = result.fillna(2).columns
+    expected = pd.RangeIndex(1)
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From 8a547910d1ab57e4b76008e04fb5ae82c76e8176 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 1 Dec 2023 18:27:02 -0800
Subject: [PATCH 06/22] Fix mode initialization, remove working xfail now

---
 python/cudf/cudf/core/dataframe.py       | 13 ++++++++++---
 python/cudf/cudf/tests/test_dataframe.py | 23 ++++++++++++++---------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f0f7a666a10..c258220b429 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -729,7 +729,11 @@ def __init__(
                     col_dict = {data.name: data._column}
                     columns, columns_from_data = pd.Index([data.name]), columns
             index, index_from_data = data.index, index
-        elif data is None:
+        elif data is None or (
+            isinstance(data, dict)
+            and columns is not None
+            and (~columns.isin(data.keys())).all()
+        ):
             if index is None:
                 index = RangeIndex(0)
             if columns is not None:
@@ -826,10 +830,13 @@ def __init__(
             else:
                 self._init_from_list_like(data, index=index, columns=columns)
             self._check_data_index_length_match()
+
+            if dtype:
+                self._data = self.astype(dtype)._data
             return
         elif is_dict_like(data):
             result = self._init_from_dict_like(
-                data, index, nan_as_null=nan_as_null
+                data, index=index, nan_as_null=nan_as_null
             )
             col_dict = result[0]
             index, index_from_data = result[1], index
@@ -6199,7 +6206,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         ]
 
         if len(mode_results) == 0:
-            return DataFrame()
+            return DataFrame(columns=self.columns[:0])
 
         df = cudf.concat(mode_results, axis=1)
         if isinstance(df, Series):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index be34cb65d17..1bf7f4b700e 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -7539,7 +7539,11 @@ def test_dataframe_append_dataframe(df, other, sort, ignore_index):
     if expected.shape != df.shape:
         assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False)
     else:
-        assert_eq(expected, actual, check_index_type=not gdf.empty)
+        # pandas returns Index[object] while this should be an empty RangeIndex
+        # for empty df/other
+        assert_eq(
+            expected, actual, check_index_type=False, check_column_type=False
+        )
 
 
 @pytest_unmark_spilling
@@ -7579,8 +7583,8 @@ def test_dataframe_append_dataframe(df, other, sort, ignore_index):
                 "https://github.com/pandas-dev/pandas/issues/35092",
             ),
         ),
-        {1: 1},
-        {0: 10, 1: 100, 2: 102},
+        {1: [1]},
+        {0: [10], 1: [100], 2: [102]},
     ],
 )
 @pytest.mark.parametrize("sort", [False, True])
@@ -7769,7 +7773,11 @@ def test_dataframe_append_dataframe_lists(df, other, sort, ignore_index):
     if expected.shape != df.shape:
         assert_eq(expected.fillna(-1), actual.fillna(-1), check_dtype=False)
     else:
-        assert_eq(expected, actual, check_index_type=not gdf.empty)
+        # pandas returns Index[object] while this should be an empty RangeIndex
+        # for empty df/other
+        assert_eq(
+            expected, actual, check_index_type=False, check_column_type=False
+        )
 
 
 @pytest.mark.parametrize(
@@ -8152,11 +8160,7 @@ def test_series_empty(ps):
     "columns",
     [["a"], ["another column name"], None, pd.Index(["a"], name="index name")],
 )
-def test_dataframe_init_with_columns(data, columns, request):
-    if data == [] and columns is None and not PANDAS_GE_200:
-        request.node.add_marker(
-            pytest.mark.xfail(reason=".column returns Index[object]")
-        )
+def test_dataframe_init_with_columns(data, columns):
     pdf = pd.DataFrame(data, columns=columns)
     gdf = cudf.DataFrame(data, columns=columns)
 
@@ -8164,6 +8168,7 @@ def test_dataframe_init_with_columns(data, columns, request):
         pdf,
         gdf,
         check_index_type=len(pdf.index) != 0,
+        check_column_type=data is not None and columns is not None,
         check_dtype=not (pdf.empty and len(pdf.columns)),
     )
 

From 36b85cc9cb3670adbefffeefed1679ec161c1d6e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 7 Dec 2023 15:12:31 -0800
Subject: [PATCH 07/22] Clean up tests, fix more bugs

---
 python/cudf/cudf/core/dataframe.py       | 15 +++++++----
 python/cudf/cudf/tests/test_dataframe.py | 34 +++++++++---------------
 2 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index ef92dff7692..3707bd185c5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -698,6 +698,10 @@ def __init__(
             col_dict = data._data
             index, index_from_data = data.index, index
             columns, columns_from_data = data.columns, columns
+            if columns_from_data is not None and len(columns_from_data) == 0:
+                # TODO: Can this be avoided?
+                # as_index([]) returns Index[float64]
+                columns_from_data = columns_from_data.astype(columns.dtype)
         elif isinstance(data, (cudf.Series, pd.Series)):
             if isinstance(data, pd.Series):
                 data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null)
@@ -854,6 +858,7 @@ def __init__(
             columns_from_data = as_index(columns_from_data)
             col_is_rangeindex = isinstance(columns, cudf.RangeIndex)
             col_is_multiindex = isinstance(columns, cudf.MultiIndex)
+            col_dtype = columns_from_data.dtype
             reindexed = self.reindex(
                 columns=columns_from_data.to_pandas(), copy=False
             )
@@ -3516,12 +3521,12 @@ def rename(
                 )
 
             if level is not None and isinstance(self.index, MultiIndex):
-                out_index = self.index.copy(deep=copy)
-                out_index.get_level_values(level).to_frame().replace(
-                    to_replace=list(index.keys()),
-                    value=list(index.values()),
-                    inplace=True,
+                out_frame = self.index.to_frame(index=False)
+                level = self.index._get_level_label(level)
+                out_frame[level] = out_frame[level].replace(
+                    to_replace=list(index.keys()), value=list(index.values())
                 )
+                out_index = type(self.index).from_frame(out_frame)
                 out = DataFrame(index=out_index)
             else:
                 to_replace = list(index.keys())
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2c157daa78c..836824ac879 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -8951,8 +8951,9 @@ def test_dataframe_from_pandas_duplicate_columns():
         ["column_not_exists1", "column_not_exists2"],
     ],
 )
-@pytest.mark.parametrize("index", [["abc", "def", "ghi"]])
-def test_dataframe_constructor_columns(df, columns, index, request):
+def test_dataframe_constructor_columns(df, columns, request):
+    index = ["abc", "def", "ghi"]
+
     def assert_local_eq(actual, df, expected, host_columns):
         check_index_type = not expected.empty
         if host_columns is not None and any(
@@ -8967,12 +8968,6 @@ def assert_local_eq(actual, df, expected, host_columns):
         else:
             assert_eq(expected, actual, check_index_type=check_index_type)
 
-    if df.empty and columns is None and not PANDAS_GE_200:
-        request.node.add_marker(
-            pytest.mark.xfail(
-                reason="pandas returns Index[object] instead of RangeIndex"
-            )
-        )
     gdf = cudf.from_pandas(df)
     host_columns = (
         columns.to_pandas() if isinstance(columns, cudf.BaseIndex) else columns
@@ -9279,23 +9274,20 @@ def test_dataframe_setitem_cupy_array():
     assert_eq(pdf, gdf)
 
 
-@pytest.mark.parametrize(
-    "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}]
-)
-@pytest.mark.parametrize(
-    "index",
-    [{0: 123, 1: 4, 2: 6}],
-)
 @pytest.mark.parametrize(
     "level",
     ["x", 0],
 )
-def test_rename_for_level_MultiIndex_dataframe(data, index, level):
+def test_rename_for_level_MultiIndex_dataframe(level):
+    data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
+    index = {0: 123, 1: 4, 2: 6}
     pdf = pd.DataFrame(
         data,
-        index=pd.MultiIndex.from_tuples([(0, 1, 2), (1, 2, 3), (2, 3, 4)]),
+        index=pd.MultiIndex.from_tuples(
+            [(0, 1, 2), (1, 2, 3), (2, 3, 4)], names=["x", "y", "z"]
+        ),
     )
-    pdf.index.names = ["x", "y", "z"]
+
     gdf = cudf.from_pandas(pdf)
 
     expect = pdf.rename(index=index, level=level)
@@ -9304,9 +9296,6 @@ def test_rename_for_level_MultiIndex_dataframe(data, index, level):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize(
-    "data", [{"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}]
-)
 @pytest.mark.parametrize(
     "columns",
     [{"a": "f", "b": "g"}, {1: 3, 2: 4}, lambda s: 2 * s],
@@ -9315,7 +9304,8 @@ def test_rename_for_level_MultiIndex_dataframe(data, index, level):
     "level",
     [0, 1],
 )
-def test_rename_for_level_MultiColumn_dataframe(data, columns, level):
+def test_rename_for_level_MultiColumn_dataframe(columns, level):
+    data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
     gdf = cudf.DataFrame(data)
     gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
 

From 553fe3683f9e58ff7dfff7aa3b377272da43c865 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 7 Dec 2023 19:02:47 -0800
Subject: [PATCH 08/22] Fix more tests, test reindex bug

---
 python/cudf/cudf/core/dataframe.py       | 17 +++++++++++------
 python/cudf/cudf/core/indexed_frame.py   | 11 ++++++++---
 python/cudf/cudf/tests/test_dataframe.py | 12 +++++++++++-
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3707bd185c5..fddf7a0d2a4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -672,7 +672,12 @@ def __init__(
         col_dtype = None
 
         if columns is not None:
-            columns = as_index(columns)
+            dtype = None
+            if isinstance(columns, list) and len(columns) == 0:
+                # TODO: Generically, an empty dtype-less container
+                # TODO: Why does as_index([]) return FloatIndex
+                dtype = object
+            columns = as_index(columns, dtype=dtype)
             if not isinstance(
                 columns, MultiIndex
             ) and columns.nunique() != len(columns):
@@ -698,10 +703,6 @@ def __init__(
             col_dict = data._data
             index, index_from_data = data.index, index
             columns, columns_from_data = data.columns, columns
-            if columns_from_data is not None and len(columns_from_data) == 0:
-                # TODO: Can this be avoided?
-                # as_index([]) returns Index[float64]
-                columns_from_data = columns_from_data.astype(columns.dtype)
         elif isinstance(data, (cudf.Series, pd.Series)):
             if isinstance(data, pd.Series):
                 data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null)
@@ -794,6 +795,7 @@ def __init__(
                 col_dict = self.from_records(
                     data, index=index, columns=columns
                 )._data
+            columns_from_data = columns
             if index is None:
                 index = RangeIndex(arr_interface["shape"][0])
         elif is_scalar(data):
@@ -851,8 +853,8 @@ def __init__(
             raise TypeError(
                 f"data must be list or dict-like, not {type(data).__name__}"
             )
-
         super().__init__(col_dict, index=index)
+        self._check_data_index_length_match()
         if columns_from_data is not None:
             # TODO: This there a better way to do this?
             columns_from_data = as_index(columns_from_data)
@@ -870,12 +872,15 @@ def __init__(
             reindexed = self.reindex(index=index_from_data, copy=False)
             self._data = reindexed._data
             self._index = index_from_data
+        # TODO this one might not be needed
         self._check_data_index_length_match()
 
         if dtype:
             self._data = self.astype(dtype)._data
 
         self._data.rangeindex = self._data.rangeindex or col_is_rangeindex
+        # TODO: multiindex assignment
+        # test_non_string_column_name_to_arrow to fail
         self._data.multiindex = self._data.multiindex or col_is_multiindex
         self._data.label_dtype = self._data.label_dtype or col_dtype
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index c81174482e0..f73299667d2 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2653,6 +2653,7 @@ def _reindex(
             name: (
                 df._data[name].copy(deep=deep)
                 if name in df._data
+                # Why does this default to np.float64?
                 else cudf.core.column.column.column_empty(
                     dtype=dtypes.get(name, np.float64),
                     masked=True,
@@ -2661,13 +2662,17 @@ def _reindex(
             )
             for name in names
         }
+        if column_names is None:
+            level_names = self._data.level_names
+        elif isinstance(column_names, pd.Index):
+            level_names = tuple(column_names.names)
+        else:
+            level_names = None
         result = self.__class__._from_data(
             data=cudf.core.column_accessor.ColumnAccessor(
                 cols,
                 multiindex=self._data.multiindex,
-                level_names=tuple(column_names.names)
-                if isinstance(column_names, pd.Index)
-                else None,
+                level_names=level_names,
             ),
             index=index,
         )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 836824ac879..8809ab33224 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9441,7 +9441,8 @@ def test_dataframe_init_from_series(data, columns, index):
     assert_eq(
         expected,
         actual,
-        check_index_type=len(expected) != 0,
+        # TODO: reindex creates new cols of float64, why not object?
+        check_dtype=False,
     )
 
 
@@ -10823,3 +10824,12 @@ def test_dataframe_duplicate_index_reindex():
         lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
         rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
     )
+
+
+def test_dataframe_reindex_doesnt_remove_column_name():
+    gdf = cudf.DataFrame([1], columns=pd.Index(["a"], name="foo"))
+    result = gdf.reindex(index=pd.Index([0, 1]))
+    expected = cudf.DataFrame(
+        [1, None], columns=pd.Index(["a"], name="foo"), index=pd.Index([0, 1])
+    )
+    assert_eq(result, expected)

From 5baac4e26bce6c99e369f72d424f5ef3cd9ca4b7 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 8 Dec 2023 13:27:30 -0800
Subject: [PATCH 09/22] Fix dict like to avoid reindexing

---
 python/cudf/cudf/core/dataframe.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index fddf7a0d2a4..c1189c63350 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -846,7 +846,7 @@ def __init__(
                 data, index=index, nan_as_null=nan_as_null
             )
             col_dict = result[0]
-            index, index_from_data = result[1], index
+            index = result[1]
             columns, columns_from_data = result[2], columns
             col_is_multiindex = isinstance(columns, pd.MultiIndex)
         else:
@@ -1063,7 +1063,7 @@ def _init_from_dict_like(
         if not data:
             return data, cudf.RangeIndex(0), pd.RangeIndex(0)
         data, index_from_data = self._align_input_series_indices(
-            data, nan_as_null=nan_as_null
+            data, index=index, nan_as_null=nan_as_null
         )
 
         value_lengths = set()
@@ -1125,7 +1125,7 @@ def _from_data(
     @staticmethod
     @_cudf_nvtx_annotate
     def _align_input_series_indices(
-        data: dict, nan_as_null=None
+        data: dict, index: cudf.Index | None, nan_as_null=None
     ) -> tuple[dict, None | cudf.Index]:
         input_series = {
             key: Series(val, nan_as_null=nan_as_null)
@@ -1143,6 +1143,8 @@ def _align_input_series_indices(
         for key, aligned_series in zip(
             input_series.keys(), aligned_input_series
         ):
+            if index is not None:
+                aligned_series = aligned_series.reindex(index=index)
             data[key] = aligned_series
         return data, aligned_series.index
 

From 9ce0a69f921b6a259917834f31952aa83fe19a94 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 8 Dec 2023 13:30:42 -0800
Subject: [PATCH 10/22] Adjust
 test_series_data_with_name_with_columns_matching_align

---
 python/cudf/cudf/tests/test_dataframe.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 8809ab33224..8089f670efd 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10619,7 +10619,9 @@ def test_series_data_with_name_with_columns_not_matching():
 def test_series_data_with_name_with_columns_matching_align():
     gdf = cudf.DataFrame(cudf.Series([1], name=2), columns=[1, 2])
     pdf = pd.DataFrame(pd.Series([1], name=2), columns=[1, 2])
-    assert_eq(gdf, pdf)
+    # pandas A column is NaN of object type
+    # cudf A column is NA of type float
+    assert_eq(gdf, pdf, check_dtype=False)
 
 
 @pytest.mark.parametrize("digits", [0, 1, 3, 4, 10])

From 5fcce39648ba94487d1a9337bd720a8ede2c1ef7 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 8 Dec 2023 14:31:53 -0800
Subject: [PATCH 11/22] add comments

---
 python/cudf/cudf/core/dataframe.py       | 2 +-
 python/cudf/cudf/tests/test_dataframe.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c1189c63350..569f901a56b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5569,7 +5569,7 @@ def _from_arrays(
 
         Returns
         -------
-        {int: Column}
+        {Any: Column}
         """
 
         data = cupy.asarray(data)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 8089f670efd..3fd4f26f909 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10619,8 +10619,8 @@ def test_series_data_with_name_with_columns_not_matching():
 def test_series_data_with_name_with_columns_matching_align():
     gdf = cudf.DataFrame(cudf.Series([1], name=2), columns=[1, 2])
     pdf = pd.DataFrame(pd.Series([1], name=2), columns=[1, 2])
-    # pandas A column is NaN of object type
-    # cudf A column is NA of type float
+    # pandas 1 column is NaN of object type
+    # cudf 1 column is NA of type float
     assert_eq(gdf, pdf, check_dtype=False)
 
 

From df93b636d5475fadb5ee76999e9269b0cbf8ed0d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 11 Dec 2023 13:34:07 -0800
Subject: [PATCH 12/22] Fix some tests and a naming bug

---
 python/cudf/cudf/core/dataframe.py       |  6 +++---
 python/cudf/cudf/tests/test_dataframe.py | 10 +++++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 569f901a56b..1bc77635cd2 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -672,12 +672,12 @@ def __init__(
         col_dtype = None
 
         if columns is not None:
-            dtype = None
+            as_idx_typ = None
             if isinstance(columns, list) and len(columns) == 0:
                 # TODO: Generically, an empty dtype-less container
                 # TODO: Why does as_index([]) return FloatIndex
-                dtype = object
-            columns = as_index(columns, dtype=dtype)
+                as_idx_typ = object
+            columns = as_index(columns, dtype=as_idx_typ)
             if not isinstance(
                 columns, MultiIndex
             ) and columns.nunique() != len(columns):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 3aac6b2f54d..6c6d76835a2 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10467,10 +10467,18 @@ def test_dataframe_dict_like_with_columns(columns, index):
     data = {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}
     expect = pd.DataFrame(data, columns=columns, index=index)
     actual = cudf.DataFrame(data, columns=columns, index=index)
+    # TODO(pandas2.0): New NA columns will be object instead of float type
+    check_dtype = isinstance(columns, list) and columns == [
+        "a",
+        "d",
+        "b",
+        "e",
+        "c",
+    ]
     if index is None and len(columns) == 0:
         # We make an empty range index, pandas makes an empty index
         expect = expect.reset_index(drop=True)
-    assert_eq(expect, actual)
+    assert_eq(expect, actual, check_dtype=not check_dtype)
 
 
 def test_dataframe_init_columns_named_multiindex():

From 77ab160e27ae07d19a91e6b35f24e3804f35ea9e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 11 Dec 2023 14:57:18 -0800
Subject: [PATCH 13/22] pass arguments through colaccessor

---
 python/cudf/cudf/core/dataframe.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1bc77635cd2..346f922da71 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -756,6 +756,9 @@ def __init__(
                         for k in columns
                     },
                     level_names=level_names,
+                    multiindex=col_is_multiindex,
+                    rangeindex=col_is_rangeindex,
+                    label_dtype=col_dtype,
                 )
             else:
                 col_dict = {}
@@ -853,18 +856,28 @@ def __init__(
             raise TypeError(
                 f"data must be list or dict-like, not {type(data).__name__}"
             )
-        super().__init__(col_dict, index=index)
+        col_accessor = ColumnAccessor(
+            col_dict,
+            multiindex=col_is_multiindex,
+            rangeindex=col_is_rangeindex,
+            label_dtype=col_dtype,
+        )
+        super().__init__(col_accessor, index=index)
         self._check_data_index_length_match()
         if columns_from_data is not None:
             # TODO: This there a better way to do this?
             columns_from_data = as_index(columns_from_data)
-            col_is_rangeindex = isinstance(columns, cudf.RangeIndex)
-            col_is_multiindex = isinstance(columns, cudf.MultiIndex)
-            col_dtype = columns_from_data.dtype
             reindexed = self.reindex(
                 columns=columns_from_data.to_pandas(), copy=False
             )
             self._data = reindexed._data
+            self._data.rangeindex = isinstance(
+                columns_from_data, cudf.RangeIndex
+            )
+            self._data.multiindex = isinstance(
+                columns_from_data, cudf.MultiIndex
+            )
+            self._data.label_dtype = columns_from_data.dtype
             self._index = index
         if index_from_data is not None:
             # TODO: This there a better way to do this?
@@ -878,12 +891,6 @@ def __init__(
         if dtype:
             self._data = self.astype(dtype)._data
 
-        self._data.rangeindex = self._data.rangeindex or col_is_rangeindex
-        # TODO: multiindex assignment
-        # test_non_string_column_name_to_arrow to fail
-        self._data.multiindex = self._data.multiindex or col_is_multiindex
-        self._data.label_dtype = self._data.label_dtype or col_dtype
-
     @_cudf_nvtx_annotate
     def _init_from_series_list(self, data, columns, index):
         if index is None:

From 4981b05db478bbefcb1a763a0937096a4724b999 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 11 Dec 2023 17:08:21 -0800
Subject: [PATCH 14/22] Remove redundant check

---
 python/cudf/cudf/core/dataframe.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 346f922da71..68a19788d46 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -885,8 +885,6 @@ def __init__(
             reindexed = self.reindex(index=index_from_data, copy=False)
             self._data = reindexed._data
             self._index = index_from_data
-        # TODO this one might not be needed
-        self._check_data_index_length_match()
 
         if dtype:
             self._data = self.astype(dtype)._data

From 3fdeb870e39f0c5bee45705b7f2b6e92278c7554 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 11 Dec 2023 18:59:04 -0800
Subject: [PATCH 15/22] Adjust test and add another one with defined behavior

---
 python/cudf/cudf/core/dataframe.py       | 21 +++++++++++++++++++++
 python/cudf/cudf/tests/test_dataframe.py | 15 +++++++++++++--
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 68a19788d46..584dd32f420 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1080,16 +1080,26 @@ def _init_from_dict_like(
             result_index = index
 
         scalar_keys = []
+        tuple_key_count = 0
+        tuple_key_lengths = set()
         col_data = {}
         for key, value in data.items():
             if is_scalar(value):
                 scalar_keys.append(key)
                 col_data[key] = value
             else:
+                if isinstance(key, tuple):
+                    tuple_key_count += 1
+                    tuple_key_lengths.add(len(key))
                 column = as_column(value, nan_as_null=nan_as_null)
                 value_lengths.add(len(column))
                 col_data[key] = column
 
+        if tuple_key_count not in (0, len(data)):
+            raise ValueError(
+                "All dict keys must be tuples if a tuple key exists."
+            )
+
         if len(scalar_keys) != len(data) and len(value_lengths) > 1:
             raise ValueError(
                 "Found varying value lengths when all values "
@@ -1110,6 +1120,17 @@ def _init_from_dict_like(
                 col_data[key], nan_as_null=nan_as_null, length=scalar_length
             )
 
+        if tuple_key_count and len(tuple_key_lengths) > 1:
+            # All tuple keys must be the same length
+            final_length = max(tuple_key_lengths)
+            col_data = {
+                old_key
+                if len(old_key) == final_length
+                else old_key
+                + (cudf.NA,) * (final_length - len(old_key)): column
+                for old_key, column in col_data.items()
+            }
+
         if result_index is None:
             result_index = cudf.RangeIndex(scalar_length)
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6c6d76835a2..4aa07537ab2 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10230,18 +10230,24 @@ def test_dataframe_assign_scalar_to_empty_series():
     "data",
     [
         {0: [1, 2, 3], 2: [10, 11, 23]},
-        {("a", "b"): [1, 2, 3], ("2",): [10, 11, 23]},
+        {("a", "b"): [1, 2, 3], ("2", "3"): [10, 11, 23]},
     ],
 )
 def test_non_string_column_name_to_arrow(data):
     df = cudf.DataFrame(data)
-
     expected = df.to_arrow()
     actual = pa.Table.from_pandas(df.to_pandas())
 
     assert expected.equals(actual)
 
 
+def test_dict_uneven_tuple_keys_fill_with_NA():
+    data = ({("a", "b"): [1, 2, 3], ("2",): [10, 11, 23]},)
+    result = cudf.DataFrame(data)
+    expected = pd.DataFrame(data)
+    assert_eq(result, expected)
+
+
 def test_complex_types_from_arrow():
     expected = pa.Table.from_arrays(
         [
@@ -10824,6 +10830,11 @@ def test_dataframe_series_dot():
     assert_eq(expected, actual)
 
 
+def test_dict_tuple_keys_must_all_be_tuple_keys():
+    with pytest.raises(ValueError):
+        cudf.DataFrame({(1, 2): [1], 3: [2]})
+
+
 def test_dataframe_reindex_keep_colname():
     gdf = cudf.DataFrame([1], columns=cudf.Index([1], name="foo"))
     result = gdf.reindex(index=[0, 1])

From 03f2e7f53c1a479df73cd147993edb6184f00e68 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 12 Dec 2023 09:21:52 -0800
Subject: [PATCH 16/22] Move all new tests together, reduce diff

---
 python/cudf/cudf/core/indexed_frame.py   |  1 -
 python/cudf/cudf/tests/test_dataframe.py | 75 ++++++++++++------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index f72d1b0a332..0c23d6dd45b 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2645,7 +2645,6 @@ def _reindex(
             name: (
                 df._data[name].copy(deep=deep)
                 if name in df._data
-                # Why does this default to np.float64?
                 else cudf.core.column.column.column_empty(
                     dtype=dtypes.get(name, np.float64),
                     masked=True,
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6cb42253b4c..0b7bcb6fb43 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10285,19 +10285,13 @@ def test_dataframe_assign_scalar_to_empty_series():
 )
 def test_non_string_column_name_to_arrow(data):
     df = cudf.DataFrame(data)
+
     expected = df.to_arrow()
     actual = pa.Table.from_pandas(df.to_pandas())
 
     assert expected.equals(actual)
 
 
-def test_dict_uneven_tuple_keys_fill_with_NA():
-    data = ({("a", "b"): [1, 2, 3], ("2",): [10, 11, 23]},)
-    result = cudf.DataFrame(data)
-    expected = pd.DataFrame(data)
-    assert_eq(result, expected)
-
-
 def test_complex_types_from_arrow():
     expected = pa.Table.from_arrays(
         [
@@ -10772,31 +10766,6 @@ def test_dataframe_from_ndarray_dup_columns():
         cudf.DataFrame(np.eye(2), columns=["A", "A"])
 
 
-def test_dataframe_from_dict_only_scalar_values_raises():
-    with pytest.raises(ValueError):
-        cudf.DataFrame({0: 3, 1: 2})
-
-
-@pytest.mark.parametrize("klass", [cudf.DataFrame, pd.DataFrame])
-@pytest.mark.parametrize(
-    "axis_kwargs, exp_data",
-    [
-        [
-            {"index": [1, 2], "columns": [1, 2]},
-            np.array([[1.0, np.nan], [np.nan, np.nan]]),
-        ],
-        [{"index": [1, 2]}, np.array([[0.0, 1.0], [np.nan, np.nan]])],
-        [{"columns": [1, 2]}, np.array([[0.0, np.nan], [1.0, np.nan]])],
-    ],
-)
-def test_dataframe_from_frame_with_index_or_columns_reindexes(
-    klass, axis_kwargs, exp_data
-):
-    result = cudf.DataFrame(klass(np.eye(2)), **axis_kwargs)
-    expected = cudf.DataFrame(exp_data, **axis_kwargs)
-    assert_eq(result, expected)
-
-
 @pytest.mark.parametrize("name", ["a", 0, None, np.nan, cudf.NA])
 @pytest.mark.parametrize("contains", ["a", 0, None, np.nan, cudf.NA])
 @pytest.mark.parametrize("other_names", [[], ["b", "c"], [1, 2]])
@@ -10880,11 +10849,6 @@ def test_dataframe_series_dot():
     assert_eq(expected, actual)
 
 
-def test_dict_tuple_keys_must_all_be_tuple_keys():
-    with pytest.raises(ValueError):
-        cudf.DataFrame({(1, 2): [1], 3: [2]})
-
-
 def test_dataframe_reindex_keep_colname():
     gdf = cudf.DataFrame([1], columns=cudf.Index([1], name="foo"))
     result = gdf.reindex(index=[0, 1])
@@ -10906,6 +10870,43 @@ def test_dataframe_duplicate_index_reindex():
     )
 
 
+def test_dict_uneven_tuple_keys_fill_with_NA():
+    data = ({("a", "b"): [1, 2, 3], ("2",): [10, 11, 23]},)
+    result = cudf.DataFrame(data)
+    expected = pd.DataFrame(data)
+    assert_eq(result, expected)
+
+
+def test_dataframe_from_dict_only_scalar_values_raises():
+    with pytest.raises(ValueError):
+        cudf.DataFrame({0: 3, 1: 2})
+
+
+@pytest.mark.parametrize("klass", [cudf.DataFrame, pd.DataFrame])
+@pytest.mark.parametrize(
+    "axis_kwargs, exp_data",
+    [
+        [
+            {"index": [1, 2], "columns": [1, 2]},
+            np.array([[1.0, np.nan], [np.nan, np.nan]]),
+        ],
+        [{"index": [1, 2]}, np.array([[0.0, 1.0], [np.nan, np.nan]])],
+        [{"columns": [1, 2]}, np.array([[0.0, np.nan], [1.0, np.nan]])],
+    ],
+)
+def test_dataframe_from_frame_with_index_or_columns_reindexes(
+    klass, axis_kwargs, exp_data
+):
+    result = cudf.DataFrame(klass(np.eye(2)), **axis_kwargs)
+    expected = cudf.DataFrame(exp_data, **axis_kwargs)
+    assert_eq(result, expected)
+
+
+def test_dict_tuple_keys_must_all_be_tuple_keys():
+    with pytest.raises(ValueError):
+        cudf.DataFrame({(1, 2): [1], 3: [2]})
+
+
 def test_dataframe_reindex_doesnt_remove_column_name():
     gdf = cudf.DataFrame([1], columns=pd.Index(["a"], name="foo"))
     result = gdf.reindex(index=pd.Index([0, 1]))

From ad81d4b73c2be84b026190049acea9270a130021 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 12 Dec 2023 09:26:06 -0800
Subject: [PATCH 17/22] Remove redundant test

---
 python/cudf/cudf/tests/test_dataframe.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 0b7bcb6fb43..b213a9c6c4a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10905,12 +10905,3 @@ def test_dataframe_from_frame_with_index_or_columns_reindexes(
 def test_dict_tuple_keys_must_all_be_tuple_keys():
     with pytest.raises(ValueError):
         cudf.DataFrame({(1, 2): [1], 3: [2]})
-
-
-def test_dataframe_reindex_doesnt_remove_column_name():
-    gdf = cudf.DataFrame([1], columns=pd.Index(["a"], name="foo"))
-    result = gdf.reindex(index=pd.Index([0, 1]))
-    expected = cudf.DataFrame(
-        [1, None], columns=pd.Index(["a"], name="foo"), index=pd.Index([0, 1])
-    )
-    assert_eq(result, expected)

From baeaa87b465be44ac814a55f72cc4393ab6b36d3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 14 Dec 2023 15:32:23 -0800
Subject: [PATCH 18/22] Ensure columns are maintained in slicing

---
 python/cudf/cudf/core/dataframe.py     |  2 +-
 python/cudf/cudf/core/indexed_frame.py | 16 +++++++++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index fe48997edfd..7b39cc6e6e7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7167,7 +7167,7 @@ def append(
 
         >>> df = cudf.DataFrame(columns=['A'])
         >>> for i in range(5):
-        ...     df = df.append({'A': i}, ignore_index=True)
+        ...     df = df.append({'A': [i]}, ignore_index=True)
         >>> df
            A
         0  0
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index a2f3db681ec..b0b45e38d3b 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1863,11 +1863,17 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
         start, stop, stride = arg.indices(num_rows)
         index = self.index
         has_range_index = isinstance(index, RangeIndex)
+        col_was_multiindex = self._data.multiindex
+        col_was_rangeindex = self._data.rangeindex
+        col_label_dtype = self._data.label_dtype
         if len(range(start, stop, stride)) == 0:
             # Avoid materialising the range index column
             result = self._empty_like(
                 keep_index=keep_index and not has_range_index
             )
+            result._data.rangeindex = col_was_rangeindex
+            result._data.multiindex = col_was_multiindex
+            result._data.label_dtype = col_label_dtype
             if keep_index and has_range_index:
                 lo = index.start + start * index.step
                 hi = index.start + stop * index.step
@@ -1896,7 +1902,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
         stop = min(stop, num_rows)
 
         if stride != 1:
-            return self._gather(
+            result = self._gather(
                 GatherMap.from_column_unchecked(
                     cudf.core.column.arange(
                         start,
@@ -1909,6 +1915,10 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
                 ),
                 keep_index=keep_index,
             )
+            result._data.rangeindex = col_was_rangeindex
+            result._data.multiindex = col_was_multiindex
+            result._data.label_dtype = col_label_dtype
+            return result
 
         columns_to_slice = [
             *(
@@ -1924,6 +1934,10 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
             None if has_range_index or not keep_index else self._index.names,
         )
 
+        result._data.rangeindex = col_was_rangeindex
+        result._data.multiindex = col_was_multiindex
+        result._data.label_dtype = col_label_dtype
+
         if keep_index and has_range_index:
             result.index = self.index[start:stop]
         return result

From 645cc3368a123bb64daa3295d2523d62341052e3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 14 Dec 2023 16:34:00 -0800
Subject: [PATCH 19/22] Fix .columns usage, fix for pandas 2.0 in concat

---
 python/cudf/cudf/core/dataframe.py    | 6 ++++--
 python/cudf/cudf/tests/test_concat.py | 9 +++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7b39cc6e6e7..c0d43790602 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -702,7 +702,7 @@ def __init__(
                 data = self.from_pandas(data, nan_as_null=nan_as_null)
             col_dict = data._data
             index, index_from_data = data.index, index
-            columns, columns_from_data = data.columns, columns
+            columns, columns_from_data = data._data.to_pandas_index(), columns
         elif isinstance(data, (cudf.Series, pd.Series)):
             if isinstance(data, pd.Series):
                 data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null)
@@ -6518,7 +6518,9 @@ def select_dtypes(self, include=None, exclude=None):
                 if infered_type in inclusion:
                     df._insert(len(df._data), k, col)
         else:
-            df.columns = df.columns[:0]
+            df._data.rangeindex = self._data.rangeindex
+            df._data.multiindex = self._data.multiindex
+            df._data.label_dtype = self._data.label_dtype
 
         return df
 
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index df743a96759..1fb3bc08413 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -8,7 +8,7 @@
 
 import cudf as gd
 from cudf.api.types import is_categorical_dtype
-from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140
+from cudf.core._compat import PANDAS_GE_150, PANDAS_GE_200, PANDAS_LT_140
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing._utils import assert_eq, assert_exceptions_equal
 
@@ -596,7 +596,12 @@ def test_concat_empty_dataframes(df, other, ignore_index):
                 actual[key] = col.fillna(-1)
         assert_eq(expected, actual, check_dtype=False, check_index_type=True)
     else:
-        assert_eq(expected, actual, check_index_type=not gdf.empty)
+        assert_eq(
+            expected,
+            actual,
+            check_index_type=not gdf.empty,
+            check_column_type=PANDAS_GE_200,
+        )
 
 
 @pytest.mark.parametrize("ignore_index", [True, False])

From d1ce06b28d1d17629b33db6f50b32982ed208032 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 19 Dec 2023 15:59:28 -0800
Subject: [PATCH 20/22] Address test failures

---
 python/cudf/cudf/core/dataframe.py                            | 2 +-
 .../cudf/cudf/tests/test_avro_reader_fastavro_integration.py  | 2 +-
 python/cudf/cudf/tests/test_groupby.py                        | 4 ++--
 python/cudf/cudf/tests/test_orc.py                            | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index baaeea35305..f4e71f43d18 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6251,7 +6251,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         ]
 
         if len(mode_results) == 0:
-            return DataFrame(columns=self.columns[:0])
+            return data_df.head(0)
 
         df = cudf.concat(mode_results, axis=1)
         if isinstance(df, Series):
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 2272231fec1..9a3d3af3fd8 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -209,7 +209,7 @@ def test_can_parse_no_schema():
     schema_root = None
     records = []
     actual = cudf_from_avro_util(schema_root, records)
-    expected = cudf.DataFrame()
+    expected = cudf.DataFrame(columns=cudf.Index([], dtype="object"))
     assert_eq(expected, actual)
 
 
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 57aa6e72eae..3853b4aee12 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3406,8 +3406,8 @@ def test_head_tail_empty():
     # GH #13397
 
     values = [1, 2, 3]
-    pdf = pd.DataFrame({}, index=values)
-    df = cudf.DataFrame({}, index=values)
+    pdf = pd.DataFrame(index=values)
+    df = cudf.DataFrame(index=values)
 
     expected = pdf.groupby(pd.Series(values)).head()
     got = df.groupby(cudf.Series(values)).head()
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 7407da9c4ac..67588e58fc0 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -885,7 +885,7 @@ def test_nanoseconds_overflow():
 
 def test_empty_dataframe():
     buffer = BytesIO()
-    expected = cudf.DataFrame()
+    expected = cudf.DataFrame(columns=cudf.Index([], dtype="object"))
     expected.to_orc(buffer)
 
     # Raise error if column name is mentioned, but it doesn't exist.

From c62aaa6e5304df50541af4824b59dbf8b8e08fc0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 19 Dec 2023 18:02:48 -0800
Subject: [PATCH 21/22] Fix mode

---
 python/cudf/cudf/core/dataframe.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f4e71f43d18..fc0ef017519 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6251,7 +6251,9 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         ]
 
         if len(mode_results) == 0:
-            return data_df.head(0)
+            result = data_df.head(0)
+            result.index = cudf.RangeIndex(0)
+            return result
 
         df = cudf.concat(mode_results, axis=1)
         if isinstance(df, Series):

From 498fc756994b769a5a5fd1ee4545adfc645979b4 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 3 Jan 2024 17:36:13 -0800
Subject: [PATCH 22/22] Allow columns to not be an index

---
 python/cudf/cudf/core/column/column.py        |  2 +-
 python/cudf/cudf/core/dataframe.py            | 27 +++++++++++--------
 python/cudf/cudf/core/indexed_frame.py        |  2 +-
 .../test_avro_reader_fastavro_integration.py  |  2 +-
 python/cudf/cudf/tests/test_concat.py         |  2 +-
 python/cudf/cudf/tests/test_dataframe.py      |  2 +-
 python/cudf/cudf/tests/test_groupby.py        |  2 +-
 python/cudf/cudf/tests/test_orc.py            |  2 +-
 8 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 64f9c25a9f0..47d8bad7cb1 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index fc0ef017519..62bfdfc7922 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -675,16 +675,21 @@ def __init__(
                 # TODO: Generically, an empty dtype-less container
                 # TODO: Why does as_index([]) return FloatIndex
                 as_idx_typ = object
-            columns = as_index(columns, dtype=as_idx_typ)
-            if not isinstance(
-                columns, MultiIndex
-            ) and columns.nunique() != len(columns):
-                raise ValueError("Columns cannot contain duplicate values")
-            columns = columns.to_pandas()
-            col_is_rangeindex = isinstance(columns, pd.RangeIndex)
-            col_is_multiindex = isinstance(columns, pd.MultiIndex)
-            if not isinstance(columns, pd.MultiIndex):
-                col_dtype = columns.dtype
+            try:
+                columns = as_index(columns, dtype=as_idx_typ)
+            except pa.lib.ArrowInvalid:
+                # mixed typed elements are allowed e.g. [(1, 2), "a"]
+                columns = list(columns)
+            else:
+                if not isinstance(
+                    columns, MultiIndex
+                ) and columns.nunique() != len(columns):
+                    raise ValueError("Columns cannot contain duplicate values")
+                columns = columns.to_pandas()
+                col_is_rangeindex = isinstance(columns, pd.RangeIndex)
+                col_is_multiindex = isinstance(columns, pd.MultiIndex)
+                if not isinstance(columns, pd.MultiIndex):
+                    col_dtype = columns.dtype
 
         if index is not None:
             index = as_index(index)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8d7d396f57d..f7c3e180fc5 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 """Base class for Frame types that have an index."""
 
 from __future__ import annotations
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 9a3d3af3fd8..2711926ae12 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 1fb3bc08413..d393e9f81cb 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from decimal import Decimal
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index c2f105cdc34..dc82b1c1f3f 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import array as arr
 import contextlib
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 3853b4aee12..efb2dda12e4 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import collections
 import datetime
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 67588e58fc0..cafcb347d52 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import datetime
 import decimal