diff --git a/CHANGELOG.md b/CHANGELOG.md index b79561fbd89..5da3b46e1bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ ## Improvements +- PR #6838 Fix `columns` & `index` handling in dataframe constructor + ## Bug Fixes - PR #6912 Fix rmm_mode=managed parameter for gtests diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 33555236769..47dda88dfde 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -308,8 +308,8 @@ cdef class Column: self._size = other_col.size self._dtype = other_col._dtype self.set_base_data(other_col.base_data) - self.set_base_mask(other_col.base_mask) self.set_base_children(other_col.base_children) + self.set_base_mask(other_col.base_mask) else: return other_col diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index da5d54adec0..6c9b4006869 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -200,34 +200,52 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): """ super().__init__() + if isinstance(columns, (Series, cudf.Index)): + columns = columns.to_pandas() + if isinstance(data, ColumnAccessor): - self._data = data if index is None: - index = as_index(range(self._data.nrows)) - self.index = as_index(index) - return None + index = as_index(range(data.nrows)) + else: + index = as_index(index) + self._index = index - if isinstance(data, DataFrame): - self._data = data._data - self._index = data._index - self.columns = data.columns - return + if columns is not None: + self._data = data + self._reindex(columns=columns, deep=True, inplace=True) + else: + self._data = data - if isinstance(data, pd.DataFrame): - data = self.from_pandas(data) - self._data = data._data - self._index = data._index - self.columns = data.columns - return + elif isinstance(data, (DataFrame, pd.DataFrame)): + if isinstance(data, pd.DataFrame): + data = self.from_pandas(data) - if data is None: + if index is not None: + if not data.index.equals(index): + data = data.reindex(index) + index = data._index + else: + index = as_index(index) + else: + index = data._index + + self._index = index + + if columns is not None: + self._data = data._data + self._reindex( + columns=columns, index=index, deep=False, inplace=True + ) + else: + self._data = data._data + self.columns = data.columns + + elif data is None: if index is None: self._index = RangeIndex(0) else: self._index = as_index(index) if columns is not None: - if isinstance(columns, (Series, cudf.Index)): - columns = columns.to_pandas() self._data = ColumnAccessor( OrderedDict.fromkeys( @@ -2560,48 +2578,16 @@ def reindex( df = self cols = columns - original_cols = df._data dtypes = OrderedDict(df.dtypes) idx = labels if index is None and axis in (0, "index") else index cols = labels if cols is None and axis in (1, "columns") else cols df = df if cols is None else df[list(set(df.columns) & set(cols))] - if idx is not None: - idx = as_index(idx) - - if isinstance(idx, cudf.core.MultiIndex): - idx_dtype_match = ( - df.index._source_data.dtypes == idx._source_data.dtypes - ).all() - else: - idx_dtype_match = df.index.dtype == idx.dtype - - if not idx_dtype_match: - cols = cols if cols is not None else list(df.columns) - df = DataFrame() - else: - df = DataFrame(None, idx).join(df, how="left", sort=True) - # double-argsort to map back from sorted to unsorted positions - df = df.take(idx.argsort(ascending=True).argsort()) - - idx = idx if idx is not None else df.index - names = cols if cols is not None else list(df.columns) - - length = len(idx) - cols = OrderedDict() - - for name in names: - if name in df: - cols[name] = df._data[name].copy(deep=copy) - else: - dtype = dtypes.get(name, np.float64) - col = original_cols.get(name, Series(dtype=dtype)._column) - col = column.column_empty_like( - col, dtype=dtype, masked=True, newsize=length - ) - cols[name] = col + result = df._reindex( + columns=cols, dtypes=dtypes, deep=copy, index=idx, inplace=False + ) - return DataFrame(cols, idx) + return result def _set_index( self, index, to_drop=None, inplace=False, verify_integrity=False, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 6b7ed4f2f9a..7da3facb6ad 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,9 +1,9 @@ # Copyright (c) 2020, NVIDIA CORPORATION. import copy import functools +import operator import warnings from collections import OrderedDict, abc as abc -import operator import cupy import numpy as np @@ -3301,6 +3301,79 @@ def _encode(self): keys = self.__class__._from_table(keys) return keys, indices + def _reindex( + self, columns, dtypes=None, deep=False, index=None, inplace=False + ): + """ + Helper for `.reindex` + + Parameters + ---------- + columns : array-like + The list of columns to select from the Frame, + if ``columns`` is a superset of ``Frame.columns`` new + columns are created. + dtypes : dict + Mapping of dtypes for the empty columns being created. + deep : boolean, optional, default False + Whether to make deep copy or shallow copy of the columns. + index : Index or array-like, default None + The ``index`` to be used to reindex the Frame with. + inplace : bool, default False + Whether to perform the operation in place on the data. + + Returns + ------- + DataFrame + """ + if dtypes is None: + dtypes = {} + + df = self + if index is not None: + index = cudf.core.index.as_index(index) + + if isinstance(index, cudf.core.MultiIndex): + idx_dtype_match = ( + df.index._source_data.dtypes == index._source_data.dtypes + ).all() + else: + idx_dtype_match = df.index.dtype == index.dtype + + if not idx_dtype_match: + columns = columns if columns is not None else list(df.columns) + df = cudf.DataFrame() + else: + df = cudf.DataFrame(None, index).join( + df, how="left", sort=True + ) + # double-argsort to map back from sorted to unsorted positions + df = df.take(index.argsort(ascending=True).argsort()) + + cols = OrderedDict() + index = index if index is not None else df.index + names = columns if columns is not None else list(df.columns) + for name in names: + if name in df._data: + cols[name] = df._data[name].copy(deep=deep) + else: + dtype = dtypes.get(name, np.float64) + cols[name] = column_empty( + dtype=dtype, masked=True, row_count=len(index) + ) + result = self.__class__._from_table( + Frame( + data=cudf.core.column_accessor.ColumnAccessor( + cols, + multiindex=self._data.multiindex, + level_names=self._data.level_names, + ) + ), + index=index, + ) + + return self._mimic_inplace(result, inplace=inplace) + def _get_replacement_values(to_replace, replacement, col_name, column): diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index a8c38c5e200..9dc7f8380f7 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8015,3 +8015,62 @@ def test_dataframe_from_pandas_duplicate_columns(): ValueError, match="Duplicate column names are not allowed" ): gd.from_pandas(pdf) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame( + {"a": [1, 2, 3], "b": [10, 11, 20], "c": ["a", "bcd", "xyz"]} + ), + pd.DataFrame(), + ], +) +@pytest.mark.parametrize( + "columns", + [ + None, + ["a"], + ["c", "a"], + ["b", "a", "c"], + [], + pd.Index(["c", "a"]), + gd.Index(["c", "a"]), + ["abc", "a"], + ["column_not_exists1", "column_not_exists2"], + ], +) +@pytest.mark.parametrize("index", [["abc", "def", "ghi"]]) +def test_dataframe_constructor_columns(df, columns, index): + def assert_local_eq(actual, df, expected, host_columns): + check_index_type = False if expected.empty else True + if host_columns is not None and any( + col not in df.columns for col in host_columns + ): + assert_eq( + expected, + actual, + check_dtype=False, + check_index_type=check_index_type, + ) + else: + assert_eq(expected, actual, check_index_type=check_index_type) + + gdf = gd.from_pandas(df) + host_columns = ( + columns.to_pandas() if isinstance(columns, gd.Index) else columns + ) + + expected = pd.DataFrame(df, columns=host_columns, index=index) + actual = gd.DataFrame(gdf, columns=columns, index=index) + + assert_local_eq(actual, df, expected, host_columns) + + expected = pd.DataFrame(df, columns=host_columns) + actual = gd.DataFrame(gdf._data, columns=columns, index=index) + if index is not None: + if df.shape == (0, 0): + expected = pd.DataFrame(columns=host_columns, index=index) + else: + expected.index = index + assert_local_eq(actual, df, expected, host_columns) diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index 16454019929..bed383f4565 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -34,7 +34,7 @@ def _quantile(a, q): n = len(a) if not len(a): return None, n - return (a.quantiles(q.tolist(), interpolation="nearest"), n) + return (a.quantiles(q=q.tolist(), interpolation="nearest"), n) def merge_quantiles(finalq, qs, vals): @@ -121,7 +121,7 @@ def _approximate_quantile(df, q): # Define final action (create df with quantiles as index) def finalize_tsk(tsk): - return (final_type, tsk, q) + return (final_type, tsk) return_type = df.__class__ @@ -162,7 +162,14 @@ def finalize_tsk(tsk): } dsk = toolz.merge(val_dsk, merge_dsk) graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df]) - return return_type(graph, name2, meta, new_divisions) + df = return_type(graph, name2, meta, new_divisions) + + def set_quantile_index(df): + df.index = q + return df + + df = df.map_partitions(set_quantile_index, meta=meta) + return df def quantile_divisions(df, by, npartitions):