From 210e34dcbc7716254d56d6c74164a025638e067b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 11 Nov 2020 07:41:52 -0800 Subject: [PATCH 01/16] fix typo --- python/cudf/cudf/_lib/parquet.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 3c2ff128685..b97529d39d1 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -142,7 +142,7 @@ cpdef generate_pandas_metadata(Table table, index): "'category' column dtypes are currently not " + "supported by the gpu accelerated parquet writer" ) - elif is_list_dtype(col): + elif is_list_dtype(idx): types.append(col.dtype.to_arrow()) else: types.append(np_to_pa_dtype(idx.dtype)) From f16ab6a1cedbfab0c71f80fcbf527c484bbae336 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 11 Nov 2020 20:54:01 -0600 Subject: [PATCH 02/16] Fix writing index in parquet writer. --- python/cudf/cudf/_lib/parquet.pyx | 98 +++++++++++++++----------- python/cudf/cudf/tests/test_parquet.py | 34 +++++++++ 2 files changed, 91 insertions(+), 41 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index b97529d39d1..a213d8c14fe 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -119,37 +119,34 @@ cpdef generate_pandas_metadata(Table table, index): # Indexes if index is not False: - for name in table._index.names: - if name is not None: - if isinstance(table._index, cudf.core.multiindex.MultiIndex): - idx = table.index.get_level_values(name) - else: - idx = table.index - - if isinstance(idx, cudf.core.index.RangeIndex): - descr = { - "kind": "range", - "name": table.index.name, - "start": table.index._start, - "stop": table.index._stop, - "step": 1, - } - else: - descr = name - col_names.append(name) - if is_categorical_dtype(idx): - raise ValueError( - "'category' column dtypes are currently not " - + "supported by the gpu accelerated parquet writer" - ) - elif is_list_dtype(idx): - types.append(col.dtype.to_arrow()) - else: - types.append(np_to_pa_dtype(idx.dtype)) - index_levels.append(idx) - index_descriptors.append(descr) + for level, name in enumerate(table._index.names): + if isinstance(table._index, cudf.core.multiindex.MultiIndex): + idx = table.index.get_level_values(level) + else: + idx = table.index + + if isinstance(idx, cudf.core.index.RangeIndex): + descr = { + "kind": "range", + "name": table.index.name, + "start": table.index._start, + "stop": table.index._stop, + "step": 1, + } else: - col_names.append(name) + descr = _index_level_name(idx.name, level, col_names) + if is_categorical_dtype(idx): + raise ValueError( + "'category' column dtypes are currently not " + + "supported by the gpu accelerated parquet writer" + ) + elif is_list_dtype(idx): + types.append(col.dtype.to_arrow()) + else: + types.append(np_to_pa_dtype(idx.dtype)) + index_levels.append(idx) + col_names.append(name) + index_descriptors.append(descr) metadata = pa.pandas_compat.construct_metadata( table, @@ -295,21 +292,20 @@ cpdef write_parquet( cdef vector[string] column_names cdef map[string, string] user_data - cdef table_view tv = table.data_view() + cdef table_view tv cdef unique_ptr[cudf_io_types.data_sink] _data_sink cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink) - if index is not False: + if index is not False and not isinstance(table._index, cudf.RangeIndex): tv = table.view() - if isinstance(table._index, cudf.core.multiindex.MultiIndex): - for idx_name in table._index.names: - column_names.push_back(str.encode(idx_name)) - else: - if table._index.name is not None: - column_names.push_back(str.encode(table._index.name)) - else: - # No named index exists so just write out columns - tv = table.data_view() + for level, idx_name in enumerate(table._index.names): + column_names.push_back( + str.encode( + _index_level_name(idx_name, level, table._column_names) + ) + ) + else: + tv = table.data_view() for col_name in table._column_names: column_names.push_back(str.encode(col_name)) @@ -541,3 +537,23 @@ cdef Column _update_column_struct_field_names( ) col.set_base_children(tuple(children)) return col + + +def _index_level_name(index_name, level, column_names): + """ + Return the name of an index level or a default name + if `index_name` is None or is already a column name. + + Parameters + ---------- + index_name : name of an Index object + level : level of the Index object + + Returns + ------- + name : str + """ + if index_name is not None and index_name not in column_names: + return index_name + else: + return f"__index_level_{level}__" diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 633f5b472e7..73035a4717c 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1579,3 +1579,37 @@ def test_parquet_nullable_boolean(tmpdir, engine): actual_gdf = cudf.read_parquet(pandas_path, engine=engine) assert_eq(actual_gdf, expected_gdf) + + +@pytest.mark.parametrize( + "pdf", + [ + pd.DataFrame(index=[1, 2, 3]), + # pd.DataFrame(index=pd.RangeIndex(0, 10, 1)), + pd.DataFrame({"a": [1, 2, 3]}, index=[0.43534, 345, 0.34534]), + pd.DataFrame( + {"b": [11, 22, 33], "c": ["a", "b", "c"]}, + index=pd.Index(["a", "b", "c"], name="custom name"), + ), + pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")), + ], +) +@pytest.mark.parametrize("index", [None, True, False]) +def test_parquet_index(tmpdir, pdf, index): + pandas_path = tmpdir.join("pandas_index.parquet") + cudf_path = tmpdir.join("pandas_index.parquet") + + gdf = cudf.from_pandas(pdf) + + pdf.to_parquet(pandas_path, index=index) + gdf.to_parquet(cudf_path, index=index) + + expected = pd.read_parquet(cudf_path) + actual = cudf.read_parquet(cudf_path) + + assert_eq(expected, actual) + + expected = pd.read_parquet(pandas_path) + actual = cudf.read_parquet(pandas_path) + + assert_eq(expected, actual) From 614500a5fac0505ac9fde0e20fb53daf9224193e Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 16 Nov 2020 14:52:34 -0600 Subject: [PATCH 03/16] Fix Parquet reader index handling --- python/cudf/cudf/_lib/parquet.pyx | 60 ++++++++++++++----- python/cudf/cudf/core/dataframe.py | 9 +-- python/cudf/cudf/core/index.py | 7 +++ python/cudf/cudf/tests/test_pandas_interop.py | 10 ++-- python/cudf/cudf/tests/test_parquet.py | 52 +++++++++------- 5 files changed, 89 insertions(+), 49 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index a213d8c14fe..3d8d04e07a9 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -6,6 +6,7 @@ import cudf import errno import os import pyarrow as pa +from collections import OrderedDict try: import ujson as json @@ -129,9 +130,9 @@ cpdef generate_pandas_metadata(Table table, index): descr = { "kind": "range", "name": table.index.name, - "start": table.index._start, - "stop": table.index._stop, - "step": 1, + "start": table.index.start, + "stop": table.index.stop, + "step": table.index.step, } else: descr = _index_level_name(idx.name, level, col_names) @@ -222,15 +223,24 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, column_names = [x.decode() for x in c_out_table.metadata.column_names] # Access the Parquet user_data json to find the index - index_col = '' + index_col = None cdef map[string, string] user_data = c_out_table.metadata.user_data json_str = user_data[b'pandas'].decode('utf-8') meta = None if json_str != "": meta = json.loads(json_str) if 'index_columns' in meta and len(meta['index_columns']) > 0: - index_col = meta['index_columns'][0] - + index_col = meta['index_columns'] + if isinstance(index_col[0], dict) and \ + index_col[0]['kind'] == 'range': + is_range_index = True + else: + is_range_index = False + index_col_names = OrderedDict() + for idx_col in index_col: + for c in meta['columns']: + if c['field_name'] == idx_col: + index_col_names[idx_col] = c['name'] df = cudf.DataFrame._from_table( Table.from_unique_ptr( move(c_out_table.tbl), @@ -247,7 +257,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if not column_names: column_names = [o['name'] for o in meta['columns']] - if index_col in cols_dtype_map: + if not is_range_index and index_col in cols_dtype_map: column_names.remove(index_col) for col in column_names: @@ -258,16 +268,38 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, ) # Set the index column - if index_col is not '' and isinstance(index_col, str): - if index_col in column_names: - df = df.set_index(index_col) - new_index_name = pa.pandas_compat._backwards_compatible_index_name( - df.index.name, df.index.name + if index_col is not None and len(index_col) > 0: + if is_range_index: + range_index_meta = index_col[0] + idx = cudf.RangeIndex( + start=range_index_meta['start'], + stop=range_index_meta['stop'], + step=range_index_meta['step'], + name=range_index_meta['name'] ) - df.index.name = new_index_name + if skiprows is not None: + idx = idx[skiprows:] + if num_rows is not None: + idx = idx[:num_rows] + df.index = idx + elif set(index_col).issubset(column_names): + index_data = df[index_col] + actual_index_names = list(index_col_names.values()) + if len(index_data._data) == 1: + idx = cudf.Index( + index_data._data.columns[0], + name=actual_index_names[0] + ) + else: + idx = cudf.MultiIndex.from_frame( + index_data, + names=actual_index_names + ) + df.drop(columns=index_col, inplace=True) + df.index = idx else: if use_pandas_metadata: - df.index.name = index_col + df.index.names = index_col return df diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a730e3488eb..107d2d20e38 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4940,10 +4940,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): df.columns = dataframe.columns # Set index - if isinstance(dataframe.index, pd.MultiIndex): - index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) - else: - index = dataframe.index + index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) result = df.set_index(index) return result @@ -7137,10 +7134,8 @@ def from_pandas(obj, nan_as_null=None): elif isinstance(obj, pd.MultiIndex): return cudf.MultiIndex.from_pandas(obj, nan_as_null=nan_as_null) elif isinstance(obj, pd.RangeIndex): - if obj._step and obj._step != 1: - raise ValueError("cudf RangeIndex requires step == 1") return cudf.core.index.RangeIndex( - obj._start, stop=obj._stop, name=obj.name + start=obj.start, stop=obj.stop, step=obj.step, name=obj.name ) elif isinstance(obj, pd.Index): return cudf.Index.from_pandas(obj, nan_as_null=nan_as_null) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7485b99b0ce..56348e4a1a4 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1532,6 +1532,13 @@ def stop(self): """ return self._stop + @property + def step(self): + """ + The value of the step parameter. + """ + return self._step + @property def _num_columns(self): return 1 diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py index 064b73f1052..15b1acdfc08 100644 --- a/python/cudf/cudf/tests/test_pandas_interop.py +++ b/python/cudf/cudf/tests/test_pandas_interop.py @@ -1,8 +1,7 @@ -# Copyright (c) 2018, NVIDIA CORPORATION. +# Copyright (c) 2018-2020, NVIDIA CORPORATION. import numpy as np import pandas as pd -import pytest import cudf from cudf.core import DataFrame @@ -85,6 +84,7 @@ def test_from_pandas_rangeindex(): def test_from_pandas_rangeindex_step(): - idx1 = pd.RangeIndex(start=0, stop=8, step=2, name="myindex") - with pytest.raises(ValueError): - cudf.from_pandas(idx1) + expected = pd.RangeIndex(start=0, stop=8, step=2, name="myindex") + actual = cudf.from_pandas(expected) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 73035a4717c..e66adff7d5e 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1,4 +1,5 @@ # Copyright (c) 2019-2020, NVIDIA CORPORATION. + import os import pathlib import random @@ -967,15 +968,7 @@ def test_parquet_reader_list_skiprows(skip, tmpdir): src.to_parquet(fname) assert os.path.exists(fname) - expect = pd.DataFrame( - { - "a": list_gen(int_gen, skip, num_rows - skip, 80, 50), - "b": list_gen(string_gen, skip, num_rows - skip, 80, 50), - "c": list_gen( - int_gen, skip, num_rows - skip, 80, 50, include_validity=True - ), - } - ) + expect = src.iloc[skip:] got = cudf.read_parquet(fname, skiprows=skip) assert_eq(expect, got, check_dtype=False) @@ -998,18 +991,7 @@ def test_parquet_reader_list_num_rows(skip, tmpdir): assert os.path.exists(fname) rows_to_read = min(3, num_rows - skip) - expect = pd.DataFrame( - { - "a": list_gen(int_gen, skip, rows_to_read, 80, 50), - "b": list_gen(string_gen, skip, rows_to_read, 80, 50), - "c": list_gen( - int_gen, skip, rows_to_read, 80, 50, include_validity=True - ), - "d": list_gen( - string_gen, skip, rows_to_read, 80, 50, include_validity=True - ), - } - ) + expect = src.iloc[skip:].head(rows_to_read) got = cudf.read_parquet(fname, skiprows=skip, num_rows=rows_to_read) assert_eq(expect, got, check_dtype=False) @@ -1514,7 +1496,7 @@ def test_parquet_writer_sliced(tmpdir): df_select = df.iloc[1:3] df_select.to_parquet(cudf_path) - assert_eq(cudf.read_parquet(cudf_path), df_select.reset_index(drop=True)) + assert_eq(cudf.read_parquet(cudf_path), df_select) def test_parquet_writer_list_basic(tmpdir): @@ -1585,13 +1567,37 @@ def test_parquet_nullable_boolean(tmpdir, engine): "pdf", [ pd.DataFrame(index=[1, 2, 3]), - # pd.DataFrame(index=pd.RangeIndex(0, 10, 1)), + pytest.param( + pd.DataFrame(index=pd.RangeIndex(0, 10, 1)), + marks=pytest.mark.xfail( + reason="https://github.com/pandas-dev/pandas/issues/37897" + "https://github.com/pandas-dev/pandas/issues/37896" + ), + ), pd.DataFrame({"a": [1, 2, 3]}, index=[0.43534, 345, 0.34534]), pd.DataFrame( {"b": [11, 22, 33], "c": ["a", "b", "c"]}, index=pd.Index(["a", "b", "c"], name="custom name"), ), + pd.DataFrame( + {"a": [10, 11, 12], "b": [99, 88, 77]}, + index=pd.RangeIndex(12, 17, 2), + ), + pd.DataFrame( + {"b": [99, 88, 77]}, + index=pd.RangeIndex(22, 27, 2, name="hello index"), + ), pd.DataFrame(index=pd.Index(["a", "b", "c"], name="custom name")), + pd.DataFrame( + {"a": ["a", "bb", "cc"], "b": [10, 21, 32]}, + index=pd.MultiIndex.from_tuples([[1, 2], [10, 11], [15, 16]]), + ), + pd.DataFrame( + {"a": ["a", "bb", "cc"], "b": [10, 21, 32]}, + index=pd.MultiIndex.from_tuples( + [[1, 2], [10, 11], [15, 16]], names=["first", "second"] + ), + ), ], ) @pytest.mark.parametrize("index", [None, True, False]) From 3bb0aecc152f45ec3a5477ac32c3184d1dcca2a0 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 16 Nov 2020 15:01:46 -0600 Subject: [PATCH 04/16] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ff00d2ac33b..8375bbd4026 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -123,6 +123,7 @@ - PR #6742 Fix concat bug in dask_cudf Series/Index creation - PR #6632 Fix DataFrame initialization from list of dicts - PR #6767 Fix sort order of parameters in `test_scalar_invalid_implicit_conversion` pytest +- PR #6771 Fix index handling in parquet reader and writer # cuDF 0.16.0 (21 Oct 2020) From 01c6423fe8c7298516a2fbc61f5bcd38b9b048f7 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 23 Nov 2020 10:48:18 -0800 Subject: [PATCH 05/16] enable respecting columns and index --- python/cudf/cudf/core/dataframe.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index a730e3488eb..6fd0d6a8040 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -200,6 +200,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): """ super().__init__() + if isinstance(columns, (Series, cudf.Index)): + columns = columns.to_pandas() + if isinstance(data, ColumnAccessor): self._data = data if index is None: @@ -207,16 +210,15 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): self.index = as_index(index) return None - if isinstance(data, DataFrame): - self._data = data._data - self._index = data._index - self.columns = data.columns - return + if isinstance(data, (DataFrame, pd.DataFrame)): + if columns is not None: + data = data[columns] + + if isinstance(data, pd.DataFrame): + data = self.from_pandas(data) - if isinstance(data, pd.DataFrame): - data = self.from_pandas(data) self._data = data._data - self._index = data._index + self._index = data._index if index is None else as_index(index) self.columns = data.columns return @@ -226,8 +228,6 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): else: self._index = as_index(index) if columns is not None: - if isinstance(columns, (Series, cudf.Index)): - columns = columns.to_pandas() self._data = ColumnAccessor( OrderedDict.fromkeys( From a9d920011df06156affc4a26aef71f4a344a5447 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 23 Nov 2020 13:59:00 -0800 Subject: [PATCH 06/16] fix columnAccessor constructor --- python/cudf/cudf/core/dataframe.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 6fd0d6a8040..c4c3ad6e1f1 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -204,6 +204,15 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): columns = columns.to_pandas() if isinstance(data, ColumnAccessor): + if columns is not None: + data = ColumnAccessor( + data=OrderedDict( + (col_name, data[col_name]) for col_name in columns + ), + multiindex=data.multiindex, + level_names=data.level_names, + ) + self._data = data if index is None: index = as_index(range(self._data.nrows)) From b2c45b3b72da6e013d3daa4e29ce92ae4c49226a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 23 Nov 2020 17:22:52 -0800 Subject: [PATCH 07/16] handle non-existent columns --- python/cudf/cudf/core/dataframe.py | 37 +++++++++++++++++------- python/cudf/cudf/tests/test_dataframe.py | 34 ++++++++++++++++++++++ 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c4c3ad6e1f1..35838f1189e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -205,12 +205,8 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): if isinstance(data, ColumnAccessor): if columns is not None: - data = ColumnAccessor( - data=OrderedDict( - (col_name, data[col_name]) for col_name in columns - ), - multiindex=data.multiindex, - level_names=data.level_names, + data = _get_columns_from_column_accessor( + data=data, columns=columns ) self._data = data @@ -220,13 +216,16 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): return None if isinstance(data, (DataFrame, pd.DataFrame)): - if columns is not None: - data = data[columns] - if isinstance(data, pd.DataFrame): data = self.from_pandas(data) - self._data = data._data + if columns is not None: + self._data = _get_columns_from_column_accessor( + data=data._data, columns=columns + ) + else: + self._data = data._data + self._index = data._index if index is None else as_index(index) self.columns = data.columns return @@ -7306,3 +7305,21 @@ def _get_host_unique(array): return [array] else: return set(array) + + +def _get_columns_from_column_accessor(data, columns): + return ColumnAccessor( + data=OrderedDict( + ( + col_name, + data[col_name] + if col_name in data + else cudf.core.column.column_empty( + row_count=data.nrows, dtype="float64", masked=True + ), + ) + for col_name in columns + ), + multiindex=data.multiindex, + level_names=data.level_names, + ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 69eb70e7201..82ce87c668b 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7995,3 +7995,37 @@ def test_dataframe_from_pandas_duplicate_columns(): ValueError, match="Duplicate column names are not allowed" ): gd.from_pandas(pdf) + + +@pytest.mark.parametrize( + "df", + [ + pd.DataFrame( + {"a": [1, 2, 3], "b": [10, 11, 20], "c": ["a", "bcd", "xyz"]} + ) + ], +) +@pytest.mark.parametrize( + "columns", + [ + None, + ["a"], + ["c", "a"], + ["b", "a", "c"], + [], + pd.Index(["c", "a"]), + gd.Index(["c", "a"]), + ], +) +def test_dataframe_constructor_columns(df, columns): + gdf = gd.from_pandas(df) + + expected = pd.DataFrame( + df, + columns=columns.to_pandas() + if isinstance(columns, gd.Index) + else columns, + ) + actual = gd.DataFrame(gdf, columns=columns) + + assert_eq(expected, actual) From c1cf47ec9e9de25e9749f38d294475422fe49401 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 24 Nov 2020 13:00:17 -0600 Subject: [PATCH 08/16] Handle more cases and add tests for the same. --- python/cudf/cudf/core/dataframe.py | 34 +++++++++++------------- python/cudf/cudf/tests/test_dataframe.py | 26 +++++++++++++----- 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 35838f1189e..d07b03db7c2 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -204,33 +204,29 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): columns = columns.to_pandas() if isinstance(data, ColumnAccessor): + if index is None: + index = as_index(range(data.nrows)) + if columns is not None: data = _get_columns_from_column_accessor( - data=data, columns=columns + column_accessor=data, columns=columns ) - self._data = data - if index is None: - index = as_index(range(self._data.nrows)) self.index = as_index(index) - return None - - if isinstance(data, (DataFrame, pd.DataFrame)): + elif isinstance(data, (DataFrame, pd.DataFrame)): if isinstance(data, pd.DataFrame): data = self.from_pandas(data) if columns is not None: self._data = _get_columns_from_column_accessor( - data=data._data, columns=columns + column_accessor=data._data, columns=columns ) else: self._data = data._data + self.columns = data.columns self._index = data._index if index is None else as_index(index) - self.columns = data.columns - return - - if data is None: + elif data is None: if index is None: self._index = RangeIndex(0) else: @@ -7307,19 +7303,21 @@ def _get_host_unique(array): return set(array) -def _get_columns_from_column_accessor(data, columns): +def _get_columns_from_column_accessor(column_accessor, columns): return ColumnAccessor( data=OrderedDict( ( col_name, - data[col_name] - if col_name in data + column_accessor[col_name] + if col_name in column_accessor else cudf.core.column.column_empty( - row_count=data.nrows, dtype="float64", masked=True + row_count=column_accessor.nrows, + dtype="object", + masked=True, ), ) for col_name in columns ), - multiindex=data.multiindex, - level_names=data.level_names, + multiindex=column_accessor.multiindex, + level_names=column_accessor.level_names, ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 82ce87c668b..7476e516679 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8015,17 +8015,29 @@ def test_dataframe_from_pandas_duplicate_columns(): [], pd.Index(["c", "a"]), gd.Index(["c", "a"]), + ["abc", "a"], + ["column_not_exists1", "column_not_exists2"], ], ) def test_dataframe_constructor_columns(df, columns): - gdf = gd.from_pandas(df) + def assert_local_eq(actual, df, expected, host_columns): + if host_columns is not None and any( + col not in df.columns for col in host_columns + ): + assert_eq(expected, actual, check_dtype=False) + else: + assert_eq(expected, actual) - expected = pd.DataFrame( - df, - columns=columns.to_pandas() - if isinstance(columns, gd.Index) - else columns, + gdf = gd.from_pandas(df) + host_columns = ( + columns.to_pandas() if isinstance(columns, gd.Index) else columns ) + + expected = pd.DataFrame(df, columns=host_columns) actual = gd.DataFrame(gdf, columns=columns) - assert_eq(expected, actual) + assert_local_eq(actual, df, expected, host_columns) + + actual = gd.DataFrame(gdf._data, columns=columns) + + assert_local_eq(actual, df, expected, host_columns) From 264c4cf6dcf40280a399a10e8d23cfc0ae6295b6 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 24 Nov 2020 16:05:44 -0600 Subject: [PATCH 09/16] Fix dask issue and add tests --- python/cudf/cudf/core/dataframe.py | 5 ++++- python/cudf/cudf/tests/test_dataframe.py | 13 ++++++++----- python/dask_cudf/dask_cudf/sorting.py | 2 +- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d07b03db7c2..fd5d31c1329 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -217,6 +217,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): if isinstance(data, pd.DataFrame): data = self.from_pandas(data) + if index is not None and not data.index.equals(index): + data = data.reindex(index) + if columns is not None: self._data = _get_columns_from_column_accessor( column_accessor=data._data, columns=columns @@ -225,7 +228,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): self._data = data._data self.columns = data.columns - self._index = data._index if index is None else as_index(index) + self._index = data._index elif data is None: if index is None: self._index = RangeIndex(0) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 7476e516679..fb73bf35b80 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8019,7 +8019,8 @@ def test_dataframe_from_pandas_duplicate_columns(): ["column_not_exists1", "column_not_exists2"], ], ) -def test_dataframe_constructor_columns(df, columns): +@pytest.mark.parametrize("index", [None, ["abc", "def", "ghi"]]) +def test_dataframe_constructor_columns(df, columns, index): def assert_local_eq(actual, df, expected, host_columns): if host_columns is not None and any( col not in df.columns for col in host_columns @@ -8033,11 +8034,13 @@ def assert_local_eq(actual, df, expected, host_columns): columns.to_pandas() if isinstance(columns, gd.Index) else columns ) - expected = pd.DataFrame(df, columns=host_columns) - actual = gd.DataFrame(gdf, columns=columns) + expected = pd.DataFrame(df, columns=host_columns, index=index) + actual = gd.DataFrame(gdf, columns=columns, index=index) assert_local_eq(actual, df, expected, host_columns) - actual = gd.DataFrame(gdf._data, columns=columns) - + expected = pd.DataFrame(df, columns=host_columns) + actual = gd.DataFrame(gdf._data, columns=columns, index=index) + if index is not None: + expected.index = index assert_local_eq(actual, df, expected, host_columns) diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index 16454019929..0a908ba1389 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -96,7 +96,7 @@ def _append_counts(val, count): index = lower # alias; we no longer need lower index[mask] = upper[mask] rv = combined_vals.iloc[index] - return rv.reset_index(drop=True) + return rv.reset_index(drop=True)._data def _approximate_quantile(df, q): From cfeef0823f298fd682319d03e6847bd1365eede7 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 24 Nov 2020 16:08:10 -0600 Subject: [PATCH 10/16] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index af326d16ebb..6a8ea5908e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -158,6 +158,7 @@ - PR #6824 Fix JNI build - PR #6826 Fix resource management in Java ColumnBuilder - PR #6830 Fix categorical scalar insertion +- PR #6838 Fix `columns` & `index` handling in dataframe constructor # cuDF 0.16.0 (21 Oct 2020) From 004f7c1691a0779298099fa5022b4c66991d440b Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 25 Nov 2020 04:32:39 -0600 Subject: [PATCH 11/16] Add more changes --- python/cudf/cudf/core/dataframe.py | 27 ++++++++++++++++-------- python/cudf/cudf/tests/test_dataframe.py | 20 +++++++++++++----- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fd5d31c1329..57251008a04 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -206,12 +206,18 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): if isinstance(data, ColumnAccessor): if index is None: index = as_index(range(data.nrows)) + else: + index = as_index(index) if columns is not None: - data = _get_columns_from_column_accessor( - column_accessor=data, columns=columns + self._data = _get_columns_from_column_accessor( + column_accessor=data, + columns=columns, + nrows=len(index) if data.nrows == 0 else data.nrows, ) - self._data = data + else: + self._data = data + self.index = as_index(index) elif isinstance(data, (DataFrame, pd.DataFrame)): if isinstance(data, pd.DataFrame): @@ -219,16 +225,21 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): if index is not None and not data.index.equals(index): data = data.reindex(index) + index = data._index if columns is not None: self._data = _get_columns_from_column_accessor( - column_accessor=data._data, columns=columns + column_accessor=data._data, + columns=columns, + nrows=len(index) + if data._data.nrows == 0 + else data._data.nrows, ) else: self._data = data._data self.columns = data.columns - self._index = data._index + self._index = index elif data is None: if index is None: self._index = RangeIndex(0) @@ -7306,7 +7317,7 @@ def _get_host_unique(array): return set(array) -def _get_columns_from_column_accessor(column_accessor, columns): +def _get_columns_from_column_accessor(column_accessor, columns, nrows): return ColumnAccessor( data=OrderedDict( ( @@ -7314,9 +7325,7 @@ def _get_columns_from_column_accessor(column_accessor, columns): column_accessor[col_name] if col_name in column_accessor else cudf.core.column.column_empty( - row_count=column_accessor.nrows, - dtype="object", - masked=True, + row_count=nrows, dtype="object", masked=True, ), ) for col_name in columns diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index fb73bf35b80..a370217efd4 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -8002,7 +8002,8 @@ def test_dataframe_from_pandas_duplicate_columns(): [ pd.DataFrame( {"a": [1, 2, 3], "b": [10, 11, 20], "c": ["a", "bcd", "xyz"]} - ) + ), + pd.DataFrame(), ], ) @pytest.mark.parametrize( @@ -8019,15 +8020,21 @@ def test_dataframe_from_pandas_duplicate_columns(): ["column_not_exists1", "column_not_exists2"], ], ) -@pytest.mark.parametrize("index", [None, ["abc", "def", "ghi"]]) +@pytest.mark.parametrize("index", [["abc", "def", "ghi"]]) def test_dataframe_constructor_columns(df, columns, index): def assert_local_eq(actual, df, expected, host_columns): + check_index_type = False if expected.empty else True if host_columns is not None and any( col not in df.columns for col in host_columns ): - assert_eq(expected, actual, check_dtype=False) + assert_eq( + expected, + actual, + check_dtype=False, + check_index_type=check_index_type, + ) else: - assert_eq(expected, actual) + assert_eq(expected, actual, check_index_type=check_index_type) gdf = gd.from_pandas(df) host_columns = ( @@ -8042,5 +8049,8 @@ def assert_local_eq(actual, df, expected, host_columns): expected = pd.DataFrame(df, columns=host_columns) actual = gd.DataFrame(gdf._data, columns=columns, index=index) if index is not None: - expected.index = index + if df.shape == (0, 0): + expected = pd.DataFrame(columns=host_columns, index=index) + else: + expected.index = index assert_local_eq(actual, df, expected, host_columns) From 12849ba4d30491aa0a7ca0206a3f6c9936996cb9 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Dec 2020 11:24:17 -0800 Subject: [PATCH 12/16] handle index slicing when row groups is used --- python/cudf/cudf/_lib/parquet.pyx | 48 ++++++++++++++++++++------ python/cudf/cudf/tests/test_parquet.py | 5 ++- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 67ff8b6404b..b63d7762269 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -271,16 +271,44 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if index_col is not None and len(index_col) > 0: if is_range_index: range_index_meta = index_col[0] - idx = cudf.RangeIndex( - start=range_index_meta['start'], - stop=range_index_meta['stop'], - step=range_index_meta['step'], - name=range_index_meta['name'] - ) - if skiprows is not None: - idx = idx[skiprows:] - if num_rows is not None: - idx = idx[:num_rows] + if row_groups is not None: + per_file_metadata = [] + for s in filepaths_or_buffers: + per_file_metadata.append(pa.parquet.read_metadata(s)) + + filtered_idx = [] + for i, file_meta in enumerate(per_file_metadata): + row_groups_i = [] + start = 0 + for row_group in range(file_meta.num_row_groups): + stop = start + file_meta.row_group(row_group).num_rows + row_groups_i.append((start, stop)) + start = stop + + for rg in row_groups[i]: + filtered_idx.append( + cudf.RangeIndex( + start=rg[k][0], + stop=rg[k][1], + step=range_index_meta['step'] + ) + ) + + if len(filtered_idx) > 0: + idx = cudf.concat(filtered_idx) + else: + idx = cudf.Index([]) + else: + idx = cudf.RangeIndex( + start=range_index_meta['start'], + stop=range_index_meta['stop'], + step=range_index_meta['step'], + name=range_index_meta['name'] + ) + if skiprows is not None: + idx = idx[skiprows:] + if num_rows is not None: + idx = idx[:num_rows] df.index = idx elif set(index_col).issubset(column_names): index_data = df[index_col] diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 3f5ae04f341..54b5901c9eb 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -460,7 +460,10 @@ def test_parquet_read_filtered_multiple_files(tmpdir): [fname_0, fname_1, fname_2], filters=[("x", "==", 2)] ) assert_eq( - filtered_df, cudf.DataFrame({"x": [2, 3, 2, 3], "y": list("bbcc")}) + filtered_df, + cudf.DataFrame( + {"x": [2, 3, 2, 3], "y": list("bbcc")}, index=[2, 3, 2, 3] + ), ) From fa8aa69cb88b2cc74abc79acf20f08c27e6d74ef Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Dec 2020 13:55:26 -0800 Subject: [PATCH 13/16] address review comments --- python/cudf/cudf/_lib/parquet.pyx | 12 ++++++------ python/cudf/cudf/tests/test_parquet.py | 3 +-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index b63d7762269..19da062f7c2 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -272,9 +272,9 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if is_range_index: range_index_meta = index_col[0] if row_groups is not None: - per_file_metadata = [] - for s in filepaths_or_buffers: - per_file_metadata.append(pa.parquet.read_metadata(s)) + per_file_metadata = [ + pa.parquet.read_metadata(s) for s in filepaths_or_buffers + ] filtered_idx = [] for i, file_meta in enumerate(per_file_metadata): @@ -288,8 +288,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, for rg in row_groups[i]: filtered_idx.append( cudf.RangeIndex( - start=rg[k][0], - stop=rg[k][1], + start=row_groups_i[rg][0], + stop=row_groups_i[rg][1], step=range_index_meta['step'] ) ) @@ -297,7 +297,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None, if len(filtered_idx) > 0: idx = cudf.concat(filtered_idx) else: - idx = cudf.Index([]) + idx = cudf.Index(cudf.core.column.column_empty(0)) else: idx = cudf.RangeIndex( start=range_index_meta['start'], diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 54b5901c9eb..fb8c293017a 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1636,8 +1636,7 @@ def test_parquet_nullable_boolean(tmpdir, engine): pytest.param( pd.DataFrame(index=pd.RangeIndex(0, 10, 1)), marks=pytest.mark.xfail( - reason="https://github.com/pandas-dev/pandas/issues/37897" - "https://github.com/pandas-dev/pandas/issues/37896" + reason="https://issues.apache.org/jira/browse/ARROW-10643" ), ), pd.DataFrame({"a": [1, 2, 3]}, index=[0.43534, 345, 0.34534]), From 820307ce9561de42a58a5edba99b14edb0a56ea2 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Dec 2020 14:07:48 -0800 Subject: [PATCH 14/16] remove unrelated commits --- CHANGELOG.md | 1 - python/cudf/cudf/core/dataframe.py | 75 +++++++----------------- python/cudf/cudf/tests/test_dataframe.py | 59 ------------------- 3 files changed, 21 insertions(+), 114 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad757afe5f3..db2142805c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -176,7 +176,6 @@ - PR #6855 Fix `.str.replace_with_backrefs` docs examples - PR #6853 Fix contiguous split of null string columns - PR #6861 Fix compile error in type_dispatch_benchmark.cu -- PR #6838 Fix `columns` & `index` handling in dataframe constructor # cuDF 0.16.0 (21 Oct 2020) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index af8aef2180e..d299f6e63fc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -200,52 +200,34 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): """ super().__init__() - if isinstance(columns, (Series, cudf.Index)): - columns = columns.to_pandas() - if isinstance(data, ColumnAccessor): + self._data = data if index is None: - index = as_index(range(data.nrows)) - else: - index = as_index(index) - - if columns is not None: - self._data = _get_columns_from_column_accessor( - column_accessor=data, - columns=columns, - nrows=len(index) if data.nrows == 0 else data.nrows, - ) - else: - self._data = data - + index = as_index(range(self._data.nrows)) self.index = as_index(index) - elif isinstance(data, (DataFrame, pd.DataFrame)): - if isinstance(data, pd.DataFrame): - data = self.from_pandas(data) + return None - if index is not None and not data.index.equals(index): - data = data.reindex(index) - index = data._index + if isinstance(data, DataFrame): + self._data = data._data + self._index = data._index + self.columns = data.columns + return - if columns is not None: - self._data = _get_columns_from_column_accessor( - column_accessor=data._data, - columns=columns, - nrows=len(index) - if data._data.nrows == 0 - else data._data.nrows, - ) - else: - self._data = data._data - self.columns = data.columns + if isinstance(data, pd.DataFrame): + data = self.from_pandas(data) + self._data = data._data + self._index = data._index + self.columns = data.columns + return - self._index = index - elif data is None: + if data is None: if index is None: self._index = RangeIndex(0) else: self._index = as_index(index) if columns is not None: + if isinstance(columns, (Series, cudf.Index)): + columns = columns.to_pandas() self._data = ColumnAccessor( OrderedDict.fromkeys( @@ -4958,7 +4940,10 @@ def from_pandas(cls, dataframe, nan_as_null=None): df.columns = dataframe.columns # Set index - index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) + if isinstance(dataframe.index, pd.MultiIndex): + index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) + else: + index = dataframe.index result = df.set_index(index) return result @@ -7310,21 +7295,3 @@ def _get_host_unique(array): return [array] else: return set(array) - - -def _get_columns_from_column_accessor(column_accessor, columns, nrows): - return ColumnAccessor( - data=OrderedDict( - ( - col_name, - column_accessor[col_name] - if col_name in column_accessor - else cudf.core.column.column_empty( - row_count=nrows, dtype="object", masked=True, - ), - ) - for col_name in columns - ), - multiindex=column_accessor.multiindex, - level_names=column_accessor.level_names, - ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index a370217efd4..69eb70e7201 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -7995,62 +7995,3 @@ def test_dataframe_from_pandas_duplicate_columns(): ValueError, match="Duplicate column names are not allowed" ): gd.from_pandas(pdf) - - -@pytest.mark.parametrize( - "df", - [ - pd.DataFrame( - {"a": [1, 2, 3], "b": [10, 11, 20], "c": ["a", "bcd", "xyz"]} - ), - pd.DataFrame(), - ], -) -@pytest.mark.parametrize( - "columns", - [ - None, - ["a"], - ["c", "a"], - ["b", "a", "c"], - [], - pd.Index(["c", "a"]), - gd.Index(["c", "a"]), - ["abc", "a"], - ["column_not_exists1", "column_not_exists2"], - ], -) -@pytest.mark.parametrize("index", [["abc", "def", "ghi"]]) -def test_dataframe_constructor_columns(df, columns, index): - def assert_local_eq(actual, df, expected, host_columns): - check_index_type = False if expected.empty else True - if host_columns is not None and any( - col not in df.columns for col in host_columns - ): - assert_eq( - expected, - actual, - check_dtype=False, - check_index_type=check_index_type, - ) - else: - assert_eq(expected, actual, check_index_type=check_index_type) - - gdf = gd.from_pandas(df) - host_columns = ( - columns.to_pandas() if isinstance(columns, gd.Index) else columns - ) - - expected = pd.DataFrame(df, columns=host_columns, index=index) - actual = gd.DataFrame(gdf, columns=columns, index=index) - - assert_local_eq(actual, df, expected, host_columns) - - expected = pd.DataFrame(df, columns=host_columns) - actual = gd.DataFrame(gdf._data, columns=columns, index=index) - if index is not None: - if df.shape == (0, 0): - expected = pd.DataFrame(columns=host_columns, index=index) - else: - expected.index = index - assert_local_eq(actual, df, expected, host_columns) From d509894b2a844c2d405a1f68ff7b0caa71919115 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Dec 2020 14:08:59 -0800 Subject: [PATCH 15/16] revert unrelated changes --- python/dask_cudf/dask_cudf/sorting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py index 0a908ba1389..16454019929 100644 --- a/python/dask_cudf/dask_cudf/sorting.py +++ b/python/dask_cudf/dask_cudf/sorting.py @@ -96,7 +96,7 @@ def _append_counts(val, count): index = lower # alias; we no longer need lower index[mask] = upper[mask] rv = combined_vals.iloc[index] - return rv.reset_index(drop=True)._data + return rv.reset_index(drop=True) def _approximate_quantile(df, q): From 2a9b65c1f8127634ae17015c52131ab47c56ec5a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Tue, 1 Dec 2020 14:31:20 -0800 Subject: [PATCH 16/16] add back required change --- python/cudf/cudf/core/dataframe.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index d299f6e63fc..107d2d20e38 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -4940,10 +4940,7 @@ def from_pandas(cls, dataframe, nan_as_null=None): df.columns = dataframe.columns # Set index - if isinstance(dataframe.index, pd.MultiIndex): - index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) - else: - index = dataframe.index + index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) result = df.set_index(index) return result