From 0bd9cda5aa462df0afd48591fed760bdbec6e6c8 Mon Sep 17 00:00:00 2001 From: Gregory Shimansky Date: Wed, 27 May 2020 12:51:34 -0500 Subject: [PATCH 1/2] Fixed #1490. New column case is checked first in __setitem__ Signed-off-by: Gregory Shimansky --- modin/pandas/dataframe.py | 69 ++++++++++++++++------------- modin/pandas/test/test_dataframe.py | 32 +++++++++---- 2 files changed, 62 insertions(+), 39 deletions(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 97953b9fbc2..2a9765488be 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -2059,6 +2059,41 @@ def __setattr__(self, key, value): object.__setattr__(self, key, value) def __setitem__(self, key, value): + if key not in self.columns: + # Handle new column case first + if isinstance(value, Series): + if len(self.columns) == 0: + self._query_compiler = value._query_compiler.copy() + else: + self._create_or_update_from_compiler( + self._query_compiler.concat(1, value._query_compiler), + inplace=True, + ) + # Now that the data is appended, we need to update the column name for + # that column to `key`, otherwise the name could be incorrect. Drop the + # last column name from the list (the appended value's name and append + # the new name. + self.columns = self.columns[:-1].append(pandas.Index([key])) + elif ( + isinstance(value, np.ndarray) + and len(value.shape) > 1 + and value.shape[1] != 1 + ): + raise ValueError( + "Wrong number of items passed %i, placement implies 1" + % value.shape[1] + ) + elif ( + isinstance(value, (pandas.DataFrame, DataFrame)) and value.shape[1] != 1 + ): + raise ValueError( + "Wrong number of items passed %i, placement implies 1" + % value.shape[1] + ) + else: + self.insert(loc=len(self.columns), column=key, value=value) + return + if not isinstance(key, str): def setitem_without_string_columns(df): @@ -2071,24 +2106,11 @@ def setitem_without_string_columns(df): return self._update_inplace( self._default_to_pandas(setitem_without_string_columns)._query_compiler ) + if is_list_like(value): if isinstance(value, (pandas.DataFrame, DataFrame)): - if value.shape[1] != 1 and key not in self.columns: - raise ValueError( - "Wrong number of items passed %i, placement implies 1" - % value.shape[1] - ) value = value[value.columns[0]].values elif isinstance(value, np.ndarray): - if ( - len(value.shape) > 1 - and value.shape[1] != 1 - and key not in self.columns - ): - raise ValueError( - "Wrong number of items passed %i, placement implies 1" - % value.shape[1] - ) assert ( len(value.shape) < 3 ), "Shape of new values must be compatible with manager shape" @@ -2097,23 +2119,8 @@ def setitem_without_string_columns(df): value = value[: len(self)] if not isinstance(value, Series): value = list(value) - if key not in self.columns: - if isinstance(value, Series): - if len(self.columns) == 0: - self._query_compiler = value._query_compiler.copy() - else: - self._create_or_update_from_compiler( - self._query_compiler.concat(1, value._query_compiler), - inplace=True, - ) - # Now that the data is appended, we need to update the column name for - # that column to `key`, otherwise the name could be incorrect. Drop the - # last column name from the list (the appended value's name and append - # the new name. - self.columns = self.columns[:-1].append(pandas.Index([key])) - else: - self.insert(loc=len(self.columns), column=key, value=value) - elif len(self.index) == 0: + + if len(self.index) == 0: new_self = DataFrame({key: value}, columns=self.columns) self._update_inplace(new_self._query_compiler) else: diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py index 74db237ceac..d2da880f205 100644 --- a/modin/pandas/test/test_dataframe.py +++ b/modin/pandas/test/test_dataframe.py @@ -5131,14 +5131,30 @@ def test___setitem__(self, data): df_equals(modin_df, pandas_df) - def test_setitem_on_empty_df(self): - columns = ["id", "max_speed", "health"] - modin_df = pd.DataFrame(columns=columns) - pandas_df = pandas.DataFrame(columns=columns) - a = np.array(["one", "two"]) - - modin_df["id"] = a - pandas_df["id"] = a + @pytest.mark.parametrize( + "data", + [ + {}, + pytest.param( + {"id": [], "max_speed": [], "health": []}, + marks=pytest.mark.xfail( + reason="Throws an exception because generally assigning Series or other objects of length different from DataFrame does not work right now" + ), + ), + ], + ids=["empty", "empty_columns"], + ) + @pytest.mark.parametrize( + "value", [np.array(["one", "two"]), [11, 22]], ids=["ndarray", "list"], + ) + @pytest.mark.parametrize("convert_to_series", [False, True]) + @pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"]) + def test_setitem_on_empty_df(self, data, value, convert_to_series, new_col_id): + pandas_df = pandas.DataFrame(data) + modin_df = pd.DataFrame(data) + + pandas_df[new_col_id] = pandas.Series(value) if convert_to_series else value + modin_df[new_col_id] = pd.Series(value) if convert_to_series else value df_equals(modin_df, pandas_df) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) From 18e255621f44adacf11e1fdddc9f6ad26b7dcdc4 Mon Sep 17 00:00:00 2001 From: Gregory Shimansky Date: Wed, 27 May 2020 18:49:37 -0500 Subject: [PATCH 2/2] Remove extra new line in modin/pandas/dataframe.py Co-authored-by: Devin Petersohn --- modin/pandas/dataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 2a9765488be..ec03f2a64dc 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -2106,7 +2106,6 @@ def setitem_without_string_columns(df): return self._update_inplace( self._default_to_pandas(setitem_without_string_columns)._query_compiler ) - if is_list_like(value): if isinstance(value, (pandas.DataFrame, DataFrame)): value = value[value.columns[0]].values