From 0bd9cda5aa462df0afd48591fed760bdbec6e6c8 Mon Sep 17 00:00:00 2001
From: Gregory Shimansky <gregory.shimansky@intel.com>
Date: Wed, 27 May 2020 12:51:34 -0500
Subject: [PATCH 1/2] Fixed #1490. New column case is checked first in
 __setitem__

Signed-off-by: Gregory Shimansky <gregory.shimansky@intel.com>
---
 modin/pandas/dataframe.py           | 69 ++++++++++++++++-------------
 modin/pandas/test/test_dataframe.py | 32 +++++++++----
 2 files changed, 62 insertions(+), 39 deletions(-)

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index 97953b9fbc2..2a9765488be 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -2059,6 +2059,41 @@ def __setattr__(self, key, value):
         object.__setattr__(self, key, value)
 
     def __setitem__(self, key, value):
+        if key not in self.columns:
+            # Handle new column case first
+            if isinstance(value, Series):
+                if len(self.columns) == 0:
+                    self._query_compiler = value._query_compiler.copy()
+                else:
+                    self._create_or_update_from_compiler(
+                        self._query_compiler.concat(1, value._query_compiler),
+                        inplace=True,
+                    )
+                # Now that the data is appended, we need to update the column name for
+                # that column to `key`, otherwise the name could be incorrect. Drop the
+                # last column name from the list (the appended value's name and append
+                # the new name.
+                self.columns = self.columns[:-1].append(pandas.Index([key]))
+            elif (
+                isinstance(value, np.ndarray)
+                and len(value.shape) > 1
+                and value.shape[1] != 1
+            ):
+                raise ValueError(
+                    "Wrong number of items passed %i, placement implies 1"
+                    % value.shape[1]
+                )
+            elif (
+                isinstance(value, (pandas.DataFrame, DataFrame)) and value.shape[1] != 1
+            ):
+                raise ValueError(
+                    "Wrong number of items passed %i, placement implies 1"
+                    % value.shape[1]
+                )
+            else:
+                self.insert(loc=len(self.columns), column=key, value=value)
+            return
+
         if not isinstance(key, str):
 
             def setitem_without_string_columns(df):
@@ -2071,24 +2106,11 @@ def setitem_without_string_columns(df):
             return self._update_inplace(
                 self._default_to_pandas(setitem_without_string_columns)._query_compiler
             )
+
         if is_list_like(value):
             if isinstance(value, (pandas.DataFrame, DataFrame)):
-                if value.shape[1] != 1 and key not in self.columns:
-                    raise ValueError(
-                        "Wrong number of items passed %i, placement implies 1"
-                        % value.shape[1]
-                    )
                 value = value[value.columns[0]].values
             elif isinstance(value, np.ndarray):
-                if (
-                    len(value.shape) > 1
-                    and value.shape[1] != 1
-                    and key not in self.columns
-                ):
-                    raise ValueError(
-                        "Wrong number of items passed %i, placement implies 1"
-                        % value.shape[1]
-                    )
                 assert (
                     len(value.shape) < 3
                 ), "Shape of new values must be compatible with manager shape"
@@ -2097,23 +2119,8 @@ def setitem_without_string_columns(df):
                     value = value[: len(self)]
             if not isinstance(value, Series):
                 value = list(value)
-        if key not in self.columns:
-            if isinstance(value, Series):
-                if len(self.columns) == 0:
-                    self._query_compiler = value._query_compiler.copy()
-                else:
-                    self._create_or_update_from_compiler(
-                        self._query_compiler.concat(1, value._query_compiler),
-                        inplace=True,
-                    )
-                # Now that the data is appended, we need to update the column name for
-                # that column to `key`, otherwise the name could be incorrect. Drop the
-                # last column name from the list (the appended value's name and append
-                # the new name.
-                self.columns = self.columns[:-1].append(pandas.Index([key]))
-            else:
-                self.insert(loc=len(self.columns), column=key, value=value)
-        elif len(self.index) == 0:
+
+        if len(self.index) == 0:
             new_self = DataFrame({key: value}, columns=self.columns)
             self._update_inplace(new_self._query_compiler)
         else:
diff --git a/modin/pandas/test/test_dataframe.py b/modin/pandas/test/test_dataframe.py
index 74db237ceac..d2da880f205 100644
--- a/modin/pandas/test/test_dataframe.py
+++ b/modin/pandas/test/test_dataframe.py
@@ -5131,14 +5131,30 @@ def test___setitem__(self, data):
 
         df_equals(modin_df, pandas_df)
 
-    def test_setitem_on_empty_df(self):
-        columns = ["id", "max_speed", "health"]
-        modin_df = pd.DataFrame(columns=columns)
-        pandas_df = pandas.DataFrame(columns=columns)
-        a = np.array(["one", "two"])
-
-        modin_df["id"] = a
-        pandas_df["id"] = a
+    @pytest.mark.parametrize(
+        "data",
+        [
+            {},
+            pytest.param(
+                {"id": [], "max_speed": [], "health": []},
+                marks=pytest.mark.xfail(
+                    reason="Throws an exception because generally assigning Series or other objects of length different from DataFrame does not work right now"
+                ),
+            ),
+        ],
+        ids=["empty", "empty_columns"],
+    )
+    @pytest.mark.parametrize(
+        "value", [np.array(["one", "two"]), [11, 22]], ids=["ndarray", "list"],
+    )
+    @pytest.mark.parametrize("convert_to_series", [False, True])
+    @pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"])
+    def test_setitem_on_empty_df(self, data, value, convert_to_series, new_col_id):
+        pandas_df = pandas.DataFrame(data)
+        modin_df = pd.DataFrame(data)
+
+        pandas_df[new_col_id] = pandas.Series(value) if convert_to_series else value
+        modin_df[new_col_id] = pd.Series(value) if convert_to_series else value
         df_equals(modin_df, pandas_df)
 
     @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)

From 18e255621f44adacf11e1fdddc9f6ad26b7dcdc4 Mon Sep 17 00:00:00 2001
From: Gregory Shimansky <gshimansky@users.noreply.github.com>
Date: Wed, 27 May 2020 18:49:37 -0500
Subject: [PATCH 2/2] Remove extra new line in modin/pandas/dataframe.py

Co-authored-by: Devin Petersohn <devin-petersohn@users.noreply.github.com>
---
 modin/pandas/dataframe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index 2a9765488be..ec03f2a64dc 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -2106,7 +2106,6 @@ def setitem_without_string_columns(df):
             return self._update_inplace(
                 self._default_to_pandas(setitem_without_string_columns)._query_compiler
             )
-
         if is_list_like(value):
             if isinstance(value, (pandas.DataFrame, DataFrame)):
                 value = value[value.columns[0]].values