Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed #1490. New column case is checked first in __setitem__ #1495

Merged
merged 2 commits into from
May 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 37 additions & 31 deletions modin/pandas/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2059,6 +2059,41 @@ def __setattr__(self, key, value):
object.__setattr__(self, key, value)

def __setitem__(self, key, value):
if key not in self.columns:
# Handle new column case first
if isinstance(value, Series):
if len(self.columns) == 0:
self._query_compiler = value._query_compiler.copy()
else:
self._create_or_update_from_compiler(
self._query_compiler.concat(1, value._query_compiler),
inplace=True,
)
# Now that the data is appended, we need to update the column name for
# that column to `key`, otherwise the name could be incorrect. Drop the
# last column name from the list (the appended value's name and append
# the new name.
self.columns = self.columns[:-1].append(pandas.Index([key]))
elif (
isinstance(value, np.ndarray)
and len(value.shape) > 1
and value.shape[1] != 1
):
raise ValueError(
"Wrong number of items passed %i, placement implies 1"
% value.shape[1]
)
elif (
isinstance(value, (pandas.DataFrame, DataFrame)) and value.shape[1] != 1
):
raise ValueError(
"Wrong number of items passed %i, placement implies 1"
% value.shape[1]
)
else:
self.insert(loc=len(self.columns), column=key, value=value)
return

if not isinstance(key, str):

def setitem_without_string_columns(df):
Expand All @@ -2073,22 +2108,8 @@ def setitem_without_string_columns(df):
)
if is_list_like(value):
if isinstance(value, (pandas.DataFrame, DataFrame)):
if value.shape[1] != 1 and key not in self.columns:
raise ValueError(
"Wrong number of items passed %i, placement implies 1"
% value.shape[1]
)
value = value[value.columns[0]].values
elif isinstance(value, np.ndarray):
if (
len(value.shape) > 1
and value.shape[1] != 1
and key not in self.columns
):
raise ValueError(
"Wrong number of items passed %i, placement implies 1"
% value.shape[1]
)
assert (
len(value.shape) < 3
), "Shape of new values must be compatible with manager shape"
Expand All @@ -2097,23 +2118,8 @@ def setitem_without_string_columns(df):
value = value[: len(self)]
if not isinstance(value, Series):
value = list(value)
if key not in self.columns:
if isinstance(value, Series):
if len(self.columns) == 0:
self._query_compiler = value._query_compiler.copy()
else:
self._create_or_update_from_compiler(
self._query_compiler.concat(1, value._query_compiler),
inplace=True,
)
# Now that the data is appended, we need to update the column name for
# that column to `key`, otherwise the name could be incorrect. Drop the
# last column name from the list (the appended value's name and append
# the new name.
self.columns = self.columns[:-1].append(pandas.Index([key]))
else:
self.insert(loc=len(self.columns), column=key, value=value)
elif len(self.index) == 0:

if len(self.index) == 0:
new_self = DataFrame({key: value}, columns=self.columns)
self._update_inplace(new_self._query_compiler)
else:
Expand Down
32 changes: 24 additions & 8 deletions modin/pandas/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5131,14 +5131,30 @@ def test___setitem__(self, data):

df_equals(modin_df, pandas_df)

def test_setitem_on_empty_df(self):
columns = ["id", "max_speed", "health"]
modin_df = pd.DataFrame(columns=columns)
pandas_df = pandas.DataFrame(columns=columns)
a = np.array(["one", "two"])

modin_df["id"] = a
pandas_df["id"] = a
@pytest.mark.parametrize(
"data",
[
{},
pytest.param(
{"id": [], "max_speed": [], "health": []},
marks=pytest.mark.xfail(
reason="Throws an exception because generally assigning Series or other objects of length different from DataFrame does not work right now"
),
),
],
ids=["empty", "empty_columns"],
)
@pytest.mark.parametrize(
"value", [np.array(["one", "two"]), [11, 22]], ids=["ndarray", "list"],
)
@pytest.mark.parametrize("convert_to_series", [False, True])
@pytest.mark.parametrize("new_col_id", [123, "new_col"], ids=["integer", "string"])
def test_setitem_on_empty_df(self, data, value, convert_to_series, new_col_id):
pandas_df = pandas.DataFrame(data)
modin_df = pd.DataFrame(data)

pandas_df[new_col_id] = pandas.Series(value) if convert_to_series else value
modin_df[new_col_id] = pd.Series(value) if convert_to_series else value
df_equals(modin_df, pandas_df)

@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
Expand Down