Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Fix #1683 - losing index names in pd.concat #1684

Merged
merged 12 commits into from
Jul 22, 2020
47 changes: 46 additions & 1 deletion modin/pandas/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,13 @@
# governing permissions and limitations under the License.

import pandas
import numpy as np

from typing import Hashable, Iterable, Mapping, Optional, Union
from pandas._typing import FrameOrSeriesUnion
from pandas.core.dtypes.common import is_list_like

from modin.backends.base.query_compiler import BaseQueryCompiler
from .dataframe import DataFrame
from .series import Series

Expand Down Expand Up @@ -108,8 +111,18 @@ def concat(
new_idx_labels = {
k: v.index if axis == 0 else v.columns for k, v in zip(keys, objs)
}
tuples = [(k, o) for k, obj in new_idx_labels.items() for o in obj]
tuples = [
(k, *o) if isinstance(o, tuple) else (k, o)
dchigarev marked this conversation as resolved.
Show resolved Hide resolved
for k, obj in new_idx_labels.items()
for o in obj
]
new_idx = pandas.MultiIndex.from_tuples(tuples)
if names is not None:
new_idx.names = names
else:
old_name = _determine_name(objs, axis)
if old_name is not None:
new_idx.names = [None] + old_name
else:
new_idx = None
new_query_compiler = objs[0].concat(
Expand All @@ -132,3 +145,35 @@ def concat(
else:
result_df.columns = new_idx
return result_df


def _determine_name(objs: Iterable[BaseQueryCompiler], axis: Union[int, str]):
"""
Determine names of index after concatenation along passed axis

Parameters
----------
objs : iterable of QueryCompilers
objects to concatenate

axis : int or str
devin-petersohn marked this conversation as resolved.
Show resolved Hide resolved
the axis to concatenate along

Returns
-------
`list` with single element - computed index name, `None` if it could not
be determined
"""
axis = pandas.DataFrame()._get_axis_number(axis)
dchigarev marked this conversation as resolved.
Show resolved Hide resolved

def get_names(obj):
return obj.columns.names if axis else obj.index.names

names = np.array([get_names(obj) for obj in objs])

# saving old name, only if index names of all objs are the same
if np.all(names == names[0]):
# we must do this check to avoid this calls `list(str_like_name)`
return list(names[0]) if is_list_like(names[0]) else [names[0]]
else:
return None
66 changes: 19 additions & 47 deletions modin/pandas/test/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,57 +17,11 @@

import modin.pandas as pd
from modin.pandas.utils import from_pandas
from .utils import df_equals
from .utils import df_equals, generate_dfs, generate_multiindex_dfs, generate_none_dfs

pd.DEFAULT_NPARTITIONS = 4


def generate_dfs():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moving this is fine for this PR, but in the future I prefer new "REFACTOR" PRs for moved code so we can keep a more detailed commit history.

df = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col4": [12, 13, 14, 15],
"col5": [0, 0, 0, 0],
}
)

df2 = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col6": [12, 13, 14, 15],
"col7": [0, 0, 0, 0],
}
)
return df, df2


def generate_none_dfs():
df = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, None, 7],
"col3": [8, 9, 10, 11],
"col4": [12, 13, 14, 15],
"col5": [None, None, None, None],
}
)

df2 = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col6": [12, 13, 14, 15],
"col7": [0, 0, 0, 0],
}
)
return df, df2


def test_df_concat():
df, df2 = generate_dfs()

Expand Down Expand Up @@ -207,3 +161,21 @@ def test_concat_with_empty_frame():
pd.concat([modin_empty_df, modin_row]),
pandas.concat([pandas_empty_df, pandas_row]),
)


@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("names", [False, True])
def test_concat_multiindex(axis, names):
pd_df1, pd_df2 = generate_multiindex_dfs(axis=axis)
md_df1, md_df2 = map(from_pandas, [pd_df1, pd_df2])

keys = ["first", "second"]
if names:
names = [str(i) for i in np.arange(pd_df1.axes[axis].nlevels + 1)]
else:
names = None

df_equals(
pd.concat([md_df1, md_df2], keys=keys, axis=axis, names=names),
pandas.concat([pd_df1, pd_df2], keys=keys, axis=axis, names=names),
)
59 changes: 59 additions & 0 deletions modin/pandas/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,3 +592,62 @@ def execute_callable(fn, md_kwargs={}, pd_kwargs={}):

def create_test_dfs(*args, **kwargs):
return pd.DataFrame(*args, **kwargs), pandas.DataFrame(*args, **kwargs)


def generate_dfs():
df = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col4": [12, 13, 14, 15],
"col5": [0, 0, 0, 0],
}
)

df2 = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col6": [12, 13, 14, 15],
"col7": [0, 0, 0, 0],
}
)
return df, df2


def generate_multiindex_dfs(axis=1):
def generate_multiindex(index):
return pandas.MultiIndex.from_tuples(
[("a", x) for x in index.values], names=["name1", "name2"]
)

df1, df2 = generate_dfs()
df1.axes[axis], df2.axes[axis] = map(
generate_multiindex, [df1.axes[axis], df2.axes[axis]]
)
return df1, df2


def generate_none_dfs():
df = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, None, 7],
"col3": [8, 9, 10, 11],
"col4": [12, 13, 14, 15],
"col5": [None, None, None, None],
}
)

df2 = pandas.DataFrame(
{
"col1": [0, 1, 2, 3],
"col2": [4, 5, 6, 7],
"col3": [8, 9, 10, 11],
"col6": [12, 13, 14, 15],
"col7": [0, 0, 0, 0],
}
)
return df, df2