Skip to content

Commit

Permalink
BUG: coerce pd.wide_to_long suffixes to numeric
Browse files Browse the repository at this point in the history
  • Loading branch information
tdpetrou committed Sep 25, 2017
1 parent 45a795e commit ad359f6
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 59 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,8 @@ Other API Changes
- :func:`to_datetime` when passed a tz-aware ``origin=`` kwarg will now raise a more informative ``ValueError`` rather than a ``TypeError`` (:issue:`16842`)
- Renamed non-functional ``index`` to ``index_col`` in :func:`read_stata` to improve API consistency (:issue:`16342`)
- Bug in :func:`DataFrame.drop` caused boolean labels ``False`` and ``True`` to be treated as labels 0 and 1 respectively when dropping indices from a numeric index. This will now raise a ValueError (:issue:`16877`)
- :func:`wide_to_long` previously suffixes were left as ``object`` dtype. Now they are cast to numeric if possible (:issue:`17627`)


.. _whatsnew_0210.deprecations:

Expand Down
20 changes: 13 additions & 7 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from pandas.core.dtypes.cast import maybe_promote
from pandas.core.dtypes.missing import notna
import pandas.core.dtypes.concat as _concat
from pandas.core.tools.numeric import to_numeric

from pandas.core.series import Series
from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -895,6 +896,10 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
.. versionadded:: 0.20.0
When all suffixes are numeric, they are cast to int64/float64.
.. versionadded:: 0.21.0
Returns
-------
DataFrame
Expand Down Expand Up @@ -1033,22 +1038,24 @@ def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'):
-----
All extra variables are left untouched. This simply uses
`pandas.melt` under the hood, but is hard-coded to "do the right thing"
in a typicaly case.
in a typical case.
"""
def get_var_names(df, stub, sep, suffix):
regex = "^{stub}{sep}{suffix}".format(
stub=re.escape(stub), sep=re.escape(sep), suffix=suffix)
return df.filter(regex=regex).columns.tolist()
regex = '^{0}{1}{2}$'.format(re.escape(stub), re.escape(sep), suffix)
return [col for col in df.columns if re.match(regex, col)]

def melt_stub(df, stub, i, j, value_vars, sep):
newdf = melt(df, id_vars=i, value_vars=value_vars,
value_name=stub.rstrip(sep), var_name=j)
newdf[j] = Categorical(newdf[j])
newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "")

# GH17627 Cast numerics suffixes to int/float
newdf[j] = to_numeric(newdf[j], errors='ignore')

return newdf.set_index(i + [j])

if any(map(lambda s: s in df.columns.tolist(), stubnames)):
if any([col in stubnames for col in df.columns]):
raise ValueError("stubname can't be identical to a column name")

if not is_list_like(stubnames):
Expand All @@ -1064,8 +1071,7 @@ def melt_stub(df, stub, i, j, value_vars, sep):
if df[i].duplicated().any():
raise ValueError("the id variables need to uniquely identify each row")

value_vars = list(map(lambda stub:
get_var_names(df, stub, sep, suffix), stubnames))
value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]

value_vars_flattened = [e for sublist in value_vars for e in sublist]
id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
Expand Down
188 changes: 136 additions & 52 deletions pandas/tests/reshape/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,12 +764,12 @@ def test_simple(self):
exp_data = {"X": x.tolist() + x.tolist(),
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": ['1970', '1970', '1970', '1980', '1980', '1980'],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
exp_frame = DataFrame(exp_data)
exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]]
long_frame = wide_to_long(df, ["A", "B"], i="id", j="year")
tm.assert_frame_equal(long_frame, exp_frame)
expected = DataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
result = wide_to_long(df, ["A", "B"], i="id", j="year")
tm.assert_frame_equal(result, expected)

def test_stubs(self):
# GH9204
Expand Down Expand Up @@ -804,12 +804,12 @@ def test_separating_character(self):
exp_data = {"X": x.tolist() + x.tolist(),
"A": ['a', 'b', 'c', 'd', 'e', 'f'],
"B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": ['1970', '1970', '1970', '1980', '1980', '1980'],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
exp_frame = DataFrame(exp_data)
exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]]
long_frame = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
tm.assert_frame_equal(long_frame, exp_frame)
expected = DataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
tm.assert_frame_equal(result, expected)

def test_escapable_characters(self):
np.random.seed(123)
Expand All @@ -832,14 +832,14 @@ def test_escapable_characters(self):
exp_data = {"X": x.tolist() + x.tolist(),
"A(quarterly)": ['a', 'b', 'c', 'd', 'e', 'f'],
"B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
"year": ['1970', '1970', '1970', '1980', '1980', '1980'],
"year": [1970, 1970, 1970, 1980, 1980, 1980],
"id": [0, 1, 2, 0, 1, 2]}
exp_frame = DataFrame(exp_data)
exp_frame = exp_frame.set_index(
expected = DataFrame(exp_data)
expected = expected.set_index(
['id', 'year'])[["X", "A(quarterly)", "B(quarterly)"]]
long_frame = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
i="id", j="year")
tm.assert_frame_equal(long_frame, exp_frame)
result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"],
i="id", j="year")
tm.assert_frame_equal(result, expected)

def test_unbalanced(self):
# test that we can have a varying amount of time variables
Expand All @@ -852,11 +852,11 @@ def test_unbalanced(self):
'A': [1.0, 3.0, 2.0, 4.0],
'B': [5.0, np.nan, 6.0, np.nan],
'id': [0, 0, 1, 1],
'year': ['2010', '2011', '2010', '2011']}
exp_frame = pd.DataFrame(exp_data)
exp_frame = exp_frame.set_index(['id', 'year'])[["X", "A", "B"]]
long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year')
tm.assert_frame_equal(long_frame, exp_frame)
'year': [2010, 2011, 2010, 2011]}
expected = pd.DataFrame(exp_data)
expected = expected.set_index(['id', 'year'])[["X", "A", "B"]]
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
tm.assert_frame_equal(result, expected)

def test_character_overlap(self):
# Test we handle overlapping characters in both id_vars and value_vars
Expand All @@ -871,19 +871,19 @@ def test_character_overlap(self):
'BBBZ': [91, 92, 93]
})
df['id'] = df.index
exp_frame = pd.DataFrame({
expected = pd.DataFrame({
'BBBX': [91, 92, 93, 91, 92, 93],
'BBBZ': [91, 92, 93, 91, 92, 93],
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
'BB': [1, 2, 3, 4, 5, 6],
'id': [0, 1, 2, 0, 1, 2],
'year': ['11', '11', '11', '12', '12', '12']})
exp_frame = exp_frame.set_index(['id', 'year'])[
'year': [11, 11, 11, 12, 12, 12]})
expected = expected.set_index(['id', 'year'])[
['BBBX', 'BBBZ', 'A', 'B', 'BB']]
long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
tm.assert_frame_equal(long_frame.sort_index(axis=1),
exp_frame.sort_index(axis=1))
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))

def test_invalid_separator(self):
# if an invalid separator is supplied a empty data frame is returned
Expand All @@ -901,13 +901,13 @@ def test_invalid_separator(self):
'year': [],
'A': [],
'B': []}
exp_frame = pd.DataFrame(exp_data)
exp_frame = exp_frame.set_index(['id', 'year'])[[
expected = pd.DataFrame(exp_data).astype({'year': 'int'})
expected = expected.set_index(['id', 'year'])[[
'X', 'A2010', 'A2011', 'B2010', 'A', 'B']]
exp_frame.index.set_levels([[0, 1], []], inplace=True)
long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
tm.assert_frame_equal(long_frame.sort_index(axis=1),
exp_frame.sort_index(axis=1))
expected.index.set_levels([0, 1], level=0, inplace=True)
result = wide_to_long(df, ['A', 'B'], i='id', j='year', sep=sep)
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))

def test_num_string_disambiguation(self):
# Test that we can disambiguate number value_vars from
Expand All @@ -923,19 +923,19 @@ def test_num_string_disambiguation(self):
'Arating_old': [91, 92, 93]
})
df['id'] = df.index
exp_frame = pd.DataFrame({
expected = pd.DataFrame({
'Arating': [91, 92, 93, 91, 92, 93],
'Arating_old': [91, 92, 93, 91, 92, 93],
'A': ['a11', 'a22', 'a33', 'a21', 'a22', 'a23'],
'B': ['b11', 'b12', 'b13', 'b21', 'b22', 'b23'],
'BB': [1, 2, 3, 4, 5, 6],
'id': [0, 1, 2, 0, 1, 2],
'year': ['11', '11', '11', '12', '12', '12']})
exp_frame = exp_frame.set_index(['id', 'year'])[
'year': [11, 11, 11, 12, 12, 12]})
expected = expected.set_index(['id', 'year'])[
['Arating', 'Arating_old', 'A', 'B', 'BB']]
long_frame = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
tm.assert_frame_equal(long_frame.sort_index(axis=1),
exp_frame.sort_index(axis=1))
result = wide_to_long(df, ['A', 'B', 'BB'], i='id', j='year')
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))

def test_invalid_suffixtype(self):
# If all stubs names end with a string, but a numeric suffix is
Expand All @@ -953,13 +953,13 @@ def test_invalid_suffixtype(self):
'year': [],
'A': [],
'B': []}
exp_frame = pd.DataFrame(exp_data)
exp_frame = exp_frame.set_index(['id', 'year'])[[
'X', 'Aone', 'Atwo', 'Bone', 'A', 'B']]
exp_frame.index.set_levels([[0, 1], []], inplace=True)
long_frame = wide_to_long(df, ['A', 'B'], i='id', j='year')
tm.assert_frame_equal(long_frame.sort_index(axis=1),
exp_frame.sort_index(axis=1))
expected = pd.DataFrame(exp_data).astype({'year': 'int'})

expected = expected.set_index(['id', 'year'])
expected.index.set_levels([0, 1], level=0, inplace=True)
result = wide_to_long(df, ['A', 'B'], i='id', j='year')
tm.assert_frame_equal(result.sort_index(axis=1),
expected.sort_index(axis=1))

def test_multiple_id_columns(self):
# Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
Expand All @@ -969,17 +969,17 @@ def test_multiple_id_columns(self):
'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
})
exp_frame = pd.DataFrame({
expected = pd.DataFrame({
'ht': [2.8, 3.4, 2.9, 3.8, 2.2, 2.9, 2.0, 3.2, 1.8,
2.8, 1.9, 2.4, 2.2, 3.3, 2.3, 3.4, 2.1, 2.9],
'famid': [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
'birth': [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
'age': ['1', '2', '1', '2', '1', '2', '1', '2', '1',
'2', '1', '2', '1', '2', '1', '2', '1', '2']
'age': [1, 2, 1, 2, 1, 2, 1, 2, 1,
2, 1, 2, 1, 2, 1, 2, 1, 2]
})
exp_frame = exp_frame.set_index(['famid', 'birth', 'age'])[['ht']]
long_frame = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
tm.assert_frame_equal(long_frame, exp_frame)
expected = expected.set_index(['famid', 'birth', 'age'])[['ht']]
result = wide_to_long(df, 'ht', i=['famid', 'birth'], j='age')
tm.assert_frame_equal(result, expected)

def test_non_unique_idvars(self):
# GH16382
Expand All @@ -991,3 +991,87 @@ def test_non_unique_idvars(self):
})
with pytest.raises(ValueError):
wide_to_long(df, ['A_A', 'B_B'], i='x', j='colname')

def test_cast_j_int(self):
df = pd.DataFrame({
'actor_1': ['CCH Pounder', 'Johnny Depp', 'Christoph Waltz'],
'actor_2': ['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear'],
'actor_fb_likes_1': [1000.0, 40000.0, 11000.0],
'actor_fb_likes_2': [936.0, 5000.0, 393.0],
'title': ['Avatar', "Pirates of the Caribbean", 'Spectre']})

expected = pd.DataFrame({
'actor': ['CCH Pounder',
'Johnny Depp',
'Christoph Waltz',
'Joel David Moore',
'Orlando Bloom',
'Rory Kinnear'],
'actor_fb_likes': [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
'num': [1, 1, 1, 2, 2, 2],
'title': ['Avatar',
'Pirates of the Caribbean',
'Spectre',
'Avatar',
'Pirates of the Caribbean',
'Spectre']}).set_index(['title', 'num'])
result = wide_to_long(df, ['actor', 'actor_fb_likes'],
i='title', j='num', sep='_')

tm.assert_frame_equal(result, expected)

def test_identical_stubnames(self):
df = pd.DataFrame({'A2010': [1.0, 2.0],
'A2011': [3.0, 4.0],
'B2010': [5.0, 6.0],
'A': ['X1', 'X2']})
with pytest.raises(ValueError):
wide_to_long(df, ['A', 'B'], i='A', j='colname')

def test_nonnumeric_suffix(self):
df = pd.DataFrame({'treatment_placebo': [1.0, 2.0],
'treatment_test': [3.0, 4.0],
'result_placebo': [5.0, 6.0],
'A': ['X1', 'X2']})
expected = pd.DataFrame({
'A': ['X1', 'X1', 'X2', 'X2'],
'colname': ['placebo', 'test', 'placebo', 'test'],
'result': [5.0, np.nan, 6.0, np.nan],
'treatment': [1.0, 3.0, 2.0, 4.0]})
expected = expected.set_index(['A', 'colname'])
result = wide_to_long(df, ['result', 'treatment'],
i='A', j='colname', suffix='[a-z]+', sep='_')
tm.assert_frame_equal(result, expected)

def test_mixed_type_suffix(self):
df = pd.DataFrame({
'treatment_1': [1.0, 2.0],
'treatment_foo': [3.0, 4.0],
'result_foo': [5.0, 6.0],
'result_1': [0, 9],
'A': ['X1', 'X2']})
expected = pd.DataFrame({
'A': ['X1', 'X2', 'X1', 'X2'],
'colname': ['1', '1', 'foo', 'foo'],
'result': [0.0, 9.0, 5.0, 6.0],
'treatment': [1.0, 2.0, 3.0, 4.0]}).set_index(['A', 'colname'])
result = wide_to_long(df, ['result', 'treatment'],
i='A', j='colname', suffix='.+', sep='_')
tm.assert_frame_equal(result, expected)

def test_float_suffix(self):
df = pd.DataFrame({
'treatment_1.1': [1.0, 2.0],
'treatment_2.1': [3.0, 4.0],
'result_1.2': [5.0, 6.0],
'result_1': [0, 9],
'A': ['X1', 'X2']})
expected = pd.DataFrame({
'A': ['X1', 'X1', 'X1', 'X1', 'X2', 'X2', 'X2', 'X2'],
'colname': [1, 1.1, 1.2, 2.1, 1, 1.1, 1.2, 2.1],
'result': [0.0, np.nan, 5.0, np.nan, 9.0, np.nan, 6.0, np.nan],
'treatment': [np.nan, 1.0, np.nan, 3.0, np.nan, 2.0, np.nan, 4.0]})
expected = expected.set_index(['A', 'colname'])
result = wide_to_long(df, ['result', 'treatment'],
i='A', j='colname', suffix='[0-9.]+', sep='_')
tm.assert_frame_equal(result, expected)

0 comments on commit ad359f6

Please sign in to comment.