Skip to content

Commit

Permalink
BUG: fix issue with concat creating SparseFrame if not all series are…
Browse files Browse the repository at this point in the history
… sparse. (#18924)
  • Loading branch information
hexgnu authored and jreback committed Feb 1, 2018
1 parent 113f788 commit d3851ac
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 38 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,7 @@ Reshaping
- Bug in :func:`DataFrame.merge` in which merging using ``Index`` objects as vectors raised an Exception (:issue:`19038`)
- Bug in :func:`DataFrame.stack`, :func:`DataFrame.unstack`, :func:`Series.unstack` which were not returning subclasses (:issue:`15563`)
- Bug in timezone comparisons, manifesting as a conversion of the index to UTC in ``.concat()`` (:issue:`18523`)
- Bug in :func:`concat` when concatting sparse and dense series it returns only a ``SparseDataFrame``. Should be a ``DataFrame``. (:issue:`18914`, :issue:`18686`, and :issue:`16874`)
-


Expand Down
10 changes: 6 additions & 4 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
_TD_DTYPE)
from pandas.core.dtypes.generic import (
ABCDatetimeIndex, ABCTimedeltaIndex,
ABCPeriodIndex, ABCRangeIndex)
ABCPeriodIndex, ABCRangeIndex, ABCSparseDataFrame)


def get_dtype_kinds(l):
Expand Down Expand Up @@ -89,14 +89,16 @@ def _get_series_result_type(result, objs=None):
def _get_frame_result_type(result, objs):
"""
return appropriate class of DataFrame-like concat
if any block is SparseBlock, return SparseDataFrame
if all blocks are SparseBlock, return SparseDataFrame
otherwise, return 1st obj
"""
if any(b.is_sparse for b in result.blocks):

if result.blocks and all(b.is_sparse for b in result.blocks):
from pandas.core.sparse.api import SparseDataFrame
return SparseDataFrame
else:
return objs[0]
return next(obj for obj in objs if not isinstance(obj,
ABCSparseDataFrame))


def _concat_compat(to_concat, axis=0):
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/dtypes/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ def _check(cls, inst):

ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", ))
ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", ))
ABCSparseDataFrame = create_pandas_abc_type("ABCSparseDataFrame", "_subtyp",
("sparse_frame", ))
ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",))
ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp",
('sparse_series',
Expand Down
2 changes: 2 additions & 0 deletions pandas/tests/dtypes/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ class TestABCClasses(object):
df = pd.DataFrame({'names': ['a', 'b', 'c']}, index=multi_index)
sparse_series = pd.Series([1, 2, 3]).to_sparse()
sparse_array = pd.SparseArray(np.random.randn(10))
sparse_frame = pd.SparseDataFrame({'a': [1, -1, None]})

def test_abc_types(self):
assert isinstance(pd.Index(['a', 'b', 'c']), gt.ABCIndex)
Expand All @@ -37,6 +38,7 @@ def test_abc_types(self):
assert isinstance(self.df.to_panel(), gt.ABCPanel)
assert isinstance(self.sparse_series, gt.ABCSparseSeries)
assert isinstance(self.sparse_array, gt.ABCSparseArray)
assert isinstance(self.sparse_frame, gt.ABCSparseDataFrame)
assert isinstance(self.categorical, gt.ABCCategorical)
assert isinstance(pd.Period('2012', freq='A-DEC'), gt.ABCPeriod)

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/reshape/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,6 +454,15 @@ def test_dataframe_dummies_preserve_categorical_dtype(self, dtype):

tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize('sparse', [True, False])
def test_get_dummies_dont_sparsify_all_columns(self, sparse):
# GH18914
df = DataFrame.from_items([('GDP', [1, 2]), ('Nation', ['AB', 'CD'])])
df = get_dummies(df, columns=['Nation'], sparse=sparse)
df2 = df.reindex(columns=['GDP'])

tm.assert_frame_equal(df[['GDP']], df2)


class TestCategoricalReshape(object):

Expand Down
85 changes: 51 additions & 34 deletions pandas/tests/sparse/test_combine_concat.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
# pylint: disable-msg=E1101,W0612
import pytest

import numpy as np
import pandas as pd
import pandas.util.testing as tm
import itertools


class TestSparseSeriesConcat(object):
Expand Down Expand Up @@ -317,37 +319,52 @@ def test_concat_axis1(self):
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

def test_concat_sparse_dense(self):
sparse = self.dense1.to_sparse()

res = pd.concat([sparse, self.dense2])
exp = pd.concat([self.dense1, self.dense2])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

res = pd.concat([self.dense2, sparse])
exp = pd.concat([self.dense2, self.dense1])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

sparse = self.dense1.to_sparse(fill_value=0)

res = pd.concat([sparse, self.dense2])
exp = pd.concat([self.dense1, self.dense2])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

res = pd.concat([self.dense2, sparse])
exp = pd.concat([self.dense2, self.dense1])
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

res = pd.concat([self.dense3, sparse], axis=1)
exp = pd.concat([self.dense3, self.dense1], axis=1)
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res, exp)

res = pd.concat([sparse, self.dense3], axis=1)
exp = pd.concat([self.dense1, self.dense3], axis=1)
assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res, exp)
@pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
itertools.product([None, 0, 1, np.nan],
[0, 1],
[1, 0]))
def test_concat_sparse_dense_rows(self, fill_value, sparse_idx, dense_idx):
frames = [self.dense1, self.dense2]
sparse_frame = [frames[dense_idx],
frames[sparse_idx].to_sparse(fill_value=fill_value)]
dense_frame = [frames[dense_idx], frames[sparse_idx]]

# This will try both directions sparse + dense and dense + sparse
for _ in range(2):
res = pd.concat(sparse_frame)
exp = pd.concat(dense_frame)

assert isinstance(res, pd.SparseDataFrame)
tm.assert_frame_equal(res.to_dense(), exp)

sparse_frame = sparse_frame[::-1]
dense_frame = dense_frame[::-1]

@pytest.mark.parametrize('fill_value,sparse_idx,dense_idx',
itertools.product([None, 0, 1, np.nan],
[0, 1],
[1, 0]))
def test_concat_sparse_dense_cols(self, fill_value, sparse_idx, dense_idx):
# See GH16874, GH18914 and #18686 for why this should be a DataFrame

frames = [self.dense1, self.dense3]

sparse_frame = [frames[dense_idx],
frames[sparse_idx].to_sparse(fill_value=fill_value)]
dense_frame = [frames[dense_idx], frames[sparse_idx]]

# This will try both directions sparse + dense and dense + sparse
for _ in range(2):
res = pd.concat(sparse_frame, axis=1)
exp = pd.concat(dense_frame, axis=1)

for column in frames[dense_idx].columns:
if dense_idx == sparse_idx:
tm.assert_frame_equal(res[column], exp[column])
else:
tm.assert_series_equal(res[column], exp[column])

tm.assert_frame_equal(res, exp)

sparse_frame = sparse_frame[::-1]
dense_frame = dense_frame[::-1]

0 comments on commit d3851ac

Please sign in to comment.