Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: add method/dtype coverage to str-accessor; precursor to #23167 #23582

Merged
merged 12 commits into from
Nov 28, 2018
Merged
82 changes: 81 additions & 1 deletion pandas/conftest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from datetime import date, time, timedelta
from decimal import Decimal
import importlib
import os

Expand All @@ -8,7 +10,7 @@
import pytest
from pytz import FixedOffset, utc

from pandas.compat import PY3
from pandas.compat import PY3, u
import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -514,6 +516,84 @@ def any_numpy_dtype(request):
return request.param


# categoricals are handled separately
_any_skipna_inferred_dtype = [
('string', ['a', np.nan, 'c']),
('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]),
('bytes' if PY3 else 'string', [b'a', np.nan, b'c']),
('empty', [np.nan, np.nan, np.nan]),
('empty', []),
('mixed-integer', ['a', np.nan, 2]),
('mixed', ['a', np.nan, 2.0]),
('floating', [1.0, np.nan, 2.0]),
('integer', [1, np.nan, 2]),
('mixed-integer-float', [1, np.nan, 2.0]),
('decimal', [Decimal(1), np.nan, Decimal(2)]),
('boolean', [True, np.nan, False]),
('datetime64', [np.datetime64('2013-01-01'), np.nan,
np.datetime64('2018-01-01')]),
('datetime', [pd.Timestamp('20130101'), np.nan, pd.Timestamp('20180101')]),
('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]),
# The following two dtypes are commented out due to GH 23554
# ('complex', [1 + 1j, np.nan, 2 + 2j]),
# ('timedelta64', [np.timedelta64(1, 'D'),
# np.nan, np.timedelta64(2, 'D')]),
('timedelta', [timedelta(1), np.nan, timedelta(2)]),
('time', [time(1), np.nan, time(2)]),
('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]),
('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])]
ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id


@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids)
def any_skipna_inferred_dtype(request):
"""
Fixture for all inferred dtypes from _libs.lib.infer_dtype

The covered (inferred) types are:
* 'string'
* 'unicode' (if PY2)
* 'empty'
* 'bytes' (if PY3)
* 'mixed'
* 'mixed-integer'
* 'mixed-integer-float'
* 'floating'
* 'integer'
* 'decimal'
* 'boolean'
* 'datetime64'
* 'datetime'
* 'date'
* 'timedelta'
* 'time'
* 'period'
* 'interval'

Returns
-------
inferred_dtype : str
The string for the inferred dtype from _libs.lib.infer_dtype
values : np.ndarray
An array of object dtype that will be inferred to have
`inferred_dtype`

Examples
--------
>>> import pandas._libs.lib as lib
>>>
>>> def test_something(any_skipna_inferred_dtype):
... inferred_dtype, values = any_skipna_inferred_dtype
... # will pass
... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
"""
inferred_dtype, values = request.param
values = np.array(values, dtype=object) # object dtype to avoid casting

# correctness of inference tested in tests/dtypes/test_inference.py
return inferred_dtype, values


@pytest.fixture
def mock():
"""
Expand Down
7 changes: 7 additions & 0 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,13 @@ class TestTypeInference(object):
class Dummy():
pass

def test_inferred_dtype_fixture(self, any_skipna_inferred_dtype):
# see pandas/conftest.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its nice that you added this, but you didn't remove any code. if you are not going to do that , then not much point of putting the fixture in conftest.py in the first place.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback
This is coming out of your review above:

I'm asking if you want me to move this particular fixture to pandas/conftest.py and then test it within the dtype tests (because this is effectively a dtype thing).

Yes

And I don't get how adding this fixture is tied to code removal? I'm testing the .str-accessor on all the inferred dtypes to make sure it raises correctly, that's what I mainly need this fixture for.

That I'm testing the validity of the fixture in test_inference.py is for consistency, because it belongs there thematically (but could otherwise test that directly in the fixture constructor).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback
This fixture would be a perfect candidate for splitting off a PR, but then, I'm afraid you're gonna say it doesn't do anything interesting (yet).

Do you want me to:

  • split it up, and have the fixture being unusued until this PR is merged,
  • or do want me to keep things logically together (i.e. in this PR)?

inferred_dtype, values = any_skipna_inferred_dtype

# make sure the inferred dtype of the fixture is as requested
assert inferred_dtype == lib.infer_dtype(values, skipna=True)

def test_length_zero(self):
result = lib.infer_dtype(np.array([], dtype='i4'))
assert result == 'integer'
Expand Down
153 changes: 147 additions & 6 deletions pandas/tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import numpy as np
from numpy.random import randint

from pandas.compat import range, u
from pandas.compat import range, u, PY3
import pandas.compat as compat
from pandas import Index, Series, DataFrame, isna, MultiIndex, notna, concat

Expand Down Expand Up @@ -118,6 +118,55 @@ def any_string_method(request):
return request.param


# subset of the full set from pandas/conftest.py
_any_allowed_skipna_inferred_dtype = [
('string', ['a', np.nan, 'c']),
('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]),
('bytes' if PY3 else 'string', [b'a', np.nan, b'c']),
('empty', [np.nan, np.nan, np.nan]),
('empty', []),
('mixed-integer', ['a', np.nan, 2])
]
ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id


@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
def any_allowed_skipna_inferred_dtype(request):
"""
Fixture for all (inferred) dtypes allowed in StringMethods.__init__

The covered (inferred) types are:
* 'string'
* 'unicode' (if PY2)
* 'empty'
* 'bytes' (if PY3)
* 'mixed'
* 'mixed-integer'

Returns
-------
inferred_dtype : str
The string for the inferred dtype from _libs.lib.infer_dtype
values : np.ndarray
An array of object dtype that will be inferred to have
`inferred_dtype`

Examples
--------
>>> import pandas._libs.lib as lib
>>>
>>> def test_something(any_allowed_skipna_inferred_dtype):
... inferred_dtype, values = any_skipna_inferred_dtype
... # will pass
... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
"""
inferred_dtype, values = request.param
values = np.array(values, dtype=object) # object dtype to avoid casting

# correctness of inference tested in tests/dtypes/test_inference.py
return inferred_dtype, values


class TestStringMethods(object):

def test_api(self):
Expand All @@ -126,11 +175,103 @@ def test_api(self):
assert Series.str is strings.StringMethods
assert isinstance(Series(['']).str, strings.StringMethods)

# GH 9184
invalid = Series([1])
with pytest.raises(AttributeError, match="only use .str accessor"):
invalid.str
assert not hasattr(invalid, 'str')
@pytest.mark.parametrize('dtype', [object, 'category'])
@pytest.mark.parametrize('box', [Series, Index])
def test_api_per_dtype(self, box, dtype, any_skipna_inferred_dtype):
# one instance of parametrized fixture
inferred_dtype, values = any_skipna_inferred_dtype

t = box(values, dtype=dtype) # explicit dtype to avoid casting
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can actually make this 2 tests by putting the xfails in a function. then the test becomes single purpose and you don't have the if statement near the bottom.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand what you're asking for here, sorry.

The test is already very single-purpose (except the xfails, which will be gone with #23167 and follow-up PRs), and the final if-switch makes it transparent which types are actually passing the constructor, with all other types raising. this would only get harder to understand if it's split into two, no?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no you have an if else branch. it makes reasoning about this impossible as its very fixture / data dependent.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The if branch is transparently for the cases where the .str-accessor raises or not. I do not understand how you want me to structure this test (resp. this function you mentioned).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if/else make the test very hard to understand. pls break in 2

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This particular test can be broken into _passes and _raises, but that last if condition is really not that hard. Don't get that objection, tbh.

if inferred_dtype in types_passing_constructor:
    # pass
else:
    # raise

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Respectfully, I find it an unrealistic criterion to not be able to use one simple if-condition in a test (aside from xfails, which will be gone soon after). The whole point is that it's got an extensively parametrized fixture (any_skipna_inferred_dtype), and I have to make a single distinction based on the content of that fixture.

Even if I were to split this test into _passes and _raises, I'd have to use the same kind of if-condition, unless I were to needlessly break up the parametrized fixtures into smaller subsets.


# TODO: get rid of these xfails
if dtype == 'category' and inferred_dtype in ['period', 'interval']:
pytest.xfail(reason='Conversion to numpy array fails because '
'the ._values-attribute is not a numpy array for '
'PeriodArray/IntervalArray; see GH 23553')
if box == Index and inferred_dtype in ['empty', 'bytes']:
pytest.xfail(reason='Raising too restrictively; '
'solved by GH 23167')
if (box == Index and dtype == object
and inferred_dtype in ['boolean', 'date', 'time']):
pytest.xfail(reason='Inferring incorrectly because of NaNs; '
'solved by GH 23167')
if (box == Series
and (dtype == object and inferred_dtype not in [
'string', 'unicode', 'empty',
'bytes', 'mixed', 'mixed-integer'])
or (dtype == 'category'
and inferred_dtype in ['decimal', 'boolean', 'time'])):
pytest.xfail(reason='Not raising correctly; solved by GH 23167')

types_passing_constructor = ['string', 'unicode', 'empty',
'bytes', 'mixed', 'mixed-integer']
if inferred_dtype in types_passing_constructor:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

see my comment above

# GH 6106
assert isinstance(t.str, strings.StringMethods)
else:
# GH 9184, GH 23011, GH 23163
with pytest.raises(AttributeError, match='Can only use .str '
'accessor with string values.*'):
t.str
assert not hasattr(t, 'str')

@pytest.mark.parametrize('dtype', [object, 'category'])
@pytest.mark.parametrize('box', [Series, Index])
def test_api_per_method(self, box, dtype,
any_allowed_skipna_inferred_dtype,
any_string_method):
# this test does not check correctness of the different methods,
# just that the methods work on the specified (inferred) dtypes,
# and raise on all others

# one instance of each parametrized fixture
inferred_dtype, values = any_allowed_skipna_inferred_dtype
method_name, args, kwargs = any_string_method

# TODO: get rid of these xfails
if (method_name not in ['encode', 'decode', 'len']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment as above

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These xfails will be gone, and then the test reads very clearly, IMO

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this test cannot be broken up as easily, because the allowed types depend on the method being checked!

and inferred_dtype == 'bytes'):
pytest.xfail(reason='Not raising for "bytes", see GH 23011;'
'Also: malformed method names, see GH 23551; '
'solved by GH 23167')
if (method_name == 'cat'
and inferred_dtype in ['mixed', 'mixed-integer']):
pytest.xfail(reason='Bad error message; should raise better; '
'solved by GH 23167')
if box == Index and inferred_dtype in ['empty', 'bytes']:
pytest.xfail(reason='Raising too restrictively; '
'solved by GH 23167')
if (box == Index and dtype == object
and inferred_dtype in ['boolean', 'date', 'time']):
pytest.xfail(reason='Inferring incorrectly because of NaNs; '
'solved by GH 23167')
if box == Index and dtype == 'category':
pytest.xfail(reason='Broken methods on CategoricalIndex; '
'see GH 23556')

t = box(values, dtype=dtype) # explicit dtype to avoid casting
method = getattr(t.str, method_name)

bytes_allowed = method_name in ['encode', 'decode', 'len']
# as of v0.23.4, all methods except 'cat' are very lenient with the
# allowed data types, just returning NaN for entries that error.
# This could be changed with an 'errors'-kwarg to the `str`-accessor,
# see discussion in GH 13877
mixed_allowed = method_name not in ['cat']

allowed_types = (['string', 'unicode', 'empty']
+ ['bytes'] * bytes_allowed
+ ['mixed', 'mixed-integer'] * mixed_allowed)

if inferred_dtype in allowed_types:
method(*args, **kwargs) # works!
else:
# GH 23011, GH 23163
msg = ('Cannot use .str.{name} with values of inferred dtype '
'{inferred_dtype!r}.'.format(name=method_name,
inferred_dtype=inferred_dtype))
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)

def test_api_for_categorical(self, any_string_method):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are the remnants of test_str_accessor_api_for_categorical after parametrization.

# https://github.com/pandas-dev/pandas/issues/10661
Expand Down