-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Hypothesis tests for roundtrip to & from pandas #3285
Merged
dcherian
merged 17 commits into
pydata:master
from
takluyver:hypothesis-pandas-roundtrip
Oct 30, 2019
Merged
Changes from all commits
Commits
Show all changes
17 commits
Select commit
Hold shift + click to select a range
1e8ac35
Move hypothesis deadline configuration to conftest.py
takluyver 9f14426
Add simple roundtrip test for xarray-pandas-xarray
takluyver 18790cc
Test roundtrip pd.Series->DataArray->Series
takluyver 2449ac2
Test roundtrip DataFrame->DataArray->DataFrame
takluyver 54900f0
Test roundtrip Dataset->Dataframe->Dataset
takluyver 02fd311
Relax to allow 0 entries in each dataset var
takluyver e8fb3da
Relax to allow empty string names
takluyver 67c7034
Add print_blob to config
takluyver 4ba4f7b
Extra half-roundtrip from pandas series to xarray
takluyver fb222c5
Extra half roundtrip from pandas dataframe to Xarray
takluyver 7b39a6f
Redesign strategy for generating datasets with 1D variables
takluyver a328739
Make pep8 happy
takluyver 3f462be
Merge branch 'master' into hypothesis-pandas-roundtrip
takluyver ecd016a
Autoformat test file
takluyver 351b40b
Skip hypothesis tests if hypothesis not available
takluyver 044c67d
Don't require hypothesis for conftest file
takluyver 5b0ae82
Mark failing test as xfail
takluyver File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
try: | ||
from hypothesis import settings | ||
except ImportError: | ||
pass | ||
else: | ||
# Run for a while - arrays are a bigger search space than usual | ||
settings.register_profile("ci", deadline=None, print_blob=True) | ||
settings.load_profile("ci") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
""" | ||
Property-based tests for roundtripping between xarray and pandas objects. | ||
""" | ||
import pytest | ||
|
||
pytest.importorskip("hypothesis") | ||
|
||
from functools import partial | ||
import hypothesis.extra.numpy as npst | ||
import hypothesis.extra.pandas as pdst | ||
import hypothesis.strategies as st | ||
from hypothesis import given | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import xarray as xr | ||
|
||
numeric_dtypes = st.one_of( | ||
npst.unsigned_integer_dtypes(), npst.integer_dtypes(), npst.floating_dtypes() | ||
) | ||
|
||
numeric_series = numeric_dtypes.flatmap(lambda dt: pdst.series(dtype=dt)) | ||
|
||
an_array = npst.arrays( | ||
dtype=numeric_dtypes, | ||
shape=npst.array_shapes(max_dims=2), # can only convert 1D/2D to pandas | ||
) | ||
|
||
|
||
@st.composite | ||
def datasets_1d_vars(draw): | ||
"""Generate datasets with only 1D variables | ||
|
||
Suitable for converting to pandas dataframes. | ||
""" | ||
# Generate an index for the dataset | ||
idx = draw(pdst.indexes(dtype="u8", min_size=0, max_size=100)) | ||
|
||
# Generate 1-3 variables, 1D with the same length as the index | ||
vars_strategy = st.dictionaries( | ||
keys=st.text(), | ||
values=npst.arrays(dtype=numeric_dtypes, shape=len(idx)).map( | ||
partial(xr.Variable, ("rows",)) | ||
), | ||
min_size=1, | ||
max_size=3, | ||
) | ||
return xr.Dataset(draw(vars_strategy), coords={"rows": idx}) | ||
|
||
|
||
@given(st.data(), an_array) | ||
def test_roundtrip_dataarray(data, arr): | ||
names = data.draw( | ||
st.lists(st.text(), min_size=arr.ndim, max_size=arr.ndim, unique=True).map( | ||
tuple | ||
) | ||
) | ||
coords = {name: np.arange(n) for (name, n) in zip(names, arr.shape)} | ||
original = xr.DataArray(arr, dims=names, coords=coords) | ||
roundtripped = xr.DataArray(original.to_pandas()) | ||
xr.testing.assert_identical(original, roundtripped) | ||
|
||
|
||
@given(datasets_1d_vars()) | ||
def test_roundtrip_dataset(dataset): | ||
df = dataset.to_dataframe() | ||
assert isinstance(df, pd.DataFrame) | ||
roundtripped = xr.Dataset(df) | ||
xr.testing.assert_identical(dataset, roundtripped) | ||
|
||
|
||
@given(numeric_series, st.text()) | ||
def test_roundtrip_pandas_series(ser, ix_name): | ||
# Need to name the index, otherwise Xarray calls it 'dim_0'. | ||
ser.index.name = ix_name | ||
arr = xr.DataArray(ser) | ||
roundtripped = arr.to_pandas() | ||
pd.testing.assert_series_equal(ser, roundtripped) | ||
xr.testing.assert_identical(arr, roundtripped.to_xarray()) | ||
|
||
|
||
# Dataframes with columns of all the same dtype - for roundtrip to DataArray | ||
numeric_homogeneous_dataframe = numeric_dtypes.flatmap( | ||
lambda dt: pdst.data_frames(columns=pdst.columns(["a", "b", "c"], dtype=dt)) | ||
) | ||
|
||
|
||
@pytest.mark.xfail | ||
@given(numeric_homogeneous_dataframe) | ||
def test_roundtrip_pandas_dataframe(df): | ||
# Need to name the indexes, otherwise Xarray names them 'dim_0', 'dim_1'. | ||
df.index.name = "rows" | ||
df.columns.name = "cols" | ||
arr = xr.DataArray(df) | ||
roundtripped = arr.to_pandas() | ||
pd.testing.assert_frame_equal(df, roundtripped) | ||
This comment was marked as resolved.
Sorry, something went wrong. |
||
xr.testing.assert_identical(arr, roundtripped.to_xarray()) |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These may need to be guarded too using
pytest.importorskip
perhaps? @max-sixty what do you think?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes same as here! https://github.com/max-sixty/xarray/blob/black/properties/test_encode_decode.py#L9
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Aha, I was being distracted by the other errors around the real one. Let's see if the latest commit helps.