Skip to content

Commit

Permalink
TST: Refactor some slow tests (pandas-dev#53784)
Browse files Browse the repository at this point in the history
* Cleanup single used method

* Clean plotting test

* Improve test_series_groupby_nunique

* Address more slow tests

* Undo changes
  • Loading branch information
mroeschke authored and im-vinicius committed Jul 8, 2023
1 parent 1418b11 commit c318752
Show file tree
Hide file tree
Showing 7 changed files with 199 additions and 183 deletions.
18 changes: 11 additions & 7 deletions pandas/tests/frame/methods/test_duplicated.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import sys

import numpy as np
import pytest
Expand All @@ -21,14 +22,17 @@ def test_duplicated_with_misspelled_column_name(subset):
df.duplicated(subset)


@pytest.mark.slow
def test_duplicated_do_not_fail_on_wide_dataframes():
def test_duplicated_implemented_no_recursion():
# gh-21524
# Given the wide dataframe with a lot of columns
# with different (important!) values
data = {f"col_{i:02d}": np.random.randint(0, 1000, 30000) for i in range(100)}
df = DataFrame(data).T
result = df.duplicated()
# Ensure duplicated isn't implemented using recursion that
# can fail on wide frames
df = DataFrame(np.random.randint(0, 1000, (10, 1000)))
rec_limit = sys.getrecursionlimit()
try:
sys.setrecursionlimit(100)
result = df.duplicated()
finally:
sys.setrecursionlimit(rec_limit)

# Then duplicates produce the bool Series as a result and don't fail during
# calculation. Actual values doesn't matter here, though usually it's all
Expand Down
64 changes: 28 additions & 36 deletions pandas/tests/groupby/test_nunique.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,51 +17,43 @@


@pytest.mark.slow
@pytest.mark.parametrize("n", 10 ** np.arange(2, 6))
@pytest.mark.parametrize("m", [10, 100, 1000])
@pytest.mark.parametrize("sort", [False, True])
@pytest.mark.parametrize("dropna", [False, True])
def test_series_groupby_nunique(n, m, sort, dropna):
def check_nunique(df, keys, as_index=True):
original_df = df.copy()
gr = df.groupby(keys, as_index=as_index, sort=sort)
left = gr["julie"].nunique(dropna=dropna)

gr = df.groupby(keys, as_index=as_index, sort=sort)
right = gr["julie"].apply(Series.nunique, dropna=dropna)
if not as_index:
right = right.reset_index(drop=True)

if as_index:
tm.assert_series_equal(left, right, check_names=False)
else:
tm.assert_frame_equal(left, right, check_names=False)
tm.assert_frame_equal(df, original_df)

@pytest.mark.parametrize("as_index", [True, False])
@pytest.mark.parametrize("with_nan", [True, False])
@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]])
def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys):
n = 100
m = 10
days = date_range("2015-08-23", periods=10)

frame = DataFrame(
df = DataFrame(
{
"jim": np.random.choice(list(ascii_lowercase), n),
"joe": np.random.choice(days, n),
"julie": np.random.randint(0, m, n),
}
)

check_nunique(frame, ["jim"])
check_nunique(frame, ["jim", "joe"])

frame = frame.astype({"julie": float}) # Explicit cast to avoid implicit cast below
frame.loc[1::17, "jim"] = None
frame.loc[3::37, "joe"] = None
frame.loc[7::19, "julie"] = None
frame.loc[8::19, "julie"] = None
frame.loc[9::19, "julie"] = None

check_nunique(frame, ["jim"])
check_nunique(frame, ["jim", "joe"])
check_nunique(frame, ["jim"], as_index=False)
check_nunique(frame, ["jim", "joe"], as_index=False)
if with_nan:
df = df.astype({"julie": float}) # Explicit cast to avoid implicit cast below
df.loc[1::17, "jim"] = None
df.loc[3::37, "joe"] = None
df.loc[7::19, "julie"] = None
df.loc[8::19, "julie"] = None
df.loc[9::19, "julie"] = None
original_df = df.copy()
gr = df.groupby(keys, as_index=as_index, sort=sort)
left = gr["julie"].nunique(dropna=dropna)

gr = df.groupby(keys, as_index=as_index, sort=sort)
right = gr["julie"].apply(Series.nunique, dropna=dropna)
if not as_index:
right = right.reset_index(drop=True)

if as_index:
tm.assert_series_equal(left, right, check_names=False)
else:
tm.assert_frame_equal(left, right, check_names=False)
tm.assert_frame_equal(df, original_df)


def test_nunique():
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/formats/test_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
IS64,
PYPY,
)
import pandas.util._test_decorators as td

from pandas import (
CategoricalIndex,
Expand Down Expand Up @@ -504,9 +503,10 @@ def test_memory_usage_empty_no_warning():
tm.assert_series_equal(result, expected)


@td.skip_if_no("numba")
@pytest.mark.single_cpu
def test_info_compute_numba():
# GH#51922
pytest.importorskip("numba")
df = DataFrame([[1, 2], [3, 4]])

with option_context("compute.use_numba", True):
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1831,9 +1831,10 @@ def test_encoding_latin1_118(self, datapath):
# will block pytests skip mechanism from triggering (failing the test)
# if the path is not present
path = datapath("io", "data", "stata", "stata1_encoding_118.dta")
with tm.assert_produces_warning(UnicodeWarning) as w:
with tm.assert_produces_warning(UnicodeWarning, filter_level="once") as w:
encoded = read_stata(path)
assert len(w) == 151
# with filter_level="always", produces 151 warnings which can be slow
assert len(w) == 1
assert w[0].message.args[0] == msg

expected = DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"])
Expand Down
48 changes: 21 additions & 27 deletions pandas/tests/plotting/frame/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1782,39 +1782,33 @@ def _check(axes):
_check_visible(ax.get_xticklabels(minor=True), visible=True)

@td.skip_if_no_scipy
def test_memory_leak(self):
@pytest.mark.parametrize("kind", plotting.PlotAccessor._all_kinds)
def test_memory_leak(self, kind):
"""Check that every plot type gets properly collected."""
results = {}
for kind in plotting.PlotAccessor._all_kinds:
args = {}
if kind in ["hexbin", "scatter", "pie"]:
df = DataFrame(
{
"A": np.random.uniform(size=20),
"B": np.random.uniform(size=20),
"C": np.arange(20) + np.random.uniform(size=20),
}
)
args = {"x": "A", "y": "B"}
elif kind == "area":
df = tm.makeTimeDataFrame().abs()
else:
df = tm.makeTimeDataFrame()

# Use a weakref so we can see if the object gets collected without
# also preventing it from being collected
results[kind] = weakref.proxy(df.plot(kind=kind, **args))
args = {}
if kind in ["hexbin", "scatter", "pie"]:
df = DataFrame(
{
"A": np.random.uniform(size=20),
"B": np.random.uniform(size=20),
"C": np.arange(20) + np.random.uniform(size=20),
}
)
args = {"x": "A", "y": "B"}
elif kind == "area":
df = tm.makeTimeDataFrame().abs()
else:
df = tm.makeTimeDataFrame()

# Use a weakref so we can see if the object gets collected without
# also preventing it from being collected
ref = weakref.ref(df.plot(kind=kind, **args))

# have matplotlib delete all the figures
tm.close()
# force a garbage collection
gc.collect()
msg = "weakly-referenced object no longer exists"
for result_value in results.values():
# check that every plot was collected
with pytest.raises(ReferenceError, match=msg):
# need to actually access something to get an error
result_value.lines
assert ref() is None

def test_df_gridspec_patterns(self):
# GH 10819
Expand Down
134 changes: 76 additions & 58 deletions pandas/tests/plotting/test_boxplot_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,38 +392,51 @@ def test_grouped_box_return_type(self, hist_df):
result, None, expected_keys=["height", "weight", "category"]
)

@pytest.mark.slow
def test_grouped_box_return_type_groupby(self, hist_df):
df = hist_df
# now for groupby
result = df.groupby("gender").boxplot(return_type="dict")
_check_box_return_type(result, "dict", expected_keys=["Male", "Female"])

columns2 = "X B C D A G Y N Q O".split()
df2 = DataFrame(np.random.randn(50, 10), columns=columns2)
categories2 = "A B C D E F G H I J".split()
df2["category"] = categories2 * 5
@pytest.mark.slow
@pytest.mark.parametrize("return_type", ["dict", "axes", "both"])
def test_grouped_box_return_type_arg(self, hist_df, return_type):
df = hist_df

for t in ["dict", "axes", "both"]:
returned = df.groupby("classroom").boxplot(return_type=t)
_check_box_return_type(returned, t, expected_keys=["A", "B", "C"])
returned = df.groupby("classroom").boxplot(return_type=return_type)
_check_box_return_type(returned, return_type, expected_keys=["A", "B", "C"])

returned = df.boxplot(by="classroom", return_type=t)
_check_box_return_type(
returned, t, expected_keys=["height", "weight", "category"]
)
returned = df.boxplot(by="classroom", return_type=return_type)
_check_box_return_type(
returned, return_type, expected_keys=["height", "weight", "category"]
)

returned = df2.groupby("category").boxplot(return_type=t)
_check_box_return_type(returned, t, expected_keys=categories2)
@pytest.mark.slow
@pytest.mark.parametrize("return_type", ["dict", "axes", "both"])
def test_grouped_box_return_type_arg_duplcate_cats(self, return_type):
columns2 = "X B C D A".split()
df2 = DataFrame(np.random.randn(6, 5), columns=columns2)
categories2 = "A B".split()
df2["category"] = categories2 * 3

returned = df2.groupby("category").boxplot(return_type=return_type)
_check_box_return_type(returned, return_type, expected_keys=categories2)

returned = df2.boxplot(by="category", return_type=t)
_check_box_return_type(returned, t, expected_keys=columns2)
returned = df2.boxplot(by="category", return_type=return_type)
_check_box_return_type(returned, return_type, expected_keys=columns2)

@pytest.mark.slow
def test_grouped_box_layout(self, hist_df):
def test_grouped_box_layout_too_small(self, hist_df):
df = hist_df

msg = "Layout of 1x1 must be larger than required size 2"
with pytest.raises(ValueError, match=msg):
df.boxplot(column=["weight", "height"], by=df.gender, layout=(1, 1))

@pytest.mark.slow
def test_grouped_box_layout_needs_by(self, hist_df):
df = hist_df
msg = "The 'layout' keyword is not supported when 'by' is None"
with pytest.raises(ValueError, match=msg):
df.boxplot(
Expand All @@ -432,79 +445,84 @@ def test_grouped_box_layout(self, hist_df):
return_type="dict",
)

@pytest.mark.slow
def test_grouped_box_layout_positive_layout(self, hist_df):
df = hist_df
msg = "At least one dimension of layout must be positive"
with pytest.raises(ValueError, match=msg):
df.boxplot(column=["weight", "height"], by=df.gender, layout=(-1, -1))

# _check_plot_works adds an ax so catch warning. see GH #13188
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
_check_plot_works(
df.groupby("gender").boxplot, column="height", return_type="dict"
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=2, layout=(1, 2))

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
_check_plot_works(
df.groupby("category").boxplot, column="height", return_type="dict"
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(2, 2))

# GH 6769
@pytest.mark.slow
@pytest.mark.parametrize(
"gb_key, axes_num, rows",
[["gender", 2, 1], ["category", 4, 2], ["classroom", 3, 2]],
)
def test_grouped_box_layout_positive_layout_axes(
self, hist_df, gb_key, axes_num, rows
):
df = hist_df
# _check_plot_works adds an ax so catch warning. see GH #13188 GH 6769
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
_check_plot_works(
df.groupby("classroom").boxplot, column="height", return_type="dict"
df.groupby(gb_key).boxplot, column="height", return_type="dict"
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=axes_num, layout=(rows, 2))

@pytest.mark.slow
@pytest.mark.parametrize(
"col, visible", [["height", False], ["weight", True], ["category", True]]
)
def test_grouped_box_layout_visible(self, hist_df, col, visible):
df = hist_df
# GH 5897
axes = df.boxplot(
column=["height", "weight", "category"], by="gender", return_type="axes"
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
for ax in [axes["height"]]:
_check_visible(ax.get_xticklabels(), visible=False)
_check_visible([ax.xaxis.get_label()], visible=False)
for ax in [axes["weight"], axes["category"]]:
_check_visible(ax.get_xticklabels())
_check_visible([ax.xaxis.get_label()])
ax = axes[col]
_check_visible(ax.get_xticklabels(), visible=visible)
_check_visible([ax.xaxis.get_label()], visible=visible)

@pytest.mark.slow
def test_grouped_box_layout_shape(self, hist_df):
df = hist_df
df.groupby("classroom").boxplot(
column=["height", "weight", "category"], return_type="dict"
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))

@pytest.mark.slow
@pytest.mark.parametrize("cols", [2, -1])
def test_grouped_box_layout_works(self, hist_df, cols):
df = hist_df
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
_check_plot_works(
df.groupby("category").boxplot,
column="height",
layout=(3, 2),
return_type="dict",
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(3, 2))
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
_check_plot_works(
df.groupby("category").boxplot,
column="height",
layout=(3, -1),
layout=(3, cols),
return_type="dict",
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(3, 2))

df.boxplot(column=["height", "weight", "category"], by="gender", layout=(4, 1))
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(4, 1))

df.boxplot(column=["height", "weight", "category"], by="gender", layout=(-1, 1))
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(3, 1))

df.groupby("classroom").boxplot(
column=["height", "weight", "category"], layout=(1, 4), return_type="dict"
@pytest.mark.slow
@pytest.mark.parametrize("rows, res", [[4, 4], [-1, 3]])
def test_grouped_box_layout_axes_shape_rows(self, hist_df, rows, res):
df = hist_df
df.boxplot(
column=["height", "weight", "category"], by="gender", layout=(rows, 1)
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, 4))
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(res, 1))

@pytest.mark.slow
@pytest.mark.parametrize("cols, res", [[4, 4], [-1, 3]])
def test_grouped_box_layout_axes_shape_cols_groupby(self, hist_df, cols, res):
df = hist_df
df.groupby("classroom").boxplot(
column=["height", "weight", "category"], layout=(1, -1), return_type="dict"
column=["height", "weight", "category"],
layout=(1, cols),
return_type="dict",
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, 3))
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, res))

@pytest.mark.slow
def test_grouped_box_multiple_axes(self, hist_df):
Expand Down
Loading

0 comments on commit c318752

Please sign in to comment.