Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST: Refactor some slow tests #53784

Merged
merged 7 commits into from
Jun 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 11 additions & 7 deletions pandas/tests/frame/methods/test_duplicated.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import sys

import numpy as np
import pytest
Expand All @@ -21,14 +22,17 @@ def test_duplicated_with_misspelled_column_name(subset):
df.duplicated(subset)


@pytest.mark.slow
def test_duplicated_do_not_fail_on_wide_dataframes():
def test_duplicated_implemented_no_recursion():
# gh-21524
# Given the wide dataframe with a lot of columns
# with different (important!) values
data = {f"col_{i:02d}": np.random.randint(0, 1000, 30000) for i in range(100)}
df = DataFrame(data).T
result = df.duplicated()
# Ensure duplicated isn't implemented using recursion that
# can fail on wide frames
df = DataFrame(np.random.randint(0, 1000, (10, 1000)))
rec_limit = sys.getrecursionlimit()
try:
sys.setrecursionlimit(100)
result = df.duplicated()
finally:
sys.setrecursionlimit(rec_limit)

# Then duplicates produce the bool Series as a result and don't fail during
# calculation. Actual values doesn't matter here, though usually it's all
Expand Down
64 changes: 28 additions & 36 deletions pandas/tests/groupby/test_nunique.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,51 +17,43 @@


@pytest.mark.slow
@pytest.mark.parametrize("n", 10 ** np.arange(2, 6))
@pytest.mark.parametrize("m", [10, 100, 1000])
@pytest.mark.parametrize("sort", [False, True])
@pytest.mark.parametrize("dropna", [False, True])
def test_series_groupby_nunique(n, m, sort, dropna):
def check_nunique(df, keys, as_index=True):
original_df = df.copy()
gr = df.groupby(keys, as_index=as_index, sort=sort)
left = gr["julie"].nunique(dropna=dropna)

gr = df.groupby(keys, as_index=as_index, sort=sort)
right = gr["julie"].apply(Series.nunique, dropna=dropna)
if not as_index:
right = right.reset_index(drop=True)

if as_index:
tm.assert_series_equal(left, right, check_names=False)
else:
tm.assert_frame_equal(left, right, check_names=False)
tm.assert_frame_equal(df, original_df)

@pytest.mark.parametrize("as_index", [True, False])
@pytest.mark.parametrize("with_nan", [True, False])
@pytest.mark.parametrize("keys", [["joe"], ["joe", "jim"]])
def test_series_groupby_nunique(sort, dropna, as_index, with_nan, keys):
n = 100
m = 10
days = date_range("2015-08-23", periods=10)

frame = DataFrame(
df = DataFrame(
{
"jim": np.random.choice(list(ascii_lowercase), n),
"joe": np.random.choice(days, n),
"julie": np.random.randint(0, m, n),
}
)

check_nunique(frame, ["jim"])
check_nunique(frame, ["jim", "joe"])

frame = frame.astype({"julie": float}) # Explicit cast to avoid implicit cast below
frame.loc[1::17, "jim"] = None
frame.loc[3::37, "joe"] = None
frame.loc[7::19, "julie"] = None
frame.loc[8::19, "julie"] = None
frame.loc[9::19, "julie"] = None

check_nunique(frame, ["jim"])
check_nunique(frame, ["jim", "joe"])
check_nunique(frame, ["jim"], as_index=False)
check_nunique(frame, ["jim", "joe"], as_index=False)
if with_nan:
df = df.astype({"julie": float}) # Explicit cast to avoid implicit cast below
df.loc[1::17, "jim"] = None
df.loc[3::37, "joe"] = None
df.loc[7::19, "julie"] = None
df.loc[8::19, "julie"] = None
df.loc[9::19, "julie"] = None
original_df = df.copy()
gr = df.groupby(keys, as_index=as_index, sort=sort)
left = gr["julie"].nunique(dropna=dropna)

gr = df.groupby(keys, as_index=as_index, sort=sort)
right = gr["julie"].apply(Series.nunique, dropna=dropna)
if not as_index:
right = right.reset_index(drop=True)

if as_index:
tm.assert_series_equal(left, right, check_names=False)
else:
tm.assert_frame_equal(left, right, check_names=False)
tm.assert_frame_equal(df, original_df)


def test_nunique():
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/io/formats/test_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
IS64,
PYPY,
)
import pandas.util._test_decorators as td

from pandas import (
CategoricalIndex,
Expand Down Expand Up @@ -504,9 +503,10 @@ def test_memory_usage_empty_no_warning():
tm.assert_series_equal(result, expected)


@td.skip_if_no("numba")
@pytest.mark.single_cpu
def test_info_compute_numba():
# GH#51922
pytest.importorskip("numba")
df = DataFrame([[1, 2], [3, 4]])

with option_context("compute.use_numba", True):
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1831,9 +1831,10 @@ def test_encoding_latin1_118(self, datapath):
# will block pytests skip mechanism from triggering (failing the test)
# if the path is not present
path = datapath("io", "data", "stata", "stata1_encoding_118.dta")
with tm.assert_produces_warning(UnicodeWarning) as w:
with tm.assert_produces_warning(UnicodeWarning, filter_level="once") as w:
encoded = read_stata(path)
assert len(w) == 151
# with filter_level="always", produces 151 warnings which can be slow
assert len(w) == 1
assert w[0].message.args[0] == msg

expected = DataFrame([["Düsseldorf"]] * 151, columns=["kreis1849"])
Expand Down
48 changes: 21 additions & 27 deletions pandas/tests/plotting/frame/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1782,39 +1782,33 @@ def _check(axes):
_check_visible(ax.get_xticklabels(minor=True), visible=True)

@td.skip_if_no_scipy
def test_memory_leak(self):
@pytest.mark.parametrize("kind", plotting.PlotAccessor._all_kinds)
def test_memory_leak(self, kind):
"""Check that every plot type gets properly collected."""
results = {}
for kind in plotting.PlotAccessor._all_kinds:
args = {}
if kind in ["hexbin", "scatter", "pie"]:
df = DataFrame(
{
"A": np.random.uniform(size=20),
"B": np.random.uniform(size=20),
"C": np.arange(20) + np.random.uniform(size=20),
}
)
args = {"x": "A", "y": "B"}
elif kind == "area":
df = tm.makeTimeDataFrame().abs()
else:
df = tm.makeTimeDataFrame()

# Use a weakref so we can see if the object gets collected without
# also preventing it from being collected
results[kind] = weakref.proxy(df.plot(kind=kind, **args))
args = {}
if kind in ["hexbin", "scatter", "pie"]:
df = DataFrame(
{
"A": np.random.uniform(size=20),
"B": np.random.uniform(size=20),
"C": np.arange(20) + np.random.uniform(size=20),
}
)
args = {"x": "A", "y": "B"}
elif kind == "area":
df = tm.makeTimeDataFrame().abs()
else:
df = tm.makeTimeDataFrame()

# Use a weakref so we can see if the object gets collected without
# also preventing it from being collected
ref = weakref.ref(df.plot(kind=kind, **args))

# have matplotlib delete all the figures
tm.close()
# force a garbage collection
gc.collect()
msg = "weakly-referenced object no longer exists"
for result_value in results.values():
# check that every plot was collected
with pytest.raises(ReferenceError, match=msg):
# need to actually access something to get an error
result_value.lines
assert ref() is None

def test_df_gridspec_patterns(self):
# GH 10819
Expand Down
134 changes: 76 additions & 58 deletions pandas/tests/plotting/test_boxplot_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,38 +392,51 @@ def test_grouped_box_return_type(self, hist_df):
result, None, expected_keys=["height", "weight", "category"]
)

@pytest.mark.slow
def test_grouped_box_return_type_groupby(self, hist_df):
df = hist_df
# now for groupby
result = df.groupby("gender").boxplot(return_type="dict")
_check_box_return_type(result, "dict", expected_keys=["Male", "Female"])

columns2 = "X B C D A G Y N Q O".split()
df2 = DataFrame(np.random.randn(50, 10), columns=columns2)
categories2 = "A B C D E F G H I J".split()
df2["category"] = categories2 * 5
@pytest.mark.slow
@pytest.mark.parametrize("return_type", ["dict", "axes", "both"])
def test_grouped_box_return_type_arg(self, hist_df, return_type):
df = hist_df

for t in ["dict", "axes", "both"]:
returned = df.groupby("classroom").boxplot(return_type=t)
_check_box_return_type(returned, t, expected_keys=["A", "B", "C"])
returned = df.groupby("classroom").boxplot(return_type=return_type)
_check_box_return_type(returned, return_type, expected_keys=["A", "B", "C"])

returned = df.boxplot(by="classroom", return_type=t)
_check_box_return_type(
returned, t, expected_keys=["height", "weight", "category"]
)
returned = df.boxplot(by="classroom", return_type=return_type)
_check_box_return_type(
returned, return_type, expected_keys=["height", "weight", "category"]
)

returned = df2.groupby("category").boxplot(return_type=t)
_check_box_return_type(returned, t, expected_keys=categories2)
@pytest.mark.slow
@pytest.mark.parametrize("return_type", ["dict", "axes", "both"])
def test_grouped_box_return_type_arg_duplcate_cats(self, return_type):
columns2 = "X B C D A".split()
df2 = DataFrame(np.random.randn(6, 5), columns=columns2)
categories2 = "A B".split()
df2["category"] = categories2 * 3

returned = df2.groupby("category").boxplot(return_type=return_type)
_check_box_return_type(returned, return_type, expected_keys=categories2)

returned = df2.boxplot(by="category", return_type=t)
_check_box_return_type(returned, t, expected_keys=columns2)
returned = df2.boxplot(by="category", return_type=return_type)
_check_box_return_type(returned, return_type, expected_keys=columns2)

@pytest.mark.slow
def test_grouped_box_layout(self, hist_df):
def test_grouped_box_layout_too_small(self, hist_df):
df = hist_df

msg = "Layout of 1x1 must be larger than required size 2"
with pytest.raises(ValueError, match=msg):
df.boxplot(column=["weight", "height"], by=df.gender, layout=(1, 1))

@pytest.mark.slow
def test_grouped_box_layout_needs_by(self, hist_df):
df = hist_df
msg = "The 'layout' keyword is not supported when 'by' is None"
with pytest.raises(ValueError, match=msg):
df.boxplot(
Expand All @@ -432,79 +445,84 @@ def test_grouped_box_layout(self, hist_df):
return_type="dict",
)

@pytest.mark.slow
def test_grouped_box_layout_positive_layout(self, hist_df):
df = hist_df
msg = "At least one dimension of layout must be positive"
with pytest.raises(ValueError, match=msg):
df.boxplot(column=["weight", "height"], by=df.gender, layout=(-1, -1))

# _check_plot_works adds an ax so catch warning. see GH #13188
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
_check_plot_works(
df.groupby("gender").boxplot, column="height", return_type="dict"
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=2, layout=(1, 2))

with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
_check_plot_works(
df.groupby("category").boxplot, column="height", return_type="dict"
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(2, 2))

# GH 6769
@pytest.mark.slow
@pytest.mark.parametrize(
"gb_key, axes_num, rows",
[["gender", 2, 1], ["category", 4, 2], ["classroom", 3, 2]],
)
def test_grouped_box_layout_positive_layout_axes(
self, hist_df, gb_key, axes_num, rows
):
df = hist_df
# _check_plot_works adds an ax so catch warning. see GH #13188 GH 6769
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
_check_plot_works(
df.groupby("classroom").boxplot, column="height", return_type="dict"
df.groupby(gb_key).boxplot, column="height", return_type="dict"
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=axes_num, layout=(rows, 2))

@pytest.mark.slow
@pytest.mark.parametrize(
"col, visible", [["height", False], ["weight", True], ["category", True]]
)
def test_grouped_box_layout_visible(self, hist_df, col, visible):
df = hist_df
# GH 5897
axes = df.boxplot(
column=["height", "weight", "category"], by="gender", return_type="axes"
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))
for ax in [axes["height"]]:
_check_visible(ax.get_xticklabels(), visible=False)
_check_visible([ax.xaxis.get_label()], visible=False)
for ax in [axes["weight"], axes["category"]]:
_check_visible(ax.get_xticklabels())
_check_visible([ax.xaxis.get_label()])
ax = axes[col]
_check_visible(ax.get_xticklabels(), visible=visible)
_check_visible([ax.xaxis.get_label()], visible=visible)

@pytest.mark.slow
def test_grouped_box_layout_shape(self, hist_df):
df = hist_df
df.groupby("classroom").boxplot(
column=["height", "weight", "category"], return_type="dict"
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(2, 2))

@pytest.mark.slow
@pytest.mark.parametrize("cols", [2, -1])
def test_grouped_box_layout_works(self, hist_df, cols):
df = hist_df
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
_check_plot_works(
df.groupby("category").boxplot,
column="height",
layout=(3, 2),
return_type="dict",
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(3, 2))
with tm.assert_produces_warning(UserWarning, check_stacklevel=False):
_check_plot_works(
df.groupby("category").boxplot,
column="height",
layout=(3, -1),
layout=(3, cols),
return_type="dict",
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=4, layout=(3, 2))

df.boxplot(column=["height", "weight", "category"], by="gender", layout=(4, 1))
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(4, 1))

df.boxplot(column=["height", "weight", "category"], by="gender", layout=(-1, 1))
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(3, 1))

df.groupby("classroom").boxplot(
column=["height", "weight", "category"], layout=(1, 4), return_type="dict"
@pytest.mark.slow
@pytest.mark.parametrize("rows, res", [[4, 4], [-1, 3]])
def test_grouped_box_layout_axes_shape_rows(self, hist_df, rows, res):
df = hist_df
df.boxplot(
column=["height", "weight", "category"], by="gender", layout=(rows, 1)
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, 4))
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(res, 1))

@pytest.mark.slow
@pytest.mark.parametrize("cols, res", [[4, 4], [-1, 3]])
def test_grouped_box_layout_axes_shape_cols_groupby(self, hist_df, cols, res):
df = hist_df
df.groupby("classroom").boxplot(
column=["height", "weight", "category"], layout=(1, -1), return_type="dict"
column=["height", "weight", "category"],
layout=(1, cols),
return_type="dict",
)
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, 3))
_check_axes_shape(mpl.pyplot.gcf().axes, axes_num=3, layout=(1, res))

@pytest.mark.slow
def test_grouped_box_multiple_axes(self, hist_df):
Expand Down
Loading