Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce pytest runtime #10203

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions python/cudf/cudf/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from io import BytesIO, StringIO
bdice marked this conversation as resolved.
Show resolved Hide resolved
from pathlib import Path

import cupy as cp
import numpy as np
import pandas as pd
import pytest
Expand Down Expand Up @@ -1009,17 +1010,17 @@ def test_small_zip(tmpdir):
def test_csv_reader_carriage_return(tmpdir):
rows = 1000
names = ["int_row", "int_double_row"]

buffer = ",".join(names) + "\r\n"
for row in range(rows):
buffer += str(row) + ", " + str(2 * row) + "\r\n"

df = read_csv(StringIO(buffer))
expect = cudf.DataFrame(
{"int_row": cp.arange(rows), "int_double_row": cp.arange(rows) * 2}
)

assert len(df) == rows
for row in range(0, rows):
assert df[names[0]][row] == row
assert df[names[1]][row] == 2 * row
assert_eq(expect, df)


def test_csv_reader_tabs():
Expand Down
22 changes: 10 additions & 12 deletions python/cudf/cudf/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1294,8 +1294,8 @@ def test_loc_datetime_index(sli, is_dataframe):
@pytest.mark.parametrize(
bdice marked this conversation as resolved.
Show resolved Hide resolved
"gdf",
[
cudf.DataFrame({"a": range(1000000)}),
cudf.DataFrame({"a": range(1000000), "b": range(1000000)}),
cudf.DataFrame({"a": range(100000)}),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we remove the construction of GPU objects from the parametrize call? It occurs at collection time and is very expensive. This can be constructed lazily like:

@pytest.mark.parametrize(
    "gdf_kwargs",
    [
        dict(data={"a": range(100000)}),
        dict(data={"a": range(100000), "b": range(100000)}),
        # ...
        dict(index=[1, 2, 3]),
        # ...
    ],
)

then:

def test_dataframe_sliced(gdf_kwargs, slice):
    gdf = cudf.DataFrame(**gdf_kwargs)
    pdf = gdf.to_pandas()
    # ...

cudf.DataFrame({"a": range(100000), "b": range(100000)}),
cudf.DataFrame({"a": range(20), "b": range(20)}),
brandon-b-miller marked this conversation as resolved.
Show resolved Hide resolved
cudf.DataFrame(
{
Expand All @@ -1305,25 +1305,23 @@ def test_loc_datetime_index(sli, is_dataframe):
}
),
cudf.DataFrame(index=[1, 2, 3]),
cudf.DataFrame(index=range(1000000)),
cudf.DataFrame(index=range(100000)),
cudf.DataFrame(columns=["a", "b", "c", "d"]),
cudf.DataFrame(columns=["a"], index=range(1000000)),
cudf.DataFrame(
columns=["a", "col2", "...col n"], index=range(1000000)
),
cudf.DataFrame(index=cudf.Series(range(1000000)).astype("str")),
cudf.DataFrame(columns=["a"], index=range(100000)),
cudf.DataFrame(columns=["a", "col2", "...col n"], index=range(100000)),
cudf.DataFrame(index=cudf.Series(range(100000)).astype("str")),
cudf.DataFrame(
columns=["a", "b", "c", "d"],
index=cudf.Series(range(1000000)).astype("str"),
index=cudf.Series(range(100000)).astype("str"),
),
],
)
@pytest.mark.parametrize(
"slice",
[
slice(250000, 500000),
slice(250000, 250001),
slice(500000),
slice(25000, 50000),
slice(25000, 25001),
slice(50000),
slice(1, 10),
slice(10, 20),
slice(15, 24000),
Expand Down
10 changes: 5 additions & 5 deletions python/cudf/cudf/tests/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import cudf
from cudf.io.orc import ORCWriter
from cudf.testing import assert_frame_equal
from cudf.testing._utils import (
assert_eq,
gen_rand_series,
Expand Down Expand Up @@ -93,7 +94,7 @@ def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
path, engine=engine, columns=columns, use_index=use_index
)

assert_eq(expect, got, check_categorical=False)
assert_frame_equal(cudf.from_pandas(expect), got, check_categorical=False)


def test_orc_reader_filenotfound(tmpdir):
Expand Down Expand Up @@ -388,7 +389,7 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression):
cudf.from_pandas(expect).to_orc(gdf_fname.strpath, compression=compression)
got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()

assert_eq(expect, got)
assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))
bdice marked this conversation as resolved.
Show resolved Hide resolved


@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
Expand All @@ -409,7 +410,7 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
cudf.from_pandas(expect).to_orc(gdf_fname.strpath, statistics=stats_freq)
got = pa.orc.ORCFile(gdf_fname).read().to_pandas()

assert_eq(expect, got)
assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))


@pytest.mark.parametrize("stats_freq", ["NONE", "STRIPE", "ROWGROUP"])
Expand Down Expand Up @@ -492,8 +493,7 @@ def test_chunked_orc_writer(
writer.close()

got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()

assert_eq(expect, got)
assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))


@pytest.mark.parametrize(
Expand Down