-
Notifications
You must be signed in to change notification settings - Fork 928
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reduce execution time of Python ORC tests #14776
Changes from all commits
b1f60d8
ac7ba75
af56dd4
dc021d7
0d4de48
1f9b727
fc3a005
eedfc0d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -604,13 +604,13 @@ def normalized_equals(value1, value2): | |
|
||
|
||
@pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) | ||
@pytest.mark.parametrize("nrows", [1, 100, 6000000]) | ||
@pytest.mark.parametrize("nrows", [1, 100, 100000]) | ||
def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): | ||
from pyarrow import orc | ||
|
||
supported_stat_types = supported_numpy_dtypes + ["str"] | ||
# Can't write random bool columns until issue #6763 is fixed | ||
if nrows == 6000000: | ||
# Writing bool columns to multiple row groups is disabled until #6763 is fixed | ||
if nrows == 100000: | ||
supported_stat_types.remove("bool") | ||
|
||
# Make a dataframe | ||
|
@@ -623,7 +623,7 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): | |
fname = tmpdir.join("gdf.orc") | ||
|
||
# Write said dataframe to ORC with cuDF | ||
gdf.to_orc(fname.strpath, statistics=stats_freq) | ||
gdf.to_orc(fname.strpath, statistics=stats_freq, stripe_size_rows=30000) | ||
|
||
# Read back written ORC's statistics | ||
orc_file = orc.ORCFile(fname) | ||
|
@@ -678,28 +678,30 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): | |
|
||
|
||
@pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) | ||
@pytest.mark.parametrize("nrows", [2, 100, 6000000]) | ||
@pytest.mark.parametrize("nrows", [2, 100, 200000]) | ||
def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): | ||
from pyarrow import orc | ||
|
||
np.random.seed(0) | ||
supported_stat_types = supported_numpy_dtypes + ["str"] | ||
# Can't write random bool columns until issue #6763 is fixed | ||
if nrows == 6000000: | ||
# Writing bool columns to multiple row groups is disabled until #6763 is fixed | ||
if nrows == 200000: | ||
supported_stat_types.remove("bool") | ||
|
||
gdf_fname = tmpdir.join("chunked_stats.orc") | ||
writer = ORCWriter(gdf_fname) | ||
writer = ORCWriter( | ||
gdf_fname, statistics=stats_freq, stripe_size_rows=30000 | ||
) | ||
|
||
max_char_length = 1000 if nrows < 10000 else 100 | ||
max_char_length = 100 if nrows < 10000 else 10 | ||
|
||
# Make a dataframe | ||
gdf = cudf.DataFrame( | ||
{ | ||
"col_" | ||
+ str(dtype): gen_rand_series( | ||
dtype, | ||
int(nrows / 2), | ||
nrows // 2, | ||
has_nulls=True, | ||
low=0, | ||
high=max_char_length, | ||
|
@@ -718,7 +720,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): | |
"col_" | ||
+ str(dtype): gen_rand_series( | ||
dtype, | ||
int(nrows / 2), | ||
nrows // 2, | ||
has_nulls=True, | ||
low=0, | ||
high=max_char_length, | ||
|
@@ -785,7 +787,7 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): | |
assert stats_num_vals == actual_num_vals | ||
|
||
|
||
@pytest.mark.parametrize("nrows", [1, 100, 6000000]) | ||
@pytest.mark.parametrize("nrows", [1, 100, 100000]) | ||
def test_orc_write_bool_statistics(tmpdir, datadir, nrows): | ||
from pyarrow import orc | ||
|
||
|
@@ -794,7 +796,7 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows): | |
fname = tmpdir.join("gdf.orc") | ||
|
||
# Write said dataframe to ORC with cuDF | ||
gdf.to_orc(fname.strpath) | ||
gdf.to_orc(fname.strpath, stripe_size_rows=30000) | ||
|
||
# Read back written ORC's statistics | ||
orc_file = orc.ORCFile(fname) | ||
|
@@ -848,21 +850,20 @@ def test_orc_bool_encode_fail(): | |
np.random.seed(0) | ||
buffer = BytesIO() | ||
|
||
# Generate a boolean column longer than a single stripe | ||
fail_df = cudf.DataFrame({"col": gen_rand_series("bool", 600000)}) | ||
# Invalidate the first row in the second stripe to break encoding | ||
fail_df["col"][500000] = None | ||
# Generate a boolean column longer than a single row group | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Modified this test based on the actual checks we perform on bool columns - all row groups except for the last one in each stripe need to have the number of valid elements divisible by 8. The row group size is 10k, so a single null fails this check and the writer should throw. |
||
fail_df = cudf.DataFrame({"col": gen_rand_series("bool", 20000)}) | ||
# Invalidate a row in the first row group | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the old comment was incorrect, the test file had a single stripe |
||
fail_df["col"][5000] = None | ||
|
||
# Should throw instead of generating a file that is incompatible | ||
# with other readers (see issue #6763) | ||
with pytest.raises(RuntimeError): | ||
fail_df.to_orc(buffer) | ||
|
||
# Generate a boolean column that fits into a single stripe | ||
okay_df = cudf.DataFrame({"col": gen_rand_series("bool", 500000)}) | ||
okay_df["col"][500000 - 1] = None | ||
# Invalid row is in the last row group of the stripe; | ||
# encoding is assumed to be correct | ||
# Generate a boolean column longer than a single row group | ||
okay_df = cudf.DataFrame({"col": gen_rand_series("bool", 20000)}) | ||
okay_df["col"][15000] = None | ||
# Invalid row is in the last row group; encoding is assumed to be correct | ||
okay_df.to_orc(buffer) | ||
|
||
# Also validate data | ||
|
@@ -1130,7 +1131,7 @@ def test_pyspark_struct(datadir): | |
assert_eq(pdf, gdf) | ||
|
||
|
||
def gen_map_buff(size=10000): | ||
def gen_map_buff(size): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. default value was unused |
||
from string import ascii_letters as al | ||
|
||
from pyarrow import orc | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does this work for
nrows=1
ornrows=100
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can write a single row group of random bools, just not multiple (at least not in the way that does not cause issues with other readers). So anything below 10k rows is fine. I know this is very hacky :(