-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Expand statistics support in ORC writer #13848
Changes from 15 commits
b614fe8
9a7988a
70c3b28
b392376
eb6cdae
4776140
e277645
9fbe6c5
d833ff0
1ca376a
1db56c5
950cec8
50b67d8
cb61069
96b3112
01af60b
2f35d5a
cc54019
864bdd1
fcfa662
ee1347f
d7facda
bfa0d8b
cb71541
227a9c1
23a5e14
f2f6090
64636f9
75fe574
c3b7410
e79496c
7d0e3c7
b5dbea1
7b27f56
e65310d
8c58c6f
eacb578
3afcd64
3746cb4
7808cb3
f181df2
3c0da37
3a61eee
8742633
8920b71
d027019
2136109
62862b4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -633,16 +633,19 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): | |
for col in gdf: | ||
if "minimum" in file_stats[0][col]: | ||
stats_min = file_stats[0][col]["minimum"] | ||
actual_min = gdf[col].min() | ||
assert normalized_equals(actual_min, stats_min) | ||
if stats_min is not None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Under what circumstances does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great question! |
||
actual_min = gdf[col].min() | ||
assert normalized_equals(actual_min, stats_min) | ||
if "maximum" in file_stats[0][col]: | ||
stats_max = file_stats[0][col]["maximum"] | ||
actual_max = gdf[col].max() | ||
assert normalized_equals(actual_max, stats_max) | ||
if stats_max is not None: | ||
actual_max = gdf[col].max() | ||
assert normalized_equals(actual_max, stats_max) | ||
if "number_of_values" in file_stats[0][col]: | ||
stats_num_vals = file_stats[0][col]["number_of_values"] | ||
actual_num_vals = gdf[col].count() | ||
assert stats_num_vals == actual_num_vals | ||
if stats_num_vals is not None: | ||
actual_num_vals = gdf[col].count() | ||
assert stats_num_vals == actual_num_vals | ||
|
||
# compare stripe statistics with actual min/max | ||
for stripe_idx in range(0, orc_file.nstripes): | ||
|
@@ -651,21 +654,24 @@ def test_orc_write_statistics(tmpdir, datadir, nrows, stats_freq): | |
stripe_df = cudf.DataFrame(stripe.to_pandas()) | ||
for col in stripe_df: | ||
if "minimum" in stripes_stats[stripe_idx][col]: | ||
actual_min = stripe_df[col].min() | ||
stats_min = stripes_stats[stripe_idx][col]["minimum"] | ||
assert normalized_equals(actual_min, stats_min) | ||
if stats_min is not None: | ||
actual_min = stripe_df[col].min() | ||
assert normalized_equals(actual_min, stats_min) | ||
|
||
if "maximum" in stripes_stats[stripe_idx][col]: | ||
actual_max = stripe_df[col].max() | ||
stats_max = stripes_stats[stripe_idx][col]["maximum"] | ||
assert normalized_equals(actual_max, stats_max) | ||
if stats_max is not None: | ||
actual_max = stripe_df[col].max() | ||
assert normalized_equals(actual_max, stats_max) | ||
|
||
if "number_of_values" in stripes_stats[stripe_idx][col]: | ||
stats_num_vals = stripes_stats[stripe_idx][col][ | ||
"number_of_values" | ||
] | ||
actual_num_vals = stripe_df[col].count() | ||
assert stats_num_vals == actual_num_vals | ||
if stats_num_vals is not None: | ||
actual_num_vals = stripe_df[col].count() | ||
assert stats_num_vals == actual_num_vals | ||
|
||
|
||
@pytest.mark.parametrize("stats_freq", ["STRIPE", "ROWGROUP"]) | ||
|
@@ -733,16 +739,19 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): | |
for col in expect: | ||
if "minimum" in file_stats[0][col]: | ||
stats_min = file_stats[0][col]["minimum"] | ||
actual_min = expect[col].min() | ||
assert normalized_equals(actual_min, stats_min) | ||
if stats_min is not None: | ||
actual_min = expect[col].min() | ||
assert normalized_equals(actual_min, stats_min) | ||
if "maximum" in file_stats[0][col]: | ||
stats_max = file_stats[0][col]["maximum"] | ||
actual_max = expect[col].max() | ||
assert normalized_equals(actual_max, stats_max) | ||
if stats_max is not None: | ||
actual_max = expect[col].max() | ||
assert normalized_equals(actual_max, stats_max) | ||
if "number_of_values" in file_stats[0][col]: | ||
stats_num_vals = file_stats[0][col]["number_of_values"] | ||
actual_num_vals = expect[col].count() | ||
assert stats_num_vals == actual_num_vals | ||
if stats_num_vals is not None: | ||
actual_num_vals = expect[col].count() | ||
assert stats_num_vals == actual_num_vals | ||
|
||
# compare stripe statistics with actual min/max | ||
for stripe_idx in range(0, orc_file.nstripes): | ||
|
@@ -751,21 +760,24 @@ def test_orc_chunked_write_statistics(tmpdir, datadir, nrows, stats_freq): | |
stripe_df = cudf.DataFrame(stripe.to_pandas()) | ||
for col in stripe_df: | ||
if "minimum" in stripes_stats[stripe_idx][col]: | ||
actual_min = stripe_df[col].min() | ||
stats_min = stripes_stats[stripe_idx][col]["minimum"] | ||
assert normalized_equals(actual_min, stats_min) | ||
if stats_min is not None: | ||
actual_min = stripe_df[col].min() | ||
assert normalized_equals(actual_min, stats_min) | ||
|
||
if "maximum" in stripes_stats[stripe_idx][col]: | ||
actual_max = stripe_df[col].max() | ||
stats_max = stripes_stats[stripe_idx][col]["maximum"] | ||
assert normalized_equals(actual_max, stats_max) | ||
if stats_max is not None: | ||
actual_max = stripe_df[col].max() | ||
assert normalized_equals(actual_max, stats_max) | ||
|
||
if "number_of_values" in stripes_stats[stripe_idx][col]: | ||
stats_num_vals = stripes_stats[stripe_idx][col][ | ||
"number_of_values" | ||
] | ||
actual_num_vals = stripe_df[col].count() | ||
assert stats_num_vals == actual_num_vals | ||
if stats_num_vals is not None: | ||
actual_num_vals = stripe_df[col].count() | ||
assert stats_num_vals == actual_num_vals | ||
|
||
|
||
@pytest.mark.parametrize("nrows", [1, 100, 6000000]) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
was unused