-
Notifications
You must be signed in to change notification settings - Fork 912
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Fix arrow-based round trip of empty dataframes (#15373)
When materializing range indices we were not previously creating the correct metadata. So do that. While here, tidy up a few corner cases around creating range indices when constructing empty data frames. - Closes #12243 - Closes #14159 Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #15373
- Loading branch information
Showing
4 changed files
with
115 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,58 @@ | ||
# Copyright (c) 2023, NVIDIA CORPORATION. | ||
# Copyright (c) 2023-2024, NVIDIA CORPORATION. | ||
import contextlib | ||
from io import BytesIO | ||
|
||
import pandas as pd | ||
import pyarrow as pa | ||
import pyarrow.parquet as pq | ||
import pytest | ||
|
||
import cudf | ||
from cudf.testing._utils import assert_eq | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"index", | ||
[range(1, 11), list(range(1, 11)), range(1, 11)[::2]], | ||
ids=["RangeIndex", "IntIndex", "StridedRange"], | ||
) | ||
@pytest.mark.parametrize("write_index", [False, True, None]) | ||
@pytest.mark.parametrize("empty", [False, True], ids=["nonempty", "empty"]) | ||
def test_dataframe_parquet_roundtrip(index, write_index, empty): | ||
if empty: | ||
data = {} | ||
else: | ||
data = {"a": [i * 2 for i in index]} | ||
df = cudf.DataFrame(data=data, index=index) | ||
pf = pd.DataFrame(data=data, index=index) | ||
gpu_buf = BytesIO() | ||
cpu_buf = BytesIO() | ||
|
||
df.to_parquet(gpu_buf, index=write_index) | ||
pf.to_parquet(cpu_buf, index=write_index) | ||
gpu_table = pq.read_table(gpu_buf) | ||
cpu_table = pq.read_table(cpu_buf) | ||
metadata_equal = ( | ||
gpu_table.schema.pandas_metadata == cpu_table.schema.pandas_metadata | ||
) | ||
if empty and write_index is not False: | ||
# https://github.com/rapidsai/cudf/issues/15372 | ||
ctx = pytest.raises(AssertionError) | ||
else: | ||
ctx = contextlib.nullcontext() | ||
with ctx: | ||
assert metadata_equal | ||
|
||
gpu_read = cudf.read_parquet(gpu_buf) | ||
cpu_read = cudf.read_parquet(cpu_buf) | ||
with ctx: | ||
assert_eq(gpu_read, cpu_read) | ||
|
||
|
||
@pytest.mark.parametrize("preserve_index", [False, True, None]) | ||
def test_dataframe_to_arrow_preserve_index(preserve_index): | ||
df = cudf.DataFrame({"x": ["cat", "dog"] * 5}) | ||
pf = df.to_pandas() | ||
expect = pa.Table.from_pandas(pf, preserve_index=preserve_index).schema | ||
got = df.to_arrow(preserve_index=preserve_index).schema | ||
assert expect == got |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters