Skip to content

Commit

Permalink
Use decimal precision metadata when reading from parquet files (#9162)
Browse files Browse the repository at this point in the history
Closes #8354.

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #9162
  • Loading branch information
shwina authored Sep 2, 2021
1 parent cd4c8c7 commit 858944b
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 16 deletions.
23 changes: 7 additions & 16 deletions python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -185,22 +185,13 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,

update_struct_field_names(df, c_out_table.metadata.schema_info)

if df.empty and meta is not None:
cols_dtype_map = {}
for col in meta['columns']:
cols_dtype_map[col['name']] = col['numpy_type']

if not column_names:
column_names = [o['name'] for o in meta['columns']]
if not is_range_index and index_col in cols_dtype_map:
column_names.remove(index_col)

for col in column_names:
meta_dtype = cols_dtype_map.get(col, None)
df._data[col] = cudf.core.column.column_empty(
row_count=0,
dtype=cudf.dtype(meta_dtype)
)
# update the decimal precision of each column
if meta is not None:
for col, col_meta in zip(column_names, meta["columns"]):
if isinstance(df._data[col].dtype, cudf.Decimal64Dtype):
df._data[col].dtype.precision = (
col_meta["metadata"]["precision"]
)

# Set the index column
if index_col is not None and len(index_col) > 0:
Expand Down
26 changes: 26 additions & 0 deletions python/cudf/cudf/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2047,3 +2047,29 @@ def test_parquet_writer_nulls_pandas_read(tmpdir, pdf):
got = pd.read_parquet(fname)
nullable = True if num_rows > 0 else False
assert_eq(gdf.to_pandas(nullable=nullable), got)


def test_parquet_decimal_precision(tmpdir):
df = cudf.DataFrame({"val": ["3.5", "4.2"]}).astype(
cudf.Decimal64Dtype(5, 2)
)
assert df.val.dtype.precision == 5

fname = tmpdir.join("decimal_test.parquet")
df.to_parquet(fname)
df = cudf.read_parquet(fname)
assert df.val.dtype.precision == 5


def test_parquet_decimal_precision_empty(tmpdir):
df = (
cudf.DataFrame({"val": ["3.5", "4.2"]})
.astype(cudf.Decimal64Dtype(5, 2))
.iloc[:0]
)
assert df.val.dtype.precision == 5

fname = tmpdir.join("decimal_test.parquet")
df.to_parquet(fname)
df = cudf.read_parquet(fname)
assert df.val.dtype.precision == 5

0 comments on commit 858944b

Please sign in to comment.