From 7e86a1b73d357e83b2d0bb166ac3a0fdbf77fa99 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 13 Sep 2022 12:25:35 -0700 Subject: [PATCH] Default to Snappy compression in `to_orc` when using cuDF or Dask (#11690) Fix `to_orc` defaults for the compression type in cuDF and Dask. Aligns the default to the libcudf default (and to the Parquet default). Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/11690 --- python/cudf/cudf/_lib/orc.pyx | 4 ++-- python/cudf/cudf/_lib/parquet.pyx | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/io/orc.py | 2 +- python/cudf/cudf/io/parquet.py | 2 +- python/cudf/cudf/utils/ioutils.py | 6 +++--- python/dask_cudf/dask_cudf/io/orc.py | 4 ++-- 7 files changed, 11 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 1c9f388873c..be7b29da515 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -242,7 +242,7 @@ cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics): cpdef write_orc(table, object path_or_buf, - object compression=None, + object compression="snappy", object statistics="ROWGROUP", object stripe_size_bytes=None, object stripe_size_rows=None, @@ -381,7 +381,7 @@ cdef class ORCWriter: def __cinit__(self, object path, object index=None, - object compression=None, + object compression="snappy", object statistics="ROWGROUP", object cols_as_map_type=None): diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index 3c8e78bd87a..891f259a828 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -487,7 +487,7 @@ cdef class ParquetWriter: cdef size_type max_page_size_rows def __cinit__(self, object filepath_or_buffer, object index=None, - object compression=None, str statistics="ROWGROUP", + object compression="snappy", str statistics="ROWGROUP", int row_group_size_bytes=134217728, int row_group_size_rows=1000000, int max_page_size_bytes=524288, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c07a88e9396..2f1695e4445 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6012,7 +6012,7 @@ def to_csv( ) @ioutils.doc_to_orc() - def to_orc(self, fname, compression=None, *args, **kwargs): + def to_orc(self, fname, compression="snappy", *args, **kwargs): """{docstring}""" from cudf.io import orc diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index 378cb25fafb..718b9c4144f 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -407,7 +407,7 @@ def read_orc_stripe(orc_file, stripe, columns): def to_orc( df, fname, - compression=None, + compression="snappy", statistics="ROWGROUP", stripe_size_bytes=None, stripe_size_rows=None, diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index 4fab657d9a0..7ac391c5f3d 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -911,7 +911,7 @@ def __init__( path, partition_cols, index=None, - compression=None, + compression="snappy", statistics="ROWGROUP", max_file_size=None, file_name_prefix=None, diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 19815c7c506..fb1b0235822 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -429,7 +429,7 @@ ---------- fname : str File path or object where the ORC dataset will be stored. -compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default None +compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default 'snappy' Name of the compression to use. Use None for no compression. enable_statistics: boolean, default True Enable writing column statistics. @@ -1013,10 +1013,10 @@ line_terminator : char, default '\\n' chunksize : int or None, default None Rows to write at a time -encoding: str, default 'utf-8' +encoding : str, default 'utf-8' A string representing the encoding to use in the output file Only ‘utf-8’ is currently supported -compression: str, None +compression : str, None A string representing the compression scheme to use in the the output file Compression while writing csv is not supported currently Returns diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index f5df0e261c9..e731057ed90 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -115,7 +115,7 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): return dd.core.new_dd_object(dsk, name, meta, divisions) -def write_orc_partition(df, path, fs, filename, compression=None): +def write_orc_partition(df, path, fs, filename, compression="snappy"): full_path = fs.sep.join([path, filename]) with fs.open(full_path, mode="wb") as out_file: if not isinstance(out_file, IOBase): @@ -129,7 +129,7 @@ def to_orc( path, write_index=True, storage_options=None, - compression=None, + compression="snappy", compute=True, **kwargs, ):