Default to Snappy compression in to_orc when using cuDF or Dask (#1…

…1690) Fix `to_orc` defaults for the compression type in cuDF and Dask. Aligns the default to the libcudf default (and to the Parquet default). Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #11690
rapidsai · Sep 13, 2022 · 7e86a1b · 7e86a1b
1 parent 7b0d597
commit 7e86a1b
Show file tree

Hide file tree

Showing 7 changed files with 11 additions and 11 deletions.
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
@@ -242,7 +242,7 @@ cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics):
 
 cpdef write_orc(table,
                 object path_or_buf,
-                object compression=None,
+                object compression="snappy",
                 object statistics="ROWGROUP",
                 object stripe_size_bytes=None,
                 object stripe_size_rows=None,
@@ -381,7 +381,7 @@ cdef class ORCWriter:
     def __cinit__(self,
                   object path,
                   object index=None,
-                  object compression=None,
+                  object compression="snappy",
                   object statistics="ROWGROUP",
                   object cols_as_map_type=None):
 

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
@@ -487,7 +487,7 @@ cdef class ParquetWriter:
     cdef size_type max_page_size_rows
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
-                  object compression=None, str statistics="ROWGROUP",
+                  object compression="snappy", str statistics="ROWGROUP",
                   int row_group_size_bytes=134217728,
                   int row_group_size_rows=1000000,
                   int max_page_size_bytes=524288,

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -6012,7 +6012,7 @@ def to_csv(
         )
 
     @ioutils.doc_to_orc()
-    def to_orc(self, fname, compression=None, *args, **kwargs):
+    def to_orc(self, fname, compression="snappy", *args, **kwargs):
         """{docstring}"""
         from cudf.io import orc
 

diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
@@ -407,7 +407,7 @@ def read_orc_stripe(orc_file, stripe, columns):
 def to_orc(
     df,
     fname,
-    compression=None,
+    compression="snappy",
     statistics="ROWGROUP",
     stripe_size_bytes=None,
     stripe_size_rows=None,

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
@@ -911,7 +911,7 @@ def __init__(
         path,
         partition_cols,
         index=None,
-        compression=None,
+        compression="snappy",
         statistics="ROWGROUP",
         max_file_size=None,
         file_name_prefix=None,

diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
@@ -429,7 +429,7 @@
 ----------
 fname : str
     File path or object where the ORC dataset will be stored.
-compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default None
+compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default 'snappy'
     Name of the compression to use. Use None for no compression.
 enable_statistics: boolean, default True
     Enable writing column statistics.
@@ -1013,10 +1013,10 @@
 line_terminator : char, default '\\n'
 chunksize : int or None, default None
     Rows to write at a time
-encoding: str, default 'utf-8'
+encoding : str, default 'utf-8'
     A string representing the encoding to use in the output file
     Only ‘utf-8’ is currently supported
-compression: str, None
+compression : str, None
     A string representing the compression scheme to use in the the output file
     Compression while writing csv is not supported currently
 Returns

diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
@@ -115,7 +115,7 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
     return dd.core.new_dd_object(dsk, name, meta, divisions)
 
 
-def write_orc_partition(df, path, fs, filename, compression=None):
+def write_orc_partition(df, path, fs, filename, compression="snappy"):
     full_path = fs.sep.join([path, filename])
     with fs.open(full_path, mode="wb") as out_file:
         if not isinstance(out_file, IOBase):
@@ -129,7 +129,7 @@ def to_orc(
     path,
     write_index=True,
     storage_options=None,
-    compression=None,
+    compression="snappy",
     compute=True,
     **kwargs,
 ):