From 7e86a1b73d357e83b2d0bb166ac3a0fdbf77fa99 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 13 Sep 2022 12:25:35 -0700
Subject: [PATCH] Default to Snappy compression in `to_orc` when using cuDF or
 Dask (#11690)

Fix `to_orc` defaults for the compression type in cuDF and Dask. Aligns the default to the libcudf default (and to the Parquet default).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/11690
---
 python/cudf/cudf/_lib/orc.pyx        | 4 ++--
 python/cudf/cudf/_lib/parquet.pyx    | 2 +-
 python/cudf/cudf/core/dataframe.py   | 2 +-
 python/cudf/cudf/io/orc.py           | 2 +-
 python/cudf/cudf/io/parquet.py       | 2 +-
 python/cudf/cudf/utils/ioutils.py    | 6 +++---
 python/dask_cudf/dask_cudf/io/orc.py | 4 ++--
 7 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 1c9f388873c..be7b29da515 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -242,7 +242,7 @@ cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics):
 
 cpdef write_orc(table,
                 object path_or_buf,
-                object compression=None,
+                object compression="snappy",
                 object statistics="ROWGROUP",
                 object stripe_size_bytes=None,
                 object stripe_size_rows=None,
@@ -381,7 +381,7 @@ cdef class ORCWriter:
     def __cinit__(self,
                   object path,
                   object index=None,
-                  object compression=None,
+                  object compression="snappy",
                   object statistics="ROWGROUP",
                   object cols_as_map_type=None):
 
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 3c8e78bd87a..891f259a828 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -487,7 +487,7 @@ cdef class ParquetWriter:
     cdef size_type max_page_size_rows
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
-                  object compression=None, str statistics="ROWGROUP",
+                  object compression="snappy", str statistics="ROWGROUP",
                   int row_group_size_bytes=134217728,
                   int row_group_size_rows=1000000,
                   int max_page_size_bytes=524288,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c07a88e9396..2f1695e4445 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6012,7 +6012,7 @@ def to_csv(
         )
 
     @ioutils.doc_to_orc()
-    def to_orc(self, fname, compression=None, *args, **kwargs):
+    def to_orc(self, fname, compression="snappy", *args, **kwargs):
         """{docstring}"""
         from cudf.io import orc
 
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 378cb25fafb..718b9c4144f 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -407,7 +407,7 @@ def read_orc_stripe(orc_file, stripe, columns):
 def to_orc(
     df,
     fname,
-    compression=None,
+    compression="snappy",
     statistics="ROWGROUP",
     stripe_size_bytes=None,
     stripe_size_rows=None,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 4fab657d9a0..7ac391c5f3d 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -911,7 +911,7 @@ def __init__(
         path,
         partition_cols,
         index=None,
-        compression=None,
+        compression="snappy",
         statistics="ROWGROUP",
         max_file_size=None,
         file_name_prefix=None,
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 19815c7c506..fb1b0235822 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -429,7 +429,7 @@
 ----------
 fname : str
     File path or object where the ORC dataset will be stored.
-compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default None
+compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default 'snappy'
     Name of the compression to use. Use None for no compression.
 enable_statistics: boolean, default True
     Enable writing column statistics.
@@ -1013,10 +1013,10 @@
 line_terminator : char, default '\\n'
 chunksize : int or None, default None
     Rows to write at a time
-encoding: str, default 'utf-8'
+encoding : str, default 'utf-8'
     A string representing the encoding to use in the output file
     Only ‘utf-8’ is currently supported
-compression: str, None
+compression : str, None
     A string representing the compression scheme to use in the the output file
     Compression while writing csv is not supported currently
 Returns
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index f5df0e261c9..e731057ed90 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -115,7 +115,7 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
     return dd.core.new_dd_object(dsk, name, meta, divisions)
 
 
-def write_orc_partition(df, path, fs, filename, compression=None):
+def write_orc_partition(df, path, fs, filename, compression="snappy"):
     full_path = fs.sep.join([path, filename])
     with fs.open(full_path, mode="wb") as out_file:
         if not isinstance(out_file, IOBase):
@@ -129,7 +129,7 @@ def to_orc(
     path,
     write_index=True,
     storage_options=None,
-    compression=None,
+    compression="snappy",
     compute=True,
     **kwargs,
 ):