Skip to content

Commit

Permalink
Default to Snappy compression in to_orc when using cuDF or Dask (#1…
Browse files Browse the repository at this point in the history
…1690)

Fix `to_orc` defaults for the compression type in cuDF and Dask. Aligns the default to the libcudf default (and to the Parquet default).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #11690
  • Loading branch information
vuule authored Sep 13, 2022
1 parent 7b0d597 commit 7e86a1b
Show file tree
Hide file tree
Showing 7 changed files with 11 additions and 11 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics):

cpdef write_orc(table,
object path_or_buf,
object compression=None,
object compression="snappy",
object statistics="ROWGROUP",
object stripe_size_bytes=None,
object stripe_size_rows=None,
Expand Down Expand Up @@ -381,7 +381,7 @@ cdef class ORCWriter:
def __cinit__(self,
object path,
object index=None,
object compression=None,
object compression="snappy",
object statistics="ROWGROUP",
object cols_as_map_type=None):

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ cdef class ParquetWriter:
cdef size_type max_page_size_rows

def __cinit__(self, object filepath_or_buffer, object index=None,
object compression=None, str statistics="ROWGROUP",
object compression="snappy", str statistics="ROWGROUP",
int row_group_size_bytes=134217728,
int row_group_size_rows=1000000,
int max_page_size_bytes=524288,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6012,7 +6012,7 @@ def to_csv(
)

@ioutils.doc_to_orc()
def to_orc(self, fname, compression=None, *args, **kwargs):
def to_orc(self, fname, compression="snappy", *args, **kwargs):
"""{docstring}"""
from cudf.io import orc

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,7 @@ def read_orc_stripe(orc_file, stripe, columns):
def to_orc(
df,
fname,
compression=None,
compression="snappy",
statistics="ROWGROUP",
stripe_size_bytes=None,
stripe_size_rows=None,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,7 +911,7 @@ def __init__(
path,
partition_cols,
index=None,
compression=None,
compression="snappy",
statistics="ROWGROUP",
max_file_size=None,
file_name_prefix=None,
Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@
----------
fname : str
File path or object where the ORC dataset will be stored.
compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default None
compression : {{ 'snappy', 'ZLIB', 'ZSTD', None }}, default 'snappy'
Name of the compression to use. Use None for no compression.
enable_statistics: boolean, default True
Enable writing column statistics.
Expand Down Expand Up @@ -1013,10 +1013,10 @@
line_terminator : char, default '\\n'
chunksize : int or None, default None
Rows to write at a time
encoding: str, default 'utf-8'
encoding : str, default 'utf-8'
A string representing the encoding to use in the output file
Only ‘utf-8’ is currently supported
compression: str, None
compression : str, None
A string representing the compression scheme to use in the the output file
Compression while writing csv is not supported currently
Returns
Expand Down
4 changes: 2 additions & 2 deletions python/dask_cudf/dask_cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
return dd.core.new_dd_object(dsk, name, meta, divisions)


def write_orc_partition(df, path, fs, filename, compression=None):
def write_orc_partition(df, path, fs, filename, compression="snappy"):
full_path = fs.sep.join([path, filename])
with fs.open(full_path, mode="wb") as out_file:
if not isinstance(out_file, IOBase):
Expand All @@ -129,7 +129,7 @@ def to_orc(
path,
write_index=True,
storage_options=None,
compression=None,
compression="snappy",
compute=True,
**kwargs,
):
Expand Down

0 comments on commit 7e86a1b

Please sign in to comment.