Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Remove kwargs in read_csv & to_csv #11762

Merged
merged 6 commits into from
Sep 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5987,11 +5987,11 @@ def to_csv(
columns=None,
header=True,
index=True,
line_terminator="\n",
chunksize=None,
encoding=None,
compression=None,
**kwargs,
line_terminator="\n",
chunksize=None,
storage_options=None,
):
"""{docstring}"""
from cudf.io import csv
Expand All @@ -6008,7 +6008,7 @@ def to_csv(
chunksize=chunksize,
encoding=encoding,
compression=compression,
**kwargs,
storage_options=storage_options,
)

@ioutils.doc_to_orc()
Expand Down
66 changes: 38 additions & 28 deletions python/cudf/cudf/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,45 +18,52 @@
@ioutils.doc_read_csv()
def read_csv(
filepath_or_buffer,
lineterminator="\n",
quotechar='"',
quoting=0,
doublequote=True,
header="infer",
mangle_dupe_cols=True,
usecols=None,
sep=",",
delimiter=None,
delim_whitespace=False,
skipinitialspace=False,
header="infer",
names=None,
index_col=None,
usecols=None,
prefix=None,
mangle_dupe_cols=True,
dtype=None,
skipfooter=0,
skiprows=0,
dayfirst=False,
compression="infer",
thousands=None,
decimal=".",
true_values=None,
false_values=None,
skipinitialspace=False,
skiprows=0,
skipfooter=0,
nrows=None,
byte_range=None,
skip_blank_lines=True,
parse_dates=None,
comment=None,
na_values=None,
keep_default_na=True,
na_filter=True,
prefix=None,
index_col=None,
skip_blank_lines=True,
parse_dates=None,
dayfirst=False,
compression="infer",
thousands=None,
decimal=".",
lineterminator="\n",
quotechar='"',
quoting=0,
doublequote=True,
comment=None,
delim_whitespace=False,
byte_range=None,
use_python_file_object=True,
**kwargs,
storage_options=None,
bytes_per_thread=None,
):
"""{docstring}"""

if use_python_file_object and bytes_per_thread is not None:
raise ValueError(
"bytes_per_thread is only supported when "
"`use_python_file_object=False`"
)

is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
path_or_data=filepath_or_buffer,
**kwargs,
storage_options=storage_options,
)
if not is_single_filepath_or_buffer:
raise NotImplementedError(
Expand All @@ -68,7 +75,10 @@ def read_csv(
compression=compression,
iotypes=(BytesIO, StringIO, NativeFile),
use_python_file_object=use_python_file_object,
**kwargs,
storage_options=storage_options,
bytes_per_thread=256_000_000
if bytes_per_thread is None
else bytes_per_thread,
)

if na_values is not None and is_scalar(na_values):
Expand Down Expand Up @@ -142,11 +152,11 @@ def to_csv(
columns=None,
header=True,
index=True,
line_terminator="\n",
chunksize=None,
encoding=None,
compression=None,
**kwargs,
line_terminator="\n",
chunksize=None,
storage_options=None,
):
"""{docstring}"""

Expand All @@ -172,7 +182,7 @@ def to_csv(
return_as_string = True

path_or_buf = ioutils.get_writer_filepath_or_buffer(
path_or_data=path_or_buf, mode="w", **kwargs
path_or_data=path_or_buf, mode="w", storage_options=storage_options
)

if columns is not None:
Expand Down
5 changes: 3 additions & 2 deletions python/cudf/cudf/tests/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,6 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
got = cudf.read_csv(
f"s3://{bucket}/{fname}",
storage_options=s3so,
bytes_per_thread=bytes_per_thread,
use_python_file_object=True,
)
assert_eq(pdf, got)
Expand Down Expand Up @@ -188,7 +187,9 @@ def test_read_csv_byte_range(
f"s3://{bucket}/{fname}",
storage_options=s3so,
byte_range=(74, 73),
bytes_per_thread=bytes_per_thread,
bytes_per_thread=bytes_per_thread
if not use_python_file_object
else None,
header=None,
names=["Integer", "Float", "Integer2", "String", "Boolean"],
use_python_file_object=use_python_file_object,
Expand Down
154 changes: 88 additions & 66 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,79 +842,43 @@
Delimiter to be used.
delimiter : char, default None
Alternative argument name for sep.
delim_whitespace : bool, default False
Determines whether to use whitespace as delimiter.
lineterminator : char, default '\\n'
Character to indicate end of line.
skipinitialspace : bool, default False
Skip spaces after delimiter.
names : list of str, default None
List of column names to be used.
dtype : type, str, list of types, or dict of column -> type, default None
Data type(s) for data or columns. If `dtype` is a type/str, all columns
are mapped to the particular type passed. If list, types are applied in
the same order as the column names. If dict, types are mapped to the
column names.
E.g. {{‘a’: np.float64, ‘b’: int32, ‘c’: ‘float’}}
If `None`, dtypes are inferred from the dataset. Use `str` to preserve data
and not infer or interpret to dtype.
quotechar : char, default '"'
Character to indicate start and end of quote item.
quoting : str or int, default 0
Controls quoting behavior. Set to one of
0 (csv.QUOTE_MINIMAL), 1 (csv.QUOTE_ALL),
2 (csv.QUOTE_NONNUMERIC) or 3 (csv.QUOTE_NONE).
Quoting is enabled with all values except 3.
doublequote : bool, default True
When quoting is enabled, indicates whether to interpret two
consecutive quotechar inside fields as single quotechar
header : int, default 'infer'
Row number to use as the column names. Default behavior is to infer
the column names: if no names are passed, header=0;
if column names are passed explicitly, header=None.
names : list of str, default None
List of column names to be used.
index_col : int, string or False, default None
Column to use as the row labels of the DataFrame. Passing `index_col=False`
explicitly disables index column inference and discards the last column.
usecols : list of int or str, default None
Returns subset of the columns given in the list. All elements must be
either integer indices (column number) or strings that correspond to
column names
prefix : str, default None
Prefix to add to column numbers when parsing without a header row
mangle_dupe_cols : boolean, default True
Duplicate columns will be specified as 'X','X.1',...'X.N'.
skiprows : int, default 0
Number of rows to be skipped from the start of file.
skipfooter : int, default 0
Number of rows to be skipped at the bottom of file.
compression : {{'infer', 'gzip', 'zip', None}}, default 'infer'
For on-the-fly decompression of on-disk data. If ‘infer’, then detect
compression from the following extensions: ‘.gz’,‘.zip’ (otherwise no
decompression). If using ‘zip’, the ZIP file must contain only one
data file to be read in, otherwise the first non-zero-sized file will
be used. Set to None for no decompression.
decimal : char, default '.'
Character used as a decimal point.
thousands : char, default None
Character used as a thousands delimiter.
dtype : type, str, list of types, or dict of column -> type, default None
Data type(s) for data or columns. If `dtype` is a type/str, all columns
are mapped to the particular type passed. If list, types are applied in
the same order as the column names. If dict, types are mapped to the
column names.
E.g. {{‘a’: np.float64, ‘b’: int32, ‘c’: ‘float’}}
If `None`, dtypes are inferred from the dataset. Use `str` to preserve data
and not infer or interpret to dtype.
true_values : list, default None
Values to consider as boolean True
false_values : list, default None
Values to consider as boolean False
skipinitialspace : bool, default False
Skip spaces after delimiter.
skiprows : int, default 0
Number of rows to be skipped from the start of file.
skipfooter : int, default 0
Number of rows to be skipped at the bottom of file.
nrows : int, default None
If specified, maximum number of rows to read
byte_range : list or tuple, default None
Byte range within the input file to be read. The first number is the
offset in bytes, the second number is the range size in bytes. Set the
size to zero to read all data after the offset location. Reads the row
that starts before or at the end of the range, even if it ends after
the end of the range.
skip_blank_lines : bool, default True
If True, discard and do not parse empty lines
If False, interpret empty lines as NaN values
parse_dates : list of int or names, default None
If list of columns, then attempt to parse each entry as a date.
Columns may not always be recognized as dates, for instance due to
unusual or non-standard formats. To guarantee a date and increase parsing
speed, explicitly specify `dtype='date'` for the desired columns.
comment : char, default None
Character used as a comments indicator. If found at the beginning of a
line, the line will be ignored altogether.
na_values : scalar, str, or list-like, optional
Additional strings to recognize as nulls.
By default the following values are interpreted as
Expand All @@ -927,16 +891,67 @@
na_filter : bool, default True
Detect missing values (empty strings and the values in na_values).
Passing False can improve performance.
prefix : str, default None
Prefix to add to column numbers when parsing without a header row
index_col : int, string or False, default None
Column to use as the row labels of the DataFrame. Passing `index_col=False`
explicitly disables index column inference and discards the last column.
skip_blank_lines : bool, default True
If True, discard and do not parse empty lines
If False, interpret empty lines as NaN values
parse_dates : list of int or names, default None
If list of columns, then attempt to parse each entry as a date.
Columns may not always be recognized as dates, for instance due to
unusual or non-standard formats. To guarantee a date and increase parsing
speed, explicitly specify `dtype='date'` for the desired columns.
dayfirst : bool, default False
DD/MM format dates, international and European format.
compression : {{'infer', 'gzip', 'zip', None}}, default 'infer'
For on-the-fly decompression of on-disk data. If ‘infer’, then detect
compression from the following extensions: ‘.gz’,‘.zip’ (otherwise no
decompression). If using ‘zip’, the ZIP file must contain only one
data file to be read in, otherwise the first non-zero-sized file will
be used. Set to None for no decompression.
thousands : char, default None
Character used as a thousands delimiter.
decimal : char, default '.'
Character used as a decimal point.
lineterminator : char, default '\\n'
Character to indicate end of line.
quotechar : char, default '"'
Character to indicate start and end of quote item.
quoting : str or int, default 0
Controls quoting behavior. Set to one of
0 (csv.QUOTE_MINIMAL), 1 (csv.QUOTE_ALL),
2 (csv.QUOTE_NONNUMERIC) or 3 (csv.QUOTE_NONE).
Quoting is enabled with all values except 3.
doublequote : bool, default True
When quoting is enabled, indicates whether to interpret two
consecutive quotechar inside fields as single quotechar
comment : char, default None
Character used as a comments indicator. If found at the beginning of a
line, the line will be ignored altogether.
delim_whitespace : bool, default False
Determines whether to use whitespace as delimiter.
byte_range : list or tuple, default None
Byte range within the input file to be read. The first number is the
offset in bytes, the second number is the range size in bytes. Set the
size to zero to read all data after the offset location. Reads the row
that starts before or at the end of the range, even if it ends after
the end of the range.
use_python_file_object : boolean, default True
If True, Arrow-backed PythonFile objects will be used in place of fsspec
AbstractBufferedFile objects at IO time. This option is likely to improve
performance when making small reads from larger CSV files.

storage_options : dict, optional, default None
Extra options that make sense for a particular storage connection,
e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
pairs are forwarded to ``urllib.request.Request`` as header options.
For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value
pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
``urllib`` for more details.
bytes_per_thread : int, default None
Determines the number of bytes to be allocated per thread to read the
files in parallel. When there is a file of large size, we get slightly
better throughput by decomposing it and transferring multiple "blocks"
in parallel (using a python thread pool). Default allocation is
256_000_000 bytes.
This parameter is functional only when `use_python_file_object=False`.
Returns
-------
GPU ``DataFrame`` object.
Expand Down Expand Up @@ -1010,15 +1025,22 @@
Write out the column names
index : bool, default True
Write out the index as a column
line_terminator : char, default '\\n'
chunksize : int or None, default None
Rows to write at a time
encoding : str, default 'utf-8'
A string representing the encoding to use in the output file
Only ‘utf-8’ is currently supported
compression : str, None
A string representing the compression scheme to use in the the output file
Compression while writing csv is not supported currently
line_terminator : char, default '\\n'
chunksize : int or None, default None
Rows to write at a time
storage_options : dict, optional, default None
Extra options that make sense for a particular storage connection,
e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
pairs are forwarded to ``urllib.request.Request`` as header options.
For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value
pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
``urllib`` for more details.
Returns
-------
None or str
Expand Down