rapidsai · rapids-bot · Sep 27, 2022 · Sep 24, 2022 · Sep 26, 2022 · Sep 26, 2022
@@ -5987,11 +5987,11 @@ def to_csv(
         columns=None,
         header=True,
         index=True,
-        line_terminator="\n",
-        chunksize=None,
         encoding=None,
         compression=None,
-        **kwargs,
+        line_terminator="\n",
+        chunksize=None,
+        storage_options=None,
     ):
         """{docstring}"""
         from cudf.io import csv
@@ -6008,7 +6008,7 @@ def to_csv(
             chunksize=chunksize,
             encoding=encoding,
             compression=compression,
-            **kwargs,
+            storage_options=storage_options,
         )
 
     @ioutils.doc_to_orc()

@@ -18,45 +18,52 @@
 @ioutils.doc_read_csv()
 def read_csv(
     filepath_or_buffer,
-    lineterminator="\n",
-    quotechar='"',
-    quoting=0,
-    doublequote=True,
-    header="infer",
-    mangle_dupe_cols=True,
-    usecols=None,
     sep=",",
     delimiter=None,
-    delim_whitespace=False,
-    skipinitialspace=False,
+    header="infer",
     names=None,
+    index_col=None,
+    usecols=None,
+    prefix=None,
+    mangle_dupe_cols=True,
     dtype=None,
-    skipfooter=0,
-    skiprows=0,
-    dayfirst=False,
-    compression="infer",
-    thousands=None,
-    decimal=".",
     true_values=None,
     false_values=None,
+    skipinitialspace=False,
+    skiprows=0,
+    skipfooter=0,
     nrows=None,
-    byte_range=None,
-    skip_blank_lines=True,
-    parse_dates=None,
-    comment=None,
     na_values=None,
     keep_default_na=True,
     na_filter=True,
-    prefix=None,
-    index_col=None,
+    skip_blank_lines=True,
+    parse_dates=None,
+    dayfirst=False,
+    compression="infer",
+    thousands=None,
+    decimal=".",
+    lineterminator="\n",
+    quotechar='"',
+    quoting=0,
+    doublequote=True,
+    comment=None,
+    delim_whitespace=False,
+    byte_range=None,
     use_python_file_object=True,
-    **kwargs,
+    storage_options=None,
+    bytes_per_thread=None,
 ):
     """{docstring}"""
 
+    if use_python_file_object and bytes_per_thread is not None:
+        raise ValueError(
+            "bytes_per_thread is only supported when "
+            "`use_python_file_object=False`"
+        )
+
     is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
-        **kwargs,
+        storage_options=storage_options,
     )
     if not is_single_filepath_or_buffer:
         raise NotImplementedError(
@@ -68,7 +75,10 @@ def read_csv(
         compression=compression,
         iotypes=(BytesIO, StringIO, NativeFile),
         use_python_file_object=use_python_file_object,
-        **kwargs,
+        storage_options=storage_options,
+        bytes_per_thread=256_000_000
+        if bytes_per_thread is None
+        else bytes_per_thread,
     )
 
     if na_values is not None and is_scalar(na_values):
@@ -142,11 +152,11 @@ def to_csv(
     columns=None,
     header=True,
     index=True,
-    line_terminator="\n",
-    chunksize=None,
     encoding=None,
     compression=None,
-    **kwargs,
+    line_terminator="\n",
+    chunksize=None,
+    storage_options=None,
 ):
     """{docstring}"""
 
@@ -172,7 +182,7 @@ def to_csv(
         return_as_string = True
 
     path_or_buf = ioutils.get_writer_filepath_or_buffer(
-        path_or_data=path_or_buf, mode="w", **kwargs
+        path_or_data=path_or_buf, mode="w", storage_options=storage_options
     )
 
     if columns is not None:

@@ -151,7 +151,6 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
         got = cudf.read_csv(
             f"s3://{bucket}/{fname}",
             storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
             use_python_file_object=True,
         )
     assert_eq(pdf, got)
@@ -188,7 +187,9 @@ def test_read_csv_byte_range(
             f"s3://{bucket}/{fname}",
             storage_options=s3so,
             byte_range=(74, 73),
-            bytes_per_thread=bytes_per_thread,
+            bytes_per_thread=bytes_per_thread
+            if not use_python_file_object
+            else None,
             header=None,
             names=["Integer", "Float", "Integer2", "String", "Boolean"],
             use_python_file_object=use_python_file_object,

@@ -842,79 +842,43 @@
     Delimiter to be used.
 delimiter : char, default None
     Alternative argument name for sep.
-delim_whitespace : bool, default False
-    Determines whether to use whitespace as delimiter.
-lineterminator : char, default '\\n'
-    Character to indicate end of line.
-skipinitialspace : bool, default False
-    Skip spaces after delimiter.
-names : list of str, default None
-    List of column names to be used.
-dtype : type, str, list of types, or dict of column -> type, default None
-    Data type(s) for data or columns. If `dtype` is a type/str, all columns
-    are mapped to the particular type passed. If list, types are applied in
-    the same order as the column names. If dict, types are mapped to the
-    column names.
-    E.g. {{‘a’: np.float64, ‘b’: int32, ‘c’: ‘float’}}
-    If `None`, dtypes are inferred from the dataset. Use `str` to preserve data
-    and not infer or interpret to dtype.
-quotechar : char, default '"'
-    Character to indicate start and end of quote item.
-quoting : str or int, default 0
-    Controls quoting behavior. Set to one of
-    0 (csv.QUOTE_MINIMAL), 1 (csv.QUOTE_ALL),
-    2 (csv.QUOTE_NONNUMERIC) or 3 (csv.QUOTE_NONE).
-    Quoting is enabled with all values except 3.
-doublequote : bool, default True
-    When quoting is enabled, indicates whether to interpret two
-    consecutive quotechar inside fields as single quotechar
 header : int, default 'infer'
     Row number to use as the column names. Default behavior is to infer
     the column names: if no names are passed, header=0;
     if column names are passed explicitly, header=None.
+names : list of str, default None
+    List of column names to be used.
+index_col : int, string or False, default None
+    Column to use as the row labels of the DataFrame. Passing `index_col=False`
+    explicitly disables index column inference and discards the last column.
 usecols : list of int or str, default None
     Returns subset of the columns given in the list. All elements must be
     either integer indices (column number) or strings that correspond to
     column names
+prefix : str, default None
+    Prefix to add to column numbers when parsing without a header row
 mangle_dupe_cols : boolean, default True
     Duplicate columns will be specified as 'X','X.1',...'X.N'.
-skiprows : int, default 0
-    Number of rows to be skipped from the start of file.
-skipfooter : int, default 0
-    Number of rows to be skipped at the bottom of file.
-compression : {{'infer', 'gzip', 'zip', None}}, default 'infer'
-    For on-the-fly decompression of on-disk data. If ‘infer’, then detect
-    compression from the following extensions: ‘.gz’,‘.zip’ (otherwise no
-    decompression). If using ‘zip’, the ZIP file must contain only one
-    data file to be read in, otherwise the first non-zero-sized file will
-    be used. Set to None for no decompression.
-decimal : char, default '.'
-    Character used as a decimal point.
-thousands : char, default None
-    Character used as a thousands delimiter.
+dtype : type, str, list of types, or dict of column -> type, default None
+    Data type(s) for data or columns. If `dtype` is a type/str, all columns
+    are mapped to the particular type passed. If list, types are applied in
+    the same order as the column names. If dict, types are mapped to the
+    column names.
+    E.g. {{‘a’: np.float64, ‘b’: int32, ‘c’: ‘float’}}
+    If `None`, dtypes are inferred from the dataset. Use `str` to preserve data
+    and not infer or interpret to dtype.
 true_values : list, default None
     Values to consider as boolean True
 false_values : list, default None
     Values to consider as boolean False
+skipinitialspace : bool, default False
+    Skip spaces after delimiter.
+skiprows : int, default 0
+    Number of rows to be skipped from the start of file.
+skipfooter : int, default 0
+    Number of rows to be skipped at the bottom of file.
 nrows : int, default None
     If specified, maximum number of rows to read
-byte_range : list or tuple, default None
-    Byte range within the input file to be read. The first number is the
-    offset in bytes, the second number is the range size in bytes. Set the
-    size to zero to read all data after the offset location. Reads the row
-    that starts before or at the end of the range, even if it ends after
-    the end of the range.
-skip_blank_lines : bool, default True
-    If True, discard and do not parse empty lines
-    If False, interpret empty lines as NaN values
-parse_dates : list of int or names, default None
-    If list of columns, then attempt to parse each entry as a date.
-    Columns may not always be recognized as dates, for instance due to
-    unusual or non-standard formats. To guarantee a date and increase parsing
-    speed, explicitly specify `dtype='date'` for the desired columns.
-comment : char, default None
-    Character used as a comments indicator. If found at the beginning of a
-    line, the line will be ignored altogether.
 na_values : scalar, str, or list-like, optional
     Additional strings to recognize as nulls.
     By default the following values are interpreted as
@@ -927,16 +891,67 @@
 na_filter : bool, default True
     Detect missing values (empty strings and the values in na_values).
     Passing False can improve performance.
-prefix : str, default None
-    Prefix to add to column numbers when parsing without a header row
-index_col : int, string or False, default None
-    Column to use as the row labels of the DataFrame. Passing `index_col=False`
-    explicitly disables index column inference and discards the last column.
+skip_blank_lines : bool, default True
+    If True, discard and do not parse empty lines
+    If False, interpret empty lines as NaN values
+parse_dates : list of int or names, default None
+    If list of columns, then attempt to parse each entry as a date.
+    Columns may not always be recognized as dates, for instance due to
+    unusual or non-standard formats. To guarantee a date and increase parsing
+    speed, explicitly specify `dtype='date'` for the desired columns.
+dayfirst : bool, default False
+    DD/MM format dates, international and European format.
+compression : {{'infer', 'gzip', 'zip', None}}, default 'infer'
+    For on-the-fly decompression of on-disk data. If ‘infer’, then detect
+    compression from the following extensions: ‘.gz’,‘.zip’ (otherwise no
+    decompression). If using ‘zip’, the ZIP file must contain only one
+    data file to be read in, otherwise the first non-zero-sized file will
+    be used. Set to None for no decompression.
+thousands : char, default None
+    Character used as a thousands delimiter.
+decimal : char, default '.'
+    Character used as a decimal point.
+lineterminator : char, default '\\n'
+    Character to indicate end of line.
+quotechar : char, default '"'
+    Character to indicate start and end of quote item.
+quoting : str or int, default 0
+    Controls quoting behavior. Set to one of
+    0 (csv.QUOTE_MINIMAL), 1 (csv.QUOTE_ALL),
+    2 (csv.QUOTE_NONNUMERIC) or 3 (csv.QUOTE_NONE).
+    Quoting is enabled with all values except 3.
+doublequote : bool, default True
+    When quoting is enabled, indicates whether to interpret two
+    consecutive quotechar inside fields as single quotechar
+comment : char, default None
+    Character used as a comments indicator. If found at the beginning of a
+    line, the line will be ignored altogether.
+delim_whitespace : bool, default False
+    Determines whether to use whitespace as delimiter.
+byte_range : list or tuple, default None
+    Byte range within the input file to be read. The first number is the
+    offset in bytes, the second number is the range size in bytes. Set the
+    size to zero to read all data after the offset location. Reads the row
+    that starts before or at the end of the range, even if it ends after
+    the end of the range.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
     AbstractBufferedFile objects at IO time. This option is likely to improve
     performance when making small reads from larger CSV files.
-
+storage_options : dict, optional, default None
+    Extra options that make sense for a particular storage connection,
+    e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
+    pairs are forwarded to ``urllib.request.Request`` as header options.
+    For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value
+    pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
+    ``urllib`` for more details.
+bytes_per_thread : int, default None
+    Determines the number of bytes to be allocated per thread to read the
+    files in parallel. When there is a file of large size, we get slightly
+    better throughput by decomposing it and transferring multiple "blocks"
+    in parallel (using a python thread pool). Default allocation is
+    256_000_000 bytes.
+    This parameter is functional only when `use_python_file_object=False`.
 Returns
 -------
 GPU ``DataFrame`` object.
@@ -1010,15 +1025,22 @@
     Write out the column names
 index : bool, default True
     Write out the index as a column
-line_terminator : char, default '\\n'
-chunksize : int or None, default None
-    Rows to write at a time
 encoding : str, default 'utf-8'
     A string representing the encoding to use in the output file
     Only ‘utf-8’ is currently supported
 compression : str, None
     A string representing the compression scheme to use in the the output file
     Compression while writing csv is not supported currently
+line_terminator : char, default '\\n'
+chunksize : int or None, default None
+    Rows to write at a time
+storage_options : dict, optional, default None
+    Extra options that make sense for a particular storage connection,
+    e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
+    pairs are forwarded to ``urllib.request.Request`` as header options.
+    For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value
+    pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
+    ``urllib`` for more details.
 Returns
 -------
 None or str