diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py index 9b6abeb1276..f8f674fecec 100644 --- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py +++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import sys from io import StringIO @@ -54,12 +54,12 @@ def csv_writer_test(pdf): ], "columns": ALL_POSSIBLE_VALUES, "index": [True, False], - "line_terminator": ["\n", "\r", "\r\n"], + "lineterminator": ["\n", "\r", "\r\n"], "chunksize": ALL_POSSIBLE_VALUES, }, ) def csv_writer_test_params( - pdf, sep, header, na_rep, columns, index, line_terminator, chunksize + pdf, sep, header, na_rep, columns, index, lineterminator, chunksize ): gdf = cudf.from_pandas(pdf) @@ -69,7 +69,7 @@ def csv_writer_test_params( na_rep=na_rep, columns=columns, index=index, - line_terminator=line_terminator, + lineterminator=lineterminator, chunksize=chunksize, ) gd_buffer = gdf.to_csv( @@ -78,7 +78,7 @@ def csv_writer_test_params( na_rep=na_rep, columns=columns, index=index, - line_terminator=line_terminator, + lineterminator=lineterminator, chunksize=chunksize, ) @@ -90,13 +90,13 @@ def csv_writer_test_params( StringIO(gd_buffer), delimiter=sep, na_values=na_rep, - lineterminator=line_terminator, + lineterminator=lineterminator, ) expected = pd.read_csv( StringIO(pd_buffer), delimiter=sep, na_values=na_rep, - lineterminator=line_terminator, + lineterminator=lineterminator, ) if not header: # TODO: Remove renaming columns once the following bug is fixed: diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 09de1f1724e..ce57ea26360 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -472,7 +472,7 @@ def write_csv( object sep=",", object na_rep="", bool header=True, - object line_terminator="\n", + object lineterminator="\n", int rows_per_chunk=8, bool index=True, ): @@ -488,7 +488,7 @@ def write_csv( ) cdef bool include_header_c = header cdef char delim_c = ord(sep) - cdef string line_term_c = line_terminator.encode() + cdef string line_term_c = lineterminator.encode() cdef string na_c = na_rep.encode() cdef int rows_per_chunk_c = rows_per_chunk cdef vector[string] col_names diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index fd6e9e2687d..128f3485063 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6,6 +6,7 @@ import inspect import itertools import numbers +import os import pickle import re import sys @@ -604,7 +605,6 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): def __init__( self, data=None, index=None, columns=None, dtype=None, nan_as_null=True ): - super().__init__() if isinstance(columns, (Series, cudf.BaseIndex)): @@ -918,7 +918,7 @@ def _init_from_dict_like( if len(data): self._data.multiindex = True - for (i, col_name) in enumerate(data): + for i, col_name in enumerate(data): self._data.multiindex = self._data.multiindex and isinstance( col_name, tuple ) @@ -1199,7 +1199,6 @@ def __setitem__(self, arg, value): if is_scalar(value): self._data[col_name][scatter_map] = value else: - self._data[col_name][scatter_map] = column.as_column( value )[scatter_map] @@ -5445,7 +5444,6 @@ def interpolate( downcast=None, **kwargs, ): - if all(dt == np.dtype("object") for dt in self.dtypes): raise TypeError( "Cannot interpolate with all object-dtype " @@ -6358,13 +6356,29 @@ def to_csv( index=True, encoding=None, compression=None, - line_terminator="\n", + lineterminator=None, + line_terminator=None, chunksize=None, storage_options=None, ): """{docstring}""" from cudf.io import csv + if line_terminator is not None: + warnings.warn( + "line_terminator is a deprecated keyword argument, " + "use lineterminator instead.", + FutureWarning, + ) + if lineterminator is not None: + warnings.warn( + f"Ignoring {line_terminator=} in favour " + f"of {lineterminator=}" + ) + else: + lineterminator = line_terminator + if lineterminator is None: + lineterminator = os.linesep return csv.to_csv( self, path_or_buf=path_or_buf, @@ -6373,7 +6387,7 @@ def to_csv( columns=columns, header=header, index=index, - line_terminator=line_terminator, + lineterminator=line_terminator, chunksize=chunksize, encoding=encoding, compression=compression, @@ -6738,7 +6752,6 @@ def append( current_cols = self._data.to_pandas_index() combined_columns = other.index.to_pandas() if len(current_cols): - if cudf.utils.dtypes.is_mixed_with_object_dtype( current_cols, combined_columns ): diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 1eacbbb4458..95e0aa18070 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. from collections import abc from io import BytesIO, StringIO @@ -155,7 +155,7 @@ def to_csv( index=True, encoding=None, compression=None, - line_terminator="\n", + lineterminator="\n", chunksize=None, storage_options=None, ): @@ -233,7 +233,7 @@ def to_csv( sep=sep, na_rep=na_rep, header=header, - line_terminator=line_terminator, + lineterminator=lineterminator, rows_per_chunk=rows_per_chunk, index=index, ) @@ -244,7 +244,7 @@ def to_csv( sep=sep, na_rep=na_rep, header=header, - line_terminator=line_terminator, + lineterminator=lineterminator, rows_per_chunk=rows_per_chunk, index=index, ) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index 6066cd3b03e..4a7804da62c 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -223,7 +223,6 @@ def _make_path_or_buf(src): @pytest.mark.parametrize("dtype", dtypes) @pytest.mark.parametrize("nelem", nelem) def test_csv_reader_numeric_data(dtype, nelem, tmpdir): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file1.csv") df = make_numeric_dataframe(nelem, dtype) @@ -262,7 +261,6 @@ def test_csv_reader_datetime(parse_dates): def test_csv_reader_mixed_data_delimiter_sep( tmpdir, pandas_arg, cudf_arg, pd_mixed_dataframe ): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file3.csv") pd_mixed_dataframe.to_csv(fname, sep="|", index=False, header=False) @@ -342,7 +340,6 @@ def test_csv_reader_dtype_extremes(use_names): def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file5.csv") pd_mixed_dataframe.to_csv( @@ -531,7 +528,6 @@ def test_csv_reader_float_decimal(tmpdir): def test_csv_reader_NaN_values(): - names = dtypes = ["float32"] empty_cells = '\n""\n' default_na_cells = ( @@ -638,7 +634,6 @@ def test_csv_reader_thousands(tmpdir): def test_csv_reader_buffer_strings(): - names = ["text", "int"] dtypes = ["str", "int"] lines = [",".join(names), "a,0", "b,0", "c,0", "d,0"] @@ -682,7 +677,6 @@ def test_csv_reader_buffer_strings(): def test_csv_reader_compression( tmpdir, ext, out_comp, in_comp, pd_mixed_dataframe ): - fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_compression" + ext) df = pd_mixed_dataframe @@ -959,7 +953,6 @@ def test_csv_reader_gzip_compression_strings(tmpdir): @pytest.mark.parametrize("skip_rows", [0, 2, 4]) @pytest.mark.parametrize("header_row", [0, 2]) def test_csv_reader_skiprows_header(skip_rows, header_row): - names = ["float_point", "integer"] dtypes = ["float64", "int64"] lines = [ @@ -1023,7 +1016,6 @@ def test_csv_reader_dtype_inference_whitespace(): def test_csv_reader_empty_dataframe(): - dtypes = ["float64", "int64"] buffer = "float_point, integer" @@ -1208,24 +1200,23 @@ def test_csv_reader_byte_range_strings(segment_bytes): ("infer", 5, False), ], ) -@pytest.mark.parametrize("line_terminator", ["\n", "\r\n"]) +@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) def test_csv_reader_blanks_and_comments( - skip_rows, header_row, skip_blanks, line_terminator + skip_rows, header_row, skip_blanks, lineterminator ): - lines = [ "# first comment line", - line_terminator, + lineterminator, "# third comment line", "1,2,3", "4,5,6", "7,8,9", - line_terminator, + lineterminator, "# last comment line", - line_terminator, + lineterminator, "1,1,1", ] - buffer = line_terminator.join(lines) + buffer = lineterminator.join(lines) cu_df = read_csv( StringIO(buffer), @@ -1247,7 +1238,6 @@ def test_csv_reader_blanks_and_comments( def test_csv_reader_prefix(): - lines = ["1, 1, 1, 1"] buffer = "\n".join(lines) @@ -1521,11 +1511,10 @@ def test_csv_reader_scientific_type_detection(): assert np.isclose(df[col][0], expected[int(col)]) -@pytest.mark.parametrize("line_terminator", ["\n", "\r\n"]) -def test_csv_blank_first_row(line_terminator): - +@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"]) +def test_csv_blank_first_row(lineterminator): lines = ["colA,colB", "", "1, 1.1", "2, 2.2"] - buffer = line_terminator.join(lines) + buffer = lineterminator.join(lines) cu_df = read_csv(StringIO(buffer)) @@ -1588,7 +1577,6 @@ def test_csv_reader_partial_dtype(dtype): def test_csv_writer_file_handle(tmpdir): - df = pd.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) gdf = cudf.from_pandas(df) @@ -1602,7 +1590,6 @@ def test_csv_writer_file_handle(tmpdir): def test_csv_writer_file_append(tmpdir): - gdf1 = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) gdf2 = cudf.DataFrame({"a": [4, 5, 6], "b": ["foo", "bar", "baz"]}) @@ -1618,7 +1605,6 @@ def test_csv_writer_file_append(tmpdir): def test_csv_writer_buffer(tmpdir): - gdf = cudf.DataFrame({"a": [1, 2, 3], "b": ["xxx", "yyyy", "zzzzz"]}) buffer = BytesIO() @@ -1631,7 +1617,6 @@ def test_csv_writer_buffer(tmpdir): @pytest.mark.parametrize("dtype", dtypes) @pytest.mark.parametrize("nelem", nelem) def test_csv_writer_numeric_data(dtype, nelem, tmpdir): - pdf_df_fname = tmpdir.join("pdf_df_1.csv") gdf_df_fname = tmpdir.join("gdf_df_1.csv") @@ -1665,24 +1650,22 @@ def test_csv_writer_datetime_data(tmpdir): assert_eq(expect, got) -@pytest.mark.parametrize("line_terminator", ["\r", "\n", "\t", np.str_("\n")]) +@pytest.mark.parametrize("lineterminator", ["\r", "\n", "\t", np.str_("\n")]) @pytest.mark.parametrize("sep", [",", "/", np.str_(",")]) -def test_csv_writer_terminator_sep(line_terminator, sep, cudf_mixed_dataframe): +def test_csv_writer_terminator_sep(lineterminator, sep, cudf_mixed_dataframe): df = cudf_mixed_dataframe buffer = BytesIO() - df.to_csv(buffer, line_terminator=line_terminator, sep=sep, index=False) + df.to_csv(buffer, lineterminator=lineterminator, sep=sep, index=False) - got = read_csv(buffer, lineterminator=line_terminator, sep=sep) + got = read_csv(buffer, lineterminator=lineterminator, sep=sep) assert_eq(df, got) @pytest.mark.parametrize( - "line_terminator", ["\r\n", "ABC", "\t\t", np.str_("\r\n")] + "lineterminator", ["\r\n", "ABC", "\t\t", np.str_("\r\n")] ) -def test_csv_writer_multichar_terminator( - line_terminator, cudf_mixed_dataframe -): +def test_csv_writer_multichar_terminator(lineterminator, cudf_mixed_dataframe): df = cudf_mixed_dataframe default_terminator_csv = StringIO() @@ -1690,10 +1673,10 @@ def test_csv_writer_multichar_terminator( # Need to check manually since readers don't support # multicharacter line terminators - expected = default_terminator_csv.getvalue().replace("\n", line_terminator) + expected = default_terminator_csv.getvalue().replace("\n", lineterminator) buffer = StringIO() - df.to_csv(buffer, line_terminator=line_terminator) + df.to_csv(buffer, lineterminator=lineterminator) got = buffer.getvalue() assert_eq(expected, got) @@ -1827,7 +1810,6 @@ def test_to_csv_StringIO(df): def test_csv_writer_empty_dataframe(tmpdir): - df_fname = tmpdir.join("gdf_df_5.csv") gdf = cudf.DataFrame({"float_point": [], "integer": []}) gdf["float_point"] = gdf["float_point"].astype("float") @@ -2225,7 +2207,6 @@ def test_default_float_bitwidth_partial(default_float_bitwidth): ], ) def test_column_selection_plus_column_names(usecols, names): - lines = [ "num,datetime,text", "123,2018-11-13T12:00:00,abc", diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 924cc62fb15..c9ae76ae06e 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1226,7 +1226,15 @@ compression : str, None A string representing the compression scheme to use in the the output file Compression while writing csv is not supported currently -line_terminator : char, default '\\n' +line_terminator : str, optional + + .. deprecated:: 23.04 + + Replaced with lineterminator for consistency with read_csv and pandas. + +lineterminator : str, optional + The newline character or character sequence to use in the output file. + Defaults to :attr:`os.linesep`. chunksize : int or None, default None Rows to write at a time storage_options : dict, optional, default None @@ -1651,7 +1659,6 @@ def get_reader_filepath_or_buffer( path_or_data = stringify_pathlike(path_or_data) if isinstance(path_or_data, str): - # Get a filesystem object if one isn't already available paths = [path_or_data] if fs is None: