diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 86808b8ba00..403b7acde5c 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -297,7 +297,7 @@ cdef csv_reader_options make_csv_reader_options( if false_values is not None: c_false_values.reserve(len(false_values)) - for fv in c_false_values: + for fv in false_values: c_false_values.push_back(fv.encode()) csv_reader_options_c.set_false_values(c_false_values) diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py index b91893d8991..e85d404d2c4 100644 --- a/python/cudf/cudf/tests/test_csv.py +++ b/python/cudf/cudf/tests/test_csv.py @@ -766,6 +766,81 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses): assert_eq(df_out, out) +def test_csv_reader_bools_custom(): + names = ["text", "bool"] + dtypes = {"text": "str", "bool": "bool"} + trues = ["foo", "1"] + falses = ["bar", "0"] + lines = [ + ",".join(names), + "true,true", + "false,false", + "foo,foo", + "bar,bar", + "0,0", + "1,1", + ] + buffer = "\n".join(lines) + + df = read_csv( + StringIO(buffer), + names=names, + dtype=dtypes, + skiprows=1, + true_values=trues, + false_values=falses, + ) + + # Note: bool literals give parsing errors as int + # "0" and "1" give parsing errors as bool in pandas + expected = pd.read_csv( + StringIO(buffer), + names=names, + dtype=dtypes, + skiprows=1, + true_values=trues, + false_values=falses, + ) + assert_eq(df, expected, check_dtype=True) + + +def test_csv_reader_bools_NA(): + names = ["text", "int"] + dtypes = ["str", "int"] + trues = ["foo"] + falses = ["bar"] + lines = [ + ",".join(names), + "true,true", + "false,false", + "foo,foo", + "bar,bar", + "qux,qux", + ] + + buffer = "\n".join(lines) + + df = read_csv( + StringIO(buffer), + names=names, + dtype=dtypes, + skiprows=1, + true_values=trues, + false_values=falses, + ) + assert len(df.columns) == 2 + assert df["text"].dtype == np.dtype("object") + assert df["int"].dtype == np.dtype("int64") + expected = pd.DataFrame( + { + "text": ["true", "false", "foo", "bar", "qux"], + "int": [1, 0, 1, 0, 0], + } + ) + # breaking behaviour is np.nan for qux + assert_eq(df, expected) + + def test_csv_quotednumbers(tmpdir): fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv")