Skip to content

Commit

Permalink
Fix dtype-argument bug in dask_cudf read_csv (#9796)
Browse files Browse the repository at this point in the history
Closes #9719

`dask_cudf.read_csv` currently fails when both `usecols` and `dtype` are specified. This PR is  a simple fix.  In the near future, the `_internal_read_csv` implementation should also be modified to produce a `Blockwise` HLG Layer, but I will leave that for a separate PR.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #9796
  • Loading branch information
rjzamora authored Nov 30, 2021
1 parent 27b7190 commit dca8a0a
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 10 deletions.
19 changes: 11 additions & 8 deletions python/dask_cudf/dask_cudf/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,17 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
if chunksize is None:
return read_csv_without_chunksize(path, **kwargs)

# Let dask.dataframe generate meta
dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
usecols = kwargs.pop("usecols", None)
meta = dask_reader(filenames[0], **kwargs)._meta
kwargs1 = kwargs.copy()
usecols = kwargs1.pop("usecols", None)
dtype = kwargs1.pop("dtype", None)
meta = dask_reader(filenames[0], **kwargs1)._meta
names = meta.columns
if usecols or dtype:
# Regenerate meta with original kwargs if
# `usecols` or `dtype` was specified
meta = dask_reader(filenames[0], **kwargs)._meta

dsk = {}
i = 0
Expand All @@ -127,18 +135,13 @@ def _internal_read_csv(path, chunksize="256 MiB", **kwargs):
chunksize,
) # specify which chunk of the file we care about
if start != 0:
kwargs2[
"names"
] = meta.columns # no header in the middle of the file
kwargs2["names"] = names # no header in the middle of the file
kwargs2["header"] = None
kwargs2["usecols"] = usecols
dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)

i += 1

divisions = [None] * (len(dsk) + 1)
if usecols is not None:
meta = meta[usecols]
return dd.core.new_dd_object(dsk, name, meta, divisions)


Expand Down
5 changes: 3 additions & 2 deletions python/dask_cudf/dask_cudf/io/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,8 @@ def test_read_csv_chunksize_none(tmp_path, compression, size):
dd.assert_eq(df, df2)


def test_csv_reader_usecols(tmp_path):
@pytest.mark.parametrize("dtype", [{"b": str, "c": int}, None])
def test_csv_reader_usecols(tmp_path, dtype):
df = cudf.DataFrame(
{
"a": [1, 2, 3, 4] * 100,
Expand All @@ -147,6 +148,6 @@ def test_csv_reader_usecols(tmp_path):
csv_path = str(tmp_path / "usecols_data.csv")
df.to_csv(csv_path, index=False)
ddf = dask_cudf.from_cudf(df[["b", "c"]], npartitions=5)
ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"])
ddf2 = dask_cudf.read_csv(csv_path, usecols=["b", "c"], dtype=dtype)

dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)

0 comments on commit dca8a0a

Please sign in to comment.