-
Notifications
You must be signed in to change notification settings - Fork 915
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Handler csv reader options in cudf-polars #16211
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,9 +15,9 @@ | |
|
||
import dataclasses | ||
import itertools | ||
import json | ||
import types | ||
from functools import cache | ||
from pathlib import Path | ||
from typing import TYPE_CHECKING, Any, Callable, ClassVar | ||
|
||
import pyarrow as pa | ||
|
@@ -185,8 +185,10 @@ class Scan(IR): | |
|
||
typ: str | ||
"""What type of file are we reading? Parquet, CSV, etc...""" | ||
options: tuple[Any, ...] | ||
"""Type specific options, as json-encoded strings.""" | ||
reader_options: dict[str, Any] | ||
"""Reader-specific options, as dictionary.""" | ||
cloud_options: dict[str, Any] | None | ||
"""Cloud-related authentication options, currently ignored.""" | ||
paths: list[str] | ||
"""List of paths to read from.""" | ||
file_options: Any | ||
|
@@ -206,24 +208,104 @@ def __post_init__(self) -> None: | |
if self.file_options.n_rows is not None: | ||
raise NotImplementedError("row limit in scan") | ||
if self.typ not in ("csv", "parquet"): | ||
raise NotImplementedError(f"Unhandled scan type: {self.typ}") | ||
if self.cloud_options is not None and any( | ||
self.cloud_options[k] is not None for k in ("aws", "azure", "gcp") | ||
): | ||
raise NotImplementedError( | ||
f"Unhandled scan type: {self.typ}" | ||
) # pragma: no cover; polars raises on the rust side for now | ||
"Read from cloud storage" | ||
) # pragma: no cover; no test yet | ||
if self.typ == "csv": | ||
if self.reader_options["skip_rows_after_header"] != 0: | ||
raise NotImplementedError("Skipping rows after header in CSV reader") | ||
parse_options = self.reader_options["parse_options"] | ||
if ( | ||
null_values := parse_options["null_values"] | ||
) is not None and "Named" in null_values: | ||
raise NotImplementedError( | ||
"Per column null value specification not supported for CSV reader" | ||
) | ||
if ( | ||
comment := parse_options["comment_prefix"] | ||
) is not None and "Multi" in comment: | ||
raise NotImplementedError( | ||
"Multi-character comment prefix not supported for CSV reader" | ||
) | ||
if not self.reader_options["has_header"]: | ||
# Need to do some file introspection to get the number | ||
# of columns so that column projection works right. | ||
raise NotImplementedError("Reading CSV without header") | ||
|
||
def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: | ||
"""Evaluate and return a dataframe.""" | ||
options = self.file_options | ||
with_columns = options.with_columns | ||
row_index = options.row_index | ||
if self.typ == "csv": | ||
opts, cloud_opts = map(json.loads, self.options) | ||
df = DataFrame.from_cudf( | ||
cudf.concat( | ||
[cudf.read_csv(p, usecols=with_columns) for p in self.paths] | ||
dtype_map = { | ||
name: cudf._lib.types.PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[typ.id()] | ||
for name, typ in self.schema.items() | ||
} | ||
parse_options = self.reader_options["parse_options"] | ||
sep = chr(parse_options["separator"]) | ||
quote = chr(parse_options["quote_char"]) | ||
eol = chr(parse_options["eol_char"]) | ||
if self.reader_options["schema"] is not None: | ||
# Reader schema provides names | ||
column_names = list(self.reader_options["schema"]["inner"].keys()) | ||
else: | ||
# file provides column names | ||
column_names = None | ||
usecols = with_columns | ||
# TODO: support has_header=False | ||
header = 0 | ||
|
||
# polars defaults to no null recognition | ||
null_values = [""] | ||
if parse_options["null_values"] is not None: | ||
((typ, nulls),) = parse_options["null_values"].items() | ||
if typ == "AllColumnsSingle": | ||
# Single value | ||
null_values.append(nulls) | ||
else: | ||
# List of values | ||
null_values.extend(nulls) | ||
if parse_options["comment_prefix"] is not None: | ||
comment = chr(parse_options["comment_prefix"]["Single"]) | ||
else: | ||
comment = None | ||
decimal = "," if parse_options["decimal_comma"] else "." | ||
|
||
# polars skips blank lines at the beginning of the file | ||
pieces = [] | ||
for p in self.paths: | ||
skiprows = self.reader_options["skip_rows"] | ||
# TODO: read_csv expands globs which we should not do, | ||
# because polars will already have handled them. | ||
path = Path(p) | ||
with path.open() as f: | ||
while f.readline() == "\n": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this something that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At some point (maybe soon?), cudf-polars is not going to use |
||
skiprows += 1 | ||
pieces.append( | ||
cudf.read_csv( | ||
path, | ||
sep=sep, | ||
quotechar=quote, | ||
lineterminator=eol, | ||
names=column_names, | ||
header=header, | ||
usecols=usecols, | ||
na_filter=True, | ||
na_values=null_values, | ||
keep_default_na=False, | ||
skiprows=skiprows, | ||
comment=comment, | ||
decimal=decimal, | ||
dtype=dtype_map, | ||
) | ||
) | ||
) | ||
df = DataFrame.from_cudf(cudf.concat(pieces)) | ||
elif self.typ == "parquet": | ||
opts, cloud_opts = map(json.loads, self.options) | ||
cdf = cudf.read_parquet(self.paths, columns=with_columns) | ||
assert isinstance(cdf, cudf.DataFrame) | ||
df = DataFrame.from_cudf(cdf) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Confirming that these strings are guaranteed to be in the dict at this point? If this option isn't supported at all would it be worthwhile just to only accept
None
here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
csv reader has non-None cloud_options but the values are None, so I can't punt for
cloud_options is not None
.