Skip to content

Commit

Permalink
Migrate CSV reader to pylibcudf
Browse files Browse the repository at this point in the history
  • Loading branch information
lithomas1 committed Jun 12, 2024
1 parent dfa79d4 commit 24a9d94
Show file tree
Hide file tree
Showing 17 changed files with 454 additions and 290 deletions.
415 changes: 138 additions & 277 deletions python/cudf/cudf/_lib/csv.pyx

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions python/cudf/cudf/_lib/io/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand All @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources datasource.pyx utils.pyx)
set(cython_sources utils.pyx)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/io/utils.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.column cimport Column
from cudf._lib.io.datasource cimport Datasource
from cudf._lib.pylibcudf.io.datasource cimport Datasource
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
from cudf._lib.pylibcudf.libcudf.io.types cimport (
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ except ImportError:

cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
from cudf._lib.column cimport Column
from cudf._lib.io.datasource cimport NativeFileDatasource
from cudf._lib.io.utils cimport (
make_sink_info,
make_source_info,
update_column_struct_field_names,
)
from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
from cudf._lib.pylibcudf.libcudf.io.orc cimport (
chunked_orc_writer_options,
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
from cudf._lib.column cimport Column
from cudf._lib.expressions cimport Expression
from cudf._lib.io.datasource cimport NativeFileDatasource
from cudf._lib.io.utils cimport (
make_sinks_info,
make_source_info,
update_struct_field_names,
)
from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
from cudf._lib.pylibcudf.libcudf.expressions cimport expression
from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
chunked_parquet_reader as cpp_chunked_parquet_reader,
Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================

set(cython_sources avro.pyx types.pyx)
set(cython_sources avro.pyx csv.pyx datasource.pyx types.pyx)

set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
Expand All @@ -21,5 +21,7 @@ rapids_cython_create_modules(
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
)

set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types)
set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_csv pylibcudf_io_datasource
pylibcudf_io_types
)
link_to_pyarrow_headers("${targets_using_arrow_headers}")
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . cimport avro, types
from . cimport avro, csv, types
from .types cimport SourceInfo, TableWithMetadata
2 changes: 1 addition & 1 deletion python/cudf/cudf/_lib/pylibcudf/io/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from . import avro, types
from . import avro, csv, types
from .types import SourceInfo, TableWithMetadata
49 changes: 49 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/csv.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool

from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type, quote_style
from cudf._lib.pylibcudf.libcudf.types cimport size_type, type_id
from cudf._lib.pylibcudf.types cimport DataType

ctypedef fused dtypes_t:
dict
list

cpdef TableWithMetadata read_csv(
SourceInfo source_info,
compression_type compression = *,
size_t byte_range_offset = *,
size_t byte_range_size = *,
list col_names = *,
str prefix = *,
bool mangle_dupe_cols = *,
list usecols = *,
size_type nrows = *,
size_type skiprows = *,
size_type skipfooter = *,
size_type header = *,
str lineterminator = *,
str delimiter = *,
str thousands = *,
str decimal = *,
str comment = *,
bool delim_whitespace = *,
bool skipinitialspace = *,
bool skip_blank_lines = *,
quote_style quoting = *,
str quotechar = *,
bool doublequote = *,
bool detect_whitespace_around_quotes = *,
list parse_dates = *,
list parse_hex = *,
dtypes_t dtypes = *,
list true_values = *,
list false_values = *,
list na_values = *,
bool keep_default_na = *,
bool na_filter = *,
bool dayfirst = *,
DataType timestamp_type = *
)
207 changes: 207 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.map cimport map
from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.pylibcudf.io.csv cimport dtypes_t
from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
from cudf._lib.pylibcudf.libcudf.io.csv cimport (
csv_reader_options,
read_csv as cpp_read_csv,
)
from cudf._lib.pylibcudf.libcudf.io.types cimport (
compression_type,
quote_style,
table_with_metadata,
)
from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
from cudf._lib.pylibcudf.types cimport DataType


cpdef TableWithMetadata read_csv(
SourceInfo source_info,
compression_type compression = compression_type.AUTO,
size_t byte_range_offset = 0,
size_t byte_range_size = 0,
list col_names = None,
str prefix = "",
bool mangle_dupe_cols = True,
list usecols = None,
size_type nrows = -1,
size_type skiprows = 0,
size_type skipfooter = 0,
size_type header = 0,
str lineterminator = "\n",
str delimiter = None,
str thousands = None,
str decimal = None,
str comment = None,
bool delim_whitespace = False,
bool skipinitialspace = False,
bool skip_blank_lines = True,
quote_style quoting = quote_style.MINIMAL,
str quotechar = '"',
bool doublequote = True,
bool detect_whitespace_around_quotes = False,
list parse_dates = None,
list parse_hex = None,
dtypes_t dtypes = None,
list true_values = None,
list false_values = None,
list na_values = None,
bool keep_default_na = True,
bool na_filter = True,
bool dayfirst = False,
DataType timestamp_type = DataType(type_id.EMPTY)
):
"""
"""
cdef vector[string] c_names
cdef vector[int] c_use_cols_indexes
cdef vector[string] c_use_cols_names
cdef vector[string] c_parse_dates_names
cdef vector[int] c_parse_dates_indexes
cdef vector[int] c_parse_hex_names
cdef vector[int] c_parse_hex_indexes
cdef vector[data_type] c_dtypes_list
cdef map[string, data_type] c_dtypes_map
cdef vector[string] c_true_values
cdef vector[string] c_false_values
cdef vector[string] c_na_values

cdef csv_reader_options options = move(
csv_reader_options.builder(source_info.c_obj)
.compression(compression)
.mangle_dupe_cols(mangle_dupe_cols)
.byte_range_offset(byte_range_offset)
.byte_range_size(byte_range_size)
.nrows(nrows)
.skiprows(skiprows)
.skipfooter(skipfooter)
.quoting(quoting)
.lineterminator(ord(lineterminator))
.quotechar(ord(quotechar))
.decimal(ord(decimal))
.delim_whitespace(delim_whitespace)
.skipinitialspace(skipinitialspace)
.skip_blank_lines(skip_blank_lines)
.doublequote(doublequote)
.keep_default_na(keep_default_na)
.na_filter(na_filter)
.dayfirst(dayfirst)
.build()
)

options.set_header(header)

if col_names is not None:
c_names.reserve(len(col_names))
for name in col_names:
c_names.push_back(str(name).encode())
options.set_names(c_names)

if prefix is not None:
options.set_prefix(prefix.encode())

if usecols is not None:
all_int = all([isinstance(col, int) for col in usecols])
if all_int:
c_use_cols_indexes.reserve(len(usecols))
c_use_cols_indexes = usecols
options.set_use_cols_indexes(c_use_cols_indexes)
else:
c_use_cols_names.reserve(len(usecols))
for col_name in usecols:
c_use_cols_names.push_back(
str(col_name).encode()
)
options.set_use_cols_names(c_use_cols_names)

if delimiter is not None:
options.set_delimiter(ord(delimiter))

if thousands is not None:
options.set_thousands(ord(thousands))

if comment is not None:
options.set_comment(ord(comment))

if parse_dates is not None:
for col in parse_dates:
if isinstance(col, str):
c_parse_dates_names.push_back(col.encode())
elif isinstance(col, int):
c_parse_dates_indexes.push_back(col)
else:
raise NotImplementedError(
"`parse_dates`: Must pass a list of column names/indices")

# Set both since users are allowed to mix column names and indices
options.set_parse_dates(c_parse_dates_names)
options.set_parse_dates(c_parse_dates_indexes)

if parse_hex is not None:
for col in parse_hex:
if isinstance(col, str):
c_parse_hex_names.push_back(col.encode())
elif isinstance(col, int):
c_parse_hex_indexes.push_back(col)
else:
raise NotImplementedError(
"`parse_hex`: Must pass a list of column names/indices")
# Set both since users are allowed to mix column names and indices
options.set_parse_hex(c_parse_hex_names)
options.set_parse_hex(c_parse_hex_indexes)

cdef string k_str
if dtypes is not None:
if dtypes_t is list:
for dtype in dtypes:
if not isinstance(dtype, DataType):
raise TypeError("If passing list to read_csv, "
"all elements must be of type `DataType`!")
c_dtypes_list.push_back((<DataType>dtype).c_obj)
options.set_dtypes(c_dtypes_list)
else:
# dtypes_t is dict
for k, v in dtypes.items():
k_str = str(k).encode()
if not isinstance(v, DataType):
raise TypeError("If passing dict to read_csv, "
"all values must be of type `DataType`!")
c_dtypes_map[k_str] = (<DataType>v).c_obj
options.set_dtypes(c_dtypes_map)

if true_values is not None:
c_true_values.reserve(len(true_values))
for tv in true_values:
if not isinstance(tv, str):
raise TypeError("true_values must be a list of str!")
c_true_values.push_back(tv.encode())
options.set_true_values(c_true_values)

if false_values is not None:
c_false_values.reserve(len(false_values))
for fv in false_values:
if not isinstance(fv, str):
raise TypeError("false_values must be a list of str!")
c_false_values.push_back(fv.encode())
options.set_false_values(c_false_values)

if na_values is not None:
c_na_values.reserve(len(na_values))
for nv in na_values:
if not isinstance(nv, str):
raise TypeError("na_values must be a list of str!")
c_na_values.push_back(nv.encode())
options.set_na_values(c_na_values)

cdef table_with_metadata c_result
with nogil:
c_result = move(cpp_read_csv(options))

return TableWithMetadata.from_libcudf(c_result)
File renamed without changes.
File renamed without changes.
23 changes: 19 additions & 4 deletions python/cudf/cudf/_lib/pylibcudf/io/types.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ from libcpp.string cimport string
from libcpp.utility cimport move
from libcpp.vector cimport vector

from cudf._lib.pylibcudf.io.datasource cimport Datasource
from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
from cudf._lib.pylibcudf.libcudf.io.types cimport (
host_buffer,
source_info,
Expand Down Expand Up @@ -56,9 +58,8 @@ cdef class SourceInfo:
Parameters
----------
sources : List[Union[str, os.PathLike, bytes, io.BytesIO]]
A homogeneous list of sources (this can be a string filename,
an os.PathLike, bytes, or an io.BytesIO) to read from.
sources : List[Union[str, os.PathLike, bytes, io.BytesIO, DataSource]]
A homogeneous list of sources to read from.
Mixing different types of sources will raise a `ValueError`.
"""
Expand All @@ -68,6 +69,7 @@ cdef class SourceInfo:
raise ValueError("Need to pass at least one source")

cdef vector[string] c_files
cdef vector[datasource*] c_datasources

if isinstance(sources[0], (os.PathLike, str)):
c_files.reserve(len(sources))
Expand All @@ -84,6 +86,13 @@ cdef class SourceInfo:

self.c_obj = move(source_info(c_files))
return
elif isinstance(sources[0], Datasource):
for csrc in sources:
if not isinstance(csrc, Datasource):
raise ValueError("All sources must be of the same type!")
c_datasources.push_back((<Datasource>csrc).get_datasource())
self.c_obj = move(source_info(c_datasources))
return

# TODO: host_buffer is deprecated API, use host_span instead
cdef vector[host_buffer] c_host_buffers
Expand All @@ -106,5 +115,11 @@ cdef class SourceInfo:
c_buffer = bio.getbuffer() # check if empty?
c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
c_buffer.shape[0]))
else:
raise ValueError("Sources must be a list of str/paths, "
"bytes, io.BytesIO, or a Datasource")

if empty_buffer is True:
c_host_buffers.push_back(host_buffer(<char*>NULL, 0))

self.c_obj = source_info(c_host_buffers)
self.c_obj = move(source_info(c_host_buffers))
Loading

0 comments on commit 24a9d94

Please sign in to comment.