Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add StringIO support to read_text #10465

Merged
merged 8 commits into from
Mar 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions python/cudf/cudf/_lib/text.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from io import TextIOBase

import cudf

from cython.operator cimport dereference
Expand Down Expand Up @@ -28,30 +30,33 @@ def read_text(object filepaths_or_buffers,
--------
cudf.io.text.read_text
"""
cdef string filename = filepaths_or_buffers.encode()
cdef string delim = delimiter.encode()

cdef unique_ptr[data_chunk_source] datasource
cdef unique_ptr[column] c_col

cdef size_t c_byte_range_offset
cdef size_t c_byte_range_size
cdef byte_range_info c_byte_range

if (byte_range is not None):
if isinstance(filepaths_or_buffers, TextIOBase):
datasource = move(make_source(filepaths_or_buffers.read().encode()))
else:
datasource = move(make_source_from_file(filepaths_or_buffers.encode()))

if (byte_range is None):
with nogil:
c_col = move(multibyte_split(dereference(datasource), delim))
else:
c_byte_range_offset = byte_range[0]
c_byte_range_size = byte_range[1]
c_byte_range = byte_range_info(
c_byte_range_offset,
c_byte_range_size)
with nogil:
datasource = move(make_source_from_file(filename))
c_byte_range = byte_range_info(
c_byte_range_offset,
c_byte_range_size)
c_col = move(multibyte_split(
dereference(datasource),
delim,
c_byte_range))
else:
with nogil:
datasource = move(make_source_from_file(filename))
c_col = move(multibyte_split(dereference(datasource), delim))

return {None: Column.from_unique_ptr(move(c_col))}
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_text.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

from io import StringIO

import numpy as np
import pytest

Expand Down Expand Up @@ -829,3 +831,13 @@ def test_read_text_byte_range_large(datadir):
f.write(content)

cudf.read_text(temp_file, delimiter=delimiter)


def test_read_text_in_memory(datadir):
# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(["x::", "y::", "z"])

actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")

assert_eq(expected, actual)