Skip to content

Commit

Permalink
Add StringIO support to read_text (#10465)
Browse files Browse the repository at this point in the history
Add StringIO support to `read_text`.

Authors:
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ashwin Srinath (https://github.com/shwina)
  - Jeremy Dyer (https://github.com/jdye64)

URL: #10465
  • Loading branch information
cwharris authored Mar 23, 2022
1 parent 9edcbd4 commit 12b66a3
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 10 deletions.
25 changes: 15 additions & 10 deletions python/cudf/cudf/_lib/text.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from io import TextIOBase

import cudf

from cython.operator cimport dereference
Expand Down Expand Up @@ -28,30 +30,33 @@ def read_text(object filepaths_or_buffers,
--------
cudf.io.text.read_text
"""
cdef string filename = filepaths_or_buffers.encode()
cdef string delim = delimiter.encode()

cdef unique_ptr[data_chunk_source] datasource
cdef unique_ptr[column] c_col

cdef size_t c_byte_range_offset
cdef size_t c_byte_range_size
cdef byte_range_info c_byte_range

if (byte_range is not None):
if isinstance(filepaths_or_buffers, TextIOBase):
datasource = move(make_source(filepaths_or_buffers.read().encode()))
else:
datasource = move(make_source_from_file(filepaths_or_buffers.encode()))

if (byte_range is None):
with nogil:
c_col = move(multibyte_split(dereference(datasource), delim))
else:
c_byte_range_offset = byte_range[0]
c_byte_range_size = byte_range[1]
c_byte_range = byte_range_info(
c_byte_range_offset,
c_byte_range_size)
with nogil:
datasource = move(make_source_from_file(filename))
c_byte_range = byte_range_info(
c_byte_range_offset,
c_byte_range_size)
c_col = move(multibyte_split(
dereference(datasource),
delim,
c_byte_range))
else:
with nogil:
datasource = move(make_source_from_file(filename))
c_col = move(multibyte_split(dereference(datasource), delim))

return {None: Column.from_unique_ptr(move(c_col))}
12 changes: 12 additions & 0 deletions python/cudf/cudf/tests/test_text.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Copyright (c) 2019-2022, NVIDIA CORPORATION.

from io import StringIO

import numpy as np
import pytest

Expand Down Expand Up @@ -829,3 +831,13 @@ def test_read_text_byte_range_large(datadir):
f.write(content)

cudf.read_text(temp_file, delimiter=delimiter)


def test_read_text_in_memory(datadir):
# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(["x::", "y::", "z"])

actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")

assert_eq(expected, actual)

0 comments on commit 12b66a3

Please sign in to comment.