From 12b66a3e60d00dd9e3f9abb890b47d6e83727fa6 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 23 Mar 2022 15:07:18 -0500 Subject: [PATCH] Add StringIO support to read_text (#10465) Add StringIO support to `read_text`. Authors: - Christopher Harris (https://github.com/cwharris) Approvers: - Bradley Dice (https://github.com/bdice) - Ashwin Srinath (https://github.com/shwina) - Jeremy Dyer (https://github.com/jdye64) URL: https://github.com/rapidsai/cudf/pull/10465 --- python/cudf/cudf/_lib/text.pyx | 25 +++++++++++++++---------- python/cudf/cudf/tests/test_text.py | 12 ++++++++++++ 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx index daea227cc39..868574be187 100644 --- a/python/cudf/cudf/_lib/text.pyx +++ b/python/cudf/cudf/_lib/text.pyx @@ -1,5 +1,7 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. +from io import TextIOBase + import cudf from cython.operator cimport dereference @@ -28,30 +30,33 @@ def read_text(object filepaths_or_buffers, -------- cudf.io.text.read_text """ - cdef string filename = filepaths_or_buffers.encode() cdef string delim = delimiter.encode() cdef unique_ptr[data_chunk_source] datasource cdef unique_ptr[column] c_col + cdef size_t c_byte_range_offset cdef size_t c_byte_range_size cdef byte_range_info c_byte_range - if (byte_range is not None): + if isinstance(filepaths_or_buffers, TextIOBase): + datasource = move(make_source(filepaths_or_buffers.read().encode())) + else: + datasource = move(make_source_from_file(filepaths_or_buffers.encode())) + + if (byte_range is None): + with nogil: + c_col = move(multibyte_split(dereference(datasource), delim)) + else: c_byte_range_offset = byte_range[0] c_byte_range_size = byte_range[1] + c_byte_range = byte_range_info( + c_byte_range_offset, + c_byte_range_size) with nogil: - datasource = move(make_source_from_file(filename)) - c_byte_range = byte_range_info( - c_byte_range_offset, - c_byte_range_size) c_col = move(multibyte_split( dereference(datasource), delim, c_byte_range)) - else: - with nogil: - datasource = move(make_source_from_file(filename)) - c_col = move(multibyte_split(dereference(datasource), delim)) return {None: Column.from_unique_ptr(move(c_col))} diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index fb6505f5f92..c332924fd8b 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -1,5 +1,7 @@ # Copyright (c) 2019-2022, NVIDIA CORPORATION. +from io import StringIO + import numpy as np import pytest @@ -829,3 +831,13 @@ def test_read_text_byte_range_large(datadir): f.write(content) cudf.read_text(temp_file, delimiter=delimiter) + + +def test_read_text_in_memory(datadir): + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series(["x::", "y::", "z"]) + + actual = cudf.read_text(StringIO("x::y::z"), delimiter="::") + + assert_eq(expected, actual)