From ab0b133594b217d5aad605491fedccef7137d936 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 19 Mar 2022 14:19:25 -0500 Subject: [PATCH 1/7] add StringIO support to read_text --- python/cudf/cudf/_lib/text.pyx | 18 ++++++++++++++---- python/cudf/cudf/tests/test_text.py | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx index 9f33f32bdaf..e5940593a11 100644 --- a/python/cudf/cudf/_lib/text.pyx +++ b/python/cudf/cudf/_lib/text.pyx @@ -1,5 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. +from io import StringIO + import cudf from cython.operator cimport dereference @@ -26,14 +28,22 @@ def read_text(object filepaths_or_buffers, -------- cudf.io.text.read_text """ - cdef string filename = filepaths_or_buffers.encode() + cdef string filepath + cdef string data cdef string delim = delimiter.encode() cdef unique_ptr[data_chunk_source] datasource cdef unique_ptr[column] c_col - with nogil: - datasource = move(make_source_from_file(filename)) - c_col = move(multibyte_split(dereference(datasource), delim)) + if isinstance(filepaths_or_buffers, (StringIO)): + data = filepaths_or_buffers.read().encode() + with nogil: + datasource = move(make_source(data)) + c_col = move(multibyte_split(dereference(datasource), delim)) + else: + filepath = filepaths_or_buffers.encode() + with nogil: + datasource = move(make_source_from_file(filepath)) + c_col = move(multibyte_split(dereference(datasource), delim)) return {None: Column.from_unique_ptr(move(c_col))} diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index a447a60c709..a507432d0e2 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -1,4 +1,5 @@ -# Copyright (c) 2019, NVIDIA CORPORATION. + +from io import StringIO# Copyright (c) 2019, NVIDIA CORPORATION. import numpy as np import pytest @@ -778,3 +779,15 @@ def test_read_text(datadir): actual = cudf.read_text(chess_file, delimiter=delimiter) assert_eq(expected, actual) + + +def test_read_text_in_memory(datadir): + delimiter = "::" + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series(["x::", "y::", "z"]) + + actual = cudf.read_text(StringIO("x::y::z"), delimiter=delimiter) + + assert_eq(expected, actual) From 3fa26da4c0ff184f33d0f87439f4edf057a016cc Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 19 Mar 2022 14:25:15 -0500 Subject: [PATCH 2/7] fix copyright --- python/cudf/cudf/tests/test_text.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index a507432d0e2..a1f49bec932 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -1,5 +1,7 @@ -from io import StringIO# Copyright (c) 2019, NVIDIA CORPORATION. +# Copyright (c) 2019-2022, NVIDIA CORPORATION. + +from io import StringIO import numpy as np import pytest @@ -344,7 +346,7 @@ def test_character_tokenize_series(): "w", "o", ":", - "t", + "t",already-in-memory "h", "r", "e", From af0a0b1a48004214c3cc5ba9e73faf4ba4007cbb Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 19 Mar 2022 15:11:53 -0500 Subject: [PATCH 3/7] rm extra whitespace --- python/cudf/cudf/tests/test_text.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index a1f49bec932..6435630974f 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -1,4 +1,3 @@ - # Copyright (c) 2019-2022, NVIDIA CORPORATION. from io import StringIO From 61e8f54caa87e545787dbd73ac0d98f316d8adaf Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Sat, 19 Mar 2022 15:12:43 -0500 Subject: [PATCH 4/7] fix typos --- python/cudf/cudf/tests/test_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index 6435630974f..dd516e46ebf 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -345,7 +345,7 @@ def test_character_tokenize_series(): "w", "o", ":", - "t",already-in-memory + "t", "h", "r", "e", From 197bd73f5bb2cd5d77d274e7c1690c02dd69756c Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Mon, 21 Mar 2022 22:45:44 -0500 Subject: [PATCH 5/7] remove unnecessary file --- python/cudf/cudf/tests/data/text/temp.txt | 1 - 1 file changed, 1 deletion(-) delete mode 100644 python/cudf/cudf/tests/data/text/temp.txt diff --git a/python/cudf/cudf/tests/data/text/temp.txt b/python/cudf/cudf/tests/data/text/temp.txt deleted file mode 100644 index 860e21333e6..00000000000 --- a/python/cudf/cudf/tests/data/text/temp.txt +++ /dev/null @@ -1 +0,0 @@ -. at 0x7f1e5aa306d0> \ No newline at end of file From 68eb7e143bd5e1d0b5615de741f6cbcee69514bd Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Tue, 22 Mar 2022 14:28:15 -0500 Subject: [PATCH 6/7] support TextIOBase in read_text --- python/cudf/cudf/_lib/text.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx index 671a85b90ee..868574be187 100644 --- a/python/cudf/cudf/_lib/text.pyx +++ b/python/cudf/cudf/_lib/text.pyx @@ -1,6 +1,6 @@ # Copyright (c) 2020-2022, NVIDIA CORPORATION. -from io import StringIO +from io import TextIOBase import cudf @@ -39,7 +39,7 @@ def read_text(object filepaths_or_buffers, cdef size_t c_byte_range_size cdef byte_range_info c_byte_range - if isinstance(filepaths_or_buffers, (StringIO)): + if isinstance(filepaths_or_buffers, TextIOBase): datasource = move(make_source(filepaths_or_buffers.read().encode())) else: datasource = move(make_source_from_file(filepaths_or_buffers.encode())) From f381414d345e8471a5c62d0e5530bac274f70559 Mon Sep 17 00:00:00 2001 From: Christopher Harris Date: Wed, 23 Mar 2022 11:02:21 -0500 Subject: [PATCH 7/7] inline the delimiter argument in test_text read_text test Co-authored-by: Bradley Dice --- python/cudf/cudf/tests/test_text.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index bb5e51636ab..c332924fd8b 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -834,12 +834,10 @@ def test_read_text_byte_range_large(datadir): def test_read_text_in_memory(datadir): - delimiter = "::" - # Since Python split removes the delimiter and read_text does # not we need to add it back to the 'content' expected = cudf.Series(["x::", "y::", "z"]) - actual = cudf.read_text(StringIO("x::y::z"), delimiter=delimiter) + actual = cudf.read_text(StringIO("x::y::z"), delimiter="::") assert_eq(expected, actual)