From 11781e8160086ef28324ad45c7897161ff844a7c Mon Sep 17 00:00:00 2001
From: Jeremy Dyer <jdye64@gmail.com>
Date: Fri, 17 Sep 2021 14:26:46 -0400
Subject: [PATCH] Python/Cython bindings for multibyte_split (#8998)

Provides the Python/Cython bindings for #8702 multibyte_split. This PR depends on #8702 being merged first.

Closes #8557

Authors:
  - Jeremy Dyer (https://github.com/jdye64)
  - Christopher Harris (https://github.com/cwharris)

Approvers:
  - https://github.com/nvdbaranec
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/8998
---
 python/cudf/cudf/__init__.py               |  1 +
 python/cudf/cudf/_lib/__init__.py          |  1 +
 python/cudf/cudf/_lib/cpp/io/text.pxd      | 27 +++++++++++++++
 python/cudf/cudf/_lib/text.pyx             | 39 ++++++++++++++++++++++
 python/cudf/cudf/io/__init__.py            |  1 +
 python/cudf/cudf/io/text.py                | 28 ++++++++++++++++
 python/cudf/cudf/tests/data/text/chess.pgn | 16 +++++++++
 python/cudf/cudf/tests/test_text.py        | 26 +++++++++++++++
 python/cudf/cudf/utils/ioutils.py          | 22 ++++++++++++
 9 files changed, 161 insertions(+)
 create mode 100644 python/cudf/cudf/_lib/cpp/io/text.pxd
 create mode 100644 python/cudf/cudf/_lib/text.pyx
 create mode 100644 python/cudf/cudf/io/text.py
 create mode 100644 python/cudf/cudf/tests/data/text/chess.pgn

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 0b0fecb48a3..2e4c1ccebd5 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -96,6 +96,7 @@
     read_json,
     read_orc,
     read_parquet,
+    read_text,
 )
 from cudf.utils.dtypes import _NA_REP
 from cudf.utils.utils import set_allocator
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 02f0444e413..aa0f90fd713 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -36,6 +36,7 @@
     table,
     transpose,
     unary,
+    text,
 )
 
 MAX_COLUMN_SIZE = np.iinfo(np.int32).max
diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/cpp/io/text.pxd
new file mode 100644
index 00000000000..9ce0c68cb08
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/io/text.pxd
@@ -0,0 +1,27 @@
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.cpp.column.column cimport column
+
+
+cdef extern from "cudf/io/text/data_chunk_source.hpp" \
+        namespace "cudf::io::text" nogil:
+
+    cdef cppclass data_chunk_source:
+        data_chunk_source() except +
+
+cdef extern from "cudf/io/text/data_chunk_source_factories.hpp" \
+        namespace "cudf::io::text" nogil:
+
+    unique_ptr[data_chunk_source] make_source(string data) except +
+    unique_ptr[data_chunk_source] \
+        make_source_from_file(string filename) except +
+
+
+cdef extern from "cudf/io/text/multibyte_split.hpp" \
+        namespace "cudf::io::text" nogil:
+
+    unique_ptr[column] multibyte_split(data_chunk_source source,
+                                       string delimiter) except +
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
new file mode 100644
index 00000000000..9f33f32bdaf
--- /dev/null
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -0,0 +1,39 @@
+# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+
+import cudf
+
+from cython.operator cimport dereference
+from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.io.text cimport (
+    data_chunk_source,
+    make_source,
+    make_source_from_file,
+    multibyte_split,
+)
+
+
+def read_text(object filepaths_or_buffers,
+              object delimiter=None):
+    """
+    Cython function to call into libcudf API, see `multibyte_split`.
+
+    See Also
+    --------
+    cudf.io.text.read_text
+    """
+    cdef string filename = filepaths_or_buffers.encode()
+    cdef string delim = delimiter.encode()
+
+    cdef unique_ptr[data_chunk_source] datasource
+    cdef unique_ptr[column] c_col
+
+    with nogil:
+        datasource = move(make_source_from_file(filename))
+        c_col = move(multibyte_split(dereference(datasource), delim))
+
+    return {None: Column.from_unique_ptr(move(c_col))}
diff --git a/python/cudf/cudf/io/__init__.py b/python/cudf/cudf/io/__init__.py
index 8db29bdde99..15404b26042 100644
--- a/python/cudf/cudf/io/__init__.py
+++ b/python/cudf/cudf/io/__init__.py
@@ -12,3 +12,4 @@
     read_parquet_metadata,
     write_to_dataset,
 )
+from cudf.io.text import read_text
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
new file mode 100644
index 00000000000..705645b8349
--- /dev/null
+++ b/python/cudf/cudf/io/text.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+
+from io import BytesIO, StringIO
+
+from nvtx import annotate
+
+import cudf
+from cudf._lib import text as libtext
+from cudf.utils import ioutils
+
+
+@annotate("READ_TEXT", color="purple", domain="cudf_python")
+@ioutils.doc_read_text()
+def read_text(
+    filepath_or_buffer, delimiter=None, **kwargs,
+):
+    """{docstring}"""
+
+    filepath_or_buffer, compression = ioutils.get_filepath_or_buffer(
+        path_or_data=filepath_or_buffer,
+        compression=None,
+        iotypes=(BytesIO, StringIO),
+        **kwargs,
+    )
+
+    return cudf.Series._from_data(
+        libtext.read_text(filepath_or_buffer, delimiter=delimiter,)
+    )
diff --git a/python/cudf/cudf/tests/data/text/chess.pgn b/python/cudf/cudf/tests/data/text/chess.pgn
new file mode 100644
index 00000000000..6f516e5c640
--- /dev/null
+++ b/python/cudf/cudf/tests/data/text/chess.pgn
@@ -0,0 +1,16 @@
+[Event "F/S Return Match"]
+[Site "Belgrade, Serbia JUG"]
+[Date "1992.11.04"]
+[Round "29"]
+[White "Fischer, Robert J."]
+[Black "Spassky, Boris V."]
+[Result "1/2-1/2"]
+
+1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 {This opening is called the Ruy Lopez.}
+4. Ba4 Nf6 5. O-O Be7 6. Re1 b5 7. Bb3 d6 8. c3 O-O 9. h3 Nb8 10. d4 Nbd7
+11. c4 c6 12. cxb5 axb5 13. Nc3 Bb7 14. Bg5 b4 15. Nb1 h6 16. Bh4 c5 17. dxe5
+Nxe4 18. Bxe7 Qxe7 19. exd6 Qf6 20. Nbd2 Nxd6 21. Nc4 Nxc4 22. Bxc4 Nb6
+23. Ne5 Rae8 24. Bxf7+ Rxf7 25. Nxf7 Rxe1+ 26. Qxe1 Kxf7 27. Qe3 Qg5 28. Qxg5
+hxg5 29. b3 Ke6 30. a3 Kd6 31. axb4 cxb4 32. Ra5 Nd5 33. f3 Bc8 34. Kf2 Bf5
+35. Ra7 g6 36. Ra6+ Kc5 37. Ke1 Nf4 38. g3 Nxh3 39. Kd2 Kb5 40. Rd6 Kc5 41. Ra6
+Nf2 42. g4 Bd3 43. Re6 1/2-1/2
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index d0b1ba0758e..6b81785c879 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -8,6 +8,11 @@
 from cudf.testing._utils import assert_eq
 
 
+@pytest.fixture(scope="module")
+def datadir(datadir):
+    return datadir / "text"
+
+
 def test_tokenize():
     strings = cudf.Series(
         [
@@ -877,3 +882,24 @@ def test_is_vowel_consonant():
     actual = strings.str.is_consonant(indices)
     assert type(expected) == type(actual)
     assert_eq(expected, actual)
+
+
+def test_read_text(datadir):
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file, "r") as f:
+        content = f.read().split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(chess_file, delimiter=delimiter)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index af91db6a9e6..15cf50af817 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1012,6 +1012,28 @@
 doc_kafka_datasource = docfmt_partial(docstring=_docstring_kafka_datasource)
 
 
+_docstring_text_datasource = """
+Configuration object for a text Datasource
+
+Parameters
+----------
+filepath_or_buffer : str, path object, or file-like object
+    Either a path to a file (a `str`, `pathlib.Path`, or
+    `py._path.local.LocalPath`), URL (including http, ftp, and S3 locations),
+    or any object with a `read()` method (such as builtin `open()` file handler
+    function or `StringIO`).
+delimiter : string, default None, The delimiter that should be used
+    for splitting text chunks into seperate cudf column rows. Currently
+    only a single delimiter is supported.
+
+Returns
+-------
+result : GPU ``Series``
+
+"""
+doc_read_text = docfmt_partial(docstring=_docstring_text_datasource)
+
+
 def is_url(url):
     """Check if a string is a valid URL to a network location.