From 11781e8160086ef28324ad45c7897161ff844a7c Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Fri, 17 Sep 2021 14:26:46 -0400 Subject: [PATCH] Python/Cython bindings for multibyte_split (#8998) Provides the Python/Cython bindings for #8702 multibyte_split. This PR depends on #8702 being merged first. Closes #8557 Authors: - Jeremy Dyer (https://github.com/jdye64) - Christopher Harris (https://github.com/cwharris) Approvers: - https://github.com/nvdbaranec - Vyas Ramasubramani (https://github.com/vyasr) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/8998 --- python/cudf/cudf/__init__.py | 1 + python/cudf/cudf/_lib/__init__.py | 1 + python/cudf/cudf/_lib/cpp/io/text.pxd | 27 +++++++++++++++ python/cudf/cudf/_lib/text.pyx | 39 ++++++++++++++++++++++ python/cudf/cudf/io/__init__.py | 1 + python/cudf/cudf/io/text.py | 28 ++++++++++++++++ python/cudf/cudf/tests/data/text/chess.pgn | 16 +++++++++ python/cudf/cudf/tests/test_text.py | 26 +++++++++++++++ python/cudf/cudf/utils/ioutils.py | 22 ++++++++++++ 9 files changed, 161 insertions(+) create mode 100644 python/cudf/cudf/_lib/cpp/io/text.pxd create mode 100644 python/cudf/cudf/_lib/text.pyx create mode 100644 python/cudf/cudf/io/text.py create mode 100644 python/cudf/cudf/tests/data/text/chess.pgn diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py index 0b0fecb48a3..2e4c1ccebd5 100644 --- a/python/cudf/cudf/__init__.py +++ b/python/cudf/cudf/__init__.py @@ -96,6 +96,7 @@ read_json, read_orc, read_parquet, + read_text, ) from cudf.utils.dtypes import _NA_REP from cudf.utils.utils import set_allocator diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 02f0444e413..aa0f90fd713 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -36,6 +36,7 @@ table, transpose, unary, + text, ) MAX_COLUMN_SIZE = np.iinfo(np.int32).max diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/cpp/io/text.pxd new file mode 100644 index 00000000000..9ce0c68cb08 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/io/text.pxd @@ -0,0 +1,27 @@ +# Copyright (c) 2020-2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.cpp.column.column cimport column + + +cdef extern from "cudf/io/text/data_chunk_source.hpp" \ + namespace "cudf::io::text" nogil: + + cdef cppclass data_chunk_source: + data_chunk_source() except + + +cdef extern from "cudf/io/text/data_chunk_source_factories.hpp" \ + namespace "cudf::io::text" nogil: + + unique_ptr[data_chunk_source] make_source(string data) except + + unique_ptr[data_chunk_source] \ + make_source_from_file(string filename) except + + + +cdef extern from "cudf/io/text/multibyte_split.hpp" \ + namespace "cudf::io::text" nogil: + + unique_ptr[column] multibyte_split(data_chunk_source source, + string delimiter) except + diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx new file mode 100644 index 00000000000..9f33f32bdaf --- /dev/null +++ b/python/cudf/cudf/_lib/text.pyx @@ -0,0 +1,39 @@ +# Copyright (c) 2020-2021, NVIDIA CORPORATION. + +import cudf + +from cython.operator cimport dereference +from libcpp.memory cimport make_unique, unique_ptr +from libcpp.string cimport string +from libcpp.utility cimport move + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.io.text cimport ( + data_chunk_source, + make_source, + make_source_from_file, + multibyte_split, +) + + +def read_text(object filepaths_or_buffers, + object delimiter=None): + """ + Cython function to call into libcudf API, see `multibyte_split`. + + See Also + -------- + cudf.io.text.read_text + """ + cdef string filename = filepaths_or_buffers.encode() + cdef string delim = delimiter.encode() + + cdef unique_ptr[data_chunk_source] datasource + cdef unique_ptr[column] c_col + + with nogil: + datasource = move(make_source_from_file(filename)) + c_col = move(multibyte_split(dereference(datasource), delim)) + + return {None: Column.from_unique_ptr(move(c_col))} diff --git a/python/cudf/cudf/io/__init__.py b/python/cudf/cudf/io/__init__.py index 8db29bdde99..15404b26042 100644 --- a/python/cudf/cudf/io/__init__.py +++ b/python/cudf/cudf/io/__init__.py @@ -12,3 +12,4 @@ read_parquet_metadata, write_to_dataset, ) +from cudf.io.text import read_text diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py new file mode 100644 index 00000000000..705645b8349 --- /dev/null +++ b/python/cudf/cudf/io/text.py @@ -0,0 +1,28 @@ +# Copyright (c) 2018-2021, NVIDIA CORPORATION. + +from io import BytesIO, StringIO + +from nvtx import annotate + +import cudf +from cudf._lib import text as libtext +from cudf.utils import ioutils + + +@annotate("READ_TEXT", color="purple", domain="cudf_python") +@ioutils.doc_read_text() +def read_text( + filepath_or_buffer, delimiter=None, **kwargs, +): + """{docstring}""" + + filepath_or_buffer, compression = ioutils.get_filepath_or_buffer( + path_or_data=filepath_or_buffer, + compression=None, + iotypes=(BytesIO, StringIO), + **kwargs, + ) + + return cudf.Series._from_data( + libtext.read_text(filepath_or_buffer, delimiter=delimiter,) + ) diff --git a/python/cudf/cudf/tests/data/text/chess.pgn b/python/cudf/cudf/tests/data/text/chess.pgn new file mode 100644 index 00000000000..6f516e5c640 --- /dev/null +++ b/python/cudf/cudf/tests/data/text/chess.pgn @@ -0,0 +1,16 @@ +[Event "F/S Return Match"] +[Site "Belgrade, Serbia JUG"] +[Date "1992.11.04"] +[Round "29"] +[White "Fischer, Robert J."] +[Black "Spassky, Boris V."] +[Result "1/2-1/2"] + +1. e4 e5 2. Nf3 Nc6 3. Bb5 a6 {This opening is called the Ruy Lopez.} +4. Ba4 Nf6 5. O-O Be7 6. Re1 b5 7. Bb3 d6 8. c3 O-O 9. h3 Nb8 10. d4 Nbd7 +11. c4 c6 12. cxb5 axb5 13. Nc3 Bb7 14. Bg5 b4 15. Nb1 h6 16. Bh4 c5 17. dxe5 +Nxe4 18. Bxe7 Qxe7 19. exd6 Qf6 20. Nbd2 Nxd6 21. Nc4 Nxc4 22. Bxc4 Nb6 +23. Ne5 Rae8 24. Bxf7+ Rxf7 25. Nxf7 Rxe1+ 26. Qxe1 Kxf7 27. Qe3 Qg5 28. Qxg5 +hxg5 29. b3 Ke6 30. a3 Kd6 31. axb4 cxb4 32. Ra5 Nd5 33. f3 Bc8 34. Kf2 Bf5 +35. Ra7 g6 36. Ra6+ Kc5 37. Ke1 Nf4 38. g3 Nxh3 39. Kd2 Kb5 40. Rd6 Kc5 41. Ra6 +Nf2 42. g4 Bd3 43. Re6 1/2-1/2 diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py index d0b1ba0758e..6b81785c879 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_text.py @@ -8,6 +8,11 @@ from cudf.testing._utils import assert_eq +@pytest.fixture(scope="module") +def datadir(datadir): + return datadir / "text" + + def test_tokenize(): strings = cudf.Series( [ @@ -877,3 +882,24 @@ def test_is_vowel_consonant(): actual = strings.str.is_consonant(indices) assert type(expected) == type(actual) assert_eq(expected, actual) + + +def test_read_text(datadir): + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file, "r") as f: + content = f.read().split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + actual = cudf.read_text(chess_file, delimiter=delimiter) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index af91db6a9e6..15cf50af817 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1012,6 +1012,28 @@ doc_kafka_datasource = docfmt_partial(docstring=_docstring_kafka_datasource) +_docstring_text_datasource = """ +Configuration object for a text Datasource + +Parameters +---------- +filepath_or_buffer : str, path object, or file-like object + Either a path to a file (a `str`, `pathlib.Path`, or + `py._path.local.LocalPath`), URL (including http, ftp, and S3 locations), + or any object with a `read()` method (such as builtin `open()` file handler + function or `StringIO`). +delimiter : string, default None, The delimiter that should be used + for splitting text chunks into seperate cudf column rows. Currently + only a single delimiter is supported. + +Returns +------- +result : GPU ``Series`` + +""" +doc_read_text = docfmt_partial(docstring=_docstring_text_datasource) + + def is_url(url): """Check if a string is a valid URL to a network location.