From a09b62b7077dcc8e26e26aa77ef5030bc9cd3b64 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 24 May 2023 13:47:01 -0400 Subject: [PATCH 1/5] Separate io-text and nvtext pytests into different files --- .../tests/{test_text.py => test_nvtext.py} | 166 +----------------- python/cudf/cudf/tests/test_text_io.py | 164 +++++++++++++++++ python/cudf/cudf/tests/text/__init__.py | 0 .../cudf/tests/text/test_subword_tokenizer.py | 1 - 4 files changed, 168 insertions(+), 163 deletions(-) rename python/cudf/cudf/tests/{test_text.py => test_nvtext.py} (82%) create mode 100644 python/cudf/cudf/tests/test_text_io.py delete mode 100644 python/cudf/cudf/tests/text/__init__.py delete mode 100644 python/cudf/cudf/tests/text/test_subword_tokenizer.py diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_nvtext.py similarity index 82% rename from python/cudf/cudf/tests/test_text.py rename to python/cudf/cudf/tests/test_nvtext.py index f0e0e52142f..d286c0036a2 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/test_nvtext.py @@ -1,7 +1,5 @@ # Copyright (c) 2019-2023, NVIDIA CORPORATION. -from io import StringIO - import numpy as np import pytest @@ -9,11 +7,6 @@ from cudf.testing._utils import assert_eq -@pytest.fixture(scope="module") -def datadir(datadir): - return datadir / "text" - - def test_tokenize(): strings = cudf.Series( [ @@ -361,7 +354,7 @@ def test_character_tokenize_series(): None, ( "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" + "en#eight^nine heŒŽ'•™œ$µ¾ŤƠé DŽ" ), ] ) @@ -439,7 +432,7 @@ def test_character_tokenize_series(): "e", "Œ", "Ž", - "‘", + "'", "•", "™", "œ", @@ -480,7 +473,7 @@ def test_character_tokenize_index(): None, ( "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" + "en#eight^nine heŒŽ'•™œ$µ¾ŤƠé DŽ" ), ] ) @@ -558,7 +551,7 @@ def test_character_tokenize_index(): "e", "Œ", "Ž", - "‘", + "'", "•", "™", "œ", @@ -842,154 +835,3 @@ def test_minhash(): with pytest.raises(ValueError): seeds = cudf.Series([0, 1, 2], dtype=np.int32) strings.str.minhash(seeds=seeds) - - -def test_read_text(datadir): - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file) as f: - content = f.read().split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - actual = cudf.read_text(chess_file, delimiter=delimiter) - - assert_eq(expected, actual) - - -def test_read_text_byte_range(datadir): - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file, "r") as f: - data = f.read() - content = data.split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - byte_range_size = (len(data) // 3) + (len(data) % 3 != 0) - - actual_0 = cudf.read_text( - chess_file, - delimiter=delimiter, - byte_range=[byte_range_size * 0, byte_range_size], - ) - actual_1 = cudf.read_text( - chess_file, - delimiter=delimiter, - byte_range=[byte_range_size * 1, byte_range_size], - ) - actual_2 = cudf.read_text( - chess_file, - delimiter=delimiter, - byte_range=[byte_range_size * 2, byte_range_size], - ) - - actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True) - - assert_eq(expected, actual) - - -def test_read_text_byte_range_large(tmpdir): - content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000)) - delimiter = "\n" - temp_file = str(tmpdir) + "/temp.txt" - - with open(temp_file, "w") as f: - f.write(content) - - expected = cudf.Series(["xxxx\n" for i in range(0, 200)]) - - actual = cudf.read_text( - temp_file, delimiter=delimiter, byte_range=[1000, 1000] - ) - - assert_eq(expected, actual) - - -def test_read_text_in_memory(datadir): - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series(["x::", "y::", "z"]) - - actual = cudf.read_text(StringIO("x::y::z"), delimiter="::") - - assert_eq(expected, actual) - - -def test_read_text_in_memory_strip_delimiter(datadir): - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series(["x", "y", "z"]) - - actual = cudf.read_text( - StringIO("x::y::z"), delimiter="::", strip_delimiters=True - ) - - assert_eq(expected, actual) - - -def test_read_text_bgzip(datadir): - chess_file_compressed = str(datadir) + "/chess.pgn.gz" - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file) as f: - content = f.read().split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - actual = cudf.read_text( - chess_file_compressed, compression="bgzip", delimiter=delimiter - ) - - assert_eq(expected, actual) - - -def test_read_text_bgzip_offsets(datadir): - chess_file_compressed = str(datadir) + "/chess.pgn.gz" - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file) as f: - content = f.read()[29:695].split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - actual = cudf.read_text( - chess_file_compressed, - compression="bgzip", - compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7], - delimiter=delimiter, - ) - - assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_text_io.py b/python/cudf/cudf/tests/test_text_io.py new file mode 100644 index 00000000000..acba13bb5b0 --- /dev/null +++ b/python/cudf/cudf/tests/test_text_io.py @@ -0,0 +1,164 @@ +# Copyright (c) 2019-2023, NVIDIA CORPORATION. + +from io import StringIO + +import pytest + +import cudf +from cudf.testing._utils import assert_eq + + +@pytest.fixture(scope="module") +def datadir(datadir): + return datadir / "text" + + +def test_read_text(datadir): + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file) as f: + content = f.read().split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + actual = cudf.read_text(chess_file, delimiter=delimiter) + + assert_eq(expected, actual) + + +def test_read_text_byte_range(datadir): + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file, "r") as f: + data = f.read() + content = data.split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + byte_range_size = (len(data) // 3) + (len(data) % 3 != 0) + + actual_0 = cudf.read_text( + chess_file, + delimiter=delimiter, + byte_range=[byte_range_size * 0, byte_range_size], + ) + actual_1 = cudf.read_text( + chess_file, + delimiter=delimiter, + byte_range=[byte_range_size * 1, byte_range_size], + ) + actual_2 = cudf.read_text( + chess_file, + delimiter=delimiter, + byte_range=[byte_range_size * 2, byte_range_size], + ) + + actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True) + + assert_eq(expected, actual) + + +def test_read_text_byte_range_large(tmpdir): + content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000)) + delimiter = "\n" + temp_file = str(tmpdir) + "/temp.txt" + + with open(temp_file, "w") as f: + f.write(content) + + expected = cudf.Series(["xxxx\n" for i in range(0, 200)]) + + actual = cudf.read_text( + temp_file, delimiter=delimiter, byte_range=[1000, 1000] + ) + + assert_eq(expected, actual) + + +def test_read_text_in_memory(datadir): + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series(["x::", "y::", "z"]) + + actual = cudf.read_text(StringIO("x::y::z"), delimiter="::") + + assert_eq(expected, actual) + + +def test_read_text_in_memory_strip_delimiter(datadir): + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series(["x", "y", "z"]) + + actual = cudf.read_text( + StringIO("x::y::z"), delimiter="::", strip_delimiters=True + ) + + assert_eq(expected, actual) + + +def test_read_text_bgzip(datadir): + chess_file_compressed = str(datadir) + "/chess.pgn.gz" + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file) as f: + content = f.read().split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + actual = cudf.read_text( + chess_file_compressed, compression="bgzip", delimiter=delimiter + ) + + assert_eq(expected, actual) + + +def test_read_text_bgzip_offsets(datadir): + chess_file_compressed = str(datadir) + "/chess.pgn.gz" + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file) as f: + content = f.read()[29:695].split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + actual = cudf.read_text( + chess_file_compressed, + compression="bgzip", + compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7], + delimiter=delimiter, + ) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/text/__init__.py b/python/cudf/cudf/tests/text/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py deleted file mode 100644 index 06777c8e6af..00000000000 --- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py +++ /dev/null @@ -1 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. From 2c7eb000458215542523e2ee14b94fab3070177d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 24 May 2023 16:26:17 -0400 Subject: [PATCH 2/5] fix smartquote --- .pre-commit-config.yaml | 2 +- python/cudf/cudf/tests/test_nvtext.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0ac54113278..ce531205a2a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -85,7 +85,7 @@ repos: (?x)^( ^cpp/include/cudf_test/cxxopts.hpp| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*| - ^python/cudf/cudf/tests/test_text.py + ^python/cudf/cudf/tests/test_nvtext.py ) - repo: local hooks: diff --git a/python/cudf/cudf/tests/test_nvtext.py b/python/cudf/cudf/tests/test_nvtext.py index d286c0036a2..d0124ff400f 100644 --- a/python/cudf/cudf/tests/test_nvtext.py +++ b/python/cudf/cudf/tests/test_nvtext.py @@ -354,7 +354,7 @@ def test_character_tokenize_series(): None, ( "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ'•™œ$µ¾ŤƠé DŽ" + "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" ), ] ) @@ -432,7 +432,7 @@ def test_character_tokenize_series(): "e", "Œ", "Ž", - "'", + "‘", "•", "™", "œ", @@ -473,7 +473,7 @@ def test_character_tokenize_index(): None, ( "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ'•™œ$µ¾ŤƠé DŽ" + "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" ), ] ) From 961a9281b37f2ad9c837bbb37b9b4b37211f2656 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 24 May 2023 16:28:07 -0400 Subject: [PATCH 3/5] missed a quote --- python/cudf/cudf/tests/test_nvtext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_nvtext.py b/python/cudf/cudf/tests/test_nvtext.py index d0124ff400f..ea789b99220 100644 --- a/python/cudf/cudf/tests/test_nvtext.py +++ b/python/cudf/cudf/tests/test_nvtext.py @@ -551,7 +551,7 @@ def test_character_tokenize_index(): "e", "Œ", "Ž", - "'", + "‘", "•", "™", "œ", From 39f56f9836c4cf0365331bac1572eaa9b145195d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 24 May 2023 16:55:35 -0400 Subject: [PATCH 4/5] move files to sub-directories --- .pre-commit-config.yaml | 2 +- .../cudf/cudf/tests/input_output/test_text.py | 165 +++- .../cudf/tests/strings/test_string_methods.py | 838 +++++++++++++++++- python/cudf/cudf/tests/test_nvtext.py | 837 ----------------- python/cudf/cudf/tests/test_text_io.py | 164 ---- 5 files changed, 1002 insertions(+), 1004 deletions(-) delete mode 100644 python/cudf/cudf/tests/test_nvtext.py delete mode 100644 python/cudf/cudf/tests/test_text_io.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ce531205a2a..10e68ea0757 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -85,7 +85,7 @@ repos: (?x)^( ^cpp/include/cudf_test/cxxopts.hpp| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*| - ^python/cudf/cudf/tests/test_nvtext.py + ^python/cudf/cudf/tests/strings/test_string_methods.py ) - repo: local hooks: diff --git a/python/cudf/cudf/tests/input_output/test_text.py b/python/cudf/cudf/tests/input_output/test_text.py index 06777c8e6af..acba13bb5b0 100644 --- a/python/cudf/cudf/tests/input_output/test_text.py +++ b/python/cudf/cudf/tests/input_output/test_text.py @@ -1 +1,164 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. + +from io import StringIO + +import pytest + +import cudf +from cudf.testing._utils import assert_eq + + +@pytest.fixture(scope="module") +def datadir(datadir): + return datadir / "text" + + +def test_read_text(datadir): + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file) as f: + content = f.read().split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + actual = cudf.read_text(chess_file, delimiter=delimiter) + + assert_eq(expected, actual) + + +def test_read_text_byte_range(datadir): + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file, "r") as f: + data = f.read() + content = data.split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + byte_range_size = (len(data) // 3) + (len(data) % 3 != 0) + + actual_0 = cudf.read_text( + chess_file, + delimiter=delimiter, + byte_range=[byte_range_size * 0, byte_range_size], + ) + actual_1 = cudf.read_text( + chess_file, + delimiter=delimiter, + byte_range=[byte_range_size * 1, byte_range_size], + ) + actual_2 = cudf.read_text( + chess_file, + delimiter=delimiter, + byte_range=[byte_range_size * 2, byte_range_size], + ) + + actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True) + + assert_eq(expected, actual) + + +def test_read_text_byte_range_large(tmpdir): + content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000)) + delimiter = "\n" + temp_file = str(tmpdir) + "/temp.txt" + + with open(temp_file, "w") as f: + f.write(content) + + expected = cudf.Series(["xxxx\n" for i in range(0, 200)]) + + actual = cudf.read_text( + temp_file, delimiter=delimiter, byte_range=[1000, 1000] + ) + + assert_eq(expected, actual) + + +def test_read_text_in_memory(datadir): + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series(["x::", "y::", "z"]) + + actual = cudf.read_text(StringIO("x::y::z"), delimiter="::") + + assert_eq(expected, actual) + + +def test_read_text_in_memory_strip_delimiter(datadir): + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series(["x", "y", "z"]) + + actual = cudf.read_text( + StringIO("x::y::z"), delimiter="::", strip_delimiters=True + ) + + assert_eq(expected, actual) + + +def test_read_text_bgzip(datadir): + chess_file_compressed = str(datadir) + "/chess.pgn.gz" + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file) as f: + content = f.read().split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + actual = cudf.read_text( + chess_file_compressed, compression="bgzip", delimiter=delimiter + ) + + assert_eq(expected, actual) + + +def test_read_text_bgzip_offsets(datadir): + chess_file_compressed = str(datadir) + "/chess.pgn.gz" + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file) as f: + content = f.read()[29:695].split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + actual = cudf.read_text( + chess_file_compressed, + compression="bgzip", + compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7], + delimiter=delimiter, + ) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/strings/test_string_methods.py b/python/cudf/cudf/tests/strings/test_string_methods.py index 06777c8e6af..ea789b99220 100644 --- a/python/cudf/cudf/tests/strings/test_string_methods.py +++ b/python/cudf/cudf/tests/strings/test_string_methods.py @@ -1 +1,837 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing._utils import assert_eq + + +def test_tokenize(): + strings = cudf.Series( + [ + "the quick fox jumped over the lazy dog", + "the siamésé cat jumped under the sofa", + None, + "", + ] + ) + + expected_values = cudf.Series( + [ + "the", + "quick", + "fox", + "jumped", + "over", + "the", + "lazy", + "dog", + "the", + "siamésé", + "cat", + "jumped", + "under", + "the", + "sofa", + ] + ) + expected_index = strings.index.repeat(strings.str.token_count()) + expected = cudf.Series(expected_values, index=expected_index) + + actual = strings.str.tokenize() + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_tokenize_delimiter(): + strings = cudf.Series( + [ + "the quick fox jumped over the lazy dog", + "the siamésé cat jumped under the sofa", + None, + "", + ] + ) + + expected_values = cudf.Series( + [ + "the quick f", + "x jumped ", + "ver the lazy d", + "g", + "the siamésé cat jumped under the s", + "fa", + ] + ) + expected_index = strings.index.repeat(strings.str.token_count("o")) + expected = cudf.Series(expected_values, index=expected_index) + + actual = strings.str.tokenize(delimiter="o") + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_detokenize(): + strings = cudf.Series( + [ + "the", + "quick", + "fox", + "jumped", + "over", + "the", + "lazy", + "dog", + "the", + "siamésé", + "cat", + "jumped", + "under", + "the", + "sofa", + ] + ) + + indices = cudf.Series([0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]) + actual = strings.str.detokenize(indices) + expected = cudf.Series( + [ + "the quick fox", + "jumped over", + "the lazy dog", + "the siamésé cat jumped under the sofa", + ] + ) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + indices = cudf.Series( + [4, 0, 0, 0, 0, 4, 1, 1, 4, 2, 2, 2, 2, 4, 3], dtype=np.int8 + ) + actual = strings.str.detokenize(indices, "+") + expected = cudf.Series( + [ + "quick+fox+jumped+over", + "lazy+dog", + "siamésé+cat+jumped+under", + "sofa", + "the+the+the+the", + ] + ) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "delimiter, expected_token_counts", + [ + ("", [10, 9, 0, 0, 5]), + ("o", [6, 3, 0, 0, 1]), + (["a", "e", "i", "o", "u"], [13, 13, 0, 0, 6]), + (["a", "e", "i", "o"], [12, 11, 0, 0, 6]), + ], +) +def test_token_count(delimiter, expected_token_counts): + strings = cudf.Series( + [ + "the quick brown fox jumped over the lazy brown dog", + "the sable siamésé cat jumped under the brown sofa", + None, + "", + "test_str\x01test_str\x02test_str\x03test_str\x04test_str\x05", + ] + ) + + expected = cudf.Series(expected_token_counts) + + actual = strings.str.token_count(delimiter) + + assert type(expected) == type(actual) + assert_eq(expected, actual, check_dtype=False) + + +def test_normalize_spaces(): + strings = cudf.Series( + [ + " the\t quick fox jumped over the lazy dog", + "the siamésé cat\f jumped\t\tunder the sofa ", + None, + "", + ] + ) + expected = cudf.Series( + [ + "the quick fox jumped over the lazy dog", + "the siamésé cat jumped under the sofa", + None, + "", + ] + ) + + actual = strings.str.normalize_spaces() + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_normalize_characters(): + strings = cudf.Series( + ["乾 \t 乿", "ĂĆCĖÑTÜATE", "âscénd, Descend", "", None, "Stock^ $1"] + ) + expected = cudf.Series( + [ + " 乾 乿 ", + "accentuate", + "ascend , descend", + "", + None, + "stock ^ $ 1", + ] + ) + + actual = strings.str.normalize_characters() + assert type(expected) == type(actual) + assert_eq(expected, actual) + + expected = cudf.Series( + [ + " 乾 乿 ", + "ĂĆCĖÑTÜATE", + "âscénd , Descend", + "", + None, + "Stock ^ $ 1", + ] + ) + actual = strings.str.normalize_characters(do_lower=False) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "n, separator, expected_values", + [ + ( + 2, + "_", + [ + "this_is", + "is_my", + "my_favorite", + "favorite_book", + "book_on", + "on_my", + "my_bookshelf", + ], + ), + ( + 3, + "-", + [ + "this-is-my", + "is-my-favorite", + "my-favorite-book", + "favorite-book-on", + "book-on-my", + "on-my-bookshelf", + ], + ), + ], +) +def test_ngrams(n, separator, expected_values): + strings = cudf.Series( + ["this", "is", "my", "favorite", "book", "on", "my", "bookshelf"] + ) + + expected = cudf.Series(expected_values) + + actual = strings.str.ngrams(n=n, separator=separator) + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "n, expected_values, expected_index, as_list", + [ + ( + 2, + [ + "th", + "hi", + "is", + "is", + "my", + "bo", + "oo", + "ok", + "he", + "er", + "re", + cudf.NA, + ], + [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6], + False, + ), + ( + 3, + [ + "thi", + "his", + cudf.NA, + cudf.NA, + "boo", + "ook", + "her", + "ere", + cudf.NA, + ], + [1, 1, 2, 3, 4, 4, 5, 5, 6], + False, + ), + ( + 3, + [["thi", "his"], [], [], ["boo", "ook"], ["her", "ere"], []], + [1, 2, 3, 4, 5, 6], + True, + ), + ], +) +def test_character_ngrams(n, expected_values, expected_index, as_list): + strings = cudf.Series( + ["this", "is", "my", "book", "here", ""], index=[1, 2, 3, 4, 5, 6] + ) + + expected = cudf.Series(expected_values, index=expected_index) + + actual = strings.str.character_ngrams(n=n, as_list=as_list) + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "n, separator, expected_values", + [ + ( + 2, + "_", + [ + "this_is", + "is_my", + "my_favorite", + "book_on", + "on_my", + "my_bookshelf", + ], + ), + ( + 3, + "-", + ["this-is-my", "is-my-favorite", "book-on-my", "on-my-bookshelf"], + ), + ], +) +def test_ngrams_tokenize(n, separator, expected_values): + strings = cudf.Series(["this is my favorite", "book on my bookshelf"]) + + expected = cudf.Series(expected_values) + + actual = strings.str.ngrams_tokenize(n=n, separator=separator) + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_character_tokenize_series(): + sr = cudf.Series( + [ + "hello world", + "sdf", + None, + ( + "goodbye, one-two:three~four+five_six@sev" + "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" + ), + ] + ) + expected_values = cudf.Series( + [ + "h", + "e", + "l", + "l", + "o", + " ", + "w", + "o", + "r", + "l", + "d", + "s", + "d", + "f", + "g", + "o", + "o", + "d", + "b", + "y", + "e", + ",", + " ", + "o", + "n", + "e", + "-", + "t", + "w", + "o", + ":", + "t", + "h", + "r", + "e", + "e", + "~", + "f", + "o", + "u", + "r", + "+", + "f", + "i", + "v", + "e", + "_", + "s", + "i", + "x", + "@", + "s", + "e", + "v", + "e", + "n", + "#", + "e", + "i", + "g", + "h", + "t", + "^", + "n", + "i", + "n", + "e", + " ", + "h", + "e", + "Œ", + "Ž", + "‘", + "•", + "™", + "œ", + "$", + "µ", + "¾", + "Ť", + "Ơ", + "é", + " ", + "DŽ", + ] + ) + expected_index = sr.index.repeat(sr.str.len().fillna(0)) + expected = cudf.Series(expected_values, index=expected_index) + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + sr = cudf.Series([""]) + expected = cudf.Series([], dtype="object") + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + sr = cudf.Series(["a"]) + expected = cudf.Series(["a"]) + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + +def test_character_tokenize_index(): + sr = cudf.core.index.as_index( + [ + "hello world", + "sdf", + None, + ( + "goodbye, one-two:three~four+five_six@sev" + "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" + ), + ] + ) + expected = cudf.core.index.as_index( + [ + "h", + "e", + "l", + "l", + "o", + " ", + "w", + "o", + "r", + "l", + "d", + "s", + "d", + "f", + "g", + "o", + "o", + "d", + "b", + "y", + "e", + ",", + " ", + "o", + "n", + "e", + "-", + "t", + "w", + "o", + ":", + "t", + "h", + "r", + "e", + "e", + "~", + "f", + "o", + "u", + "r", + "+", + "f", + "i", + "v", + "e", + "_", + "s", + "i", + "x", + "@", + "s", + "e", + "v", + "e", + "n", + "#", + "e", + "i", + "g", + "h", + "t", + "^", + "n", + "i", + "n", + "e", + " ", + "h", + "e", + "Œ", + "Ž", + "‘", + "•", + "™", + "œ", + "$", + "µ", + "¾", + "Ť", + "Ơ", + "é", + " ", + "DŽ", + ] + ) + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + sr = cudf.Index([""]) + expected = cudf.Index([], dtype="object") + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + sr = cudf.core.index.as_index(["a"]) + expected = cudf.core.index.as_index(["a"]) + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + +def test_text_replace_tokens(): + sr = cudf.Series(["this is me", "theme music", ""]) + targets = cudf.Series(["is", "me"]) + + expected = cudf.Series(["this _ _", "theme music", ""]) + actual = sr.str.replace_tokens(targets, "_") + + assert_eq(expected, actual) + + replacements = cudf.Series(["IS", "ME"]) + expected = cudf.Series(["this IS ME", "theme music", ""]) + actual = sr.str.replace_tokens(targets, replacements) + + assert_eq(expected, actual) + + sr = cudf.Series( + [ + "this is a small text ☕", + "this \t\t is ; ; - + a looooooooooonnnnnnnggggggg text \n\t", + "emptyme", + ], + ) + targets = cudf.Series( + ["a", "☕", "\t", "looooooooooonnnnnnnggggggg", "emptyme"] + ) + replacements = cudf.Series(["the", "🚒", "🚒🚒🚒🚒", "🔥🔥", ""]) + + expected = cudf.Series( + [ + "this is the small text 🚒", + "this \t\t is ; ; - + the 🔥🔥 text \n\t", + "", + ] + ) + actual = sr.str.replace_tokens(targets, replacements) + + assert_eq(expected, actual) + + sr = cudf.Series( + ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] + ) + targets = cudf.Series(["🌬", "🔥", "🌊"]) + replacements = "🚰" + + expected = cudf.Series( + ["All-we-need;is;🚰", "\tall-we-need0is;🚰", "all;we:need+is;🚰"] + ) + actual = sr.str.replace_tokens(targets, replacements, delimiter=";") + + assert_eq(expected, actual) + assert_eq(sr, sr.str.replace_tokens(targets, replacements)) + assert_eq(sr, sr.str.replace_tokens([""], [""])) + + +def test_text_replace_tokens_error_cases(): + sr = cudf.Series(["this is me", "theme music", ""]) + + with pytest.raises( + TypeError, + match="targets should be an array-like or a Series object, " + "found ", + ): + sr.str.replace_tokens("me", ["a"]) + + with pytest.raises( + ValueError, + match="targets and replacements should be same size" + " sequences unless replacements is a string.", + ): + sr.str.replace_tokens(["a"], ["me", "ki"]) + + with pytest.raises( + TypeError, + match="replacements should be an str, array-like or Series object," + " found ", + ): + sr.str.replace_tokens(["a"], {"s"}) + + with pytest.raises( + TypeError, + match="Type of delimiter should be a string, found ", + ): + sr.str.replace_tokens(["a"], ["s"], delimiter=["a", "b"]) + + +def test_text_filter_tokens(): + sr = cudf.Series(["the quick brown fox jumped", "over the lazy dog", ""]) + + expected = cudf.Series([" quick brown jumped", " ", ""]) + actual = sr.str.filter_tokens(5) + assert_eq(expected, actual) + + expected = cudf.Series(["🔥 quick brown 🔥 jumped", "🔥 🔥 🔥 🔥", ""]) + actual = sr.str.filter_tokens(5, "🔥") + assert_eq(expected, actual) + + sr = cudf.Series( + ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] + ) + expected = cudf.Series( + ["All-we-need;is;--", "\tall-we-need0is;--", "all;we:need+is;--"] + ) + actual = sr.str.filter_tokens(2, "--", ";") + assert_eq(expected, actual) + + assert_eq(sr, sr.str.filter_tokens(1)) + + +def test_text_filter_tokens_error_cases(): + sr = cudf.Series(["abc", "def", ""]) + + with pytest.raises( + TypeError, + match="Type of replacement should be a string, found ", + ): + sr.str.filter_tokens(3, replacement=["a", "b"]) + + with pytest.raises( + TypeError, + match="Type of delimiter should be a string, found ", + ): + sr.str.filter_tokens(3, delimiter=["a", "b"]) + + +def test_edit_distance(): + sr = cudf.Series(["kitten", "saturday", "address", "book"]) + tg = cudf.Series(["sitting", "sunday", "addressee", "back"]) + + expected = cudf.Series([3, 3, 2, 2], dtype=np.int32) + actual = sr.str.edit_distance(tg) + assert_eq(expected, actual) + + expected = cudf.Series([0, 7, 6, 6], dtype=np.int32) + actual = sr.str.edit_distance("kitten") + assert_eq(expected, actual) + + +def test_edit_distance_matrix(): + # normal + sr = cudf.Series(["rounded", "bounded", "bounce", "trounce", "ounce"]) + + expected = cudf.Series( + [ + [0, 1, 3, 3, 3], + [1, 0, 2, 4, 3], + [3, 2, 0, 2, 1], + [3, 4, 2, 0, 2], + [3, 3, 1, 2, 0], + ] + ) + got = sr.str.edit_distance_matrix() + + assert_eq(expected, got, check_dtype=False) + + # 1-row series + sr2 = cudf.Series(["x"]) + with pytest.raises(ValueError, match="Require size >= 2"): + sr2.str.edit_distance_matrix() + + # null rows + sr3 = cudf.Series(["rounded", None, "bounce", "trounce", "ounce"]) + with pytest.raises(ValueError, match="Cannot compute"): + sr3.str.edit_distance_matrix() + + +def test_porter_stemmer_measure(): + strings = cudf.Series( + [ + "tr", + "ee", + "tree", + "y", + "by", + "trouble", + "oats", + "trees", + "ivy", + "troubles", + "private", + "oaten", + "orrery", + None, + "", + ] + ) + expected = cudf.Series( + [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, None, 0], dtype=np.int32 + ) + + actual = strings.str.porter_stemmer_measure() + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_is_vowel_consonant(): + strings = cudf.Series( + ["tr", "ee", "tree", "y", "by", "oats", "ivy", "orrery", None, ""] + ) + expected = cudf.Series( + [False, False, True, False, False, False, True, False, None, False] + ) + actual = strings.str.is_vowel(2) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + expected = cudf.Series( + [True, False, True, False, False, False, True, True, None, False] + ) + actual = strings.str.is_consonant(1) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + indices = cudf.Series([2, 1, 0, 0, 1, 2, 0, 3, 0, 0]) + expected = cudf.Series( + [False, True, False, False, True, False, True, True, None, False] + ) + actual = strings.str.is_vowel(indices) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + expected = cudf.Series( + [False, False, True, True, False, True, False, False, None, False] + ) + actual = strings.str.is_consonant(indices) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_minhash(): + strings = cudf.Series(["this is my", "favorite book", None, ""]) + expected = cudf.Series([21141582, 962346254, None, 0], dtype=np.uint32) + actual = strings.str.minhash() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + expected = cudf.Series( + [ + cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), + cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), + None, + cudf.Series([0, 0, 0], dtype=np.uint32), + ] + ) + actual = strings.str.minhash(seeds=seeds, n=5) + assert_eq(expected, actual) + + with pytest.raises(ValueError): + strings.str.minhash(seeds=7) + with pytest.raises(ValueError): + strings.str.minhash(seeds=seeds, method="md5") + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.int32) + strings.str.minhash(seeds=seeds) diff --git a/python/cudf/cudf/tests/test_nvtext.py b/python/cudf/cudf/tests/test_nvtext.py deleted file mode 100644 index ea789b99220..00000000000 --- a/python/cudf/cudf/tests/test_nvtext.py +++ /dev/null @@ -1,837 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. - -import numpy as np -import pytest - -import cudf -from cudf.testing._utils import assert_eq - - -def test_tokenize(): - strings = cudf.Series( - [ - "the quick fox jumped over the lazy dog", - "the siamésé cat jumped under the sofa", - None, - "", - ] - ) - - expected_values = cudf.Series( - [ - "the", - "quick", - "fox", - "jumped", - "over", - "the", - "lazy", - "dog", - "the", - "siamésé", - "cat", - "jumped", - "under", - "the", - "sofa", - ] - ) - expected_index = strings.index.repeat(strings.str.token_count()) - expected = cudf.Series(expected_values, index=expected_index) - - actual = strings.str.tokenize() - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_tokenize_delimiter(): - strings = cudf.Series( - [ - "the quick fox jumped over the lazy dog", - "the siamésé cat jumped under the sofa", - None, - "", - ] - ) - - expected_values = cudf.Series( - [ - "the quick f", - "x jumped ", - "ver the lazy d", - "g", - "the siamésé cat jumped under the s", - "fa", - ] - ) - expected_index = strings.index.repeat(strings.str.token_count("o")) - expected = cudf.Series(expected_values, index=expected_index) - - actual = strings.str.tokenize(delimiter="o") - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_detokenize(): - strings = cudf.Series( - [ - "the", - "quick", - "fox", - "jumped", - "over", - "the", - "lazy", - "dog", - "the", - "siamésé", - "cat", - "jumped", - "under", - "the", - "sofa", - ] - ) - - indices = cudf.Series([0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]) - actual = strings.str.detokenize(indices) - expected = cudf.Series( - [ - "the quick fox", - "jumped over", - "the lazy dog", - "the siamésé cat jumped under the sofa", - ] - ) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - indices = cudf.Series( - [4, 0, 0, 0, 0, 4, 1, 1, 4, 2, 2, 2, 2, 4, 3], dtype=np.int8 - ) - actual = strings.str.detokenize(indices, "+") - expected = cudf.Series( - [ - "quick+fox+jumped+over", - "lazy+dog", - "siamésé+cat+jumped+under", - "sofa", - "the+the+the+the", - ] - ) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "delimiter, expected_token_counts", - [ - ("", [10, 9, 0, 0, 5]), - ("o", [6, 3, 0, 0, 1]), - (["a", "e", "i", "o", "u"], [13, 13, 0, 0, 6]), - (["a", "e", "i", "o"], [12, 11, 0, 0, 6]), - ], -) -def test_token_count(delimiter, expected_token_counts): - strings = cudf.Series( - [ - "the quick brown fox jumped over the lazy brown dog", - "the sable siamésé cat jumped under the brown sofa", - None, - "", - "test_str\x01test_str\x02test_str\x03test_str\x04test_str\x05", - ] - ) - - expected = cudf.Series(expected_token_counts) - - actual = strings.str.token_count(delimiter) - - assert type(expected) == type(actual) - assert_eq(expected, actual, check_dtype=False) - - -def test_normalize_spaces(): - strings = cudf.Series( - [ - " the\t quick fox jumped over the lazy dog", - "the siamésé cat\f jumped\t\tunder the sofa ", - None, - "", - ] - ) - expected = cudf.Series( - [ - "the quick fox jumped over the lazy dog", - "the siamésé cat jumped under the sofa", - None, - "", - ] - ) - - actual = strings.str.normalize_spaces() - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_normalize_characters(): - strings = cudf.Series( - ["乾 \t 乿", "ĂĆCĖÑTÜATE", "âscénd, Descend", "", None, "Stock^ $1"] - ) - expected = cudf.Series( - [ - " 乾 乿 ", - "accentuate", - "ascend , descend", - "", - None, - "stock ^ $ 1", - ] - ) - - actual = strings.str.normalize_characters() - assert type(expected) == type(actual) - assert_eq(expected, actual) - - expected = cudf.Series( - [ - " 乾 乿 ", - "ĂĆCĖÑTÜATE", - "âscénd , Descend", - "", - None, - "Stock ^ $ 1", - ] - ) - actual = strings.str.normalize_characters(do_lower=False) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "n, separator, expected_values", - [ - ( - 2, - "_", - [ - "this_is", - "is_my", - "my_favorite", - "favorite_book", - "book_on", - "on_my", - "my_bookshelf", - ], - ), - ( - 3, - "-", - [ - "this-is-my", - "is-my-favorite", - "my-favorite-book", - "favorite-book-on", - "book-on-my", - "on-my-bookshelf", - ], - ), - ], -) -def test_ngrams(n, separator, expected_values): - strings = cudf.Series( - ["this", "is", "my", "favorite", "book", "on", "my", "bookshelf"] - ) - - expected = cudf.Series(expected_values) - - actual = strings.str.ngrams(n=n, separator=separator) - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "n, expected_values, expected_index, as_list", - [ - ( - 2, - [ - "th", - "hi", - "is", - "is", - "my", - "bo", - "oo", - "ok", - "he", - "er", - "re", - cudf.NA, - ], - [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6], - False, - ), - ( - 3, - [ - "thi", - "his", - cudf.NA, - cudf.NA, - "boo", - "ook", - "her", - "ere", - cudf.NA, - ], - [1, 1, 2, 3, 4, 4, 5, 5, 6], - False, - ), - ( - 3, - [["thi", "his"], [], [], ["boo", "ook"], ["her", "ere"], []], - [1, 2, 3, 4, 5, 6], - True, - ), - ], -) -def test_character_ngrams(n, expected_values, expected_index, as_list): - strings = cudf.Series( - ["this", "is", "my", "book", "here", ""], index=[1, 2, 3, 4, 5, 6] - ) - - expected = cudf.Series(expected_values, index=expected_index) - - actual = strings.str.character_ngrams(n=n, as_list=as_list) - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "n, separator, expected_values", - [ - ( - 2, - "_", - [ - "this_is", - "is_my", - "my_favorite", - "book_on", - "on_my", - "my_bookshelf", - ], - ), - ( - 3, - "-", - ["this-is-my", "is-my-favorite", "book-on-my", "on-my-bookshelf"], - ), - ], -) -def test_ngrams_tokenize(n, separator, expected_values): - strings = cudf.Series(["this is my favorite", "book on my bookshelf"]) - - expected = cudf.Series(expected_values) - - actual = strings.str.ngrams_tokenize(n=n, separator=separator) - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_character_tokenize_series(): - sr = cudf.Series( - [ - "hello world", - "sdf", - None, - ( - "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" - ), - ] - ) - expected_values = cudf.Series( - [ - "h", - "e", - "l", - "l", - "o", - " ", - "w", - "o", - "r", - "l", - "d", - "s", - "d", - "f", - "g", - "o", - "o", - "d", - "b", - "y", - "e", - ",", - " ", - "o", - "n", - "e", - "-", - "t", - "w", - "o", - ":", - "t", - "h", - "r", - "e", - "e", - "~", - "f", - "o", - "u", - "r", - "+", - "f", - "i", - "v", - "e", - "_", - "s", - "i", - "x", - "@", - "s", - "e", - "v", - "e", - "n", - "#", - "e", - "i", - "g", - "h", - "t", - "^", - "n", - "i", - "n", - "e", - " ", - "h", - "e", - "Œ", - "Ž", - "‘", - "•", - "™", - "œ", - "$", - "µ", - "¾", - "Ť", - "Ơ", - "é", - " ", - "DŽ", - ] - ) - expected_index = sr.index.repeat(sr.str.len().fillna(0)) - expected = cudf.Series(expected_values, index=expected_index) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.Series([""]) - expected = cudf.Series([], dtype="object") - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.Series(["a"]) - expected = cudf.Series(["a"]) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - -def test_character_tokenize_index(): - sr = cudf.core.index.as_index( - [ - "hello world", - "sdf", - None, - ( - "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" - ), - ] - ) - expected = cudf.core.index.as_index( - [ - "h", - "e", - "l", - "l", - "o", - " ", - "w", - "o", - "r", - "l", - "d", - "s", - "d", - "f", - "g", - "o", - "o", - "d", - "b", - "y", - "e", - ",", - " ", - "o", - "n", - "e", - "-", - "t", - "w", - "o", - ":", - "t", - "h", - "r", - "e", - "e", - "~", - "f", - "o", - "u", - "r", - "+", - "f", - "i", - "v", - "e", - "_", - "s", - "i", - "x", - "@", - "s", - "e", - "v", - "e", - "n", - "#", - "e", - "i", - "g", - "h", - "t", - "^", - "n", - "i", - "n", - "e", - " ", - "h", - "e", - "Œ", - "Ž", - "‘", - "•", - "™", - "œ", - "$", - "µ", - "¾", - "Ť", - "Ơ", - "é", - " ", - "DŽ", - ] - ) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.Index([""]) - expected = cudf.Index([], dtype="object") - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.core.index.as_index(["a"]) - expected = cudf.core.index.as_index(["a"]) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - -def test_text_replace_tokens(): - sr = cudf.Series(["this is me", "theme music", ""]) - targets = cudf.Series(["is", "me"]) - - expected = cudf.Series(["this _ _", "theme music", ""]) - actual = sr.str.replace_tokens(targets, "_") - - assert_eq(expected, actual) - - replacements = cudf.Series(["IS", "ME"]) - expected = cudf.Series(["this IS ME", "theme music", ""]) - actual = sr.str.replace_tokens(targets, replacements) - - assert_eq(expected, actual) - - sr = cudf.Series( - [ - "this is a small text ☕", - "this \t\t is ; ; - + a looooooooooonnnnnnnggggggg text \n\t", - "emptyme", - ], - ) - targets = cudf.Series( - ["a", "☕", "\t", "looooooooooonnnnnnnggggggg", "emptyme"] - ) - replacements = cudf.Series(["the", "🚒", "🚒🚒🚒🚒", "🔥🔥", ""]) - - expected = cudf.Series( - [ - "this is the small text 🚒", - "this \t\t is ; ; - + the 🔥🔥 text \n\t", - "", - ] - ) - actual = sr.str.replace_tokens(targets, replacements) - - assert_eq(expected, actual) - - sr = cudf.Series( - ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] - ) - targets = cudf.Series(["🌬", "🔥", "🌊"]) - replacements = "🚰" - - expected = cudf.Series( - ["All-we-need;is;🚰", "\tall-we-need0is;🚰", "all;we:need+is;🚰"] - ) - actual = sr.str.replace_tokens(targets, replacements, delimiter=";") - - assert_eq(expected, actual) - assert_eq(sr, sr.str.replace_tokens(targets, replacements)) - assert_eq(sr, sr.str.replace_tokens([""], [""])) - - -def test_text_replace_tokens_error_cases(): - sr = cudf.Series(["this is me", "theme music", ""]) - - with pytest.raises( - TypeError, - match="targets should be an array-like or a Series object, " - "found ", - ): - sr.str.replace_tokens("me", ["a"]) - - with pytest.raises( - ValueError, - match="targets and replacements should be same size" - " sequences unless replacements is a string.", - ): - sr.str.replace_tokens(["a"], ["me", "ki"]) - - with pytest.raises( - TypeError, - match="replacements should be an str, array-like or Series object," - " found ", - ): - sr.str.replace_tokens(["a"], {"s"}) - - with pytest.raises( - TypeError, - match="Type of delimiter should be a string, found ", - ): - sr.str.replace_tokens(["a"], ["s"], delimiter=["a", "b"]) - - -def test_text_filter_tokens(): - sr = cudf.Series(["the quick brown fox jumped", "over the lazy dog", ""]) - - expected = cudf.Series([" quick brown jumped", " ", ""]) - actual = sr.str.filter_tokens(5) - assert_eq(expected, actual) - - expected = cudf.Series(["🔥 quick brown 🔥 jumped", "🔥 🔥 🔥 🔥", ""]) - actual = sr.str.filter_tokens(5, "🔥") - assert_eq(expected, actual) - - sr = cudf.Series( - ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] - ) - expected = cudf.Series( - ["All-we-need;is;--", "\tall-we-need0is;--", "all;we:need+is;--"] - ) - actual = sr.str.filter_tokens(2, "--", ";") - assert_eq(expected, actual) - - assert_eq(sr, sr.str.filter_tokens(1)) - - -def test_text_filter_tokens_error_cases(): - sr = cudf.Series(["abc", "def", ""]) - - with pytest.raises( - TypeError, - match="Type of replacement should be a string, found ", - ): - sr.str.filter_tokens(3, replacement=["a", "b"]) - - with pytest.raises( - TypeError, - match="Type of delimiter should be a string, found ", - ): - sr.str.filter_tokens(3, delimiter=["a", "b"]) - - -def test_edit_distance(): - sr = cudf.Series(["kitten", "saturday", "address", "book"]) - tg = cudf.Series(["sitting", "sunday", "addressee", "back"]) - - expected = cudf.Series([3, 3, 2, 2], dtype=np.int32) - actual = sr.str.edit_distance(tg) - assert_eq(expected, actual) - - expected = cudf.Series([0, 7, 6, 6], dtype=np.int32) - actual = sr.str.edit_distance("kitten") - assert_eq(expected, actual) - - -def test_edit_distance_matrix(): - # normal - sr = cudf.Series(["rounded", "bounded", "bounce", "trounce", "ounce"]) - - expected = cudf.Series( - [ - [0, 1, 3, 3, 3], - [1, 0, 2, 4, 3], - [3, 2, 0, 2, 1], - [3, 4, 2, 0, 2], - [3, 3, 1, 2, 0], - ] - ) - got = sr.str.edit_distance_matrix() - - assert_eq(expected, got, check_dtype=False) - - # 1-row series - sr2 = cudf.Series(["x"]) - with pytest.raises(ValueError, match="Require size >= 2"): - sr2.str.edit_distance_matrix() - - # null rows - sr3 = cudf.Series(["rounded", None, "bounce", "trounce", "ounce"]) - with pytest.raises(ValueError, match="Cannot compute"): - sr3.str.edit_distance_matrix() - - -def test_porter_stemmer_measure(): - strings = cudf.Series( - [ - "tr", - "ee", - "tree", - "y", - "by", - "trouble", - "oats", - "trees", - "ivy", - "troubles", - "private", - "oaten", - "orrery", - None, - "", - ] - ) - expected = cudf.Series( - [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, None, 0], dtype=np.int32 - ) - - actual = strings.str.porter_stemmer_measure() - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_is_vowel_consonant(): - strings = cudf.Series( - ["tr", "ee", "tree", "y", "by", "oats", "ivy", "orrery", None, ""] - ) - expected = cudf.Series( - [False, False, True, False, False, False, True, False, None, False] - ) - actual = strings.str.is_vowel(2) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - expected = cudf.Series( - [True, False, True, False, False, False, True, True, None, False] - ) - actual = strings.str.is_consonant(1) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - indices = cudf.Series([2, 1, 0, 0, 1, 2, 0, 3, 0, 0]) - expected = cudf.Series( - [False, True, False, False, True, False, True, True, None, False] - ) - actual = strings.str.is_vowel(indices) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - expected = cudf.Series( - [False, False, True, True, False, True, False, False, None, False] - ) - actual = strings.str.is_consonant(indices) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_minhash(): - strings = cudf.Series(["this is my", "favorite book", None, ""]) - expected = cudf.Series([21141582, 962346254, None, 0], dtype=np.uint32) - actual = strings.str.minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), - cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), - None, - cudf.Series([0, 0, 0], dtype=np.uint32), - ] - ) - actual = strings.str.minhash(seeds=seeds, n=5) - assert_eq(expected, actual) - - with pytest.raises(ValueError): - strings.str.minhash(seeds=7) - with pytest.raises(ValueError): - strings.str.minhash(seeds=seeds, method="md5") - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash(seeds=seeds) diff --git a/python/cudf/cudf/tests/test_text_io.py b/python/cudf/cudf/tests/test_text_io.py deleted file mode 100644 index acba13bb5b0..00000000000 --- a/python/cudf/cudf/tests/test_text_io.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. - -from io import StringIO - -import pytest - -import cudf -from cudf.testing._utils import assert_eq - - -@pytest.fixture(scope="module") -def datadir(datadir): - return datadir / "text" - - -def test_read_text(datadir): - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file) as f: - content = f.read().split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - actual = cudf.read_text(chess_file, delimiter=delimiter) - - assert_eq(expected, actual) - - -def test_read_text_byte_range(datadir): - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file, "r") as f: - data = f.read() - content = data.split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - byte_range_size = (len(data) // 3) + (len(data) % 3 != 0) - - actual_0 = cudf.read_text( - chess_file, - delimiter=delimiter, - byte_range=[byte_range_size * 0, byte_range_size], - ) - actual_1 = cudf.read_text( - chess_file, - delimiter=delimiter, - byte_range=[byte_range_size * 1, byte_range_size], - ) - actual_2 = cudf.read_text( - chess_file, - delimiter=delimiter, - byte_range=[byte_range_size * 2, byte_range_size], - ) - - actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True) - - assert_eq(expected, actual) - - -def test_read_text_byte_range_large(tmpdir): - content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000)) - delimiter = "\n" - temp_file = str(tmpdir) + "/temp.txt" - - with open(temp_file, "w") as f: - f.write(content) - - expected = cudf.Series(["xxxx\n" for i in range(0, 200)]) - - actual = cudf.read_text( - temp_file, delimiter=delimiter, byte_range=[1000, 1000] - ) - - assert_eq(expected, actual) - - -def test_read_text_in_memory(datadir): - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series(["x::", "y::", "z"]) - - actual = cudf.read_text(StringIO("x::y::z"), delimiter="::") - - assert_eq(expected, actual) - - -def test_read_text_in_memory_strip_delimiter(datadir): - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series(["x", "y", "z"]) - - actual = cudf.read_text( - StringIO("x::y::z"), delimiter="::", strip_delimiters=True - ) - - assert_eq(expected, actual) - - -def test_read_text_bgzip(datadir): - chess_file_compressed = str(datadir) + "/chess.pgn.gz" - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file) as f: - content = f.read().split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - actual = cudf.read_text( - chess_file_compressed, compression="bgzip", delimiter=delimiter - ) - - assert_eq(expected, actual) - - -def test_read_text_bgzip_offsets(datadir): - chess_file_compressed = str(datadir) + "/chess.pgn.gz" - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file) as f: - content = f.read()[29:695].split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - actual = cudf.read_text( - chess_file_compressed, - compression="bgzip", - compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7], - delimiter=delimiter, - ) - - assert_eq(expected, actual) From 6523e646621a406448c8e3906eddaffdff409843 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 25 May 2023 08:40:14 -0400 Subject: [PATCH 5/5] create text/test_text_methods.py --- .pre-commit-config.yaml | 2 +- .../cudf/tests/strings/test_string_methods.py | 838 +----------------- python/cudf/cudf/tests/text/__init__.py | 0 .../{ => text}/test_subword_tokenizer.py | 2 +- .../cudf/cudf/tests/text/test_text_methods.py | 837 +++++++++++++++++ 5 files changed, 840 insertions(+), 839 deletions(-) create mode 100644 python/cudf/cudf/tests/text/__init__.py rename python/cudf/cudf/tests/{ => text}/test_subword_tokenizer.py (99%) create mode 100644 python/cudf/cudf/tests/text/test_text_methods.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 10e68ea0757..ebdbf3e6db1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -85,7 +85,7 @@ repos: (?x)^( ^cpp/include/cudf_test/cxxopts.hpp| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*| - ^python/cudf/cudf/tests/strings/test_string_methods.py + ^python/cudf/cudf/tests/text/test_text_methods.py ) - repo: local hooks: diff --git a/python/cudf/cudf/tests/strings/test_string_methods.py b/python/cudf/cudf/tests/strings/test_string_methods.py index ea789b99220..06777c8e6af 100644 --- a/python/cudf/cudf/tests/strings/test_string_methods.py +++ b/python/cudf/cudf/tests/strings/test_string_methods.py @@ -1,837 +1 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. - -import numpy as np -import pytest - -import cudf -from cudf.testing._utils import assert_eq - - -def test_tokenize(): - strings = cudf.Series( - [ - "the quick fox jumped over the lazy dog", - "the siamésé cat jumped under the sofa", - None, - "", - ] - ) - - expected_values = cudf.Series( - [ - "the", - "quick", - "fox", - "jumped", - "over", - "the", - "lazy", - "dog", - "the", - "siamésé", - "cat", - "jumped", - "under", - "the", - "sofa", - ] - ) - expected_index = strings.index.repeat(strings.str.token_count()) - expected = cudf.Series(expected_values, index=expected_index) - - actual = strings.str.tokenize() - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_tokenize_delimiter(): - strings = cudf.Series( - [ - "the quick fox jumped over the lazy dog", - "the siamésé cat jumped under the sofa", - None, - "", - ] - ) - - expected_values = cudf.Series( - [ - "the quick f", - "x jumped ", - "ver the lazy d", - "g", - "the siamésé cat jumped under the s", - "fa", - ] - ) - expected_index = strings.index.repeat(strings.str.token_count("o")) - expected = cudf.Series(expected_values, index=expected_index) - - actual = strings.str.tokenize(delimiter="o") - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_detokenize(): - strings = cudf.Series( - [ - "the", - "quick", - "fox", - "jumped", - "over", - "the", - "lazy", - "dog", - "the", - "siamésé", - "cat", - "jumped", - "under", - "the", - "sofa", - ] - ) - - indices = cudf.Series([0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]) - actual = strings.str.detokenize(indices) - expected = cudf.Series( - [ - "the quick fox", - "jumped over", - "the lazy dog", - "the siamésé cat jumped under the sofa", - ] - ) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - indices = cudf.Series( - [4, 0, 0, 0, 0, 4, 1, 1, 4, 2, 2, 2, 2, 4, 3], dtype=np.int8 - ) - actual = strings.str.detokenize(indices, "+") - expected = cudf.Series( - [ - "quick+fox+jumped+over", - "lazy+dog", - "siamésé+cat+jumped+under", - "sofa", - "the+the+the+the", - ] - ) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "delimiter, expected_token_counts", - [ - ("", [10, 9, 0, 0, 5]), - ("o", [6, 3, 0, 0, 1]), - (["a", "e", "i", "o", "u"], [13, 13, 0, 0, 6]), - (["a", "e", "i", "o"], [12, 11, 0, 0, 6]), - ], -) -def test_token_count(delimiter, expected_token_counts): - strings = cudf.Series( - [ - "the quick brown fox jumped over the lazy brown dog", - "the sable siamésé cat jumped under the brown sofa", - None, - "", - "test_str\x01test_str\x02test_str\x03test_str\x04test_str\x05", - ] - ) - - expected = cudf.Series(expected_token_counts) - - actual = strings.str.token_count(delimiter) - - assert type(expected) == type(actual) - assert_eq(expected, actual, check_dtype=False) - - -def test_normalize_spaces(): - strings = cudf.Series( - [ - " the\t quick fox jumped over the lazy dog", - "the siamésé cat\f jumped\t\tunder the sofa ", - None, - "", - ] - ) - expected = cudf.Series( - [ - "the quick fox jumped over the lazy dog", - "the siamésé cat jumped under the sofa", - None, - "", - ] - ) - - actual = strings.str.normalize_spaces() - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_normalize_characters(): - strings = cudf.Series( - ["乾 \t 乿", "ĂĆCĖÑTÜATE", "âscénd, Descend", "", None, "Stock^ $1"] - ) - expected = cudf.Series( - [ - " 乾 乿 ", - "accentuate", - "ascend , descend", - "", - None, - "stock ^ $ 1", - ] - ) - - actual = strings.str.normalize_characters() - assert type(expected) == type(actual) - assert_eq(expected, actual) - - expected = cudf.Series( - [ - " 乾 乿 ", - "ĂĆCĖÑTÜATE", - "âscénd , Descend", - "", - None, - "Stock ^ $ 1", - ] - ) - actual = strings.str.normalize_characters(do_lower=False) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "n, separator, expected_values", - [ - ( - 2, - "_", - [ - "this_is", - "is_my", - "my_favorite", - "favorite_book", - "book_on", - "on_my", - "my_bookshelf", - ], - ), - ( - 3, - "-", - [ - "this-is-my", - "is-my-favorite", - "my-favorite-book", - "favorite-book-on", - "book-on-my", - "on-my-bookshelf", - ], - ), - ], -) -def test_ngrams(n, separator, expected_values): - strings = cudf.Series( - ["this", "is", "my", "favorite", "book", "on", "my", "bookshelf"] - ) - - expected = cudf.Series(expected_values) - - actual = strings.str.ngrams(n=n, separator=separator) - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "n, expected_values, expected_index, as_list", - [ - ( - 2, - [ - "th", - "hi", - "is", - "is", - "my", - "bo", - "oo", - "ok", - "he", - "er", - "re", - cudf.NA, - ], - [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6], - False, - ), - ( - 3, - [ - "thi", - "his", - cudf.NA, - cudf.NA, - "boo", - "ook", - "her", - "ere", - cudf.NA, - ], - [1, 1, 2, 3, 4, 4, 5, 5, 6], - False, - ), - ( - 3, - [["thi", "his"], [], [], ["boo", "ook"], ["her", "ere"], []], - [1, 2, 3, 4, 5, 6], - True, - ), - ], -) -def test_character_ngrams(n, expected_values, expected_index, as_list): - strings = cudf.Series( - ["this", "is", "my", "book", "here", ""], index=[1, 2, 3, 4, 5, 6] - ) - - expected = cudf.Series(expected_values, index=expected_index) - - actual = strings.str.character_ngrams(n=n, as_list=as_list) - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -@pytest.mark.parametrize( - "n, separator, expected_values", - [ - ( - 2, - "_", - [ - "this_is", - "is_my", - "my_favorite", - "book_on", - "on_my", - "my_bookshelf", - ], - ), - ( - 3, - "-", - ["this-is-my", "is-my-favorite", "book-on-my", "on-my-bookshelf"], - ), - ], -) -def test_ngrams_tokenize(n, separator, expected_values): - strings = cudf.Series(["this is my favorite", "book on my bookshelf"]) - - expected = cudf.Series(expected_values) - - actual = strings.str.ngrams_tokenize(n=n, separator=separator) - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_character_tokenize_series(): - sr = cudf.Series( - [ - "hello world", - "sdf", - None, - ( - "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" - ), - ] - ) - expected_values = cudf.Series( - [ - "h", - "e", - "l", - "l", - "o", - " ", - "w", - "o", - "r", - "l", - "d", - "s", - "d", - "f", - "g", - "o", - "o", - "d", - "b", - "y", - "e", - ",", - " ", - "o", - "n", - "e", - "-", - "t", - "w", - "o", - ":", - "t", - "h", - "r", - "e", - "e", - "~", - "f", - "o", - "u", - "r", - "+", - "f", - "i", - "v", - "e", - "_", - "s", - "i", - "x", - "@", - "s", - "e", - "v", - "e", - "n", - "#", - "e", - "i", - "g", - "h", - "t", - "^", - "n", - "i", - "n", - "e", - " ", - "h", - "e", - "Œ", - "Ž", - "‘", - "•", - "™", - "œ", - "$", - "µ", - "¾", - "Ť", - "Ơ", - "é", - " ", - "DŽ", - ] - ) - expected_index = sr.index.repeat(sr.str.len().fillna(0)) - expected = cudf.Series(expected_values, index=expected_index) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.Series([""]) - expected = cudf.Series([], dtype="object") - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.Series(["a"]) - expected = cudf.Series(["a"]) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - -def test_character_tokenize_index(): - sr = cudf.core.index.as_index( - [ - "hello world", - "sdf", - None, - ( - "goodbye, one-two:three~four+five_six@sev" - "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" - ), - ] - ) - expected = cudf.core.index.as_index( - [ - "h", - "e", - "l", - "l", - "o", - " ", - "w", - "o", - "r", - "l", - "d", - "s", - "d", - "f", - "g", - "o", - "o", - "d", - "b", - "y", - "e", - ",", - " ", - "o", - "n", - "e", - "-", - "t", - "w", - "o", - ":", - "t", - "h", - "r", - "e", - "e", - "~", - "f", - "o", - "u", - "r", - "+", - "f", - "i", - "v", - "e", - "_", - "s", - "i", - "x", - "@", - "s", - "e", - "v", - "e", - "n", - "#", - "e", - "i", - "g", - "h", - "t", - "^", - "n", - "i", - "n", - "e", - " ", - "h", - "e", - "Œ", - "Ž", - "‘", - "•", - "™", - "œ", - "$", - "µ", - "¾", - "Ť", - "Ơ", - "é", - " ", - "DŽ", - ] - ) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.Index([""]) - expected = cudf.Index([], dtype="object") - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - sr = cudf.core.index.as_index(["a"]) - expected = cudf.core.index.as_index(["a"]) - - actual = sr.str.character_tokenize() - assert_eq(expected, actual) - - -def test_text_replace_tokens(): - sr = cudf.Series(["this is me", "theme music", ""]) - targets = cudf.Series(["is", "me"]) - - expected = cudf.Series(["this _ _", "theme music", ""]) - actual = sr.str.replace_tokens(targets, "_") - - assert_eq(expected, actual) - - replacements = cudf.Series(["IS", "ME"]) - expected = cudf.Series(["this IS ME", "theme music", ""]) - actual = sr.str.replace_tokens(targets, replacements) - - assert_eq(expected, actual) - - sr = cudf.Series( - [ - "this is a small text ☕", - "this \t\t is ; ; - + a looooooooooonnnnnnnggggggg text \n\t", - "emptyme", - ], - ) - targets = cudf.Series( - ["a", "☕", "\t", "looooooooooonnnnnnnggggggg", "emptyme"] - ) - replacements = cudf.Series(["the", "🚒", "🚒🚒🚒🚒", "🔥🔥", ""]) - - expected = cudf.Series( - [ - "this is the small text 🚒", - "this \t\t is ; ; - + the 🔥🔥 text \n\t", - "", - ] - ) - actual = sr.str.replace_tokens(targets, replacements) - - assert_eq(expected, actual) - - sr = cudf.Series( - ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] - ) - targets = cudf.Series(["🌬", "🔥", "🌊"]) - replacements = "🚰" - - expected = cudf.Series( - ["All-we-need;is;🚰", "\tall-we-need0is;🚰", "all;we:need+is;🚰"] - ) - actual = sr.str.replace_tokens(targets, replacements, delimiter=";") - - assert_eq(expected, actual) - assert_eq(sr, sr.str.replace_tokens(targets, replacements)) - assert_eq(sr, sr.str.replace_tokens([""], [""])) - - -def test_text_replace_tokens_error_cases(): - sr = cudf.Series(["this is me", "theme music", ""]) - - with pytest.raises( - TypeError, - match="targets should be an array-like or a Series object, " - "found ", - ): - sr.str.replace_tokens("me", ["a"]) - - with pytest.raises( - ValueError, - match="targets and replacements should be same size" - " sequences unless replacements is a string.", - ): - sr.str.replace_tokens(["a"], ["me", "ki"]) - - with pytest.raises( - TypeError, - match="replacements should be an str, array-like or Series object," - " found ", - ): - sr.str.replace_tokens(["a"], {"s"}) - - with pytest.raises( - TypeError, - match="Type of delimiter should be a string, found ", - ): - sr.str.replace_tokens(["a"], ["s"], delimiter=["a", "b"]) - - -def test_text_filter_tokens(): - sr = cudf.Series(["the quick brown fox jumped", "over the lazy dog", ""]) - - expected = cudf.Series([" quick brown jumped", " ", ""]) - actual = sr.str.filter_tokens(5) - assert_eq(expected, actual) - - expected = cudf.Series(["🔥 quick brown 🔥 jumped", "🔥 🔥 🔥 🔥", ""]) - actual = sr.str.filter_tokens(5, "🔥") - assert_eq(expected, actual) - - sr = cudf.Series( - ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] - ) - expected = cudf.Series( - ["All-we-need;is;--", "\tall-we-need0is;--", "all;we:need+is;--"] - ) - actual = sr.str.filter_tokens(2, "--", ";") - assert_eq(expected, actual) - - assert_eq(sr, sr.str.filter_tokens(1)) - - -def test_text_filter_tokens_error_cases(): - sr = cudf.Series(["abc", "def", ""]) - - with pytest.raises( - TypeError, - match="Type of replacement should be a string, found ", - ): - sr.str.filter_tokens(3, replacement=["a", "b"]) - - with pytest.raises( - TypeError, - match="Type of delimiter should be a string, found ", - ): - sr.str.filter_tokens(3, delimiter=["a", "b"]) - - -def test_edit_distance(): - sr = cudf.Series(["kitten", "saturday", "address", "book"]) - tg = cudf.Series(["sitting", "sunday", "addressee", "back"]) - - expected = cudf.Series([3, 3, 2, 2], dtype=np.int32) - actual = sr.str.edit_distance(tg) - assert_eq(expected, actual) - - expected = cudf.Series([0, 7, 6, 6], dtype=np.int32) - actual = sr.str.edit_distance("kitten") - assert_eq(expected, actual) - - -def test_edit_distance_matrix(): - # normal - sr = cudf.Series(["rounded", "bounded", "bounce", "trounce", "ounce"]) - - expected = cudf.Series( - [ - [0, 1, 3, 3, 3], - [1, 0, 2, 4, 3], - [3, 2, 0, 2, 1], - [3, 4, 2, 0, 2], - [3, 3, 1, 2, 0], - ] - ) - got = sr.str.edit_distance_matrix() - - assert_eq(expected, got, check_dtype=False) - - # 1-row series - sr2 = cudf.Series(["x"]) - with pytest.raises(ValueError, match="Require size >= 2"): - sr2.str.edit_distance_matrix() - - # null rows - sr3 = cudf.Series(["rounded", None, "bounce", "trounce", "ounce"]) - with pytest.raises(ValueError, match="Cannot compute"): - sr3.str.edit_distance_matrix() - - -def test_porter_stemmer_measure(): - strings = cudf.Series( - [ - "tr", - "ee", - "tree", - "y", - "by", - "trouble", - "oats", - "trees", - "ivy", - "troubles", - "private", - "oaten", - "orrery", - None, - "", - ] - ) - expected = cudf.Series( - [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, None, 0], dtype=np.int32 - ) - - actual = strings.str.porter_stemmer_measure() - - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_is_vowel_consonant(): - strings = cudf.Series( - ["tr", "ee", "tree", "y", "by", "oats", "ivy", "orrery", None, ""] - ) - expected = cudf.Series( - [False, False, True, False, False, False, True, False, None, False] - ) - actual = strings.str.is_vowel(2) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - expected = cudf.Series( - [True, False, True, False, False, False, True, True, None, False] - ) - actual = strings.str.is_consonant(1) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - indices = cudf.Series([2, 1, 0, 0, 1, 2, 0, 3, 0, 0]) - expected = cudf.Series( - [False, True, False, False, True, False, True, True, None, False] - ) - actual = strings.str.is_vowel(indices) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - expected = cudf.Series( - [False, False, True, True, False, True, False, False, None, False] - ) - actual = strings.str.is_consonant(indices) - assert type(expected) == type(actual) - assert_eq(expected, actual) - - -def test_minhash(): - strings = cudf.Series(["this is my", "favorite book", None, ""]) - expected = cudf.Series([21141582, 962346254, None, 0], dtype=np.uint32) - actual = strings.str.minhash() - assert_eq(expected, actual) - seeds = cudf.Series([0, 1, 2], dtype=np.uint32) - expected = cudf.Series( - [ - cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), - cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), - None, - cudf.Series([0, 0, 0], dtype=np.uint32), - ] - ) - actual = strings.str.minhash(seeds=seeds, n=5) - assert_eq(expected, actual) - - with pytest.raises(ValueError): - strings.str.minhash(seeds=7) - with pytest.raises(ValueError): - strings.str.minhash(seeds=seeds, method="md5") - with pytest.raises(ValueError): - seeds = cudf.Series([0, 1, 2], dtype=np.int32) - strings.str.minhash(seeds=seeds) +# Copyright (c) 2023, NVIDIA CORPORATION. diff --git a/python/cudf/cudf/tests/text/__init__.py b/python/cudf/cudf/tests/text/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/cudf/cudf/tests/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py similarity index 99% rename from python/cudf/cudf/tests/test_subword_tokenizer.py rename to python/cudf/cudf/tests/text/test_subword_tokenizer.py index 9084132243e..ac17daa8601 100644 --- a/python/cudf/cudf/tests/test_subword_tokenizer.py +++ b/python/cudf/cudf/tests/text/test_subword_tokenizer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. import os import cupy diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py new file mode 100644 index 00000000000..ea789b99220 --- /dev/null +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -0,0 +1,837 @@ +# Copyright (c) 2019-2023, NVIDIA CORPORATION. + +import numpy as np +import pytest + +import cudf +from cudf.testing._utils import assert_eq + + +def test_tokenize(): + strings = cudf.Series( + [ + "the quick fox jumped over the lazy dog", + "the siamésé cat jumped under the sofa", + None, + "", + ] + ) + + expected_values = cudf.Series( + [ + "the", + "quick", + "fox", + "jumped", + "over", + "the", + "lazy", + "dog", + "the", + "siamésé", + "cat", + "jumped", + "under", + "the", + "sofa", + ] + ) + expected_index = strings.index.repeat(strings.str.token_count()) + expected = cudf.Series(expected_values, index=expected_index) + + actual = strings.str.tokenize() + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_tokenize_delimiter(): + strings = cudf.Series( + [ + "the quick fox jumped over the lazy dog", + "the siamésé cat jumped under the sofa", + None, + "", + ] + ) + + expected_values = cudf.Series( + [ + "the quick f", + "x jumped ", + "ver the lazy d", + "g", + "the siamésé cat jumped under the s", + "fa", + ] + ) + expected_index = strings.index.repeat(strings.str.token_count("o")) + expected = cudf.Series(expected_values, index=expected_index) + + actual = strings.str.tokenize(delimiter="o") + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_detokenize(): + strings = cudf.Series( + [ + "the", + "quick", + "fox", + "jumped", + "over", + "the", + "lazy", + "dog", + "the", + "siamésé", + "cat", + "jumped", + "under", + "the", + "sofa", + ] + ) + + indices = cudf.Series([0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3]) + actual = strings.str.detokenize(indices) + expected = cudf.Series( + [ + "the quick fox", + "jumped over", + "the lazy dog", + "the siamésé cat jumped under the sofa", + ] + ) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + indices = cudf.Series( + [4, 0, 0, 0, 0, 4, 1, 1, 4, 2, 2, 2, 2, 4, 3], dtype=np.int8 + ) + actual = strings.str.detokenize(indices, "+") + expected = cudf.Series( + [ + "quick+fox+jumped+over", + "lazy+dog", + "siamésé+cat+jumped+under", + "sofa", + "the+the+the+the", + ] + ) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "delimiter, expected_token_counts", + [ + ("", [10, 9, 0, 0, 5]), + ("o", [6, 3, 0, 0, 1]), + (["a", "e", "i", "o", "u"], [13, 13, 0, 0, 6]), + (["a", "e", "i", "o"], [12, 11, 0, 0, 6]), + ], +) +def test_token_count(delimiter, expected_token_counts): + strings = cudf.Series( + [ + "the quick brown fox jumped over the lazy brown dog", + "the sable siamésé cat jumped under the brown sofa", + None, + "", + "test_str\x01test_str\x02test_str\x03test_str\x04test_str\x05", + ] + ) + + expected = cudf.Series(expected_token_counts) + + actual = strings.str.token_count(delimiter) + + assert type(expected) == type(actual) + assert_eq(expected, actual, check_dtype=False) + + +def test_normalize_spaces(): + strings = cudf.Series( + [ + " the\t quick fox jumped over the lazy dog", + "the siamésé cat\f jumped\t\tunder the sofa ", + None, + "", + ] + ) + expected = cudf.Series( + [ + "the quick fox jumped over the lazy dog", + "the siamésé cat jumped under the sofa", + None, + "", + ] + ) + + actual = strings.str.normalize_spaces() + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_normalize_characters(): + strings = cudf.Series( + ["乾 \t 乿", "ĂĆCĖÑTÜATE", "âscénd, Descend", "", None, "Stock^ $1"] + ) + expected = cudf.Series( + [ + " 乾 乿 ", + "accentuate", + "ascend , descend", + "", + None, + "stock ^ $ 1", + ] + ) + + actual = strings.str.normalize_characters() + assert type(expected) == type(actual) + assert_eq(expected, actual) + + expected = cudf.Series( + [ + " 乾 乿 ", + "ĂĆCĖÑTÜATE", + "âscénd , Descend", + "", + None, + "Stock ^ $ 1", + ] + ) + actual = strings.str.normalize_characters(do_lower=False) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "n, separator, expected_values", + [ + ( + 2, + "_", + [ + "this_is", + "is_my", + "my_favorite", + "favorite_book", + "book_on", + "on_my", + "my_bookshelf", + ], + ), + ( + 3, + "-", + [ + "this-is-my", + "is-my-favorite", + "my-favorite-book", + "favorite-book-on", + "book-on-my", + "on-my-bookshelf", + ], + ), + ], +) +def test_ngrams(n, separator, expected_values): + strings = cudf.Series( + ["this", "is", "my", "favorite", "book", "on", "my", "bookshelf"] + ) + + expected = cudf.Series(expected_values) + + actual = strings.str.ngrams(n=n, separator=separator) + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "n, expected_values, expected_index, as_list", + [ + ( + 2, + [ + "th", + "hi", + "is", + "is", + "my", + "bo", + "oo", + "ok", + "he", + "er", + "re", + cudf.NA, + ], + [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6], + False, + ), + ( + 3, + [ + "thi", + "his", + cudf.NA, + cudf.NA, + "boo", + "ook", + "her", + "ere", + cudf.NA, + ], + [1, 1, 2, 3, 4, 4, 5, 5, 6], + False, + ), + ( + 3, + [["thi", "his"], [], [], ["boo", "ook"], ["her", "ere"], []], + [1, 2, 3, 4, 5, 6], + True, + ), + ], +) +def test_character_ngrams(n, expected_values, expected_index, as_list): + strings = cudf.Series( + ["this", "is", "my", "book", "here", ""], index=[1, 2, 3, 4, 5, 6] + ) + + expected = cudf.Series(expected_values, index=expected_index) + + actual = strings.str.character_ngrams(n=n, as_list=as_list) + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "n, separator, expected_values", + [ + ( + 2, + "_", + [ + "this_is", + "is_my", + "my_favorite", + "book_on", + "on_my", + "my_bookshelf", + ], + ), + ( + 3, + "-", + ["this-is-my", "is-my-favorite", "book-on-my", "on-my-bookshelf"], + ), + ], +) +def test_ngrams_tokenize(n, separator, expected_values): + strings = cudf.Series(["this is my favorite", "book on my bookshelf"]) + + expected = cudf.Series(expected_values) + + actual = strings.str.ngrams_tokenize(n=n, separator=separator) + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_character_tokenize_series(): + sr = cudf.Series( + [ + "hello world", + "sdf", + None, + ( + "goodbye, one-two:three~four+five_six@sev" + "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" + ), + ] + ) + expected_values = cudf.Series( + [ + "h", + "e", + "l", + "l", + "o", + " ", + "w", + "o", + "r", + "l", + "d", + "s", + "d", + "f", + "g", + "o", + "o", + "d", + "b", + "y", + "e", + ",", + " ", + "o", + "n", + "e", + "-", + "t", + "w", + "o", + ":", + "t", + "h", + "r", + "e", + "e", + "~", + "f", + "o", + "u", + "r", + "+", + "f", + "i", + "v", + "e", + "_", + "s", + "i", + "x", + "@", + "s", + "e", + "v", + "e", + "n", + "#", + "e", + "i", + "g", + "h", + "t", + "^", + "n", + "i", + "n", + "e", + " ", + "h", + "e", + "Œ", + "Ž", + "‘", + "•", + "™", + "œ", + "$", + "µ", + "¾", + "Ť", + "Ơ", + "é", + " ", + "DŽ", + ] + ) + expected_index = sr.index.repeat(sr.str.len().fillna(0)) + expected = cudf.Series(expected_values, index=expected_index) + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + sr = cudf.Series([""]) + expected = cudf.Series([], dtype="object") + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + sr = cudf.Series(["a"]) + expected = cudf.Series(["a"]) + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + +def test_character_tokenize_index(): + sr = cudf.core.index.as_index( + [ + "hello world", + "sdf", + None, + ( + "goodbye, one-two:three~four+five_six@sev" + "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé DŽ" + ), + ] + ) + expected = cudf.core.index.as_index( + [ + "h", + "e", + "l", + "l", + "o", + " ", + "w", + "o", + "r", + "l", + "d", + "s", + "d", + "f", + "g", + "o", + "o", + "d", + "b", + "y", + "e", + ",", + " ", + "o", + "n", + "e", + "-", + "t", + "w", + "o", + ":", + "t", + "h", + "r", + "e", + "e", + "~", + "f", + "o", + "u", + "r", + "+", + "f", + "i", + "v", + "e", + "_", + "s", + "i", + "x", + "@", + "s", + "e", + "v", + "e", + "n", + "#", + "e", + "i", + "g", + "h", + "t", + "^", + "n", + "i", + "n", + "e", + " ", + "h", + "e", + "Œ", + "Ž", + "‘", + "•", + "™", + "œ", + "$", + "µ", + "¾", + "Ť", + "Ơ", + "é", + " ", + "DŽ", + ] + ) + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + sr = cudf.Index([""]) + expected = cudf.Index([], dtype="object") + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + sr = cudf.core.index.as_index(["a"]) + expected = cudf.core.index.as_index(["a"]) + + actual = sr.str.character_tokenize() + assert_eq(expected, actual) + + +def test_text_replace_tokens(): + sr = cudf.Series(["this is me", "theme music", ""]) + targets = cudf.Series(["is", "me"]) + + expected = cudf.Series(["this _ _", "theme music", ""]) + actual = sr.str.replace_tokens(targets, "_") + + assert_eq(expected, actual) + + replacements = cudf.Series(["IS", "ME"]) + expected = cudf.Series(["this IS ME", "theme music", ""]) + actual = sr.str.replace_tokens(targets, replacements) + + assert_eq(expected, actual) + + sr = cudf.Series( + [ + "this is a small text ☕", + "this \t\t is ; ; - + a looooooooooonnnnnnnggggggg text \n\t", + "emptyme", + ], + ) + targets = cudf.Series( + ["a", "☕", "\t", "looooooooooonnnnnnnggggggg", "emptyme"] + ) + replacements = cudf.Series(["the", "🚒", "🚒🚒🚒🚒", "🔥🔥", ""]) + + expected = cudf.Series( + [ + "this is the small text 🚒", + "this \t\t is ; ; - + the 🔥🔥 text \n\t", + "", + ] + ) + actual = sr.str.replace_tokens(targets, replacements) + + assert_eq(expected, actual) + + sr = cudf.Series( + ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] + ) + targets = cudf.Series(["🌬", "🔥", "🌊"]) + replacements = "🚰" + + expected = cudf.Series( + ["All-we-need;is;🚰", "\tall-we-need0is;🚰", "all;we:need+is;🚰"] + ) + actual = sr.str.replace_tokens(targets, replacements, delimiter=";") + + assert_eq(expected, actual) + assert_eq(sr, sr.str.replace_tokens(targets, replacements)) + assert_eq(sr, sr.str.replace_tokens([""], [""])) + + +def test_text_replace_tokens_error_cases(): + sr = cudf.Series(["this is me", "theme music", ""]) + + with pytest.raises( + TypeError, + match="targets should be an array-like or a Series object, " + "found ", + ): + sr.str.replace_tokens("me", ["a"]) + + with pytest.raises( + ValueError, + match="targets and replacements should be same size" + " sequences unless replacements is a string.", + ): + sr.str.replace_tokens(["a"], ["me", "ki"]) + + with pytest.raises( + TypeError, + match="replacements should be an str, array-like or Series object," + " found ", + ): + sr.str.replace_tokens(["a"], {"s"}) + + with pytest.raises( + TypeError, + match="Type of delimiter should be a string, found ", + ): + sr.str.replace_tokens(["a"], ["s"], delimiter=["a", "b"]) + + +def test_text_filter_tokens(): + sr = cudf.Series(["the quick brown fox jumped", "over the lazy dog", ""]) + + expected = cudf.Series([" quick brown jumped", " ", ""]) + actual = sr.str.filter_tokens(5) + assert_eq(expected, actual) + + expected = cudf.Series(["🔥 quick brown 🔥 jumped", "🔥 🔥 🔥 🔥", ""]) + actual = sr.str.filter_tokens(5, "🔥") + assert_eq(expected, actual) + + sr = cudf.Series( + ["All-we-need;is;🔥", "\tall-we-need0is;🌊", "all;we:need+is;🌬"] + ) + expected = cudf.Series( + ["All-we-need;is;--", "\tall-we-need0is;--", "all;we:need+is;--"] + ) + actual = sr.str.filter_tokens(2, "--", ";") + assert_eq(expected, actual) + + assert_eq(sr, sr.str.filter_tokens(1)) + + +def test_text_filter_tokens_error_cases(): + sr = cudf.Series(["abc", "def", ""]) + + with pytest.raises( + TypeError, + match="Type of replacement should be a string, found ", + ): + sr.str.filter_tokens(3, replacement=["a", "b"]) + + with pytest.raises( + TypeError, + match="Type of delimiter should be a string, found ", + ): + sr.str.filter_tokens(3, delimiter=["a", "b"]) + + +def test_edit_distance(): + sr = cudf.Series(["kitten", "saturday", "address", "book"]) + tg = cudf.Series(["sitting", "sunday", "addressee", "back"]) + + expected = cudf.Series([3, 3, 2, 2], dtype=np.int32) + actual = sr.str.edit_distance(tg) + assert_eq(expected, actual) + + expected = cudf.Series([0, 7, 6, 6], dtype=np.int32) + actual = sr.str.edit_distance("kitten") + assert_eq(expected, actual) + + +def test_edit_distance_matrix(): + # normal + sr = cudf.Series(["rounded", "bounded", "bounce", "trounce", "ounce"]) + + expected = cudf.Series( + [ + [0, 1, 3, 3, 3], + [1, 0, 2, 4, 3], + [3, 2, 0, 2, 1], + [3, 4, 2, 0, 2], + [3, 3, 1, 2, 0], + ] + ) + got = sr.str.edit_distance_matrix() + + assert_eq(expected, got, check_dtype=False) + + # 1-row series + sr2 = cudf.Series(["x"]) + with pytest.raises(ValueError, match="Require size >= 2"): + sr2.str.edit_distance_matrix() + + # null rows + sr3 = cudf.Series(["rounded", None, "bounce", "trounce", "ounce"]) + with pytest.raises(ValueError, match="Cannot compute"): + sr3.str.edit_distance_matrix() + + +def test_porter_stemmer_measure(): + strings = cudf.Series( + [ + "tr", + "ee", + "tree", + "y", + "by", + "trouble", + "oats", + "trees", + "ivy", + "troubles", + "private", + "oaten", + "orrery", + None, + "", + ] + ) + expected = cudf.Series( + [0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, None, 0], dtype=np.int32 + ) + + actual = strings.str.porter_stemmer_measure() + + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_is_vowel_consonant(): + strings = cudf.Series( + ["tr", "ee", "tree", "y", "by", "oats", "ivy", "orrery", None, ""] + ) + expected = cudf.Series( + [False, False, True, False, False, False, True, False, None, False] + ) + actual = strings.str.is_vowel(2) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + expected = cudf.Series( + [True, False, True, False, False, False, True, True, None, False] + ) + actual = strings.str.is_consonant(1) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + indices = cudf.Series([2, 1, 0, 0, 1, 2, 0, 3, 0, 0]) + expected = cudf.Series( + [False, True, False, False, True, False, True, True, None, False] + ) + actual = strings.str.is_vowel(indices) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + expected = cudf.Series( + [False, False, True, True, False, True, False, False, None, False] + ) + actual = strings.str.is_consonant(indices) + assert type(expected) == type(actual) + assert_eq(expected, actual) + + +def test_minhash(): + strings = cudf.Series(["this is my", "favorite book", None, ""]) + expected = cudf.Series([21141582, 962346254, None, 0], dtype=np.uint32) + actual = strings.str.minhash() + assert_eq(expected, actual) + seeds = cudf.Series([0, 1, 2], dtype=np.uint32) + expected = cudf.Series( + [ + cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32), + cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32), + None, + cudf.Series([0, 0, 0], dtype=np.uint32), + ] + ) + actual = strings.str.minhash(seeds=seeds, n=5) + assert_eq(expected, actual) + + with pytest.raises(ValueError): + strings.str.minhash(seeds=7) + with pytest.raises(ValueError): + strings.str.minhash(seeds=seeds, method="md5") + with pytest.raises(ValueError): + seeds = cudf.Series([0, 1, 2], dtype=np.int32) + strings.str.minhash(seeds=seeds)