diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 0ac54113278..ebdbf3e6db1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -85,7 +85,7 @@ repos: (?x)^( ^cpp/include/cudf_test/cxxopts.hpp| ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*| - ^python/cudf/cudf/tests/test_text.py + ^python/cudf/cudf/tests/text/test_text_methods.py ) - repo: local hooks: diff --git a/python/cudf/cudf/tests/input_output/test_text.py b/python/cudf/cudf/tests/input_output/test_text.py index 06777c8e6af..acba13bb5b0 100644 --- a/python/cudf/cudf/tests/input_output/test_text.py +++ b/python/cudf/cudf/tests/input_output/test_text.py @@ -1 +1,164 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2023, NVIDIA CORPORATION. + +from io import StringIO + +import pytest + +import cudf +from cudf.testing._utils import assert_eq + + +@pytest.fixture(scope="module") +def datadir(datadir): + return datadir / "text" + + +def test_read_text(datadir): + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file) as f: + content = f.read().split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + actual = cudf.read_text(chess_file, delimiter=delimiter) + + assert_eq(expected, actual) + + +def test_read_text_byte_range(datadir): + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file, "r") as f: + data = f.read() + content = data.split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + byte_range_size = (len(data) // 3) + (len(data) % 3 != 0) + + actual_0 = cudf.read_text( + chess_file, + delimiter=delimiter, + byte_range=[byte_range_size * 0, byte_range_size], + ) + actual_1 = cudf.read_text( + chess_file, + delimiter=delimiter, + byte_range=[byte_range_size * 1, byte_range_size], + ) + actual_2 = cudf.read_text( + chess_file, + delimiter=delimiter, + byte_range=[byte_range_size * 2, byte_range_size], + ) + + actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True) + + assert_eq(expected, actual) + + +def test_read_text_byte_range_large(tmpdir): + content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000)) + delimiter = "\n" + temp_file = str(tmpdir) + "/temp.txt" + + with open(temp_file, "w") as f: + f.write(content) + + expected = cudf.Series(["xxxx\n" for i in range(0, 200)]) + + actual = cudf.read_text( + temp_file, delimiter=delimiter, byte_range=[1000, 1000] + ) + + assert_eq(expected, actual) + + +def test_read_text_in_memory(datadir): + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series(["x::", "y::", "z"]) + + actual = cudf.read_text(StringIO("x::y::z"), delimiter="::") + + assert_eq(expected, actual) + + +def test_read_text_in_memory_strip_delimiter(datadir): + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series(["x", "y", "z"]) + + actual = cudf.read_text( + StringIO("x::y::z"), delimiter="::", strip_delimiters=True + ) + + assert_eq(expected, actual) + + +def test_read_text_bgzip(datadir): + chess_file_compressed = str(datadir) + "/chess.pgn.gz" + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file) as f: + content = f.read().split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + actual = cudf.read_text( + chess_file_compressed, compression="bgzip", delimiter=delimiter + ) + + assert_eq(expected, actual) + + +def test_read_text_bgzip_offsets(datadir): + chess_file_compressed = str(datadir) + "/chess.pgn.gz" + chess_file = str(datadir) + "/chess.pgn" + delimiter = "1." + + with open(chess_file) as f: + content = f.read()[29:695].split(delimiter) + + # Since Python split removes the delimiter and read_text does + # not we need to add it back to the 'content' + expected = cudf.Series( + [ + c + delimiter if i < (len(content) - 1) else c + for i, c in enumerate(content) + ] + ) + + actual = cudf.read_text( + chess_file_compressed, + compression="bgzip", + compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7], + delimiter=delimiter, + ) + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_subword_tokenizer.py b/python/cudf/cudf/tests/test_subword_tokenizer.py deleted file mode 100644 index 9084132243e..00000000000 --- a/python/cudf/cudf/tests/test_subword_tokenizer.py +++ /dev/null @@ -1,238 +0,0 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. -import os - -import cupy -import numpy as np -import pytest - -import cudf -from cudf.core.subword_tokenizer import SubwordTokenizer -from cudf.testing._utils import assert_eq - - -@pytest.fixture(scope="module") -def datadir(datadir): - return os.path.join(datadir, "subword_tokenizer_data") - - -def assert_equal_tokenization_outputs(hf_output, cudf_output): - assert ( - np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0 - ) - assert ( - np.sum( - hf_output["attention_mask"] != cudf_output["attention_mask"].get() - ) - == 0 - ) - - -@pytest.mark.parametrize("seq_len", [32, 64]) -@pytest.mark.parametrize("stride", [0, 15, 30]) -@pytest.mark.parametrize("add_special_tokens", [True, False]) -@pytest.mark.parametrize("do_lower_case", [True, False]) -def test_subword_tokenize( - seq_len, stride, add_special_tokens, do_lower_case, datadir -): - with open( - os.path.join(datadir, "test_sentences.txt"), encoding="utf-8" - ) as file: - input_sentence_ls = [line.strip() for line in file] - - vocab_dir = os.path.join(datadir, "bert_base_cased_sampled") - - transformers = pytest.importorskip("transformers") - - hf_tokenizer = transformers.BertTokenizer.from_pretrained( - vocab_dir, do_lower_case=do_lower_case - ) - - hf_output = hf_tokenizer( - input_sentence_ls, - max_length=seq_len, - stride=stride, - padding="max_length", - return_tensors="np", - truncation=True, - add_special_tokens=add_special_tokens, - ) - - vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt") - str_series = cudf.Series(input_sentence_ls) - cudf_tokenizer = SubwordTokenizer(vocab_hash, do_lower_case=do_lower_case) - cudf_output = cudf_tokenizer( - str_series, - max_length=seq_len, - max_num_rows=len(str_series), - stride=stride, - padding="max_length", - return_tensors="cp", - truncation=True, - add_special_tokens=add_special_tokens, - ) - assert_equal_tokenization_outputs(hf_output, cudf_output) - - -def test_subword_tokenize_with_truncation(datadir): - vocab_dir = os.path.join(datadir, "bert_base_cased_sampled") - vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt") - str_series = cudf.Series(["Test error"]) - cudf_tokenizer = SubwordTokenizer(vocab_hash) - - error_msg = ( - "Adding special tokens is not supported with truncation = False. " - "Custom Cupy kernel can potentially " - "be used to add it. For reference " - "see: _bert_add_special_tokens" - ) - - with pytest.raises(NotImplementedError, match=error_msg): - cudf_tokenizer( - str_series, - max_length=64, - max_num_rows=len(str_series), - truncation=False, - add_special_tokens=True, - ) - - -def test_text_subword_tokenize(tmpdir): - sr = cudf.Series( - [ - "This is a test", - "A test this is", - "Is test a this", - "Test test", - "this This", - ] - ) - hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt") - content = "1\n0\n23\n" - coefficients = [65559] * 23 - for c in coefficients: - content = content + str(c) + " 0\n" - # based on values from the bert_hash_table.txt file for the - # test words used here: 'this' 'is' 'a' test' - table = [0] * 23 - table[0] = 3015668 - table[1] = 6205475701751155871 - table[5] = 6358029 - table[16] = 451412625363 - table[20] = 6206321707968235495 - content = content + "23\n" - for v in table: - content = content + str(v) + "\n" - content = content + "100\n101\n102\n\n" - hash_file.write(content) - - cudf_tokenizer = SubwordTokenizer(hash_file) - - token_d = cudf_tokenizer( - sr, 8, 8, add_special_tokens=False, truncation=True - ) - tokens, masks, metadata = ( - token_d["input_ids"], - token_d["attention_mask"], - token_d["metadata"], - ) - expected_tokens = cupy.asarray( - [ - 2023, - 2003, - 1037, - 3231, - 0, - 0, - 0, - 0, - 1037, - 3231, - 2023, - 2003, - 0, - 0, - 0, - 0, - 2003, - 3231, - 1037, - 2023, - 0, - 0, - 0, - 0, - 3231, - 3231, - 0, - 0, - 0, - 0, - 0, - 0, - 2023, - 2023, - 0, - 0, - 0, - 0, - 0, - 0, - ], - dtype=np.uint32, - ) - expected_tokens = expected_tokens.reshape(-1, 8) - assert_eq(expected_tokens, tokens) - - expected_masks = cupy.asarray( - [ - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - ], - dtype=np.uint32, - ) - expected_masks = expected_masks.reshape(-1, 8) - assert_eq(expected_masks, masks) - - expected_metadata = cupy.asarray( - [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32 - ) - expected_metadata = expected_metadata.reshape(-1, 3) - assert_eq(expected_metadata, metadata) diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py index 06777c8e6af..ac17daa8601 100644 --- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py +++ b/python/cudf/cudf/tests/text/test_subword_tokenizer.py @@ -1 +1,238 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +import os + +import cupy +import numpy as np +import pytest + +import cudf +from cudf.core.subword_tokenizer import SubwordTokenizer +from cudf.testing._utils import assert_eq + + +@pytest.fixture(scope="module") +def datadir(datadir): + return os.path.join(datadir, "subword_tokenizer_data") + + +def assert_equal_tokenization_outputs(hf_output, cudf_output): + assert ( + np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0 + ) + assert ( + np.sum( + hf_output["attention_mask"] != cudf_output["attention_mask"].get() + ) + == 0 + ) + + +@pytest.mark.parametrize("seq_len", [32, 64]) +@pytest.mark.parametrize("stride", [0, 15, 30]) +@pytest.mark.parametrize("add_special_tokens", [True, False]) +@pytest.mark.parametrize("do_lower_case", [True, False]) +def test_subword_tokenize( + seq_len, stride, add_special_tokens, do_lower_case, datadir +): + with open( + os.path.join(datadir, "test_sentences.txt"), encoding="utf-8" + ) as file: + input_sentence_ls = [line.strip() for line in file] + + vocab_dir = os.path.join(datadir, "bert_base_cased_sampled") + + transformers = pytest.importorskip("transformers") + + hf_tokenizer = transformers.BertTokenizer.from_pretrained( + vocab_dir, do_lower_case=do_lower_case + ) + + hf_output = hf_tokenizer( + input_sentence_ls, + max_length=seq_len, + stride=stride, + padding="max_length", + return_tensors="np", + truncation=True, + add_special_tokens=add_special_tokens, + ) + + vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt") + str_series = cudf.Series(input_sentence_ls) + cudf_tokenizer = SubwordTokenizer(vocab_hash, do_lower_case=do_lower_case) + cudf_output = cudf_tokenizer( + str_series, + max_length=seq_len, + max_num_rows=len(str_series), + stride=stride, + padding="max_length", + return_tensors="cp", + truncation=True, + add_special_tokens=add_special_tokens, + ) + assert_equal_tokenization_outputs(hf_output, cudf_output) + + +def test_subword_tokenize_with_truncation(datadir): + vocab_dir = os.path.join(datadir, "bert_base_cased_sampled") + vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt") + str_series = cudf.Series(["Test error"]) + cudf_tokenizer = SubwordTokenizer(vocab_hash) + + error_msg = ( + "Adding special tokens is not supported with truncation = False. " + "Custom Cupy kernel can potentially " + "be used to add it. For reference " + "see: _bert_add_special_tokens" + ) + + with pytest.raises(NotImplementedError, match=error_msg): + cudf_tokenizer( + str_series, + max_length=64, + max_num_rows=len(str_series), + truncation=False, + add_special_tokens=True, + ) + + +def test_text_subword_tokenize(tmpdir): + sr = cudf.Series( + [ + "This is a test", + "A test this is", + "Is test a this", + "Test test", + "this This", + ] + ) + hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt") + content = "1\n0\n23\n" + coefficients = [65559] * 23 + for c in coefficients: + content = content + str(c) + " 0\n" + # based on values from the bert_hash_table.txt file for the + # test words used here: 'this' 'is' 'a' test' + table = [0] * 23 + table[0] = 3015668 + table[1] = 6205475701751155871 + table[5] = 6358029 + table[16] = 451412625363 + table[20] = 6206321707968235495 + content = content + "23\n" + for v in table: + content = content + str(v) + "\n" + content = content + "100\n101\n102\n\n" + hash_file.write(content) + + cudf_tokenizer = SubwordTokenizer(hash_file) + + token_d = cudf_tokenizer( + sr, 8, 8, add_special_tokens=False, truncation=True + ) + tokens, masks, metadata = ( + token_d["input_ids"], + token_d["attention_mask"], + token_d["metadata"], + ) + expected_tokens = cupy.asarray( + [ + 2023, + 2003, + 1037, + 3231, + 0, + 0, + 0, + 0, + 1037, + 3231, + 2023, + 2003, + 0, + 0, + 0, + 0, + 2003, + 3231, + 1037, + 2023, + 0, + 0, + 0, + 0, + 3231, + 3231, + 0, + 0, + 0, + 0, + 0, + 0, + 2023, + 2023, + 0, + 0, + 0, + 0, + 0, + 0, + ], + dtype=np.uint32, + ) + expected_tokens = expected_tokens.reshape(-1, 8) + assert_eq(expected_tokens, tokens) + + expected_masks = cupy.asarray( + [ + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 1, + 1, + 1, + 1, + 0, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + 1, + 1, + 0, + 0, + 0, + 0, + 0, + 0, + ], + dtype=np.uint32, + ) + expected_masks = expected_masks.reshape(-1, 8) + assert_eq(expected_masks, masks) + + expected_metadata = cupy.asarray( + [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32 + ) + expected_metadata = expected_metadata.reshape(-1, 3) + assert_eq(expected_metadata, metadata) diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/text/test_text_methods.py similarity index 82% rename from python/cudf/cudf/tests/test_text.py rename to python/cudf/cudf/tests/text/test_text_methods.py index f0e0e52142f..ea789b99220 100644 --- a/python/cudf/cudf/tests/test_text.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -1,7 +1,5 @@ # Copyright (c) 2019-2023, NVIDIA CORPORATION. -from io import StringIO - import numpy as np import pytest @@ -9,11 +7,6 @@ from cudf.testing._utils import assert_eq -@pytest.fixture(scope="module") -def datadir(datadir): - return datadir / "text" - - def test_tokenize(): strings = cudf.Series( [ @@ -842,154 +835,3 @@ def test_minhash(): with pytest.raises(ValueError): seeds = cudf.Series([0, 1, 2], dtype=np.int32) strings.str.minhash(seeds=seeds) - - -def test_read_text(datadir): - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file) as f: - content = f.read().split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - actual = cudf.read_text(chess_file, delimiter=delimiter) - - assert_eq(expected, actual) - - -def test_read_text_byte_range(datadir): - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file, "r") as f: - data = f.read() - content = data.split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - byte_range_size = (len(data) // 3) + (len(data) % 3 != 0) - - actual_0 = cudf.read_text( - chess_file, - delimiter=delimiter, - byte_range=[byte_range_size * 0, byte_range_size], - ) - actual_1 = cudf.read_text( - chess_file, - delimiter=delimiter, - byte_range=[byte_range_size * 1, byte_range_size], - ) - actual_2 = cudf.read_text( - chess_file, - delimiter=delimiter, - byte_range=[byte_range_size * 2, byte_range_size], - ) - - actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True) - - assert_eq(expected, actual) - - -def test_read_text_byte_range_large(tmpdir): - content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000)) - delimiter = "\n" - temp_file = str(tmpdir) + "/temp.txt" - - with open(temp_file, "w") as f: - f.write(content) - - expected = cudf.Series(["xxxx\n" for i in range(0, 200)]) - - actual = cudf.read_text( - temp_file, delimiter=delimiter, byte_range=[1000, 1000] - ) - - assert_eq(expected, actual) - - -def test_read_text_in_memory(datadir): - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series(["x::", "y::", "z"]) - - actual = cudf.read_text(StringIO("x::y::z"), delimiter="::") - - assert_eq(expected, actual) - - -def test_read_text_in_memory_strip_delimiter(datadir): - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series(["x", "y", "z"]) - - actual = cudf.read_text( - StringIO("x::y::z"), delimiter="::", strip_delimiters=True - ) - - assert_eq(expected, actual) - - -def test_read_text_bgzip(datadir): - chess_file_compressed = str(datadir) + "/chess.pgn.gz" - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file) as f: - content = f.read().split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - actual = cudf.read_text( - chess_file_compressed, compression="bgzip", delimiter=delimiter - ) - - assert_eq(expected, actual) - - -def test_read_text_bgzip_offsets(datadir): - chess_file_compressed = str(datadir) + "/chess.pgn.gz" - chess_file = str(datadir) + "/chess.pgn" - delimiter = "1." - - with open(chess_file) as f: - content = f.read()[29:695].split(delimiter) - - # Since Python split removes the delimiter and read_text does - # not we need to add it back to the 'content' - expected = cudf.Series( - [ - c + delimiter if i < (len(content) - 1) else c - for i, c in enumerate(content) - ] - ) - - actual = cudf.read_text( - chess_file_compressed, - compression="bgzip", - compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7], - delimiter=delimiter, - ) - - assert_eq(expected, actual)