From cc317edbe48923c4a71673e1ef294d4050a29418 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 26 May 2023 05:03:49 -0400
Subject: [PATCH] Separate io-text and nvtext pytests into different files
 (#13435)

Cleans up source files for nvtext and io-text pytests. The pytests are placed into separate files: `test_io_text.py` for the io-text pytests and `test_nvtext.py` for the nvtext pytests. Also removed the `python/cudf/cudf/tests/text` folder which contained 2 empty `.py` files.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13435
---
 .pre-commit-config.yaml                       |   2 +-
 .../cudf/cudf/tests/input_output/test_text.py | 165 +++++++++++-
 .../cudf/cudf/tests/test_subword_tokenizer.py | 238 -----------------
 .../cudf/tests/text/test_subword_tokenizer.py | 239 +++++++++++++++++-
 .../test_text_methods.py}                     | 158 ------------
 5 files changed, 403 insertions(+), 399 deletions(-)
 delete mode 100644 python/cudf/cudf/tests/test_subword_tokenizer.py
 rename python/cudf/cudf/tests/{test_text.py => text/test_text_methods.py} (82%)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0ac54113278..ebdbf3e6db1 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -85,7 +85,7 @@ repos:
                   (?x)^(
                     ^cpp/include/cudf_test/cxxopts.hpp|
                     ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*|
-                    ^python/cudf/cudf/tests/test_text.py
+                    ^python/cudf/cudf/tests/text/test_text_methods.py
                   )
       - repo: local
         hooks:
diff --git a/python/cudf/cudf/tests/input_output/test_text.py b/python/cudf/cudf/tests/input_output/test_text.py
index 06777c8e6af..acba13bb5b0 100644
--- a/python/cudf/cudf/tests/input_output/test_text.py
+++ b/python/cudf/cudf/tests/input_output/test_text.py
@@ -1 +1,164 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+
+from io import StringIO
+
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+@pytest.fixture(scope="module")
+def datadir(datadir):
+    return datadir / "text"
+
+
+def test_read_text(datadir):
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read().split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(chess_file, delimiter=delimiter)
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_byte_range(datadir):
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file, "r") as f:
+        data = f.read()
+        content = data.split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    byte_range_size = (len(data) // 3) + (len(data) % 3 != 0)
+
+    actual_0 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 0, byte_range_size],
+    )
+    actual_1 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 1, byte_range_size],
+    )
+    actual_2 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 2, byte_range_size],
+    )
+
+    actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True)
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_byte_range_large(tmpdir):
+    content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
+    delimiter = "\n"
+    temp_file = str(tmpdir) + "/temp.txt"
+
+    with open(temp_file, "w") as f:
+        f.write(content)
+
+    expected = cudf.Series(["xxxx\n" for i in range(0, 200)])
+
+    actual = cudf.read_text(
+        temp_file, delimiter=delimiter, byte_range=[1000, 1000]
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_in_memory(datadir):
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(["x::", "y::", "z"])
+
+    actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_in_memory_strip_delimiter(datadir):
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(["x", "y", "z"])
+
+    actual = cudf.read_text(
+        StringIO("x::y::z"), delimiter="::", strip_delimiters=True
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_bgzip(datadir):
+    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read().split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(
+        chess_file_compressed, compression="bgzip", delimiter=delimiter
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_bgzip_offsets(datadir):
+    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read()[29:695].split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(
+        chess_file_compressed,
+        compression="bgzip",
+        compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
+        delimiter=delimiter,
+    )
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_subword_tokenizer.py b/python/cudf/cudf/tests/test_subword_tokenizer.py
deleted file mode 100644
index 9084132243e..00000000000
--- a/python/cudf/cudf/tests/test_subword_tokenizer.py
+++ /dev/null
@@ -1,238 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-import os
-
-import cupy
-import numpy as np
-import pytest
-
-import cudf
-from cudf.core.subword_tokenizer import SubwordTokenizer
-from cudf.testing._utils import assert_eq
-
-
-@pytest.fixture(scope="module")
-def datadir(datadir):
-    return os.path.join(datadir, "subword_tokenizer_data")
-
-
-def assert_equal_tokenization_outputs(hf_output, cudf_output):
-    assert (
-        np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0
-    )
-    assert (
-        np.sum(
-            hf_output["attention_mask"] != cudf_output["attention_mask"].get()
-        )
-        == 0
-    )
-
-
-@pytest.mark.parametrize("seq_len", [32, 64])
-@pytest.mark.parametrize("stride", [0, 15, 30])
-@pytest.mark.parametrize("add_special_tokens", [True, False])
-@pytest.mark.parametrize("do_lower_case", [True, False])
-def test_subword_tokenize(
-    seq_len, stride, add_special_tokens, do_lower_case, datadir
-):
-    with open(
-        os.path.join(datadir, "test_sentences.txt"), encoding="utf-8"
-    ) as file:
-        input_sentence_ls = [line.strip() for line in file]
-
-    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
-
-    transformers = pytest.importorskip("transformers")
-
-    hf_tokenizer = transformers.BertTokenizer.from_pretrained(
-        vocab_dir, do_lower_case=do_lower_case
-    )
-
-    hf_output = hf_tokenizer(
-        input_sentence_ls,
-        max_length=seq_len,
-        stride=stride,
-        padding="max_length",
-        return_tensors="np",
-        truncation=True,
-        add_special_tokens=add_special_tokens,
-    )
-
-    vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt")
-    str_series = cudf.Series(input_sentence_ls)
-    cudf_tokenizer = SubwordTokenizer(vocab_hash, do_lower_case=do_lower_case)
-    cudf_output = cudf_tokenizer(
-        str_series,
-        max_length=seq_len,
-        max_num_rows=len(str_series),
-        stride=stride,
-        padding="max_length",
-        return_tensors="cp",
-        truncation=True,
-        add_special_tokens=add_special_tokens,
-    )
-    assert_equal_tokenization_outputs(hf_output, cudf_output)
-
-
-def test_subword_tokenize_with_truncation(datadir):
-    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
-    vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt")
-    str_series = cudf.Series(["Test error"])
-    cudf_tokenizer = SubwordTokenizer(vocab_hash)
-
-    error_msg = (
-        "Adding special tokens is not supported with truncation = False. "
-        "Custom Cupy kernel can potentially "
-        "be used to add it. For reference "
-        "see: _bert_add_special_tokens"
-    )
-
-    with pytest.raises(NotImplementedError, match=error_msg):
-        cudf_tokenizer(
-            str_series,
-            max_length=64,
-            max_num_rows=len(str_series),
-            truncation=False,
-            add_special_tokens=True,
-        )
-
-
-def test_text_subword_tokenize(tmpdir):
-    sr = cudf.Series(
-        [
-            "This is a test",
-            "A test this is",
-            "Is test a this",
-            "Test   test",
-            "this   This",
-        ]
-    )
-    hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt")
-    content = "1\n0\n23\n"
-    coefficients = [65559] * 23
-    for c in coefficients:
-        content = content + str(c) + " 0\n"
-    # based on values from the bert_hash_table.txt file for the
-    # test words used here: 'this' 'is' 'a' test'
-    table = [0] * 23
-    table[0] = 3015668
-    table[1] = 6205475701751155871
-    table[5] = 6358029
-    table[16] = 451412625363
-    table[20] = 6206321707968235495
-    content = content + "23\n"
-    for v in table:
-        content = content + str(v) + "\n"
-    content = content + "100\n101\n102\n\n"
-    hash_file.write(content)
-
-    cudf_tokenizer = SubwordTokenizer(hash_file)
-
-    token_d = cudf_tokenizer(
-        sr, 8, 8, add_special_tokens=False, truncation=True
-    )
-    tokens, masks, metadata = (
-        token_d["input_ids"],
-        token_d["attention_mask"],
-        token_d["metadata"],
-    )
-    expected_tokens = cupy.asarray(
-        [
-            2023,
-            2003,
-            1037,
-            3231,
-            0,
-            0,
-            0,
-            0,
-            1037,
-            3231,
-            2023,
-            2003,
-            0,
-            0,
-            0,
-            0,
-            2003,
-            3231,
-            1037,
-            2023,
-            0,
-            0,
-            0,
-            0,
-            3231,
-            3231,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            2023,
-            2023,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-        ],
-        dtype=np.uint32,
-    )
-    expected_tokens = expected_tokens.reshape(-1, 8)
-    assert_eq(expected_tokens, tokens)
-
-    expected_masks = cupy.asarray(
-        [
-            1,
-            1,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-            1,
-            1,
-            0,
-            0,
-            0,
-            0,
-            0,
-            0,
-        ],
-        dtype=np.uint32,
-    )
-    expected_masks = expected_masks.reshape(-1, 8)
-    assert_eq(expected_masks, masks)
-
-    expected_metadata = cupy.asarray(
-        [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32
-    )
-    expected_metadata = expected_metadata.reshape(-1, 3)
-    assert_eq(expected_metadata, metadata)
diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py
index 06777c8e6af..ac17daa8601 100644
--- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py
+++ b/python/cudf/cudf/tests/text/test_subword_tokenizer.py
@@ -1 +1,238 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+import os
+
+import cupy
+import numpy as np
+import pytest
+
+import cudf
+from cudf.core.subword_tokenizer import SubwordTokenizer
+from cudf.testing._utils import assert_eq
+
+
+@pytest.fixture(scope="module")
+def datadir(datadir):
+    return os.path.join(datadir, "subword_tokenizer_data")
+
+
+def assert_equal_tokenization_outputs(hf_output, cudf_output):
+    assert (
+        np.sum(hf_output["input_ids"] != cudf_output["input_ids"].get()) == 0
+    )
+    assert (
+        np.sum(
+            hf_output["attention_mask"] != cudf_output["attention_mask"].get()
+        )
+        == 0
+    )
+
+
+@pytest.mark.parametrize("seq_len", [32, 64])
+@pytest.mark.parametrize("stride", [0, 15, 30])
+@pytest.mark.parametrize("add_special_tokens", [True, False])
+@pytest.mark.parametrize("do_lower_case", [True, False])
+def test_subword_tokenize(
+    seq_len, stride, add_special_tokens, do_lower_case, datadir
+):
+    with open(
+        os.path.join(datadir, "test_sentences.txt"), encoding="utf-8"
+    ) as file:
+        input_sentence_ls = [line.strip() for line in file]
+
+    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
+
+    transformers = pytest.importorskip("transformers")
+
+    hf_tokenizer = transformers.BertTokenizer.from_pretrained(
+        vocab_dir, do_lower_case=do_lower_case
+    )
+
+    hf_output = hf_tokenizer(
+        input_sentence_ls,
+        max_length=seq_len,
+        stride=stride,
+        padding="max_length",
+        return_tensors="np",
+        truncation=True,
+        add_special_tokens=add_special_tokens,
+    )
+
+    vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt")
+    str_series = cudf.Series(input_sentence_ls)
+    cudf_tokenizer = SubwordTokenizer(vocab_hash, do_lower_case=do_lower_case)
+    cudf_output = cudf_tokenizer(
+        str_series,
+        max_length=seq_len,
+        max_num_rows=len(str_series),
+        stride=stride,
+        padding="max_length",
+        return_tensors="cp",
+        truncation=True,
+        add_special_tokens=add_special_tokens,
+    )
+    assert_equal_tokenization_outputs(hf_output, cudf_output)
+
+
+def test_subword_tokenize_with_truncation(datadir):
+    vocab_dir = os.path.join(datadir, "bert_base_cased_sampled")
+    vocab_hash = os.path.join(vocab_dir, "vocab-hash.txt")
+    str_series = cudf.Series(["Test error"])
+    cudf_tokenizer = SubwordTokenizer(vocab_hash)
+
+    error_msg = (
+        "Adding special tokens is not supported with truncation = False. "
+        "Custom Cupy kernel can potentially "
+        "be used to add it. For reference "
+        "see: _bert_add_special_tokens"
+    )
+
+    with pytest.raises(NotImplementedError, match=error_msg):
+        cudf_tokenizer(
+            str_series,
+            max_length=64,
+            max_num_rows=len(str_series),
+            truncation=False,
+            add_special_tokens=True,
+        )
+
+
+def test_text_subword_tokenize(tmpdir):
+    sr = cudf.Series(
+        [
+            "This is a test",
+            "A test this is",
+            "Is test a this",
+            "Test   test",
+            "this   This",
+        ]
+    )
+    hash_file = tmpdir.mkdir("nvtext").join("tmp_hashed_vocab.txt")
+    content = "1\n0\n23\n"
+    coefficients = [65559] * 23
+    for c in coefficients:
+        content = content + str(c) + " 0\n"
+    # based on values from the bert_hash_table.txt file for the
+    # test words used here: 'this' 'is' 'a' test'
+    table = [0] * 23
+    table[0] = 3015668
+    table[1] = 6205475701751155871
+    table[5] = 6358029
+    table[16] = 451412625363
+    table[20] = 6206321707968235495
+    content = content + "23\n"
+    for v in table:
+        content = content + str(v) + "\n"
+    content = content + "100\n101\n102\n\n"
+    hash_file.write(content)
+
+    cudf_tokenizer = SubwordTokenizer(hash_file)
+
+    token_d = cudf_tokenizer(
+        sr, 8, 8, add_special_tokens=False, truncation=True
+    )
+    tokens, masks, metadata = (
+        token_d["input_ids"],
+        token_d["attention_mask"],
+        token_d["metadata"],
+    )
+    expected_tokens = cupy.asarray(
+        [
+            2023,
+            2003,
+            1037,
+            3231,
+            0,
+            0,
+            0,
+            0,
+            1037,
+            3231,
+            2023,
+            2003,
+            0,
+            0,
+            0,
+            0,
+            2003,
+            3231,
+            1037,
+            2023,
+            0,
+            0,
+            0,
+            0,
+            3231,
+            3231,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            2023,
+            2023,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ],
+        dtype=np.uint32,
+    )
+    expected_tokens = expected_tokens.reshape(-1, 8)
+    assert_eq(expected_tokens, tokens)
+
+    expected_masks = cupy.asarray(
+        [
+            1,
+            1,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+            1,
+            1,
+            0,
+            0,
+            0,
+            0,
+            0,
+            0,
+        ],
+        dtype=np.uint32,
+    )
+    expected_masks = expected_masks.reshape(-1, 8)
+    assert_eq(expected_masks, masks)
+
+    expected_metadata = cupy.asarray(
+        [0, 0, 3, 1, 0, 3, 2, 0, 3, 3, 0, 1, 4, 0, 1], dtype=np.uint32
+    )
+    expected_metadata = expected_metadata.reshape(-1, 3)
+    assert_eq(expected_metadata, metadata)
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/text/test_text_methods.py
similarity index 82%
rename from python/cudf/cudf/tests/test_text.py
rename to python/cudf/cudf/tests/text/test_text_methods.py
index f0e0e52142f..ea789b99220 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -1,7 +1,5 @@
 # Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
-from io import StringIO
-
 import numpy as np
 import pytest
 
@@ -9,11 +7,6 @@
 from cudf.testing._utils import assert_eq
 
 
-@pytest.fixture(scope="module")
-def datadir(datadir):
-    return datadir / "text"
-
-
 def test_tokenize():
     strings = cudf.Series(
         [
@@ -842,154 +835,3 @@ def test_minhash():
     with pytest.raises(ValueError):
         seeds = cudf.Series([0, 1, 2], dtype=np.int32)
         strings.str.minhash(seeds=seeds)
-
-
-def test_read_text(datadir):
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file) as f:
-        content = f.read().split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(
-        [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
-        ]
-    )
-
-    actual = cudf.read_text(chess_file, delimiter=delimiter)
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_byte_range(datadir):
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file, "r") as f:
-        data = f.read()
-        content = data.split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(
-        [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
-        ]
-    )
-
-    byte_range_size = (len(data) // 3) + (len(data) % 3 != 0)
-
-    actual_0 = cudf.read_text(
-        chess_file,
-        delimiter=delimiter,
-        byte_range=[byte_range_size * 0, byte_range_size],
-    )
-    actual_1 = cudf.read_text(
-        chess_file,
-        delimiter=delimiter,
-        byte_range=[byte_range_size * 1, byte_range_size],
-    )
-    actual_2 = cudf.read_text(
-        chess_file,
-        delimiter=delimiter,
-        byte_range=[byte_range_size * 2, byte_range_size],
-    )
-
-    actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True)
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_byte_range_large(tmpdir):
-    content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
-    delimiter = "\n"
-    temp_file = str(tmpdir) + "/temp.txt"
-
-    with open(temp_file, "w") as f:
-        f.write(content)
-
-    expected = cudf.Series(["xxxx\n" for i in range(0, 200)])
-
-    actual = cudf.read_text(
-        temp_file, delimiter=delimiter, byte_range=[1000, 1000]
-    )
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_in_memory(datadir):
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(["x::", "y::", "z"])
-
-    actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_in_memory_strip_delimiter(datadir):
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(["x", "y", "z"])
-
-    actual = cudf.read_text(
-        StringIO("x::y::z"), delimiter="::", strip_delimiters=True
-    )
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_bgzip(datadir):
-    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file) as f:
-        content = f.read().split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(
-        [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
-        ]
-    )
-
-    actual = cudf.read_text(
-        chess_file_compressed, compression="bgzip", delimiter=delimiter
-    )
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_bgzip_offsets(datadir):
-    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file) as f:
-        content = f.read()[29:695].split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(
-        [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
-        ]
-    )
-
-    actual = cudf.read_text(
-        chess_file_compressed,
-        compression="bgzip",
-        compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
-        delimiter=delimiter,
-    )
-
-    assert_eq(expected, actual)