rapidsai · rapids-bot · May 26, 2023 · May 24, 2023 · May 24, 2023 · May 24, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -85,7 +85,7 @@ repos:
                   (?x)^(
                     ^cpp/include/cudf_test/cxxopts.hpp|
                     ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*|
-                    ^python/cudf/cudf/tests/test_text.py
+                    ^python/cudf/cudf/tests/test_nvtext.py
                   )
       - repo: local
         hooks:

diff --git a/python/cudf/cudf/tests/test_text.py → python/cudf/cudf/tests/test_nvtext.py b/python/cudf/cudf/tests/test_text.py → python/cudf/cudf/tests/test_nvtext.py
@@ -1,19 +1,12 @@
 # Copyright (c) 2019-2023, NVIDIA CORPORATION.
 
-from io import StringIO
-
 import numpy as np
 import pytest
 
 import cudf
 from cudf.testing._utils import assert_eq
 
 
-@pytest.fixture(scope="module")
-def datadir(datadir):
-    return datadir / "text"
-
-
 def test_tokenize():
     strings = cudf.Series(
         [
@@ -842,154 +835,3 @@ def test_minhash():
     with pytest.raises(ValueError):
         seeds = cudf.Series([0, 1, 2], dtype=np.int32)
         strings.str.minhash(seeds=seeds)
-
-
-def test_read_text(datadir):
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file) as f:
-        content = f.read().split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(
-        [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
-        ]
-    )
-
-    actual = cudf.read_text(chess_file, delimiter=delimiter)
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_byte_range(datadir):
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file, "r") as f:
-        data = f.read()
-        content = data.split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(
-        [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
-        ]
-    )
-
-    byte_range_size = (len(data) // 3) + (len(data) % 3 != 0)
-
-    actual_0 = cudf.read_text(
-        chess_file,
-        delimiter=delimiter,
-        byte_range=[byte_range_size * 0, byte_range_size],
-    )
-    actual_1 = cudf.read_text(
-        chess_file,
-        delimiter=delimiter,
-        byte_range=[byte_range_size * 1, byte_range_size],
-    )
-    actual_2 = cudf.read_text(
-        chess_file,
-        delimiter=delimiter,
-        byte_range=[byte_range_size * 2, byte_range_size],
-    )
-
-    actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True)
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_byte_range_large(tmpdir):
-    content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
-    delimiter = "\n"
-    temp_file = str(tmpdir) + "/temp.txt"
-
-    with open(temp_file, "w") as f:
-        f.write(content)
-
-    expected = cudf.Series(["xxxx\n" for i in range(0, 200)])
-
-    actual = cudf.read_text(
-        temp_file, delimiter=delimiter, byte_range=[1000, 1000]
-    )
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_in_memory(datadir):
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(["x::", "y::", "z"])
-
-    actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_in_memory_strip_delimiter(datadir):
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(["x", "y", "z"])
-
-    actual = cudf.read_text(
-        StringIO("x::y::z"), delimiter="::", strip_delimiters=True
-    )
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_bgzip(datadir):
-    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file) as f:
-        content = f.read().split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(
-        [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
-        ]
-    )
-
-    actual = cudf.read_text(
-        chess_file_compressed, compression="bgzip", delimiter=delimiter
-    )
-
-    assert_eq(expected, actual)
-
-
-def test_read_text_bgzip_offsets(datadir):
-    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
-    chess_file = str(datadir) + "/chess.pgn"
-    delimiter = "1."
-
-    with open(chess_file) as f:
-        content = f.read()[29:695].split(delimiter)
-
-    # Since Python split removes the delimiter and read_text does
-    # not we need to add it back to the 'content'
-    expected = cudf.Series(
-        [
-            c + delimiter if i < (len(content) - 1) else c
-            for i, c in enumerate(content)
-        ]
-    )
-
-    actual = cudf.read_text(
-        chess_file_compressed,
-        compression="bgzip",
-        compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
-        delimiter=delimiter,
-    )
-
-    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_text_io.py b/python/cudf/cudf/tests/test_text_io.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+
+from io import StringIO
+
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+@pytest.fixture(scope="module")
+def datadir(datadir):
+    return datadir / "text"
+
+
+def test_read_text(datadir):
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read().split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(chess_file, delimiter=delimiter)
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_byte_range(datadir):
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file, "r") as f:
+        data = f.read()
+        content = data.split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    byte_range_size = (len(data) // 3) + (len(data) % 3 != 0)
+
+    actual_0 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 0, byte_range_size],
+    )
+    actual_1 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 1, byte_range_size],
+    )
+    actual_2 = cudf.read_text(
+        chess_file,
+        delimiter=delimiter,
+        byte_range=[byte_range_size * 2, byte_range_size],
+    )
+
+    actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True)
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_byte_range_large(tmpdir):
+    content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
+    delimiter = "\n"
+    temp_file = str(tmpdir) + "/temp.txt"
+
+    with open(temp_file, "w") as f:
+        f.write(content)
+
+    expected = cudf.Series(["xxxx\n" for i in range(0, 200)])
+
+    actual = cudf.read_text(
+        temp_file, delimiter=delimiter, byte_range=[1000, 1000]
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_in_memory(datadir):
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(["x::", "y::", "z"])
+
+    actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_in_memory_strip_delimiter(datadir):
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(["x", "y", "z"])
+
+    actual = cudf.read_text(
+        StringIO("x::y::z"), delimiter="::", strip_delimiters=True
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_bgzip(datadir):
+    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read().split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(
+        chess_file_compressed, compression="bgzip", delimiter=delimiter
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_bgzip_offsets(datadir):
+    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read()[29:695].split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(
+        chess_file_compressed,
+        compression="bgzip",
+        compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
+        delimiter=delimiter,
+    )
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/text/__init__.py b/python/cudf/cudf/tests/text/__init__.py