Skip to content

Commit

Permalink
Separate io-text and nvtext pytests into different files (#13435)
Browse files Browse the repository at this point in the history
Cleans up source files for nvtext and io-text pytests. The pytests are placed into separate files: `test_io_text.py` for the io-text pytests and `test_nvtext.py` for the nvtext pytests. Also removed the `python/cudf/cudf/tests/text` folder which contained 2 empty `.py` files.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #13435
  • Loading branch information
davidwendt authored May 26, 2023
1 parent 5b3e3ab commit cc317ed
Show file tree
Hide file tree
Showing 5 changed files with 403 additions and 399 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ repos:
(?x)^(
^cpp/include/cudf_test/cxxopts.hpp|
^python/cudf/cudf/tests/data/subword_tokenizer_data/.*|
^python/cudf/cudf/tests/test_text.py
^python/cudf/cudf/tests/text/test_text_methods.py
)
- repo: local
hooks:
Expand Down
165 changes: 164 additions & 1 deletion python/cudf/cudf/tests/input_output/test_text.py
Original file line number Diff line number Diff line change
@@ -1 +1,164 @@
# Copyright (c) 2023, NVIDIA CORPORATION.
# Copyright (c) 2019-2023, NVIDIA CORPORATION.

from io import StringIO

import pytest

import cudf
from cudf.testing._utils import assert_eq


@pytest.fixture(scope="module")
def datadir(datadir):
return datadir / "text"


def test_read_text(datadir):
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file) as f:
content = f.read().split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

actual = cudf.read_text(chess_file, delimiter=delimiter)

assert_eq(expected, actual)


def test_read_text_byte_range(datadir):
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file, "r") as f:
data = f.read()
content = data.split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

byte_range_size = (len(data) // 3) + (len(data) % 3 != 0)

actual_0 = cudf.read_text(
chess_file,
delimiter=delimiter,
byte_range=[byte_range_size * 0, byte_range_size],
)
actual_1 = cudf.read_text(
chess_file,
delimiter=delimiter,
byte_range=[byte_range_size * 1, byte_range_size],
)
actual_2 = cudf.read_text(
chess_file,
delimiter=delimiter,
byte_range=[byte_range_size * 2, byte_range_size],
)

actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True)

assert_eq(expected, actual)


def test_read_text_byte_range_large(tmpdir):
content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
delimiter = "\n"
temp_file = str(tmpdir) + "/temp.txt"

with open(temp_file, "w") as f:
f.write(content)

expected = cudf.Series(["xxxx\n" for i in range(0, 200)])

actual = cudf.read_text(
temp_file, delimiter=delimiter, byte_range=[1000, 1000]
)

assert_eq(expected, actual)


def test_read_text_in_memory(datadir):
# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(["x::", "y::", "z"])

actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")

assert_eq(expected, actual)


def test_read_text_in_memory_strip_delimiter(datadir):
# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(["x", "y", "z"])

actual = cudf.read_text(
StringIO("x::y::z"), delimiter="::", strip_delimiters=True
)

assert_eq(expected, actual)


def test_read_text_bgzip(datadir):
chess_file_compressed = str(datadir) + "/chess.pgn.gz"
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file) as f:
content = f.read().split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

actual = cudf.read_text(
chess_file_compressed, compression="bgzip", delimiter=delimiter
)

assert_eq(expected, actual)


def test_read_text_bgzip_offsets(datadir):
chess_file_compressed = str(datadir) + "/chess.pgn.gz"
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file) as f:
content = f.read()[29:695].split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

actual = cudf.read_text(
chess_file_compressed,
compression="bgzip",
compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
delimiter=delimiter,
)

assert_eq(expected, actual)
Loading

0 comments on commit cc317ed

Please sign in to comment.