Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate io-text and nvtext pytests into different files #13435

Merged
merged 9 commits into from
May 26, 2023
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ repos:
(?x)^(
^cpp/include/cudf_test/cxxopts.hpp|
^python/cudf/cudf/tests/data/subword_tokenizer_data/.*|
^python/cudf/cudf/tests/test_text.py
^python/cudf/cudf/tests/test_nvtext.py
)
- repo: local
hooks:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,12 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.

from io import StringIO

import numpy as np
import pytest

import cudf
from cudf.testing._utils import assert_eq


@pytest.fixture(scope="module")
def datadir(datadir):
return datadir / "text"


def test_tokenize():
strings = cudf.Series(
[
Expand Down Expand Up @@ -842,154 +835,3 @@ def test_minhash():
with pytest.raises(ValueError):
seeds = cudf.Series([0, 1, 2], dtype=np.int32)
strings.str.minhash(seeds=seeds)


davidwendt marked this conversation as resolved.
Show resolved Hide resolved
def test_read_text(datadir):
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file) as f:
content = f.read().split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

actual = cudf.read_text(chess_file, delimiter=delimiter)

assert_eq(expected, actual)


def test_read_text_byte_range(datadir):
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file, "r") as f:
data = f.read()
content = data.split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

byte_range_size = (len(data) // 3) + (len(data) % 3 != 0)

actual_0 = cudf.read_text(
chess_file,
delimiter=delimiter,
byte_range=[byte_range_size * 0, byte_range_size],
)
actual_1 = cudf.read_text(
chess_file,
delimiter=delimiter,
byte_range=[byte_range_size * 1, byte_range_size],
)
actual_2 = cudf.read_text(
chess_file,
delimiter=delimiter,
byte_range=[byte_range_size * 2, byte_range_size],
)

actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True)

assert_eq(expected, actual)


def test_read_text_byte_range_large(tmpdir):
content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
delimiter = "\n"
temp_file = str(tmpdir) + "/temp.txt"

with open(temp_file, "w") as f:
f.write(content)

expected = cudf.Series(["xxxx\n" for i in range(0, 200)])

actual = cudf.read_text(
temp_file, delimiter=delimiter, byte_range=[1000, 1000]
)

assert_eq(expected, actual)


def test_read_text_in_memory(datadir):
# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(["x::", "y::", "z"])

actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")

assert_eq(expected, actual)


def test_read_text_in_memory_strip_delimiter(datadir):
# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(["x", "y", "z"])

actual = cudf.read_text(
StringIO("x::y::z"), delimiter="::", strip_delimiters=True
)

assert_eq(expected, actual)


def test_read_text_bgzip(datadir):
chess_file_compressed = str(datadir) + "/chess.pgn.gz"
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file) as f:
content = f.read().split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

actual = cudf.read_text(
chess_file_compressed, compression="bgzip", delimiter=delimiter
)

assert_eq(expected, actual)


def test_read_text_bgzip_offsets(datadir):
chess_file_compressed = str(datadir) + "/chess.pgn.gz"
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file) as f:
content = f.read()[29:695].split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

actual = cudf.read_text(
chess_file_compressed,
compression="bgzip",
compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
delimiter=delimiter,
)

assert_eq(expected, actual)
164 changes: 164 additions & 0 deletions python/cudf/cudf/tests/test_text_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Copyright (c) 2019-2023, NVIDIA CORPORATION.

from io import StringIO

import pytest

import cudf
from cudf.testing._utils import assert_eq


@pytest.fixture(scope="module")
def datadir(datadir):
return datadir / "text"


def test_read_text(datadir):
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file) as f:
content = f.read().split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

actual = cudf.read_text(chess_file, delimiter=delimiter)

assert_eq(expected, actual)


def test_read_text_byte_range(datadir):
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file, "r") as f:
data = f.read()
content = data.split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

byte_range_size = (len(data) // 3) + (len(data) % 3 != 0)

actual_0 = cudf.read_text(
chess_file,
delimiter=delimiter,
byte_range=[byte_range_size * 0, byte_range_size],
)
actual_1 = cudf.read_text(
chess_file,
delimiter=delimiter,
byte_range=[byte_range_size * 1, byte_range_size],
)
actual_2 = cudf.read_text(
chess_file,
delimiter=delimiter,
byte_range=[byte_range_size * 2, byte_range_size],
)

actual = cudf.concat([actual_0, actual_1, actual_2], ignore_index=True)

assert_eq(expected, actual)


def test_read_text_byte_range_large(tmpdir):
content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
delimiter = "\n"
temp_file = str(tmpdir) + "/temp.txt"

with open(temp_file, "w") as f:
f.write(content)

expected = cudf.Series(["xxxx\n" for i in range(0, 200)])

actual = cudf.read_text(
temp_file, delimiter=delimiter, byte_range=[1000, 1000]
)

assert_eq(expected, actual)


def test_read_text_in_memory(datadir):
# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(["x::", "y::", "z"])

actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")

assert_eq(expected, actual)


def test_read_text_in_memory_strip_delimiter(datadir):
# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(["x", "y", "z"])

actual = cudf.read_text(
StringIO("x::y::z"), delimiter="::", strip_delimiters=True
)

assert_eq(expected, actual)


def test_read_text_bgzip(datadir):
chess_file_compressed = str(datadir) + "/chess.pgn.gz"
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file) as f:
content = f.read().split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

actual = cudf.read_text(
chess_file_compressed, compression="bgzip", delimiter=delimiter
)

assert_eq(expected, actual)


def test_read_text_bgzip_offsets(datadir):
chess_file_compressed = str(datadir) + "/chess.pgn.gz"
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file) as f:
content = f.read()[29:695].split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

actual = cudf.read_text(
chess_file_compressed,
compression="bgzip",
compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
delimiter=delimiter,
)

assert_eq(expected, actual)
Empty file.
1 change: 0 additions & 1 deletion python/cudf/cudf/tests/text/test_subword_tokenizer.py

This file was deleted.