Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BGZIP reader to python read_text #11802

Merged
merged 5 commits into from
Oct 7, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions python/cudf/cudf/_lib/cpp/io/text.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.

from libc.stdint cimport uint64_t
from libcpp.memory cimport unique_ptr
from libcpp.string cimport string

Expand All @@ -25,6 +26,12 @@ cdef extern from "cudf/io/text/data_chunk_source_factories.hpp" \
unique_ptr[data_chunk_source] make_source(string data) except +
unique_ptr[data_chunk_source] \
make_source_from_file(string filename) except +
unique_ptr[data_chunk_source] \
make_source_from_bgzip_file(string filename) except +
unique_ptr[data_chunk_source] \
make_source_from_bgzip_file(string filename,
uint64_t virtual_begin,
uint64_t virtual_end) except +


cdef extern from "cudf/io/text/multibyte_split.hpp" \
Expand Down
27 changes: 27 additions & 0 deletions python/cudf/cudf/_lib/text.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ from io import TextIOBase
import cudf

from cython.operator cimport dereference
from libc.stdint cimport uint64_t
from libcpp.memory cimport make_unique, unique_ptr
from libcpp.string cimport string
from libcpp.utility cimport move
Expand All @@ -15,11 +16,24 @@ from cudf._lib.cpp.io.text cimport (
byte_range_info,
data_chunk_source,
make_source,
make_source_from_bgzip_file,
make_source_from_file,
multibyte_split,
)


class BGZIPFile:
def __init__(self, filename, compression_offsets):
upsj marked this conversation as resolved.
Show resolved Hide resolved
self.filename = filename
self.has_offsets = compression_offsets is not None
if self.has_offsets:
if len(compression_offsets) != 2:
raise ValueError(
"compression offsets need to consist of two elements")
self.begin_offset = compression_offsets[0]
self.end_offset = compression_offsets[1]


def read_text(object filepaths_or_buffers,
object delimiter=None,
object byte_range=None):
Expand All @@ -38,9 +52,22 @@ def read_text(object filepaths_or_buffers,
cdef size_t c_byte_range_offset
cdef size_t c_byte_range_size
cdef byte_range_info c_byte_range
cdef uint64_t c_compression_begin_offset
cdef uint64_t c_compression_end_offset

if isinstance(filepaths_or_buffers, TextIOBase):
datasource = move(make_source(filepaths_or_buffers.read().encode()))
elif isinstance(filepaths_or_buffers, BGZIPFile):
if filepaths_or_buffers.has_offsets:
c_compression_begin_offset = filepaths_or_buffers.begin_offset
c_compression_end_offset = filepaths_or_buffers.end_offset
datasource = move(make_source_from_bgzip_file(
filepaths_or_buffers.filename.encode(),
c_compression_begin_offset,
c_compression_end_offset))
else:
datasource = move(make_source_from_bgzip_file(
filepaths_or_buffers.filename.encode()))
else:
datasource = move(make_source_from_file(filepaths_or_buffers.encode()))

Expand Down
20 changes: 18 additions & 2 deletions python/cudf/cudf/io/text.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Copyright (c) 2018-2022, NVIDIA CORPORATION.

from io import BytesIO, StringIO
from io import BytesIO, StringIO, TextIOBase

import cudf
from cudf._lib import text as libtext
Expand All @@ -14,17 +14,33 @@ def read_text(
filepath_or_buffer,
delimiter=None,
byte_range=None,
compression=None,
compression_offsets=None,
**kwargs,
):
"""{docstring}"""

filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
if delimiter is None:
raise ValueError("delimiter needs to be provided")

filepath_or_buffer, _ = ioutils.get_reader_filepath_or_buffer(
path_or_data=filepath_or_buffer,
compression=None,
iotypes=(BytesIO, StringIO),
**kwargs,
)

if compression == "bgzip":
if isinstance(filepath_or_buffer, TextIOBase):
raise ValueError("bgzip compression requires a file path")
filepath_or_buffer = libtext.BGZIPFile(
upsj marked this conversation as resolved.
Show resolved Hide resolved
filepath_or_buffer, compression_offsets
)
elif compression is not None:
raise ValueError("Only bgzip compression is supported at the moment")
elif compression_offsets is not None:
raise ValueError("compression_offsets requires compression to be set")

return cudf.Series._from_data(
libtext.read_text(
filepath_or_buffer, delimiter=delimiter, byte_range=byte_range
Expand Down
Binary file added python/cudf/cudf/tests/data/text/chess.pgn.gz
Binary file not shown.
51 changes: 51 additions & 0 deletions python/cudf/cudf/tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -845,3 +845,54 @@ def test_read_text_in_memory(datadir):
actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")

assert_eq(expected, actual)


def test_read_text_bgzip(datadir):
chess_file_compressed = str(datadir) + "/chess.pgn.gz"
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file) as f:
content = f.read().split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

actual = cudf.read_text(
chess_file_compressed, compression="bgzip", delimiter=delimiter
)

assert_eq(expected, actual)


def test_read_text_bgzip_offsets(datadir):
chess_file_compressed = str(datadir) + "/chess.pgn.gz"
chess_file = str(datadir) + "/chess.pgn"
delimiter = "1."

with open(chess_file) as f:
content = f.read()[29:695].split(delimiter)

# Since Python split removes the delimiter and read_text does
# not we need to add it back to the 'content'
expected = cudf.Series(
[
c + delimiter if i < (len(content) - 1) else c
for i, c in enumerate(content)
]
)

actual = cudf.read_text(
chess_file_compressed,
compression="bgzip",
compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
delimiter=delimiter,
)

assert_eq(expected, actual)
10 changes: 10 additions & 0 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1172,6 +1172,16 @@
The output contains all rows that start inside the byte range
(i.e. at or after the offset, and before the end at `offset + size`),
which may include rows that continue past the end.
compression : string, default None
Which compression type is the input compressed with.
Currently supports only `bgzip`, and requires the path to a file as input.
compression_offsets: list or tuple, default None
The virtual begin and end offset associated with the provided compression.
For `bgzip`, they are composed of a local uncompressed offset inside a
BGZIP block (lower 16 bits) and the start offset of this BGZIP block in the
compressed file (upper 48 bits).
The start offset points to the first byte to be read, the end offset points
one past the last byte to be read.

Returns
-------
Expand Down