rapidsai · rapids-bot · Oct 7, 2022 · Sep 28, 2022 · Sep 28, 2022 · Sep 28, 2022
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
@@ -25,6 +26,12 @@ cdef extern from "cudf/io/text/data_chunk_source_factories.hpp" \
     unique_ptr[data_chunk_source] make_source(string data) except +
     unique_ptr[data_chunk_source] \
         make_source_from_file(string filename) except +
+    unique_ptr[data_chunk_source] \
+        make_source_from_bgzip_file(string filename) except +
+    unique_ptr[data_chunk_source] \
+        make_source_from_bgzip_file(string filename,
+                                    uint64_t virtual_begin,
+                                    uint64_t virtual_end) except +
 
 
 cdef extern from "cudf/io/text/multibyte_split.hpp" \

@@ -5,6 +5,7 @@ from io import TextIOBase
 import cudf
 
 from cython.operator cimport dereference
+from libc.stdint cimport uint64_t
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -15,11 +16,24 @@ from cudf._lib.cpp.io.text cimport (
     byte_range_info,
     data_chunk_source,
     make_source,
+    make_source_from_bgzip_file,
     make_source_from_file,
     multibyte_split,
 )
 
 
+class BGZIPFile:
+    def __init__(self, filename, compression_offsets):
+        self.filename = filename
+        self.has_offsets = compression_offsets is not None
+        if self.has_offsets:
+            if len(compression_offsets) != 2:
+                raise ValueError(
+                    "compression offsets need to consist of two elements")
+            self.begin_offset = compression_offsets[0]
+            self.end_offset = compression_offsets[1]
+
+
 def read_text(object filepaths_or_buffers,
               object delimiter=None,
               object byte_range=None):
@@ -38,9 +52,22 @@ def read_text(object filepaths_or_buffers,
     cdef size_t c_byte_range_offset
     cdef size_t c_byte_range_size
     cdef byte_range_info c_byte_range
+    cdef uint64_t c_compression_begin_offset
+    cdef uint64_t c_compression_end_offset
 
     if isinstance(filepaths_or_buffers, TextIOBase):
         datasource = move(make_source(filepaths_or_buffers.read().encode()))
+    elif isinstance(filepaths_or_buffers, BGZIPFile):
+        if filepaths_or_buffers.has_offsets:
+            c_compression_begin_offset = filepaths_or_buffers.begin_offset
+            c_compression_end_offset = filepaths_or_buffers.end_offset
+            datasource = move(make_source_from_bgzip_file(
+                filepaths_or_buffers.filename.encode(),
+                c_compression_begin_offset,
+                c_compression_end_offset))
+        else:
+            datasource = move(make_source_from_bgzip_file(
+                filepaths_or_buffers.filename.encode()))
     else:
         datasource = move(make_source_from_file(filepaths_or_buffers.encode()))
 

@@ -1,6 +1,6 @@
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
-from io import BytesIO, StringIO
+from io import BytesIO, StringIO, TextIOBase
 
 import cudf
 from cudf._lib import text as libtext
@@ -14,17 +14,33 @@ def read_text(
     filepath_or_buffer,
     delimiter=None,
     byte_range=None,
+    compression=None,
+    compression_offsets=None,
     **kwargs,
 ):
     """{docstring}"""
 
-    filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
+    if delimiter is None:
+        raise ValueError("delimiter needs to be provided")
+
+    filepath_or_buffer, _ = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         compression=None,
         iotypes=(BytesIO, StringIO),
         **kwargs,
     )
 
+    if compression == "bgzip":
+        if isinstance(filepath_or_buffer, TextIOBase):
+            raise ValueError("bgzip compression requires a file path")
+        filepath_or_buffer = libtext.BGZIPFile(
+            filepath_or_buffer, compression_offsets
+        )
+    elif compression is not None:
+        raise ValueError("Only bgzip compression is supported at the moment")
+    elif compression_offsets is not None:
+        raise ValueError("compression_offsets requires compression to be set")
+
     return cudf.Series._from_data(
         libtext.read_text(
             filepath_or_buffer, delimiter=delimiter, byte_range=byte_range

@@ -845,3 +845,54 @@ def test_read_text_in_memory(datadir):
     actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")
 
     assert_eq(expected, actual)
+
+
+def test_read_text_bgzip(datadir):
+    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read().split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(
+        chess_file_compressed, compression="bgzip", delimiter=delimiter
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_bgzip_offsets(datadir):
+    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read()[29:695].split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(
+        chess_file_compressed,
+        compression="bgzip",
+        compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
+        delimiter=delimiter,
+    )
+
+    assert_eq(expected, actual)
@@ -1172,6 +1172,16 @@
     The output contains all rows that start inside the byte range
     (i.e. at or after the offset, and before the end at `offset + size`),
     which may include rows that continue past the end.
+compression : string, default None
+    Which compression type is the input compressed with.
+    Currently supports only `bgzip`, and requires the path to a file as input.
+compression_offsets: list or tuple, default None
+    The virtual begin and end offset associated with the provided compression.
+    For `bgzip`, they are composed of a local uncompressed offset inside a
+    BGZIP block (lower 16 bits) and the start offset of this BGZIP block in the
+    compressed file (upper 48 bits).
+    The start offset points to the first byte to be read, the end offset points
+    one past the last byte to be read.
 
 Returns
 -------