From c4d46bddd3b2f98b7318e8e81c9429c58885b83f Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 7 Nov 2022 08:39:44 -0800 Subject: [PATCH 1/3] Fix IO of GzipFile types --- python/cudf/cudf/tests/test_json.py | 12 ++++++++++++ python/cudf/cudf/utils/ioutils.py | 10 +++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 34aff2c34fe..586131a1642 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1,6 +1,7 @@ # Copyright (c) 2018-2022, NVIDIA CORPORATION. import copy +import gzip import itertools import os from io import BytesIO, StringIO @@ -943,3 +944,14 @@ def test_order_nested_json_reader(tag, data): ) assert_eq(expected, target, check_dtype=True) + + +def test_json_round_trip_gzip(): + df = cudf.DataFrame({"a": [1, 2, 3], "b": ["abc", "def", "ghi"]}) + bytes = BytesIO() + with gzip.open(bytes, mode="wb") as fo: + df.to_json(fo, orient="records", lines=True) + bytes.seek(0) + with gzip.open(bytes, mode="rb") as fo: + written_df = cudf.read_json(fo, orient="records", lines=True) + assert_eq(written_df, df) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 9146405c6ed..f1d34cf618c 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1953,7 +1953,15 @@ def _fsspec_data_transfer( # Calculate total file size if file_like: - file_size = path_or_fob.size + try: + file_size = path_or_fob.size + except AttributeError: + # Find file size of there is no `size` + # attribute + old_file_position = path_or_fob.tell() + path_or_fob.seek(0, os.SEEK_END) + file_size = path_or_fob.tell() + path_or_fob.seek(old_file_position, os.SEEK_SET) file_size = file_size or fs.size(path_or_fob) # Check if a direct read makes the most sense From c0587e01afd02cf6afa356acc94744b07192428a Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Wed, 9 Nov 2022 06:53:16 -0800 Subject: [PATCH 2/3] address review --- python/cudf/cudf/tests/test_json.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 586131a1642..14238be7bc1 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -955,3 +955,15 @@ def test_json_round_trip_gzip(): with gzip.open(bytes, mode="rb") as fo: written_df = cudf.read_json(fo, orient="records", lines=True) assert_eq(written_df, df) + + # Testing writing from middle of the file. + loc = bytes.tell() + + with gzip.open(bytes, mode="wb") as fo: + fo.seek(loc) + df.to_json(fo, orient="records", lines=True) + bytes.seek(loc) + with gzip.open(bytes, mode="rb") as fo: + fo.seek(loc) + written_df = cudf.read_json(fo, orient="records", lines=True) + assert_eq(written_df, df) From bbdfe90f1935223738305640e9b6fec41260f07e Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 9 Nov 2022 10:36:18 -0600 Subject: [PATCH 3/3] Update python/cudf/cudf/utils/ioutils.py Co-authored-by: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> --- python/cudf/cudf/utils/ioutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index f1d34cf618c..2c4b73666a5 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1956,7 +1956,7 @@ def _fsspec_data_transfer( try: file_size = path_or_fob.size except AttributeError: - # Find file size of there is no `size` + # Find file size if there is no `size` # attribute old_file_position = path_or_fob.tell() path_or_fob.seek(0, os.SEEK_END)