diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 34aff2c34fe..14238be7bc1 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -1,6 +1,7 @@ # Copyright (c) 2018-2022, NVIDIA CORPORATION. import copy +import gzip import itertools import os from io import BytesIO, StringIO @@ -943,3 +944,26 @@ def test_order_nested_json_reader(tag, data): ) assert_eq(expected, target, check_dtype=True) + + +def test_json_round_trip_gzip(): + df = cudf.DataFrame({"a": [1, 2, 3], "b": ["abc", "def", "ghi"]}) + bytes = BytesIO() + with gzip.open(bytes, mode="wb") as fo: + df.to_json(fo, orient="records", lines=True) + bytes.seek(0) + with gzip.open(bytes, mode="rb") as fo: + written_df = cudf.read_json(fo, orient="records", lines=True) + assert_eq(written_df, df) + + # Testing writing from middle of the file. + loc = bytes.tell() + + with gzip.open(bytes, mode="wb") as fo: + fo.seek(loc) + df.to_json(fo, orient="records", lines=True) + bytes.seek(loc) + with gzip.open(bytes, mode="rb") as fo: + fo.seek(loc) + written_df = cudf.read_json(fo, orient="records", lines=True) + assert_eq(written_df, df) diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 9146405c6ed..2c4b73666a5 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1953,7 +1953,15 @@ def _fsspec_data_transfer( # Calculate total file size if file_like: - file_size = path_or_fob.size + try: + file_size = path_or_fob.size + except AttributeError: + # Find file size if there is no `size` + # attribute + old_file_position = path_or_fob.tell() + path_or_fob.seek(0, os.SEEK_END) + file_size = path_or_fob.tell() + path_or_fob.seek(old_file_position, os.SEEK_SET) file_size = file_size or fs.size(path_or_fob) # Check if a direct read makes the most sense