Skip to content

Commit

Permalink
Fix an error in IO with GzipFile type (#12085)
Browse files Browse the repository at this point in the history
Fixes: #10590 

This PR fixes an issue where the file-like object doesn't have a `size` attribute, we will manually compute the size of the file.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/brandon-b-miller
  - Vukasin Milovanovic (https://github.com/vuule)

URL: #12085
  • Loading branch information
galipremsagar authored Nov 9, 2022
1 parent 74053f4 commit a2c428c
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 1 deletion.
24 changes: 24 additions & 0 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2018-2022, NVIDIA CORPORATION.

import copy
import gzip
import itertools
import os
from io import BytesIO, StringIO
Expand Down Expand Up @@ -943,3 +944,26 @@ def test_order_nested_json_reader(tag, data):
)

assert_eq(expected, target, check_dtype=True)


def test_json_round_trip_gzip():
df = cudf.DataFrame({"a": [1, 2, 3], "b": ["abc", "def", "ghi"]})
bytes = BytesIO()
with gzip.open(bytes, mode="wb") as fo:
df.to_json(fo, orient="records", lines=True)
bytes.seek(0)
with gzip.open(bytes, mode="rb") as fo:
written_df = cudf.read_json(fo, orient="records", lines=True)
assert_eq(written_df, df)

# Testing writing from middle of the file.
loc = bytes.tell()

with gzip.open(bytes, mode="wb") as fo:
fo.seek(loc)
df.to_json(fo, orient="records", lines=True)
bytes.seek(loc)
with gzip.open(bytes, mode="rb") as fo:
fo.seek(loc)
written_df = cudf.read_json(fo, orient="records", lines=True)
assert_eq(written_df, df)
10 changes: 9 additions & 1 deletion python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1953,7 +1953,15 @@ def _fsspec_data_transfer(

# Calculate total file size
if file_like:
file_size = path_or_fob.size
try:
file_size = path_or_fob.size
except AttributeError:
# Find file size if there is no `size`
# attribute
old_file_position = path_or_fob.tell()
path_or_fob.seek(0, os.SEEK_END)
file_size = path_or_fob.tell()
path_or_fob.seek(old_file_position, os.SEEK_SET)
file_size = file_size or fs.size(path_or_fob)

# Check if a direct read makes the most sense
Expand Down

0 comments on commit a2c428c

Please sign in to comment.