Skip to content

Commit

Permalink
Added documentation, extra test cases, and a change to ioutils for wh…
Browse files Browse the repository at this point in the history
…en an unknown file protocol is encountered
  • Loading branch information
jdye64 committed Jun 16, 2021
1 parent bd9e59b commit b89f085
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 3 deletions.
14 changes: 14 additions & 0 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,22 @@
from io import BytesIO, StringIO

import pandas as pd
from fsspec.core import get_fs_token_paths

import cudf
from cudf._lib import json as libjson
from cudf.utils import ioutils
from cudf.utils.dtypes import is_list_like


def _ensure_filesystem(passed_filesystem, path):
if passed_filesystem is None:
return get_fs_token_paths(path[0] if isinstance(path, list) else path)[
0
]
return passed_filesystem


@ioutils.doc_read_json()
def read_json(
path_or_buf,
Expand All @@ -35,6 +44,11 @@ def read_json(

filepaths_or_buffers = []
for source in path_or_buf:
if ioutils.is_directory(source, **kwargs):
fs = _ensure_filesystem(passed_filesystem=None, path=source)
source = ioutils.stringify_pathlike(source)
source = fs.sep.join([source, "*.json"])

tmp_source, compression = ioutils.get_filepath_or_buffer(
path_or_data=source,
compression=compression,
Expand Down
31 changes: 31 additions & 0 deletions python/cudf/cudf/tests/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,37 @@ def test_json_lines_multiple(tmpdir, json_input, engine):
np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array())


@pytest.mark.parametrize("engine", ["auto", "cudf"])
def test_json_read_directory(tmpdir, json_input, engine):
pdf = pd.read_json(json_input, lines=True)
pdf.to_json(
tmpdir.join("MultiInputs1.json"),
compression="infer",
lines=True,
orient="records",
)
pdf.to_json(
tmpdir.join("MultiInputs2.json"),
compression="infer",
lines=True,
orient="records",
)
pdf.to_json(
tmpdir.join("MultiInputs3.json"),
compression="infer",
lines=True,
orient="records",
)

cu_df = cudf.read_json(tmpdir, engine=engine, lines=True)
pd_df = pd.concat([pdf, pdf, pdf])

assert all(cu_df.dtypes == ["int64", "int64", "int64"])
for cu_col, pd_col in zip(cu_df.columns, pd_df.columns):
assert str(cu_col) == str(pd_col)
np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array())


def test_json_lines_byte_range(json_input):
# include the first row and half of the second row
# should parse the first two rows
Expand Down
8 changes: 5 additions & 3 deletions python/cudf/cudf/utils/ioutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -390,11 +390,13 @@
Parameters
----------
path_or_buf : str, path object, or file-like object
path_or_buf : list, str, path object, or file-like object
Either JSON data in a `str`, path to a file (a `str`, `pathlib.Path`, or
`py._path.local.LocalPath`), URL (including http, ftp, and S3 locations),
or any object with a `read()` method (such as builtin `open()` file handler
function or `StringIO`).
function or `StringIO`). Multiple inputs may be provided as a list. If a
list is specified each list entry may be of a different input type as long
as each input is of a valid type and all input JSON schema(s) match.
engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto'
Parser engine to use. If 'auto' is passed, the engine will be
automatically selected based on the other parameters.
Expand Down Expand Up @@ -1086,7 +1088,7 @@ def is_directory(path_or_data, **kwargs):
)
except ValueError as e:
if str(e).startswith("Protocol not known"):
return True
return False
else:
raise e

Expand Down

0 comments on commit b89f085

Please sign in to comment.