From b89f08545b7a4b9d6c062d9a9e37362187ac17f1 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 16 Jun 2021 21:48:39 +0000 Subject: [PATCH] Added documentation, extra test cases, and a change to ioutils for when an unknown file protocol is encountered --- python/cudf/cudf/io/json.py | 14 +++++++++++++ python/cudf/cudf/tests/test_json.py | 31 +++++++++++++++++++++++++++++ python/cudf/cudf/utils/ioutils.py | 8 +++++--- 3 files changed, 50 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 6d8fcad7364..a6a6c05a54e 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -3,6 +3,7 @@ from io import BytesIO, StringIO import pandas as pd +from fsspec.core import get_fs_token_paths import cudf from cudf._lib import json as libjson @@ -10,6 +11,14 @@ from cudf.utils.dtypes import is_list_like +def _ensure_filesystem(passed_filesystem, path): + if passed_filesystem is None: + return get_fs_token_paths(path[0] if isinstance(path, list) else path)[ + 0 + ] + return passed_filesystem + + @ioutils.doc_read_json() def read_json( path_or_buf, @@ -35,6 +44,11 @@ def read_json( filepaths_or_buffers = [] for source in path_or_buf: + if ioutils.is_directory(source, **kwargs): + fs = _ensure_filesystem(passed_filesystem=None, path=source) + source = ioutils.stringify_pathlike(source) + source = fs.sep.join([source, "*.json"]) + tmp_source, compression = ioutils.get_filepath_or_buffer( path_or_data=source, compression=compression, diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index 4cd906377d6..2da2cea164f 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -212,6 +212,37 @@ def test_json_lines_multiple(tmpdir, json_input, engine): np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array()) +@pytest.mark.parametrize("engine", ["auto", "cudf"]) +def test_json_read_directory(tmpdir, json_input, engine): + pdf = pd.read_json(json_input, lines=True) + pdf.to_json( + tmpdir.join("MultiInputs1.json"), + compression="infer", + lines=True, + orient="records", + ) + pdf.to_json( + tmpdir.join("MultiInputs2.json"), + compression="infer", + lines=True, + orient="records", + ) + pdf.to_json( + tmpdir.join("MultiInputs3.json"), + compression="infer", + lines=True, + orient="records", + ) + + cu_df = cudf.read_json(tmpdir, engine=engine, lines=True) + pd_df = pd.concat([pdf, pdf, pdf]) + + assert all(cu_df.dtypes == ["int64", "int64", "int64"]) + for cu_col, pd_col in zip(cu_df.columns, pd_df.columns): + assert str(cu_col) == str(pd_col) + np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array()) + + def test_json_lines_byte_range(json_input): # include the first row and half of the second row # should parse the first two rows diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 15120fd8fab..d77ba93f9e8 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -390,11 +390,13 @@ Parameters ---------- -path_or_buf : str, path object, or file-like object +path_or_buf : list, str, path object, or file-like object Either JSON data in a `str`, path to a file (a `str`, `pathlib.Path`, or `py._path.local.LocalPath`), URL (including http, ftp, and S3 locations), or any object with a `read()` method (such as builtin `open()` file handler - function or `StringIO`). + function or `StringIO`). Multiple inputs may be provided as a list. If a + list is specified each list entry may be of a different input type as long + as each input is of a valid type and all input JSON schema(s) match. engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto' Parser engine to use. If 'auto' is passed, the engine will be automatically selected based on the other parameters. @@ -1086,7 +1088,7 @@ def is_directory(path_or_data, **kwargs): ) except ValueError as e: if str(e).startswith("Protocol not known"): - return True + return False else: raise e