Added documentation, extra test cases, and a change to ioutils for wh…

…en an unknown file protocol is encountered
rapidsai · Jun 16, 2021 · b89f085 · b89f085
1 parent bd9e59b
commit b89f085
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 3 deletions.
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
@@ -3,13 +3,22 @@
 from io import BytesIO, StringIO
 
 import pandas as pd
+from fsspec.core import get_fs_token_paths
 
 import cudf
 from cudf._lib import json as libjson
 from cudf.utils import ioutils
 from cudf.utils.dtypes import is_list_like
 
 
+def _ensure_filesystem(passed_filesystem, path):
+    if passed_filesystem is None:
+        return get_fs_token_paths(path[0] if isinstance(path, list) else path)[
+            0
+        ]
+    return passed_filesystem
+
+
 @ioutils.doc_read_json()
 def read_json(
     path_or_buf,
@@ -35,6 +44,11 @@ def read_json(
 
         filepaths_or_buffers = []
         for source in path_or_buf:
+            if ioutils.is_directory(source, **kwargs):
+                fs = _ensure_filesystem(passed_filesystem=None, path=source)
+                source = ioutils.stringify_pathlike(source)
+                source = fs.sep.join([source, "*.json"])
+
             tmp_source, compression = ioutils.get_filepath_or_buffer(
                 path_or_data=source,
                 compression=compression,

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
@@ -212,6 +212,37 @@ def test_json_lines_multiple(tmpdir, json_input, engine):
         np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array())
 
 
+@pytest.mark.parametrize("engine", ["auto", "cudf"])
+def test_json_read_directory(tmpdir, json_input, engine):
+    pdf = pd.read_json(json_input, lines=True)
+    pdf.to_json(
+        tmpdir.join("MultiInputs1.json"),
+        compression="infer",
+        lines=True,
+        orient="records",
+    )
+    pdf.to_json(
+        tmpdir.join("MultiInputs2.json"),
+        compression="infer",
+        lines=True,
+        orient="records",
+    )
+    pdf.to_json(
+        tmpdir.join("MultiInputs3.json"),
+        compression="infer",
+        lines=True,
+        orient="records",
+    )
+
+    cu_df = cudf.read_json(tmpdir, engine=engine, lines=True)
+    pd_df = pd.concat([pdf, pdf, pdf])
+
+    assert all(cu_df.dtypes == ["int64", "int64", "int64"])
+    for cu_col, pd_col in zip(cu_df.columns, pd_df.columns):
+        assert str(cu_col) == str(pd_col)
+        np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_array())
+
+
 def test_json_lines_byte_range(json_input):
     # include the first row and half of the second row
     # should parse the first two rows

diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
@@ -390,11 +390,13 @@
 
 Parameters
 ----------
-path_or_buf : str, path object, or file-like object
+path_or_buf : list, str, path object, or file-like object
     Either JSON data in a `str`, path to a file (a `str`, `pathlib.Path`, or
     `py._path.local.LocalPath`), URL (including http, ftp, and S3 locations),
     or any object with a `read()` method (such as builtin `open()` file handler
-    function or `StringIO`).
+    function or `StringIO`). Multiple inputs may be provided as a list. If a
+    list is specified each list entry may be of a different input type as long
+    as each input is of a valid type and all input JSON schema(s) match.
 engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto'
     Parser engine to use. If 'auto' is passed, the engine will be
     automatically selected based on the other parameters.
@@ -1086,7 +1088,7 @@ def is_directory(path_or_data, **kwargs):
             )
         except ValueError as e:
             if str(e).startswith("Protocol not known"):
-                return True
+                return False
             else:
                 raise e