From 072cbeef5ae3054ddbefba614d71ac4325ece14d Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Wed, 19 May 2021 00:31:38 -0700 Subject: [PATCH] Update io util to convert path like object to string (#8275) Fixes failing dask-cudf s3 test introduced from changes to `fsspec.stringify_path` in their 2021.5.0 release. Problem description: While reading remote files with dask-cudf the `path_or_object` param is populated with remote file objects such as `s3fs.s3File` or `gcsfs.GCSFile`. Updates to the `fsspec.stringify_path` util results in returning the string path for such objects resulting in cudf looking for those file paths locally. Fix: Fix uses one of the branches from the original `stringify_path` util that checks if a `path like` object implement the `fsspath` protocol and gets the string representation from that. Authors: - Ayush Dattagupta (https://github.com/ayushdg) Approvers: - Keith Kraus (https://github.com/kkraus14) URL: https://github.com/rapidsai/cudf/pull/8275 --- python/cudf/cudf/io/parquet.py | 3 +-- python/cudf/cudf/utils/ioutils.py | 27 ++++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index c17630d1227..5ace108a72d 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -5,7 +5,6 @@ from uuid import uuid4 from fsspec.core import get_fs_token_paths -from fsspec.utils import stringify_path from pyarrow import dataset as ds, parquet as pq import cudf @@ -203,7 +202,7 @@ def read_parquet( for source in filepath_or_buffer: if ioutils.is_directory(source, **kwargs): fs = _ensure_filesystem(passed_filesystem=None, path=source) - source = stringify_path(source) + source = ioutils.stringify_pathlike(source) source = fs.sep.join([source, "*.parquet"]) tmp_source, compression = ioutils.get_filepath_or_buffer( diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 16511627aa2..15120fd8fab 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1051,7 +1051,7 @@ def _is_local_filesystem(fs): def ensure_single_filepath_or_buffer(path_or_data, **kwargs): """Return False if `path_or_data` resolves to multiple filepaths or buffers """ - path_or_data = fsspec.utils.stringify_path(path_or_data) + path_or_data = stringify_pathlike(path_or_data) if isinstance(path_or_data, str): storage_options = kwargs.get("storage_options") path_or_data = os.path.expanduser(path_or_data) @@ -1076,7 +1076,7 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs): def is_directory(path_or_data, **kwargs): """Returns True if the provided filepath is a directory """ - path_or_data = fsspec.utils.stringify_path(path_or_data) + path_or_data = stringify_pathlike(path_or_data) if isinstance(path_or_data, str): storage_options = kwargs.get("storage_options") path_or_data = os.path.expanduser(path_or_data) @@ -1121,7 +1121,7 @@ def get_filepath_or_buffer( compression : str Type of compression algorithm for the content """ - path_or_data = fsspec.utils.stringify_path(path_or_data) + path_or_data = stringify_pathlike(path_or_data) if isinstance(path_or_data, str): storage_options = kwargs.get("storage_options") @@ -1223,6 +1223,27 @@ def is_fsspec_open_file(file_obj): return False +def stringify_pathlike(pathlike): + """ + Convert any object that implements the fspath protocol + to a string. Leaves other objects unchanged + Parameters + ---------- + pathlike + Pathlike object that implements the fspath protocol + + Returns + ------- + maybe_pathlike_str + String version of the object if possible + """ + maybe_pathlike_str = ( + pathlike.__fspath__() if hasattr(pathlike, "__fspath__") else pathlike + ) + + return maybe_pathlike_str + + def buffer_write_lines(buf, lines): """ Appends lines to a buffer.