From 74a6360efa2b7934e66d5ca28d007c926089ee25 Mon Sep 17 00:00:00 2001 From: mvashishtha Date: Wed, 1 Mar 2023 09:39:35 -0600 Subject: [PATCH] FIX-#5723: Attempt to read list of parquet files in one go. Signed-off-by: mvashishtha --- .../io/column_stores/parquet_dispatcher.py | 133 ++++++++++-------- modin/pandas/test/test_io.py | 14 ++ 2 files changed, 92 insertions(+), 55 deletions(-) diff --git a/modin/core/io/column_stores/parquet_dispatcher.py b/modin/core/io/column_stores/parquet_dispatcher.py index e9e95d842a2..e5e3478ec45 100644 --- a/modin/core/io/column_stores/parquet_dispatcher.py +++ b/modin/core/io/column_stores/parquet_dispatcher.py @@ -22,6 +22,8 @@ from fsspec.spec import AbstractBufferedFile import numpy as np from packaging import version +from pandas.api.types import is_list_like +import s3fs from modin.core.storage_formats.pandas.utils import compute_chunksize from modin.config import NPartitions @@ -40,7 +42,7 @@ class ColumnStoreDataset: Attributes ---------- - path : str, path object or file-like object + paths : list-like of str, path object or file-like object The filepath of the parquet file in local filesystem or hdfs. storage_options : dict Parameters for specific storage engine. @@ -60,10 +62,13 @@ class ColumnStoreDataset: List that contains the full paths of the parquet files in the dataset. """ - def __init__(self, path, storage_options): # noqa : PR01 - self.path = path.__fspath__() if isinstance(path, os.PathLike) else path + def __init__(self, paths, storage_options): # noqa : PR01 + self.paths = [ + path.__fspath__() if isinstance(path, os.PathLike) else path + for path in paths + ] self.storage_options = storage_options - self._fs_path = None + self._fs_paths = None self._fs = None self.dataset = self._init_dataset() self._row_groups_per_file = None @@ -107,14 +112,18 @@ def fs(self): Filesystem object. """ if self._fs is None: - if isinstance(self.path, AbstractBufferedFile): - self._fs = self.path.fs + if isinstance(self.paths[0], AbstractBufferedFile): + self._fs = self.paths[0].fs else: - self._fs, self._fs_path = url_to_fs(self.path, **self.storage_options) + self._fs_paths = [ + url_to_fs(path, **self.storage_options)[1] for path in self.paths + ] + # If given list of paths, can assume they are all on the same filesystem + self._fs = url_to_fs(self.paths[0])[0] return self._fs @property - def fs_path(self): + def fs_paths(self): """ Return the filesystem-specific path or file handle. @@ -123,12 +132,18 @@ def fs_path(self): fs_path : str, path object or file-like object String path specific to filesystem or a file handle. """ - if self._fs_path is None: - if isinstance(self.path, AbstractBufferedFile): - self._fs_path = self.path + if self._fs_paths is None: + if len(self.paths) > 1: + self._fs_paths = self.paths + elif isinstance(self.paths[0], AbstractBufferedFile): + self._fs_paths = [self.paths[0]] else: - self._fs, self._fs_path = url_to_fs(self.path, **self.storage_options) - return self._fs_path + self._fs_paths = [ + url_to_fs(path, **self.storage_options)[1] for path in self.paths + ] + # If given list of paths, can assume they are all on the same filesystem + self._fs = url_to_fs(self.paths[0])[0] + return self._fs_paths def to_pandas_dataframe(self, columns): """ @@ -166,8 +181,8 @@ def _unstrip_protocol(protocol, path): return path return f"{protos[0]}://{path}" - if isinstance(self.path, AbstractBufferedFile): - return [self.path] + if len(self.paths) == 1 and isinstance(self.paths[0], AbstractBufferedFile): + return [self.paths[0]] # version.parse() is expensive, so we can split this into two separate loops if version.parse(fsspec.__version__) < version.parse("2022.5.0"): fs_files = [_unstrip_protocol(self.fs.protocol, fpath) for fpath in files] @@ -183,7 +198,7 @@ def _init_dataset(self): # noqa: GL08 from pyarrow.parquet import ParquetDataset return ParquetDataset( - self.fs_path, filesystem=self.fs, use_legacy_dataset=False + self.fs_paths, filesystem=self.fs, use_legacy_dataset=False ) @property @@ -231,7 +246,7 @@ def to_pandas_dataframe( from pyarrow.parquet import read_table return read_table( - self._fs_path, columns=columns, filesystem=self.fs + self._fs_paths, columns=columns, filesystem=self.fs ).to_pandas() @@ -240,7 +255,7 @@ class FastParquetDataset(ColumnStoreDataset): def _init_dataset(self): # noqa: GL08 from fastparquet import ParquetFile - return ParquetFile(self.fs_path, fs=self.fs) + return ParquetFile(self.fs_paths, fs=self.fs) @property def pandas_metadata(self): @@ -285,12 +300,13 @@ def _get_fastparquet_files(self): # noqa: GL08 # have to copy some of their logic here while we work on getting # an easier method to get a list of valid files. # See: https://github.com/dask/fastparquet/issues/795 - if "*" in self.path: + if "*" in self.paths: files = self.fs.glob(self.path) else: files = [ f - for f in self.fs.find(self.path) + for path in self.paths + for f in self.fs.find(path) if f.endswith(".parquet") or f.endswith(".parq") ] return files @@ -612,41 +628,49 @@ def _read(cls, path, engine, columns, **kwargs): ParquetFile API is used. Please refer to the documentation here https://arrow.apache.org/docs/python/parquet.html """ - if isinstance(path, str): - if os.path.isdir(path): - path_generator = os.walk(path) - else: - storage_options = kwargs.get("storage_options") - if storage_options is not None: - fs, fs_path = url_to_fs(path, **storage_options) + # s3fs.core.S3File is considered list-like + path_list = ( + path + if is_list_like(path) and not isinstance(path, s3fs.core.S3File) + else [path] + ) + for path in path_list: + if isinstance(path, str): + if os.path.isdir(path): + path_generator = os.walk(path) else: - fs, fs_path = url_to_fs(path) - path_generator = fs.walk(fs_path) - partitioned_columns = set() - # We do a tree walk of the path directory because partitioned - # parquet directories have a unique column at each directory level. - # Thus, we can use os.walk(), which does a dfs search, to walk - # through the different columns that the data is partitioned on - for _, dir_names, files in path_generator: - if dir_names: - partitioned_columns.add(dir_names[0].split("=")[0]) - if files: - # Metadata files, git files, .DSStore - # TODO: fix conditional for column partitioning, see issue #4637 - if len(files[0]) > 0 and files[0][0] == ".": - continue - break - partitioned_columns = list(partitioned_columns) - if len(partitioned_columns): - return cls.single_worker_read( - path, - engine=engine, - columns=columns, - reason="Mixed partitioning columns in Parquet", - **kwargs, - ) - - dataset = cls.get_dataset(path, engine, kwargs.get("storage_options") or {}) + storage_options = kwargs.get("storage_options") + if storage_options is not None: + fs, fs_path = url_to_fs(path, **storage_options) + else: + fs, fs_path = url_to_fs(path) + path_generator = fs.walk(fs_path) + partitioned_columns = set() + # We do a tree walk of the path directory because partitioned + # parquet directories have a unique column at each directory level. + # Thus, we can use os.walk(), which does a dfs search, to walk + # through the different columns that the data is partitioned on + for _, dir_names, files in path_generator: + if dir_names: + partitioned_columns.add(dir_names[0].split("=")[0]) + if files: + # Metadata files, git files, .DSStore + # TODO: fix conditional for column partitioning, see issue #4637 + if len(files[0]) > 0 and files[0][0] == ".": + continue + break + partitioned_columns = list(partitioned_columns) + if len(partitioned_columns): + return cls.single_worker_read( + path, + engine=engine, + columns=columns, + reason="Mixed partitioning columns in Parquet", + **kwargs, + ) + dataset = cls.get_dataset( + path_list, engine, kwargs.get("storage_options") or {} + ) index_columns = ( dataset.pandas_metadata.get("index_columns", []) if dataset.pandas_metadata @@ -659,5 +683,4 @@ def _read(cls, path, engine, columns, **kwargs): for c in column_names if c not in index_columns and not cls.index_regex.match(c) ] - return cls.build_query_compiler(dataset, columns, index_columns, **kwargs) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index 6e20e3b18bb..0035d655e6f 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -1356,6 +1356,20 @@ def test_read_parquet( columns=columns, ) + def test_read_parquet_list_of_files_5698(self, engine, make_parquet_file): + with ensure_clean(".parquet") as f1, ensure_clean( + ".parquet" + ) as f2, ensure_clean(".parquet") as f3: + for f in [f1, f2, f3]: + make_parquet_file(filename=f) + eval_io(fn_name="read_parquet", path=[f1, f2, f3], engine=engine) + + def test_empty_list(self, engine): + eval_io(fn_name="read_parquet", path=[], engine=engine) + + def test_list_of_s3_file(self, engine): + raise NotImplementedError + @pytest.mark.xfail( condition="config.getoption('--simulate-cloud').lower() != 'off'", reason="The reason of tests fail in `cloud` mode is unknown for now - issue #3264",