From 19d791cea7abb8ccbcf3f0cd8037644f3176166f Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" <rzamora217@gmail.com> Date: Tue, 24 Oct 2023 15:45:41 -0500 Subject: [PATCH] Avoid `pyarrow.fs` import for local storage (#14321) This is not a resolution, but may help mitigate problems from https://github.com/aws/aws-sdk-cpp/issues/2681 Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Peter Andreas Entschev (https://github.com/pentschev) - Lawrence Mitchell (https://github.com/wence-) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14321 --- python/cudf/cudf/io/orc.py | 5 ++++- python/cudf/cudf/io/parquet.py | 11 +++++++++-- python/cudf/cudf/tests/test_s3.py | 14 ++++++++++++++ python/cudf/cudf/utils/ioutils.py | 10 +++++++++- 4 files changed, 36 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index f51952d23bf..d135a31438e 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -5,7 +5,6 @@ import pyarrow as pa from fsspec.utils import stringify_path -from pyarrow import orc as orc import cudf from cudf._lib import orc as liborc @@ -17,6 +16,8 @@ def _make_empty_df(filepath_or_buffer, columns): + from pyarrow import orc + orc_file = orc.ORCFile(filepath_or_buffer) schema = orc_file.schema col_names = schema.names if columns is None else columns @@ -150,6 +151,7 @@ def _parse_column_statistics(cs, column_statistics_blob): @ioutils.doc_read_orc_metadata() def read_orc_metadata(path): """{docstring}""" + from pyarrow import orc orc_file = orc.ORCFile(path) @@ -380,6 +382,7 @@ def read_orc( ) ) else: + from pyarrow import orc def read_orc_stripe(orc_file, stripe, columns): pa_table = orc_file.read_stripe(stripe, columns) diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py index d84aff66d7b..1f346578d70 100644 --- a/python/cudf/cudf/io/parquet.py +++ b/python/cudf/cudf/io/parquet.py @@ -15,7 +15,7 @@ import numpy as np import pandas as pd -from pyarrow import dataset as ds, parquet as pq +from pyarrow import dataset as ds import cudf from cudf._lib import parquet as libparquet @@ -266,6 +266,7 @@ def write_to_dataset( @_cudf_nvtx_annotate def read_parquet_metadata(path): """{docstring}""" + import pyarrow.parquet as pq pq_file = pq.ParquetFile(path) @@ -303,7 +304,9 @@ def _process_dataset( # Convert filters to ds.Expression if filters is not None: - filters = pq.filters_to_expression(filters) + from pyarrow.parquet import filters_to_expression + + filters = filters_to_expression(filters) # Initialize ds.FilesystemDataset # TODO: Remove the if len(paths) workaround after following bug is fixed: @@ -825,6 +828,8 @@ def _read_parquet( use_pandas_metadata=use_pandas_metadata, ) else: + import pyarrow.parquet as pq + return cudf.DataFrame.from_arrow( pq.ParquetDataset(filepaths_or_buffers).read_pandas( columns=columns, *args, **kwargs @@ -930,6 +935,8 @@ def to_parquet( ) else: + import pyarrow.parquet as pq + if partition_offsets is not None: warnings.warn( "partition_offsets will be ignored when engine is not cudf" diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py index d54a2eabf22..d16cbd2377a 100644 --- a/python/cudf/cudf/tests/test_s3.py +++ b/python/cudf/cudf/tests/test_s3.py @@ -533,3 +533,17 @@ def test_write_chunked_parquet(s3_base, s3so): actual.sort_values(["b"]).reset_index(drop=True), cudf.concat([df1, df2]).sort_values(["b"]).reset_index(drop=True), ) + + +def test_no_s3fs_on_cudf_import(): + import subprocess + import sys + + output = subprocess.check_output( + [ + sys.executable, + "-c", + "import cudf; import sys; print('pyarrow._s3fs' in sys.modules)", + ] + ) + assert output.strip() == b"False" diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 91925bf3c0c..d2739b35049 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -13,7 +13,6 @@ import pandas as pd from fsspec.core import get_fs_token_paths from pyarrow import PythonFile as ArrowPythonFile -from pyarrow.fs import FSSpecHandler, PyFileSystem from pyarrow.lib import NativeFile from cudf.utils.docutils import docfmt_partial @@ -1630,6 +1629,15 @@ def _open_remote_files( for path, rgs in zip(paths, row_groups) ] + # Avoid top-level pyarrow.fs import. + # Importing pyarrow.fs initializes a S3 SDK with a finalizer + # that runs atexit. In some circumstances it appears this + # runs a call into a logging system that is already shutdown. + # To avoid this, we only import this subsystem if it is + # really needed. + # See https://github.com/aws/aws-sdk-cpp/issues/2681 + from pyarrow.fs import FSSpecHandler, PyFileSystem + # Default open - Use pyarrow filesystem API pa_fs = PyFileSystem(FSSpecHandler(fs)) return [