rapidsai · rapids-bot · May 10, 2022 · May 2, 2022 · May 2, 2022 · May 3, 2022
@@ -67,6 +67,13 @@ dependencies:
   - pydata-sphinx-theme
   - librdkafka=1.7.0
   - python-confluent-kafka=1.7.0
+  - moto>=3.1.6
+  - boto3>=1.21.21
+  - botocore>=1.24.21
+  - aiobotocore>=2.2.0
+  - s3fs>=2022.3.0
+  - flask
+  - flask_cors
   - pip:
       - git+https://github.com/python-streamz/streamz.git@master
       - pyorc

diff --git a/docs/cudf/source/api_docs/io.rst b/docs/cudf/source/api_docs/io.rst
@@ -36,6 +36,10 @@ Parquet
    read_parquet
    DataFrame.to_parquet
    cudf.io.parquet.read_parquet_metadata
+   :template: autosummary/class_with_autosummary.rst
+
+   cudf.io.parquet.ParquetDatasetWriter
+
 
 ORC
 ~~~

@@ -1,4 +1,4 @@
-# Copyright (c) 2018, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 from cudf.io.avro import read_avro
 from cudf.io.csv import read_csv, to_csv
 from cudf.io.dlpack import from_dlpack
@@ -9,6 +9,7 @@
 from cudf.io.parquet import (
     merge_parquet_filemetadata,
     read_parquet,
+    ParquetDatasetWriter,
     read_parquet_metadata,
     write_to_dataset,
 )

@@ -1,5 +1,7 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
+import shutil
+import tempfile
 import warnings
 from collections import defaultdict
 from contextlib import ExitStack
@@ -206,12 +208,15 @@ def _process_dataset(
         filters = pq._filters_to_expression(filters)
 
     # Initialize ds.FilesystemDataset
+    # TODO: Remove the if len(paths) workaround after following bug is fixed:
+    # https://issues.apache.org/jira/browse/ARROW-16438
     dataset = ds.dataset(
-        paths,
+        source=paths[0] if len(paths) == 1 else paths,
         filesystem=fs,
         format="parquet",
         partitioning="hive",
     )
+
     file_list = dataset.files
     if len(file_list) == 0:
         raise FileNotFoundError(f"{paths} could not be resolved to any files")
@@ -716,6 +721,58 @@ def _get_partitioned(
 
 
 class ParquetDatasetWriter:
+    """
+    Write a parquet file or dataset incrementally
+
+    Parameters
+    ----------
+    path : str or s3 and URL
+        A Local directory path/s3 URL. Will be used as Root Directory
+        path while writing a partitioned dataset.
+    partition_cols : list
+        Column names by which to partition the dataset
+        Columns are partitioned in the order they are given
+    index : bool, default None
+        If ``True``, include the dataframe’s index(es) in the file output.
+        If ``False``, they will not be written to the file. If ``None``,
+        index(es) other than RangeIndex will be saved as columns.
+    compression : {'snappy', None}, default 'snappy'
+        Name of the compression to use. Use ``None`` for no compression.
+    statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
+        Level at which column statistics should be included in file.
+
+
+    Examples
+    --------
+    Using a context
+
+    >>> df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]})
+    >>> df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]})
+    >>> with ParquetDatasetWriter("./dataset", partition_cols=["a"]) as cw:
+    ...     cw.write_table(df1)
+    ...     cw.write_table(df2)
+
+    By manually calling ``close()``
+
+    >>> cw = ParquetDatasetWriter("./dataset", partition_cols=["a"])
+    >>> cw.write_table(df1)
+    >>> cw.write_table(df2)
+    >>> cw.close()
+
+    Both the methods will generate the same directory structure
+
+    .. code-block:: bash
+
+        dataset/
+            a=1
+                <filename>.parquet
+            a=2
+                <filename>.parquet
+            a=3
+                <filename>.parquet
+
+    """
+
     @_cudf_nvtx_annotate
     def __init__(
         self,
@@ -724,59 +781,14 @@ def __init__(
         index=None,
         compression=None,
         statistics="ROWGROUP",
+        **kwargs,
     ) -> None:
-        """
-        Write a parquet file or dataset incrementally
-
-        Parameters
-        ----------
-        path : str
-            File path or Root Directory path. Will be used as Root Directory
-            path while writing a partitioned dataset.
-        partition_cols : list
-            Column names by which to partition the dataset
-            Columns are partitioned in the order they are given
-        index : bool, default None
-            If ``True``, include the dataframe’s index(es) in the file output.
-            If ``False``, they will not be written to the file. If ``None``,
-            index(es) other than RangeIndex will be saved as columns.
-        compression : {'snappy', None}, default 'snappy'
-            Name of the compression to use. Use ``None`` for no compression.
-        statistics : {'ROWGROUP', 'PAGE', 'NONE'}, default 'ROWGROUP'
-            Level at which column statistics should be included in file.
-
-
-        Examples
-        ________
-        Using a context
-
-        >>> df1 = cudf.DataFrame({"a": [1, 1, 2, 2, 1], "b": [9, 8, 7, 6, 5]})
-        >>> df2 = cudf.DataFrame({"a": [1, 3, 3, 1, 3], "b": [4, 3, 2, 1, 0]})
-        >>> with ParquetDatasetWriter("./dataset", partition_cols=["a"]) as cw:
-        ...     cw.write_table(df1)
-        ...     cw.write_table(df2)
-
-        By manually calling ``close()``
-
-        >>> cw = ParquetDatasetWriter("./dataset", partition_cols=["a"])
-        >>> cw.write_table(df1)
-        >>> cw.write_table(df2)
-        >>> cw.close()
-
-        Both the methods will generate the same directory structure
-
-        .. code-block:: bash
-
-            dataset/
-                a=1
-                    <filename>.parquet
-                a=2
-                    <filename>.parquet
-                a=3
-                    <filename>.parquet
-
-        """
-        self.path = path
+        if isinstance(path, str) and path.startswith("s3://"):
+            self.fs_meta = {"is_s3": True, "actual_path": path}
+            self.path = tempfile.TemporaryDirectory().name
+        else:
+            self.fs_meta = {}
+            self.path = path
         self.common_args = {
             "index": index,
             "compression": compression,
@@ -792,6 +804,7 @@ def __init__(
         # in self._chunked_writers for reverse lookup
         self.path_cw_map: Dict[str, int] = {}
         self.filename = None
+        self.kwargs = kwargs
 
     @_cudf_nvtx_annotate
     def write_table(self, df):
@@ -837,18 +850,19 @@ def write_table(self, df):
             ]
             cw.write_table(grouped_df, this_cw_part_info)
 
-        # Create new cw for unhandled paths encountered in this write_table
-        new_paths, part_info, meta_paths = zip(*new_cw_paths)
-        self._chunked_writers.append(
-            (
-                ParquetWriter(new_paths, **self.common_args),
-                new_paths,
-                meta_paths,
+        if new_cw_paths:
+            # Create new cw for unhandled paths encountered in this write_table
+            new_paths, part_info, meta_paths = zip(*new_cw_paths)
+            self._chunked_writers.append(
+                (
+                    ParquetWriter(new_paths, **self.common_args),
+                    new_paths,
+                    meta_paths,
+                )
             )
-        )
-        new_cw_idx = len(self._chunked_writers) - 1
-        self.path_cw_map.update({k: new_cw_idx for k in new_paths})
-        self._chunked_writers[-1][0].write_table(grouped_df, part_info)
+            new_cw_idx = len(self._chunked_writers) - 1
+            self.path_cw_map.update({k: new_cw_idx for k in new_paths})
+            self._chunked_writers[-1][0].write_table(grouped_df, part_info)
 
     @_cudf_nvtx_annotate
     def close(self, return_metadata=False):
@@ -862,6 +876,15 @@ def close(self, return_metadata=False):
             for cw, _, meta_path in self._chunked_writers
         ]
 
+        if self.fs_meta.get("is_s3", False):
+            local_path = self.path
+            s3_path = self.fs_meta["actual_path"]
+            s3_file, _ = ioutils._get_filesystem_and_paths(
+                s3_path, **self.kwargs
+            )
+            s3_file.put(local_path, s3_path, recursive=True)
+            shutil.rmtree(self.path)
+
         if return_metadata:
             return (
                 merge_parquet_filemetadata(metadata)

@@ -2,6 +2,7 @@
 
 import os
 import shlex
+import socket
 import subprocess
 import time
 from contextlib import contextmanager
@@ -18,10 +19,20 @@
 import cudf
 from cudf.testing._utils import assert_eq
 
-moto = pytest.importorskip("moto", minversion="1.3.14")
+moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
 requests = pytest.importorskip("requests")
 s3fs = pytest.importorskip("s3fs")
+flask = pytest.importorskip("flask")
+flask_cors = pytest.importorskip("flask_cors")
+
+
+@pytest.fixture(scope="session")
+def endpoint_port():
+    # Return a free port per worker session.
+    sock = socket.socket()
+    sock.bind(("", 0))
+    return sock.getsockname()[1]
-@pytest.fixture(scope="session")
-def endpoint_port():
-    # Return a free port per worker session.
-    sock = socket.socket()
-    sock.bind(("", 0))
-    return sock.getsockname()[1]
+@pytest.fixture(scope="session")
+def endpoint_port():
+    # Return a free port per worker session.
+    with socket.socket() as sock:
+        sock.bind(("127.0.0.1", 0))
+        yield sock.getsockname()[1] 
-@pytest.fixture(scope="session")
-def endpoint_port():
-    # Return a free port per worker session.
-    sock = socket.socket()
-    sock.bind(("", 0))
-    return sock.getsockname()[1]
+@pytest.fixture(scope="session")
+def endpoint_port():
+    # Return a free port per worker session.
+    sock = socket.socket()
+    sock.bind(("127.0.0.1", 0))
+    yield sock.getsockname()[1]
+    sock.close()
-@pytest.fixture(scope="session")
-def endpoint_port():
-    # Return a free port per worker session.
-    sock = socket.socket()
-    sock.bind(("", 0))
-    return sock.getsockname()[1]
+@pytest.fixture(scope="session")
+def endpoint_port():
+    # Return a free port per worker session.
+    with socket.socket() as sock:
+        sock.bind(("127.0.0.1", 0))
+        yield sock.getsockname()[1] 
-@pytest.fixture(scope="session")
-def endpoint_port():
-    # Return a free port per worker session.
-    sock = socket.socket()
-    sock.bind(("", 0))
-    return sock.getsockname()[1]
+@pytest.fixture(scope="session")
+def endpoint_port():
+    # Return a free port per worker session.
+    sock = socket.socket()
+    sock.bind(("127.0.0.1", 0))
+    yield sock.getsockname()[1]
+    sock.close()
 
 
 @contextmanager
@@ -40,7 +51,7 @@ def ensure_safe_environment_variables():
 
 
 @pytest.fixture(scope="session")
-def s3_base(worker_id):
+def s3_base(endpoint_port):
     """
     Fixture to set up moto server in separate process
     """
@@ -49,15 +60,11 @@ def s3_base(worker_id):
         # system aws credentials, https://github.com/spulec/moto/issues/1793
         os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
         os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
+        os.environ.setdefault("S3FS_LOGGING_LEVEL", "DEBUG")
 
         # Launching moto in server mode, i.e., as a separate process
         # with an S3 endpoint on localhost
 
-        endpoint_port = (
-            5000
-            if worker_id == "master"
-            else 5550 + int(worker_id.lstrip("gw"))
-        )
         endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
 
         proc = subprocess.Popen(
@@ -82,13 +89,10 @@ def s3_base(worker_id):
 
 
 @pytest.fixture()
-def s3so(worker_id):
+def s3so(endpoint_port):
     """
     Returns s3 storage options to pass to fsspec
     """
-    endpoint_port = (
-        5000 if worker_id == "master" else 5550 + int(worker_id.lstrip("gw"))
-    )
     endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
 
     return {"client_kwargs": {"endpoint_url": endpoint_uri}}
@@ -360,20 +364,33 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
 
 @pytest.mark.parametrize("partition_cols", [None, ["String"]])
 def test_write_parquet(s3_base, s3so, pdf, partition_cols):
-    fname = "test_parquet_writer.parquet"
+    fname_cudf = "test_parquet_writer_cudf"
+    fname_pandas = "test_parquet_writer_pandas"
     bname = "parquet"
     gdf = cudf.from_pandas(pdf)
+
     with s3_context(s3_base=s3_base, bucket=bname) as s3fs:
         gdf.to_parquet(
-            f"s3://{bname}/{fname}",
+            f"s3://{bname}/{fname_cudf}",
             partition_cols=partition_cols,
             storage_options=s3so,
         )
-        assert s3fs.exists(f"s3://{bname}/{fname}")
+        assert s3fs.exists(f"s3://{bname}/{fname_cudf}")
+        pdf.to_parquet(
+            f"s3://{bname}/{fname_pandas}",
+            partition_cols=partition_cols,
+            storage_options=s3so,
+        )
+        assert s3fs.exists(f"s3://{bname}/{fname_pandas}")
 
-        got = pd.read_parquet(s3fs.open(f"s3://{bname}/{fname}"))
+        got = pd.read_parquet(
+            f"s3://{bname}/{fname_pandas}", storage_options=s3so
+        )
+        expect = cudf.read_parquet(
+            f"s3://{bname}/{fname_cudf}", storage_options=s3so
+        )
 
-    assert_eq(pdf, got)
+    assert_eq(expect, got)
 
 
 def test_read_json(s3_base, s3so):
@@ -457,3 +474,42 @@ def test_write_orc(s3_base, s3so, pdf):
             got = pa.orc.ORCFile(f).read().to_pandas()
 
     assert_eq(pdf, got)
+
+
+def test_write_chunked_parquet(s3_base, s3so):
+    df1 = cudf.DataFrame({"b": [10, 11, 12], "a": [1, 2, 3]})
+    df2 = cudf.DataFrame({"b": [20, 30, 50], "a": [3, 2, 1]})
+    dirname = "chunked_writer_directory"
+    bname = "parquet"
+    from cudf.io.parquet import ParquetDatasetWriter
+
+    with s3_context(
+        s3_base=s3_base, bucket=bname, files={dirname: BytesIO()}
+    ) as s3fs:
+        cw = ParquetDatasetWriter(
+            f"s3://{bname}/{dirname}",
+            partition_cols=["a"],
+            storage_options=s3so,
+        )
+        cw.write_table(df1)
+        cw.write_table(df2)
+        cw.close()
+
+        # TODO: Replace following workaround with:
+        # expect = cudf.read_parquet(f"s3://{bname}/{dirname}/",
+        # storage_options=s3so)
+        # after the following bug is fixed:
+        # https://issues.apache.org/jira/browse/ARROW-16438
+
+        dfs = []
+        for folder in {"a=1", "a=2", "a=3"}:
+            assert s3fs.exists(f"s3://{bname}/{dirname}/{folder}")
+            for file in s3fs.ls(f"s3://{bname}/{dirname}/{folder}"):
+                df = cudf.read_parquet("s3://" + file, storage_options=s3so)
+                dfs.append(df)
+
+        actual = cudf.concat(dfs).astype("int64")
+        assert_eq(
+            actual.sort_values(["b"]).reset_index(drop=True),
+            cudf.concat([df1, df2]).sort_values(["b"]).reset_index(drop=True),
+        )