From 262715390f739075f0bdac01ff8c92206a1c2fb5 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 14 Dec 2021 13:58:28 -0600
Subject: [PATCH 01/13] Change default `dtype` of all nulls column from `float`
 to `object` (#9803)

Fixes: #9337

- [x] This PR changes the default `dtype` of `all-nulls` column to `object` dtype from `float64` dtype.
- [x] To make `np.nan` values read as `float` column `nan_as_null` has to be passed as `False` in `cudf.DataFrame` constructor - This change is in-line with what is already supported by `cudf.Series` constructor.
- [x] Added `has_nans` & `nan_count` property which is needed for some of the checks.
- [x] Cached the `nan_count` since it is repeatedly used in math operations and clearing the cache in the regular `_clear_cache` call.
- [x] Fixes pytests that are going to break due to this breaking change of types.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/brandon-b-miller
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9803
---
 python/cudf/cudf/_lib/column.pyi              |  3 +-
 python/cudf/cudf/_lib/column.pyx              |  3 +-
 python/cudf/cudf/core/_base_index.py          |  2 +-
 python/cudf/cudf/core/column/column.py        | 34 ++++++----
 python/cudf/cudf/core/column/datetime.py      |  2 +-
 python/cudf/cudf/core/column/numerical.py     | 64 +++++++++++++++++--
 .../cudf/cudf/core/column/numerical_base.py   |  9 ++-
 python/cudf/cudf/core/column/string.py        | 15 +----
 python/cudf/cudf/core/dataframe.py            | 44 +++++++++----
 python/cudf/cudf/core/frame.py                |  2 +-
 python/cudf/cudf/core/index.py                |  2 +-
 python/cudf/cudf/core/multiindex.py           |  2 +-
 python/cudf/cudf/core/series.py               |  2 +-
 python/cudf/cudf/core/tools/datetimes.py      |  2 +-
 python/cudf/cudf/core/tools/numeric.py        |  2 +-
 python/cudf/cudf/core/window/rolling.py       |  2 +-
 python/cudf/cudf/tests/test_dataframe.py      | 54 +++++++++++++---
 python/cudf/cudf/tests/test_interpolate.py    |  8 ++-
 python/cudf/cudf/tests/test_list.py           | 15 +++--
 python/cudf/cudf/tests/test_onehot.py         | 12 +++-
 python/cudf/cudf/tests/test_repr.py           |  6 +-
 python/cudf/cudf/tests/test_series.py         |  2 +-
 python/cudf/cudf/tests/test_stats.py          |  5 +-
 23 files changed, 210 insertions(+), 82 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index dafaa8f4d1d..235cb4fd973 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -70,8 +70,7 @@ class Column:
     def nullable(self) -> bool:
         ...
 
-    @property
-    def has_nulls(self) -> bool:
+    def has_nulls(self, include_nan: bool=False) -> bool:
         ...
 
     @property
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index ff3f3050e63..5e0ee3136b7 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -146,8 +146,7 @@ cdef class Column:
     def nullable(self):
         return self.base_mask is not None
 
-    @property
-    def has_nulls(self):
+    def has_nulls(self, include_nan=False):
         return self.null_count != 0
 
     @property
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 2fcc976d8e1..ac5e152d011 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -147,7 +147,7 @@ def _clean_nulls_from_index(self):
         methods using this method to replace or handle representation
         of the actual types correctly.
         """
-        if self._values.has_nulls:
+        if self._values.has_nulls():
             return cudf.Index(
                 self._values.astype("str").fillna(cudf._NA_REP), name=self.name
             )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1d113f6e159..a98052ce906 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -139,7 +139,7 @@ def values_host(self) -> "np.ndarray":
         if len(self) == 0:
             return np.array([], dtype=self.dtype)
 
-        if self.has_nulls:
+        if self.has_nulls():
             raise ValueError("Column must have no nulls.")
 
         return self.data_array_view.copy_to_host()
@@ -152,7 +152,7 @@ def values(self) -> "cupy.ndarray":
         if len(self) == 0:
             return cupy.array([], dtype=self.dtype)
 
-        if self.has_nulls:
+        if self.has_nulls():
             raise ValueError("Column must have no nulls.")
 
         return cupy.asarray(self.data_array_view)
@@ -193,7 +193,7 @@ def all(self, skipna: bool = True) -> bool:
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
         result_col = self.nans_to_nulls() if skipna else self
-        if not skipna and result_col.has_nulls:
+        if not skipna and result_col.has_nulls():
             return True
         elif skipna and result_col.null_count == result_col.size:
             return False
@@ -786,7 +786,7 @@ def as_mask(self) -> Buffer:
         Buffer
         """
 
-        if self.has_nulls:
+        if self.has_nulls():
             raise ValueError("Column must have no nulls.")
 
         return bools_to_mask(self)
@@ -797,13 +797,13 @@ def is_unique(self) -> bool:
 
     @property
     def is_monotonic_increasing(self) -> bool:
-        return not self.has_nulls and self.as_frame()._is_sorted(
+        return not self.has_nulls() and self.as_frame()._is_sorted(
             ascending=None, null_position=None
         )
 
     @property
     def is_monotonic_decreasing(self) -> bool:
-        return not self.has_nulls and self.as_frame()._is_sorted(
+        return not self.has_nulls() and self.as_frame()._is_sorted(
             ascending=[False], null_position=None
         )
 
@@ -942,7 +942,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase:
         )
 
         # columns include null index in factorization; remove:
-        if self.has_nulls:
+        if self.has_nulls():
             cats = cats._column.dropna(drop_nan=False)
             min_type = min_unsigned_type(len(cats), 8)
             labels = labels - 1
@@ -1216,10 +1216,10 @@ def _process_for_reduction(
 
         if skipna:
             result_col = self.nans_to_nulls()
-            if result_col.has_nulls:
+            if result_col.has_nulls():
                 result_col = result_col.dropna()
         else:
-            if self.has_nulls:
+            if self.has_nulls():
                 return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
             result_col = self
@@ -1766,12 +1766,20 @@ def as_column(
                 "https://issues.apache.org/jira/browse/ARROW-3802"
             )
         col = ColumnBase.from_arrow(arbitrary)
+
         if isinstance(arbitrary, pa.NullArray):
-            if type(dtype) == str and dtype == "empty":
-                new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
+            new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
+            if dtype is not None:
+                # Cast the column to the `dtype` if specified.
+                col = col.astype(dtype)
+            elif len(arbitrary) == 0:
+                # If the column is empty, it has to be
+                # a `float64` dtype.
+                col = col.astype("float64")
             else:
-                new_dtype = cudf.dtype(dtype)
-            col = col.astype(new_dtype)
+                # If the null column is not empty, it has to
+                # be of `object` dtype.
+                col = col.astype(new_dtype)
 
         return col
 
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 08d72f1c6ee..24ec25acbbb 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -292,7 +292,7 @@ def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]:
             "version": 1,
         }
 
-        if self.nullable and self.has_nulls:
+        if self.nullable and self.has_nulls():
 
             # Create a simple Python object that exposes the
             # `__cuda_array_interface__` attribute here since we need to modify
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index becb303feeb..c947440edb1 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,16 @@
 from __future__ import annotations
 
 from types import SimpleNamespace
-from typing import Any, Callable, Mapping, Sequence, Tuple, Union, cast
+from typing import (
+    Any,
+    Callable,
+    Mapping,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    cast,
+)
 
 import cupy
 import numpy as np
@@ -47,6 +56,8 @@ class NumericalColumn(NumericalBaseColumn):
     mask : Buffer, optional
     """
 
+    _nan_count: Optional[int]
+
     def __init__(
         self,
         data: Buffer,
@@ -62,7 +73,7 @@ def __init__(
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = (data.size // dtype.itemsize) - offset
-
+        self._nan_count = None
         super().__init__(
             data,
             size=size,
@@ -72,6 +83,10 @@ def __init__(
             null_count=null_count,
         )
 
+    def _clear_cache(self):
+        super()._clear_cache()
+        self._nan_count = None
+
     def __contains__(self, item: ScalarLike) -> bool:
         """
         Returns True if column contains item, else False.
@@ -90,6 +105,11 @@ def __contains__(self, item: ScalarLike) -> bool:
             self, column.as_column([item], dtype=self.dtype)
         ).any()
 
+    def has_nulls(self, include_nan=False):
+        return self.null_count != 0 or (
+            self.nan_count != 0 if include_nan else False
+        )
+
     @property
     def __cuda_array_interface__(self) -> Mapping[str, Any]:
         output = {
@@ -100,7 +120,7 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
             "version": 1,
         }
 
-        if self.nullable and self.has_nulls:
+        if self.nullable and self.has_nulls():
 
             # Create a simple Python object that exposes the
             # `__cuda_array_interface__` attribute here since we need to modify
@@ -280,6 +300,15 @@ def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn:
             return self
         return libcudf.unary.cast(self, dtype)
 
+    @property
+    def nan_count(self) -> int:
+        if self.dtype.kind != "f":
+            self._nan_count = 0
+        elif self._nan_count is None:
+            nan_col = libcudf.unary.is_nan(self)
+            self._nan_count = nan_col.sum()
+        return self._nan_count
+
     def _process_values_for_isin(
         self, values: Sequence
     ) -> Tuple[ColumnBase, ColumnBase]:
@@ -296,6 +325,20 @@ def _process_values_for_isin(
 
         return lhs, rhs
 
+    def _can_return_nan(self, skipna: bool = None) -> bool:
+        return not skipna and self.has_nulls(include_nan=True)
+
+    def _process_for_reduction(
+        self, skipna: bool = None, min_count: int = 0
+    ) -> Union[ColumnBase, ScalarLike]:
+        skipna = True if skipna is None else skipna
+
+        if self._can_return_nan(skipna=skipna):
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+        return super()._process_for_reduction(
+            skipna=skipna, min_count=min_count
+        )
+
     def _default_na_value(self) -> ScalarLike:
         """Returns the default NA value for this column"""
         dkind = self.dtype.kind
@@ -319,8 +362,19 @@ def find_and_replace(
         """
         Return col with *to_replace* replaced with *value*.
         """
+
+        # If all of `to_replace`/`replacement` are `None`,
+        # dtype of `to_replace_col`/`replacement_col`
+        # is inferred as `string`, but this is a valid
+        # float64 column too, Hence we will need to type-cast
+        # to self.dtype.
         to_replace_col = column.as_column(to_replace)
+        if to_replace_col.null_count == len(to_replace_col):
+            to_replace_col = to_replace_col.astype(self.dtype)
+
         replacement_col = column.as_column(replacement)
+        if replacement_col.null_count == len(replacement_col):
+            replacement_col = replacement_col.astype(self.dtype)
 
         if type(to_replace_col) != type(replacement_col):
             raise TypeError(
@@ -578,7 +632,7 @@ def to_pandas(
             arrow_array = self.to_arrow()
             pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)
             pd_series = pd.Series(pandas_array, copy=False)
-        elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls:
+        elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls():
             pd_series = pd.Series(cupy.asnumpy(self.values), copy=False)
         else:
             pd_series = self.to_arrow().to_pandas(**kwargs)
@@ -597,6 +651,8 @@ def _normalize_find_and_replace_input(
     )
     col_to_normalize_dtype = normalized_column.dtype
     if isinstance(col_to_normalize, list):
+        if normalized_column.null_count == len(normalized_column):
+            normalized_column = normalized_column.astype(input_column_dtype)
         col_to_normalize_dtype = min_column_type(
             normalized_column, input_column_dtype
         )
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 853fb360c50..1f84cb88e37 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -77,10 +77,13 @@ def sum_of_squares(
             "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
         )
 
+    def _can_return_nan(self, skipna: bool = None) -> bool:
+        return not skipna and self.has_nulls()
+
     def kurtosis(self, skipna: bool = None) -> float:
         skipna = True if skipna is None else skipna
 
-        if len(self) == 0 or (not skipna and self.has_nulls):
+        if len(self) == 0 or self._can_return_nan(skipna=skipna):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
         self = self.nans_to_nulls().dropna()  # type: ignore
@@ -105,7 +108,7 @@ def kurtosis(self, skipna: bool = None) -> float:
     def skew(self, skipna: bool = None) -> ScalarLike:
         skipna = True if skipna is None else skipna
 
-        if len(self) == 0 or (not skipna and self.has_nulls):
+        if len(self) == 0 or self._can_return_nan(skipna=skipna):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
         self = self.nans_to_nulls().dropna()  # type: ignore
@@ -148,7 +151,7 @@ def quantile(
     def median(self, skipna: bool = None) -> NumericalBaseColumn:
         skipna = True if skipna is None else skipna
 
-        if not skipna and self.has_nulls:
+        if self._can_return_nan(skipna=skipna):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
         # enforce linear in case the default ever changes
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 2a91abc5701..1c9a013810a 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5018,7 +5018,7 @@ def edit_distance_matrix(self) -> ParentType:
             raise ValueError(
                 "Require size >= 2 to compute edit distance matrix."
             )
-        if self._column.has_nulls:
+        if self._column.has_nulls():
             raise ValueError(
                 "Cannot compute edit distance between null strings. "
                 "Consider removing them using `dropna` or fill with `fillna`."
@@ -5440,20 +5440,7 @@ def find_and_replace(
         """
 
         to_replace_col = column.as_column(to_replace)
-        if to_replace_col.null_count == len(to_replace_col):
-            # If all of `to_replace` are `None`, dtype of `to_replace_col`
-            # is inferred as `float64`, but this is a valid
-            # string column too, Hence we will need to type-cast
-            # to self.dtype.
-            to_replace_col = to_replace_col.astype(self.dtype)
-
         replacement_col = column.as_column(replacement)
-        if replacement_col.null_count == len(replacement_col):
-            # If all of `replacement` are `None`, dtype of `replacement_col`
-            # is inferred as `float64`, but this is a valid
-            # string column too, Hence we will need to type-cast
-            # to self.dtype.
-            replacement_col = replacement_col.astype(self.dtype)
 
         if type(to_replace_col) != type(replacement_col):
             raise TypeError(
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bbe691595e7..88c8aaebd9e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -440,6 +440,11 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
         Data type to force. Only a single dtype is allowed.
         If None, infer.
 
+    nan_as_null : bool, Default True
+        If ``None``/``True``, converts ``np.nan`` values to
+        ``null`` values.
+        If ``False``, leaves ``np.nan`` values as is.
+
     Examples
     --------
 
@@ -514,7 +519,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     _iloc_indexer_type = _DataFrameIlocIndexer
 
     @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python")
-    def __init__(self, data=None, index=None, columns=None, dtype=None):
+    def __init__(
+        self, data=None, index=None, columns=None, dtype=None, nan_as_null=True
+    ):
 
         super().__init__()
 
@@ -523,7 +530,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
 
         if isinstance(data, (DataFrame, pd.DataFrame)):
             if isinstance(data, pd.DataFrame):
-                data = self.from_pandas(data)
+                data = self.from_pandas(data, nan_as_null=nan_as_null)
 
             if index is not None:
                 if not data.index.equals(index):
@@ -546,11 +553,14 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
                 self.columns = data.columns
         elif isinstance(data, (cudf.Series, pd.Series)):
             if isinstance(data, pd.Series):
-                data = cudf.Series.from_pandas(data)
+                data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null)
 
             name = data.name or 0
             self._init_from_dict_like(
-                {name: data}, index=index, columns=columns
+                {name: data},
+                index=index,
+                columns=columns,
+                nan_as_null=nan_as_null,
             )
         elif data is None:
             if index is None:
@@ -620,7 +630,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None):
                 if not is_dict_like(data):
                     raise TypeError("data must be list or dict-like")
 
-                self._init_from_dict_like(data, index=index, columns=columns)
+                self._init_from_dict_like(
+                    data, index=index, columns=columns, nan_as_null=nan_as_null
+                )
 
         if dtype:
             self._data = self.astype(dtype)._data
@@ -759,7 +771,9 @@ def _init_from_list_like(self, data, index=None, columns=None):
 
             self.columns = columns
 
-    def _init_from_dict_like(self, data, index=None, columns=None):
+    def _init_from_dict_like(
+        self, data, index=None, columns=None, nan_as_null=None
+    ):
         if columns is not None:
             # remove all entries in `data` that are
             # not in `columns`
@@ -794,7 +808,9 @@ def _init_from_dict_like(self, data, index=None, columns=None):
                 if is_scalar(data[col_name]):
                     num_rows = num_rows or 1
                 else:
-                    data[col_name] = column.as_column(data[col_name])
+                    data[col_name] = column.as_column(
+                        data[col_name], nan_as_null=nan_as_null
+                    )
                     num_rows = len(data[col_name])
             self._index = RangeIndex(0, num_rows)
         else:
@@ -806,7 +822,9 @@ def _init_from_dict_like(self, data, index=None, columns=None):
                 self._data.multiindex = self._data.multiindex and isinstance(
                     col_name, tuple
                 )
-                self.insert(i, col_name, data[col_name])
+                self.insert(
+                    i, col_name, data[col_name], nan_as_null=nan_as_null
+                )
 
         if columns is not None:
             self.columns = columns
@@ -1747,7 +1765,7 @@ def _clean_nulls_from_dataframe(self, df):
             if is_list_dtype(df._data[col]) or is_struct_dtype(df._data[col]):
                 # TODO we need to handle this
                 pass
-            elif df._data[col].has_nulls:
+            elif df._data[col].has_nulls():
                 df[col] = df._data[col].astype("str").fillna(cudf._NA_REP)
             else:
                 df[col] = df._data[col]
@@ -2582,7 +2600,7 @@ def take(self, indices, axis=0, keep_index=None):
         return out
 
     @annotate("INSERT", color="green", domain="cudf_python")
-    def insert(self, loc, name, value):
+    def insert(self, loc, name, value, nan_as_null=None):
         """Add a column to DataFrame at the index specified by loc.
 
         Parameters
@@ -2625,11 +2643,11 @@ def insert(self, loc, name, value):
                         )
                 self._data = new_data
         elif isinstance(value, (pd.Series, Series)):
-            value = Series(value)._align_to_index(
+            value = Series(value, nan_as_null=nan_as_null)._align_to_index(
                 self._index, how="right", sort=False
             )
 
-        value = column.as_column(value)
+        value = column.as_column(value, nan_as_null=nan_as_null)
 
         self._data.insert(name, value, loc=loc)
 
@@ -3081,7 +3099,7 @@ def as_gpu_matrix(self, columns=None, order="F"):
 
         dtype = find_common_type([col.dtype for col in cols])
         for k, c in self._data.items():
-            if c.has_nulls:
+            if c.has_nulls():
                 raise ValueError(
                     f"column '{k}' has null values. "
                     f"hint: use .fillna() to replace null values"
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 61ce64e7d6b..c85ed0c8555 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -4755,7 +4755,7 @@ def _scan(self, op, axis=None, skipna=True, cast_to_int=False):
                 result_col = self._data[name].nans_to_nulls()
             else:
                 result_col = self._data[name].copy()
-                if result_col.has_nulls:
+                if result_col.has_nulls(include_nan=True):
                     # Workaround as find_first_value doesn't seem to work
                     # incase of bools.
                     first_index = int(
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0002aaf38c5..29e0d17bc39 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2515,7 +2515,7 @@ def _clean_nulls_from_index(self):
         Convert all na values(if any) in Index object
         to `<NA>` as a preprocessing step to `__repr__` methods.
         """
-        if self._values.has_nulls:
+        if self._values.has_nulls():
             return self.fillna(cudf._NA_REP)
         else:
             return self
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 65c79b4cf59..c403c697e3d 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -386,7 +386,7 @@ def __repr__(self):
         else:
             preprocess = self.copy(deep=False)
 
-        if any(col.has_nulls for col in preprocess._data.columns):
+        if any(col.has_nulls() for col in preprocess._data.columns):
             preprocess_df = preprocess.to_frame(index=False)
             for name, col in preprocess._data.items():
                 if isinstance(
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index bbeae1adc5e..036c8c1ee00 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1474,7 +1474,7 @@ def has_nulls(self):
         >>> series.dropna().has_nulls
         False
         """
-        return self._column.has_nulls
+        return self._column.has_nulls()
 
     def dropna(self, axis=0, inplace=False, how=None):
         """
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 34d62ffc048..3efbd982b53 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -330,7 +330,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format):
             col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
 
     elif col.dtype.kind in ("O"):
-        if unit not in (None, "ns"):
+        if unit not in (None, "ns") or col.null_count == len(col):
             try:
                 col = col.astype(dtype="int64")
             except ValueError:
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 7c688b92009..bd1b505c57f 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -165,7 +165,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     if isinstance(arg, (cudf.Series, pd.Series)):
         return cudf.Series(col)
     else:
-        if col.has_nulls:
+        if col.has_nulls():
             # To match pandas, always return a floating type filled with nan.
             col = col.astype(float).fillna(np.nan)
         return col.values
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 617dbdeaea5..0f4256e49a6 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -326,7 +326,7 @@ def apply(self, func, *args, **kwargs):
         """
         has_nulls = False
         if isinstance(self.obj, cudf.Series):
-            if self.obj._column.has_nulls:
+            if self.obj._column.has_nulls():
                 has_nulls = True
         else:
             for col in self.obj._data:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index c40f9f0b0a5..ab0856fad1e 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -246,7 +246,7 @@ def test_series_init_none():
     sr1 = cudf.Series()
     got = sr1.to_string()
 
-    expect = "Series([], dtype: float64)"
+    expect = sr1.to_pandas().__repr__()
     # values should match despite whitespace difference
     assert got.split() == expect.split()
 
@@ -254,7 +254,7 @@ def test_series_init_none():
     sr2 = cudf.Series(None)
     got = sr2.to_string()
 
-    expect = "Series([], dtype: float64)"
+    expect = sr2.to_pandas().__repr__()
     # values should match despite whitespace difference
     assert got.split() == expect.split()
 
@@ -1308,7 +1308,7 @@ def test_concat_empty_dataframe(df_1, df_2):
     [
         {"a": [1, 2], "b": [1, 2], "c": ["s1", "s2"], "d": [1.0, 2.0]},
         {"b": [1.9, 10.9], "c": ["s1", "s2"]},
-        {"c": ["s1"], "b": [None], "a": [False]},
+        {"c": ["s1"], "b": pd.Series([None], dtype="float"), "a": [False]},
     ],
 )
 @pytest.mark.parametrize(
@@ -2008,8 +2008,8 @@ def test_dataframe_count_reduction(data, func):
         {"x": [np.nan, 2, 3, 4, 100, np.nan], "y": [4, 5, 6, 88, 99, np.nan]},
         {"x": [1, 2, 3], "y": [4, 5, 6]},
         {"x": [np.nan, np.nan, np.nan], "y": [np.nan, np.nan, np.nan]},
-        {"x": [], "y": []},
-        {"x": []},
+        {"x": pd.Series([], dtype="float"), "y": pd.Series([], dtype="float")},
+        {"x": pd.Series([], dtype="int")},
     ],
 )
 @pytest.mark.parametrize("ops", ["sum", "product", "prod"])
@@ -2017,7 +2017,7 @@ def test_dataframe_count_reduction(data, func):
 @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 10])
 def test_dataframe_min_count_ops(data, ops, skipna, min_count):
     psr = pd.DataFrame(data)
-    gsr = cudf.DataFrame(data)
+    gsr = cudf.from_pandas(psr)
 
     assert_eq(
         getattr(psr, ops)(skipna=skipna, min_count=min_count),
@@ -2498,7 +2498,7 @@ def test_series_all_null(num_elements, null_type):
 
     # Typecast Pandas because None will return `object` dtype
     expect = pd.Series(data, dtype="float64")
-    got = cudf.Series(data)
+    got = cudf.Series(data, dtype="float64")
 
     assert_eq(expect, got)
 
@@ -8480,10 +8480,10 @@ def test_agg_for_dataframe_with_string_columns(aggs):
 )
 def test_update_for_dataframes(data, data2, join, overwrite, errors):
     pdf = pd.DataFrame(data)
-    gdf = cudf.DataFrame(data)
+    gdf = cudf.DataFrame(data, nan_as_null=False)
 
     other_pd = pd.DataFrame(data2)
-    other_gd = cudf.DataFrame(data2)
+    other_gd = cudf.DataFrame(data2, nan_as_null=False)
 
     pdf.update(other=other_pd, join=join, overwrite=overwrite, errors=errors)
     gdf.update(other=other_gd, join=join, overwrite=overwrite, errors=errors)
@@ -8949,7 +8949,9 @@ def test_frame_series_where_other(data):
         (
             {
                 "id": ["a", "a", "b", "b", "c", "c"],
-                "val": [None, None, None, None, None, None],
+                "val": cudf.Series(
+                    [None, None, None, None, None, None], dtype="float64"
+                ),
             },
             ["id"],
         ),
@@ -9041,6 +9043,38 @@ def test_pearson_corr_multiindex_dataframe():
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        {"a": [np.nan, 1, 2], "b": [None, None, None]},
+        {"a": [1, 2, np.nan, 2], "b": [np.nan, np.nan, np.nan, np.nan]},
+        {
+            "a": [1, 2, np.nan, 2, None],
+            "b": [np.nan, np.nan, None, np.nan, np.nan],
+        },
+        {"a": [1, 2, 2, None, 1.1], "b": [1, 2.2, 3, None, 5]},
+    ],
+)
+@pytest.mark.parametrize("nan_as_null", [True, False])
+def test_dataframe_constructor_nan_as_null(data, nan_as_null):
+    actual = cudf.DataFrame(data, nan_as_null=nan_as_null)
+
+    if nan_as_null:
+        assert (
+            not (
+                actual.astype("float").replace(
+                    cudf.Series([np.nan], nan_as_null=False), cudf.Series([-1])
+                )
+                == -1
+            )
+            .any()
+            .any()
+        )
+    else:
+        actual = actual.select_dtypes(exclude=["object"])
+        assert (actual.replace(np.nan, -1) == -1).any().any()
+
+
 def test_dataframe_add_prefix():
     cdf = cudf.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]})
     pdf = cdf.to_pandas()
diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py
index 66556c48828..2c544dfc17c 100644
--- a/python/cudf/cudf/tests/test_interpolate.py
+++ b/python/cudf/cudf/tests/test_interpolate.py
@@ -50,7 +50,9 @@ def test_interpolate_series(data, method, axis):
     expect = psr.interpolate(method=method, axis=axis)
     got = gsr.interpolate(method=method, axis=axis)
 
-    assert_eq(expect, got)
+    assert_eq(
+        expect, got, check_dtype=False if psr.dtype == "object" else True
+    )
 
 
 @pytest.mark.parametrize(
@@ -88,7 +90,9 @@ def test_interpolate_series_values_or_index(data, index, method):
     expect = psr.interpolate(method=method)
     got = gsr.interpolate(method=method)
 
-    assert_eq(expect, got)
+    assert_eq(
+        expect, got, check_dtype=False if psr.dtype == "object" else True
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 2b71ca7ac36..b898222d7d7 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -76,10 +76,14 @@ def test_leaves(data):
     pa_array = pa.array(data)
     while hasattr(pa_array, "flatten"):
         pa_array = pa_array.flatten()
-    dtype = "int8" if isinstance(pa_array, pa.NullArray) else None
-    expect = cudf.Series(pa_array, dtype=dtype)
+
+    expect = cudf.Series(pa_array)
     got = cudf.Series(data).list.leaves
-    assert_eq(expect, got)
+    assert_eq(
+        expect,
+        got,
+        check_dtype=False if isinstance(pa_array, pa.NullArray) else True,
+    )
 
 
 def test_list_to_pandas_nullable_true():
@@ -269,7 +273,10 @@ def test_get(data, index, expect):
     sr = cudf.Series(data)
     expect = cudf.Series(expect)
     got = sr.list.get(index)
-    assert_eq(expect, got)
+
+    assert_eq(
+        expect, got, check_dtype=False if expect.isnull().all() else True
+    )
 
 
 def test_get_nested_lists():
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index ed55fb86820..f2a20a73b63 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -113,10 +113,18 @@ def test_get_dummies(data):
     encoded_expected = pd.get_dummies(pdf, prefix="test")
     encoded_actual = cudf.get_dummies(gdf, prefix="test")
 
-    utils.assert_eq(encoded_expected, encoded_actual)
+    utils.assert_eq(
+        encoded_expected,
+        encoded_actual,
+        check_dtype=False if len(data) == 0 else True,
+    )
     encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8)
 
-    utils.assert_eq(encoded_expected, encoded_actual)
+    utils.assert_eq(
+        encoded_expected,
+        encoded_actual,
+        check_dtype=False if len(data) == 0 else True,
+    )
 
 
 @pytest.mark.parametrize("n_cols", [5, 10, 20])
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 736bcf131cc..fe95b2930df 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -328,10 +328,14 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows):
         ),
         (
             cudf.Index([None, None, None], name="hello"),
+            "StringIndex([None None None], dtype='object', name='hello')",
+        ),
+        (
+            cudf.Index([None, None, None], dtype="float", name="hello"),
             "Float64Index([<NA>, <NA>, <NA>], dtype='float64', name='hello')",
         ),
         (
-            cudf.Index([None], name="hello"),
+            cudf.Index([None], dtype="float64", name="hello"),
             "Float64Index([<NA>], dtype='float64', name='hello')",
         ),
         (
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 1e11e862329..583d2c7a8dd 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -971,7 +971,7 @@ def test_series_update(data, other):
 @pytest.mark.parametrize("nan_as_null", [True, False])
 @pytest.mark.parametrize("fill_value", [1.2, 332, np.nan])
 def test_fillna_with_nan(data, nan_as_null, fill_value):
-    gs = cudf.Series(data, nan_as_null=nan_as_null)
+    gs = cudf.Series(data, dtype="float64", nan_as_null=nan_as_null)
     ps = gs.to_pandas()
 
     expected = ps.fillna(fill_value)
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index ebe78d56c3f..142ca6c6831 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -460,7 +460,8 @@ def test_df_corr():
 @pytest.mark.parametrize("skipna", [True, False, None])
 def test_nans_stats(data, ops, skipna):
     psr = cudf.utils.utils._create_pandas_series(data=data)
-    gsr = cudf.Series(data)
+    gsr = cudf.Series(data, nan_as_null=False)
+
     assert_eq(
         getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna)
     )
@@ -486,7 +487,7 @@ def test_nans_stats(data, ops, skipna):
 @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 5, 10])
 def test_min_count_ops(data, ops, skipna, min_count):
     psr = pd.Series(data)
-    gsr = cudf.Series(data)
+    gsr = cudf.Series(data, nan_as_null=False)
 
     assert_eq(
         getattr(psr, ops)(skipna=skipna, min_count=min_count),

From 7a23f1a01547648db7ad684fa3dc0482b7ac813f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 14 Dec 2021 15:28:59 -0500
Subject: [PATCH 02/13] Add utility to format ninja-log build times (#9631)

Reference: https://github.com/rapidsai/ops/issues/1896

Generate build times log from formatted, sorted `.ninja_log` file.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Robert Maynard (https://github.com/robertmaynard)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/9631
---
 build.sh                      |  18 +++++
 ci/gpu/build.sh               |  12 +++-
 cpp/scripts/sort_ninja_log.py | 121 ++++++++++++++++++++++++++++++++++
 3 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100755 cpp/scripts/sort_ninja_log.py

diff --git a/build.sh b/build.sh
index d0ccd4821e0..adf6e220744 100755
--- a/build.sh
+++ b/build.sh
@@ -172,6 +172,12 @@ if buildAll || hasArg libcudf; then
         echo "Building for *ALL* supported GPU architectures..."
     fi
 
+    # get the current count before the compile starts
+    FILES_IN_CCACHE=""
+    if [ -x "$(command -v ccache)" ]; then
+        FILES_IN_CCACHE=$(ccache -s | grep "files in cache")
+    fi
+
     cmake -S $REPODIR/cpp -B ${LIB_BUILD_DIR} \
           -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
           ${CUDF_CMAKE_CUDA_ARCHITECTURES} \
@@ -185,7 +191,19 @@ if buildAll || hasArg libcudf; then
 
     cd ${LIB_BUILD_DIR}
 
+    compile_start=$(date +%s)
     cmake --build . -j${PARALLEL_LEVEL} ${VERBOSE_FLAG}
+    compile_end=$(date +%s)
+    compile_total=$(( compile_end - compile_start ))
+
+    # Record build times
+    if [[ -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then
+        echo "Formatting build times"
+        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml
+        message="$FILES_IN_CCACHE <p>$PARALLEL_LEVEL parallel build time is $compile_total seconds"
+        echo "$message"
+        python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$message" > ${LIB_BUILD_DIR}/ninja_log.html
+    fi
 
     if [[ ${INSTALL_TARGET} != "" ]]; then
         cmake --build . -j${PARALLEL_LEVEL} --target install ${VERBOSE_FLAG}
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index d8b5cc7ba4c..00ad6bf812d 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -166,16 +166,26 @@ else
     gpuci_logger "Check GPU usage"
     nvidia-smi
 
-    gpuci_logger "GoogleTests"
     set -x
     cd $LIB_BUILD_DIR
 
+    gpuci_logger "GoogleTests"
+
     for gt in gtests/* ; do
         test_name=$(basename ${gt})
         echo "Running GoogleTest $test_name"
         ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
     done
 
+    # Copy libcudf build time results
+    echo "Checking for build time log $LIB_BUILD_DIR/ninja_log.html"
+    if [[ -f "$LIB_BUILD_DIR/ninja_log.html" ]]; then
+        gpuci_logger "Copying build time results"
+        cp "$LIB_BUILD_DIR/ninja_log.xml" "$WORKSPACE/test-results/buildtimes-junit.xml"
+        mkdir -p "$WORKSPACE/build-metrics"
+        cp "$LIB_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html"
+    fi
+
     ################################################################################
     # MEMCHECK - Run compute-sanitizer on GoogleTest (only in nightly builds)
     ################################################################################
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
new file mode 100755
index 00000000000..5eada13aea2
--- /dev/null
+++ b/cpp/scripts/sort_ninja_log.py
@@ -0,0 +1,121 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+import argparse
+import os
+import sys
+import xml.etree.ElementTree as ET
+from xml.dom import minidom
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "log_file", type=str, default=".ninja_log", help=".ninja_log file"
+)
+parser.add_argument(
+    "--fmt",
+    type=str,
+    default="csv",
+    choices=["csv", "xml", "html"],
+    help="output format (to stdout)",
+)
+parser.add_argument(
+    "--msg",
+    type=str,
+    default=None,
+    help="optional message to include in html output",
+)
+args = parser.parse_args()
+
+log_file = args.log_file
+log_path = os.path.dirname(os.path.abspath(log_file))
+
+output_fmt = args.fmt
+
+# build a map of the log entries
+entries = {}
+with open(log_file, "r") as log:
+    for line in log:
+        entry = line.split()
+        if len(entry) > 4:
+            elapsed = int(entry[1]) - int(entry[0])
+            obj_file = entry[3]
+            file_size = (
+                os.path.getsize(os.path.join(log_path, obj_file))
+                if os.path.exists(obj_file)
+                else 0
+            )
+            entries[entry[3]] = (elapsed, file_size)
+
+# check file could be loaded
+if len(entries) == 0:
+    print("Could not parse", log_file)
+    exit()
+
+# sort the keys by build time (descending order)
+keys = list(entries.keys())
+sl = sorted(keys, key=lambda k: entries[k][0], reverse=True)
+
+if output_fmt == "xml":
+    # output results in XML format
+    root = ET.Element("testsuites")
+    testsuite = ET.Element(
+        "testsuite",
+        attrib={
+            "name": "build-time",
+            "tests": str(len(keys)),
+            "failures": str(0),
+            "errors": str(0),
+        },
+    )
+    root.append(testsuite)
+    for key in sl:
+        entry = entries[key]
+        elapsed = float(entry[0]) / 1000
+        item = ET.Element(
+            "testcase",
+            attrib={
+                "classname": "BuildTime",
+                "name": key,
+                "time": str(elapsed),
+            },
+        )
+        testsuite.append(item)
+
+    tree = ET.ElementTree(root)
+    xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent="   ")
+    print(xmlstr)
+
+elif output_fmt == "html":
+    # output results in HTML format
+    print("<html><head><title>Sorted Ninja Build Times</title>")
+    print("<style>", "table, th, td { border:1px solid black; }", "</style>")
+    print("</head><body>")
+    if args.msg is not None:
+        print("<p>", args.msg, "</p>")
+    print("<table>")
+    print(
+        "<tr><th>File</th>",
+        "<th align='right'>Compile time (ms)</th>",
+        "<th align='right'>Size (bytes)</th><tr>",
+        sep="",
+    )
+    for key in sl:
+        result = entries[key]
+        print(
+            "<tr><td>",
+            key,
+            "</td><td align='right'>",
+            result[0],
+            "</td><td align='right'>",
+            result[1],
+            "</td></tr>",
+            sep="",
+        )
+    print("</table></body></html>")
+
+else:
+    # output results in CSV format
+    print("time,size,file")
+    for key in sl:
+        result = entries[key]
+        print(result[0], result[1], key, sep=",")

From 61794aaef64d1eb918910817799e66d51d2d6cda Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 14 Dec 2021 13:13:12 -0800
Subject: [PATCH 03/13] Fix a memcheck error in ORC writer (#9896)

Follow up of https://github.com/rapidsai/cudf/pull/9808
Skips some kernels when input columns are empty to avoid OOB memory access.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/9896
---
 cpp/src/io/orc/writer_impl.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index a7a767585e6..c1eb9891229 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1018,6 +1018,7 @@ std::vector<StripeInformation> writer::impl::gather_stripes(
   hostdevice_2dvector<gpu::encoder_chunk_streams>* enc_streams,
   hostdevice_2dvector<gpu::StripeStream>* strm_desc)
 {
+  if (segmentation.num_stripes() == 0) { return {}; }
   std::vector<StripeInformation> stripes(segmentation.num_stripes());
   for (auto const& stripe : segmentation.stripes) {
     for (size_t col_idx = 0; col_idx < enc_streams->size().first; col_idx++) {

From 41f99565d9d85d6dfb63ccd29a9717ce1dbb7eb5 Mon Sep 17 00:00:00 2001
From: Devavret Makkar <devavret@users.noreply.github.com>
Date: Wed, 15 Dec 2021 03:39:41 +0530
Subject: [PATCH 04/13] Add partitioning support in parquet writer (#9810)

Contributes to https://github.com/rapidsai/cudf/issues/5059

Adds libcudf support for writing partitioned datasets in parquet writer. With the new API, one can specify a vector of `{start_row, num_rows}` structs along with a table st slices of the input table gets written to the corresponding sink.
Adds Multi-sink support in `sink_info`

Authors:
  - Devavret Makkar (https://github.com/devavret)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/9810
---
 cpp/include/cudf/io/data_sink.hpp        |  16 +
 cpp/include/cudf/io/detail/parquet.hpp   |  15 +-
 cpp/include/cudf/io/orc.hpp              |  59 +++
 cpp/include/cudf/io/parquet.hpp          | 153 ++++++-
 cpp/include/cudf/io/types.hpp            |  87 +++-
 cpp/src/io/functions.cpp                 |  88 ++--
 cpp/src/io/orc/writer_impl.cu            |  12 +-
 cpp/src/io/orc/writer_impl.hpp           |   2 +
 cpp/src/io/parquet/chunk_dict.cu         | 141 ++-----
 cpp/src/io/parquet/page_enc.cu           | 161 ++-----
 cpp/src/io/parquet/parquet_gpu.hpp       |  46 +-
 cpp/src/io/parquet/writer_impl.cu        | 511 +++++++++++++++--------
 cpp/src/io/parquet/writer_impl.hpp       |  33 +-
 cpp/tests/io/parquet_test.cpp            | 102 ++++-
 python/cudf/cudf/_lib/cpp/io/orc.pxd     |   9 +
 python/cudf/cudf/_lib/cpp/io/parquet.pxd |  24 +-
 python/cudf/cudf/_lib/cpp/io/types.pxd   |  15 +-
 python/cudf/cudf/_lib/orc.pyx            |   9 +-
 python/cudf/cudf/_lib/parquet.pyx        |  29 +-
 19 files changed, 962 insertions(+), 550 deletions(-)

diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 42421aed716..2c1966ee6ba 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -69,6 +69,22 @@ class data_sink {
    */
   static std::unique_ptr<data_sink> create(cudf::io::data_sink* const user_sink);
 
+  /**
+   * @brief Creates a vector of data sinks, one per element in the input vector.
+   *
+   * @param[in] args vector of parameters
+   */
+  template <typename T>
+  static std::vector<std::unique_ptr<data_sink>> create(std::vector<T> const& args)
+  {
+    std::vector<std::unique_ptr<data_sink>> sinks;
+    sinks.reserve(args.size());
+    std::transform(args.cbegin(), args.cend(), std::back_inserter(sinks), [](auto const& arg) {
+      return data_sink::create(arg);
+    });
+    return sinks;
+  }
+
   /**
    * @brief Base class destructor
    */
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index a18bd450640..9af2e3f278d 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -89,13 +89,13 @@ class writer {
   /**
    * @brief Constructor for output to a file.
    *
-   * @param sink The data sink to write the data to
+   * @param sinks The data sinks to write the data to
    * @param options Settings for controlling writing behavior
    * @param mode Option to write at once or in chunks
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
+  explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   parquet_writer_options const& options,
                   SingleWriteMode mode,
                   rmm::cuda_stream_view stream,
@@ -104,7 +104,7 @@ class writer {
   /**
    * @brief Constructor for writer to handle chunked parquet options.
    *
-   * @param sink The data sink to write the data to
+   * @param sinks The data sinks to write the data to
    * @param options Settings for controlling writing behavior for chunked writer
    * @param mode Option to write at once or in chunks
    * @param stream CUDA stream used for device memory operations and kernel launches
@@ -112,7 +112,7 @@ class writer {
    *
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list
    */
-  explicit writer(std::unique_ptr<cudf::io::data_sink> sink,
+  explicit writer(std::vector<std::unique_ptr<data_sink>> sinks,
                   chunked_parquet_writer_options const& options,
                   SingleWriteMode mode,
                   rmm::cuda_stream_view stream,
@@ -127,8 +127,10 @@ class writer {
    * @brief Writes a single subtable as part of a larger parquet file/table write.
    *
    * @param[in] table The table information to be written
+   * @param[in] partitions Optional partitions to divide the table into. If specified, must be same
+   * size as number of sinks.
    */
-  void write(table_view const& table);
+  void write(table_view const& table, std::vector<partition_info> const& partitions = {});
 
   /**
    * @brief Finishes the chunked/streamed write process.
@@ -138,7 +140,8 @@ class writer {
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if
    * `column_chunks_file_path` is provided, else null.
    */
-  std::unique_ptr<std::vector<uint8_t>> close(std::string const& column_chunks_file_path = "");
+  std::unique_ptr<std::vector<uint8_t>> close(
+    std::vector<std::string> const& column_chunks_file_path = {});
 
   /**
    * @brief Merges multiple metadata blobs returned by write_all into a single metadata blob
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 16588185f3d..b3a2f6bcbbb 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -454,6 +454,8 @@ class orc_writer_options {
   table_view _table;
   // Optional associated metadata
   const table_input_metadata* _metadata = nullptr;
+  // Optional footer key_value_metadata
+  std::map<std::string, std::string> _user_data;
 
   friend orc_writer_options_builder;
 
@@ -530,6 +532,11 @@ class orc_writer_options {
    */
   table_input_metadata const* get_metadata() const { return _metadata; }
 
+  /**
+   * @brief Returns Key-Value footer metadata information.
+   */
+  std::map<std::string, std::string> const& get_key_value_metadata() const { return _user_data; }
+
   // Setters
 
   /**
@@ -591,6 +598,16 @@ class orc_writer_options {
    * @param meta Associated metadata.
    */
   void set_metadata(table_input_metadata const* meta) { _metadata = meta; }
+
+  /**
+   * @brief Sets metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   */
+  void set_key_value_metadata(std::map<std::string, std::string> metadata)
+  {
+    _user_data = std::move(metadata);
+  }
 };
 
 class orc_writer_options_builder {
@@ -698,6 +715,18 @@ class orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets Key-Value footer metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   * @return this for chaining.
+   */
+  orc_writer_options_builder& key_value_metadata(std::map<std::string, std::string> metadata)
+  {
+    options._user_data = std::move(metadata);
+    return *this;
+  }
+
   /**
    * @brief move orc_writer_options member once it's built.
    */
@@ -753,6 +782,8 @@ class chunked_orc_writer_options {
   size_type _row_index_stride = default_row_index_stride;
   // Optional associated metadata
   const table_input_metadata* _metadata = nullptr;
+  // Optional footer key_value_metadata
+  std::map<std::string, std::string> _user_data;
 
   friend chunked_orc_writer_options_builder;
 
@@ -819,6 +850,11 @@ class chunked_orc_writer_options {
    */
   table_input_metadata const* get_metadata() const { return _metadata; }
 
+  /**
+   * @brief Returns Key-Value footer metadata information.
+   */
+  std::map<std::string, std::string> const& get_key_value_metadata() const { return _user_data; }
+
   // Setters
 
   /**
@@ -873,6 +909,16 @@ class chunked_orc_writer_options {
    * @param meta Associated metadata.
    */
   void metadata(table_input_metadata const* meta) { _metadata = meta; }
+
+  /**
+   * @brief Sets Key-Value footer metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   */
+  void set_key_value_metadata(std::map<std::string, std::string> metadata)
+  {
+    _user_data = std::move(metadata);
+  }
 };
 
 class chunked_orc_writer_options_builder {
@@ -965,6 +1011,19 @@ class chunked_orc_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets Key-Value footer metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   * @return this for chaining.
+   */
+  chunked_orc_writer_options_builder& key_value_metadata(
+    std::map<std::string, std::string> metadata)
+  {
+    options._user_data = std::move(metadata);
+    return *this;
+  }
+
   /**
    * @brief move chunked_orc_writer_options member once it's built.
    */
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 2215f24b550..740f7a8b2db 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -364,13 +364,17 @@ class parquet_writer_options {
   statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP;
   // Sets of columns to output
   table_view _table;
+  // Partitions described as {start_row, num_rows} pairs
+  std::vector<partition_info> _partitions;
   // Optional associated metadata
   table_input_metadata const* _metadata = nullptr;
+  // Optional footer key_value_metadata
+  std::vector<std::map<std::string, std::string>> _user_data;
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
-  // Column chunks file path to be set in the raw output metadata
-  std::string _column_chunks_file_path;
+  // Column chunks file paths to be set in the raw output metadata. One per output file
+  std::vector<std::string> _column_chunks_file_paths;
   // Maximum size of each row group (unless smaller than a single page)
   size_t _row_group_size_bytes = default_row_group_size_bytes;
   // Maximum number of rows in row group (unless smaller than a single page)
@@ -434,20 +438,36 @@ class parquet_writer_options {
    */
   table_view get_table() const { return _table; }
 
+  /**
+   * @brief Returns partitions.
+   */
+  std::vector<partition_info> const& get_partitions() const { return _partitions; }
+
   /**
    * @brief Returns associated metadata.
    */
   table_input_metadata const* get_metadata() const { return _metadata; }
 
+  /**
+   * @brief Returns Key-Value footer metadata information.
+   */
+  std::vector<std::map<std::string, std::string>> const& get_key_value_metadata() const
+  {
+    return _user_data;
+  }
+
   /**
    * @brief Returns `true` if timestamps will be written as INT96
    */
   bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
   /**
-   * @brief Returns Column chunks file path to be set in the raw output metadata.
+   * @brief Returns Column chunks file paths to be set in the raw output metadata.
    */
-  std::string get_column_chunks_file_path() const { return _column_chunks_file_path; }
+  std::vector<std::string> const& get_column_chunks_file_paths() const
+  {
+    return _column_chunks_file_paths;
+  }
 
   /**
    * @brief Returns maximum row group size, in bytes.
@@ -459,6 +479,19 @@ class parquet_writer_options {
    */
   auto get_row_group_size_rows() const { return _row_group_size_rows; }
 
+  /**
+   * @brief Sets partitions.
+   *
+   * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must
+   * be same size as number of sinks in sink_info
+   */
+  void set_partitions(std::vector<partition_info> partitions)
+  {
+    CUDF_EXPECTS(partitions.size() == _sink.num_sinks(),
+                 "Mismatch between number of sinks and number of partitions");
+    _partitions = std::move(partitions);
+  }
+
   /**
    * @brief Sets metadata.
    *
@@ -466,6 +499,18 @@ class parquet_writer_options {
    */
   void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; }
 
+  /**
+   * @brief Sets metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   */
+  void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)
+  {
+    CUDF_EXPECTS(metadata.size() == _sink.num_sinks(),
+                 "Mismatch between number of sinks and number of metadata maps");
+    _user_data = std::move(metadata);
+  }
+
   /**
    * @brief Sets the level of statistics.
    *
@@ -491,11 +536,14 @@ class parquet_writer_options {
   /**
    * @brief Sets column chunks file path to be set in the raw output metadata.
    *
-   * @param file_path String which indicates file path.
+   * @param file_paths Vector of Strings which indicates file path. Must be same size as number of
+   * data sinks in sink info
    */
-  void set_column_chunks_file_path(std::string file_path)
+  void set_column_chunks_file_paths(std::vector<std::string> file_paths)
   {
-    _column_chunks_file_path.assign(file_path);
+    CUDF_EXPECTS(file_paths.size() == _sink.num_sinks(),
+                 "Mismatch between number of sinks and number of chunk paths to set");
+    _column_chunks_file_paths = std::move(file_paths);
   }
 
   /**
@@ -543,6 +591,21 @@ class parquet_writer_options_builder {
   {
   }
 
+  /**
+   * @brief Sets partitions in parquet_writer_options.
+   *
+   * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must
+   * be same size as number of sinks in sink_info
+   * @return this for chaining.
+   */
+  parquet_writer_options_builder& partitions(std::vector<partition_info> partitions)
+  {
+    CUDF_EXPECTS(partitions.size() == options._sink.num_sinks(),
+                 "Mismatch between number of sinks and number of partitions");
+    options.set_partitions(std::move(partitions));
+    return *this;
+  }
+
   /**
    * @brief Sets metadata in parquet_writer_options.
    *
@@ -555,6 +618,21 @@ class parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets Key-Value footer metadata in parquet_writer_options.
+   *
+   * @param metadata Key-Value footer metadata
+   * @return this for chaining.
+   */
+  parquet_writer_options_builder& key_value_metadata(
+    std::vector<std::map<std::string, std::string>> metadata)
+  {
+    CUDF_EXPECTS(metadata.size() == options._sink.num_sinks(),
+                 "Mismatch between number of sinks and number of metadata maps");
+    options._user_data = std::move(metadata);
+    return *this;
+  }
+
   /**
    * @brief Sets the level of statistics in parquet_writer_options.
    *
@@ -582,12 +660,15 @@ class parquet_writer_options_builder {
   /**
    * @brief Sets column chunks file path to be set in the raw output metadata.
    *
-   * @param file_path String which indicates file path.
+   * @param file_paths Vector of Strings which indicates file path. Must be same size as number of
+   * data sinks
    * @return this for chaining.
    */
-  parquet_writer_options_builder& column_chunks_file_path(std::string file_path)
+  parquet_writer_options_builder& column_chunks_file_paths(std::vector<std::string> file_paths)
   {
-    options._column_chunks_file_path.assign(file_path);
+    CUDF_EXPECTS(file_paths.size() == options._sink.num_sinks(),
+                 "Mismatch between number of sinks and number of chunk paths to set");
+    options.set_column_chunks_file_paths(std::move(file_paths));
     return *this;
   }
 
@@ -690,6 +771,8 @@ class chunked_parquet_writer_options {
   statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP;
   // Optional associated metadata.
   table_input_metadata const* _metadata = nullptr;
+  // Optional footer key_value_metadata
+  std::vector<std::map<std::string, std::string>> _user_data;
   // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
   // If true then overrides any per-column setting in _metadata.
   bool _write_timestamps_as_int96 = false;
@@ -735,6 +818,14 @@ class chunked_parquet_writer_options {
    */
   table_input_metadata const* get_metadata() const { return _metadata; }
 
+  /**
+   * @brief Returns Key-Value footer metadata information.
+   */
+  std::vector<std::map<std::string, std::string>> const& get_key_value_metadata() const
+  {
+    return _user_data;
+  }
+
   /**
    * @brief Returns `true` if timestamps will be written as INT96
    */
@@ -757,6 +848,18 @@ class chunked_parquet_writer_options {
    */
   void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; }
 
+  /**
+   * @brief Sets Key-Value footer metadata.
+   *
+   * @param metadata Key-Value footer metadata
+   */
+  void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)
+  {
+    CUDF_EXPECTS(metadata.size() == _sink.num_sinks(),
+                 "Mismatch between number of sinks and number of metadata maps");
+    _user_data = std::move(metadata);
+  }
+
   /**
    * @brief Sets the level of statistics in parquet_writer_options.
    *
@@ -841,6 +944,21 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets Key-Value footer metadata in parquet_writer_options.
+   *
+   * @param metadata Key-Value footer metadata
+   * @return this for chaining.
+   */
+  chunked_parquet_writer_options_builder& key_value_metadata(
+    std::vector<std::map<std::string, std::string>> metadata)
+  {
+    CUDF_EXPECTS(metadata.size() == options._sink.num_sinks(),
+                 "Mismatch between number of sinks and number of metadata maps");
+    options.set_key_value_metadata(std::move(metadata));
+    return *this;
+  }
+
   /**
    * @brief Sets Sets the level of statistics in chunked_parquet_writer_options.
    *
@@ -958,18 +1076,25 @@ class parquet_chunked_writer {
    * @brief Writes table to output.
    *
    * @param[in] table Table that needs to be written
+   * @param[in] partitions Optional partitions to divide the table into. If specified, must be same
+   * size as number of sinks.
+   *
+   * @throws cudf::logic_error If the number of partitions is not the smae as number of sinks
    * @return returns reference of the class object
    */
-  parquet_chunked_writer& write(table_view const& table);
+  parquet_chunked_writer& write(table_view const& table,
+                                std::vector<partition_info> const& partitions = {});
 
   /**
    * @brief Finishes the chunked/streamed write process.
    *
-   * @param[in] column_chunks_file_path Column chunks file path to be set in the raw output metadata
+   * @param[in] column_chunks_file_paths Column chunks file path to be set in the raw output
+   * metadata
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if
-   * `column_chunks_file_path` is provided, else null.
+   * `column_chunks_file_paths` is provided, else null.
    */
-  std::unique_ptr<std::vector<uint8_t>> close(std::string const& column_chunks_file_path = "");
+  std::unique_ptr<std::vector<uint8_t>> close(
+    std::vector<std::string> const& column_chunks_file_paths = {});
 
   // Unique pointer to impl writer class
   std::unique_ptr<cudf::io::detail::parquet::writer> writer;
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index cf6be8a20af..512a90b3249 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -151,61 +151,93 @@ struct host_buffer {
  * @brief Source information for read interfaces
  */
 struct source_info {
-  io_type type = io_type::FILEPATH;
-  std::vector<std::string> filepaths;
-  std::vector<host_buffer> buffers;
-  std::vector<std::shared_ptr<arrow::io::RandomAccessFile>> files;
-  std::vector<cudf::io::datasource*> user_sources;
+  std::vector<std::shared_ptr<arrow::io::RandomAccessFile>> _files;
 
   source_info() = default;
 
   explicit source_info(std::vector<std::string> const& file_paths)
-    : type(io_type::FILEPATH), filepaths(file_paths)
+    : _type(io_type::FILEPATH), _filepaths(file_paths)
   {
   }
   explicit source_info(std::string const& file_path)
-    : type(io_type::FILEPATH), filepaths({file_path})
+    : _type(io_type::FILEPATH), _filepaths({file_path})
   {
   }
 
   explicit source_info(std::vector<host_buffer> const& host_buffers)
-    : type(io_type::HOST_BUFFER), buffers(host_buffers)
+    : _type(io_type::HOST_BUFFER), _buffers(host_buffers)
   {
   }
   explicit source_info(const char* host_data, size_t size)
-    : type(io_type::HOST_BUFFER), buffers({{host_data, size}})
+    : _type(io_type::HOST_BUFFER), _buffers({{host_data, size}})
   {
   }
 
   explicit source_info(std::vector<cudf::io::datasource*> const& sources)
-    : type(io_type::USER_IMPLEMENTED), user_sources(sources)
+    : _type(io_type::USER_IMPLEMENTED), _user_sources(sources)
   {
   }
   explicit source_info(cudf::io::datasource* source)
-    : type(io_type::USER_IMPLEMENTED), user_sources({source})
+    : _type(io_type::USER_IMPLEMENTED), _user_sources({source})
   {
   }
+
+  auto type() const { return _type; }
+  auto const& filepaths() const { return _filepaths; }
+  auto const& buffers() const { return _buffers; }
+  auto const& files() const { return _files; }
+  auto const& user_sources() const { return _user_sources; }
+
+ private:
+  io_type _type = io_type::FILEPATH;
+  std::vector<std::string> _filepaths;
+  std::vector<host_buffer> _buffers;
+  std::vector<cudf::io::datasource*> _user_sources;
 };
 
 /**
  * @brief Destination information for write interfaces
  */
 struct sink_info {
-  io_type type = io_type::VOID;
-  std::string filepath;
-  std::vector<char>* buffer      = nullptr;
-  cudf::io::data_sink* user_sink = nullptr;
-
   sink_info() = default;
+  sink_info(size_t num_sinks) : _type(io_type::VOID), _num_sinks(num_sinks) {}
 
-  explicit sink_info(const std::string& file_path) : type(io_type::FILEPATH), filepath(file_path) {}
+  explicit sink_info(std::vector<std::string> const& file_paths)
+    : _type(io_type::FILEPATH), _num_sinks(file_paths.size()), _filepaths(file_paths)
+  {
+  }
+  explicit sink_info(std::string const& file_path)
+    : _type(io_type::FILEPATH), _filepaths({file_path})
+  {
+  }
 
-  explicit sink_info(std::vector<char>* buffer) : type(io_type::HOST_BUFFER), buffer(buffer) {}
+  explicit sink_info(std::vector<std::vector<char>*> const& buffers)
+    : _type(io_type::HOST_BUFFER), _num_sinks(buffers.size()), _buffers(buffers)
+  {
+  }
+  explicit sink_info(std::vector<char>* buffer) : _type(io_type::HOST_BUFFER), _buffers({buffer}) {}
 
-  explicit sink_info(class cudf::io::data_sink* user_sink_)
-    : type(io_type::USER_IMPLEMENTED), user_sink(user_sink_)
+  explicit sink_info(std::vector<cudf::io::data_sink*> const& user_sinks)
+    : _type(io_type::USER_IMPLEMENTED), _num_sinks(user_sinks.size()), _user_sinks(user_sinks)
   {
   }
+  explicit sink_info(class cudf::io::data_sink* user_sink)
+    : _type(io_type::USER_IMPLEMENTED), _user_sinks({user_sink})
+  {
+  }
+
+  auto type() const { return _type; }
+  auto num_sinks() const { return _num_sinks; }
+  auto const& filepaths() const { return _filepaths; }
+  auto const& buffers() const { return _buffers; }
+  auto const& user_sinks() const { return _user_sinks; }
+
+ private:
+  io_type _type     = io_type::VOID;
+  size_t _num_sinks = 1;
+  std::vector<std::string> _filepaths;
+  std::vector<std::vector<char>*> _buffers;
+  std::vector<cudf::io::data_sink*> _user_sinks;
 };
 
 class table_input_metadata;
@@ -369,12 +401,21 @@ class table_input_metadata {
    * The constructed table_input_metadata has the same structure as the passed table_view
    *
    * @param table The table_view to construct metadata for
-   * @param user_data Optional Additional metadata to encode, as key-value pairs
    */
-  table_input_metadata(table_view const& table, std::map<std::string, std::string> user_data = {});
+  table_input_metadata(table_view const& table);
 
   std::vector<column_in_metadata> column_metadata;
-  std::map<std::string, std::string> user_data;  //!< Format-dependent metadata as key-values pairs
+};
+
+/**
+ * @brief Information used while writing partitioned datasets
+ *
+ * This information defines the slice of an input table to write to file. In partitioned dataset
+ * writing, one partition_info struct defines one partition and corresponds to one output file
+ */
+struct partition_info {
+  size_type start_row;
+  size_type num_rows;
 };
 
 }  // namespace io
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 768d6b25690..04638d3eca9 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -111,27 +111,33 @@ std::vector<std::unique_ptr<cudf::io::datasource>> make_datasources(source_info
                                                                     size_t range_offset = 0,
                                                                     size_t range_size   = 0)
 {
-  switch (info.type) {
+  switch (info.type()) {
     case io_type::FILEPATH: {
       auto sources = std::vector<std::unique_ptr<cudf::io::datasource>>();
-      for (auto const& filepath : info.filepaths) {
+      for (auto const& filepath : info.filepaths()) {
         sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size));
       }
       return sources;
     }
-    case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers);
-    case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources);
+    case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers());
+    case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources());
     default: CUDF_FAIL("Unsupported source type");
   }
 }
 
-std::unique_ptr<data_sink> make_datasink(sink_info const& info)
+std::vector<std::unique_ptr<data_sink>> make_datasinks(sink_info const& info)
 {
-  switch (info.type) {
-    case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepath);
-    case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffer);
-    case io_type::VOID: return cudf::io::data_sink::create();
-    case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sink);
+  switch (info.type()) {
+    case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepaths());
+    case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffers());
+    case io_type::VOID: {
+      std::vector<std::unique_ptr<data_sink>> sinks;
+      for (size_t i = 0; i < info.num_sinks(); ++i) {
+        sinks.push_back(cudf::io::data_sink::create());
+      }
+      return sinks;
+    }
+    case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sinks());
     default: CUDF_FAIL("Unsupported sink type");
   }
 }
@@ -156,9 +162,9 @@ compression_type infer_compression_type(compression_type compression, source_inf
 {
   if (compression != compression_type::AUTO) { return compression; }
 
-  if (info.type != io_type::FILEPATH) { return compression_type::NONE; }
+  if (info.type() != io_type::FILEPATH) { return compression_type::NONE; }
 
-  auto filepath = info.filepaths[0];
+  auto filepath = info.filepaths()[0];
 
   // Attempt to infer from the file extension
   const auto pos = filepath.find_last_of('.');
@@ -218,10 +224,11 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
 {
   using namespace cudf::io::detail;
 
-  auto sink = make_datasink(options.get_sink());
+  auto sinks = make_datasinks(options.get_sink());
+  CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for CSV writing");
 
   return csv::write_csv(  //
-    sink.get(),
+    sinks[0].get(),
     options.get_table(),
     options.get_metadata(),
     options,
@@ -235,15 +242,16 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info)
 {
   // Get source to read statistics from
   std::unique_ptr<datasource> source;
-  if (src_info.type == io_type::FILEPATH) {
-    CUDF_EXPECTS(src_info.filepaths.size() == 1, "Only a single source is currently supported.");
-    source = cudf::io::datasource::create(src_info.filepaths[0]);
-  } else if (src_info.type == io_type::HOST_BUFFER) {
-    CUDF_EXPECTS(src_info.buffers.size() == 1, "Only a single source is currently supported.");
-    source = cudf::io::datasource::create(src_info.buffers[0]);
-  } else if (src_info.type == io_type::USER_IMPLEMENTED) {
-    CUDF_EXPECTS(src_info.user_sources.size() == 1, "Only a single source is currently supported.");
-    source = cudf::io::datasource::create(src_info.user_sources[0]);
+  if (src_info.type() == io_type::FILEPATH) {
+    CUDF_EXPECTS(src_info.filepaths().size() == 1, "Only a single source is currently supported.");
+    source = cudf::io::datasource::create(src_info.filepaths()[0]);
+  } else if (src_info.type() == io_type::HOST_BUFFER) {
+    CUDF_EXPECTS(src_info.buffers().size() == 1, "Only a single source is currently supported.");
+    source = cudf::io::datasource::create(src_info.buffers()[0]);
+  } else if (src_info.type() == io_type::USER_IMPLEMENTED) {
+    CUDF_EXPECTS(src_info.user_sources().size() == 1,
+                 "Only a single source is currently supported.");
+    source = cudf::io::datasource::create(src_info.user_sources()[0]);
   } else {
     CUDF_FAIL("Unsupported source type");
   }
@@ -350,9 +358,11 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc
 
   CUDF_FUNC_RANGE();
 
-  auto sink   = make_datasink(options.get_sink());
+  auto sinks = make_datasinks(options.get_sink());
+  CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing");
+
   auto writer = std::make_unique<detail_orc::writer>(
-    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+    std::move(sinks[0]), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
   writer->write(options.get_table());
 }
@@ -365,10 +375,11 @@ orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options
 {
   namespace io_detail = cudf::io::detail;
 
-  auto sink = make_datasink(options.get_sink());
+  auto sinks = make_datasinks(options.get_sink());
+  CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing");
 
   writer = std::make_unique<detail_orc::writer>(
-    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+    std::move(sinks[0]), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
@@ -417,9 +428,7 @@ std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
   return detail_parquet::writer::merge_row_group_metadata(metadata_list);
 }
 
-table_input_metadata::table_input_metadata(table_view const& table,
-                                           std::map<std::string, std::string> user_data)
-  : user_data{std::move(user_data)}
+table_input_metadata::table_input_metadata(table_view const& table)
 {
   // Create a metadata hierarchy using `table`
   std::function<column_in_metadata(column_view const&)> get_children = [&](column_view const& col) {
@@ -443,13 +452,13 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
 
   CUDF_FUNC_RANGE();
 
-  auto sink   = make_datasink(options.get_sink());
+  auto sinks  = make_datasinks(options.get_sink());
   auto writer = std::make_unique<detail_parquet::writer>(
-    std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
+    std::move(sinks), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr);
 
-  writer->write(options.get_table());
+  writer->write(options.get_table(), options.get_partitions());
 
-  return writer->close(options.get_column_chunks_file_path());
+  return writer->close(options.get_column_chunks_file_paths());
 }
 
 /**
@@ -460,20 +469,21 @@ parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options co
 {
   namespace io_detail = cudf::io::detail;
 
-  auto sink = make_datasink(options.get_sink());
+  auto sinks = make_datasinks(options.get_sink());
 
   writer = std::make_unique<detail_parquet::writer>(
-    std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
+    std::move(sinks), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr);
 }
 
 /**
  * @copydoc cudf::io::parquet_chunked_writer::write
  */
-parquet_chunked_writer& parquet_chunked_writer::write(table_view const& table)
+parquet_chunked_writer& parquet_chunked_writer::write(table_view const& table,
+                                                      std::vector<partition_info> const& partitions)
 {
   CUDF_FUNC_RANGE();
 
-  writer->write(table);
+  writer->write(table, partitions);
 
   return *this;
 }
@@ -482,7 +492,7 @@ parquet_chunked_writer& parquet_chunked_writer::write(table_view const& table)
  * @copydoc cudf::io::parquet_chunked_writer::close
  */
 std::unique_ptr<std::vector<uint8_t>> parquet_chunked_writer::close(
-  std::string const& column_chunks_file_path)
+  std::vector<std::string> const& column_chunks_file_path)
 {
   CUDF_FUNC_RANGE();
   return writer->close(column_chunks_file_path);
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index c1eb9891229..b0e674c206f 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1313,6 +1313,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.is_enabled_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
+    kv_meta(options.get_key_value_metadata()),
     out_sink_(std::move(sink))
 {
   if (options.get_metadata()) {
@@ -1333,6 +1334,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     compression_kind_(to_orc_compression(options.get_compression())),
     enable_statistics_(options.is_enabled_statistics()),
     single_write_mode(mode == SingleWriteMode::YES),
+    kv_meta(options.get_key_value_metadata()),
     out_sink_(std::move(sink))
 {
   if (options.get_metadata()) {
@@ -2069,12 +2071,10 @@ void writer::impl::close()
   PostScript ps;
 
   ff.contentLength = out_sink_->bytes_written();
-  std::transform(table_meta->user_data.begin(),
-                 table_meta->user_data.end(),
-                 std::back_inserter(ff.metadata),
-                 [&](auto const& udata) {
-                   return UserMetadataItem{udata.first, udata.second};
-                 });
+  std::transform(
+    kv_meta.begin(), kv_meta.end(), std::back_inserter(ff.metadata), [&](auto const& udata) {
+      return UserMetadataItem{udata.first, udata.second};
+    });
 
   // Write statistics metadata
   if (md.stripeStats.size() != 0) {
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 68622d17b28..80c22b09927 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -369,6 +369,8 @@ class writer::impl {
   bool const single_write_mode;
   // optional user metadata
   std::unique_ptr<table_input_metadata> table_meta;
+  // optional user metadata
+  std::map<std::string, std::string> kv_meta;
   // to track if the output has been written to sink
   bool closed = false;
 
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 42d27dadd1a..5589f87e57e 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -95,69 +95,41 @@ struct map_find_fn {
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 1)
   populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                                  size_type num_rows)
+                                  cudf::detail::device_2dspan<gpu::PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
   auto t       = threadIdx.x;
+  auto frag    = frags[col_idx][block_x];
+  auto chunk   = frag.chunk;
+  auto col     = chunk->col_desc;
 
-  auto start_row =
-    block_x *
-    max_page_fragment_size;  // This is fragment size. all chunks are multiple of these many rows.
-  size_type end_row = min(start_row + max_page_fragment_size, num_rows);
+  size_type start_row = frag.start_row;
+  size_type end_row   = frag.start_row + frag.num_rows;
 
-  __shared__ EncColumnChunk* s_chunk;
-  __shared__ parquet_column_device_view s_col;
   __shared__ size_type s_start_value_idx;
   __shared__ size_type s_num_values;
-  if (t == 0) {
-    // Find the chunk this block is a part of
-    size_type num_rowgroups = chunks.size().first;
-    size_type rg_idx        = 0;
-    while (rg_idx < num_rowgroups) {
-      if (auto ck = chunks[rg_idx][col_idx];
-          start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) {
-        break;
-      }
-      ++rg_idx;
-    }
-    s_chunk = &chunks[rg_idx][col_idx];
-    s_col   = *(s_chunk->col_desc);
-  }
-  __syncthreads();
-  if (not s_chunk->use_dictionary) { return; }
+
+  if (not chunk->use_dictionary) { return; }
 
   if (t == 0) {
     // Find the bounds of values in leaf column to be inserted into the map for current chunk
-    auto col             = *(s_col.parent_column);
-    auto start_value_idx = start_row;
-    auto end_value_idx   = end_row;
-    while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-      if (col.type().id() == type_id::STRUCT) {
-        start_value_idx += col.offset();
-        end_value_idx += col.offset();
-        col = col.child(0);
-      } else {
-        auto offset_col = col.child(lists_column_view::offsets_column_index);
-        start_value_idx = offset_col.element<size_type>(start_value_idx + col.offset());
-        end_value_idx   = offset_col.element<size_type>(end_value_idx + col.offset());
-        col             = col.child(lists_column_view::child_column_index);
-      }
-    }
-    s_start_value_idx = start_value_idx;
-    s_num_values      = end_value_idx - start_value_idx;
+    auto cudf_col      = *(col->parent_column);
+    s_start_value_idx  = row_to_value_idx(start_row, cudf_col);
+    auto end_value_idx = row_to_value_idx(end_row, cudf_col);
+    s_num_values       = end_value_idx - s_start_value_idx;
   }
   __syncthreads();
 
-  column_device_view const& data_col = *s_col.leaf_column;
+  column_device_view const& data_col = *col->leaf_column;
   using block_reduce                 = cub::BlockReduce<size_type, block_size>;
   __shared__ typename block_reduce::TempStorage reduce_storage;
 
   // Make a view of the hash map
   auto hash_map_mutable = map_type::device_mutable_view(
-    s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+    chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
   auto hash_map = map_type::device_view(
-    s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+    chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
 
   __shared__ int total_num_dict_entries;
   for (size_type i = 0; i < s_num_values; i += block_size) {
@@ -176,7 +148,7 @@ __global__ void __launch_bounds__(block_size, 1)
           type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx);
         uniq_elem_size = [&]() -> size_type {
           if (not is_unique) { return 0; }
-          switch (s_col.physical_type) {
+          switch (col->physical_type) {
             case Type::INT32: return 4;
             case Type::INT64: return 8;
             case Type::INT96: return 12;
@@ -200,9 +172,9 @@ __global__ void __launch_bounds__(block_size, 1)
     __syncthreads();
     auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
     if (t == 0) {
-      total_num_dict_entries = atomicAdd(&s_chunk->num_dict_entries, num_unique);
+      total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
       total_num_dict_entries += num_unique;
-      atomicAdd(&s_chunk->uniq_data_size, uniq_data_size);
+      atomicAdd(&chunk->uniq_data_size, uniq_data_size);
     }
     __syncthreads();
 
@@ -246,67 +218,38 @@ __global__ void __launch_bounds__(block_size, 1)
 template <int block_size>
 __global__ void __launch_bounds__(block_size, 1)
   get_dictionary_indices_kernel(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                                size_type num_rows)
+                                cudf::detail::device_2dspan<gpu::PageFragment const> frags)
 {
   auto col_idx = blockIdx.y;
   auto block_x = blockIdx.x;
   auto t       = threadIdx.x;
+  auto frag    = frags[col_idx][block_x];
+  auto chunk   = frag.chunk;
+  auto col     = chunk->col_desc;
 
-  size_type start_row = block_x * max_page_fragment_size;
-  size_type end_row   = min(start_row + max_page_fragment_size, num_rows);
+  size_type start_row = frag.start_row;
+  size_type end_row   = frag.start_row + frag.num_rows;
 
-  __shared__ EncColumnChunk s_chunk;
-  __shared__ parquet_column_device_view s_col;
   __shared__ size_type s_start_value_idx;
   __shared__ size_type s_ck_start_val_idx;
   __shared__ size_type s_num_values;
 
   if (t == 0) {
-    // Find the chunk this block is a part of
-    size_type num_rowgroups = chunks.size().first;
-    size_type rg_idx        = 0;
-    while (rg_idx < num_rowgroups) {
-      if (auto ck = chunks[rg_idx][col_idx];
-          start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) {
-        break;
-      }
-      ++rg_idx;
-    }
-    s_chunk = chunks[rg_idx][col_idx];
-    s_col   = *(s_chunk.col_desc);
-
-    // Find the bounds of values in leaf column to be inserted into the map for current chunk
-
-    auto col                 = *(s_col.parent_column);
-    auto start_value_idx     = start_row;
-    auto end_value_idx       = end_row;
-    auto chunk_start_val_idx = s_chunk.start_row;
-    while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-      if (col.type().id() == type_id::STRUCT) {
-        start_value_idx += col.offset();
-        chunk_start_val_idx += col.offset();
-        end_value_idx += col.offset();
-        col = col.child(0);
-      } else {
-        auto offset_col     = col.child(lists_column_view::offsets_column_index);
-        start_value_idx     = offset_col.element<size_type>(start_value_idx + col.offset());
-        chunk_start_val_idx = offset_col.element<size_type>(chunk_start_val_idx + col.offset());
-        end_value_idx       = offset_col.element<size_type>(end_value_idx + col.offset());
-        col                 = col.child(lists_column_view::child_column_index);
-      }
-    }
-    s_start_value_idx  = start_value_idx;
-    s_ck_start_val_idx = chunk_start_val_idx;
-    s_num_values       = end_value_idx - start_value_idx;
+    // Find the bounds of values in leaf column to be searched in the map for current chunk
+    auto cudf_col      = *(col->parent_column);
+    s_start_value_idx  = row_to_value_idx(start_row, cudf_col);
+    s_ck_start_val_idx = row_to_value_idx(chunk->start_row, cudf_col);
+    auto end_value_idx = row_to_value_idx(end_row, cudf_col);
+    s_num_values       = end_value_idx - s_start_value_idx;
   }
   __syncthreads();
 
-  if (not s_chunk.use_dictionary) { return; }
+  if (not chunk->use_dictionary) { return; }
 
-  column_device_view const& data_col = *s_col.leaf_column;
+  column_device_view const& data_col = *col->leaf_column;
 
   auto map = map_type::device_view(
-    s_chunk.dict_map_slots, s_chunk.dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
+    chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL);
 
   for (size_t i = 0; i < s_num_values; i += block_size) {
     if (t + i < s_num_values) {
@@ -321,7 +264,7 @@ __global__ void __launch_bounds__(block_size, 1)
         if (found_slot != map.end()) {
           // No need for atomic as this is not going to be modified by any other thread
           auto* val_ptr = reinterpret_cast<map_type::mapped_type*>(&found_slot->second);
-          s_chunk.dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
+          chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
         }
       }
     }
@@ -336,16 +279,14 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
 }
 
 void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                              size_type num_rows,
+                              cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   constexpr int block_size = 256;
-  auto const grid_x        = cudf::detail::grid_1d(num_rows, max_page_fragment_size);
-  auto const num_columns   = chunks.size().second;
-  dim3 const dim_grid(grid_x.num_blocks, num_columns);
+  dim3 const dim_grid(frags.size().second, frags.size().first);
 
   populate_chunk_hash_maps_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, num_rows);
+    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, frags);
 }
 
 void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
@@ -355,16 +296,14 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
 }
 
 void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                            size_type num_rows,
+                            cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
   constexpr int block_size = 256;
-  auto const grid_x        = cudf::detail::grid_1d(num_rows, max_page_fragment_size);
-  auto const num_columns   = chunks.size().second;
-  dim3 const dim_grid(grid_x.num_blocks, num_columns);
+  dim3 const dim_grid(frags.size().second, frags.size().first);
 
   get_dictionary_indices_kernel<block_size>
-    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, num_rows);
+    <<<dim_grid, block_size, 0, stream.value()>>>(chunks, frags);
 }
 }  // namespace gpu
 }  // namespace parquet
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 33647ff626c..ec6b24b3b4e 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -27,6 +27,7 @@
 
 #include <cub/cub.cuh>
 #include <cuda/std/chrono>
+#include <thrust/binary_search.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/discard_iterator.h>
 
@@ -49,7 +50,6 @@ constexpr uint32_t rle_buffer_size = (1 << 9);
 struct frag_init_state_s {
   parquet_column_device_view col;
   PageFragment frag;
-  size_type start_value_idx;
 };
 
 struct page_enc_state_s {
@@ -114,24 +114,14 @@ inline __device__ uint32_t uint64_init_hash(uint64_t v)
   return uint32_init_hash(static_cast<uint32_t>(v + (v >> 32)));
 }
 
-/**
- * @brief Initializes encoder page fragments
- *
- * Based on the number of rows in each fragment, populates the value count, the size of data in the
- * fragment, the number of unique values, and the data size of unique values.
- *
- * @param[in] frag Fragment array [fragment_id][column_id]
- * @param[in] col_desc Column description array [column_id]
- * @param[in] num_fragments Number of fragments per column
- * @param[in] num_columns Number of columns
- */
 // blockDim {512,1,1}
 template <int block_size>
 __global__ void __launch_bounds__(block_size)
   gpuInitPageFragments(device_2dspan<PageFragment> frag,
                        device_span<parquet_column_device_view const> col_desc,
-                       uint32_t fragment_size,
-                       uint32_t max_num_rows)
+                       device_span<partition_info const> partitions,
+                       device_span<int const> part_frag_offset,
+                       uint32_t fragment_size)
 {
   __shared__ __align__(16) frag_init_state_s state_g;
 
@@ -140,53 +130,36 @@ __global__ void __launch_bounds__(block_size)
 
   frag_init_state_s* const s = &state_g;
   uint32_t t                 = threadIdx.x;
+  int frag_y                 = blockIdx.y;
 
   if (t == 0) s->col = col_desc[blockIdx.x];
   __syncthreads();
-  uint32_t const start_row = blockIdx.y * fragment_size;
   if (!t) {
-    // frag.num_rows = fragment_size except for the last page fragment which can be smaller.
+    // Find which partition this fragment came from
+    auto it =
+      thrust::upper_bound(thrust::seq, part_frag_offset.begin(), part_frag_offset.end(), frag_y);
+    int p             = it - part_frag_offset.begin() - 1;
+    int part_end_row  = partitions[p].start_row + partitions[p].num_rows;
+    s->frag.start_row = (frag_y - part_frag_offset[p]) * fragment_size + partitions[p].start_row;
+
+    // frag.num_rows = fragment_size except for the last fragment in partition which can be smaller.
     // num_rows is fixed but fragment size could be larger if the data is strings or nested.
-    s->frag.num_rows           = min(fragment_size, max_num_rows - min(start_row, max_num_rows));
+    s->frag.num_rows           = min(fragment_size, part_end_row - s->frag.start_row);
     s->frag.num_dict_vals      = 0;
     s->frag.fragment_data_size = 0;
     s->frag.dict_data_size     = 0;
 
-    // To use num_vals instead of num_rows, we need to calculate num_vals on the fly.
-    // For list<list<int>>, values between i and i+50 can be calculated by
-    // off_11 = off[i], off_12 = off[i+50]
-    // off_21 = child.off[off_11], off_22 = child.off[off_12]
-    // etc...
-    size_type end_value_idx = start_row + s->frag.num_rows;
-    if (s->col.parent_column == nullptr) {
-      s->start_value_idx = start_row;
-    } else {
-      auto col                     = *(s->col.parent_column);
-      auto current_start_value_idx = start_row;
-      while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-        if (col.type().id() == type_id::STRUCT) {
-          current_start_value_idx += col.offset();
-          end_value_idx += col.offset();
-          col = col.child(0);
-        } else {
-          auto offset_col = col.child(lists_column_view::offsets_column_index);
-          current_start_value_idx =
-            offset_col.element<size_type>(current_start_value_idx + col.offset());
-          end_value_idx = offset_col.element<size_type>(end_value_idx + col.offset());
-          col           = col.child(lists_column_view::child_column_index);
-        }
-      }
-      s->start_value_idx = current_start_value_idx;
-    }
-    s->frag.start_value_idx = s->start_value_idx;
-    s->frag.num_leaf_values = end_value_idx - s->start_value_idx;
+    auto col                = *(s->col.parent_column);
+    s->frag.start_value_idx = row_to_value_idx(s->frag.start_row, col);
+    size_type end_value_idx = row_to_value_idx(s->frag.start_row + s->frag.num_rows, col);
+    s->frag.num_leaf_values = end_value_idx - s->frag.start_value_idx;
 
     if (s->col.level_offsets != nullptr) {
       // For nested schemas, the number of values in a fragment is not directly related to the
       // number of encoded data elements or the number of rows.  It is simply the number of
       // repetition/definition values which together encode validity and nesting information.
-      size_type first_level_val_idx = s->col.level_offsets[start_row];
-      size_type last_level_val_idx  = s->col.level_offsets[start_row + s->frag.num_rows];
+      size_type first_level_val_idx = s->col.level_offsets[s->frag.start_row];
+      size_type last_level_val_idx  = s->col.level_offsets[s->frag.start_row + s->frag.num_rows];
       s->frag.num_values            = last_level_val_idx - first_level_val_idx;
     } else {
       s->frag.num_values = s->frag.num_rows;
@@ -197,7 +170,7 @@ __global__ void __launch_bounds__(block_size)
   __syncthreads();
 
   size_type nvals           = s->frag.num_leaf_values;
-  size_type start_value_idx = s->start_value_idx;
+  size_type start_value_idx = s->frag.start_value_idx;
 
   for (uint32_t i = 0; i < nvals; i += block_size) {
     uint32_t val_idx  = start_value_idx + i + t;
@@ -912,28 +885,9 @@ __global__ void __launch_bounds__(128, 8)
       dst[0]     = dict_bits;
       s->rle_out = dst + 1;
     }
-    s->page_start_val    = s->page.start_row;  // Dictionary page's start row is chunk's start row
-    auto chunk_start_val = s->ck.start_row;
-    if (s->col.parent_column != nullptr) {  // TODO: remove this check. parent is now never nullptr
-      auto col                    = *(s->col.parent_column);
-      auto current_page_start_val = s->page_start_val;
-      // TODO: We do this so much. Add a global function that converts row idx to val idx
-      while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-        if (col.type().id() == type_id::STRUCT) {
-          current_page_start_val += col.offset();
-          chunk_start_val += col.offset();
-          col = col.child(0);
-        } else {
-          auto offset_col = col.child(lists_column_view::offsets_column_index);
-          current_page_start_val =
-            offset_col.element<size_type>(current_page_start_val + col.offset());
-          chunk_start_val = offset_col.element<size_type>(chunk_start_val + col.offset());
-          col             = col.child(lists_column_view::child_column_index);
-        }
-      }
-      s->page_start_val  = current_page_start_val;
-      s->chunk_start_val = chunk_start_val;
-    }
+    auto col           = *(s->col.parent_column);
+    s->page_start_val  = row_to_value_idx(s->page.start_row, col);
+    s->chunk_start_val = row_to_value_idx(s->ck.start_row, col);
   }
   __syncthreads();
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
@@ -1944,36 +1898,20 @@ dremel_data get_dremel_data(column_view h_col,
     std::move(new_offsets), std::move(rep_level), std::move(def_level), leaf_data_size};
 }
 
-/**
- * @brief Launches kernel for initializing encoder page fragments
- *
- * @param[in,out] frag Fragment array [column_id][fragment_id]
- * @param[in] col_desc Column description array [column_id]
- * @param[in] num_fragments Number of fragments per column
- * @param[in] num_columns Number of columns
- * @param[in] stream CUDA stream to use, default 0
- */
 void InitPageFragments(device_2dspan<PageFragment> frag,
                        device_span<parquet_column_device_view const> col_desc,
+                       device_span<partition_info const> partitions,
+                       device_span<int const> part_frag_offset,
                        uint32_t fragment_size,
-                       uint32_t num_rows,
                        rmm::cuda_stream_view stream)
 {
   auto num_columns              = frag.size().first;
   auto num_fragments_per_column = frag.size().second;
   dim3 dim_grid(num_columns, num_fragments_per_column);  // 1 threadblock per fragment
-  gpuInitPageFragments<512>
-    <<<dim_grid, 512, 0, stream.value()>>>(frag, col_desc, fragment_size, num_rows);
+  gpuInitPageFragments<512><<<dim_grid, 512, 0, stream.value()>>>(
+    frag, col_desc, partitions, part_frag_offset, fragment_size);
 }
 
-/**
- * @brief Launches kernel for initializing fragment statistics groups
- *
- * @param[out] groups Statistics groups [num_columns x num_fragments]
- * @param[in] fragments Page fragments [num_columns x num_fragments]
- * @param[in] col_desc Column description [num_columns]
- * @param[in] stream CUDA stream to use, default 0
- */
 void InitFragmentStatistics(device_2dspan<statistics_group> groups,
                             device_2dspan<PageFragment const> fragments,
                             device_span<parquet_column_device_view const> col_desc,
@@ -1986,19 +1924,6 @@ void InitFragmentStatistics(device_2dspan<statistics_group> groups,
   gpuInitFragmentStats<<<dim_grid, 128, 0, stream.value()>>>(groups, fragments, col_desc);
 }
 
-/**
- * @brief Launches kernel for initializing encoder data pages
- *
- * @param[in,out] chunks Column chunks [rowgroup][column]
- * @param[out] pages Encode page array (null if just counting pages)
- * @param[in] col_desc Column description array [column_id]
- * @param[in] num_rowgroups Number of fragments per column
- * @param[in] num_columns Number of columns
- * @param[out] page_grstats Setup for page-level stats
- * @param[out] chunk_grstats Setup for chunk-level stats
- * @param[in] max_page_comp_data_size Calculated maximum compressed data size of pages
- * @param[in] stream CUDA stream to use, default 0
- */
 void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
                       device_span<gpu::EncPage> pages,
                       device_span<parquet_column_device_view const> col_desc,
@@ -2014,14 +1939,6 @@ void InitEncoderPages(device_2dspan<EncColumnChunk> chunks,
     chunks, pages, col_desc, page_grstats, chunk_grstats, max_page_comp_data_size, num_columns);
 }
 
-/**
- * @brief Launches kernel for packing column data into parquet pages
- *
- * @param[in,out] pages Device array of EncPages (unordered)
- * @param[out] comp_in Optionally initializes compressor input params
- * @param[out] comp_stat Optionally initializes compressor status
- * @param[in] stream CUDA stream to use, default 0
- */
 void EncodePages(device_span<gpu::EncPage> pages,
                  device_span<gpu_inflate_input_s> comp_in,
                  device_span<gpu_inflate_status_s> comp_stat,
@@ -2033,26 +1950,11 @@ void EncodePages(device_span<gpu::EncPage> pages,
   gpuEncodePages<128><<<num_pages, 128, 0, stream.value()>>>(pages, comp_in, comp_stat);
 }
 
-/**
- * @brief Launches kernel to make the compressed vs uncompressed chunk-level decision
- *
- * @param[in,out] chunks Column chunks
- * @param[in] stream CUDA stream to use, default 0
- */
 void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
 {
   gpuDecideCompression<<<chunks.size(), 128, 0, stream.value()>>>(chunks);
 }
 
-/**
- * @brief Launches kernel to encode page headers
- *
- * @param[in,out] pages Device array of EncPages
- * @param[in] comp_stat Compressor status or nullptr if no compression
- * @param[in] page_stats Optional page-level statistics to be included in page header
- * @param[in] chunk_stats Optional chunk-level statistics to be encoded
- * @param[in] stream CUDA stream to use, default 0
- */
 void EncodePageHeaders(device_span<EncPage> pages,
                        device_span<gpu_inflate_status_s const> comp_stat,
                        device_span<statistics_chunk const> page_stats,
@@ -2065,13 +1967,6 @@ void EncodePageHeaders(device_span<EncPage> pages,
     pages, comp_stat, page_stats, chunk_stats);
 }
 
-/**
- * @brief Launches kernel to gather pages to a single contiguous block per chunk
- *
- * @param[in,out] chunks Column chunks
- * @param[in] pages Device array of EncPages
- * @param[in] stream CUDA stream to use, default 0
- */
 void GatherPages(device_span<EncColumnChunk> chunks,
                  device_span<gpu::EncPage const> pages,
                  rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index a0cbc28bc8d..53bb11c8b70 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -252,6 +252,8 @@ struct parquet_column_device_view : stats_column_desc {
 
 constexpr int max_page_fragment_size = 5000;  //!< Max number of rows in a page fragment
 
+struct EncColumnChunk;
+
 /**
  * @brief Struct describing an encoder page fragment
  */
@@ -262,8 +264,10 @@ struct PageFragment {
   uint32_t start_value_idx;
   uint32_t num_leaf_values;  //!< Number of leaf values in fragment. Does not include nulls at
                              //!< non-leaf level
+  size_type start_row;       //!< First row in fragment
   uint16_t num_rows;         //!< Number of rows in fragment
   uint16_t num_dict_vals;    //!< Number of unique dictionary entries
+  EncColumnChunk* chunk;     //!< The chunk that this fragment belongs to
 };
 
 /// Size of hash used for building dictionaries
@@ -284,6 +288,27 @@ inline uint32_t __device__ int32_logical_len(type_id id)
   }
 }
 
+/**
+ * @brief Translate the row index of a parent column_device_view into the index of the first value
+ * in the leaf child.
+ * Only works in the context of parquet writer where struct columns are previously modified s.t.
+ * they only have one immediate child.
+ */
+inline size_type __device__ row_to_value_idx(size_type idx, column_device_view col)
+{
+  while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+    if (col.type().id() == type_id::STRUCT) {
+      idx += col.offset();
+      col = col.child(0);
+    } else {
+      auto offset_col = col.child(lists_column_view::offsets_column_index);
+      idx             = offset_col.element<size_type>(idx + col.offset());
+      col             = col.child(lists_column_view::child_column_index);
+    }
+  }
+  return idx;
+}
+
 /**
  * @brief Return worst-case compressed size of compressed data given the uncompressed size
  */
@@ -309,7 +334,7 @@ struct EncColumnChunk {
   uint32_t compressed_size;       //!< Compressed buffer size
   uint32_t max_page_data_size;    //!< Max data size (excluding header) of any page in this chunk
   uint32_t page_headers_size;     //!< Sum of size of all page headers
-  uint32_t start_row;             //!< First row of chunk
+  size_type start_row;            //!< First row of chunk
   uint32_t num_rows;              //!< Number of rows in chunk
   size_type num_values;     //!< Number of values in chunk. Different from num_rows for nested types
   uint32_t first_fragment;  //!< First fragment of chunk
@@ -459,18 +484,21 @@ dremel_data get_dremel_data(column_view h_col,
 /**
  * @brief Launches kernel for initializing encoder page fragments
  *
+ * Based on the number of rows in each fragment, populates the value count, the size of data in the
+ * fragment, the number of unique values, and the data size of unique values.
+ *
  * @param[out] frag Fragment array [column_id][fragment_id]
  * @param[in] col_desc Column description array [column_id]
- * @param[in] num_fragments Number of fragments per column
- * @param[in] num_columns Number of columns
+ * @param[in] partitions Information about partitioning of table
+ * @param[in] first_frag_in_part A Partition's offset into fragment array
  * @param[in] fragment_size Number of rows per fragment
- * @param[in] num_rows Number of rows per column
  * @param[in] stream CUDA stream to use
  */
 void InitPageFragments(cudf::detail::device_2dspan<PageFragment> frag,
                        device_span<parquet_column_device_view const> col_desc,
+                       device_span<partition_info const> partitions,
+                       device_span<int const> first_frag_in_part,
                        uint32_t fragment_size,
-                       uint32_t num_rows,
                        rmm::cuda_stream_view stream);
 
 /**
@@ -498,11 +526,11 @@ void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_st
  * @brief Insert chunk values into their respective hash maps
  *
  * @param chunks Column chunks [rowgroup][column]
- * @param num_rows Number of rows per column
+ * @param frags Column fragments
  * @param stream CUDA stream to use
  */
 void populate_chunk_hash_maps(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                              size_type num_rows,
+                              cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
 /**
@@ -523,11 +551,11 @@ void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_vi
  * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
  *
  * @param chunks Column chunks [rowgroup][column]
- * @param num_rows Number of rows per column
+ * @param frags Column fragments
  * @param stream CUDA stream to use
  */
 void get_dictionary_indices(cudf::detail::device_2dspan<EncColumnChunk> chunks,
-                            size_type num_rows,
+                            cudf::detail::device_2dspan<gpu::PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index e04c8371df8..aceb3bfbec1 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -27,6 +27,7 @@
 #include <io/utilities/config_utils.hpp>
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
@@ -40,6 +41,8 @@
 
 #include <nvcomp/snappy.h>
 
+#include <thrust/binary_search.h>
+
 #include <algorithm>
 #include <cstring>
 #include <numeric>
@@ -76,6 +79,113 @@ parquet::Compression to_parquet_compression(compression_type compression)
 
 }  // namespace
 
+struct aggregate_metadata {
+  aggregate_metadata(std::vector<partition_info> const& partitions,
+                     size_type num_columns,
+                     std::vector<SchemaElement> schema,
+                     statistics_freq stats_granularity,
+                     std::vector<std::map<std::string, std::string>> const& kv_md)
+    : version(1), schema(std::move(schema)), files(partitions.size())
+  {
+    for (size_t i = 0; i < partitions.size(); ++i) {
+      this->files[i].num_rows = partitions[i].num_rows;
+    }
+    this->column_order_listsize =
+      (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_columns : 0;
+
+    for (size_t p = 0; p < kv_md.size(); ++p) {
+      std::transform(kv_md[p].begin(),
+                     kv_md[p].end(),
+                     std::back_inserter(this->files[p].key_value_metadata),
+                     [](auto const& kv) {
+                       return KeyValue{kv.first, kv.second};
+                     });
+    }
+  }
+
+  void update_files(std::vector<partition_info> const& partitions)
+  {
+    CUDF_EXPECTS(partitions.size() == this->files.size(),
+                 "New partitions must be same size as previously passed number of partitions");
+    for (size_t i = 0; i < partitions.size(); ++i) {
+      this->files[i].num_rows += partitions[i].num_rows;
+    }
+  }
+
+  FileMetaData get_metadata(size_t part)
+  {
+    CUDF_EXPECTS(part < files.size(), "Invalid part index queried");
+    FileMetaData meta{};
+    meta.version               = this->version;
+    meta.schema                = this->schema;
+    meta.num_rows              = this->files[part].num_rows;
+    meta.row_groups            = this->files[part].row_groups;
+    meta.key_value_metadata    = this->files[part].key_value_metadata;
+    meta.created_by            = this->created_by;
+    meta.column_order_listsize = this->column_order_listsize;
+    return meta;
+  }
+
+  void set_file_paths(std::vector<std::string> const& column_chunks_file_path)
+  {
+    for (size_t p = 0; p < this->files.size(); ++p) {
+      auto& file            = this->files[p];
+      auto const& file_path = column_chunks_file_path[p];
+      for (auto& rowgroup : file.row_groups) {
+        for (auto& col : rowgroup.columns) {
+          col.file_path = file_path;
+        }
+      }
+    }
+  }
+
+  FileMetaData get_merged_metadata()
+  {
+    FileMetaData merged_md;
+    for (size_t p = 0; p < this->files.size(); ++p) {
+      auto& file = this->files[p];
+      if (p == 0) {
+        merged_md = this->get_metadata(0);
+      } else {
+        merged_md.row_groups.insert(merged_md.row_groups.end(),
+                                    std::make_move_iterator(file.row_groups.begin()),
+                                    std::make_move_iterator(file.row_groups.end()));
+        merged_md.num_rows += file.num_rows;
+      }
+    }
+    return merged_md;
+  }
+
+  std::vector<size_t> num_row_groups_per_file()
+  {
+    std::vector<size_t> global_rowgroup_base;
+    std::transform(this->files.begin(),
+                   this->files.end(),
+                   std::back_inserter(global_rowgroup_base),
+                   [](auto const& part) { return part.row_groups.size(); });
+    return global_rowgroup_base;
+  }
+
+  bool schema_matches(std::vector<SchemaElement> const& schema) const
+  {
+    return this->schema == schema;
+  }
+  auto& file(size_t p) { return files[p]; }
+  size_t num_files() const { return files.size(); }
+
+ private:
+  int32_t version = 0;
+  std::vector<SchemaElement> schema;
+  struct per_file_metadata {
+    int64_t num_rows = 0;
+    std::vector<RowGroup> row_groups;
+    std::vector<KeyValue> key_value_metadata;
+  };
+  std::vector<per_file_metadata> files;
+  std::string created_by         = "";
+  uint32_t column_order_listsize = 0;
+};
+
 struct linked_column_view;
 
 using LinkedColPtr    = std::shared_ptr<linked_column_view>;
@@ -736,10 +846,12 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view(
 
 void writer::impl::init_page_fragments(cudf::detail::hostdevice_2dvector<gpu::PageFragment>& frag,
                                        device_span<gpu::parquet_column_device_view const> col_desc,
-                                       uint32_t num_rows,
+                                       host_span<partition_info const> partitions,
+                                       device_span<int const> part_frag_offset,
                                        uint32_t fragment_size)
 {
-  gpu::InitPageFragments(frag, col_desc, fragment_size, num_rows, stream);
+  auto d_partitions = cudf::detail::make_device_uvector_async(partitions, stream);
+  gpu::InitPageFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host(stream, true);
 }
 
@@ -771,7 +883,7 @@ void writer::impl::init_page_sizes(hostdevice_2dvector<gpu::EncColumnChunk>& chu
 
 auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
                               host_span<gpu::parquet_column_device_view const> col_desc,
-                              uint32_t num_rows,
+                              device_2dspan<gpu::PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   // At this point, we know all chunks and their sizes. We want to allocate dictionaries for each
@@ -801,7 +913,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   chunks.host_to_device(stream);
 
   gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  gpu::populate_chunk_hash_maps(chunks, num_rows, stream);
+  gpu::populate_chunk_hash_maps(chunks, frags, stream);
 
   chunks.device_to_host(stream, true);
 
@@ -850,7 +962,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector<gpu::EncColumnChunk>& chunks,
   }
   chunks.host_to_device(stream);
   gpu::collect_map_entries(chunks.device_view().flat_view(), stream);
-  gpu::get_dictionary_indices(chunks.device_view(), num_rows, stream);
+  gpu::get_dictionary_indices(chunks.device_view(), frags, stream);
 
   return std::make_pair(std::move(dict_data), std::move(dict_index));
 }
@@ -1016,7 +1128,7 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
   stream.synchronize();
 }
 
-writer::impl::impl(std::unique_ptr<data_sink> sink,
+writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    parquet_writer_options const& options,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
@@ -1028,8 +1140,9 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
-    out_sink_(std::move(sink)),
-    single_write_mode(mode == SingleWriteMode::YES)
+    kv_md(options.get_key_value_metadata()),
+    single_write_mode(mode == SingleWriteMode::YES),
+    out_sink_(std::move(sinks))
 {
   if (options.get_metadata()) {
     table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
@@ -1037,7 +1150,7 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   init_state();
 }
 
-writer::impl::impl(std::unique_ptr<data_sink> sink,
+writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    chunked_parquet_writer_options const& options,
                    SingleWriteMode mode,
                    rmm::cuda_stream_view stream,
@@ -1049,8 +1162,9 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
     compression_(to_parquet_compression(options.get_compression())),
     stats_granularity_(options.get_stats_level()),
     int96_timestamps(options.is_enabled_int96_timestamps()),
+    kv_md(options.get_key_value_metadata()),
     single_write_mode(mode == SingleWriteMode::YES),
-    out_sink_(std::move(sink))
+    out_sink_(std::move(sinks))
 {
   if (options.get_metadata()) {
     table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
@@ -1062,19 +1176,21 @@ writer::impl::~impl() { close(); }
 
 void writer::impl::init_state()
 {
+  current_chunk_offset.resize(out_sink_.size());
   // Write file header
   file_header_s fhdr;
   fhdr.magic = parquet_magic;
-  out_sink_->host_write(&fhdr, sizeof(fhdr));
-  current_chunk_offset = sizeof(file_header_s);
+  for (auto& sink : out_sink_) {
+    sink->host_write(&fhdr, sizeof(fhdr));
+  }
+  std::fill_n(current_chunk_offset.begin(), current_chunk_offset.size(), sizeof(file_header_s));
 }
 
-void writer::impl::write(table_view const& table)
+void writer::impl::write(table_view const& table, std::vector<partition_info> const& partitions)
 {
+  last_write_successful = false;
   CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed");
 
-  size_type num_rows = table.num_rows();
-
   if (not table_meta) { table_meta = std::make_unique<table_input_metadata>(table); }
 
   // Fill unnamed columns' names in table_meta
@@ -1109,25 +1225,15 @@ void writer::impl::write(table_view const& table)
 
   std::vector<SchemaElement> this_table_schema(schema_tree.begin(), schema_tree.end());
 
-  if (md.version == 0) {
-    md.version  = 1;
-    md.num_rows = num_rows;
-    md.column_order_listsize =
-      (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? num_columns : 0;
-    std::transform(table_meta->user_data.begin(),
-                   table_meta->user_data.end(),
-                   std::back_inserter(md.key_value_metadata),
-                   [](auto const& kv) {
-                     return KeyValue{kv.first, kv.second};
-                   });
-    md.schema = this_table_schema;
+  if (!md) {
+    md = std::make_unique<aggregate_metadata>(
+      partitions, num_columns, std::move(this_table_schema), stats_granularity_, kv_md);
   } else {
     // verify the user isn't passing mismatched tables
-    CUDF_EXPECTS(md.schema == this_table_schema,
+    CUDF_EXPECTS(md->schema_matches(this_table_schema),
                  "Mismatch in schema between multiple calls to write_chunk");
 
-    // increment num rows
-    md.num_rows += num_rows;
+    md->update_files(partitions);
   }
   // Create table_device_view so that corresponding column_device_view data
   // can be written into col_desc members
@@ -1149,7 +1255,22 @@ void writer::impl::write(table_view const& table)
   // compression/decompression performance).
   using cudf::io::parquet::gpu::max_page_fragment_size;
 
-  size_type const num_fragments = (num_rows + max_page_fragment_size - 1) / max_page_fragment_size;
+  std::vector<int> num_frag_in_part;
+  std::transform(partitions.begin(),
+                 partitions.end(),
+                 std::back_inserter(num_frag_in_part),
+                 [](auto const& part) {
+                   return util::div_rounding_up_unsafe(part.num_rows, max_page_fragment_size);
+                 });
+
+  size_type num_fragments = std::reduce(num_frag_in_part.begin(), num_frag_in_part.end());
+
+  std::vector<int> part_frag_offset;  // Store the idx of the first fragment in each partition
+  std::exclusive_scan(
+    num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0);
+  part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());
+
+  auto d_part_frag_offset = cudf::detail::make_device_uvector_async(part_frag_offset, stream);
   cudf::detail::hostdevice_2dvector<gpu::PageFragment> fragments(
     num_columns, num_fragments, stream);
 
@@ -1159,36 +1280,50 @@ void writer::impl::write(table_view const& table)
     leaf_column_views = create_leaf_column_device_views<gpu::parquet_column_device_view>(
       col_desc, *parent_column_table_device_view, stream);
 
-    init_page_fragments(fragments, col_desc, num_rows, max_page_fragment_size);
+    init_page_fragments(
+      fragments, col_desc, partitions, d_part_frag_offset, max_page_fragment_size);
   }
 
-  auto const global_rowgroup_base = static_cast<size_type>(md.row_groups.size());
+  std::vector<size_t> const global_rowgroup_base = md->num_row_groups_per_file();
 
   // Decide row group boundaries based on uncompressed data size
-  auto rowgroup_size = 0ul;
-  auto num_rowgroups = 0;
-  for (auto f = 0, global_r = global_rowgroup_base, rowgroup_start = 0; f < num_fragments; f++) {
-    auto fragment_data_size = 0ul;
-    // Replace with STL algorithm to transform and sum
-    for (auto i = 0; i < num_columns; i++) {
-      fragment_data_size += fragments[i][f].fragment_data_size;
-    }
-    if (f > rowgroup_start &&
-        (rowgroup_size + fragment_data_size > max_row_group_size ||
-         (f + 1 - rowgroup_start) * max_page_fragment_size > max_row_group_rows)) {
-      // update schema
-      md.row_groups.resize(md.row_groups.size() + 1);
-      md.row_groups[global_r++].num_rows = (f - rowgroup_start) * max_page_fragment_size;
-      num_rowgroups++;
-      rowgroup_start = f;
-      rowgroup_size  = 0;
-    }
-    rowgroup_size += fragment_data_size;
-    if (f + 1 == num_fragments) {
-      // update schema
-      md.row_groups.resize(md.row_groups.size() + 1);
-      md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * max_page_fragment_size;
-      num_rowgroups++;
+  int num_rowgroups = 0;
+
+  std::vector<int> num_rg_in_part(partitions.size());
+  for (size_t p = 0; p < partitions.size(); ++p) {
+    size_type curr_rg_num_rows = 0;
+    size_t curr_rg_data_size   = 0;
+    int first_frag_in_rg       = part_frag_offset[p];
+    int last_frag_in_part      = part_frag_offset[p + 1] - 1;
+    for (auto f = first_frag_in_rg; f <= last_frag_in_part; ++f) {
+      size_t fragment_data_size = 0;
+      for (auto c = 0; c < num_columns; c++) {
+        fragment_data_size += fragments[c][f].fragment_data_size;
+      }
+      size_type fragment_num_rows = fragments[0][f].num_rows;
+
+      // If the fragment size gets larger than rg limit then break off a rg
+      if (f > first_frag_in_rg &&  // There has to be at least one fragment in row group
+          (curr_rg_data_size + fragment_data_size > max_row_group_size ||
+           curr_rg_num_rows + fragment_num_rows > max_row_group_rows)) {
+        auto& rg    = md->file(p).row_groups.emplace_back();
+        rg.num_rows = curr_rg_num_rows;
+        num_rowgroups++;
+        num_rg_in_part[p]++;
+        curr_rg_num_rows  = 0;
+        curr_rg_data_size = 0;
+        first_frag_in_rg  = f;
+      }
+      curr_rg_num_rows += fragment_num_rows;
+      curr_rg_data_size += fragment_data_size;
+
+      // TODO: (wishful) refactor to consolidate with above if block
+      if (f == last_frag_in_part) {
+        auto& rg    = md->file(p).row_groups.emplace_back();
+        rg.num_rows = curr_rg_num_rows;
+        num_rowgroups++;
+        num_rg_in_part[p]++;
+      }
     }
   }
 
@@ -1196,58 +1331,79 @@ void writer::impl::write(table_view const& table)
   rmm::device_uvector<statistics_chunk> frag_stats(0, stream);
   if (stats_granularity_ != statistics_freq::STATISTICS_NONE) {
     frag_stats.resize(num_fragments * num_columns, stream);
-    if (frag_stats.size() != 0) {
+    if (not frag_stats.is_empty()) {
       auto frag_stats_2dview =
         device_2dspan<statistics_chunk>(frag_stats.data(), num_columns, num_fragments);
       gather_fragment_statistics(frag_stats_2dview, fragments, col_desc, num_fragments);
     }
   }
+
+  std::vector<int> first_rg_in_part;
+  std::exclusive_scan(
+    num_rg_in_part.begin(), num_rg_in_part.end(), std::back_inserter(first_rg_in_part), 0);
+
   // Initialize row groups and column chunks
   auto const num_chunks = num_rowgroups * num_columns;
   hostdevice_2dvector<gpu::EncColumnChunk> chunks(num_rowgroups, num_columns, stream);
-  for (auto r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups;
-       r++, global_r++) {
-    size_type const fragments_in_chunk =
-      (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size;
-    md.row_groups[global_r].total_byte_size = 0;
-    md.row_groups[global_r].columns.resize(num_columns);
-    for (auto i = 0; i < num_columns; i++) {
-      gpu::EncColumnChunk* ck = &chunks[r][i];
-
-      *ck             = {};
-      ck->col_desc    = col_desc.device_ptr() + i;
-      ck->col_desc_id = i;
-      ck->fragments   = &fragments.device_view()[i][f];
-      ck->stats = (frag_stats.size() != 0) ? frag_stats.data() + i * num_fragments + f : nullptr;
-      ck->start_row        = start_row;
-      ck->num_rows         = (uint32_t)md.row_groups[global_r].num_rows;
-      ck->first_fragment   = i * num_fragments + f;
-      auto chunk_fragments = fragments[i].subspan(f, fragments_in_chunk);
-      ck->num_values =
-        std::accumulate(chunk_fragments.begin(), chunk_fragments.end(), 0, [](uint32_t l, auto r) {
-          return l + r.num_values;
-        });
-      ck->plain_data_size = std::accumulate(
-        chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) {
-          return sum + frag.fragment_data_size;
-        });
-      md.row_groups[global_r].columns[i].meta_data.type      = parquet_columns[i].physical_type();
-      md.row_groups[global_r].columns[i].meta_data.encodings = {Encoding::PLAIN, Encoding::RLE};
-      md.row_groups[global_r].columns[i].meta_data.path_in_schema =
-        parquet_columns[i].get_path_in_schema();
-      md.row_groups[global_r].columns[i].meta_data.codec      = UNCOMPRESSED;
-      md.row_groups[global_r].columns[i].meta_data.num_values = ck->num_values;
+
+  for (size_t p = 0; p < partitions.size(); ++p) {
+    int f               = part_frag_offset[p];
+    size_type start_row = partitions[p].start_row;
+    for (int r = 0; r < num_rg_in_part[p]; r++) {
+      size_t global_r = global_rowgroup_base[p] + r;  // Number of rowgroups already in file/part
+      auto& row_group = md->file(p).row_groups[global_r];
+      uint32_t fragments_in_chunk =
+        util::div_rounding_up_unsafe(row_group.num_rows, max_page_fragment_size);
+      row_group.total_byte_size = 0;
+      row_group.columns.resize(num_columns);
+      for (int c = 0; c < num_columns; c++) {
+        gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c];
+
+        ck             = {};
+        ck.col_desc    = col_desc.device_ptr() + c;
+        ck.col_desc_id = c;
+        ck.fragments   = &fragments.device_view()[c][f];
+        ck.stats =
+          (not frag_stats.is_empty()) ? frag_stats.data() + c * num_fragments + f : nullptr;
+        ck.start_row         = start_row;
+        ck.num_rows          = (uint32_t)row_group.num_rows;
+        ck.first_fragment    = c * num_fragments + f;
+        auto chunk_fragments = fragments[c].subspan(f, fragments_in_chunk);
+        // In fragment struct, add a pointer to the chunk it belongs to
+        // In each fragment in chunk_fragments, update the chunk pointer here.
+        for (auto& frag : chunk_fragments) {
+          frag.chunk = &chunks.device_view()[r + first_rg_in_part[p]][c];
+        }
+        ck.num_values = std::accumulate(
+          chunk_fragments.begin(), chunk_fragments.end(), 0, [](uint32_t l, auto r) {
+            return l + r.num_values;
+          });
+        ck.plain_data_size = std::accumulate(
+          chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) {
+            return sum + frag.fragment_data_size;
+          });
+        auto& column_chunk_meta          = row_group.columns[c].meta_data;
+        column_chunk_meta.type           = parquet_columns[c].physical_type();
+        column_chunk_meta.encodings      = {Encoding::PLAIN, Encoding::RLE};
+        column_chunk_meta.path_in_schema = parquet_columns[c].get_path_in_schema();
+        column_chunk_meta.codec          = UNCOMPRESSED;
+        column_chunk_meta.num_values     = ck.num_values;
+      }
+      f += fragments_in_chunk;
+      start_row += (uint32_t)row_group.num_rows;
     }
-    f += fragments_in_chunk;
-    start_row += (uint32_t)md.row_groups[global_r].num_rows;
   }
 
-  auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, num_rows, stream);
-  for (auto rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) {
-    for (auto col = 0; col < num_columns; col++) {
-      if (chunks.host_view()[rg][col].use_dictionary) {
-        md.row_groups[global_rg].columns[col].meta_data.encodings.push_back(
-          Encoding::PLAIN_DICTIONARY);
+  fragments.host_to_device(stream);
+  auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, fragments, stream);
+  for (size_t p = 0; p < partitions.size(); p++) {
+    for (int rg = 0; rg < num_rg_in_part[p]; rg++) {
+      size_t global_rg = global_rowgroup_base[p] + rg;
+      for (int col = 0; col < num_columns; col++) {
+        if (chunks.host_view()[rg][col].use_dictionary) {
+          md->file(p).row_groups[global_rg].columns[col].meta_data.encodings.push_back(
+            Encoding::PLAIN_DICTIONARY);
+        }
       }
     }
   }
@@ -1272,6 +1428,12 @@ void writer::impl::write(table_view const& table)
                  "Error in getting compressed size from nvcomp");
   }
 
+  // Find which partition a rg belongs to
+  std::vector<int> rg_to_part;
+  for (size_t p = 0; p < num_rg_in_part.size(); ++p) {
+    std::fill_n(std::back_inserter(rg_to_part), num_rg_in_part[p], p);
+  }
+
   // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
   std::vector<size_type> batch_list;
   size_type num_pages          = 0;
@@ -1335,11 +1497,11 @@ void writer::impl::write(table_view const& table)
     auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
     for (auto j = 0; j < batch_list[b]; j++, r++) {
       for (auto i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk* ck = &chunks[r][i];
-        ck->uncompressed_bfr    = bfr;
-        ck->compressed_bfr      = bfr_c;
-        bfr += ck->bfr_size;
-        bfr_c += ck->compressed_size;
+        gpu::EncColumnChunk& ck = chunks[r][i];
+        ck.uncompressed_bfr     = bfr;
+        ck.compressed_bfr       = bfr_c;
+        bfr += ck.bfr_size;
+        bfr_c += ck.compressed_size;
       }
     }
   }
@@ -1359,9 +1521,7 @@ void writer::impl::write(table_view const& table)
   pinned_buffer<uint8_t> host_bfr{nullptr, cudaFreeHost};
 
   // Encode row groups in batches
-  for (auto b = 0, r = 0, global_r = global_rowgroup_base;
-       b < static_cast<size_type>(batch_list.size());
-       b++) {
+  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
     // Count pages in this batch
     auto const rnext               = r + batch_list[b];
     auto const first_page_in_batch = chunks[r][0].first_page;
@@ -1381,30 +1541,33 @@ void writer::impl::write(table_view const& table)
       (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages
                                                                : nullptr);
     std::vector<std::future<void>> write_tasks;
-    for (; r < rnext; r++, global_r++) {
+    for (; r < rnext; r++) {
+      int p           = rg_to_part[r];
+      int global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
+      auto& row_group = md->file(p).row_groups[global_r];
       for (auto i = 0; i < num_columns; i++) {
-        gpu::EncColumnChunk* ck = &chunks[r][i];
+        gpu::EncColumnChunk& ck = chunks[r][i];
+        auto& column_chunk_meta = row_group.columns[i].meta_data;
         uint8_t* dev_bfr;
-        if (ck->is_compressed) {
-          md.row_groups[global_r].columns[i].meta_data.codec = compression_;
-          dev_bfr                                            = ck->compressed_bfr;
+        if (ck.is_compressed) {
+          column_chunk_meta.codec = compression_;
+          dev_bfr                 = ck.compressed_bfr;
         } else {
-          dev_bfr = ck->uncompressed_bfr;
+          dev_bfr = ck.uncompressed_bfr;
         }
 
-        if (out_sink_->is_device_write_preferred(ck->compressed_size)) {
+        if (out_sink_[p]->is_device_write_preferred(ck.compressed_size)) {
           // let the writer do what it wants to retrieve the data from the gpu.
-          write_tasks.push_back(
-            out_sink_->device_write_async(dev_bfr + ck->ck_stat_size, ck->compressed_size, stream));
+          write_tasks.push_back(out_sink_[p]->device_write_async(
+            dev_bfr + ck.ck_stat_size, ck.compressed_size, stream));
           // we still need to do a (much smaller) memcpy for the statistics.
-          if (ck->ck_stat_size != 0) {
-            md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(ck->ck_stat_size);
-            CUDA_TRY(
-              cudaMemcpyAsync(md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(),
-                              dev_bfr,
-                              ck->ck_stat_size,
-                              cudaMemcpyDeviceToHost,
-                              stream.value()));
+          if (ck.ck_stat_size != 0) {
+            column_chunk_meta.statistics_blob.resize(ck.ck_stat_size);
+            CUDA_TRY(cudaMemcpyAsync(column_chunk_meta.statistics_blob.data(),
+                                     dev_bfr,
+                                     ck.ck_stat_size,
+                                     cudaMemcpyDeviceToHost,
+                                     stream.value()));
             stream.synchronize();
           }
         } else {
@@ -1419,86 +1582,91 @@ void writer::impl::write(table_view const& table)
           // copy the full data
           CUDA_TRY(cudaMemcpyAsync(host_bfr.get(),
                                    dev_bfr,
-                                   ck->ck_stat_size + ck->compressed_size,
+                                   ck.ck_stat_size + ck.compressed_size,
                                    cudaMemcpyDeviceToHost,
                                    stream.value()));
           stream.synchronize();
-          out_sink_->host_write(host_bfr.get() + ck->ck_stat_size, ck->compressed_size);
-          if (ck->ck_stat_size != 0) {
-            md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(ck->ck_stat_size);
-            memcpy(md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(),
-                   host_bfr.get(),
-                   ck->ck_stat_size);
+          out_sink_[p]->host_write(host_bfr.get() + ck.ck_stat_size, ck.compressed_size);
+          if (ck.ck_stat_size != 0) {
+            column_chunk_meta.statistics_blob.resize(ck.ck_stat_size);
+            memcpy(column_chunk_meta.statistics_blob.data(), host_bfr.get(), ck.ck_stat_size);
           }
         }
-        md.row_groups[global_r].total_byte_size += ck->compressed_size;
-        md.row_groups[global_r].columns[i].meta_data.data_page_offset =
-          current_chunk_offset + ((ck->use_dictionary) ? ck->dictionary_size : 0);
-        md.row_groups[global_r].columns[i].meta_data.dictionary_page_offset =
-          (ck->use_dictionary) ? current_chunk_offset : 0;
-        md.row_groups[global_r].columns[i].meta_data.total_uncompressed_size = ck->bfr_size;
-        md.row_groups[global_r].columns[i].meta_data.total_compressed_size   = ck->compressed_size;
-        current_chunk_offset += ck->compressed_size;
+        row_group.total_byte_size += ck.compressed_size;
+        column_chunk_meta.data_page_offset =
+          current_chunk_offset[p] + ((ck.use_dictionary) ? ck.dictionary_size : 0);
+        column_chunk_meta.dictionary_page_offset =
+          (ck.use_dictionary) ? current_chunk_offset[p] : 0;
+        column_chunk_meta.total_uncompressed_size = ck.bfr_size;
+        column_chunk_meta.total_compressed_size   = ck.compressed_size;
+        current_chunk_offset[p] += ck.compressed_size;
       }
     }
     for (auto const& task : write_tasks) {
       task.wait();
     }
   }
+  last_write_successful = true;
 }
 
 std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
-  std::string const& column_chunks_file_path)
+  std::vector<std::string> const& column_chunks_file_path)
 {
   if (closed) { return nullptr; }
   closed = true;
-  CompactProtocolWriter cpw(&buffer_);
-  file_ender_s fendr;
-  buffer_.resize(0);
-  fendr.footer_len = static_cast<uint32_t>(cpw.write(md));
-  fendr.magic      = parquet_magic;
-  out_sink_->host_write(buffer_.data(), buffer_.size());
-  out_sink_->host_write(&fendr, sizeof(fendr));
-  out_sink_->flush();
+  if (not last_write_successful) { return nullptr; }
+  for (size_t p = 0; p < out_sink_.size(); p++) {
+    std::vector<uint8_t> buffer;
+    CompactProtocolWriter cpw(&buffer);
+    file_ender_s fendr;
+    buffer.resize(0);
+    fendr.footer_len = static_cast<uint32_t>(cpw.write(md->get_metadata(p)));
+    fendr.magic      = parquet_magic;
+    out_sink_[p]->host_write(buffer.data(), buffer.size());
+    out_sink_[p]->host_write(&fendr, sizeof(fendr));
+    out_sink_[p]->flush();
+  }
 
   // Optionally output raw file metadata with the specified column chunk file path
-  if (column_chunks_file_path.length() > 0) {
+  if (column_chunks_file_path.size() > 0) {
+    CUDF_EXPECTS(column_chunks_file_path.size() == md->num_files(),
+                 "Expected one column chunk path per output file");
+    md->set_file_paths(column_chunks_file_path);
     file_header_s fhdr = {parquet_magic};
-    buffer_.resize(0);
-    buffer_.insert(buffer_.end(),
-                   reinterpret_cast<const uint8_t*>(&fhdr),
-                   reinterpret_cast<const uint8_t*>(&fhdr) + sizeof(fhdr));
-    for (auto& rowgroup : md.row_groups) {
-      for (auto& col : rowgroup.columns) {
-        col.file_path = column_chunks_file_path;
-      }
-    }
-    fendr.footer_len = static_cast<uint32_t>(cpw.write(md));
-    buffer_.insert(buffer_.end(),
-                   reinterpret_cast<const uint8_t*>(&fendr),
-                   reinterpret_cast<const uint8_t*>(&fendr) + sizeof(fendr));
-    return std::make_unique<std::vector<uint8_t>>(std::move(buffer_));
+    std::vector<uint8_t> buffer;
+    CompactProtocolWriter cpw(&buffer);
+    buffer.insert(buffer.end(),
+                  reinterpret_cast<const uint8_t*>(&fhdr),
+                  reinterpret_cast<const uint8_t*>(&fhdr) + sizeof(fhdr));
+    file_ender_s fendr;
+    fendr.magic      = parquet_magic;
+    fendr.footer_len = static_cast<uint32_t>(cpw.write(md->get_merged_metadata()));
+    buffer.insert(buffer.end(),
+                  reinterpret_cast<const uint8_t*>(&fendr),
+                  reinterpret_cast<const uint8_t*>(&fendr) + sizeof(fendr));
+    return std::make_unique<std::vector<uint8_t>>(std::move(buffer));
   } else {
     return {nullptr};
   }
+  return nullptr;
 }
 
 // Forward to implementation
-writer::writer(std::unique_ptr<data_sink> sink,
+writer::writer(std::vector<std::unique_ptr<data_sink>> sinks,
                parquet_writer_options const& options,
                SingleWriteMode mode,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
+  : _impl(std::make_unique<impl>(std::move(sinks), options, mode, stream, mr))
 {
 }
 
-writer::writer(std::unique_ptr<data_sink> sink,
+writer::writer(std::vector<std::unique_ptr<data_sink>> sinks,
                chunked_parquet_writer_options const& options,
                SingleWriteMode mode,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(std::move(sink), options, mode, stream, mr))
+  : _impl(std::make_unique<impl>(std::move(sinks), options, mode, stream, mr))
 {
 }
 
@@ -1506,16 +1674,21 @@ writer::writer(std::unique_ptr<data_sink> sink,
 writer::~writer() = default;
 
 // Forward to implementation
-void writer::write(table_view const& table) { _impl->write(table); }
+void writer::write(table_view const& table, std::vector<partition_info> const& partitions)
+{
+  _impl->write(
+    table, partitions.empty() ? std::vector<partition_info>{{0, table.num_rows()}} : partitions);
+}
 
 // Forward to implementation
-std::unique_ptr<std::vector<uint8_t>> writer::close(std::string const& column_chunks_file_path)
+std::unique_ptr<std::vector<uint8_t>> writer::close(
+  std::vector<std::string> const& column_chunks_file_path)
 {
   return _impl->close(column_chunks_file_path);
 }
 
 std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
-  const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list)
+  std::vector<std::unique_ptr<std::vector<uint8_t>>> const& metadata_list)
 {
   std::vector<uint8_t> output;
   CompactProtocolWriter cpw(&output);
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 9188218f607..1cefb91c904 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -45,6 +45,7 @@ namespace detail {
 namespace parquet {
 // Forward internal classes
 struct parquet_column_view;
+struct aggregate_metadata;
 
 using namespace cudf::io::parquet;
 using namespace cudf::io;
@@ -60,13 +61,13 @@ class writer::impl {
   /**
    * @brief Constructor with writer options.
    *
-   * @param sink data_sink for storing dataset
+   * @param sink data_sink's for storing dataset
    * @param options Settings for controlling behavior
    * @param mode Option to write at once or in chunks
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::unique_ptr<data_sink> sink,
+  explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 parquet_writer_options const& options,
                 SingleWriteMode mode,
                 rmm::cuda_stream_view stream,
@@ -75,13 +76,13 @@ class writer::impl {
   /**
    * @brief Constructor with chunked writer options.
    *
-   * @param sink data_sink for storing dataset
+   * @param sink data_sink's for storing dataset
    * @param options Settings for controlling behavior
    * @param mode Option to write at once or in chunks
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::unique_ptr<data_sink> sink,
+  explicit impl(std::vector<std::unique_ptr<data_sink>> sinks,
                 chunked_parquet_writer_options const& options,
                 SingleWriteMode mode,
                 rmm::cuda_stream_view stream,
@@ -102,8 +103,10 @@ class writer::impl {
    * normally used for chunked writing.
    *
    * @param[in] table The table information to be written
+   * @param[in] partitions Optional partitions to divide the table into. If specified, must be same
+   * size as number of sinks.
    */
-  void write(table_view const& table);
+  void write(table_view const& table, std::vector<partition_info> const& partitions);
 
   /**
    * @brief Finishes the chunked/streamed write process.
@@ -112,7 +115,8 @@ class writer::impl {
    * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if
    * `column_chunks_file_path` is provided, else null.
    */
-  std::unique_ptr<std::vector<uint8_t>> close(std::string const& column_chunks_file_path = "");
+  std::unique_ptr<std::vector<uint8_t>> close(
+    std::vector<std::string> const& column_chunks_file_path = {});
 
  private:
   /**
@@ -120,12 +124,14 @@ class writer::impl {
    *
    * @param frag Destination page fragments
    * @param col_desc column description array
-   * @param num_rows Total number of rows
+   * @param[in] partitions Information about partitioning of table
+   * @param[in] part_frag_offset A Partition's offset into fragment array
    * @param fragment_size Number of rows per fragment
    */
   void init_page_fragments(hostdevice_2dvector<gpu::PageFragment>& frag,
                            device_span<gpu::parquet_column_device_view const> col_desc,
-                           uint32_t num_rows,
+                           host_span<partition_info const> partitions,
+                           device_span<int const> part_frag_offset,
                            uint32_t fragment_size);
 
   /**
@@ -208,19 +214,22 @@ class writer::impl {
   statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE;
   bool int96_timestamps              = false;
   // Overall file metadata.  Filled in during the process and written during write_chunked_end()
-  cudf::io::parquet::FileMetaData md;
+  std::unique_ptr<aggregate_metadata> md;
+  // File footer key-value metadata. Written during write_chunked_end()
+  std::vector<std::map<std::string, std::string>> kv_md;
   // optional user metadata
   std::unique_ptr<table_input_metadata> table_meta;
   // to track if the output has been written to sink
   bool closed = false;
+  // To track if the last write(table) call completed successfully
+  bool last_write_successful = false;
   // current write position for rowgroups/chunks
-  std::size_t current_chunk_offset;
+  std::vector<std::size_t> current_chunk_offset;
   // special parameter only used by detail::write() to indicate that we are guaranteeing
   // a single table write.  this enables some internal optimizations.
   bool const single_write_mode = true;
 
-  std::vector<uint8_t> buffer_;
-  std::unique_ptr<data_sink> out_sink_;
+  std::vector<std::unique_ptr<data_sink>> out_sink_;
 };
 
 }  // namespace parquet
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index c376accd1ff..75ff39cbe70 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -838,13 +838,13 @@ TEST_F(ParquetWriterTest, MultiIndex)
   expected_metadata.column_metadata[2].set_name("int32s");
   expected_metadata.column_metadata[3].set_name("floats");
   expected_metadata.column_metadata[4].set_name("doubles");
-  expected_metadata.user_data.insert(
-    {"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"});
 
   auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet");
   cudf_io::parquet_writer_options out_opts =
     cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view())
-      .metadata(&expected_metadata);
+      .metadata(&expected_metadata)
+      .key_value_metadata(
+        {{{"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"}}});
   cudf_io::write_parquet(out_opts);
 
   cudf_io::parquet_reader_options in_opts =
@@ -1174,6 +1174,100 @@ TEST_F(ParquetWriterTest, DeviceWriteLargeishFile)
   auto custom_tbl = cudf_io::read_parquet(custom_args);
   CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view());
 }
+
+TEST_F(ParquetWriterTest, PartitionedWrite)
+{
+  auto source = create_compressible_fixed_table<int>(16, 4 * 1024 * 1024, 1000, false);
+
+  auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet");
+  auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet");
+
+  auto partition1 = cudf::io::partition_info{10, 1024 * 1024};
+  auto partition2 = cudf::io::partition_info{20 * 1024 + 7, 3 * 1024 * 1024};
+
+  auto expected1 =
+    cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows});
+  auto expected2 =
+    cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows});
+
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(
+      cudf_io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
+      .partitions({partition1, partition2})
+      .compression(cudf_io::compression_type::NONE);
+  cudf_io::write_parquet(args);
+
+  auto result1 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view());
+
+  auto result2 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, PartitionedWriteEmptyPartitions)
+{
+  auto source = create_random_fixed_table<int>(4, 4, false);
+
+  auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet");
+  auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet");
+
+  auto partition1 = cudf::io::partition_info{1, 0};
+  auto partition2 = cudf::io::partition_info{1, 0};
+
+  auto expected1 =
+    cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows});
+  auto expected2 =
+    cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows});
+
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(
+      cudf_io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
+      .partitions({partition1, partition2})
+      .compression(cudf_io::compression_type::NONE);
+  cudf_io::write_parquet(args);
+
+  auto result1 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view());
+
+  auto result2 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, PartitionedWriteEmptyColumns)
+{
+  auto source = create_random_fixed_table<int>(0, 4, false);
+
+  auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet");
+  auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet");
+
+  auto partition1 = cudf::io::partition_info{1, 0};
+  auto partition2 = cudf::io::partition_info{1, 0};
+
+  auto expected1 =
+    cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows});
+  auto expected2 =
+    cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows});
+
+  cudf_io::parquet_writer_options args =
+    cudf_io::parquet_writer_options::builder(
+      cudf_io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
+      .partitions({partition1, partition2})
+      .compression(cudf_io::compression_type::NONE);
+  cudf_io::write_parquet(args);
+
+  auto result1 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view());
+
+  auto result2 = cudf_io::read_parquet(
+    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2)));
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
+}
+
 template <typename T>
 std::string create_parquet_file(int num_cols)
 {
@@ -1305,7 +1399,7 @@ TEST_F(ParquetChunkedWriterTest, ManyTables)
   std::for_each(table_views.begin(), table_views.end(), [&writer](table_view const& tbl) {
     writer.write(tbl);
   });
-  auto md = writer.close("dummy/path");
+  auto md = writer.close({"dummy/path"});
   CUDF_EXPECTS(md, "The returned metadata should not be null.");
 
   cudf_io::parquet_reader_options read_opts =
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index 2fc71f64df1..e5a8bb926c1 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -2,6 +2,7 @@
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
+from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -85,6 +86,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void set_row_index_stride(size_type val) except+
         void set_table(cudf_table_view.table_view tbl) except+
         void set_metadata(cudf_io_types.table_input_metadata* meta) except+
+        void set_key_value_metadata(map[string, string] kvm) except +
 
         @staticmethod
         orc_writer_options_builder builder(
@@ -107,6 +109,9 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_writer_options_builder& metadata(
             cudf_io_types.table_input_metadata *meta
         ) except+
+        orc_writer_options_builder& key_value_metadata(
+            map[string, string] kvm
+        ) except+
 
         orc_writer_options build() except+
 
@@ -134,6 +139,7 @@ cdef extern from "cudf/io/orc.hpp" \
         void set_metadata(
             cudf_io_types.table_input_metadata* meta
         ) except+
+        void set_key_value_metadata(map[string, string] kvm) except +
 
         @staticmethod
         chunked_orc_writer_options_builder builder(
@@ -155,6 +161,9 @@ cdef extern from "cudf/io/orc.hpp" \
         chunked_orc_writer_options_builder& metadata(
             cudf_io_types.table_input_metadata *meta
         ) except+
+        chunked_orc_writer_options_builder& key_value_metadata(
+            map[string, string] kvm
+        ) except+
 
         chunked_orc_writer_options build() except+
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 9d95dce83bc..60be608d997 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -73,21 +73,24 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_table_view.table_view get_table() except +
         const cudf_io_types.table_input_metadata get_metadata() except +
-        string get_column_chunks_file_path() except+
+        string get_column_chunks_file_paths() except+
         size_t get_row_group_size_bytes() except+
         size_type get_row_group_size_rows() except+
 
         void set_metadata(
             cudf_io_types.table_input_metadata *m
         ) except +
+        void set_key_value_metadata(
+            vector[map[string, string]] kvm
+        ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
         ) except +
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
-        void set_column_chunks_file_path(
-            string column_chunks_file_path
+        void set_column_chunks_file_paths(
+            vector[string] column_chunks_file_paths
         ) except +
         void set_row_group_size_bytes(size_t val) except+
         void set_row_group_size_rows(size_type val) except+
@@ -108,14 +111,17 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& metadata(
             cudf_io_types.table_input_metadata *m
         ) except +
+        parquet_writer_options_builder& key_value_metadata(
+            vector[map[string, string]] kvm
+        ) except +
         parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
         ) except +
         parquet_writer_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except +
-        parquet_writer_options_builder& column_chunks_file_path(
-            string column_chunks_file_path
+        parquet_writer_options_builder& column_chunks_file_paths(
+            vector[string] column_chunks_file_paths
         ) except +
         parquet_writer_options_builder& int96_timestamps(
             bool enabled
@@ -146,6 +152,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_metadata(
             cudf_io_types.table_input_metadata *m
         ) except +
+        void set_key_value_metadata(
+            vector[map[string, string]] kvm
+        ) except +
         void set_stats_level(
             cudf_io_types.statistics_freq sf
         ) except +
@@ -168,6 +177,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& metadata(
             cudf_io_types.table_input_metadata *m
         ) except +
+        chunked_parquet_writer_options_builder& key_value_metadata(
+            vector[map[string, string]] kvm
+        ) except +
         chunked_parquet_writer_options_builder& stats_level(
             cudf_io_types.statistics_freq sf
         ) except +
@@ -190,7 +202,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             cudf_table_view.table_view table_,
         ) except+
         unique_ptr[vector[uint8_t]] close(
-            string column_chunks_file_path,
+            vector[string] column_chunks_file_paths,
         ) except+
 
     cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata(
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd
index 6b68902d22f..40a056b46e0 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/types.pxd
@@ -70,13 +70,8 @@ cdef extern from "cudf/io/types.hpp" \
     cdef cppclass table_input_metadata:
         table_input_metadata() except +
         table_input_metadata(const cudf_table_view.table_view& table) except +
-        table_input_metadata(
-            const cudf_table_view.table_view& table,
-            map[string, string] user_data
-        ) except +
 
         vector[column_in_metadata] column_metadata
-        map[string, string] user_data
 
     cdef cppclass host_buffer:
         const char* data
@@ -87,8 +82,8 @@ cdef extern from "cudf/io/types.hpp" \
 
     cdef cppclass source_info:
         io_type type
-        vector[string] filepaths
-        vector[host_buffer] buffers
+        const vector[string]& filepaths() except +
+        const vector[host_buffer]& buffers() except +
         vector[shared_ptr[CRandomAccessFile]] files
 
         source_info() except +
@@ -98,9 +93,9 @@ cdef extern from "cudf/io/types.hpp" \
 
     cdef cppclass sink_info:
         io_type type
-        string filepath
-        vector[char] * buffer
-        data_sink * user_sink
+        const vector[string]& filepaths()
+        const vector[vector[char] *]& buffers()
+        const vector[data_sink *]& user_sinks()
 
         sink_info() except +
         sink_info(string file_path) except +
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 9a4bd8652da..bf761c30bc8 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -3,6 +3,7 @@
 import cudf
 
 from libcpp cimport bool, int
+from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -311,10 +312,9 @@ cdef class ORCWriter:
         chunked_orc_writer_options anb creates a writer"""
         cdef table_view tv
 
-        # Set the table_metadata
         num_index_cols_meta = 0
         self.tbl_meta = make_unique[table_input_metadata](
-            table_view_from_table(table, ignore_index=True)
+            table_view_from_table(table, ignore_index=True),
         )
         if self.index is not False:
             if isinstance(table._index, cudf.core.multiindex.MultiIndex):
@@ -340,15 +340,16 @@ cdef class ORCWriter:
                 table[name]._column, self.tbl_meta.get().column_metadata[i]
             )
 
+        cdef map[string, string] user_data
         pandas_metadata = generate_pandas_metadata(table, self.index)
-        self.tbl_meta.get().user_data[str.encode("pandas")] = \
-            str.encode(pandas_metadata)
+        user_data[str.encode("pandas")] = str.encode(pandas_metadata)
 
         cdef chunked_orc_writer_options args
         with nogil:
             args = move(
                 chunked_orc_writer_options.builder(self.sink)
                 .metadata(self.tbl_meta.get())
+                .key_value_metadata(move(user_data))
                 .compression(self.comp_type)
                 .enable_statistics(self.enable_stats)
                 .build()
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d17184685fa..955324778fd 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -296,7 +296,7 @@ cpdef write_parquet(
     # Create the write options
     cdef unique_ptr[table_input_metadata] tbl_meta
 
-    cdef map[string, string] user_data
+    cdef vector[map[string, string]] user_data
     cdef table_view tv
     cdef unique_ptr[cudf_io_types.data_sink] _data_sink
     cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink)
@@ -328,30 +328,29 @@ cpdef write_parquet(
         )
 
     pandas_metadata = generate_pandas_metadata(table, index)
-    user_data[str.encode("pandas")] = str.encode(pandas_metadata)
-
-    # Set the table_metadata
-    tbl_meta.get().user_data = user_data
+    user_data.resize(1)
+    user_data.back()[str.encode("pandas")] = str.encode(pandas_metadata)
 
     cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression)
     cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics)
 
     cdef unique_ptr[vector[uint8_t]] out_metadata_c
-    cdef string c_column_chunks_file_path
+    cdef vector[string] c_column_chunks_file_paths
     cdef bool _int96_timestamps = int96_timestamps
-    if metadata_file_path is not None:
-        c_column_chunks_file_path = str.encode(metadata_file_path)
 
     # Perform write
     cdef parquet_writer_options args = move(
         parquet_writer_options.builder(sink, tv)
         .metadata(tbl_meta.get())
+        .key_value_metadata(move(user_data))
         .compression(comp_type)
         .stats_level(stat_freq)
-        .column_chunks_file_path(c_column_chunks_file_path)
         .int96_timestamps(_int96_timestamps)
         .build()
     )
+    if metadata_file_path is not None:
+        c_column_chunks_file_paths.push_back(str.encode(metadata_file_path))
+        args.set_column_chunks_file_paths(move(c_column_chunks_file_paths))
     if row_group_size_bytes is not None:
         args.set_row_group_size_bytes(row_group_size_bytes)
     if row_group_size_rows is not None:
@@ -413,18 +412,18 @@ cdef class ParquetWriter:
 
     def close(self, object metadata_file_path=None):
         cdef unique_ptr[vector[uint8_t]] out_metadata_c
-        cdef string column_chunks_file_path
+        cdef vector[string] column_chunks_file_paths
 
         if not self.initialized:
             return None
 
         # Update metadata-collection options
         if metadata_file_path is not None:
-            column_chunks_file_path = str.encode(metadata_file_path)
+            column_chunks_file_paths.push_back(str.encode(metadata_file_path))
 
         with nogil:
             out_metadata_c = move(
-                self.writer.get()[0].close(column_chunks_file_path)
+                self.writer.get()[0].close(column_chunks_file_paths)
             )
 
         if metadata_file_path is not None:
@@ -471,14 +470,16 @@ cdef class ParquetWriter:
             )
 
         pandas_metadata = generate_pandas_metadata(table, self.index)
-        self.tbl_meta.get().user_data[str.encode("pandas")] = \
-            str.encode(pandas_metadata)
+        cdef vector[map[string, string]] user_data
+        user_data.resize(1)
+        user_data.back()[str.encode("pandas")] = str.encode(pandas_metadata)
 
         cdef chunked_parquet_writer_options args
         with nogil:
             args = move(
                 chunked_parquet_writer_options.builder(self.sink)
                 .metadata(self.tbl_meta.get())
+                .key_value_metadata(move(user_data))
                 .compression(self.comp_type)
                 .stats_level(self.stat_freq)
                 .build()

From fc2a32a1576d97a48b0c1c983ef4b31285267e96 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 14 Dec 2021 16:59:24 -0600
Subject: [PATCH 05/13] Introduce `nan_as_null` parameter for `cudf.Index`
 (#9893)

Fixes: #9822

This PR introduces `nan_as_null` parameter to `cudf.Index` constructor which is similar to the one present in `cudf.Series` constructor.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: https://github.com/rapidsai/cudf/pull/9893
---
 python/cudf/cudf/core/_base_index.py        |  2 +-
 python/cudf/cudf/core/column/categorical.py |  4 +++-
 python/cudf/cudf/core/index.py              | 25 ++++++++++++++++-----
 python/cudf/cudf/tests/test_index.py        | 19 ++++++++++++++++
 4 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index ac5e152d011..ed1cc74db71 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -829,7 +829,7 @@ def is_floating(self):
         >>> idx = cudf.Index([1.0, 2.0, np.nan, 4.0])
         >>> idx.is_floating()
         True
-        >>> idx = cudf.Index([1, 2, 3, 4, np.nan])
+        >>> idx = cudf.Index([1, 2, 3, 4, np.nan], nan_as_null=False)
         >>> idx.is_floating()
         True
         >>> idx = cudf.Index([1, 2, 3, 4])
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index a2c1f04b2f2..4be7a422de0 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -809,7 +809,9 @@ def __setitem__(self, key, value):
             to_add_categories = 0
         else:
             to_add_categories = len(
-                cudf.Index(value).difference(self.categories)
+                cudf.Index(value, nan_as_null=False).difference(
+                    self.categories
+                )
             )
 
         if to_add_categories > 0:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 29e0d17bc39..362c96ebbeb 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2527,7 +2527,7 @@ def is_object(self):
         return True
 
 
-def as_index(arbitrary, **kwargs) -> BaseIndex:
+def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
     """Create an Index from an arbitrary object
 
     Currently supported inputs are:
@@ -2560,7 +2560,7 @@ def as_index(arbitrary, **kwargs) -> BaseIndex:
     elif isinstance(arbitrary, ColumnBase):
         return _index_from_data({kwargs.get("name", None): arbitrary})
     elif isinstance(arbitrary, cudf.Series):
-        return as_index(arbitrary._column, **kwargs)
+        return as_index(arbitrary._column, nan_as_null=nan_as_null, **kwargs)
     elif isinstance(arbitrary, (pd.RangeIndex, range)):
         return RangeIndex(
             start=arbitrary.start,
@@ -2569,11 +2569,14 @@ def as_index(arbitrary, **kwargs) -> BaseIndex:
             **kwargs,
         )
     elif isinstance(arbitrary, pd.MultiIndex):
-        return cudf.MultiIndex.from_pandas(arbitrary)
+        return cudf.MultiIndex.from_pandas(arbitrary, nan_as_null=nan_as_null)
     elif isinstance(arbitrary, cudf.DataFrame):
         return cudf.MultiIndex.from_frame(arbitrary)
     return as_index(
-        column.as_column(arbitrary, dtype=kwargs.get("dtype", None)), **kwargs
+        column.as_column(
+            arbitrary, dtype=kwargs.get("dtype", None), nan_as_null=nan_as_null
+        ),
+        **kwargs,
     )
 
 
@@ -2623,6 +2626,10 @@ class Index(BaseIndex, metaclass=IndexMeta):
     tupleize_cols : bool (default: True)
         When True, attempt to create a MultiIndex if possible.
         tupleize_cols == False is not yet supported.
+    nan_as_null : bool, Default True
+        If ``None``/``True``, converts ``np.nan`` values to
+        ``null`` values.
+        If ``False``, leaves ``np.nan`` values as is.
 
     Returns
     -------
@@ -2655,6 +2662,7 @@ def __new__(
         copy=False,
         name=None,
         tupleize_cols=True,
+        nan_as_null=True,
         **kwargs,
     ):
         assert (
@@ -2665,7 +2673,14 @@ def __new__(
                 "tupleize_cols != True is not yet supported"
             )
 
-        return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs)
+        return as_index(
+            data,
+            copy=copy,
+            dtype=dtype,
+            name=name,
+            nan_as_null=nan_as_null,
+            **kwargs,
+        )
 
     @classmethod
     def from_arrow(cls, obj):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index c7fca2075f5..6679725ae9a 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2509,3 +2509,22 @@ def test_index_datetime_round(resolution):
     cuidx_floor = cuidx.round(resolution)
 
     assert_eq(pidx_floor, cuidx_floor)
+
+
+@pytest.mark.parametrize(
+    "data,nan_idx,NA_idx",
+    [([1, 2, 3, None], None, 3), ([2, 3, np.nan, None], 2, 3)],
+)
+@pytest.mark.parametrize("nan_as_null", [True, False])
+def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
+    idx = cudf.Index(data, nan_as_null=nan_as_null)
+
+    if nan_as_null:
+        if nan_idx is not None:
+            assert idx[nan_idx] is cudf.NA
+    else:
+        if nan_idx is not None:
+            assert np.isnan(idx[nan_idx])
+
+    if NA_idx is not None:
+        assert idx[NA_idx] is cudf.NA

From 44fce8bb201ae818ec73ed563e4ab0232ceb751e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 14 Dec 2021 22:02:27 -0600
Subject: [PATCH 06/13] Fix cudf.Scalar string datetime construction (#9875)

Closes https://github.com/rapidsai/cudf/issues/9874

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Michael Wang (https://github.com/isVoid)

URL: https://github.com/rapidsai/cudf/pull/9875
---
 python/cudf/cudf/tests/test_scalar.py | 12 ++++++++++++
 python/cudf/cudf/utils/dtypes.py      |  6 +++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index a9919900256..a8b62710e0e 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -369,3 +369,15 @@ def test_construct_from_scalar(value):
 
     x._is_host_value_current == y._is_host_value_current
     x._is_device_value_current == y._is_device_value_current
+
+
+@pytest.mark.parametrize(
+    "data", ["20000101", "2000-01-01", "2000-01-01T00:00:00.000000000", "2000"]
+)
+@pytest.mark.parametrize("dtype", DATETIME_TYPES)
+def test_datetime_scalar_from_string(data, dtype):
+    slr = cudf.Scalar(data, dtype)
+
+    expected = np.datetime64(datetime.datetime(2000, 1, 1)).astype(dtype)
+
+    assert expected == slr.value
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 2eb38c0f77e..7142d0d710e 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -257,7 +257,11 @@ def to_cudf_compatible_scalar(val, dtype=None):
     val = cudf.api.types.pandas_dtype(type(val)).type(val)
 
     if dtype is not None:
-        val = val.astype(dtype)
+        if isinstance(val, str) and np.dtype(dtype).kind == "M":
+            # pd.Timestamp can handle str, but not np.str_
+            val = pd.Timestamp(str(val)).to_datetime64().astype(dtype)
+        else:
+            val = val.astype(dtype)
 
     if val.dtype.type is np.datetime64:
         time_unit, _ = np.datetime_data(val.dtype)

From 3428f7f7b123851ee580c29f7c4fdc28b8384e98 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 14 Dec 2021 22:08:54 -0600
Subject: [PATCH 07/13] Fix compilation of benchmark for parquet writer.
 (#9905)

This fixes a compilation error introduced in #9810. Tagging @devavret @vuule for review. Feel free to push to this PR with any fixes.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/9905
---
 cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
index b4c11179c35..5c3c53fee8e 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp
@@ -85,7 +85,7 @@ void BM_parq_write_varying_options(benchmark::State& state)
       cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
         .compression(compression)
         .stats_level(enable_stats)
-        .column_chunks_file_path(file_path);
+        .column_chunks_file_paths({file_path});
     cudf_io::write_parquet(options);
   }
 

From 78d12bb20501770839c2a062a2cc611349dc4120 Mon Sep 17 00:00:00 2001
From: Jordan Jacobelli <jjacobelli@nvidia.com>
Date: Wed, 15 Dec 2021 10:05:12 +0100
Subject: [PATCH 08/13] Update ucx-py version on release using rvc (#9897)

Update `ucx-py` version on release using `rvc`

Authors:
  - Jordan Jacobelli (https://github.com/Ethyling)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9897
---
 ci/gpu/build.sh              | 5 ++++-
 ci/gpu/java.sh               | 5 ++++-
 ci/release/update-version.sh | 5 +++++
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index 00ad6bf812d..5646c268301 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -33,6 +33,9 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 # Dask & Distributed git tag
 export DASK_DISTRIBUTED_GIT_TAG='2021.11.2'
 
+# ucx-py version
+export UCX_PY_VERSION='0.24.*'
+
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
 ################################################################################
@@ -83,7 +86,7 @@ gpuci_mamba_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.24.*"
+                  "ucx-py=${UCX_PY_VERSION}"
 
 # https://docs.rapids.ai/maintainers/depmgmt/
 # gpuci_mamba_retry remove --force rapids-build-env rapids-notebook-env
diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh
index bada16bd40e..6f7038d21d7 100755
--- a/ci/gpu/java.sh
+++ b/ci/gpu/java.sh
@@ -30,6 +30,9 @@ export CONDA_ARTIFACT_PATH="$WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/"
 export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
+# ucx-py version
+export UCX_PY_VERSION='0.24.*'
+
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
 ################################################################################
@@ -80,7 +83,7 @@ gpuci_conda_retry install -y \
                   "rapids-notebook-env=$MINOR_VERSION.*" \
                   "dask-cuda=${MINOR_VERSION}" \
                   "rmm=$MINOR_VERSION.*" \
-                  "ucx-py=0.24.*" \
+                  "ucx-py=${UCX_PY_VERSION}" \
                   "openjdk=8.*" \
                   "maven"
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 86432a92128..1105b9c194d 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -21,6 +21,7 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
+NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -62,3 +63,7 @@ sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 
 # Libcudf examples update
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt
+
+# ucx-py version update
+sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh
+sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/java.sh

From 38631a635fbfe05f69fd243df03868ec1f23d3c5 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Wed, 15 Dec 2021 08:29:05 -0600
Subject: [PATCH 09/13] Fix the java build after parquet partitioning support
 (#9908)

This fixes the java build after #9810 went in.

There is a lot of copy/paste in this first draft, because I just wanted to get something to work. Not sure if it is worth going back to make it common everywhere.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9908
---
 java/src/main/native/src/TableJni.cpp | 63 +++++++++++++++++++--------
 1 file changed, 45 insertions(+), 18 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 5bae4f5f399..0914c8a23f7 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -705,15 +705,12 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata,
 
 void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_names,
                          jintArray &j_children, jbooleanArray &j_col_nullability,
-                         jobjectArray &j_metadata_keys, jobjectArray &j_metadata_values,
                          jbooleanArray &j_is_int96, jintArray &j_precisions,
                          jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata) {
   cudf::jni::auto_set_device(env);
   cudf::jni::native_jstringArray col_names(env, j_col_names);
   cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
   cudf::jni::native_jbooleanArray is_int96(env, j_is_int96);
-  cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
-  cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
   cudf::jni::native_jintArray precisions(env, j_precisions);
   cudf::jni::native_jintArray children(env, j_children);
   cudf::jni::native_jbooleanArray is_map(env, j_is_map);
@@ -742,9 +739,6 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
                               is_int96, precisions, is_map, children, childs_children, read_index);
     }
   }
-  for (auto i = 0; i < meta_keys.size(); ++i) {
-    metadata.user_data[meta_keys[i].get()] = meta_values[i].get();
-  }
 }
 
 // Check that window parameters are valid.
@@ -1364,15 +1358,23 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
     using namespace cudf::jni;
     sink_info sink{data_sink.get()};
     table_input_metadata metadata;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
-                        j_metadata_keys, j_metadata_values, j_isInt96, j_precisions, j_is_map,
-                        metadata);
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
+                        j_precisions, j_is_map, metadata);
+
+    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
+    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+
+    std::map<std::string, std::string> kv_metadata;
+    for (auto i = 0; i < meta_keys.size(); ++i) {
+      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
+    }
 
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
             .metadata(&metadata)
             .compression(static_cast<compression_type>(j_compression))
             .stats_level(static_cast<statistics_freq>(j_stats_freq))
+            .key_value_metadata({kv_metadata})
             .build();
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
     cudf::jni::native_parquet_writer_handle *ret =
@@ -1398,15 +1400,24 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     using namespace cudf::io;
     using namespace cudf::jni;
     table_input_metadata metadata;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
-                        j_metadata_keys, j_metadata_values, j_isInt96, j_precisions, j_is_map,
-                        metadata);
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
+                        j_precisions, j_is_map, metadata);
+
+    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
+    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+
+    std::map<std::string, std::string> kv_metadata;
+    for (auto i = 0; i < meta_keys.size(); ++i) {
+      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
+    }
+
     sink_info sink{output_path.get()};
     chunked_parquet_writer_options opts =
         chunked_parquet_writer_options::builder(sink)
             .metadata(&metadata)
             .compression(static_cast<compression_type>(j_compression))
             .stats_level(static_cast<statistics_freq>(j_stats_freq))
+            .key_value_metadata({kv_metadata})
             .build();
 
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
@@ -1519,9 +1530,16 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     table_input_metadata metadata;
     // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_is_int96 = NULL;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
-                        j_metadata_keys, j_metadata_values, j_is_int96, j_precisions, j_is_map,
-                        metadata);
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
+                        j_precisions, j_is_map, metadata);
+
+    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
+    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+
+    std::map<std::string, std::string> kv_metadata;
+    for (auto i = 0; i < meta_keys.size(); ++i) {
+      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
+    }
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
         new cudf::jni::jni_writer_data_sink(env, consumer));
@@ -1530,6 +1548,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
                                           .metadata(&metadata)
                                           .compression(static_cast<compression_type>(j_compression))
                                           .enable_statistics(true)
+                                          .key_value_metadata(kv_metadata)
                                           .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
     cudf::jni::native_orc_writer_handle *ret =
@@ -1556,15 +1575,23 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
     table_input_metadata metadata;
     // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_is_int96 = NULL;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability,
-                        j_metadata_keys, j_metadata_values, j_is_int96, j_precisions, j_is_map,
-                        metadata);
+    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
+                        j_precisions, j_is_map, metadata);
+
+    cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys);
+    cudf::jni::native_jstringArray meta_values(env, j_metadata_values);
+
+    std::map<std::string, std::string> kv_metadata;
+    for (auto i = 0; i < meta_keys.size(); ++i) {
+      kv_metadata[meta_keys[i].get()] = meta_values[i].get();
+    }
 
     sink_info sink{output_path.get()};
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
                                           .metadata(&metadata)
                                           .compression(static_cast<compression_type>(j_compression))
                                           .enable_statistics(true)
+                                          .key_value_metadata(kv_metadata)
                                           .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
     cudf::jni::native_orc_writer_handle *ret =

From db9aef8181c400d707d512a2449cc9927d4a3bc5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 15 Dec 2021 09:49:57 -0500
Subject: [PATCH 10/13] Add regex_flags parameter to strings replace_re
 functions (#9878)

Closes #9845

Adds a `cudf::strings::regex_flags` parameter to the `cudf::strings::replace_re` functions so the matching logic will be the same as for `cudf::strings::contains_re` which already has this parameter.

This is a breaking change since it adds this new parameter and changes the default behavior. The previous default behavior is equivalent to specifying the `regex_flags::MULTILINE` flag now to be consistent with the default behavior of `contains_re`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/9878
---
 cpp/include/cudf/strings/replace_re.hpp   | 24 ++++---
 cpp/src/strings/replace/backref_re.cu     | 16 +++--
 cpp/src/strings/replace/multi_re.cu       | 84 +++++++++--------------
 cpp/src/strings/replace/replace_re.cu     | 68 +++++++++---------
 cpp/tests/strings/replace_regex_tests.cpp | 52 ++++++++++++++
 5 files changed, 143 insertions(+), 101 deletions(-)

diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index 087d1a94603..a2c4eba1636 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 namespace cudf {
@@ -37,22 +38,25 @@ namespace strings {
  *
  * @param strings Strings instance for this operation.
  * @param pattern The regular expression pattern to search within each string.
- * @param repl The string used to replace the matched sequence in each string.
+ * @param replacement The string used to replace the matched sequence in each string.
  *        Default is an empty string.
- * @param maxrepl The maximum number of times to replace the matched pattern within each string.
+ * @param max_replace_count The maximum number of times to replace the matched pattern
+ *        within each string. Default replaces every substring that is matched.
+ * @param flags Regex flags for interpreting special characters in the pattern.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
 std::unique_ptr<column> replace_re(
   strings_column_view const& strings,
   std::string const& pattern,
-  string_scalar const& repl           = string_scalar(""),
-  size_type maxrepl                   = -1,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& replacement           = string_scalar(""),
+  std::optional<size_type> max_replace_count = std::nullopt,
+  regex_flags const flags                    = regex_flags::DEFAULT,
+  rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
 
 /**
  * @brief For each string, replaces any character sequence matching the given patterns
- * with the corresponding string in the repls column.
+ * with the corresponding string in the `replacements` column.
  *
  * Any null string entries return corresponding null output column entries.
  *
@@ -60,14 +64,16 @@ std::unique_ptr<column> replace_re(
  *
  * @param strings Strings instance for this operation.
  * @param patterns The regular expression patterns to search within each string.
- * @param repls The strings used for replacement.
+ * @param replacements The strings used for replacement.
+ * @param flags Regex flags for interpreting special characters in the patterns.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
 std::unique_ptr<column> replace_re(
   strings_column_view const& strings,
   std::vector<std::string> const& patterns,
-  strings_column_view const& repls,
+  strings_column_view const& replacements,
+  regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -83,6 +89,7 @@ std::unique_ptr<column> replace_re(
  * @param strings Strings instance for this operation.
  * @param pattern The regular expression patterns to search within each string.
  * @param replacement The replacement template for creating the output string.
+ * @param flags Regex flags for interpreting special characters in the pattern.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
@@ -90,6 +97,7 @@ std::unique_ptr<column> replace_with_backrefs(
   strings_column_view const& strings,
   std::string const& pattern,
   std::string const& replacement,
+  regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 99c55998fb9..ff86d7aa552 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -101,22 +101,24 @@ std::pair<std::string, std::vector<backref_type>> parse_backrefs(std::string con
 std::unique_ptr<column> replace_with_backrefs(
   strings_column_view const& strings,
   std::string const& pattern,
-  std::string const& repl,
+  std::string const& replacement,
+  regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
-  CUDF_EXPECTS(!repl.empty(), "Parameter repl must not be empty");
+  CUDF_EXPECTS(!replacement.empty(), "Parameter replacement must not be empty");
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
   // compile regex into device object
-  auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings.size(), stream);
+  auto d_prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), strings.size(), stream);
   auto const regex_insts = d_prog->insts_counts();
 
   // parse the repl string for back-ref indicators
-  auto const parse_result = parse_backrefs(repl);
+  auto const parse_result = parse_backrefs(replacement);
   rmm::device_uvector<backref_type> backrefs =
     cudf::detail::make_device_uvector_async(parse_result.second, stream);
   string_scalar repl_scalar(parse_result.first, true, stream);
@@ -170,11 +172,13 @@ std::unique_ptr<column> replace_with_backrefs(
 
 std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings,
                                               std::string const& pattern,
-                                              std::string const& repl,
+                                              std::string const& replacement,
+                                              regex_flags const flags,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_with_backrefs(strings, pattern, repl, rmm::cuda_stream_default, mr);
+  return detail::replace_with_backrefs(
+    strings, pattern, replacement, flags, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 25417909c89..2b5380b76dd 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -22,6 +22,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -53,12 +54,11 @@ using found_range = thrust::pair<size_type, size_type>;
 template <int stack_size>
 struct replace_multi_regex_fn {
   column_device_view const d_strings;
-  reprog_device* progs;  // array of regex progs
-  size_type number_of_patterns;
-  found_range* d_found_ranges;       // working array matched (begin,end) values
-  column_device_view const d_repls;  // replacement strings
-  int32_t* d_offsets{};              // these are null when
-  char* d_chars{};                   // only computing size
+  device_span<reprog_device const> progs;  // array of regex progs
+  found_range* d_found_ranges;             // working array matched (begin,end) values
+  column_device_view const d_repls;        // replacement strings
+  int32_t* d_offsets{};
+  char* d_chars{};
 
   __device__ void operator()(size_type idx)
   {
@@ -66,6 +66,9 @@ struct replace_multi_regex_fn {
       if (!d_chars) d_offsets[idx] = 0;
       return;
     }
+
+    auto const number_of_patterns = static_cast<size_type>(progs.size());
+
     auto const d_str      = d_strings.element<string_view>(idx);
     auto const nchars     = d_str.length();      // number of characters in input string
     auto nbytes           = d_str.size_bytes();  // number of bytes in input string
@@ -129,7 +132,8 @@ struct replace_multi_regex_fn {
 std::unique_ptr<column> replace_re(
   strings_column_view const& strings,
   std::vector<std::string> const& patterns,
-  strings_column_view const& repls,
+  strings_column_view const& replacements,
+  regex_flags const flags,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
@@ -138,31 +142,25 @@ std::unique_ptr<column> replace_re(
   if (patterns.empty())  // no patterns; just return a copy
     return std::make_unique<column>(strings.parent(), stream, mr);
 
-  CUDF_EXPECTS(!repls.has_nulls(), "Parameter repls must not have any nulls");
+  CUDF_EXPECTS(!replacements.has_nulls(), "Parameter replacements must not have any nulls");
 
-  auto d_strings = column_device_view::create(strings.parent(), stream);
-  auto d_repls   = column_device_view::create(repls.parent(), stream);
-  auto d_flags   = get_character_flags_table();
+  auto d_strings    = column_device_view::create(strings.parent(), stream);
+  auto d_repls      = column_device_view::create(replacements.parent(), stream);
+  auto d_char_table = get_character_flags_table();
 
   // compile regexes into device objects
   size_type regex_insts = 0;
   std::vector<std::unique_ptr<reprog_device, std::function<void(reprog_device*)>>> h_progs;
-  thrust::host_vector<reprog_device> progs;
+  std::vector<reprog_device> progs;
   for (auto itr = patterns.begin(); itr != patterns.end(); ++itr) {
-    auto prog   = reprog_device::create(*itr, d_flags, strings_count, stream);
+    auto prog   = reprog_device::create(*itr, flags, d_char_table, strings_count, stream);
     regex_insts = std::max(regex_insts, prog->insts_counts());
     progs.push_back(*prog);
     h_progs.emplace_back(std::move(prog));
   }
 
   // copy all the reprog_device instances to a device memory array
-  rmm::device_buffer progs_buffer{sizeof(reprog_device) * progs.size(), stream};
-  CUDA_TRY(cudaMemcpyAsync(progs_buffer.data(),
-                           progs.data(),
-                           progs.size() * sizeof(reprog_device),
-                           cudaMemcpyHostToDevice,
-                           stream.value()));
-  reprog_device* d_progs = reinterpret_cast<reprog_device*>(progs_buffer.data());
+  auto d_progs = cudf::detail::make_device_uvector_async(progs, stream);
 
   // create working buffer for ranges pairs
   rmm::device_uvector<found_range> found_ranges(patterns.size() * strings_count, stream);
@@ -172,34 +170,19 @@ std::unique_ptr<column> replace_re(
   auto children = [&] {
     // Each invocation is predicated on the stack size which is dependent on the number of regex
     // instructions
-    if (regex_insts <= RX_SMALL_INSTS)
-      return make_strings_children(
-        replace_multi_regex_fn<RX_STACK_SMALL>{
-          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
-        strings_count,
-        stream,
-        mr);
-    else if (regex_insts <= RX_MEDIUM_INSTS)
-      return make_strings_children(
-        replace_multi_regex_fn<RX_STACK_MEDIUM>{
-          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
-        strings_count,
-        stream,
-        mr);
-    else if (regex_insts <= RX_LARGE_INSTS)
-      return make_strings_children(
-        replace_multi_regex_fn<RX_STACK_LARGE>{
-          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
-        strings_count,
-        stream,
-        mr);
-    else
-      return make_strings_children(
-        replace_multi_regex_fn<RX_STACK_ANY>{
-          *d_strings, d_progs, static_cast<size_type>(progs.size()), d_found_ranges, *d_repls},
-        strings_count,
-        stream,
-        mr);
+    if (regex_insts <= RX_SMALL_INSTS) {
+      replace_multi_regex_fn<RX_STACK_SMALL> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else if (regex_insts <= RX_MEDIUM_INSTS) {
+      replace_multi_regex_fn<RX_STACK_MEDIUM> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else if (regex_insts <= RX_LARGE_INSTS) {
+      replace_multi_regex_fn<RX_STACK_LARGE> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else {
+      replace_multi_regex_fn<RX_STACK_ANY> fn{*d_strings, d_progs, d_found_ranges, *d_repls};
+      return make_strings_children(fn, strings_count, stream, mr);
+    }
   }();
 
   return make_strings_column(strings_count,
@@ -215,11 +198,12 @@ std::unique_ptr<column> replace_re(
 
 std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    std::vector<std::string> const& patterns,
-                                   strings_column_view const& repls,
+                                   strings_column_view const& replacements,
+                                   regex_flags const flags,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_re(strings, patterns, repls, rmm::cuda_stream_default, mr);
+  return detail::replace_re(strings, patterns, replacements, flags, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index b940944c186..9fd1768453a 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -52,7 +52,7 @@ struct replace_regex_fn {
   column_device_view const d_strings;
   reprog_device prog;
   string_view const d_repl;
-  size_type maxrepl;
+  size_type const maxrepl;
   int32_t* d_offsets{};
   char* d_chars{};
 
@@ -102,56 +102,48 @@ struct replace_regex_fn {
 std::unique_ptr<column> replace_re(
   strings_column_view const& strings,
   std::string const& pattern,
-  string_scalar const& repl           = string_scalar(""),
-  size_type maxrepl                   = -1,
+  string_scalar const& replacement,
+  std::optional<size_type> max_replace_count,
+  regex_flags const flags,
   rmm::cuda_stream_view stream        = rmm::cuda_stream_default,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
 
-  CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid");
-  string_view d_repl(repl.data(), repl.size());
+  CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
+  string_view d_repl(replacement.data(), replacement.size());
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_strings      = *strings_column;
   // compile regex into device object
-  auto prog   = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream);
-  auto d_prog = *prog;
-  auto regex_insts = d_prog.insts_counts();
+  auto prog =
+    reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream);
+  auto d_prog            = *prog;
+  auto const regex_insts = d_prog.insts_counts();
 
   // copy null mask
-  auto null_mask  = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
-  auto null_count = strings.null_count();
+  auto null_mask        = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
+  auto const null_count = strings.null_count();
+  auto const maxrepl    = max_replace_count.value_or(-1);
 
   // create child columns
   auto children = [&] {
     // Each invocation is predicated on the stack size which is dependent on the number of regex
     // instructions
-    if (regex_insts <= RX_SMALL_INSTS)
-      return make_strings_children(
-        replace_regex_fn<RX_STACK_SMALL>{d_strings, d_prog, d_repl, maxrepl},
-        strings_count,
-        stream,
-        mr);
-    else if (regex_insts <= RX_MEDIUM_INSTS)
-      return make_strings_children(
-        replace_regex_fn<RX_STACK_MEDIUM>{d_strings, d_prog, d_repl, maxrepl},
-        strings_count,
-        stream,
-        mr);
-    else if (regex_insts <= RX_LARGE_INSTS)
-      return make_strings_children(
-        replace_regex_fn<RX_STACK_LARGE>{d_strings, d_prog, d_repl, maxrepl},
-        strings_count,
-        stream,
-        mr);
-    else
-      return make_strings_children(
-        replace_regex_fn<RX_STACK_ANY>{d_strings, d_prog, d_repl, maxrepl},
-        strings_count,
-        stream,
-        mr);
+    if (regex_insts <= RX_SMALL_INSTS) {
+      replace_regex_fn<RX_STACK_SMALL> fn{d_strings, d_prog, d_repl, maxrepl};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else if (regex_insts <= RX_MEDIUM_INSTS) {
+      replace_regex_fn<RX_STACK_MEDIUM> fn{d_strings, d_prog, d_repl, maxrepl};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else if (regex_insts <= RX_LARGE_INSTS) {
+      replace_regex_fn<RX_STACK_LARGE> fn{d_strings, d_prog, d_repl, maxrepl};
+      return make_strings_children(fn, strings_count, stream, mr);
+    } else {
+      replace_regex_fn<RX_STACK_ANY> fn{d_strings, d_prog, d_repl, maxrepl};
+      return make_strings_children(fn, strings_count, stream, mr);
+    }
   }();
 
   return make_strings_column(strings_count,
@@ -167,12 +159,14 @@ std::unique_ptr<column> replace_re(
 
 std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    std::string const& pattern,
-                                   string_scalar const& repl,
-                                   size_type maxrepl,
+                                   string_scalar const& replacement,
+                                   std::optional<size_type> max_replace_count,
+                                   regex_flags const flags,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_re(strings, pattern, repl, maxrepl, rmm::cuda_stream_default, mr);
+  return detail::replace_re(
+    strings, pattern, replacement, max_replace_count, flags, rmm::cuda_stream_default, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 16308265a9b..eac06fa4588 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -133,6 +133,58 @@ TEST_F(StringsReplaceRegexTest, WithEmptyPattern)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
 }
 
+TEST_F(StringsReplaceRegexTest, MultiReplacement)
+{
+  cudf::test::strings_column_wrapper input({"aba bcd aba", "abababa abababa"});
+  auto results =
+    cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar("_"), 2);
+  cudf::test::strings_column_wrapper expected({"_ bcd _", "_b_ abababa"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  results =
+    cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar(""), 0);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input);
+}
+
+TEST_F(StringsReplaceRegexTest, Multiline)
+{
+  auto const multiline = cudf::strings::regex_flags::MULTILINE;
+
+  cudf::test::strings_column_wrapper input({"bcd\naba\nefg", "aba\naba abab\naba", "aba"});
+  auto sv = cudf::strings_column_view(input);
+
+  // single-replace
+  auto results =
+    cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_"), std::nullopt, multiline);
+  cudf::test::strings_column_wrapper expected_ml({"bcd\n_\nefg", "_\naba abab\n_", "_"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_ml);
+
+  results = cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_"));
+  cudf::test::strings_column_wrapper expected({"bcd\naba\nefg", "aba\naba abab\naba", "_"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  // multi-replace
+  std::vector<std::string> patterns({"aba$", "^aba"});
+  cudf::test::strings_column_wrapper repls({">", "<"});
+  results = cudf::strings::replace_re(sv, patterns, cudf::strings_column_view(repls), multiline);
+  cudf::test::strings_column_wrapper multi_expected_ml({"bcd\n>\nefg", ">\n< abab\n>", ">"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, multi_expected_ml);
+
+  results = cudf::strings::replace_re(sv, patterns, cudf::strings_column_view(repls));
+  cudf::test::strings_column_wrapper multi_expected({"bcd\naba\nefg", "<\naba abab\n>", ">"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, multi_expected);
+
+  // backref-replace
+  results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]", multiline);
+  cudf::test::strings_column_wrapper br_expected_ml(
+    {"bcd\n[aba]\nefg", "[aba]\n[aba] abab\n[aba]", "[aba]"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected_ml);
+
+  results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]");
+  cudf::test::strings_column_wrapper br_expected(
+    {"bcd\naba\nefg", "[aba]\naba abab\naba", "[aba]"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected);
+}
+
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
 {
   std::vector<const char*> h_strings{"the quick brown fox jumps over the lazy dog",

From 0c3f7356e0afe391dc874b55898029275a23db1c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 15 Dec 2021 10:03:22 -0500
Subject: [PATCH 11/13] Add dictionary support to cudf::copy_if_else (#9887)

Close #9885

Adds support for dictionary column types to `cudf::copy_if_else`. The column/scalar versions of this API will accept a scalar type that matches the dictionary's key type. The column/column version will accept 2 dictionary columns with matching key types. The result of the function will be a dictionary that incorporates both sets of keys or the scalar value as appropriate.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/9887
---
 .../dictionary/dictionary_column_view.hpp     |  5 ++
 cpp/src/copying/copy.cu                       | 37 +++++++-
 cpp/src/dictionary/dictionary_column_view.cpp |  8 +-
 cpp/tests/copying/copy_tests.cpp              | 84 +++++++++++++++++++
 4 files changed, 130 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
index 1da52e67e06..42f8310040e 100644
--- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
@@ -77,6 +77,11 @@ class dictionary_column_view : private column_view {
    */
   column_view keys() const noexcept;
 
+  /**
+   * @brief Returns the `data_type` of the keys child column.
+   */
+  data_type keys_type() const noexcept;
+
   /**
    * @brief Returns the number of rows in the keys column.
    */
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 10af2ffb614..91fc5f02989 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scatter.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/copy_if_else.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -267,6 +268,22 @@ struct copy_if_else_functor_impl<list_view> {
   }
 };
 
+template <>
+struct copy_if_else_functor_impl<dictionary32> {
+  template <typename Left, typename Right, typename Filter>
+  std::unique_ptr<column> operator()(Left const& lhs,
+                                     Right const& rhs,
+                                     size_type size,
+                                     bool,
+                                     bool,
+                                     Filter filter,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
+  }
+};
+
 /**
  * @brief Functor called by the `type_dispatcher` to invoke copy_if_else on combinations
  *        of column_view and scalar
@@ -297,7 +314,6 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type");
   CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8),
                "Boolean mask column must be of type type_id::BOOL8");
 
@@ -311,7 +327,11 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
     return (!has_nulls || bool_mask_device.is_valid_nocheck(i)) and
            bool_mask_device.element<bool>(i);
   };
-  return cudf::type_dispatcher<dispatch_storage_type>(lhs.type(),
+
+  // always dispatch on dictionary-type if either input is a dictionary
+  auto dispatch_type = cudf::is_dictionary(rhs.type()) ? rhs.type() : lhs.type();
+
+  return cudf::type_dispatcher<dispatch_storage_type>(dispatch_type,
                                                       copy_if_else_functor{},
                                                       lhs,
                                                       rhs,
@@ -334,6 +354,8 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs and rhs columns");
   CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size");
+  CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type");
+
   return copy_if_else(lhs, rhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, stream, mr);
 }
 
@@ -345,6 +367,11 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
 {
   CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
                "Boolean mask column must be the same size as rhs column");
+
+  auto rhs_type =
+    cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : rhs.type();
+  CUDF_EXPECTS(lhs.type() == rhs_type, "Both inputs must be of the same type");
+
   return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr);
 }
 
@@ -356,6 +383,11 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs column");
+
+  auto lhs_type =
+    cudf::is_dictionary(lhs.type()) ? cudf::dictionary_column_view(lhs).keys_type() : lhs.type();
+  CUDF_EXPECTS(lhs_type == rhs.type(), "Both inputs must be of the same type");
+
   return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
 
@@ -365,6 +397,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type");
   return copy_if_else(
     lhs, rhs, !lhs.is_valid(stream), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
diff --git a/cpp/src/dictionary/dictionary_column_view.cpp b/cpp/src/dictionary/dictionary_column_view.cpp
index d33fd6c548f..4906e5b4f9c 100644
--- a/cpp/src/dictionary/dictionary_column_view.cpp
+++ b/cpp/src/dictionary/dictionary_column_view.cpp
@@ -44,8 +44,12 @@ column_view dictionary_column_view::keys() const noexcept { return child(1); }
 
 size_type dictionary_column_view::keys_size() const noexcept
 {
-  if (size() == 0) return 0;
-  return keys().size();
+  return (size() == 0) ? 0 : keys().size();
+}
+
+data_type dictionary_column_view::keys_type() const noexcept
+{
+  return (size() == 0) ? data_type{type_id::EMPTY} : keys().type();
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 651a977050c..4468bc69640 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -18,11 +18,13 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/dictionary/encode.hpp>
 #include <cudf/scalar/scalar.hpp>
 
 template <typename T>
@@ -633,3 +635,85 @@ TYPED_TEST(FixedPointTypes, FixedPointScaleMismatch)
 
   EXPECT_THROW(cudf::copy_if_else(a, b, mask), cudf::logic_error);
 }
+
+struct DictionaryCopyIfElseTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(DictionaryCopyIfElseTest, ColumnColumn)
+{
+  auto valids = cudf::test::iterators::null_at(2);
+  std::vector<const char*> h_strings1{"eee", "bb", "", "aa", "bb", "ééé"};
+  cudf::test::dictionary_column_wrapper<std::string> input1(
+    h_strings1.begin(), h_strings1.end(), valids);
+  std::vector<const char*> h_strings2{"zz", "bb", "", "aa", "ééé", "ooo"};
+  cudf::test::dictionary_column_wrapper<std::string> input2(
+    h_strings2.begin(), h_strings2.end(), valids);
+
+  bool mask[]   = {1, 1, 0, 1, 0, 1};
+  bool mask_v[] = {1, 1, 1, 1, 1, 0};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6, mask_v);
+
+  auto results = cudf::copy_if_else(input1, input2, mask_w);
+  auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));
+
+  std::vector<const char*> h_expected;
+  for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(h_strings1.size()); ++idx) {
+    if (mask[idx] and mask_v[idx])
+      h_expected.push_back(h_strings1[idx]);
+    else
+      h_expected.push_back(h_strings2[idx]);
+  }
+  cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end(), valids);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected);
+}
+
+TEST_F(DictionaryCopyIfElseTest, ColumnScalar)
+{
+  std::string h_string{"eee"};
+  cudf::string_scalar input1{h_string};
+  std::vector<const char*> h_strings{"zz", "", "yyy", "w", "ééé", "ooo"};
+  auto valids = cudf::test::iterators::null_at(1);
+  cudf::test::dictionary_column_wrapper<std::string> input2(
+    h_strings.begin(), h_strings.end(), valids);
+
+  bool mask[] = {0, 1, 1, 1, 0, 1};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w(mask, mask + 6);
+
+  auto results = cudf::copy_if_else(input2, input1, mask_w);
+  auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));
+
+  std::vector<const char*> h_expected1;
+  std::vector<const char*> h_expected2;
+  for (cudf::size_type idx = 0; idx < static_cast<cudf::size_type>(h_strings.size()); ++idx) {
+    if (mask[idx]) {
+      h_expected1.push_back(h_strings[idx]);
+      h_expected2.push_back(h_string.c_str());
+    } else {
+      h_expected1.push_back(h_string.c_str());
+      h_expected2.push_back(h_strings[idx]);
+    }
+  }
+
+  cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end(), valids);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected1);
+
+  results = cudf::copy_if_else(input1, input2, mask_w);
+  decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view()));
+
+  cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected2);
+}
+
+TEST_F(DictionaryCopyIfElseTest, TypeMismatch)
+{
+  cudf::test::dictionary_column_wrapper<int32_t> input1({1, 1, 1, 1});
+  cudf::test::dictionary_column_wrapper<double> input2({1.0, 1.0, 1.0, 1.0});
+  cudf::test::fixed_width_column_wrapper<bool> mask({1, 0, 0, 1});
+
+  EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::logic_error);
+
+  cudf::string_scalar input3{"1"};
+  EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::logic_error);
+  EXPECT_THROW(cudf::copy_if_else(input3, input2, mask), cudf::logic_error);
+  EXPECT_THROW(cudf::copy_if_else(input2, input3, mask), cudf::logic_error);
+}

From 967f3397fb486368d74916ae344c0e1d9eb0a1a8 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 15 Dec 2021 13:28:11 -0600
Subject: [PATCH 12/13] Remove conda envs for CUDA 11.0 and 11.2. (#9910)

I think the development environments for CUDA 11.0 and 11.2 can be safely removed now that we require CUDA 11.5 to build. I also updated the default CUDA version in the conda recipes from 10.1 to 11.5.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/9910
---
 conda/environments/cudf_dev_cuda11.0.yml | 69 ------------------------
 conda/environments/cudf_dev_cuda11.2.yml | 69 ------------------------
 conda/recipes/cudf/meta.yaml             |  2 +-
 conda/recipes/cudf_kafka/meta.yaml       |  2 +-
 conda/recipes/custreamz/meta.yaml        |  4 +-
 conda/recipes/dask-cudf/meta.yaml        |  2 +-
 conda/recipes/libcudf/meta.yaml          |  2 +-
 7 files changed, 6 insertions(+), 144 deletions(-)
 delete mode 100644 conda/environments/cudf_dev_cuda11.0.yml
 delete mode 100644 conda/environments/cudf_dev_cuda11.2.yml

diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml
deleted file mode 100644
index e7b92eddd9e..00000000000
--- a/conda/environments/cudf_dev_cuda11.0.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-name: cudf_dev
-channels:
-  - rapidsai
-  - nvidia
-  - rapidsai-nightly
-  - conda-forge
-dependencies:
-  - clang=11.1.0
-  - clang-tools=11.1.0
-  - cupy>=9.5.0,<10.0.0a0
-  - rmm=22.02.*
-  - cmake>=3.20.1
-  - cmake_setuptools>=0.1.3
-  - python>=3.7,<3.9
-  - numba>=0.54
-  - numpy
-  - pandas>=1.0,<1.4.0dev0
-  - pyarrow=5.0.0=*cuda
-  - fastavro>=0.22.9
-  - python-snappy>=0.6.0
-  - notebook>=0.5.0
-  - cython>=0.29,<0.30
-  - fsspec>=0.6.0
-  - pytest
-  - pytest-benchmark
-  - pytest-xdist
-  - sphinx
-  - sphinxcontrib-websupport
-  - nbsphinx
-  - numpydoc
-  - ipython
-  - pandoc=<2.0.0
-  - cudatoolkit=11.0
-  - pip
-  - flake8=3.8.3
-  - black=19.10
-  - isort=5.6.4
-  - mypy=0.782
-  - pydocstyle=6.1.1
-  - typing_extensions
-  - pre-commit
-  - dask>=2021.11.1,<=2021.11.2
-  - distributed>=2021.11.1,<=2021.11.2
-  - streamz
-  - arrow-cpp=5.0.0
-  - dlpack>=0.5,<0.6.0a0
-  - arrow-cpp-proc * cuda
-  - double-conversion
-  - rapidjson
-  - hypothesis
-  - sphinx-markdown-tables
-  - sphinx-copybutton
-  - mimesis<4.1
-  - packaging
-  - protobuf
-  - nvtx>=0.2.1
-  - cachetools
-  - transformers<=4.10.3
-  - pydata-sphinx-theme
-  - librdkafka=1.7.0
-  - python-confluent-kafka=1.7.0
-  - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
-      - git+https://github.com/python-streamz/streamz.git@master
-      - pyorc
-  - ptxcompiler  # [linux64]
diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml
deleted file mode 100644
index 6fe8ed0fafe..00000000000
--- a/conda/environments/cudf_dev_cuda11.2.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-name: cudf_dev
-channels:
-  - rapidsai
-  - nvidia
-  - rapidsai-nightly
-  - conda-forge
-dependencies:
-  - clang=11.1.0
-  - clang-tools=11.1.0
-  - cupy>=9.5.0,<10.0.0a0
-  - rmm=22.02.*
-  - cmake>=3.20.1
-  - cmake_setuptools>=0.1.3
-  - python>=3.7,<3.9
-  - numba>=0.54
-  - numpy
-  - pandas>=1.0,<1.4.0dev0
-  - pyarrow=5.0.0=*cuda
-  - fastavro>=0.22.9
-  - python-snappy>=0.6.0
-  - notebook>=0.5.0
-  - cython>=0.29,<0.30
-  - fsspec>=0.6.0
-  - pytest
-  - pytest-benchmark
-  - pytest-xdist
-  - sphinx
-  - sphinxcontrib-websupport
-  - nbsphinx
-  - numpydoc
-  - ipython
-  - pandoc=<2.0.0
-  - cudatoolkit=11.2
-  - pip
-  - flake8=3.8.3
-  - black=19.10
-  - isort=5.6.4
-  - mypy=0.782
-  - pydocstyle=6.1.1
-  - typing_extensions
-  - pre-commit
-  - dask>=2021.11.1,<=2021.11.2
-  - distributed>=2021.11.1,<=2021.11.2
-  - streamz
-  - arrow-cpp=5.0.0
-  - dlpack>=0.5,<0.6.0a0
-  - arrow-cpp-proc * cuda
-  - double-conversion
-  - rapidjson
-  - hypothesis
-  - sphinx-markdown-tables
-  - sphinx-copybutton
-  - mimesis<4.1
-  - packaging
-  - protobuf
-  - nvtx>=0.2.1
-  - cachetools
-  - transformers<=4.10.3
-  - pydata-sphinx-theme
-  - librdkafka=1.7.0
-  - python-confluent-kafka=1.7.0
-  - pip:
-      - git+https://github.com/dask/dask.git@main
-      - git+https://github.com/dask/distributed.git@main
-      - git+https://github.com/python-streamz/streamz.git@master
-      - pyorc
-  - ptxcompiler  # [linux64]
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 46eefbc825f..2600ab358cc 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
 {% set cuda_major=cuda_version.split('.')[0] %}
 
 package:
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index af27d888b46..e450d306cbe 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
 
 package:
   name: cudf_kafka
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index db8af9b0bed..a8b096d4892 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
 
 package:
   name: custreamz
@@ -29,7 +29,7 @@ requirements:
     - cudf_kafka {{ version }}
   run:
     - python
-    - streamz 
+    - streamz
     - cudf {{ version }}
     - dask>=2021.11.1,<=2021.11.2
     - distributed>=2021.11.1,<=2021.11.2
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index d90de2d628c..ed3309056cf 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -3,7 +3,7 @@
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
 {% set py_version=environ.get('CONDA_PY', 36) %}
-{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
 {% set cuda_major=cuda_version.split('.')[0] %}
 
 package:
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index e78110f3233..bd9b76e4890 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -2,7 +2,7 @@
 
 {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %}
 {% set minor_version =  version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %}
+{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %}
 {% set cuda_major=cuda_version.split('.')[0] %}
 
 package:

From 0faf2afc2a12b8dad5e3d1fd823b6a8c98c28bcc Mon Sep 17 00:00:00 2001
From: Nghia Truong <ttnghia@users.noreply.github.com>
Date: Wed, 15 Dec 2021 12:32:11 -0700
Subject: [PATCH 13/13] Implement JNI for `cudf::scatter` APIs (#9903)

This PR adds Java binding for both `cudf::scatter` APIs:
```
std::unique_ptr<table> scatter(
  table_view const& source,
  column_view const& scatter_map,
  table_view const& target,
...)
```
and
```
std::unique_ptr<table> scatter(
  std::vector<std::reference_wrapper<const scalar>> const& source,
  column_view const& indices,
  table_view const& target,
...)
```

Closes #9892.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/9903
---
 java/src/main/java/ai/rapids/cudf/Table.java  | 68 ++++++++++++++++++-
 java/src/main/native/src/TableJni.cpp         | 40 +++++++++++
 .../test/java/ai/rapids/cudf/TableTest.java   | 47 ++++++++++++-
 3 files changed, 153 insertions(+), 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 887a125e083..00c98c4fef8 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -649,6 +649,13 @@ private static native long[] dropDuplicates(long nativeHandle, int[] keyColumns,
 
   private static native long[] gather(long tableHandle, long gatherView, boolean checkBounds);
 
+  private static native long[] scatterTable(long srcTableHandle, long scatterView,
+                                            long targetTableHandle, boolean checkBounds)
+                                            throws CudfException;
+  private static native long[] scatterScalars(long[] srcScalarHandles, long scatterView,
+                                             long targetTableHandle, boolean checkBounds)
+                                             throws CudfException;
+
   private static native long[] convertToRows(long nativeHandle);
 
   private static native long[] convertFromRows(long nativeColumnView, int[] types, int[] scale);
@@ -2047,7 +2054,7 @@ public Table gather(ColumnView gatherMap) {
    * `n` is the number of rows in this table.
    *
    * @param gatherMap the map of indexes.  Must be non-nullable and integral type.
-   * @param outOfBoundsPolicy policy to use when an out-of-range value is in `gatherMap`
+   * @param outOfBoundsPolicy policy to use when an out-of-range value is in `gatherMap`.
    * @return the resulting Table.
    */
   public Table gather(ColumnView gatherMap, OutOfBoundsPolicy outOfBoundsPolicy) {
@@ -2055,6 +2062,65 @@ public Table gather(ColumnView gatherMap, OutOfBoundsPolicy outOfBoundsPolicy) {
     return new Table(gather(nativeHandle, gatherMap.getNativeView(), checkBounds));
   }
 
+  /**
+   * Scatters values from the source table into the target table out-of-place, returning a new
+   * result table. The scatter is performed according to a scatter map such that row `scatterMap[i]`
+   * of the destination table gets row `i` of the source table. All other rows of the destination
+   * table equal corresponding rows of the target table.
+   *
+   * The number of columns in source must match the number of columns in target and their
+   * corresponding data types must be the same.
+   *
+   * If the same index appears more than once in the scatter map, the result is undefined.
+   *
+   * A negative value `i` in the `scatterMap` is interpreted as `i + n`, where `n` is the number of
+   * rows in the `target` table.
+   *
+   * @param scatterMap The map of indexes. Must be non-nullable and integral type.
+   * @param target The table into which rows from the current table are to be scattered out-of-place.
+   * @param checkBounds Optionally perform bounds checking on the values of`scatterMap` and throw
+   *                    an exception if any of its values are out of bounds.
+   * @return A new table which is the result of out-of-place scattering the source table into the
+   *         target table.
+   */
+  public Table scatter(ColumnView scatterMap, Table target, boolean checkBounds) {
+    return new Table(scatterTable(nativeHandle, scatterMap.getNativeView(), target.getNativeView(),
+        checkBounds));
+  }
+
+  /**
+   * Scatters values from the source rows into the target table out-of-place, returning a new result
+   * table. The scatter is performed according to a scatter map such that row `scatterMap[i]` of the
+   * destination table is replaced by the source row `i`. All other rows of the destination table
+   * equal corresponding rows of the target table.
+   *
+   * The number of elements in source must match the number of columns in target and their
+   * corresponding data types must be the same.
+   *
+   * If the same index appears more than once in the scatter map, the result is undefined.
+   *
+   * A negative value `i` in the `scatterMap` is interpreted as `i + n`, where `n` is the number of
+   * rows in the `target` table.
+   *
+   * @param source The input scalars containing values to be scattered into the target table.
+   * @param scatterMap The map of indexes. Must be non-nullable and integral type.
+   * @param target The table into which the values from source are to be scattered out-of-place.
+   * @param checkBounds Optionally perform bounds checking on the values of`scatterMap` and throw
+   *                    an exception if any of its values are out of bounds.
+   * @return A new table which is the result of out-of-place scattering the source values into the
+   *         target table.
+   */
+  public static Table scatter(Scalar[] source, ColumnView scatterMap, Table target,
+                              boolean checkBounds) {
+    long[] srcScalarHandles = new long[source.length];
+    for(int i = 0; i < source.length; ++i) {
+      assert source[i] != null : "Scalar vectors passed in should not contain null";
+      srcScalarHandles[i] = source[i].getScalarHandle();
+    }
+    return new Table(scatterScalars(srcScalarHandles, scatterMap.getNativeView(),
+        target.getNativeView(), checkBounds));
+  }
+
   private GatherMap[] buildJoinGatherMaps(long[] gatherMapData) {
     long bufferSize = gatherMapData[0];
     long leftAddr = gatherMapData[1];
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 0914c8a23f7..0e6425ea7a2 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2746,6 +2746,46 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, jclass,
+                                                                    jlong j_input, jlong j_map,
+                                                                    jlong j_target,
+                                                                    jboolean check_bounds) {
+  JNI_NULL_CHECK(env, j_input, "input table is null", 0);
+  JNI_NULL_CHECK(env, j_map, "map column is null", 0);
+  JNI_NULL_CHECK(env, j_target, "target table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
+    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
+    auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
+    auto result = cudf::scatter(*input, *map, *target, check_bounds);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *env, jclass,
+                                                                      jlongArray j_input,
+                                                                      jlong j_map, jlong j_target,
+                                                                      jboolean check_bounds) {
+  JNI_NULL_CHECK(env, j_input, "input scalars array is null", 0);
+  JNI_NULL_CHECK(env, j_map, "map column is null", 0);
+  JNI_NULL_CHECK(env, j_target, "target table is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const scalars_array = cudf::jni::native_jpointerArray<cudf::scalar>(env, j_input);
+    std::vector<std::reference_wrapper<cudf::scalar const>> input;
+    for (int i = 0; i < scalars_array.size(); ++i) {
+      input.emplace_back(*scalars_array[i]);
+    }
+    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
+    auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
+    auto result = cudf::scatter(input, *map, *target, check_bounds);
+    return cudf::jni::convert_table_for_return(env, result);
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env, jclass,
                                                                      jlong input_table) {
   JNI_NULL_CHECK(env, input_table, "input table is null", 0);
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index eeed8224425..86c55e19776 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -56,12 +56,12 @@
 import java.util.function.Function;
 import java.util.stream.Collectors;
 
-import static ai.rapids.cudf.ColumnWriterOptions.mapColumn;
 import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual;
 import static ai.rapids.cudf.AssertUtils.assertPartialColumnsAreEqual;
 import static ai.rapids.cudf.AssertUtils.assertPartialTablesAreEqual;
 import static ai.rapids.cudf.AssertUtils.assertTableTypes;
 import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual;
+import static ai.rapids.cudf.ColumnWriterOptions.mapColumn;
 import static ai.rapids.cudf.ParquetWriterOptions.listBuilder;
 import static ai.rapids.cudf.ParquetWriterOptions.structBuilder;
 import static ai.rapids.cudf.Table.TestBuilder;
@@ -6338,6 +6338,51 @@ void testBoundsCheckedGather() {
     }
   }
 
+
+  @Test
+  void testScatterTable() {
+    try (Table srcTable = new Table.TestBuilder()
+            .column(1, 2, 3, 4, 5)
+            .column("A", "AA", "AAA", "AAAA", "AAAAA")
+            .decimal32Column(-3, 1, 2, 3, 4, 5)
+            .decimal64Column(-8, 100001L, 200002L, 300003L, 400004L, 500005L)
+            .build();
+         ColumnVector scatterMap = ColumnVector.fromInts(0, 2, 4, -2);
+         Table targetTable = new Table.TestBuilder()
+            .column(-1, -2, -3, -4, -5)
+            .column("B", "BB", "BBB", "BBBB", "BBBBB")
+            .decimal32Column(-3, -1, -2, -3, -4, -5)
+            .decimal64Column(-8, -100001L, -200002L, -300003L, -400004L, -500005L)
+            .build();
+         Table expected = new Table.TestBuilder()
+            .column(1, -2, 2, 4, 3)
+            .column("A", "BB", "AA", "AAAA", "AAA")
+            .decimal32Column(-3, 1, -2, 2, 4, 3)
+            .decimal64Column(-8, 100001L, -200002L, 200002L, 400004L, 300003L)
+            .build();
+         Table result = srcTable.scatter(scatterMap, targetTable, false)) {
+      assertTablesAreEqual(expected, result);
+    }
+  }
+
+  @Test
+  void testScatterScalars() {
+    try (Scalar s1 = Scalar.fromInt(0);
+         Scalar s2 = Scalar.fromString("A");
+         ColumnVector scatterMap = ColumnVector.fromInts(0, 2, -1);
+         Table targetTable = new Table.TestBuilder()
+            .column(-1, -2, -3, -4, -5)
+            .column("B", "BB", "BBB", "BBBB", "BBBBB")
+            .build();
+         Table expected = new Table.TestBuilder()
+            .column(0, -2, 0, -4, 0)
+            .column("A", "BB", "A", "BBBB", "A")
+            .build();
+         Table result = Table.scatter(new Scalar[] { s1, s2 }, scatterMap, targetTable, false)) {
+       assertTablesAreEqual(expected, result);
+     }
+  }
+
   @Test
   void testMaskWithoutValidity() {
     try (ColumnVector mask = ColumnVector.fromBoxedBooleans(true, false, true, false, true);