From 262715390f739075f0bdac01ff8c92206a1c2fb5 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 14 Dec 2021 13:58:28 -0600 Subject: [PATCH 01/13] Change default `dtype` of all nulls column from `float` to `object` (#9803) Fixes: #9337 - [x] This PR changes the default `dtype` of `all-nulls` column to `object` dtype from `float64` dtype. - [x] To make `np.nan` values read as `float` column `nan_as_null` has to be passed as `False` in `cudf.DataFrame` constructor - This change is in-line with what is already supported by `cudf.Series` constructor. - [x] Added `has_nans` & `nan_count` property which is needed for some of the checks. - [x] Cached the `nan_count` since it is repeatedly used in math operations and clearing the cache in the regular `_clear_cache` call. - [x] Fixes pytests that are going to break due to this breaking change of types. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - https://github.com/brandon-b-miller - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9803 --- python/cudf/cudf/_lib/column.pyi | 3 +- python/cudf/cudf/_lib/column.pyx | 3 +- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/column/column.py | 34 ++++++---- python/cudf/cudf/core/column/datetime.py | 2 +- python/cudf/cudf/core/column/numerical.py | 64 +++++++++++++++++-- .../cudf/cudf/core/column/numerical_base.py | 9 ++- python/cudf/cudf/core/column/string.py | 15 +---- python/cudf/cudf/core/dataframe.py | 44 +++++++++---- python/cudf/cudf/core/frame.py | 2 +- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/series.py | 2 +- python/cudf/cudf/core/tools/datetimes.py | 2 +- python/cudf/cudf/core/tools/numeric.py | 2 +- python/cudf/cudf/core/window/rolling.py | 2 +- python/cudf/cudf/tests/test_dataframe.py | 54 +++++++++++++--- python/cudf/cudf/tests/test_interpolate.py | 8 ++- python/cudf/cudf/tests/test_list.py | 15 +++-- python/cudf/cudf/tests/test_onehot.py | 12 +++- python/cudf/cudf/tests/test_repr.py | 6 +- python/cudf/cudf/tests/test_series.py | 2 +- python/cudf/cudf/tests/test_stats.py | 5 +- 23 files changed, 210 insertions(+), 82 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index dafaa8f4d1d..235cb4fd973 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -70,8 +70,7 @@ class Column: def nullable(self) -> bool: ... - @property - def has_nulls(self) -> bool: + def has_nulls(self, include_nan: bool=False) -> bool: ... @property diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index ff3f3050e63..5e0ee3136b7 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -146,8 +146,7 @@ cdef class Column: def nullable(self): return self.base_mask is not None - @property - def has_nulls(self): + def has_nulls(self, include_nan=False): return self.null_count != 0 @property diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 2fcc976d8e1..ac5e152d011 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -147,7 +147,7 @@ def _clean_nulls_from_index(self): methods using this method to replace or handle representation of the actual types correctly. """ - if self._values.has_nulls: + if self._values.has_nulls(): return cudf.Index( self._values.astype("str").fillna(cudf._NA_REP), name=self.name ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1d113f6e159..a98052ce906 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -139,7 +139,7 @@ def values_host(self) -> "np.ndarray": if len(self) == 0: return np.array([], dtype=self.dtype) - if self.has_nulls: + if self.has_nulls(): raise ValueError("Column must have no nulls.") return self.data_array_view.copy_to_host() @@ -152,7 +152,7 @@ def values(self) -> "cupy.ndarray": if len(self) == 0: return cupy.array([], dtype=self.dtype) - if self.has_nulls: + if self.has_nulls(): raise ValueError("Column must have no nulls.") return cupy.asarray(self.data_array_view) @@ -193,7 +193,7 @@ def all(self, skipna: bool = True) -> bool: def any(self, skipna: bool = True) -> bool: # Early exit for fast cases. result_col = self.nans_to_nulls() if skipna else self - if not skipna and result_col.has_nulls: + if not skipna and result_col.has_nulls(): return True elif skipna and result_col.null_count == result_col.size: return False @@ -786,7 +786,7 @@ def as_mask(self) -> Buffer: Buffer """ - if self.has_nulls: + if self.has_nulls(): raise ValueError("Column must have no nulls.") return bools_to_mask(self) @@ -797,13 +797,13 @@ def is_unique(self) -> bool: @property def is_monotonic_increasing(self) -> bool: - return not self.has_nulls and self.as_frame()._is_sorted( + return not self.has_nulls() and self.as_frame()._is_sorted( ascending=None, null_position=None ) @property def is_monotonic_decreasing(self) -> bool: - return not self.has_nulls and self.as_frame()._is_sorted( + return not self.has_nulls() and self.as_frame()._is_sorted( ascending=[False], null_position=None ) @@ -942,7 +942,7 @@ def as_categorical_column(self, dtype, **kwargs) -> ColumnBase: ) # columns include null index in factorization; remove: - if self.has_nulls: + if self.has_nulls(): cats = cats._column.dropna(drop_nan=False) min_type = min_unsigned_type(len(cats), 8) labels = labels - 1 @@ -1216,10 +1216,10 @@ def _process_for_reduction( if skipna: result_col = self.nans_to_nulls() - if result_col.has_nulls: + if result_col.has_nulls(): result_col = result_col.dropna() else: - if self.has_nulls: + if self.has_nulls(): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) result_col = self @@ -1766,12 +1766,20 @@ def as_column( "https://issues.apache.org/jira/browse/ARROW-3802" ) col = ColumnBase.from_arrow(arbitrary) + if isinstance(arbitrary, pa.NullArray): - if type(dtype) == str and dtype == "empty": - new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype()) + new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype()) + if dtype is not None: + # Cast the column to the `dtype` if specified. + col = col.astype(dtype) + elif len(arbitrary) == 0: + # If the column is empty, it has to be + # a `float64` dtype. + col = col.astype("float64") else: - new_dtype = cudf.dtype(dtype) - col = col.astype(new_dtype) + # If the null column is not empty, it has to + # be of `object` dtype. + col = col.astype(new_dtype) return col diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 08d72f1c6ee..24ec25acbbb 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -292,7 +292,7 @@ def __cuda_array_interface__(self) -> Mapping[builtins.str, Any]: "version": 1, } - if self.nullable and self.has_nulls: + if self.nullable and self.has_nulls(): # Create a simple Python object that exposes the # `__cuda_array_interface__` attribute here since we need to modify diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index becb303feeb..c947440edb1 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -3,7 +3,16 @@ from __future__ import annotations from types import SimpleNamespace -from typing import Any, Callable, Mapping, Sequence, Tuple, Union, cast +from typing import ( + Any, + Callable, + Mapping, + Optional, + Sequence, + Tuple, + Union, + cast, +) import cupy import numpy as np @@ -47,6 +56,8 @@ class NumericalColumn(NumericalBaseColumn): mask : Buffer, optional """ + _nan_count: Optional[int] + def __init__( self, data: Buffer, @@ -62,7 +73,7 @@ def __init__( raise ValueError("Buffer size must be divisible by element size") if size is None: size = (data.size // dtype.itemsize) - offset - + self._nan_count = None super().__init__( data, size=size, @@ -72,6 +83,10 @@ def __init__( null_count=null_count, ) + def _clear_cache(self): + super()._clear_cache() + self._nan_count = None + def __contains__(self, item: ScalarLike) -> bool: """ Returns True if column contains item, else False. @@ -90,6 +105,11 @@ def __contains__(self, item: ScalarLike) -> bool: self, column.as_column([item], dtype=self.dtype) ).any() + def has_nulls(self, include_nan=False): + return self.null_count != 0 or ( + self.nan_count != 0 if include_nan else False + ) + @property def __cuda_array_interface__(self) -> Mapping[str, Any]: output = { @@ -100,7 +120,7 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]: "version": 1, } - if self.nullable and self.has_nulls: + if self.nullable and self.has_nulls(): # Create a simple Python object that exposes the # `__cuda_array_interface__` attribute here since we need to modify @@ -280,6 +300,15 @@ def as_numerical_column(self, dtype: Dtype, **kwargs) -> NumericalColumn: return self return libcudf.unary.cast(self, dtype) + @property + def nan_count(self) -> int: + if self.dtype.kind != "f": + self._nan_count = 0 + elif self._nan_count is None: + nan_col = libcudf.unary.is_nan(self) + self._nan_count = nan_col.sum() + return self._nan_count + def _process_values_for_isin( self, values: Sequence ) -> Tuple[ColumnBase, ColumnBase]: @@ -296,6 +325,20 @@ def _process_values_for_isin( return lhs, rhs + def _can_return_nan(self, skipna: bool = None) -> bool: + return not skipna and self.has_nulls(include_nan=True) + + def _process_for_reduction( + self, skipna: bool = None, min_count: int = 0 + ) -> Union[ColumnBase, ScalarLike]: + skipna = True if skipna is None else skipna + + if self._can_return_nan(skipna=skipna): + return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) + return super()._process_for_reduction( + skipna=skipna, min_count=min_count + ) + def _default_na_value(self) -> ScalarLike: """Returns the default NA value for this column""" dkind = self.dtype.kind @@ -319,8 +362,19 @@ def find_and_replace( """ Return col with *to_replace* replaced with *value*. """ + + # If all of `to_replace`/`replacement` are `None`, + # dtype of `to_replace_col`/`replacement_col` + # is inferred as `string`, but this is a valid + # float64 column too, Hence we will need to type-cast + # to self.dtype. to_replace_col = column.as_column(to_replace) + if to_replace_col.null_count == len(to_replace_col): + to_replace_col = to_replace_col.astype(self.dtype) + replacement_col = column.as_column(replacement) + if replacement_col.null_count == len(replacement_col): + replacement_col = replacement_col.astype(self.dtype) if type(to_replace_col) != type(replacement_col): raise TypeError( @@ -578,7 +632,7 @@ def to_pandas( arrow_array = self.to_arrow() pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array) pd_series = pd.Series(pandas_array, copy=False) - elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls: + elif str(self.dtype) in NUMERIC_TYPES and not self.has_nulls(): pd_series = pd.Series(cupy.asnumpy(self.values), copy=False) else: pd_series = self.to_arrow().to_pandas(**kwargs) @@ -597,6 +651,8 @@ def _normalize_find_and_replace_input( ) col_to_normalize_dtype = normalized_column.dtype if isinstance(col_to_normalize, list): + if normalized_column.null_count == len(normalized_column): + normalized_column = normalized_column.astype(input_column_dtype) col_to_normalize_dtype = min_column_type( normalized_column, input_column_dtype ) diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py index 853fb360c50..1f84cb88e37 100644 --- a/python/cudf/cudf/core/column/numerical_base.py +++ b/python/cudf/cudf/core/column/numerical_base.py @@ -77,10 +77,13 @@ def sum_of_squares( "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count ) + def _can_return_nan(self, skipna: bool = None) -> bool: + return not skipna and self.has_nulls() + def kurtosis(self, skipna: bool = None) -> float: skipna = True if skipna is None else skipna - if len(self) == 0 or (not skipna and self.has_nulls): + if len(self) == 0 or self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) self = self.nans_to_nulls().dropna() # type: ignore @@ -105,7 +108,7 @@ def kurtosis(self, skipna: bool = None) -> float: def skew(self, skipna: bool = None) -> ScalarLike: skipna = True if skipna is None else skipna - if len(self) == 0 or (not skipna and self.has_nulls): + if len(self) == 0 or self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) self = self.nans_to_nulls().dropna() # type: ignore @@ -148,7 +151,7 @@ def quantile( def median(self, skipna: bool = None) -> NumericalBaseColumn: skipna = True if skipna is None else skipna - if not skipna and self.has_nulls: + if self._can_return_nan(skipna=skipna): return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) # enforce linear in case the default ever changes diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2a91abc5701..1c9a013810a 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5018,7 +5018,7 @@ def edit_distance_matrix(self) -> ParentType: raise ValueError( "Require size >= 2 to compute edit distance matrix." ) - if self._column.has_nulls: + if self._column.has_nulls(): raise ValueError( "Cannot compute edit distance between null strings. " "Consider removing them using `dropna` or fill with `fillna`." @@ -5440,20 +5440,7 @@ def find_and_replace( """ to_replace_col = column.as_column(to_replace) - if to_replace_col.null_count == len(to_replace_col): - # If all of `to_replace` are `None`, dtype of `to_replace_col` - # is inferred as `float64`, but this is a valid - # string column too, Hence we will need to type-cast - # to self.dtype. - to_replace_col = to_replace_col.astype(self.dtype) - replacement_col = column.as_column(replacement) - if replacement_col.null_count == len(replacement_col): - # If all of `replacement` are `None`, dtype of `replacement_col` - # is inferred as `float64`, but this is a valid - # string column too, Hence we will need to type-cast - # to self.dtype. - replacement_col = replacement_col.astype(self.dtype) if type(to_replace_col) != type(replacement_col): raise TypeError( diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bbe691595e7..88c8aaebd9e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -440,6 +440,11 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): Data type to force. Only a single dtype is allowed. If None, infer. + nan_as_null : bool, Default True + If ``None``/``True``, converts ``np.nan`` values to + ``null`` values. + If ``False``, leaves ``np.nan`` values as is. + Examples -------- @@ -514,7 +519,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin): _iloc_indexer_type = _DataFrameIlocIndexer @annotate("DATAFRAME_INIT", color="blue", domain="cudf_python") - def __init__(self, data=None, index=None, columns=None, dtype=None): + def __init__( + self, data=None, index=None, columns=None, dtype=None, nan_as_null=True + ): super().__init__() @@ -523,7 +530,7 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): if isinstance(data, (DataFrame, pd.DataFrame)): if isinstance(data, pd.DataFrame): - data = self.from_pandas(data) + data = self.from_pandas(data, nan_as_null=nan_as_null) if index is not None: if not data.index.equals(index): @@ -546,11 +553,14 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): self.columns = data.columns elif isinstance(data, (cudf.Series, pd.Series)): if isinstance(data, pd.Series): - data = cudf.Series.from_pandas(data) + data = cudf.Series.from_pandas(data, nan_as_null=nan_as_null) name = data.name or 0 self._init_from_dict_like( - {name: data}, index=index, columns=columns + {name: data}, + index=index, + columns=columns, + nan_as_null=nan_as_null, ) elif data is None: if index is None: @@ -620,7 +630,9 @@ def __init__(self, data=None, index=None, columns=None, dtype=None): if not is_dict_like(data): raise TypeError("data must be list or dict-like") - self._init_from_dict_like(data, index=index, columns=columns) + self._init_from_dict_like( + data, index=index, columns=columns, nan_as_null=nan_as_null + ) if dtype: self._data = self.astype(dtype)._data @@ -759,7 +771,9 @@ def _init_from_list_like(self, data, index=None, columns=None): self.columns = columns - def _init_from_dict_like(self, data, index=None, columns=None): + def _init_from_dict_like( + self, data, index=None, columns=None, nan_as_null=None + ): if columns is not None: # remove all entries in `data` that are # not in `columns` @@ -794,7 +808,9 @@ def _init_from_dict_like(self, data, index=None, columns=None): if is_scalar(data[col_name]): num_rows = num_rows or 1 else: - data[col_name] = column.as_column(data[col_name]) + data[col_name] = column.as_column( + data[col_name], nan_as_null=nan_as_null + ) num_rows = len(data[col_name]) self._index = RangeIndex(0, num_rows) else: @@ -806,7 +822,9 @@ def _init_from_dict_like(self, data, index=None, columns=None): self._data.multiindex = self._data.multiindex and isinstance( col_name, tuple ) - self.insert(i, col_name, data[col_name]) + self.insert( + i, col_name, data[col_name], nan_as_null=nan_as_null + ) if columns is not None: self.columns = columns @@ -1747,7 +1765,7 @@ def _clean_nulls_from_dataframe(self, df): if is_list_dtype(df._data[col]) or is_struct_dtype(df._data[col]): # TODO we need to handle this pass - elif df._data[col].has_nulls: + elif df._data[col].has_nulls(): df[col] = df._data[col].astype("str").fillna(cudf._NA_REP) else: df[col] = df._data[col] @@ -2582,7 +2600,7 @@ def take(self, indices, axis=0, keep_index=None): return out @annotate("INSERT", color="green", domain="cudf_python") - def insert(self, loc, name, value): + def insert(self, loc, name, value, nan_as_null=None): """Add a column to DataFrame at the index specified by loc. Parameters @@ -2625,11 +2643,11 @@ def insert(self, loc, name, value): ) self._data = new_data elif isinstance(value, (pd.Series, Series)): - value = Series(value)._align_to_index( + value = Series(value, nan_as_null=nan_as_null)._align_to_index( self._index, how="right", sort=False ) - value = column.as_column(value) + value = column.as_column(value, nan_as_null=nan_as_null) self._data.insert(name, value, loc=loc) @@ -3081,7 +3099,7 @@ def as_gpu_matrix(self, columns=None, order="F"): dtype = find_common_type([col.dtype for col in cols]) for k, c in self._data.items(): - if c.has_nulls: + if c.has_nulls(): raise ValueError( f"column '{k}' has null values. " f"hint: use .fillna() to replace null values" diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 61ce64e7d6b..c85ed0c8555 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -4755,7 +4755,7 @@ def _scan(self, op, axis=None, skipna=True, cast_to_int=False): result_col = self._data[name].nans_to_nulls() else: result_col = self._data[name].copy() - if result_col.has_nulls: + if result_col.has_nulls(include_nan=True): # Workaround as find_first_value doesn't seem to work # incase of bools. first_index = int( diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 0002aaf38c5..29e0d17bc39 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2515,7 +2515,7 @@ def _clean_nulls_from_index(self): Convert all na values(if any) in Index object to `` as a preprocessing step to `__repr__` methods. """ - if self._values.has_nulls: + if self._values.has_nulls(): return self.fillna(cudf._NA_REP) else: return self diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 65c79b4cf59..c403c697e3d 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -386,7 +386,7 @@ def __repr__(self): else: preprocess = self.copy(deep=False) - if any(col.has_nulls for col in preprocess._data.columns): + if any(col.has_nulls() for col in preprocess._data.columns): preprocess_df = preprocess.to_frame(index=False) for name, col in preprocess._data.items(): if isinstance( diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index bbeae1adc5e..036c8c1ee00 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1474,7 +1474,7 @@ def has_nulls(self): >>> series.dropna().has_nulls False """ - return self._column.has_nulls + return self._column.has_nulls() def dropna(self, axis=0, inplace=False, how=None): """ diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py index 34d62ffc048..3efbd982b53 100644 --- a/python/cudf/cudf/core/tools/datetimes.py +++ b/python/cudf/cudf/core/tools/datetimes.py @@ -330,7 +330,7 @@ def _process_col(col, unit, dayfirst, infer_datetime_format, format): col = col.as_datetime_column(dtype=_unit_dtype_map[unit]) elif col.dtype.kind in ("O"): - if unit not in (None, "ns"): + if unit not in (None, "ns") or col.null_count == len(col): try: col = col.astype(dtype="int64") except ValueError: diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py index 7c688b92009..bd1b505c57f 100644 --- a/python/cudf/cudf/core/tools/numeric.py +++ b/python/cudf/cudf/core/tools/numeric.py @@ -165,7 +165,7 @@ def to_numeric(arg, errors="raise", downcast=None): if isinstance(arg, (cudf.Series, pd.Series)): return cudf.Series(col) else: - if col.has_nulls: + if col.has_nulls(): # To match pandas, always return a floating type filled with nan. col = col.astype(float).fillna(np.nan) return col.values diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 617dbdeaea5..0f4256e49a6 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -326,7 +326,7 @@ def apply(self, func, *args, **kwargs): """ has_nulls = False if isinstance(self.obj, cudf.Series): - if self.obj._column.has_nulls: + if self.obj._column.has_nulls(): has_nulls = True else: for col in self.obj._data: diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index c40f9f0b0a5..ab0856fad1e 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -246,7 +246,7 @@ def test_series_init_none(): sr1 = cudf.Series() got = sr1.to_string() - expect = "Series([], dtype: float64)" + expect = sr1.to_pandas().__repr__() # values should match despite whitespace difference assert got.split() == expect.split() @@ -254,7 +254,7 @@ def test_series_init_none(): sr2 = cudf.Series(None) got = sr2.to_string() - expect = "Series([], dtype: float64)" + expect = sr2.to_pandas().__repr__() # values should match despite whitespace difference assert got.split() == expect.split() @@ -1308,7 +1308,7 @@ def test_concat_empty_dataframe(df_1, df_2): [ {"a": [1, 2], "b": [1, 2], "c": ["s1", "s2"], "d": [1.0, 2.0]}, {"b": [1.9, 10.9], "c": ["s1", "s2"]}, - {"c": ["s1"], "b": [None], "a": [False]}, + {"c": ["s1"], "b": pd.Series([None], dtype="float"), "a": [False]}, ], ) @pytest.mark.parametrize( @@ -2008,8 +2008,8 @@ def test_dataframe_count_reduction(data, func): {"x": [np.nan, 2, 3, 4, 100, np.nan], "y": [4, 5, 6, 88, 99, np.nan]}, {"x": [1, 2, 3], "y": [4, 5, 6]}, {"x": [np.nan, np.nan, np.nan], "y": [np.nan, np.nan, np.nan]}, - {"x": [], "y": []}, - {"x": []}, + {"x": pd.Series([], dtype="float"), "y": pd.Series([], dtype="float")}, + {"x": pd.Series([], dtype="int")}, ], ) @pytest.mark.parametrize("ops", ["sum", "product", "prod"]) @@ -2017,7 +2017,7 @@ def test_dataframe_count_reduction(data, func): @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 10]) def test_dataframe_min_count_ops(data, ops, skipna, min_count): psr = pd.DataFrame(data) - gsr = cudf.DataFrame(data) + gsr = cudf.from_pandas(psr) assert_eq( getattr(psr, ops)(skipna=skipna, min_count=min_count), @@ -2498,7 +2498,7 @@ def test_series_all_null(num_elements, null_type): # Typecast Pandas because None will return `object` dtype expect = pd.Series(data, dtype="float64") - got = cudf.Series(data) + got = cudf.Series(data, dtype="float64") assert_eq(expect, got) @@ -8480,10 +8480,10 @@ def test_agg_for_dataframe_with_string_columns(aggs): ) def test_update_for_dataframes(data, data2, join, overwrite, errors): pdf = pd.DataFrame(data) - gdf = cudf.DataFrame(data) + gdf = cudf.DataFrame(data, nan_as_null=False) other_pd = pd.DataFrame(data2) - other_gd = cudf.DataFrame(data2) + other_gd = cudf.DataFrame(data2, nan_as_null=False) pdf.update(other=other_pd, join=join, overwrite=overwrite, errors=errors) gdf.update(other=other_gd, join=join, overwrite=overwrite, errors=errors) @@ -8949,7 +8949,9 @@ def test_frame_series_where_other(data): ( { "id": ["a", "a", "b", "b", "c", "c"], - "val": [None, None, None, None, None, None], + "val": cudf.Series( + [None, None, None, None, None, None], dtype="float64" + ), }, ["id"], ), @@ -9041,6 +9043,38 @@ def test_pearson_corr_multiindex_dataframe(): assert_eq(expected, actual) +@pytest.mark.parametrize( + "data", + [ + {"a": [np.nan, 1, 2], "b": [None, None, None]}, + {"a": [1, 2, np.nan, 2], "b": [np.nan, np.nan, np.nan, np.nan]}, + { + "a": [1, 2, np.nan, 2, None], + "b": [np.nan, np.nan, None, np.nan, np.nan], + }, + {"a": [1, 2, 2, None, 1.1], "b": [1, 2.2, 3, None, 5]}, + ], +) +@pytest.mark.parametrize("nan_as_null", [True, False]) +def test_dataframe_constructor_nan_as_null(data, nan_as_null): + actual = cudf.DataFrame(data, nan_as_null=nan_as_null) + + if nan_as_null: + assert ( + not ( + actual.astype("float").replace( + cudf.Series([np.nan], nan_as_null=False), cudf.Series([-1]) + ) + == -1 + ) + .any() + .any() + ) + else: + actual = actual.select_dtypes(exclude=["object"]) + assert (actual.replace(np.nan, -1) == -1).any().any() + + def test_dataframe_add_prefix(): cdf = cudf.DataFrame({"A": [1, 2, 3, 4], "B": [3, 4, 5, 6]}) pdf = cdf.to_pandas() diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py index 66556c48828..2c544dfc17c 100644 --- a/python/cudf/cudf/tests/test_interpolate.py +++ b/python/cudf/cudf/tests/test_interpolate.py @@ -50,7 +50,9 @@ def test_interpolate_series(data, method, axis): expect = psr.interpolate(method=method, axis=axis) got = gsr.interpolate(method=method, axis=axis) - assert_eq(expect, got) + assert_eq( + expect, got, check_dtype=False if psr.dtype == "object" else True + ) @pytest.mark.parametrize( @@ -88,7 +90,9 @@ def test_interpolate_series_values_or_index(data, index, method): expect = psr.interpolate(method=method) got = gsr.interpolate(method=method) - assert_eq(expect, got) + assert_eq( + expect, got, check_dtype=False if psr.dtype == "object" else True + ) @pytest.mark.parametrize( diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 2b71ca7ac36..b898222d7d7 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -76,10 +76,14 @@ def test_leaves(data): pa_array = pa.array(data) while hasattr(pa_array, "flatten"): pa_array = pa_array.flatten() - dtype = "int8" if isinstance(pa_array, pa.NullArray) else None - expect = cudf.Series(pa_array, dtype=dtype) + + expect = cudf.Series(pa_array) got = cudf.Series(data).list.leaves - assert_eq(expect, got) + assert_eq( + expect, + got, + check_dtype=False if isinstance(pa_array, pa.NullArray) else True, + ) def test_list_to_pandas_nullable_true(): @@ -269,7 +273,10 @@ def test_get(data, index, expect): sr = cudf.Series(data) expect = cudf.Series(expect) got = sr.list.get(index) - assert_eq(expect, got) + + assert_eq( + expect, got, check_dtype=False if expect.isnull().all() else True + ) def test_get_nested_lists(): diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py index ed55fb86820..f2a20a73b63 100644 --- a/python/cudf/cudf/tests/test_onehot.py +++ b/python/cudf/cudf/tests/test_onehot.py @@ -113,10 +113,18 @@ def test_get_dummies(data): encoded_expected = pd.get_dummies(pdf, prefix="test") encoded_actual = cudf.get_dummies(gdf, prefix="test") - utils.assert_eq(encoded_expected, encoded_actual) + utils.assert_eq( + encoded_expected, + encoded_actual, + check_dtype=False if len(data) == 0 else True, + ) encoded_actual = cudf.get_dummies(gdf, prefix="test", dtype=np.uint8) - utils.assert_eq(encoded_expected, encoded_actual) + utils.assert_eq( + encoded_expected, + encoded_actual, + check_dtype=False if len(data) == 0 else True, + ) @pytest.mark.parametrize("n_cols", [5, 10, 20]) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index 736bcf131cc..fe95b2930df 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -328,10 +328,14 @@ def test_dataframe_sliced(gdf, slice, max_seq_items, max_rows): ), ( cudf.Index([None, None, None], name="hello"), + "StringIndex([None None None], dtype='object', name='hello')", + ), + ( + cudf.Index([None, None, None], dtype="float", name="hello"), "Float64Index([, , ], dtype='float64', name='hello')", ), ( - cudf.Index([None], name="hello"), + cudf.Index([None], dtype="float64", name="hello"), "Float64Index([], dtype='float64', name='hello')", ), ( diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 1e11e862329..583d2c7a8dd 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -971,7 +971,7 @@ def test_series_update(data, other): @pytest.mark.parametrize("nan_as_null", [True, False]) @pytest.mark.parametrize("fill_value", [1.2, 332, np.nan]) def test_fillna_with_nan(data, nan_as_null, fill_value): - gs = cudf.Series(data, nan_as_null=nan_as_null) + gs = cudf.Series(data, dtype="float64", nan_as_null=nan_as_null) ps = gs.to_pandas() expected = ps.fillna(fill_value) diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py index ebe78d56c3f..142ca6c6831 100644 --- a/python/cudf/cudf/tests/test_stats.py +++ b/python/cudf/cudf/tests/test_stats.py @@ -460,7 +460,8 @@ def test_df_corr(): @pytest.mark.parametrize("skipna", [True, False, None]) def test_nans_stats(data, ops, skipna): psr = cudf.utils.utils._create_pandas_series(data=data) - gsr = cudf.Series(data) + gsr = cudf.Series(data, nan_as_null=False) + assert_eq( getattr(psr, ops)(skipna=skipna), getattr(gsr, ops)(skipna=skipna) ) @@ -486,7 +487,7 @@ def test_nans_stats(data, ops, skipna): @pytest.mark.parametrize("min_count", [-10, -1, 0, 1, 2, 3, 5, 10]) def test_min_count_ops(data, ops, skipna, min_count): psr = pd.Series(data) - gsr = cudf.Series(data) + gsr = cudf.Series(data, nan_as_null=False) assert_eq( getattr(psr, ops)(skipna=skipna, min_count=min_count), From 7a23f1a01547648db7ad684fa3dc0482b7ac813f Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 14 Dec 2021 15:28:59 -0500 Subject: [PATCH 02/13] Add utility to format ninja-log build times (#9631) Reference: https://github.com/rapidsai/ops/issues/1896 Generate build times log from formatted, sorted `.ninja_log` file. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - Robert Maynard (https://github.com/robertmaynard) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/9631 --- build.sh | 18 +++++ ci/gpu/build.sh | 12 +++- cpp/scripts/sort_ninja_log.py | 121 ++++++++++++++++++++++++++++++++++ 3 files changed, 150 insertions(+), 1 deletion(-) create mode 100755 cpp/scripts/sort_ninja_log.py diff --git a/build.sh b/build.sh index d0ccd4821e0..adf6e220744 100755 --- a/build.sh +++ b/build.sh @@ -172,6 +172,12 @@ if buildAll || hasArg libcudf; then echo "Building for *ALL* supported GPU architectures..." fi + # get the current count before the compile starts + FILES_IN_CCACHE="" + if [ -x "$(command -v ccache)" ]; then + FILES_IN_CCACHE=$(ccache -s | grep "files in cache") + fi + cmake -S $REPODIR/cpp -B ${LIB_BUILD_DIR} \ -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ ${CUDF_CMAKE_CUDA_ARCHITECTURES} \ @@ -185,7 +191,19 @@ if buildAll || hasArg libcudf; then cd ${LIB_BUILD_DIR} + compile_start=$(date +%s) cmake --build . -j${PARALLEL_LEVEL} ${VERBOSE_FLAG} + compile_end=$(date +%s) + compile_total=$(( compile_end - compile_start )) + + # Record build times + if [[ -f "${LIB_BUILD_DIR}/.ninja_log" ]]; then + echo "Formatting build times" + python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt xml > ${LIB_BUILD_DIR}/ninja_log.xml + message="$FILES_IN_CCACHE

$PARALLEL_LEVEL parallel build time is $compile_total seconds" + echo "$message" + python ${REPODIR}/cpp/scripts/sort_ninja_log.py ${LIB_BUILD_DIR}/.ninja_log --fmt html --msg "$message" > ${LIB_BUILD_DIR}/ninja_log.html + fi if [[ ${INSTALL_TARGET} != "" ]]; then cmake --build . -j${PARALLEL_LEVEL} --target install ${VERBOSE_FLAG} diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index d8b5cc7ba4c..00ad6bf812d 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -166,16 +166,26 @@ else gpuci_logger "Check GPU usage" nvidia-smi - gpuci_logger "GoogleTests" set -x cd $LIB_BUILD_DIR + gpuci_logger "GoogleTests" + for gt in gtests/* ; do test_name=$(basename ${gt}) echo "Running GoogleTest $test_name" ${gt} --gtest_output=xml:"$WORKSPACE/test-results/" done + # Copy libcudf build time results + echo "Checking for build time log $LIB_BUILD_DIR/ninja_log.html" + if [[ -f "$LIB_BUILD_DIR/ninja_log.html" ]]; then + gpuci_logger "Copying build time results" + cp "$LIB_BUILD_DIR/ninja_log.xml" "$WORKSPACE/test-results/buildtimes-junit.xml" + mkdir -p "$WORKSPACE/build-metrics" + cp "$LIB_BUILD_DIR/ninja_log.html" "$WORKSPACE/build-metrics/BuildMetrics.html" + fi + ################################################################################ # MEMCHECK - Run compute-sanitizer on GoogleTest (only in nightly builds) ################################################################################ diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py new file mode 100755 index 00000000000..5eada13aea2 --- /dev/null +++ b/cpp/scripts/sort_ninja_log.py @@ -0,0 +1,121 @@ +# +# Copyright (c) 2021, NVIDIA CORPORATION. +# +import argparse +import os +import sys +import xml.etree.ElementTree as ET +from xml.dom import minidom + +parser = argparse.ArgumentParser() +parser.add_argument( + "log_file", type=str, default=".ninja_log", help=".ninja_log file" +) +parser.add_argument( + "--fmt", + type=str, + default="csv", + choices=["csv", "xml", "html"], + help="output format (to stdout)", +) +parser.add_argument( + "--msg", + type=str, + default=None, + help="optional message to include in html output", +) +args = parser.parse_args() + +log_file = args.log_file +log_path = os.path.dirname(os.path.abspath(log_file)) + +output_fmt = args.fmt + +# build a map of the log entries +entries = {} +with open(log_file, "r") as log: + for line in log: + entry = line.split() + if len(entry) > 4: + elapsed = int(entry[1]) - int(entry[0]) + obj_file = entry[3] + file_size = ( + os.path.getsize(os.path.join(log_path, obj_file)) + if os.path.exists(obj_file) + else 0 + ) + entries[entry[3]] = (elapsed, file_size) + +# check file could be loaded +if len(entries) == 0: + print("Could not parse", log_file) + exit() + +# sort the keys by build time (descending order) +keys = list(entries.keys()) +sl = sorted(keys, key=lambda k: entries[k][0], reverse=True) + +if output_fmt == "xml": + # output results in XML format + root = ET.Element("testsuites") + testsuite = ET.Element( + "testsuite", + attrib={ + "name": "build-time", + "tests": str(len(keys)), + "failures": str(0), + "errors": str(0), + }, + ) + root.append(testsuite) + for key in sl: + entry = entries[key] + elapsed = float(entry[0]) / 1000 + item = ET.Element( + "testcase", + attrib={ + "classname": "BuildTime", + "name": key, + "time": str(elapsed), + }, + ) + testsuite.append(item) + + tree = ET.ElementTree(root) + xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent=" ") + print(xmlstr) + +elif output_fmt == "html": + # output results in HTML format + print("Sorted Ninja Build Times") + print("") + print("") + if args.msg is not None: + print("

", args.msg, "

") + print("") + print( + "", + "", + "", + sep="", + ) + for key in sl: + result = entries[key] + print( + "", + sep="", + ) + print("
FileCompile time (ms)Size (bytes)
", + key, + "", + result[0], + "", + result[1], + "
") + +else: + # output results in CSV format + print("time,size,file") + for key in sl: + result = entries[key] + print(result[0], result[1], key, sep=",") From 61794aaef64d1eb918910817799e66d51d2d6cda Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 14 Dec 2021 13:13:12 -0800 Subject: [PATCH 03/13] Fix a memcheck error in ORC writer (#9896) Follow up of https://github.com/rapidsai/cudf/pull/9808 Skips some kernels when input columns are empty to avoid OOB memory access. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Devavret Makkar (https://github.com/devavret) - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/9896 --- cpp/src/io/orc/writer_impl.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index a7a767585e6..c1eb9891229 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1018,6 +1018,7 @@ std::vector writer::impl::gather_stripes( hostdevice_2dvector* enc_streams, hostdevice_2dvector* strm_desc) { + if (segmentation.num_stripes() == 0) { return {}; } std::vector stripes(segmentation.num_stripes()); for (auto const& stripe : segmentation.stripes) { for (size_t col_idx = 0; col_idx < enc_streams->size().first; col_idx++) { From 41f99565d9d85d6dfb63ccd29a9717ce1dbb7eb5 Mon Sep 17 00:00:00 2001 From: Devavret Makkar Date: Wed, 15 Dec 2021 03:39:41 +0530 Subject: [PATCH 04/13] Add partitioning support in parquet writer (#9810) Contributes to https://github.com/rapidsai/cudf/issues/5059 Adds libcudf support for writing partitioned datasets in parquet writer. With the new API, one can specify a vector of `{start_row, num_rows}` structs along with a table st slices of the input table gets written to the corresponding sink. Adds Multi-sink support in `sink_info` Authors: - Devavret Makkar (https://github.com/devavret) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/9810 --- cpp/include/cudf/io/data_sink.hpp | 16 + cpp/include/cudf/io/detail/parquet.hpp | 15 +- cpp/include/cudf/io/orc.hpp | 59 +++ cpp/include/cudf/io/parquet.hpp | 153 ++++++- cpp/include/cudf/io/types.hpp | 87 +++- cpp/src/io/functions.cpp | 88 ++-- cpp/src/io/orc/writer_impl.cu | 12 +- cpp/src/io/orc/writer_impl.hpp | 2 + cpp/src/io/parquet/chunk_dict.cu | 141 ++----- cpp/src/io/parquet/page_enc.cu | 161 ++----- cpp/src/io/parquet/parquet_gpu.hpp | 46 +- cpp/src/io/parquet/writer_impl.cu | 511 +++++++++++++++-------- cpp/src/io/parquet/writer_impl.hpp | 33 +- cpp/tests/io/parquet_test.cpp | 102 ++++- python/cudf/cudf/_lib/cpp/io/orc.pxd | 9 + python/cudf/cudf/_lib/cpp/io/parquet.pxd | 24 +- python/cudf/cudf/_lib/cpp/io/types.pxd | 15 +- python/cudf/cudf/_lib/orc.pyx | 9 +- python/cudf/cudf/_lib/parquet.pyx | 29 +- 19 files changed, 962 insertions(+), 550 deletions(-) diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp index 42421aed716..2c1966ee6ba 100644 --- a/cpp/include/cudf/io/data_sink.hpp +++ b/cpp/include/cudf/io/data_sink.hpp @@ -69,6 +69,22 @@ class data_sink { */ static std::unique_ptr create(cudf::io::data_sink* const user_sink); + /** + * @brief Creates a vector of data sinks, one per element in the input vector. + * + * @param[in] args vector of parameters + */ + template + static std::vector> create(std::vector const& args) + { + std::vector> sinks; + sinks.reserve(args.size()); + std::transform(args.cbegin(), args.cend(), std::back_inserter(sinks), [](auto const& arg) { + return data_sink::create(arg); + }); + return sinks; + } + /** * @brief Base class destructor */ diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp index a18bd450640..9af2e3f278d 100644 --- a/cpp/include/cudf/io/detail/parquet.hpp +++ b/cpp/include/cudf/io/detail/parquet.hpp @@ -89,13 +89,13 @@ class writer { /** * @brief Constructor for output to a file. * - * @param sink The data sink to write the data to + * @param sinks The data sinks to write the data to * @param options Settings for controlling writing behavior * @param mode Option to write at once or in chunks * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ - explicit writer(std::unique_ptr sink, + explicit writer(std::vector> sinks, parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, @@ -104,7 +104,7 @@ class writer { /** * @brief Constructor for writer to handle chunked parquet options. * - * @param sink The data sink to write the data to + * @param sinks The data sinks to write the data to * @param options Settings for controlling writing behavior for chunked writer * @param mode Option to write at once or in chunks * @param stream CUDA stream used for device memory operations and kernel launches @@ -112,7 +112,7 @@ class writer { * * @return A parquet-compatible blob that contains the data for all rowgroups in the list */ - explicit writer(std::unique_ptr sink, + explicit writer(std::vector> sinks, chunked_parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, @@ -127,8 +127,10 @@ class writer { * @brief Writes a single subtable as part of a larger parquet file/table write. * * @param[in] table The table information to be written + * @param[in] partitions Optional partitions to divide the table into. If specified, must be same + * size as number of sinks. */ - void write(table_view const& table); + void write(table_view const& table, std::vector const& partitions = {}); /** * @brief Finishes the chunked/streamed write process. @@ -138,7 +140,8 @@ class writer { * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if * `column_chunks_file_path` is provided, else null. */ - std::unique_ptr> close(std::string const& column_chunks_file_path = ""); + std::unique_ptr> close( + std::vector const& column_chunks_file_path = {}); /** * @brief Merges multiple metadata blobs returned by write_all into a single metadata blob diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 16588185f3d..b3a2f6bcbbb 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -454,6 +454,8 @@ class orc_writer_options { table_view _table; // Optional associated metadata const table_input_metadata* _metadata = nullptr; + // Optional footer key_value_metadata + std::map _user_data; friend orc_writer_options_builder; @@ -530,6 +532,11 @@ class orc_writer_options { */ table_input_metadata const* get_metadata() const { return _metadata; } + /** + * @brief Returns Key-Value footer metadata information. + */ + std::map const& get_key_value_metadata() const { return _user_data; } + // Setters /** @@ -591,6 +598,16 @@ class orc_writer_options { * @param meta Associated metadata. */ void set_metadata(table_input_metadata const* meta) { _metadata = meta; } + + /** + * @brief Sets metadata. + * + * @param metadata Key-Value footer metadata + */ + void set_key_value_metadata(std::map metadata) + { + _user_data = std::move(metadata); + } }; class orc_writer_options_builder { @@ -698,6 +715,18 @@ class orc_writer_options_builder { return *this; } + /** + * @brief Sets Key-Value footer metadata. + * + * @param metadata Key-Value footer metadata + * @return this for chaining. + */ + orc_writer_options_builder& key_value_metadata(std::map metadata) + { + options._user_data = std::move(metadata); + return *this; + } + /** * @brief move orc_writer_options member once it's built. */ @@ -753,6 +782,8 @@ class chunked_orc_writer_options { size_type _row_index_stride = default_row_index_stride; // Optional associated metadata const table_input_metadata* _metadata = nullptr; + // Optional footer key_value_metadata + std::map _user_data; friend chunked_orc_writer_options_builder; @@ -819,6 +850,11 @@ class chunked_orc_writer_options { */ table_input_metadata const* get_metadata() const { return _metadata; } + /** + * @brief Returns Key-Value footer metadata information. + */ + std::map const& get_key_value_metadata() const { return _user_data; } + // Setters /** @@ -873,6 +909,16 @@ class chunked_orc_writer_options { * @param meta Associated metadata. */ void metadata(table_input_metadata const* meta) { _metadata = meta; } + + /** + * @brief Sets Key-Value footer metadata. + * + * @param metadata Key-Value footer metadata + */ + void set_key_value_metadata(std::map metadata) + { + _user_data = std::move(metadata); + } }; class chunked_orc_writer_options_builder { @@ -965,6 +1011,19 @@ class chunked_orc_writer_options_builder { return *this; } + /** + * @brief Sets Key-Value footer metadata. + * + * @param metadata Key-Value footer metadata + * @return this for chaining. + */ + chunked_orc_writer_options_builder& key_value_metadata( + std::map metadata) + { + options._user_data = std::move(metadata); + return *this; + } + /** * @brief move chunked_orc_writer_options member once it's built. */ diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp index 2215f24b550..740f7a8b2db 100644 --- a/cpp/include/cudf/io/parquet.hpp +++ b/cpp/include/cudf/io/parquet.hpp @@ -364,13 +364,17 @@ class parquet_writer_options { statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP; // Sets of columns to output table_view _table; + // Partitions described as {start_row, num_rows} pairs + std::vector _partitions; // Optional associated metadata table_input_metadata const* _metadata = nullptr; + // Optional footer key_value_metadata + std::vector> _user_data; // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. // If true then overrides any per-column setting in _metadata. bool _write_timestamps_as_int96 = false; - // Column chunks file path to be set in the raw output metadata - std::string _column_chunks_file_path; + // Column chunks file paths to be set in the raw output metadata. One per output file + std::vector _column_chunks_file_paths; // Maximum size of each row group (unless smaller than a single page) size_t _row_group_size_bytes = default_row_group_size_bytes; // Maximum number of rows in row group (unless smaller than a single page) @@ -434,20 +438,36 @@ class parquet_writer_options { */ table_view get_table() const { return _table; } + /** + * @brief Returns partitions. + */ + std::vector const& get_partitions() const { return _partitions; } + /** * @brief Returns associated metadata. */ table_input_metadata const* get_metadata() const { return _metadata; } + /** + * @brief Returns Key-Value footer metadata information. + */ + std::vector> const& get_key_value_metadata() const + { + return _user_data; + } + /** * @brief Returns `true` if timestamps will be written as INT96 */ bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; } /** - * @brief Returns Column chunks file path to be set in the raw output metadata. + * @brief Returns Column chunks file paths to be set in the raw output metadata. */ - std::string get_column_chunks_file_path() const { return _column_chunks_file_path; } + std::vector const& get_column_chunks_file_paths() const + { + return _column_chunks_file_paths; + } /** * @brief Returns maximum row group size, in bytes. @@ -459,6 +479,19 @@ class parquet_writer_options { */ auto get_row_group_size_rows() const { return _row_group_size_rows; } + /** + * @brief Sets partitions. + * + * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must + * be same size as number of sinks in sink_info + */ + void set_partitions(std::vector partitions) + { + CUDF_EXPECTS(partitions.size() == _sink.num_sinks(), + "Mismatch between number of sinks and number of partitions"); + _partitions = std::move(partitions); + } + /** * @brief Sets metadata. * @@ -466,6 +499,18 @@ class parquet_writer_options { */ void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; } + /** + * @brief Sets metadata. + * + * @param metadata Key-Value footer metadata + */ + void set_key_value_metadata(std::vector> metadata) + { + CUDF_EXPECTS(metadata.size() == _sink.num_sinks(), + "Mismatch between number of sinks and number of metadata maps"); + _user_data = std::move(metadata); + } + /** * @brief Sets the level of statistics. * @@ -491,11 +536,14 @@ class parquet_writer_options { /** * @brief Sets column chunks file path to be set in the raw output metadata. * - * @param file_path String which indicates file path. + * @param file_paths Vector of Strings which indicates file path. Must be same size as number of + * data sinks in sink info */ - void set_column_chunks_file_path(std::string file_path) + void set_column_chunks_file_paths(std::vector file_paths) { - _column_chunks_file_path.assign(file_path); + CUDF_EXPECTS(file_paths.size() == _sink.num_sinks(), + "Mismatch between number of sinks and number of chunk paths to set"); + _column_chunks_file_paths = std::move(file_paths); } /** @@ -543,6 +591,21 @@ class parquet_writer_options_builder { { } + /** + * @brief Sets partitions in parquet_writer_options. + * + * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must + * be same size as number of sinks in sink_info + * @return this for chaining. + */ + parquet_writer_options_builder& partitions(std::vector partitions) + { + CUDF_EXPECTS(partitions.size() == options._sink.num_sinks(), + "Mismatch between number of sinks and number of partitions"); + options.set_partitions(std::move(partitions)); + return *this; + } + /** * @brief Sets metadata in parquet_writer_options. * @@ -555,6 +618,21 @@ class parquet_writer_options_builder { return *this; } + /** + * @brief Sets Key-Value footer metadata in parquet_writer_options. + * + * @param metadata Key-Value footer metadata + * @return this for chaining. + */ + parquet_writer_options_builder& key_value_metadata( + std::vector> metadata) + { + CUDF_EXPECTS(metadata.size() == options._sink.num_sinks(), + "Mismatch between number of sinks and number of metadata maps"); + options._user_data = std::move(metadata); + return *this; + } + /** * @brief Sets the level of statistics in parquet_writer_options. * @@ -582,12 +660,15 @@ class parquet_writer_options_builder { /** * @brief Sets column chunks file path to be set in the raw output metadata. * - * @param file_path String which indicates file path. + * @param file_paths Vector of Strings which indicates file path. Must be same size as number of + * data sinks * @return this for chaining. */ - parquet_writer_options_builder& column_chunks_file_path(std::string file_path) + parquet_writer_options_builder& column_chunks_file_paths(std::vector file_paths) { - options._column_chunks_file_path.assign(file_path); + CUDF_EXPECTS(file_paths.size() == options._sink.num_sinks(), + "Mismatch between number of sinks and number of chunk paths to set"); + options.set_column_chunks_file_paths(std::move(file_paths)); return *this; } @@ -690,6 +771,8 @@ class chunked_parquet_writer_options { statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP; // Optional associated metadata. table_input_metadata const* _metadata = nullptr; + // Optional footer key_value_metadata + std::vector> _user_data; // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS. // If true then overrides any per-column setting in _metadata. bool _write_timestamps_as_int96 = false; @@ -735,6 +818,14 @@ class chunked_parquet_writer_options { */ table_input_metadata const* get_metadata() const { return _metadata; } + /** + * @brief Returns Key-Value footer metadata information. + */ + std::vector> const& get_key_value_metadata() const + { + return _user_data; + } + /** * @brief Returns `true` if timestamps will be written as INT96 */ @@ -757,6 +848,18 @@ class chunked_parquet_writer_options { */ void set_metadata(table_input_metadata const* metadata) { _metadata = metadata; } + /** + * @brief Sets Key-Value footer metadata. + * + * @param metadata Key-Value footer metadata + */ + void set_key_value_metadata(std::vector> metadata) + { + CUDF_EXPECTS(metadata.size() == _sink.num_sinks(), + "Mismatch between number of sinks and number of metadata maps"); + _user_data = std::move(metadata); + } + /** * @brief Sets the level of statistics in parquet_writer_options. * @@ -841,6 +944,21 @@ class chunked_parquet_writer_options_builder { return *this; } + /** + * @brief Sets Key-Value footer metadata in parquet_writer_options. + * + * @param metadata Key-Value footer metadata + * @return this for chaining. + */ + chunked_parquet_writer_options_builder& key_value_metadata( + std::vector> metadata) + { + CUDF_EXPECTS(metadata.size() == options._sink.num_sinks(), + "Mismatch between number of sinks and number of metadata maps"); + options.set_key_value_metadata(std::move(metadata)); + return *this; + } + /** * @brief Sets Sets the level of statistics in chunked_parquet_writer_options. * @@ -958,18 +1076,25 @@ class parquet_chunked_writer { * @brief Writes table to output. * * @param[in] table Table that needs to be written + * @param[in] partitions Optional partitions to divide the table into. If specified, must be same + * size as number of sinks. + * + * @throws cudf::logic_error If the number of partitions is not the smae as number of sinks * @return returns reference of the class object */ - parquet_chunked_writer& write(table_view const& table); + parquet_chunked_writer& write(table_view const& table, + std::vector const& partitions = {}); /** * @brief Finishes the chunked/streamed write process. * - * @param[in] column_chunks_file_path Column chunks file path to be set in the raw output metadata + * @param[in] column_chunks_file_paths Column chunks file path to be set in the raw output + * metadata * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if - * `column_chunks_file_path` is provided, else null. + * `column_chunks_file_paths` is provided, else null. */ - std::unique_ptr> close(std::string const& column_chunks_file_path = ""); + std::unique_ptr> close( + std::vector const& column_chunks_file_paths = {}); // Unique pointer to impl writer class std::unique_ptr writer; diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp index cf6be8a20af..512a90b3249 100644 --- a/cpp/include/cudf/io/types.hpp +++ b/cpp/include/cudf/io/types.hpp @@ -151,61 +151,93 @@ struct host_buffer { * @brief Source information for read interfaces */ struct source_info { - io_type type = io_type::FILEPATH; - std::vector filepaths; - std::vector buffers; - std::vector> files; - std::vector user_sources; + std::vector> _files; source_info() = default; explicit source_info(std::vector const& file_paths) - : type(io_type::FILEPATH), filepaths(file_paths) + : _type(io_type::FILEPATH), _filepaths(file_paths) { } explicit source_info(std::string const& file_path) - : type(io_type::FILEPATH), filepaths({file_path}) + : _type(io_type::FILEPATH), _filepaths({file_path}) { } explicit source_info(std::vector const& host_buffers) - : type(io_type::HOST_BUFFER), buffers(host_buffers) + : _type(io_type::HOST_BUFFER), _buffers(host_buffers) { } explicit source_info(const char* host_data, size_t size) - : type(io_type::HOST_BUFFER), buffers({{host_data, size}}) + : _type(io_type::HOST_BUFFER), _buffers({{host_data, size}}) { } explicit source_info(std::vector const& sources) - : type(io_type::USER_IMPLEMENTED), user_sources(sources) + : _type(io_type::USER_IMPLEMENTED), _user_sources(sources) { } explicit source_info(cudf::io::datasource* source) - : type(io_type::USER_IMPLEMENTED), user_sources({source}) + : _type(io_type::USER_IMPLEMENTED), _user_sources({source}) { } + + auto type() const { return _type; } + auto const& filepaths() const { return _filepaths; } + auto const& buffers() const { return _buffers; } + auto const& files() const { return _files; } + auto const& user_sources() const { return _user_sources; } + + private: + io_type _type = io_type::FILEPATH; + std::vector _filepaths; + std::vector _buffers; + std::vector _user_sources; }; /** * @brief Destination information for write interfaces */ struct sink_info { - io_type type = io_type::VOID; - std::string filepath; - std::vector* buffer = nullptr; - cudf::io::data_sink* user_sink = nullptr; - sink_info() = default; + sink_info(size_t num_sinks) : _type(io_type::VOID), _num_sinks(num_sinks) {} - explicit sink_info(const std::string& file_path) : type(io_type::FILEPATH), filepath(file_path) {} + explicit sink_info(std::vector const& file_paths) + : _type(io_type::FILEPATH), _num_sinks(file_paths.size()), _filepaths(file_paths) + { + } + explicit sink_info(std::string const& file_path) + : _type(io_type::FILEPATH), _filepaths({file_path}) + { + } - explicit sink_info(std::vector* buffer) : type(io_type::HOST_BUFFER), buffer(buffer) {} + explicit sink_info(std::vector*> const& buffers) + : _type(io_type::HOST_BUFFER), _num_sinks(buffers.size()), _buffers(buffers) + { + } + explicit sink_info(std::vector* buffer) : _type(io_type::HOST_BUFFER), _buffers({buffer}) {} - explicit sink_info(class cudf::io::data_sink* user_sink_) - : type(io_type::USER_IMPLEMENTED), user_sink(user_sink_) + explicit sink_info(std::vector const& user_sinks) + : _type(io_type::USER_IMPLEMENTED), _num_sinks(user_sinks.size()), _user_sinks(user_sinks) { } + explicit sink_info(class cudf::io::data_sink* user_sink) + : _type(io_type::USER_IMPLEMENTED), _user_sinks({user_sink}) + { + } + + auto type() const { return _type; } + auto num_sinks() const { return _num_sinks; } + auto const& filepaths() const { return _filepaths; } + auto const& buffers() const { return _buffers; } + auto const& user_sinks() const { return _user_sinks; } + + private: + io_type _type = io_type::VOID; + size_t _num_sinks = 1; + std::vector _filepaths; + std::vector*> _buffers; + std::vector _user_sinks; }; class table_input_metadata; @@ -369,12 +401,21 @@ class table_input_metadata { * The constructed table_input_metadata has the same structure as the passed table_view * * @param table The table_view to construct metadata for - * @param user_data Optional Additional metadata to encode, as key-value pairs */ - table_input_metadata(table_view const& table, std::map user_data = {}); + table_input_metadata(table_view const& table); std::vector column_metadata; - std::map user_data; //!< Format-dependent metadata as key-values pairs +}; + +/** + * @brief Information used while writing partitioned datasets + * + * This information defines the slice of an input table to write to file. In partitioned dataset + * writing, one partition_info struct defines one partition and corresponds to one output file + */ +struct partition_info { + size_type start_row; + size_type num_rows; }; } // namespace io diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 768d6b25690..04638d3eca9 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -111,27 +111,33 @@ std::vector> make_datasources(source_info size_t range_offset = 0, size_t range_size = 0) { - switch (info.type) { + switch (info.type()) { case io_type::FILEPATH: { auto sources = std::vector>(); - for (auto const& filepath : info.filepaths) { + for (auto const& filepath : info.filepaths()) { sources.emplace_back(cudf::io::datasource::create(filepath, range_offset, range_size)); } return sources; } - case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers); - case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources); + case io_type::HOST_BUFFER: return cudf::io::datasource::create(info.buffers()); + case io_type::USER_IMPLEMENTED: return cudf::io::datasource::create(info.user_sources()); default: CUDF_FAIL("Unsupported source type"); } } -std::unique_ptr make_datasink(sink_info const& info) +std::vector> make_datasinks(sink_info const& info) { - switch (info.type) { - case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepath); - case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffer); - case io_type::VOID: return cudf::io::data_sink::create(); - case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sink); + switch (info.type()) { + case io_type::FILEPATH: return cudf::io::data_sink::create(info.filepaths()); + case io_type::HOST_BUFFER: return cudf::io::data_sink::create(info.buffers()); + case io_type::VOID: { + std::vector> sinks; + for (size_t i = 0; i < info.num_sinks(); ++i) { + sinks.push_back(cudf::io::data_sink::create()); + } + return sinks; + } + case io_type::USER_IMPLEMENTED: return cudf::io::data_sink::create(info.user_sinks()); default: CUDF_FAIL("Unsupported sink type"); } } @@ -156,9 +162,9 @@ compression_type infer_compression_type(compression_type compression, source_inf { if (compression != compression_type::AUTO) { return compression; } - if (info.type != io_type::FILEPATH) { return compression_type::NONE; } + if (info.type() != io_type::FILEPATH) { return compression_type::NONE; } - auto filepath = info.filepaths[0]; + auto filepath = info.filepaths()[0]; // Attempt to infer from the file extension const auto pos = filepath.find_last_of('.'); @@ -218,10 +224,11 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc { using namespace cudf::io::detail; - auto sink = make_datasink(options.get_sink()); + auto sinks = make_datasinks(options.get_sink()); + CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for CSV writing"); return csv::write_csv( // - sink.get(), + sinks[0].get(), options.get_table(), options.get_metadata(), options, @@ -235,15 +242,16 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info) { // Get source to read statistics from std::unique_ptr source; - if (src_info.type == io_type::FILEPATH) { - CUDF_EXPECTS(src_info.filepaths.size() == 1, "Only a single source is currently supported."); - source = cudf::io::datasource::create(src_info.filepaths[0]); - } else if (src_info.type == io_type::HOST_BUFFER) { - CUDF_EXPECTS(src_info.buffers.size() == 1, "Only a single source is currently supported."); - source = cudf::io::datasource::create(src_info.buffers[0]); - } else if (src_info.type == io_type::USER_IMPLEMENTED) { - CUDF_EXPECTS(src_info.user_sources.size() == 1, "Only a single source is currently supported."); - source = cudf::io::datasource::create(src_info.user_sources[0]); + if (src_info.type() == io_type::FILEPATH) { + CUDF_EXPECTS(src_info.filepaths().size() == 1, "Only a single source is currently supported."); + source = cudf::io::datasource::create(src_info.filepaths()[0]); + } else if (src_info.type() == io_type::HOST_BUFFER) { + CUDF_EXPECTS(src_info.buffers().size() == 1, "Only a single source is currently supported."); + source = cudf::io::datasource::create(src_info.buffers()[0]); + } else if (src_info.type() == io_type::USER_IMPLEMENTED) { + CUDF_EXPECTS(src_info.user_sources().size() == 1, + "Only a single source is currently supported."); + source = cudf::io::datasource::create(src_info.user_sources()[0]); } else { CUDF_FAIL("Unsupported source type"); } @@ -350,9 +358,11 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc CUDF_FUNC_RANGE(); - auto sink = make_datasink(options.get_sink()); + auto sinks = make_datasinks(options.get_sink()); + CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing"); + auto writer = std::make_unique( - std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); + std::move(sinks[0]), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); writer->write(options.get_table()); } @@ -365,10 +375,11 @@ orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options { namespace io_detail = cudf::io::detail; - auto sink = make_datasink(options.get_sink()); + auto sinks = make_datasinks(options.get_sink()); + CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing"); writer = std::make_unique( - std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); + std::move(sinks[0]), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); } /** @@ -417,9 +428,7 @@ std::unique_ptr> merge_row_group_metadata( return detail_parquet::writer::merge_row_group_metadata(metadata_list); } -table_input_metadata::table_input_metadata(table_view const& table, - std::map user_data) - : user_data{std::move(user_data)} +table_input_metadata::table_input_metadata(table_view const& table) { // Create a metadata hierarchy using `table` std::function get_children = [&](column_view const& col) { @@ -443,13 +452,13 @@ std::unique_ptr> write_parquet(parquet_writer_options const CUDF_FUNC_RANGE(); - auto sink = make_datasink(options.get_sink()); + auto sinks = make_datasinks(options.get_sink()); auto writer = std::make_unique( - std::move(sink), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); + std::move(sinks), options, io_detail::SingleWriteMode::YES, rmm::cuda_stream_default, mr); - writer->write(options.get_table()); + writer->write(options.get_table(), options.get_partitions()); - return writer->close(options.get_column_chunks_file_path()); + return writer->close(options.get_column_chunks_file_paths()); } /** @@ -460,20 +469,21 @@ parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options co { namespace io_detail = cudf::io::detail; - auto sink = make_datasink(options.get_sink()); + auto sinks = make_datasinks(options.get_sink()); writer = std::make_unique( - std::move(sink), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); + std::move(sinks), options, io_detail::SingleWriteMode::NO, rmm::cuda_stream_default, mr); } /** * @copydoc cudf::io::parquet_chunked_writer::write */ -parquet_chunked_writer& parquet_chunked_writer::write(table_view const& table) +parquet_chunked_writer& parquet_chunked_writer::write(table_view const& table, + std::vector const& partitions) { CUDF_FUNC_RANGE(); - writer->write(table); + writer->write(table, partitions); return *this; } @@ -482,7 +492,7 @@ parquet_chunked_writer& parquet_chunked_writer::write(table_view const& table) * @copydoc cudf::io::parquet_chunked_writer::close */ std::unique_ptr> parquet_chunked_writer::close( - std::string const& column_chunks_file_path) + std::vector const& column_chunks_file_path) { CUDF_FUNC_RANGE(); return writer->close(column_chunks_file_path); diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index c1eb9891229..b0e674c206f 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1313,6 +1313,7 @@ writer::impl::impl(std::unique_ptr sink, compression_kind_(to_orc_compression(options.get_compression())), enable_statistics_(options.is_enabled_statistics()), single_write_mode(mode == SingleWriteMode::YES), + kv_meta(options.get_key_value_metadata()), out_sink_(std::move(sink)) { if (options.get_metadata()) { @@ -1333,6 +1334,7 @@ writer::impl::impl(std::unique_ptr sink, compression_kind_(to_orc_compression(options.get_compression())), enable_statistics_(options.is_enabled_statistics()), single_write_mode(mode == SingleWriteMode::YES), + kv_meta(options.get_key_value_metadata()), out_sink_(std::move(sink)) { if (options.get_metadata()) { @@ -2069,12 +2071,10 @@ void writer::impl::close() PostScript ps; ff.contentLength = out_sink_->bytes_written(); - std::transform(table_meta->user_data.begin(), - table_meta->user_data.end(), - std::back_inserter(ff.metadata), - [&](auto const& udata) { - return UserMetadataItem{udata.first, udata.second}; - }); + std::transform( + kv_meta.begin(), kv_meta.end(), std::back_inserter(ff.metadata), [&](auto const& udata) { + return UserMetadataItem{udata.first, udata.second}; + }); // Write statistics metadata if (md.stripeStats.size() != 0) { diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index 68622d17b28..80c22b09927 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -369,6 +369,8 @@ class writer::impl { bool const single_write_mode; // optional user metadata std::unique_ptr table_meta; + // optional user metadata + std::map kv_meta; // to track if the output has been written to sink bool closed = false; diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index 42d27dadd1a..5589f87e57e 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -95,69 +95,41 @@ struct map_find_fn { template __global__ void __launch_bounds__(block_size, 1) populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan chunks, - size_type num_rows) + cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; auto t = threadIdx.x; + auto frag = frags[col_idx][block_x]; + auto chunk = frag.chunk; + auto col = chunk->col_desc; - auto start_row = - block_x * - max_page_fragment_size; // This is fragment size. all chunks are multiple of these many rows. - size_type end_row = min(start_row + max_page_fragment_size, num_rows); + size_type start_row = frag.start_row; + size_type end_row = frag.start_row + frag.num_rows; - __shared__ EncColumnChunk* s_chunk; - __shared__ parquet_column_device_view s_col; __shared__ size_type s_start_value_idx; __shared__ size_type s_num_values; - if (t == 0) { - // Find the chunk this block is a part of - size_type num_rowgroups = chunks.size().first; - size_type rg_idx = 0; - while (rg_idx < num_rowgroups) { - if (auto ck = chunks[rg_idx][col_idx]; - start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) { - break; - } - ++rg_idx; - } - s_chunk = &chunks[rg_idx][col_idx]; - s_col = *(s_chunk->col_desc); - } - __syncthreads(); - if (not s_chunk->use_dictionary) { return; } + + if (not chunk->use_dictionary) { return; } if (t == 0) { // Find the bounds of values in leaf column to be inserted into the map for current chunk - auto col = *(s_col.parent_column); - auto start_value_idx = start_row; - auto end_value_idx = end_row; - while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { - if (col.type().id() == type_id::STRUCT) { - start_value_idx += col.offset(); - end_value_idx += col.offset(); - col = col.child(0); - } else { - auto offset_col = col.child(lists_column_view::offsets_column_index); - start_value_idx = offset_col.element(start_value_idx + col.offset()); - end_value_idx = offset_col.element(end_value_idx + col.offset()); - col = col.child(lists_column_view::child_column_index); - } - } - s_start_value_idx = start_value_idx; - s_num_values = end_value_idx - start_value_idx; + auto cudf_col = *(col->parent_column); + s_start_value_idx = row_to_value_idx(start_row, cudf_col); + auto end_value_idx = row_to_value_idx(end_row, cudf_col); + s_num_values = end_value_idx - s_start_value_idx; } __syncthreads(); - column_device_view const& data_col = *s_col.leaf_column; + column_device_view const& data_col = *col->leaf_column; using block_reduce = cub::BlockReduce; __shared__ typename block_reduce::TempStorage reduce_storage; // Make a view of the hash map auto hash_map_mutable = map_type::device_mutable_view( - s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL); + chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL); auto hash_map = map_type::device_view( - s_chunk->dict_map_slots, s_chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL); + chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL); __shared__ int total_num_dict_entries; for (size_type i = 0; i < s_num_values; i += block_size) { @@ -176,7 +148,7 @@ __global__ void __launch_bounds__(block_size, 1) type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx); uniq_elem_size = [&]() -> size_type { if (not is_unique) { return 0; } - switch (s_col.physical_type) { + switch (col->physical_type) { case Type::INT32: return 4; case Type::INT64: return 8; case Type::INT96: return 12; @@ -200,9 +172,9 @@ __global__ void __launch_bounds__(block_size, 1) __syncthreads(); auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size); if (t == 0) { - total_num_dict_entries = atomicAdd(&s_chunk->num_dict_entries, num_unique); + total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique); total_num_dict_entries += num_unique; - atomicAdd(&s_chunk->uniq_data_size, uniq_data_size); + atomicAdd(&chunk->uniq_data_size, uniq_data_size); } __syncthreads(); @@ -246,67 +218,38 @@ __global__ void __launch_bounds__(block_size, 1) template __global__ void __launch_bounds__(block_size, 1) get_dictionary_indices_kernel(cudf::detail::device_2dspan chunks, - size_type num_rows) + cudf::detail::device_2dspan frags) { auto col_idx = blockIdx.y; auto block_x = blockIdx.x; auto t = threadIdx.x; + auto frag = frags[col_idx][block_x]; + auto chunk = frag.chunk; + auto col = chunk->col_desc; - size_type start_row = block_x * max_page_fragment_size; - size_type end_row = min(start_row + max_page_fragment_size, num_rows); + size_type start_row = frag.start_row; + size_type end_row = frag.start_row + frag.num_rows; - __shared__ EncColumnChunk s_chunk; - __shared__ parquet_column_device_view s_col; __shared__ size_type s_start_value_idx; __shared__ size_type s_ck_start_val_idx; __shared__ size_type s_num_values; if (t == 0) { - // Find the chunk this block is a part of - size_type num_rowgroups = chunks.size().first; - size_type rg_idx = 0; - while (rg_idx < num_rowgroups) { - if (auto ck = chunks[rg_idx][col_idx]; - start_row >= ck.start_row and start_row < ck.start_row + ck.num_rows) { - break; - } - ++rg_idx; - } - s_chunk = chunks[rg_idx][col_idx]; - s_col = *(s_chunk.col_desc); - - // Find the bounds of values in leaf column to be inserted into the map for current chunk - - auto col = *(s_col.parent_column); - auto start_value_idx = start_row; - auto end_value_idx = end_row; - auto chunk_start_val_idx = s_chunk.start_row; - while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { - if (col.type().id() == type_id::STRUCT) { - start_value_idx += col.offset(); - chunk_start_val_idx += col.offset(); - end_value_idx += col.offset(); - col = col.child(0); - } else { - auto offset_col = col.child(lists_column_view::offsets_column_index); - start_value_idx = offset_col.element(start_value_idx + col.offset()); - chunk_start_val_idx = offset_col.element(chunk_start_val_idx + col.offset()); - end_value_idx = offset_col.element(end_value_idx + col.offset()); - col = col.child(lists_column_view::child_column_index); - } - } - s_start_value_idx = start_value_idx; - s_ck_start_val_idx = chunk_start_val_idx; - s_num_values = end_value_idx - start_value_idx; + // Find the bounds of values in leaf column to be searched in the map for current chunk + auto cudf_col = *(col->parent_column); + s_start_value_idx = row_to_value_idx(start_row, cudf_col); + s_ck_start_val_idx = row_to_value_idx(chunk->start_row, cudf_col); + auto end_value_idx = row_to_value_idx(end_row, cudf_col); + s_num_values = end_value_idx - s_start_value_idx; } __syncthreads(); - if (not s_chunk.use_dictionary) { return; } + if (not chunk->use_dictionary) { return; } - column_device_view const& data_col = *s_col.leaf_column; + column_device_view const& data_col = *col->leaf_column; auto map = map_type::device_view( - s_chunk.dict_map_slots, s_chunk.dict_map_size, KEY_SENTINEL, VALUE_SENTINEL); + chunk->dict_map_slots, chunk->dict_map_size, KEY_SENTINEL, VALUE_SENTINEL); for (size_t i = 0; i < s_num_values; i += block_size) { if (t + i < s_num_values) { @@ -321,7 +264,7 @@ __global__ void __launch_bounds__(block_size, 1) if (found_slot != map.end()) { // No need for atomic as this is not going to be modified by any other thread auto* val_ptr = reinterpret_cast(&found_slot->second); - s_chunk.dict_index[val_idx - s_ck_start_val_idx] = *val_ptr; + chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr; } } } @@ -336,16 +279,14 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st } void populate_chunk_hash_maps(cudf::detail::device_2dspan chunks, - size_type num_rows, + cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { constexpr int block_size = 256; - auto const grid_x = cudf::detail::grid_1d(num_rows, max_page_fragment_size); - auto const num_columns = chunks.size().second; - dim3 const dim_grid(grid_x.num_blocks, num_columns); + dim3 const dim_grid(frags.size().second, frags.size().first); populate_chunk_hash_maps_kernel - <<>>(chunks, num_rows); + <<>>(chunks, frags); } void collect_map_entries(device_span chunks, rmm::cuda_stream_view stream) @@ -355,16 +296,14 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi } void get_dictionary_indices(cudf::detail::device_2dspan chunks, - size_type num_rows, + cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream) { constexpr int block_size = 256; - auto const grid_x = cudf::detail::grid_1d(num_rows, max_page_fragment_size); - auto const num_columns = chunks.size().second; - dim3 const dim_grid(grid_x.num_blocks, num_columns); + dim3 const dim_grid(frags.size().second, frags.size().first); get_dictionary_indices_kernel - <<>>(chunks, num_rows); + <<>>(chunks, frags); } } // namespace gpu } // namespace parquet diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 33647ff626c..ec6b24b3b4e 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -49,7 +50,6 @@ constexpr uint32_t rle_buffer_size = (1 << 9); struct frag_init_state_s { parquet_column_device_view col; PageFragment frag; - size_type start_value_idx; }; struct page_enc_state_s { @@ -114,24 +114,14 @@ inline __device__ uint32_t uint64_init_hash(uint64_t v) return uint32_init_hash(static_cast(v + (v >> 32))); } -/** - * @brief Initializes encoder page fragments - * - * Based on the number of rows in each fragment, populates the value count, the size of data in the - * fragment, the number of unique values, and the data size of unique values. - * - * @param[in] frag Fragment array [fragment_id][column_id] - * @param[in] col_desc Column description array [column_id] - * @param[in] num_fragments Number of fragments per column - * @param[in] num_columns Number of columns - */ // blockDim {512,1,1} template __global__ void __launch_bounds__(block_size) gpuInitPageFragments(device_2dspan frag, device_span col_desc, - uint32_t fragment_size, - uint32_t max_num_rows) + device_span partitions, + device_span part_frag_offset, + uint32_t fragment_size) { __shared__ __align__(16) frag_init_state_s state_g; @@ -140,53 +130,36 @@ __global__ void __launch_bounds__(block_size) frag_init_state_s* const s = &state_g; uint32_t t = threadIdx.x; + int frag_y = blockIdx.y; if (t == 0) s->col = col_desc[blockIdx.x]; __syncthreads(); - uint32_t const start_row = blockIdx.y * fragment_size; if (!t) { - // frag.num_rows = fragment_size except for the last page fragment which can be smaller. + // Find which partition this fragment came from + auto it = + thrust::upper_bound(thrust::seq, part_frag_offset.begin(), part_frag_offset.end(), frag_y); + int p = it - part_frag_offset.begin() - 1; + int part_end_row = partitions[p].start_row + partitions[p].num_rows; + s->frag.start_row = (frag_y - part_frag_offset[p]) * fragment_size + partitions[p].start_row; + + // frag.num_rows = fragment_size except for the last fragment in partition which can be smaller. // num_rows is fixed but fragment size could be larger if the data is strings or nested. - s->frag.num_rows = min(fragment_size, max_num_rows - min(start_row, max_num_rows)); + s->frag.num_rows = min(fragment_size, part_end_row - s->frag.start_row); s->frag.num_dict_vals = 0; s->frag.fragment_data_size = 0; s->frag.dict_data_size = 0; - // To use num_vals instead of num_rows, we need to calculate num_vals on the fly. - // For list>, values between i and i+50 can be calculated by - // off_11 = off[i], off_12 = off[i+50] - // off_21 = child.off[off_11], off_22 = child.off[off_12] - // etc... - size_type end_value_idx = start_row + s->frag.num_rows; - if (s->col.parent_column == nullptr) { - s->start_value_idx = start_row; - } else { - auto col = *(s->col.parent_column); - auto current_start_value_idx = start_row; - while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { - if (col.type().id() == type_id::STRUCT) { - current_start_value_idx += col.offset(); - end_value_idx += col.offset(); - col = col.child(0); - } else { - auto offset_col = col.child(lists_column_view::offsets_column_index); - current_start_value_idx = - offset_col.element(current_start_value_idx + col.offset()); - end_value_idx = offset_col.element(end_value_idx + col.offset()); - col = col.child(lists_column_view::child_column_index); - } - } - s->start_value_idx = current_start_value_idx; - } - s->frag.start_value_idx = s->start_value_idx; - s->frag.num_leaf_values = end_value_idx - s->start_value_idx; + auto col = *(s->col.parent_column); + s->frag.start_value_idx = row_to_value_idx(s->frag.start_row, col); + size_type end_value_idx = row_to_value_idx(s->frag.start_row + s->frag.num_rows, col); + s->frag.num_leaf_values = end_value_idx - s->frag.start_value_idx; if (s->col.level_offsets != nullptr) { // For nested schemas, the number of values in a fragment is not directly related to the // number of encoded data elements or the number of rows. It is simply the number of // repetition/definition values which together encode validity and nesting information. - size_type first_level_val_idx = s->col.level_offsets[start_row]; - size_type last_level_val_idx = s->col.level_offsets[start_row + s->frag.num_rows]; + size_type first_level_val_idx = s->col.level_offsets[s->frag.start_row]; + size_type last_level_val_idx = s->col.level_offsets[s->frag.start_row + s->frag.num_rows]; s->frag.num_values = last_level_val_idx - first_level_val_idx; } else { s->frag.num_values = s->frag.num_rows; @@ -197,7 +170,7 @@ __global__ void __launch_bounds__(block_size) __syncthreads(); size_type nvals = s->frag.num_leaf_values; - size_type start_value_idx = s->start_value_idx; + size_type start_value_idx = s->frag.start_value_idx; for (uint32_t i = 0; i < nvals; i += block_size) { uint32_t val_idx = start_value_idx + i + t; @@ -912,28 +885,9 @@ __global__ void __launch_bounds__(128, 8) dst[0] = dict_bits; s->rle_out = dst + 1; } - s->page_start_val = s->page.start_row; // Dictionary page's start row is chunk's start row - auto chunk_start_val = s->ck.start_row; - if (s->col.parent_column != nullptr) { // TODO: remove this check. parent is now never nullptr - auto col = *(s->col.parent_column); - auto current_page_start_val = s->page_start_val; - // TODO: We do this so much. Add a global function that converts row idx to val idx - while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { - if (col.type().id() == type_id::STRUCT) { - current_page_start_val += col.offset(); - chunk_start_val += col.offset(); - col = col.child(0); - } else { - auto offset_col = col.child(lists_column_view::offsets_column_index); - current_page_start_val = - offset_col.element(current_page_start_val + col.offset()); - chunk_start_val = offset_col.element(chunk_start_val + col.offset()); - col = col.child(lists_column_view::child_column_index); - } - } - s->page_start_val = current_page_start_val; - s->chunk_start_val = chunk_start_val; - } + auto col = *(s->col.parent_column); + s->page_start_val = row_to_value_idx(s->page.start_row, col); + s->chunk_start_val = row_to_value_idx(s->ck.start_row, col); } __syncthreads(); for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) { @@ -1944,36 +1898,20 @@ dremel_data get_dremel_data(column_view h_col, std::move(new_offsets), std::move(rep_level), std::move(def_level), leaf_data_size}; } -/** - * @brief Launches kernel for initializing encoder page fragments - * - * @param[in,out] frag Fragment array [column_id][fragment_id] - * @param[in] col_desc Column description array [column_id] - * @param[in] num_fragments Number of fragments per column - * @param[in] num_columns Number of columns - * @param[in] stream CUDA stream to use, default 0 - */ void InitPageFragments(device_2dspan frag, device_span col_desc, + device_span partitions, + device_span part_frag_offset, uint32_t fragment_size, - uint32_t num_rows, rmm::cuda_stream_view stream) { auto num_columns = frag.size().first; auto num_fragments_per_column = frag.size().second; dim3 dim_grid(num_columns, num_fragments_per_column); // 1 threadblock per fragment - gpuInitPageFragments<512> - <<>>(frag, col_desc, fragment_size, num_rows); + gpuInitPageFragments<512><<>>( + frag, col_desc, partitions, part_frag_offset, fragment_size); } -/** - * @brief Launches kernel for initializing fragment statistics groups - * - * @param[out] groups Statistics groups [num_columns x num_fragments] - * @param[in] fragments Page fragments [num_columns x num_fragments] - * @param[in] col_desc Column description [num_columns] - * @param[in] stream CUDA stream to use, default 0 - */ void InitFragmentStatistics(device_2dspan groups, device_2dspan fragments, device_span col_desc, @@ -1986,19 +1924,6 @@ void InitFragmentStatistics(device_2dspan groups, gpuInitFragmentStats<<>>(groups, fragments, col_desc); } -/** - * @brief Launches kernel for initializing encoder data pages - * - * @param[in,out] chunks Column chunks [rowgroup][column] - * @param[out] pages Encode page array (null if just counting pages) - * @param[in] col_desc Column description array [column_id] - * @param[in] num_rowgroups Number of fragments per column - * @param[in] num_columns Number of columns - * @param[out] page_grstats Setup for page-level stats - * @param[out] chunk_grstats Setup for chunk-level stats - * @param[in] max_page_comp_data_size Calculated maximum compressed data size of pages - * @param[in] stream CUDA stream to use, default 0 - */ void InitEncoderPages(device_2dspan chunks, device_span pages, device_span col_desc, @@ -2014,14 +1939,6 @@ void InitEncoderPages(device_2dspan chunks, chunks, pages, col_desc, page_grstats, chunk_grstats, max_page_comp_data_size, num_columns); } -/** - * @brief Launches kernel for packing column data into parquet pages - * - * @param[in,out] pages Device array of EncPages (unordered) - * @param[out] comp_in Optionally initializes compressor input params - * @param[out] comp_stat Optionally initializes compressor status - * @param[in] stream CUDA stream to use, default 0 - */ void EncodePages(device_span pages, device_span comp_in, device_span comp_stat, @@ -2033,26 +1950,11 @@ void EncodePages(device_span pages, gpuEncodePages<128><<>>(pages, comp_in, comp_stat); } -/** - * @brief Launches kernel to make the compressed vs uncompressed chunk-level decision - * - * @param[in,out] chunks Column chunks - * @param[in] stream CUDA stream to use, default 0 - */ void DecideCompression(device_span chunks, rmm::cuda_stream_view stream) { gpuDecideCompression<<>>(chunks); } -/** - * @brief Launches kernel to encode page headers - * - * @param[in,out] pages Device array of EncPages - * @param[in] comp_stat Compressor status or nullptr if no compression - * @param[in] page_stats Optional page-level statistics to be included in page header - * @param[in] chunk_stats Optional chunk-level statistics to be encoded - * @param[in] stream CUDA stream to use, default 0 - */ void EncodePageHeaders(device_span pages, device_span comp_stat, device_span page_stats, @@ -2065,13 +1967,6 @@ void EncodePageHeaders(device_span pages, pages, comp_stat, page_stats, chunk_stats); } -/** - * @brief Launches kernel to gather pages to a single contiguous block per chunk - * - * @param[in,out] chunks Column chunks - * @param[in] pages Device array of EncPages - * @param[in] stream CUDA stream to use, default 0 - */ void GatherPages(device_span chunks, device_span pages, rmm::cuda_stream_view stream) diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index a0cbc28bc8d..53bb11c8b70 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -252,6 +252,8 @@ struct parquet_column_device_view : stats_column_desc { constexpr int max_page_fragment_size = 5000; //!< Max number of rows in a page fragment +struct EncColumnChunk; + /** * @brief Struct describing an encoder page fragment */ @@ -262,8 +264,10 @@ struct PageFragment { uint32_t start_value_idx; uint32_t num_leaf_values; //!< Number of leaf values in fragment. Does not include nulls at //!< non-leaf level + size_type start_row; //!< First row in fragment uint16_t num_rows; //!< Number of rows in fragment uint16_t num_dict_vals; //!< Number of unique dictionary entries + EncColumnChunk* chunk; //!< The chunk that this fragment belongs to }; /// Size of hash used for building dictionaries @@ -284,6 +288,27 @@ inline uint32_t __device__ int32_logical_len(type_id id) } } +/** + * @brief Translate the row index of a parent column_device_view into the index of the first value + * in the leaf child. + * Only works in the context of parquet writer where struct columns are previously modified s.t. + * they only have one immediate child. + */ +inline size_type __device__ row_to_value_idx(size_type idx, column_device_view col) +{ + while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) { + if (col.type().id() == type_id::STRUCT) { + idx += col.offset(); + col = col.child(0); + } else { + auto offset_col = col.child(lists_column_view::offsets_column_index); + idx = offset_col.element(idx + col.offset()); + col = col.child(lists_column_view::child_column_index); + } + } + return idx; +} + /** * @brief Return worst-case compressed size of compressed data given the uncompressed size */ @@ -309,7 +334,7 @@ struct EncColumnChunk { uint32_t compressed_size; //!< Compressed buffer size uint32_t max_page_data_size; //!< Max data size (excluding header) of any page in this chunk uint32_t page_headers_size; //!< Sum of size of all page headers - uint32_t start_row; //!< First row of chunk + size_type start_row; //!< First row of chunk uint32_t num_rows; //!< Number of rows in chunk size_type num_values; //!< Number of values in chunk. Different from num_rows for nested types uint32_t first_fragment; //!< First fragment of chunk @@ -459,18 +484,21 @@ dremel_data get_dremel_data(column_view h_col, /** * @brief Launches kernel for initializing encoder page fragments * + * Based on the number of rows in each fragment, populates the value count, the size of data in the + * fragment, the number of unique values, and the data size of unique values. + * * @param[out] frag Fragment array [column_id][fragment_id] * @param[in] col_desc Column description array [column_id] - * @param[in] num_fragments Number of fragments per column - * @param[in] num_columns Number of columns + * @param[in] partitions Information about partitioning of table + * @param[in] first_frag_in_part A Partition's offset into fragment array * @param[in] fragment_size Number of rows per fragment - * @param[in] num_rows Number of rows per column * @param[in] stream CUDA stream to use */ void InitPageFragments(cudf::detail::device_2dspan frag, device_span col_desc, + device_span partitions, + device_span first_frag_in_part, uint32_t fragment_size, - uint32_t num_rows, rmm::cuda_stream_view stream); /** @@ -498,11 +526,11 @@ void initialize_chunk_hash_maps(device_span chunks, rmm::cuda_st * @brief Insert chunk values into their respective hash maps * * @param chunks Column chunks [rowgroup][column] - * @param num_rows Number of rows per column + * @param frags Column fragments * @param stream CUDA stream to use */ void populate_chunk_hash_maps(cudf::detail::device_2dspan chunks, - size_type num_rows, + cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** @@ -523,11 +551,11 @@ void collect_map_entries(device_span chunks, rmm::cuda_stream_vi * col[row] == col[dict_data[dict_index[row - chunk.start_row]]] * * @param chunks Column chunks [rowgroup][column] - * @param num_rows Number of rows per column + * @param frags Column fragments * @param stream CUDA stream to use */ void get_dictionary_indices(cudf::detail::device_2dspan chunks, - size_type num_rows, + cudf::detail::device_2dspan frags, rmm::cuda_stream_view stream); /** diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index e04c8371df8..aceb3bfbec1 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -40,6 +41,8 @@ #include +#include + #include #include #include @@ -76,6 +79,113 @@ parquet::Compression to_parquet_compression(compression_type compression) } // namespace +struct aggregate_metadata { + aggregate_metadata(std::vector const& partitions, + size_type num_columns, + std::vector schema, + statistics_freq stats_granularity, + std::vector> const& kv_md) + : version(1), schema(std::move(schema)), files(partitions.size()) + { + for (size_t i = 0; i < partitions.size(); ++i) { + this->files[i].num_rows = partitions[i].num_rows; + } + this->column_order_listsize = + (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_columns : 0; + + for (size_t p = 0; p < kv_md.size(); ++p) { + std::transform(kv_md[p].begin(), + kv_md[p].end(), + std::back_inserter(this->files[p].key_value_metadata), + [](auto const& kv) { + return KeyValue{kv.first, kv.second}; + }); + } + } + + void update_files(std::vector const& partitions) + { + CUDF_EXPECTS(partitions.size() == this->files.size(), + "New partitions must be same size as previously passed number of partitions"); + for (size_t i = 0; i < partitions.size(); ++i) { + this->files[i].num_rows += partitions[i].num_rows; + } + } + + FileMetaData get_metadata(size_t part) + { + CUDF_EXPECTS(part < files.size(), "Invalid part index queried"); + FileMetaData meta{}; + meta.version = this->version; + meta.schema = this->schema; + meta.num_rows = this->files[part].num_rows; + meta.row_groups = this->files[part].row_groups; + meta.key_value_metadata = this->files[part].key_value_metadata; + meta.created_by = this->created_by; + meta.column_order_listsize = this->column_order_listsize; + return meta; + } + + void set_file_paths(std::vector const& column_chunks_file_path) + { + for (size_t p = 0; p < this->files.size(); ++p) { + auto& file = this->files[p]; + auto const& file_path = column_chunks_file_path[p]; + for (auto& rowgroup : file.row_groups) { + for (auto& col : rowgroup.columns) { + col.file_path = file_path; + } + } + } + } + + FileMetaData get_merged_metadata() + { + FileMetaData merged_md; + for (size_t p = 0; p < this->files.size(); ++p) { + auto& file = this->files[p]; + if (p == 0) { + merged_md = this->get_metadata(0); + } else { + merged_md.row_groups.insert(merged_md.row_groups.end(), + std::make_move_iterator(file.row_groups.begin()), + std::make_move_iterator(file.row_groups.end())); + merged_md.num_rows += file.num_rows; + } + } + return merged_md; + } + + std::vector num_row_groups_per_file() + { + std::vector global_rowgroup_base; + std::transform(this->files.begin(), + this->files.end(), + std::back_inserter(global_rowgroup_base), + [](auto const& part) { return part.row_groups.size(); }); + return global_rowgroup_base; + } + + bool schema_matches(std::vector const& schema) const + { + return this->schema == schema; + } + auto& file(size_t p) { return files[p]; } + size_t num_files() const { return files.size(); } + + private: + int32_t version = 0; + std::vector schema; + struct per_file_metadata { + int64_t num_rows = 0; + std::vector row_groups; + std::vector key_value_metadata; + }; + std::vector files; + std::string created_by = ""; + uint32_t column_order_listsize = 0; +}; + struct linked_column_view; using LinkedColPtr = std::shared_ptr; @@ -736,10 +846,12 @@ gpu::parquet_column_device_view parquet_column_view::get_device_view( void writer::impl::init_page_fragments(cudf::detail::hostdevice_2dvector& frag, device_span col_desc, - uint32_t num_rows, + host_span partitions, + device_span part_frag_offset, uint32_t fragment_size) { - gpu::InitPageFragments(frag, col_desc, fragment_size, num_rows, stream); + auto d_partitions = cudf::detail::make_device_uvector_async(partitions, stream); + gpu::InitPageFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream); frag.device_to_host(stream, true); } @@ -771,7 +883,7 @@ void writer::impl::init_page_sizes(hostdevice_2dvector& chu auto build_chunk_dictionaries(hostdevice_2dvector& chunks, host_span col_desc, - uint32_t num_rows, + device_2dspan frags, rmm::cuda_stream_view stream) { // At this point, we know all chunks and their sizes. We want to allocate dictionaries for each @@ -801,7 +913,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector& chunks, chunks.host_to_device(stream); gpu::initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream); - gpu::populate_chunk_hash_maps(chunks, num_rows, stream); + gpu::populate_chunk_hash_maps(chunks, frags, stream); chunks.device_to_host(stream, true); @@ -850,7 +962,7 @@ auto build_chunk_dictionaries(hostdevice_2dvector& chunks, } chunks.host_to_device(stream); gpu::collect_map_entries(chunks.device_view().flat_view(), stream); - gpu::get_dictionary_indices(chunks.device_view(), num_rows, stream); + gpu::get_dictionary_indices(chunks.device_view(), frags, stream); return std::make_pair(std::move(dict_data), std::move(dict_index)); } @@ -1016,7 +1128,7 @@ void writer::impl::encode_pages(hostdevice_2dvector& chunks stream.synchronize(); } -writer::impl::impl(std::unique_ptr sink, +writer::impl::impl(std::vector> sinks, parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, @@ -1028,8 +1140,9 @@ writer::impl::impl(std::unique_ptr sink, compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), - out_sink_(std::move(sink)), - single_write_mode(mode == SingleWriteMode::YES) + kv_md(options.get_key_value_metadata()), + single_write_mode(mode == SingleWriteMode::YES), + out_sink_(std::move(sinks)) { if (options.get_metadata()) { table_meta = std::make_unique(*options.get_metadata()); @@ -1037,7 +1150,7 @@ writer::impl::impl(std::unique_ptr sink, init_state(); } -writer::impl::impl(std::unique_ptr sink, +writer::impl::impl(std::vector> sinks, chunked_parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, @@ -1049,8 +1162,9 @@ writer::impl::impl(std::unique_ptr sink, compression_(to_parquet_compression(options.get_compression())), stats_granularity_(options.get_stats_level()), int96_timestamps(options.is_enabled_int96_timestamps()), + kv_md(options.get_key_value_metadata()), single_write_mode(mode == SingleWriteMode::YES), - out_sink_(std::move(sink)) + out_sink_(std::move(sinks)) { if (options.get_metadata()) { table_meta = std::make_unique(*options.get_metadata()); @@ -1062,19 +1176,21 @@ writer::impl::~impl() { close(); } void writer::impl::init_state() { + current_chunk_offset.resize(out_sink_.size()); // Write file header file_header_s fhdr; fhdr.magic = parquet_magic; - out_sink_->host_write(&fhdr, sizeof(fhdr)); - current_chunk_offset = sizeof(file_header_s); + for (auto& sink : out_sink_) { + sink->host_write(&fhdr, sizeof(fhdr)); + } + std::fill_n(current_chunk_offset.begin(), current_chunk_offset.size(), sizeof(file_header_s)); } -void writer::impl::write(table_view const& table) +void writer::impl::write(table_view const& table, std::vector const& partitions) { + last_write_successful = false; CUDF_EXPECTS(not closed, "Data has already been flushed to out and closed"); - size_type num_rows = table.num_rows(); - if (not table_meta) { table_meta = std::make_unique(table); } // Fill unnamed columns' names in table_meta @@ -1109,25 +1225,15 @@ void writer::impl::write(table_view const& table) std::vector this_table_schema(schema_tree.begin(), schema_tree.end()); - if (md.version == 0) { - md.version = 1; - md.num_rows = num_rows; - md.column_order_listsize = - (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? num_columns : 0; - std::transform(table_meta->user_data.begin(), - table_meta->user_data.end(), - std::back_inserter(md.key_value_metadata), - [](auto const& kv) { - return KeyValue{kv.first, kv.second}; - }); - md.schema = this_table_schema; + if (!md) { + md = std::make_unique( + partitions, num_columns, std::move(this_table_schema), stats_granularity_, kv_md); } else { // verify the user isn't passing mismatched tables - CUDF_EXPECTS(md.schema == this_table_schema, + CUDF_EXPECTS(md->schema_matches(this_table_schema), "Mismatch in schema between multiple calls to write_chunk"); - // increment num rows - md.num_rows += num_rows; + md->update_files(partitions); } // Create table_device_view so that corresponding column_device_view data // can be written into col_desc members @@ -1149,7 +1255,22 @@ void writer::impl::write(table_view const& table) // compression/decompression performance). using cudf::io::parquet::gpu::max_page_fragment_size; - size_type const num_fragments = (num_rows + max_page_fragment_size - 1) / max_page_fragment_size; + std::vector num_frag_in_part; + std::transform(partitions.begin(), + partitions.end(), + std::back_inserter(num_frag_in_part), + [](auto const& part) { + return util::div_rounding_up_unsafe(part.num_rows, max_page_fragment_size); + }); + + size_type num_fragments = std::reduce(num_frag_in_part.begin(), num_frag_in_part.end()); + + std::vector part_frag_offset; // Store the idx of the first fragment in each partition + std::exclusive_scan( + num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0); + part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back()); + + auto d_part_frag_offset = cudf::detail::make_device_uvector_async(part_frag_offset, stream); cudf::detail::hostdevice_2dvector fragments( num_columns, num_fragments, stream); @@ -1159,36 +1280,50 @@ void writer::impl::write(table_view const& table) leaf_column_views = create_leaf_column_device_views( col_desc, *parent_column_table_device_view, stream); - init_page_fragments(fragments, col_desc, num_rows, max_page_fragment_size); + init_page_fragments( + fragments, col_desc, partitions, d_part_frag_offset, max_page_fragment_size); } - auto const global_rowgroup_base = static_cast(md.row_groups.size()); + std::vector const global_rowgroup_base = md->num_row_groups_per_file(); // Decide row group boundaries based on uncompressed data size - auto rowgroup_size = 0ul; - auto num_rowgroups = 0; - for (auto f = 0, global_r = global_rowgroup_base, rowgroup_start = 0; f < num_fragments; f++) { - auto fragment_data_size = 0ul; - // Replace with STL algorithm to transform and sum - for (auto i = 0; i < num_columns; i++) { - fragment_data_size += fragments[i][f].fragment_data_size; - } - if (f > rowgroup_start && - (rowgroup_size + fragment_data_size > max_row_group_size || - (f + 1 - rowgroup_start) * max_page_fragment_size > max_row_group_rows)) { - // update schema - md.row_groups.resize(md.row_groups.size() + 1); - md.row_groups[global_r++].num_rows = (f - rowgroup_start) * max_page_fragment_size; - num_rowgroups++; - rowgroup_start = f; - rowgroup_size = 0; - } - rowgroup_size += fragment_data_size; - if (f + 1 == num_fragments) { - // update schema - md.row_groups.resize(md.row_groups.size() + 1); - md.row_groups[global_r++].num_rows = num_rows - rowgroup_start * max_page_fragment_size; - num_rowgroups++; + int num_rowgroups = 0; + + std::vector num_rg_in_part(partitions.size()); + for (size_t p = 0; p < partitions.size(); ++p) { + size_type curr_rg_num_rows = 0; + size_t curr_rg_data_size = 0; + int first_frag_in_rg = part_frag_offset[p]; + int last_frag_in_part = part_frag_offset[p + 1] - 1; + for (auto f = first_frag_in_rg; f <= last_frag_in_part; ++f) { + size_t fragment_data_size = 0; + for (auto c = 0; c < num_columns; c++) { + fragment_data_size += fragments[c][f].fragment_data_size; + } + size_type fragment_num_rows = fragments[0][f].num_rows; + + // If the fragment size gets larger than rg limit then break off a rg + if (f > first_frag_in_rg && // There has to be at least one fragment in row group + (curr_rg_data_size + fragment_data_size > max_row_group_size || + curr_rg_num_rows + fragment_num_rows > max_row_group_rows)) { + auto& rg = md->file(p).row_groups.emplace_back(); + rg.num_rows = curr_rg_num_rows; + num_rowgroups++; + num_rg_in_part[p]++; + curr_rg_num_rows = 0; + curr_rg_data_size = 0; + first_frag_in_rg = f; + } + curr_rg_num_rows += fragment_num_rows; + curr_rg_data_size += fragment_data_size; + + // TODO: (wishful) refactor to consolidate with above if block + if (f == last_frag_in_part) { + auto& rg = md->file(p).row_groups.emplace_back(); + rg.num_rows = curr_rg_num_rows; + num_rowgroups++; + num_rg_in_part[p]++; + } } } @@ -1196,58 +1331,79 @@ void writer::impl::write(table_view const& table) rmm::device_uvector frag_stats(0, stream); if (stats_granularity_ != statistics_freq::STATISTICS_NONE) { frag_stats.resize(num_fragments * num_columns, stream); - if (frag_stats.size() != 0) { + if (not frag_stats.is_empty()) { auto frag_stats_2dview = device_2dspan(frag_stats.data(), num_columns, num_fragments); gather_fragment_statistics(frag_stats_2dview, fragments, col_desc, num_fragments); } } + + std::vector first_rg_in_part; + std::exclusive_scan( + num_rg_in_part.begin(), num_rg_in_part.end(), std::back_inserter(first_rg_in_part), 0); + // Initialize row groups and column chunks auto const num_chunks = num_rowgroups * num_columns; hostdevice_2dvector chunks(num_rowgroups, num_columns, stream); - for (auto r = 0, global_r = global_rowgroup_base, f = 0, start_row = 0; r < num_rowgroups; - r++, global_r++) { - size_type const fragments_in_chunk = - (md.row_groups[global_r].num_rows + max_page_fragment_size - 1) / max_page_fragment_size; - md.row_groups[global_r].total_byte_size = 0; - md.row_groups[global_r].columns.resize(num_columns); - for (auto i = 0; i < num_columns; i++) { - gpu::EncColumnChunk* ck = &chunks[r][i]; - - *ck = {}; - ck->col_desc = col_desc.device_ptr() + i; - ck->col_desc_id = i; - ck->fragments = &fragments.device_view()[i][f]; - ck->stats = (frag_stats.size() != 0) ? frag_stats.data() + i * num_fragments + f : nullptr; - ck->start_row = start_row; - ck->num_rows = (uint32_t)md.row_groups[global_r].num_rows; - ck->first_fragment = i * num_fragments + f; - auto chunk_fragments = fragments[i].subspan(f, fragments_in_chunk); - ck->num_values = - std::accumulate(chunk_fragments.begin(), chunk_fragments.end(), 0, [](uint32_t l, auto r) { - return l + r.num_values; - }); - ck->plain_data_size = std::accumulate( - chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) { - return sum + frag.fragment_data_size; - }); - md.row_groups[global_r].columns[i].meta_data.type = parquet_columns[i].physical_type(); - md.row_groups[global_r].columns[i].meta_data.encodings = {Encoding::PLAIN, Encoding::RLE}; - md.row_groups[global_r].columns[i].meta_data.path_in_schema = - parquet_columns[i].get_path_in_schema(); - md.row_groups[global_r].columns[i].meta_data.codec = UNCOMPRESSED; - md.row_groups[global_r].columns[i].meta_data.num_values = ck->num_values; + + for (size_t p = 0; p < partitions.size(); ++p) { + int f = part_frag_offset[p]; + size_type start_row = partitions[p].start_row; + for (int r = 0; r < num_rg_in_part[p]; r++) { + size_t global_r = global_rowgroup_base[p] + r; // Number of rowgroups already in file/part + auto& row_group = md->file(p).row_groups[global_r]; + uint32_t fragments_in_chunk = + util::div_rounding_up_unsafe(row_group.num_rows, max_page_fragment_size); + row_group.total_byte_size = 0; + row_group.columns.resize(num_columns); + for (int c = 0; c < num_columns; c++) { + gpu::EncColumnChunk& ck = chunks[r + first_rg_in_part[p]][c]; + + ck = {}; + ck.col_desc = col_desc.device_ptr() + c; + ck.col_desc_id = c; + ck.fragments = &fragments.device_view()[c][f]; + ck.stats = + (not frag_stats.is_empty()) ? frag_stats.data() + c * num_fragments + f : nullptr; + ck.start_row = start_row; + ck.num_rows = (uint32_t)row_group.num_rows; + ck.first_fragment = c * num_fragments + f; + auto chunk_fragments = fragments[c].subspan(f, fragments_in_chunk); + // In fragment struct, add a pointer to the chunk it belongs to + // In each fragment in chunk_fragments, update the chunk pointer here. + for (auto& frag : chunk_fragments) { + frag.chunk = &chunks.device_view()[r + first_rg_in_part[p]][c]; + } + ck.num_values = std::accumulate( + chunk_fragments.begin(), chunk_fragments.end(), 0, [](uint32_t l, auto r) { + return l + r.num_values; + }); + ck.plain_data_size = std::accumulate( + chunk_fragments.begin(), chunk_fragments.end(), 0, [](int sum, gpu::PageFragment frag) { + return sum + frag.fragment_data_size; + }); + auto& column_chunk_meta = row_group.columns[c].meta_data; + column_chunk_meta.type = parquet_columns[c].physical_type(); + column_chunk_meta.encodings = {Encoding::PLAIN, Encoding::RLE}; + column_chunk_meta.path_in_schema = parquet_columns[c].get_path_in_schema(); + column_chunk_meta.codec = UNCOMPRESSED; + column_chunk_meta.num_values = ck.num_values; + } + f += fragments_in_chunk; + start_row += (uint32_t)row_group.num_rows; } - f += fragments_in_chunk; - start_row += (uint32_t)md.row_groups[global_r].num_rows; } - auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, num_rows, stream); - for (auto rg = 0, global_rg = global_rowgroup_base; rg < num_rowgroups; rg++, global_rg++) { - for (auto col = 0; col < num_columns; col++) { - if (chunks.host_view()[rg][col].use_dictionary) { - md.row_groups[global_rg].columns[col].meta_data.encodings.push_back( - Encoding::PLAIN_DICTIONARY); + fragments.host_to_device(stream); + auto dict_info_owner = build_chunk_dictionaries(chunks, col_desc, fragments, stream); + for (size_t p = 0; p < partitions.size(); p++) { + for (int rg = 0; rg < num_rg_in_part[p]; rg++) { + size_t global_rg = global_rowgroup_base[p] + rg; + for (int col = 0; col < num_columns; col++) { + if (chunks.host_view()[rg][col].use_dictionary) { + md->file(p).row_groups[global_rg].columns[col].meta_data.encodings.push_back( + Encoding::PLAIN_DICTIONARY); + } } } } @@ -1272,6 +1428,12 @@ void writer::impl::write(table_view const& table) "Error in getting compressed size from nvcomp"); } + // Find which partition a rg belongs to + std::vector rg_to_part; + for (size_t p = 0; p < num_rg_in_part.size(); ++p) { + std::fill_n(std::back_inserter(rg_to_part), num_rg_in_part[p], p); + } + // Initialize batches of rowgroups to encode (mainly to limit peak memory usage) std::vector batch_list; size_type num_pages = 0; @@ -1335,11 +1497,11 @@ void writer::impl::write(table_view const& table) auto bfr_c = static_cast(comp_bfr.data()); for (auto j = 0; j < batch_list[b]; j++, r++) { for (auto i = 0; i < num_columns; i++) { - gpu::EncColumnChunk* ck = &chunks[r][i]; - ck->uncompressed_bfr = bfr; - ck->compressed_bfr = bfr_c; - bfr += ck->bfr_size; - bfr_c += ck->compressed_size; + gpu::EncColumnChunk& ck = chunks[r][i]; + ck.uncompressed_bfr = bfr; + ck.compressed_bfr = bfr_c; + bfr += ck.bfr_size; + bfr_c += ck.compressed_size; } } } @@ -1359,9 +1521,7 @@ void writer::impl::write(table_view const& table) pinned_buffer host_bfr{nullptr, cudaFreeHost}; // Encode row groups in batches - for (auto b = 0, r = 0, global_r = global_rowgroup_base; - b < static_cast(batch_list.size()); - b++) { + for (auto b = 0, r = 0; b < static_cast(batch_list.size()); b++) { // Count pages in this batch auto const rnext = r + batch_list[b]; auto const first_page_in_batch = chunks[r][0].first_page; @@ -1381,30 +1541,33 @@ void writer::impl::write(table_view const& table) (stats_granularity_ != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages : nullptr); std::vector> write_tasks; - for (; r < rnext; r++, global_r++) { + for (; r < rnext; r++) { + int p = rg_to_part[r]; + int global_r = global_rowgroup_base[p] + r - first_rg_in_part[p]; + auto& row_group = md->file(p).row_groups[global_r]; for (auto i = 0; i < num_columns; i++) { - gpu::EncColumnChunk* ck = &chunks[r][i]; + gpu::EncColumnChunk& ck = chunks[r][i]; + auto& column_chunk_meta = row_group.columns[i].meta_data; uint8_t* dev_bfr; - if (ck->is_compressed) { - md.row_groups[global_r].columns[i].meta_data.codec = compression_; - dev_bfr = ck->compressed_bfr; + if (ck.is_compressed) { + column_chunk_meta.codec = compression_; + dev_bfr = ck.compressed_bfr; } else { - dev_bfr = ck->uncompressed_bfr; + dev_bfr = ck.uncompressed_bfr; } - if (out_sink_->is_device_write_preferred(ck->compressed_size)) { + if (out_sink_[p]->is_device_write_preferred(ck.compressed_size)) { // let the writer do what it wants to retrieve the data from the gpu. - write_tasks.push_back( - out_sink_->device_write_async(dev_bfr + ck->ck_stat_size, ck->compressed_size, stream)); + write_tasks.push_back(out_sink_[p]->device_write_async( + dev_bfr + ck.ck_stat_size, ck.compressed_size, stream)); // we still need to do a (much smaller) memcpy for the statistics. - if (ck->ck_stat_size != 0) { - md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(ck->ck_stat_size); - CUDA_TRY( - cudaMemcpyAsync(md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(), - dev_bfr, - ck->ck_stat_size, - cudaMemcpyDeviceToHost, - stream.value())); + if (ck.ck_stat_size != 0) { + column_chunk_meta.statistics_blob.resize(ck.ck_stat_size); + CUDA_TRY(cudaMemcpyAsync(column_chunk_meta.statistics_blob.data(), + dev_bfr, + ck.ck_stat_size, + cudaMemcpyDeviceToHost, + stream.value())); stream.synchronize(); } } else { @@ -1419,86 +1582,91 @@ void writer::impl::write(table_view const& table) // copy the full data CUDA_TRY(cudaMemcpyAsync(host_bfr.get(), dev_bfr, - ck->ck_stat_size + ck->compressed_size, + ck.ck_stat_size + ck.compressed_size, cudaMemcpyDeviceToHost, stream.value())); stream.synchronize(); - out_sink_->host_write(host_bfr.get() + ck->ck_stat_size, ck->compressed_size); - if (ck->ck_stat_size != 0) { - md.row_groups[global_r].columns[i].meta_data.statistics_blob.resize(ck->ck_stat_size); - memcpy(md.row_groups[global_r].columns[i].meta_data.statistics_blob.data(), - host_bfr.get(), - ck->ck_stat_size); + out_sink_[p]->host_write(host_bfr.get() + ck.ck_stat_size, ck.compressed_size); + if (ck.ck_stat_size != 0) { + column_chunk_meta.statistics_blob.resize(ck.ck_stat_size); + memcpy(column_chunk_meta.statistics_blob.data(), host_bfr.get(), ck.ck_stat_size); } } - md.row_groups[global_r].total_byte_size += ck->compressed_size; - md.row_groups[global_r].columns[i].meta_data.data_page_offset = - current_chunk_offset + ((ck->use_dictionary) ? ck->dictionary_size : 0); - md.row_groups[global_r].columns[i].meta_data.dictionary_page_offset = - (ck->use_dictionary) ? current_chunk_offset : 0; - md.row_groups[global_r].columns[i].meta_data.total_uncompressed_size = ck->bfr_size; - md.row_groups[global_r].columns[i].meta_data.total_compressed_size = ck->compressed_size; - current_chunk_offset += ck->compressed_size; + row_group.total_byte_size += ck.compressed_size; + column_chunk_meta.data_page_offset = + current_chunk_offset[p] + ((ck.use_dictionary) ? ck.dictionary_size : 0); + column_chunk_meta.dictionary_page_offset = + (ck.use_dictionary) ? current_chunk_offset[p] : 0; + column_chunk_meta.total_uncompressed_size = ck.bfr_size; + column_chunk_meta.total_compressed_size = ck.compressed_size; + current_chunk_offset[p] += ck.compressed_size; } } for (auto const& task : write_tasks) { task.wait(); } } + last_write_successful = true; } std::unique_ptr> writer::impl::close( - std::string const& column_chunks_file_path) + std::vector const& column_chunks_file_path) { if (closed) { return nullptr; } closed = true; - CompactProtocolWriter cpw(&buffer_); - file_ender_s fendr; - buffer_.resize(0); - fendr.footer_len = static_cast(cpw.write(md)); - fendr.magic = parquet_magic; - out_sink_->host_write(buffer_.data(), buffer_.size()); - out_sink_->host_write(&fendr, sizeof(fendr)); - out_sink_->flush(); + if (not last_write_successful) { return nullptr; } + for (size_t p = 0; p < out_sink_.size(); p++) { + std::vector buffer; + CompactProtocolWriter cpw(&buffer); + file_ender_s fendr; + buffer.resize(0); + fendr.footer_len = static_cast(cpw.write(md->get_metadata(p))); + fendr.magic = parquet_magic; + out_sink_[p]->host_write(buffer.data(), buffer.size()); + out_sink_[p]->host_write(&fendr, sizeof(fendr)); + out_sink_[p]->flush(); + } // Optionally output raw file metadata with the specified column chunk file path - if (column_chunks_file_path.length() > 0) { + if (column_chunks_file_path.size() > 0) { + CUDF_EXPECTS(column_chunks_file_path.size() == md->num_files(), + "Expected one column chunk path per output file"); + md->set_file_paths(column_chunks_file_path); file_header_s fhdr = {parquet_magic}; - buffer_.resize(0); - buffer_.insert(buffer_.end(), - reinterpret_cast(&fhdr), - reinterpret_cast(&fhdr) + sizeof(fhdr)); - for (auto& rowgroup : md.row_groups) { - for (auto& col : rowgroup.columns) { - col.file_path = column_chunks_file_path; - } - } - fendr.footer_len = static_cast(cpw.write(md)); - buffer_.insert(buffer_.end(), - reinterpret_cast(&fendr), - reinterpret_cast(&fendr) + sizeof(fendr)); - return std::make_unique>(std::move(buffer_)); + std::vector buffer; + CompactProtocolWriter cpw(&buffer); + buffer.insert(buffer.end(), + reinterpret_cast(&fhdr), + reinterpret_cast(&fhdr) + sizeof(fhdr)); + file_ender_s fendr; + fendr.magic = parquet_magic; + fendr.footer_len = static_cast(cpw.write(md->get_merged_metadata())); + buffer.insert(buffer.end(), + reinterpret_cast(&fendr), + reinterpret_cast(&fendr) + sizeof(fendr)); + return std::make_unique>(std::move(buffer)); } else { return {nullptr}; } + return nullptr; } // Forward to implementation -writer::writer(std::unique_ptr sink, +writer::writer(std::vector> sinks, parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : _impl(std::make_unique(std::move(sink), options, mode, stream, mr)) + : _impl(std::make_unique(std::move(sinks), options, mode, stream, mr)) { } -writer::writer(std::unique_ptr sink, +writer::writer(std::vector> sinks, chunked_parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) - : _impl(std::make_unique(std::move(sink), options, mode, stream, mr)) + : _impl(std::make_unique(std::move(sinks), options, mode, stream, mr)) { } @@ -1506,16 +1674,21 @@ writer::writer(std::unique_ptr sink, writer::~writer() = default; // Forward to implementation -void writer::write(table_view const& table) { _impl->write(table); } +void writer::write(table_view const& table, std::vector const& partitions) +{ + _impl->write( + table, partitions.empty() ? std::vector{{0, table.num_rows()}} : partitions); +} // Forward to implementation -std::unique_ptr> writer::close(std::string const& column_chunks_file_path) +std::unique_ptr> writer::close( + std::vector const& column_chunks_file_path) { return _impl->close(column_chunks_file_path); } std::unique_ptr> writer::merge_row_group_metadata( - const std::vector>>& metadata_list) + std::vector>> const& metadata_list) { std::vector output; CompactProtocolWriter cpw(&output); diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 9188218f607..1cefb91c904 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -45,6 +45,7 @@ namespace detail { namespace parquet { // Forward internal classes struct parquet_column_view; +struct aggregate_metadata; using namespace cudf::io::parquet; using namespace cudf::io; @@ -60,13 +61,13 @@ class writer::impl { /** * @brief Constructor with writer options. * - * @param sink data_sink for storing dataset + * @param sink data_sink's for storing dataset * @param options Settings for controlling behavior * @param mode Option to write at once or in chunks * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ - explicit impl(std::unique_ptr sink, + explicit impl(std::vector> sinks, parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, @@ -75,13 +76,13 @@ class writer::impl { /** * @brief Constructor with chunked writer options. * - * @param sink data_sink for storing dataset + * @param sink data_sink's for storing dataset * @param options Settings for controlling behavior * @param mode Option to write at once or in chunks * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ - explicit impl(std::unique_ptr sink, + explicit impl(std::vector> sinks, chunked_parquet_writer_options const& options, SingleWriteMode mode, rmm::cuda_stream_view stream, @@ -102,8 +103,10 @@ class writer::impl { * normally used for chunked writing. * * @param[in] table The table information to be written + * @param[in] partitions Optional partitions to divide the table into. If specified, must be same + * size as number of sinks. */ - void write(table_view const& table); + void write(table_view const& table, std::vector const& partitions); /** * @brief Finishes the chunked/streamed write process. @@ -112,7 +115,8 @@ class writer::impl { * @return A parquet-compatible blob that contains the data for all rowgroups in the list only if * `column_chunks_file_path` is provided, else null. */ - std::unique_ptr> close(std::string const& column_chunks_file_path = ""); + std::unique_ptr> close( + std::vector const& column_chunks_file_path = {}); private: /** @@ -120,12 +124,14 @@ class writer::impl { * * @param frag Destination page fragments * @param col_desc column description array - * @param num_rows Total number of rows + * @param[in] partitions Information about partitioning of table + * @param[in] part_frag_offset A Partition's offset into fragment array * @param fragment_size Number of rows per fragment */ void init_page_fragments(hostdevice_2dvector& frag, device_span col_desc, - uint32_t num_rows, + host_span partitions, + device_span part_frag_offset, uint32_t fragment_size); /** @@ -208,19 +214,22 @@ class writer::impl { statistics_freq stats_granularity_ = statistics_freq::STATISTICS_NONE; bool int96_timestamps = false; // Overall file metadata. Filled in during the process and written during write_chunked_end() - cudf::io::parquet::FileMetaData md; + std::unique_ptr md; + // File footer key-value metadata. Written during write_chunked_end() + std::vector> kv_md; // optional user metadata std::unique_ptr table_meta; // to track if the output has been written to sink bool closed = false; + // To track if the last write(table) call completed successfully + bool last_write_successful = false; // current write position for rowgroups/chunks - std::size_t current_chunk_offset; + std::vector current_chunk_offset; // special parameter only used by detail::write() to indicate that we are guaranteeing // a single table write. this enables some internal optimizations. bool const single_write_mode = true; - std::vector buffer_; - std::unique_ptr out_sink_; + std::vector> out_sink_; }; } // namespace parquet diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp index c376accd1ff..75ff39cbe70 100644 --- a/cpp/tests/io/parquet_test.cpp +++ b/cpp/tests/io/parquet_test.cpp @@ -838,13 +838,13 @@ TEST_F(ParquetWriterTest, MultiIndex) expected_metadata.column_metadata[2].set_name("int32s"); expected_metadata.column_metadata[3].set_name("floats"); expected_metadata.column_metadata[4].set_name("doubles"); - expected_metadata.user_data.insert( - {"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"}); auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet"); cudf_io::parquet_writer_options out_opts = cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected->view()) - .metadata(&expected_metadata); + .metadata(&expected_metadata) + .key_value_metadata( + {{{"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"}}}); cudf_io::write_parquet(out_opts); cudf_io::parquet_reader_options in_opts = @@ -1174,6 +1174,100 @@ TEST_F(ParquetWriterTest, DeviceWriteLargeishFile) auto custom_tbl = cudf_io::read_parquet(custom_args); CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view()); } + +TEST_F(ParquetWriterTest, PartitionedWrite) +{ + auto source = create_compressible_fixed_table(16, 4 * 1024 * 1024, 1000, false); + + auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet"); + auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet"); + + auto partition1 = cudf::io::partition_info{10, 1024 * 1024}; + auto partition2 = cudf::io::partition_info{20 * 1024 + 7, 3 * 1024 * 1024}; + + auto expected1 = + cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows}); + auto expected2 = + cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows}); + + cudf_io::parquet_writer_options args = + cudf_io::parquet_writer_options::builder( + cudf_io::sink_info(std::vector{filepath1, filepath2}), *source) + .partitions({partition1, partition2}) + .compression(cudf_io::compression_type::NONE); + cudf_io::write_parquet(args); + + auto result1 = cudf_io::read_parquet( + cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view()); + + auto result2 = cudf_io::read_parquet( + cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view()); +} + +TEST_F(ParquetWriterTest, PartitionedWriteEmptyPartitions) +{ + auto source = create_random_fixed_table(4, 4, false); + + auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet"); + auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet"); + + auto partition1 = cudf::io::partition_info{1, 0}; + auto partition2 = cudf::io::partition_info{1, 0}; + + auto expected1 = + cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows}); + auto expected2 = + cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows}); + + cudf_io::parquet_writer_options args = + cudf_io::parquet_writer_options::builder( + cudf_io::sink_info(std::vector{filepath1, filepath2}), *source) + .partitions({partition1, partition2}) + .compression(cudf_io::compression_type::NONE); + cudf_io::write_parquet(args); + + auto result1 = cudf_io::read_parquet( + cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view()); + + auto result2 = cudf_io::read_parquet( + cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view()); +} + +TEST_F(ParquetWriterTest, PartitionedWriteEmptyColumns) +{ + auto source = create_random_fixed_table(0, 4, false); + + auto filepath1 = temp_env->get_temp_filepath("PartitionedWrite1.parquet"); + auto filepath2 = temp_env->get_temp_filepath("PartitionedWrite2.parquet"); + + auto partition1 = cudf::io::partition_info{1, 0}; + auto partition2 = cudf::io::partition_info{1, 0}; + + auto expected1 = + cudf::slice(*source, {partition1.start_row, partition1.start_row + partition1.num_rows}); + auto expected2 = + cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows}); + + cudf_io::parquet_writer_options args = + cudf_io::parquet_writer_options::builder( + cudf_io::sink_info(std::vector{filepath1, filepath2}), *source) + .partitions({partition1, partition2}) + .compression(cudf_io::compression_type::NONE); + cudf_io::write_parquet(args); + + auto result1 = cudf_io::read_parquet( + cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view()); + + auto result2 = cudf_io::read_parquet( + cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2))); + CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view()); +} + template std::string create_parquet_file(int num_cols) { @@ -1305,7 +1399,7 @@ TEST_F(ParquetChunkedWriterTest, ManyTables) std::for_each(table_views.begin(), table_views.end(), [&writer](table_view const& tbl) { writer.write(tbl); }); - auto md = writer.close("dummy/path"); + auto md = writer.close({"dummy/path"}); CUDF_EXPECTS(md, "The returned metadata should not be null."); cudf_io::parquet_reader_options read_opts = diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index 2fc71f64df1..e5a8bb926c1 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport uint8_t from libcpp cimport bool +from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector @@ -85,6 +86,7 @@ cdef extern from "cudf/io/orc.hpp" \ void set_row_index_stride(size_type val) except+ void set_table(cudf_table_view.table_view tbl) except+ void set_metadata(cudf_io_types.table_input_metadata* meta) except+ + void set_key_value_metadata(map[string, string] kvm) except + @staticmethod orc_writer_options_builder builder( @@ -107,6 +109,9 @@ cdef extern from "cudf/io/orc.hpp" \ orc_writer_options_builder& metadata( cudf_io_types.table_input_metadata *meta ) except+ + orc_writer_options_builder& key_value_metadata( + map[string, string] kvm + ) except+ orc_writer_options build() except+ @@ -134,6 +139,7 @@ cdef extern from "cudf/io/orc.hpp" \ void set_metadata( cudf_io_types.table_input_metadata* meta ) except+ + void set_key_value_metadata(map[string, string] kvm) except + @staticmethod chunked_orc_writer_options_builder builder( @@ -155,6 +161,9 @@ cdef extern from "cudf/io/orc.hpp" \ chunked_orc_writer_options_builder& metadata( cudf_io_types.table_input_metadata *meta ) except+ + chunked_orc_writer_options_builder& key_value_metadata( + map[string, string] kvm + ) except+ chunked_orc_writer_options build() except+ diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index 9d95dce83bc..60be608d997 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -73,21 +73,24 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_io_types.statistics_freq get_stats_level() except + cudf_table_view.table_view get_table() except + const cudf_io_types.table_input_metadata get_metadata() except + - string get_column_chunks_file_path() except+ + string get_column_chunks_file_paths() except+ size_t get_row_group_size_bytes() except+ size_type get_row_group_size_rows() except+ void set_metadata( cudf_io_types.table_input_metadata *m ) except + + void set_key_value_metadata( + vector[map[string, string]] kvm + ) except + void set_stats_level( cudf_io_types.statistics_freq sf ) except + void set_compression( cudf_io_types.compression_type compression ) except + - void set_column_chunks_file_path( - string column_chunks_file_path + void set_column_chunks_file_paths( + vector[string] column_chunks_file_paths ) except + void set_row_group_size_bytes(size_t val) except+ void set_row_group_size_rows(size_type val) except+ @@ -108,14 +111,17 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: parquet_writer_options_builder& metadata( cudf_io_types.table_input_metadata *m ) except + + parquet_writer_options_builder& key_value_metadata( + vector[map[string, string]] kvm + ) except + parquet_writer_options_builder& stats_level( cudf_io_types.statistics_freq sf ) except + parquet_writer_options_builder& compression( cudf_io_types.compression_type compression ) except + - parquet_writer_options_builder& column_chunks_file_path( - string column_chunks_file_path + parquet_writer_options_builder& column_chunks_file_paths( + vector[string] column_chunks_file_paths ) except + parquet_writer_options_builder& int96_timestamps( bool enabled @@ -146,6 +152,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: void set_metadata( cudf_io_types.table_input_metadata *m ) except + + void set_key_value_metadata( + vector[map[string, string]] kvm + ) except + void set_stats_level( cudf_io_types.statistics_freq sf ) except + @@ -168,6 +177,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: chunked_parquet_writer_options_builder& metadata( cudf_io_types.table_input_metadata *m ) except + + chunked_parquet_writer_options_builder& key_value_metadata( + vector[map[string, string]] kvm + ) except + chunked_parquet_writer_options_builder& stats_level( cudf_io_types.statistics_freq sf ) except + @@ -190,7 +202,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil: cudf_table_view.table_view table_, ) except+ unique_ptr[vector[uint8_t]] close( - string column_chunks_file_path, + vector[string] column_chunks_file_paths, ) except+ cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata( diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/cpp/io/types.pxd index 6b68902d22f..40a056b46e0 100644 --- a/python/cudf/cudf/_lib/cpp/io/types.pxd +++ b/python/cudf/cudf/_lib/cpp/io/types.pxd @@ -70,13 +70,8 @@ cdef extern from "cudf/io/types.hpp" \ cdef cppclass table_input_metadata: table_input_metadata() except + table_input_metadata(const cudf_table_view.table_view& table) except + - table_input_metadata( - const cudf_table_view.table_view& table, - map[string, string] user_data - ) except + vector[column_in_metadata] column_metadata - map[string, string] user_data cdef cppclass host_buffer: const char* data @@ -87,8 +82,8 @@ cdef extern from "cudf/io/types.hpp" \ cdef cppclass source_info: io_type type - vector[string] filepaths - vector[host_buffer] buffers + const vector[string]& filepaths() except + + const vector[host_buffer]& buffers() except + vector[shared_ptr[CRandomAccessFile]] files source_info() except + @@ -98,9 +93,9 @@ cdef extern from "cudf/io/types.hpp" \ cdef cppclass sink_info: io_type type - string filepath - vector[char] * buffer - data_sink * user_sink + const vector[string]& filepaths() + const vector[vector[char] *]& buffers() + const vector[data_sink *]& user_sinks() sink_info() except + sink_info(string file_path) except + diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index 9a4bd8652da..bf761c30bc8 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -3,6 +3,7 @@ import cudf from libcpp cimport bool, int +from libcpp.map cimport map from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string from libcpp.utility cimport move @@ -311,10 +312,9 @@ cdef class ORCWriter: chunked_orc_writer_options anb creates a writer""" cdef table_view tv - # Set the table_metadata num_index_cols_meta = 0 self.tbl_meta = make_unique[table_input_metadata]( - table_view_from_table(table, ignore_index=True) + table_view_from_table(table, ignore_index=True), ) if self.index is not False: if isinstance(table._index, cudf.core.multiindex.MultiIndex): @@ -340,15 +340,16 @@ cdef class ORCWriter: table[name]._column, self.tbl_meta.get().column_metadata[i] ) + cdef map[string, string] user_data pandas_metadata = generate_pandas_metadata(table, self.index) - self.tbl_meta.get().user_data[str.encode("pandas")] = \ - str.encode(pandas_metadata) + user_data[str.encode("pandas")] = str.encode(pandas_metadata) cdef chunked_orc_writer_options args with nogil: args = move( chunked_orc_writer_options.builder(self.sink) .metadata(self.tbl_meta.get()) + .key_value_metadata(move(user_data)) .compression(self.comp_type) .enable_statistics(self.enable_stats) .build() diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index d17184685fa..955324778fd 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -296,7 +296,7 @@ cpdef write_parquet( # Create the write options cdef unique_ptr[table_input_metadata] tbl_meta - cdef map[string, string] user_data + cdef vector[map[string, string]] user_data cdef table_view tv cdef unique_ptr[cudf_io_types.data_sink] _data_sink cdef cudf_io_types.sink_info sink = make_sink_info(path, _data_sink) @@ -328,30 +328,29 @@ cpdef write_parquet( ) pandas_metadata = generate_pandas_metadata(table, index) - user_data[str.encode("pandas")] = str.encode(pandas_metadata) - - # Set the table_metadata - tbl_meta.get().user_data = user_data + user_data.resize(1) + user_data.back()[str.encode("pandas")] = str.encode(pandas_metadata) cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression) cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics) cdef unique_ptr[vector[uint8_t]] out_metadata_c - cdef string c_column_chunks_file_path + cdef vector[string] c_column_chunks_file_paths cdef bool _int96_timestamps = int96_timestamps - if metadata_file_path is not None: - c_column_chunks_file_path = str.encode(metadata_file_path) # Perform write cdef parquet_writer_options args = move( parquet_writer_options.builder(sink, tv) .metadata(tbl_meta.get()) + .key_value_metadata(move(user_data)) .compression(comp_type) .stats_level(stat_freq) - .column_chunks_file_path(c_column_chunks_file_path) .int96_timestamps(_int96_timestamps) .build() ) + if metadata_file_path is not None: + c_column_chunks_file_paths.push_back(str.encode(metadata_file_path)) + args.set_column_chunks_file_paths(move(c_column_chunks_file_paths)) if row_group_size_bytes is not None: args.set_row_group_size_bytes(row_group_size_bytes) if row_group_size_rows is not None: @@ -413,18 +412,18 @@ cdef class ParquetWriter: def close(self, object metadata_file_path=None): cdef unique_ptr[vector[uint8_t]] out_metadata_c - cdef string column_chunks_file_path + cdef vector[string] column_chunks_file_paths if not self.initialized: return None # Update metadata-collection options if metadata_file_path is not None: - column_chunks_file_path = str.encode(metadata_file_path) + column_chunks_file_paths.push_back(str.encode(metadata_file_path)) with nogil: out_metadata_c = move( - self.writer.get()[0].close(column_chunks_file_path) + self.writer.get()[0].close(column_chunks_file_paths) ) if metadata_file_path is not None: @@ -471,14 +470,16 @@ cdef class ParquetWriter: ) pandas_metadata = generate_pandas_metadata(table, self.index) - self.tbl_meta.get().user_data[str.encode("pandas")] = \ - str.encode(pandas_metadata) + cdef vector[map[string, string]] user_data + user_data.resize(1) + user_data.back()[str.encode("pandas")] = str.encode(pandas_metadata) cdef chunked_parquet_writer_options args with nogil: args = move( chunked_parquet_writer_options.builder(self.sink) .metadata(self.tbl_meta.get()) + .key_value_metadata(move(user_data)) .compression(self.comp_type) .stats_level(self.stat_freq) .build() From fc2a32a1576d97a48b0c1c983ef4b31285267e96 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 14 Dec 2021 16:59:24 -0600 Subject: [PATCH 05/13] Introduce `nan_as_null` parameter for `cudf.Index` (#9893) Fixes: #9822 This PR introduces `nan_as_null` parameter to `cudf.Index` constructor which is similar to the one present in `cudf.Series` constructor. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9893 --- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/column/categorical.py | 4 +++- python/cudf/cudf/core/index.py | 25 ++++++++++++++++----- python/cudf/cudf/tests/test_index.py | 19 ++++++++++++++++ 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index ac5e152d011..ed1cc74db71 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -829,7 +829,7 @@ def is_floating(self): >>> idx = cudf.Index([1.0, 2.0, np.nan, 4.0]) >>> idx.is_floating() True - >>> idx = cudf.Index([1, 2, 3, 4, np.nan]) + >>> idx = cudf.Index([1, 2, 3, 4, np.nan], nan_as_null=False) >>> idx.is_floating() True >>> idx = cudf.Index([1, 2, 3, 4]) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index a2c1f04b2f2..4be7a422de0 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -809,7 +809,9 @@ def __setitem__(self, key, value): to_add_categories = 0 else: to_add_categories = len( - cudf.Index(value).difference(self.categories) + cudf.Index(value, nan_as_null=False).difference( + self.categories + ) ) if to_add_categories > 0: diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 29e0d17bc39..362c96ebbeb 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2527,7 +2527,7 @@ def is_object(self): return True -def as_index(arbitrary, **kwargs) -> BaseIndex: +def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: """Create an Index from an arbitrary object Currently supported inputs are: @@ -2560,7 +2560,7 @@ def as_index(arbitrary, **kwargs) -> BaseIndex: elif isinstance(arbitrary, ColumnBase): return _index_from_data({kwargs.get("name", None): arbitrary}) elif isinstance(arbitrary, cudf.Series): - return as_index(arbitrary._column, **kwargs) + return as_index(arbitrary._column, nan_as_null=nan_as_null, **kwargs) elif isinstance(arbitrary, (pd.RangeIndex, range)): return RangeIndex( start=arbitrary.start, @@ -2569,11 +2569,14 @@ def as_index(arbitrary, **kwargs) -> BaseIndex: **kwargs, ) elif isinstance(arbitrary, pd.MultiIndex): - return cudf.MultiIndex.from_pandas(arbitrary) + return cudf.MultiIndex.from_pandas(arbitrary, nan_as_null=nan_as_null) elif isinstance(arbitrary, cudf.DataFrame): return cudf.MultiIndex.from_frame(arbitrary) return as_index( - column.as_column(arbitrary, dtype=kwargs.get("dtype", None)), **kwargs + column.as_column( + arbitrary, dtype=kwargs.get("dtype", None), nan_as_null=nan_as_null + ), + **kwargs, ) @@ -2623,6 +2626,10 @@ class Index(BaseIndex, metaclass=IndexMeta): tupleize_cols : bool (default: True) When True, attempt to create a MultiIndex if possible. tupleize_cols == False is not yet supported. + nan_as_null : bool, Default True + If ``None``/``True``, converts ``np.nan`` values to + ``null`` values. + If ``False``, leaves ``np.nan`` values as is. Returns ------- @@ -2655,6 +2662,7 @@ def __new__( copy=False, name=None, tupleize_cols=True, + nan_as_null=True, **kwargs, ): assert ( @@ -2665,7 +2673,14 @@ def __new__( "tupleize_cols != True is not yet supported" ) - return as_index(data, copy=copy, dtype=dtype, name=name, **kwargs) + return as_index( + data, + copy=copy, + dtype=dtype, + name=name, + nan_as_null=nan_as_null, + **kwargs, + ) @classmethod def from_arrow(cls, obj): diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index c7fca2075f5..6679725ae9a 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2509,3 +2509,22 @@ def test_index_datetime_round(resolution): cuidx_floor = cuidx.round(resolution) assert_eq(pidx_floor, cuidx_floor) + + +@pytest.mark.parametrize( + "data,nan_idx,NA_idx", + [([1, 2, 3, None], None, 3), ([2, 3, np.nan, None], 2, 3)], +) +@pytest.mark.parametrize("nan_as_null", [True, False]) +def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null): + idx = cudf.Index(data, nan_as_null=nan_as_null) + + if nan_as_null: + if nan_idx is not None: + assert idx[nan_idx] is cudf.NA + else: + if nan_idx is not None: + assert np.isnan(idx[nan_idx]) + + if NA_idx is not None: + assert idx[NA_idx] is cudf.NA From 44fce8bb201ae818ec73ed563e4ab0232ceb751e Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Tue, 14 Dec 2021 22:02:27 -0600 Subject: [PATCH 06/13] Fix cudf.Scalar string datetime construction (#9875) Closes https://github.com/rapidsai/cudf/issues/9874 Authors: - https://github.com/brandon-b-miller Approvers: - Michael Wang (https://github.com/isVoid) URL: https://github.com/rapidsai/cudf/pull/9875 --- python/cudf/cudf/tests/test_scalar.py | 12 ++++++++++++ python/cudf/cudf/utils/dtypes.py | 6 +++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py index a9919900256..a8b62710e0e 100644 --- a/python/cudf/cudf/tests/test_scalar.py +++ b/python/cudf/cudf/tests/test_scalar.py @@ -369,3 +369,15 @@ def test_construct_from_scalar(value): x._is_host_value_current == y._is_host_value_current x._is_device_value_current == y._is_device_value_current + + +@pytest.mark.parametrize( + "data", ["20000101", "2000-01-01", "2000-01-01T00:00:00.000000000", "2000"] +) +@pytest.mark.parametrize("dtype", DATETIME_TYPES) +def test_datetime_scalar_from_string(data, dtype): + slr = cudf.Scalar(data, dtype) + + expected = np.datetime64(datetime.datetime(2000, 1, 1)).astype(dtype) + + assert expected == slr.value diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 2eb38c0f77e..7142d0d710e 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -257,7 +257,11 @@ def to_cudf_compatible_scalar(val, dtype=None): val = cudf.api.types.pandas_dtype(type(val)).type(val) if dtype is not None: - val = val.astype(dtype) + if isinstance(val, str) and np.dtype(dtype).kind == "M": + # pd.Timestamp can handle str, but not np.str_ + val = pd.Timestamp(str(val)).to_datetime64().astype(dtype) + else: + val = val.astype(dtype) if val.dtype.type is np.datetime64: time_unit, _ = np.datetime_data(val.dtype) From 3428f7f7b123851ee580c29f7c4fdc28b8384e98 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 14 Dec 2021 22:08:54 -0600 Subject: [PATCH 07/13] Fix compilation of benchmark for parquet writer. (#9905) This fixes a compilation error introduced in #9810. Tagging @devavret @vuule for review. Feel free to push to this PR with any fixes. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/9905 --- cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp index b4c11179c35..5c3c53fee8e 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer_benchmark.cpp @@ -85,7 +85,7 @@ void BM_parq_write_varying_options(benchmark::State& state) cudf_io::parquet_writer_options::builder(source_sink.make_sink_info(), view) .compression(compression) .stats_level(enable_stats) - .column_chunks_file_path(file_path); + .column_chunks_file_paths({file_path}); cudf_io::write_parquet(options); } From 78d12bb20501770839c2a062a2cc611349dc4120 Mon Sep 17 00:00:00 2001 From: Jordan Jacobelli Date: Wed, 15 Dec 2021 10:05:12 +0100 Subject: [PATCH 08/13] Update ucx-py version on release using rvc (#9897) Update `ucx-py` version on release using `rvc` Authors: - Jordan Jacobelli (https://github.com/Ethyling) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/9897 --- ci/gpu/build.sh | 5 ++++- ci/gpu/java.sh | 5 ++++- ci/release/update-version.sh | 5 +++++ 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh index 00ad6bf812d..5646c268301 100755 --- a/ci/gpu/build.sh +++ b/ci/gpu/build.sh @@ -33,6 +33,9 @@ export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` # Dask & Distributed git tag export DASK_DISTRIBUTED_GIT_TAG='2021.11.2' +# ucx-py version +export UCX_PY_VERSION='0.24.*' + ################################################################################ # TRAP - Setup trap for removing jitify cache ################################################################################ @@ -83,7 +86,7 @@ gpuci_mamba_retry install -y \ "rapids-notebook-env=$MINOR_VERSION.*" \ "dask-cuda=${MINOR_VERSION}" \ "rmm=$MINOR_VERSION.*" \ - "ucx-py=0.24.*" + "ucx-py=${UCX_PY_VERSION}" # https://docs.rapids.ai/maintainers/depmgmt/ # gpuci_mamba_retry remove --force rapids-build-env rapids-notebook-env diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh index bada16bd40e..6f7038d21d7 100755 --- a/ci/gpu/java.sh +++ b/ci/gpu/java.sh @@ -30,6 +30,9 @@ export CONDA_ARTIFACT_PATH="$WORKSPACE/ci/artifacts/cudf/cpu/.conda-bld/" export GIT_DESCRIBE_TAG=`git describe --tags` export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'` +# ucx-py version +export UCX_PY_VERSION='0.24.*' + ################################################################################ # TRAP - Setup trap for removing jitify cache ################################################################################ @@ -80,7 +83,7 @@ gpuci_conda_retry install -y \ "rapids-notebook-env=$MINOR_VERSION.*" \ "dask-cuda=${MINOR_VERSION}" \ "rmm=$MINOR_VERSION.*" \ - "ucx-py=0.24.*" \ + "ucx-py=${UCX_PY_VERSION}" \ "openjdk=8.*" \ "maven" diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 86432a92128..1105b9c194d 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -21,6 +21,7 @@ CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR} NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}') NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}') NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR} +NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*" echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG" @@ -62,3 +63,7 @@ sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md # Libcudf examples update sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt + +# ucx-py version update +sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh +sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/java.sh From 38631a635fbfe05f69fd243df03868ec1f23d3c5 Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Wed, 15 Dec 2021 08:29:05 -0600 Subject: [PATCH 09/13] Fix the java build after parquet partitioning support (#9908) This fixes the java build after #9810 went in. There is a lot of copy/paste in this first draft, because I just wanted to get something to work. Not sure if it is worth going back to make it common everywhere. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/9908 --- java/src/main/native/src/TableJni.cpp | 63 +++++++++++++++++++-------- 1 file changed, 45 insertions(+), 18 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 5bae4f5f399..0914c8a23f7 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -705,15 +705,12 @@ int set_column_metadata(cudf::io::column_in_metadata &column_metadata, void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_names, jintArray &j_children, jbooleanArray &j_col_nullability, - jobjectArray &j_metadata_keys, jobjectArray &j_metadata_values, jbooleanArray &j_is_int96, jintArray &j_precisions, jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata) { cudf::jni::auto_set_device(env); cudf::jni::native_jstringArray col_names(env, j_col_names); cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability); cudf::jni::native_jbooleanArray is_int96(env, j_is_int96); - cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); - cudf::jni::native_jstringArray meta_values(env, j_metadata_values); cudf::jni::native_jintArray precisions(env, j_precisions); cudf::jni::native_jintArray children(env, j_children); cudf::jni::native_jbooleanArray is_map(env, j_is_map); @@ -742,9 +739,6 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam is_int96, precisions, is_map, children, childs_children, read_index); } } - for (auto i = 0; i < meta_keys.size(); ++i) { - metadata.user_data[meta_keys[i].get()] = meta_values[i].get(); - } } // Check that window parameters are valid. @@ -1364,15 +1358,23 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin( using namespace cudf::jni; sink_info sink{data_sink.get()}; table_input_metadata metadata; - createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, - j_metadata_keys, j_metadata_values, j_isInt96, j_precisions, j_is_map, - metadata); + createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96, + j_precisions, j_is_map, metadata); + + cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); + cudf::jni::native_jstringArray meta_values(env, j_metadata_values); + + std::map kv_metadata; + for (auto i = 0; i < meta_keys.size(); ++i) { + kv_metadata[meta_keys[i].get()] = meta_values[i].get(); + } chunked_parquet_writer_options opts = chunked_parquet_writer_options::builder(sink) .metadata(&metadata) .compression(static_cast(j_compression)) .stats_level(static_cast(j_stats_freq)) + .key_value_metadata({kv_metadata}) .build(); auto writer_ptr = std::make_unique(opts); cudf::jni::native_parquet_writer_handle *ret = @@ -1398,15 +1400,24 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin( using namespace cudf::io; using namespace cudf::jni; table_input_metadata metadata; - createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, - j_metadata_keys, j_metadata_values, j_isInt96, j_precisions, j_is_map, - metadata); + createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96, + j_precisions, j_is_map, metadata); + + cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); + cudf::jni::native_jstringArray meta_values(env, j_metadata_values); + + std::map kv_metadata; + for (auto i = 0; i < meta_keys.size(); ++i) { + kv_metadata[meta_keys[i].get()] = meta_values[i].get(); + } + sink_info sink{output_path.get()}; chunked_parquet_writer_options opts = chunked_parquet_writer_options::builder(sink) .metadata(&metadata) .compression(static_cast(j_compression)) .stats_level(static_cast(j_stats_freq)) + .key_value_metadata({kv_metadata}) .build(); auto writer_ptr = std::make_unique(opts); @@ -1519,9 +1530,16 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin( table_input_metadata metadata; // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue. jbooleanArray j_is_int96 = NULL; - createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, - j_metadata_keys, j_metadata_values, j_is_int96, j_precisions, j_is_map, - metadata); + createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96, + j_precisions, j_is_map, metadata); + + cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); + cudf::jni::native_jstringArray meta_values(env, j_metadata_values); + + std::map kv_metadata; + for (auto i = 0; i < meta_keys.size(); ++i) { + kv_metadata[meta_keys[i].get()] = meta_values[i].get(); + } std::unique_ptr data_sink( new cudf::jni::jni_writer_data_sink(env, consumer)); @@ -1530,6 +1548,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin( .metadata(&metadata) .compression(static_cast(j_compression)) .enable_statistics(true) + .key_value_metadata(kv_metadata) .build(); auto writer_ptr = std::make_unique(opts); cudf::jni::native_orc_writer_handle *ret = @@ -1556,15 +1575,23 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin( table_input_metadata metadata; // ORC has no `j_is_int96`, but `createTableMetaData` needs a lvalue. jbooleanArray j_is_int96 = NULL; - createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, - j_metadata_keys, j_metadata_values, j_is_int96, j_precisions, j_is_map, - metadata); + createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96, + j_precisions, j_is_map, metadata); + + cudf::jni::native_jstringArray meta_keys(env, j_metadata_keys); + cudf::jni::native_jstringArray meta_values(env, j_metadata_values); + + std::map kv_metadata; + for (auto i = 0; i < meta_keys.size(); ++i) { + kv_metadata[meta_keys[i].get()] = meta_values[i].get(); + } sink_info sink{output_path.get()}; chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink) .metadata(&metadata) .compression(static_cast(j_compression)) .enable_statistics(true) + .key_value_metadata(kv_metadata) .build(); auto writer_ptr = std::make_unique(opts); cudf::jni::native_orc_writer_handle *ret = From db9aef8181c400d707d512a2449cc9927d4a3bc5 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 15 Dec 2021 09:49:57 -0500 Subject: [PATCH 10/13] Add regex_flags parameter to strings replace_re functions (#9878) Closes #9845 Adds a `cudf::strings::regex_flags` parameter to the `cudf::strings::replace_re` functions so the matching logic will be the same as for `cudf::strings::contains_re` which already has this parameter. This is a breaking change since it adds this new parameter and changes the default behavior. The previous default behavior is equivalent to specifying the `regex_flags::MULTILINE` flag now to be consistent with the default behavior of `contains_re`. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Mike Wilson (https://github.com/hyperbolic2346) URL: https://github.com/rapidsai/cudf/pull/9878 --- cpp/include/cudf/strings/replace_re.hpp | 24 ++++--- cpp/src/strings/replace/backref_re.cu | 16 +++-- cpp/src/strings/replace/multi_re.cu | 84 +++++++++-------------- cpp/src/strings/replace/replace_re.cu | 68 +++++++++--------- cpp/tests/strings/replace_regex_tests.cpp | 52 ++++++++++++++ 5 files changed, 143 insertions(+), 101 deletions(-) diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp index 087d1a94603..a2c4eba1636 100644 --- a/cpp/include/cudf/strings/replace_re.hpp +++ b/cpp/include/cudf/strings/replace_re.hpp @@ -17,6 +17,7 @@ #include #include +#include #include namespace cudf { @@ -37,22 +38,25 @@ namespace strings { * * @param strings Strings instance for this operation. * @param pattern The regular expression pattern to search within each string. - * @param repl The string used to replace the matched sequence in each string. + * @param replacement The string used to replace the matched sequence in each string. * Default is an empty string. - * @param maxrepl The maximum number of times to replace the matched pattern within each string. + * @param max_replace_count The maximum number of times to replace the matched pattern + * within each string. Default replaces every substring that is matched. + * @param flags Regex flags for interpreting special characters in the pattern. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ std::unique_ptr replace_re( strings_column_view const& strings, std::string const& pattern, - string_scalar const& repl = string_scalar(""), - size_type maxrepl = -1, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + string_scalar const& replacement = string_scalar(""), + std::optional max_replace_count = std::nullopt, + regex_flags const flags = regex_flags::DEFAULT, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief For each string, replaces any character sequence matching the given patterns - * with the corresponding string in the repls column. + * with the corresponding string in the `replacements` column. * * Any null string entries return corresponding null output column entries. * @@ -60,14 +64,16 @@ std::unique_ptr replace_re( * * @param strings Strings instance for this operation. * @param patterns The regular expression patterns to search within each string. - * @param repls The strings used for replacement. + * @param replacements The strings used for replacement. + * @param flags Regex flags for interpreting special characters in the patterns. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ std::unique_ptr replace_re( strings_column_view const& strings, std::vector const& patterns, - strings_column_view const& repls, + strings_column_view const& replacements, + regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -83,6 +89,7 @@ std::unique_ptr replace_re( * @param strings Strings instance for this operation. * @param pattern The regular expression patterns to search within each string. * @param replacement The replacement template for creating the output string. + * @param flags Regex flags for interpreting special characters in the pattern. * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ @@ -90,6 +97,7 @@ std::unique_ptr replace_with_backrefs( strings_column_view const& strings, std::string const& pattern, std::string const& replacement, + regex_flags const flags = regex_flags::DEFAULT, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); } // namespace strings diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu index 99c55998fb9..ff86d7aa552 100644 --- a/cpp/src/strings/replace/backref_re.cu +++ b/cpp/src/strings/replace/backref_re.cu @@ -101,22 +101,24 @@ std::pair> parse_backrefs(std::string con std::unique_ptr replace_with_backrefs( strings_column_view const& strings, std::string const& pattern, - std::string const& repl, + std::string const& replacement, + regex_flags const flags, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { if (strings.is_empty()) return make_empty_column(type_id::STRING); CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty"); - CUDF_EXPECTS(!repl.empty(), "Parameter repl must not be empty"); + CUDF_EXPECTS(!replacement.empty(), "Parameter replacement must not be empty"); auto d_strings = column_device_view::create(strings.parent(), stream); // compile regex into device object - auto d_prog = reprog_device::create(pattern, get_character_flags_table(), strings.size(), stream); + auto d_prog = + reprog_device::create(pattern, flags, get_character_flags_table(), strings.size(), stream); auto const regex_insts = d_prog->insts_counts(); // parse the repl string for back-ref indicators - auto const parse_result = parse_backrefs(repl); + auto const parse_result = parse_backrefs(replacement); rmm::device_uvector backrefs = cudf::detail::make_device_uvector_async(parse_result.second, stream); string_scalar repl_scalar(parse_result.first, true, stream); @@ -170,11 +172,13 @@ std::unique_ptr replace_with_backrefs( std::unique_ptr replace_with_backrefs(strings_column_view const& strings, std::string const& pattern, - std::string const& repl, + std::string const& replacement, + regex_flags const flags, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_with_backrefs(strings, pattern, repl, rmm::cuda_stream_default, mr); + return detail::replace_with_backrefs( + strings, pattern, replacement, flags, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index 25417909c89..2b5380b76dd 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -53,12 +54,11 @@ using found_range = thrust::pair; template struct replace_multi_regex_fn { column_device_view const d_strings; - reprog_device* progs; // array of regex progs - size_type number_of_patterns; - found_range* d_found_ranges; // working array matched (begin,end) values - column_device_view const d_repls; // replacement strings - int32_t* d_offsets{}; // these are null when - char* d_chars{}; // only computing size + device_span progs; // array of regex progs + found_range* d_found_ranges; // working array matched (begin,end) values + column_device_view const d_repls; // replacement strings + int32_t* d_offsets{}; + char* d_chars{}; __device__ void operator()(size_type idx) { @@ -66,6 +66,9 @@ struct replace_multi_regex_fn { if (!d_chars) d_offsets[idx] = 0; return; } + + auto const number_of_patterns = static_cast(progs.size()); + auto const d_str = d_strings.element(idx); auto const nchars = d_str.length(); // number of characters in input string auto nbytes = d_str.size_bytes(); // number of bytes in input string @@ -129,7 +132,8 @@ struct replace_multi_regex_fn { std::unique_ptr replace_re( strings_column_view const& strings, std::vector const& patterns, - strings_column_view const& repls, + strings_column_view const& replacements, + regex_flags const flags, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { @@ -138,31 +142,25 @@ std::unique_ptr replace_re( if (patterns.empty()) // no patterns; just return a copy return std::make_unique(strings.parent(), stream, mr); - CUDF_EXPECTS(!repls.has_nulls(), "Parameter repls must not have any nulls"); + CUDF_EXPECTS(!replacements.has_nulls(), "Parameter replacements must not have any nulls"); - auto d_strings = column_device_view::create(strings.parent(), stream); - auto d_repls = column_device_view::create(repls.parent(), stream); - auto d_flags = get_character_flags_table(); + auto d_strings = column_device_view::create(strings.parent(), stream); + auto d_repls = column_device_view::create(replacements.parent(), stream); + auto d_char_table = get_character_flags_table(); // compile regexes into device objects size_type regex_insts = 0; std::vector>> h_progs; - thrust::host_vector progs; + std::vector progs; for (auto itr = patterns.begin(); itr != patterns.end(); ++itr) { - auto prog = reprog_device::create(*itr, d_flags, strings_count, stream); + auto prog = reprog_device::create(*itr, flags, d_char_table, strings_count, stream); regex_insts = std::max(regex_insts, prog->insts_counts()); progs.push_back(*prog); h_progs.emplace_back(std::move(prog)); } // copy all the reprog_device instances to a device memory array - rmm::device_buffer progs_buffer{sizeof(reprog_device) * progs.size(), stream}; - CUDA_TRY(cudaMemcpyAsync(progs_buffer.data(), - progs.data(), - progs.size() * sizeof(reprog_device), - cudaMemcpyHostToDevice, - stream.value())); - reprog_device* d_progs = reinterpret_cast(progs_buffer.data()); + auto d_progs = cudf::detail::make_device_uvector_async(progs, stream); // create working buffer for ranges pairs rmm::device_uvector found_ranges(patterns.size() * strings_count, stream); @@ -172,34 +170,19 @@ std::unique_ptr replace_re( auto children = [&] { // Each invocation is predicated on the stack size which is dependent on the number of regex // instructions - if (regex_insts <= RX_SMALL_INSTS) - return make_strings_children( - replace_multi_regex_fn{ - *d_strings, d_progs, static_cast(progs.size()), d_found_ranges, *d_repls}, - strings_count, - stream, - mr); - else if (regex_insts <= RX_MEDIUM_INSTS) - return make_strings_children( - replace_multi_regex_fn{ - *d_strings, d_progs, static_cast(progs.size()), d_found_ranges, *d_repls}, - strings_count, - stream, - mr); - else if (regex_insts <= RX_LARGE_INSTS) - return make_strings_children( - replace_multi_regex_fn{ - *d_strings, d_progs, static_cast(progs.size()), d_found_ranges, *d_repls}, - strings_count, - stream, - mr); - else - return make_strings_children( - replace_multi_regex_fn{ - *d_strings, d_progs, static_cast(progs.size()), d_found_ranges, *d_repls}, - strings_count, - stream, - mr); + if (regex_insts <= RX_SMALL_INSTS) { + replace_multi_regex_fn fn{*d_strings, d_progs, d_found_ranges, *d_repls}; + return make_strings_children(fn, strings_count, stream, mr); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + replace_multi_regex_fn fn{*d_strings, d_progs, d_found_ranges, *d_repls}; + return make_strings_children(fn, strings_count, stream, mr); + } else if (regex_insts <= RX_LARGE_INSTS) { + replace_multi_regex_fn fn{*d_strings, d_progs, d_found_ranges, *d_repls}; + return make_strings_children(fn, strings_count, stream, mr); + } else { + replace_multi_regex_fn fn{*d_strings, d_progs, d_found_ranges, *d_repls}; + return make_strings_children(fn, strings_count, stream, mr); + } }(); return make_strings_column(strings_count, @@ -215,11 +198,12 @@ std::unique_ptr replace_re( std::unique_ptr replace_re(strings_column_view const& strings, std::vector const& patterns, - strings_column_view const& repls, + strings_column_view const& replacements, + regex_flags const flags, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_re(strings, patterns, repls, rmm::cuda_stream_default, mr); + return detail::replace_re(strings, patterns, replacements, flags, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu index b940944c186..9fd1768453a 100644 --- a/cpp/src/strings/replace/replace_re.cu +++ b/cpp/src/strings/replace/replace_re.cu @@ -52,7 +52,7 @@ struct replace_regex_fn { column_device_view const d_strings; reprog_device prog; string_view const d_repl; - size_type maxrepl; + size_type const maxrepl; int32_t* d_offsets{}; char* d_chars{}; @@ -102,56 +102,48 @@ struct replace_regex_fn { std::unique_ptr replace_re( strings_column_view const& strings, std::string const& pattern, - string_scalar const& repl = string_scalar(""), - size_type maxrepl = -1, + string_scalar const& replacement, + std::optional max_replace_count, + regex_flags const flags, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { auto strings_count = strings.size(); if (strings_count == 0) return make_empty_column(type_id::STRING); - CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid"); - string_view d_repl(repl.data(), repl.size()); + CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid"); + string_view d_repl(replacement.data(), replacement.size()); auto strings_column = column_device_view::create(strings.parent(), stream); auto d_strings = *strings_column; // compile regex into device object - auto prog = reprog_device::create(pattern, get_character_flags_table(), strings_count, stream); - auto d_prog = *prog; - auto regex_insts = d_prog.insts_counts(); + auto prog = + reprog_device::create(pattern, flags, get_character_flags_table(), strings_count, stream); + auto d_prog = *prog; + auto const regex_insts = d_prog.insts_counts(); // copy null mask - auto null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - auto null_count = strings.null_count(); + auto null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); + auto const null_count = strings.null_count(); + auto const maxrepl = max_replace_count.value_or(-1); // create child columns auto children = [&] { // Each invocation is predicated on the stack size which is dependent on the number of regex // instructions - if (regex_insts <= RX_SMALL_INSTS) - return make_strings_children( - replace_regex_fn{d_strings, d_prog, d_repl, maxrepl}, - strings_count, - stream, - mr); - else if (regex_insts <= RX_MEDIUM_INSTS) - return make_strings_children( - replace_regex_fn{d_strings, d_prog, d_repl, maxrepl}, - strings_count, - stream, - mr); - else if (regex_insts <= RX_LARGE_INSTS) - return make_strings_children( - replace_regex_fn{d_strings, d_prog, d_repl, maxrepl}, - strings_count, - stream, - mr); - else - return make_strings_children( - replace_regex_fn{d_strings, d_prog, d_repl, maxrepl}, - strings_count, - stream, - mr); + if (regex_insts <= RX_SMALL_INSTS) { + replace_regex_fn fn{d_strings, d_prog, d_repl, maxrepl}; + return make_strings_children(fn, strings_count, stream, mr); + } else if (regex_insts <= RX_MEDIUM_INSTS) { + replace_regex_fn fn{d_strings, d_prog, d_repl, maxrepl}; + return make_strings_children(fn, strings_count, stream, mr); + } else if (regex_insts <= RX_LARGE_INSTS) { + replace_regex_fn fn{d_strings, d_prog, d_repl, maxrepl}; + return make_strings_children(fn, strings_count, stream, mr); + } else { + replace_regex_fn fn{d_strings, d_prog, d_repl, maxrepl}; + return make_strings_children(fn, strings_count, stream, mr); + } }(); return make_strings_column(strings_count, @@ -167,12 +159,14 @@ std::unique_ptr replace_re( std::unique_ptr replace_re(strings_column_view const& strings, std::string const& pattern, - string_scalar const& repl, - size_type maxrepl, + string_scalar const& replacement, + std::optional max_replace_count, + regex_flags const flags, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::replace_re(strings, pattern, repl, maxrepl, rmm::cuda_stream_default, mr); + return detail::replace_re( + strings, pattern, replacement, max_replace_count, flags, rmm::cuda_stream_default, mr); } } // namespace strings diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp index 16308265a9b..eac06fa4588 100644 --- a/cpp/tests/strings/replace_regex_tests.cpp +++ b/cpp/tests/strings/replace_regex_tests.cpp @@ -133,6 +133,58 @@ TEST_F(StringsReplaceRegexTest, WithEmptyPattern) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); } +TEST_F(StringsReplaceRegexTest, MultiReplacement) +{ + cudf::test::strings_column_wrapper input({"aba bcd aba", "abababa abababa"}); + auto results = + cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar("_"), 2); + cudf::test::strings_column_wrapper expected({"_ bcd _", "_b_ abababa"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + results = + cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar(""), 0); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input); +} + +TEST_F(StringsReplaceRegexTest, Multiline) +{ + auto const multiline = cudf::strings::regex_flags::MULTILINE; + + cudf::test::strings_column_wrapper input({"bcd\naba\nefg", "aba\naba abab\naba", "aba"}); + auto sv = cudf::strings_column_view(input); + + // single-replace + auto results = + cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_"), std::nullopt, multiline); + cudf::test::strings_column_wrapper expected_ml({"bcd\n_\nefg", "_\naba abab\n_", "_"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_ml); + + results = cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_")); + cudf::test::strings_column_wrapper expected({"bcd\naba\nefg", "aba\naba abab\naba", "_"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected); + + // multi-replace + std::vector patterns({"aba$", "^aba"}); + cudf::test::strings_column_wrapper repls({">", "<"}); + results = cudf::strings::replace_re(sv, patterns, cudf::strings_column_view(repls), multiline); + cudf::test::strings_column_wrapper multi_expected_ml({"bcd\n>\nefg", ">\n< abab\n>", ">"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, multi_expected_ml); + + results = cudf::strings::replace_re(sv, patterns, cudf::strings_column_view(repls)); + cudf::test::strings_column_wrapper multi_expected({"bcd\naba\nefg", "<\naba abab\n>", ">"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, multi_expected); + + // backref-replace + results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]", multiline); + cudf::test::strings_column_wrapper br_expected_ml( + {"bcd\n[aba]\nefg", "[aba]\n[aba] abab\n[aba]", "[aba]"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected_ml); + + results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]"); + cudf::test::strings_column_wrapper br_expected( + {"bcd\naba\nefg", "[aba]\naba abab\naba", "[aba]"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected); +} + TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest) { std::vector h_strings{"the quick brown fox jumps over the lazy dog", From 0c3f7356e0afe391dc874b55898029275a23db1c Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 15 Dec 2021 10:03:22 -0500 Subject: [PATCH 11/13] Add dictionary support to cudf::copy_if_else (#9887) Close #9885 Adds support for dictionary column types to `cudf::copy_if_else`. The column/scalar versions of this API will accept a scalar type that matches the dictionary's key type. The column/column version will accept 2 dictionary columns with matching key types. The result of the function will be a dictionary that incorporates both sets of keys or the scalar value as appropriate. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Mark Harris (https://github.com/harrism) URL: https://github.com/rapidsai/cudf/pull/9887 --- .../dictionary/dictionary_column_view.hpp | 5 ++ cpp/src/copying/copy.cu | 37 +++++++- cpp/src/dictionary/dictionary_column_view.cpp | 8 +- cpp/tests/copying/copy_tests.cpp | 84 +++++++++++++++++++ 4 files changed, 130 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp index 1da52e67e06..42f8310040e 100644 --- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp +++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp @@ -77,6 +77,11 @@ class dictionary_column_view : private column_view { */ column_view keys() const noexcept; + /** + * @brief Returns the `data_type` of the keys child column. + */ + data_type keys_type() const noexcept; + /** * @brief Returns the number of rows in the keys column. */ diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu index 10af2ffb614..91fc5f02989 100644 --- a/cpp/src/copying/copy.cu +++ b/cpp/src/copying/copy.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -267,6 +268,22 @@ struct copy_if_else_functor_impl { } }; +template <> +struct copy_if_else_functor_impl { + template + std::unique_ptr operator()(Left const& lhs, + Right const& rhs, + size_type size, + bool, + bool, + Filter filter, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + { + return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr); + } +}; + /** * @brief Functor called by the `type_dispatcher` to invoke copy_if_else on combinations * of column_view and scalar @@ -297,7 +314,6 @@ std::unique_ptr copy_if_else(Left const& lhs, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type"); CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8), "Boolean mask column must be of type type_id::BOOL8"); @@ -311,7 +327,11 @@ std::unique_ptr copy_if_else(Left const& lhs, return (!has_nulls || bool_mask_device.is_valid_nocheck(i)) and bool_mask_device.element(i); }; - return cudf::type_dispatcher(lhs.type(), + + // always dispatch on dictionary-type if either input is a dictionary + auto dispatch_type = cudf::is_dictionary(rhs.type()) ? rhs.type() : lhs.type(); + + return cudf::type_dispatcher(dispatch_type, copy_if_else_functor{}, lhs, rhs, @@ -334,6 +354,8 @@ std::unique_ptr copy_if_else(column_view const& lhs, CUDF_EXPECTS(boolean_mask.size() == lhs.size(), "Boolean mask column must be the same size as lhs and rhs columns"); CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size"); + CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type"); + return copy_if_else(lhs, rhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, stream, mr); } @@ -345,6 +367,11 @@ std::unique_ptr copy_if_else(scalar const& lhs, { CUDF_EXPECTS(boolean_mask.size() == rhs.size(), "Boolean mask column must be the same size as rhs column"); + + auto rhs_type = + cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : rhs.type(); + CUDF_EXPECTS(lhs.type() == rhs_type, "Both inputs must be of the same type"); + return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr); } @@ -356,6 +383,11 @@ std::unique_ptr copy_if_else(column_view const& lhs, { CUDF_EXPECTS(boolean_mask.size() == lhs.size(), "Boolean mask column must be the same size as lhs column"); + + auto lhs_type = + cudf::is_dictionary(lhs.type()) ? cudf::dictionary_column_view(lhs).keys_type() : lhs.type(); + CUDF_EXPECTS(lhs_type == rhs.type(), "Both inputs must be of the same type"); + return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(stream), boolean_mask, stream, mr); } @@ -365,6 +397,7 @@ std::unique_ptr copy_if_else(scalar const& lhs, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + CUDF_EXPECTS(lhs.type() == rhs.type(), "Both inputs must be of the same type"); return copy_if_else( lhs, rhs, !lhs.is_valid(stream), !rhs.is_valid(stream), boolean_mask, stream, mr); } diff --git a/cpp/src/dictionary/dictionary_column_view.cpp b/cpp/src/dictionary/dictionary_column_view.cpp index d33fd6c548f..4906e5b4f9c 100644 --- a/cpp/src/dictionary/dictionary_column_view.cpp +++ b/cpp/src/dictionary/dictionary_column_view.cpp @@ -44,8 +44,12 @@ column_view dictionary_column_view::keys() const noexcept { return child(1); } size_type dictionary_column_view::keys_size() const noexcept { - if (size() == 0) return 0; - return keys().size(); + return (size() == 0) ? 0 : keys().size(); +} + +data_type dictionary_column_view::keys_type() const noexcept +{ + return (size() == 0) ? data_type{type_id::EMPTY} : keys().type(); } } // namespace cudf diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp index 651a977050c..4468bc69640 100644 --- a/cpp/tests/copying/copy_tests.cpp +++ b/cpp/tests/copying/copy_tests.cpp @@ -18,11 +18,13 @@ #include #include #include +#include #include #include #include #include +#include #include template @@ -633,3 +635,85 @@ TYPED_TEST(FixedPointTypes, FixedPointScaleMismatch) EXPECT_THROW(cudf::copy_if_else(a, b, mask), cudf::logic_error); } + +struct DictionaryCopyIfElseTest : public cudf::test::BaseFixture { +}; + +TEST_F(DictionaryCopyIfElseTest, ColumnColumn) +{ + auto valids = cudf::test::iterators::null_at(2); + std::vector h_strings1{"eee", "bb", "", "aa", "bb", "ééé"}; + cudf::test::dictionary_column_wrapper input1( + h_strings1.begin(), h_strings1.end(), valids); + std::vector h_strings2{"zz", "bb", "", "aa", "ééé", "ooo"}; + cudf::test::dictionary_column_wrapper input2( + h_strings2.begin(), h_strings2.end(), valids); + + bool mask[] = {1, 1, 0, 1, 0, 1}; + bool mask_v[] = {1, 1, 1, 1, 1, 0}; + cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6, mask_v); + + auto results = cudf::copy_if_else(input1, input2, mask_w); + auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); + + std::vector h_expected; + for (cudf::size_type idx = 0; idx < static_cast(h_strings1.size()); ++idx) { + if (mask[idx] and mask_v[idx]) + h_expected.push_back(h_strings1[idx]); + else + h_expected.push_back(h_strings2[idx]); + } + cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end(), valids); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected); +} + +TEST_F(DictionaryCopyIfElseTest, ColumnScalar) +{ + std::string h_string{"eee"}; + cudf::string_scalar input1{h_string}; + std::vector h_strings{"zz", "", "yyy", "w", "ééé", "ooo"}; + auto valids = cudf::test::iterators::null_at(1); + cudf::test::dictionary_column_wrapper input2( + h_strings.begin(), h_strings.end(), valids); + + bool mask[] = {0, 1, 1, 1, 0, 1}; + cudf::test::fixed_width_column_wrapper mask_w(mask, mask + 6); + + auto results = cudf::copy_if_else(input2, input1, mask_w); + auto decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); + + std::vector h_expected1; + std::vector h_expected2; + for (cudf::size_type idx = 0; idx < static_cast(h_strings.size()); ++idx) { + if (mask[idx]) { + h_expected1.push_back(h_strings[idx]); + h_expected2.push_back(h_string.c_str()); + } else { + h_expected1.push_back(h_string.c_str()); + h_expected2.push_back(h_strings[idx]); + } + } + + cudf::test::strings_column_wrapper expected1(h_expected1.begin(), h_expected1.end(), valids); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected1); + + results = cudf::copy_if_else(input1, input2, mask_w); + decoded = cudf::dictionary::decode(cudf::dictionary_column_view(results->view())); + + cudf::test::strings_column_wrapper expected2(h_expected2.begin(), h_expected2.end()); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(decoded->view(), expected2); +} + +TEST_F(DictionaryCopyIfElseTest, TypeMismatch) +{ + cudf::test::dictionary_column_wrapper input1({1, 1, 1, 1}); + cudf::test::dictionary_column_wrapper input2({1.0, 1.0, 1.0, 1.0}); + cudf::test::fixed_width_column_wrapper mask({1, 0, 0, 1}); + + EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::logic_error); + + cudf::string_scalar input3{"1"}; + EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::logic_error); + EXPECT_THROW(cudf::copy_if_else(input3, input2, mask), cudf::logic_error); + EXPECT_THROW(cudf::copy_if_else(input2, input3, mask), cudf::logic_error); +} From 967f3397fb486368d74916ae344c0e1d9eb0a1a8 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 15 Dec 2021 13:28:11 -0600 Subject: [PATCH 12/13] Remove conda envs for CUDA 11.0 and 11.2. (#9910) I think the development environments for CUDA 11.0 and 11.2 can be safely removed now that we require CUDA 11.5 to build. I also updated the default CUDA version in the conda recipes from 10.1 to 11.5. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/9910 --- conda/environments/cudf_dev_cuda11.0.yml | 69 ------------------------ conda/environments/cudf_dev_cuda11.2.yml | 69 ------------------------ conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/cudf_kafka/meta.yaml | 2 +- conda/recipes/custreamz/meta.yaml | 4 +- conda/recipes/dask-cudf/meta.yaml | 2 +- conda/recipes/libcudf/meta.yaml | 2 +- 7 files changed, 6 insertions(+), 144 deletions(-) delete mode 100644 conda/environments/cudf_dev_cuda11.0.yml delete mode 100644 conda/environments/cudf_dev_cuda11.2.yml diff --git a/conda/environments/cudf_dev_cuda11.0.yml b/conda/environments/cudf_dev_cuda11.0.yml deleted file mode 100644 index e7b92eddd9e..00000000000 --- a/conda/environments/cudf_dev_cuda11.0.yml +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -name: cudf_dev -channels: - - rapidsai - - nvidia - - rapidsai-nightly - - conda-forge -dependencies: - - clang=11.1.0 - - clang-tools=11.1.0 - - cupy>=9.5.0,<10.0.0a0 - - rmm=22.02.* - - cmake>=3.20.1 - - cmake_setuptools>=0.1.3 - - python>=3.7,<3.9 - - numba>=0.54 - - numpy - - pandas>=1.0,<1.4.0dev0 - - pyarrow=5.0.0=*cuda - - fastavro>=0.22.9 - - python-snappy>=0.6.0 - - notebook>=0.5.0 - - cython>=0.29,<0.30 - - fsspec>=0.6.0 - - pytest - - pytest-benchmark - - pytest-xdist - - sphinx - - sphinxcontrib-websupport - - nbsphinx - - numpydoc - - ipython - - pandoc=<2.0.0 - - cudatoolkit=11.0 - - pip - - flake8=3.8.3 - - black=19.10 - - isort=5.6.4 - - mypy=0.782 - - pydocstyle=6.1.1 - - typing_extensions - - pre-commit - - dask>=2021.11.1,<=2021.11.2 - - distributed>=2021.11.1,<=2021.11.2 - - streamz - - arrow-cpp=5.0.0 - - dlpack>=0.5,<0.6.0a0 - - arrow-cpp-proc * cuda - - double-conversion - - rapidjson - - hypothesis - - sphinx-markdown-tables - - sphinx-copybutton - - mimesis<4.1 - - packaging - - protobuf - - nvtx>=0.2.1 - - cachetools - - transformers<=4.10.3 - - pydata-sphinx-theme - - librdkafka=1.7.0 - - python-confluent-kafka=1.7.0 - - pip: - - git+https://github.com/dask/dask.git@main - - git+https://github.com/dask/distributed.git@main - - git+https://github.com/python-streamz/streamz.git@master - - pyorc - - ptxcompiler # [linux64] diff --git a/conda/environments/cudf_dev_cuda11.2.yml b/conda/environments/cudf_dev_cuda11.2.yml deleted file mode 100644 index 6fe8ed0fafe..00000000000 --- a/conda/environments/cudf_dev_cuda11.2.yml +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. - -name: cudf_dev -channels: - - rapidsai - - nvidia - - rapidsai-nightly - - conda-forge -dependencies: - - clang=11.1.0 - - clang-tools=11.1.0 - - cupy>=9.5.0,<10.0.0a0 - - rmm=22.02.* - - cmake>=3.20.1 - - cmake_setuptools>=0.1.3 - - python>=3.7,<3.9 - - numba>=0.54 - - numpy - - pandas>=1.0,<1.4.0dev0 - - pyarrow=5.0.0=*cuda - - fastavro>=0.22.9 - - python-snappy>=0.6.0 - - notebook>=0.5.0 - - cython>=0.29,<0.30 - - fsspec>=0.6.0 - - pytest - - pytest-benchmark - - pytest-xdist - - sphinx - - sphinxcontrib-websupport - - nbsphinx - - numpydoc - - ipython - - pandoc=<2.0.0 - - cudatoolkit=11.2 - - pip - - flake8=3.8.3 - - black=19.10 - - isort=5.6.4 - - mypy=0.782 - - pydocstyle=6.1.1 - - typing_extensions - - pre-commit - - dask>=2021.11.1,<=2021.11.2 - - distributed>=2021.11.1,<=2021.11.2 - - streamz - - arrow-cpp=5.0.0 - - dlpack>=0.5,<0.6.0a0 - - arrow-cpp-proc * cuda - - double-conversion - - rapidjson - - hypothesis - - sphinx-markdown-tables - - sphinx-copybutton - - mimesis<4.1 - - packaging - - protobuf - - nvtx>=0.2.1 - - cachetools - - transformers<=4.10.3 - - pydata-sphinx-theme - - librdkafka=1.7.0 - - python-confluent-kafka=1.7.0 - - pip: - - git+https://github.com/dask/dask.git@main - - git+https://github.com/dask/distributed.git@main - - git+https://github.com/python-streamz/streamz.git@master - - pyorc - - ptxcompiler # [linux64] diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 46eefbc825f..2600ab358cc 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} {% set cuda_major=cuda_version.split('.')[0] %} package: diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index af27d888b46..e450d306cbe 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} package: name: cudf_kafka diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index db8af9b0bed..a8b096d4892 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} package: name: custreamz @@ -29,7 +29,7 @@ requirements: - cudf_kafka {{ version }} run: - python - - streamz + - streamz - cudf {{ version }} - dask>=2021.11.1,<=2021.11.2 - distributed>=2021.11.1,<=2021.11.2 diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index d90de2d628c..ed3309056cf 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -3,7 +3,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} {% set py_version=environ.get('CONDA_PY', 36) %} -{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} {% set cuda_major=cuda_version.split('.')[0] %} package: diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index e78110f3233..bd9b76e4890 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -2,7 +2,7 @@ {% set version = environ.get('GIT_DESCRIBE_TAG', '0.0.0.dev').lstrip('v') + environ.get('VERSION_SUFFIX', '') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} -{% set cuda_version='.'.join(environ.get('CUDA', '10.1').split('.')[:2]) %} +{% set cuda_version='.'.join(environ.get('CUDA', '11.5').split('.')[:2]) %} {% set cuda_major=cuda_version.split('.')[0] %} package: From 0faf2afc2a12b8dad5e3d1fd823b6a8c98c28bcc Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 15 Dec 2021 12:32:11 -0700 Subject: [PATCH 13/13] Implement JNI for `cudf::scatter` APIs (#9903) This PR adds Java binding for both `cudf::scatter` APIs: ``` std::unique_ptr scatter( table_view const& source, column_view const& scatter_map, table_view const& target, ...) ``` and ``` std::unique_ptr
scatter( std::vector> const& source, column_view const& indices, table_view const& target, ...) ``` Closes #9892. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Jason Lowe (https://github.com/jlowe) URL: https://github.com/rapidsai/cudf/pull/9903 --- java/src/main/java/ai/rapids/cudf/Table.java | 68 ++++++++++++++++++- java/src/main/native/src/TableJni.cpp | 40 +++++++++++ .../test/java/ai/rapids/cudf/TableTest.java | 47 ++++++++++++- 3 files changed, 153 insertions(+), 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index 887a125e083..00c98c4fef8 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -649,6 +649,13 @@ private static native long[] dropDuplicates(long nativeHandle, int[] keyColumns, private static native long[] gather(long tableHandle, long gatherView, boolean checkBounds); + private static native long[] scatterTable(long srcTableHandle, long scatterView, + long targetTableHandle, boolean checkBounds) + throws CudfException; + private static native long[] scatterScalars(long[] srcScalarHandles, long scatterView, + long targetTableHandle, boolean checkBounds) + throws CudfException; + private static native long[] convertToRows(long nativeHandle); private static native long[] convertFromRows(long nativeColumnView, int[] types, int[] scale); @@ -2047,7 +2054,7 @@ public Table gather(ColumnView gatherMap) { * `n` is the number of rows in this table. * * @param gatherMap the map of indexes. Must be non-nullable and integral type. - * @param outOfBoundsPolicy policy to use when an out-of-range value is in `gatherMap` + * @param outOfBoundsPolicy policy to use when an out-of-range value is in `gatherMap`. * @return the resulting Table. */ public Table gather(ColumnView gatherMap, OutOfBoundsPolicy outOfBoundsPolicy) { @@ -2055,6 +2062,65 @@ public Table gather(ColumnView gatherMap, OutOfBoundsPolicy outOfBoundsPolicy) { return new Table(gather(nativeHandle, gatherMap.getNativeView(), checkBounds)); } + /** + * Scatters values from the source table into the target table out-of-place, returning a new + * result table. The scatter is performed according to a scatter map such that row `scatterMap[i]` + * of the destination table gets row `i` of the source table. All other rows of the destination + * table equal corresponding rows of the target table. + * + * The number of columns in source must match the number of columns in target and their + * corresponding data types must be the same. + * + * If the same index appears more than once in the scatter map, the result is undefined. + * + * A negative value `i` in the `scatterMap` is interpreted as `i + n`, where `n` is the number of + * rows in the `target` table. + * + * @param scatterMap The map of indexes. Must be non-nullable and integral type. + * @param target The table into which rows from the current table are to be scattered out-of-place. + * @param checkBounds Optionally perform bounds checking on the values of`scatterMap` and throw + * an exception if any of its values are out of bounds. + * @return A new table which is the result of out-of-place scattering the source table into the + * target table. + */ + public Table scatter(ColumnView scatterMap, Table target, boolean checkBounds) { + return new Table(scatterTable(nativeHandle, scatterMap.getNativeView(), target.getNativeView(), + checkBounds)); + } + + /** + * Scatters values from the source rows into the target table out-of-place, returning a new result + * table. The scatter is performed according to a scatter map such that row `scatterMap[i]` of the + * destination table is replaced by the source row `i`. All other rows of the destination table + * equal corresponding rows of the target table. + * + * The number of elements in source must match the number of columns in target and their + * corresponding data types must be the same. + * + * If the same index appears more than once in the scatter map, the result is undefined. + * + * A negative value `i` in the `scatterMap` is interpreted as `i + n`, where `n` is the number of + * rows in the `target` table. + * + * @param source The input scalars containing values to be scattered into the target table. + * @param scatterMap The map of indexes. Must be non-nullable and integral type. + * @param target The table into which the values from source are to be scattered out-of-place. + * @param checkBounds Optionally perform bounds checking on the values of`scatterMap` and throw + * an exception if any of its values are out of bounds. + * @return A new table which is the result of out-of-place scattering the source values into the + * target table. + */ + public static Table scatter(Scalar[] source, ColumnView scatterMap, Table target, + boolean checkBounds) { + long[] srcScalarHandles = new long[source.length]; + for(int i = 0; i < source.length; ++i) { + assert source[i] != null : "Scalar vectors passed in should not contain null"; + srcScalarHandles[i] = source[i].getScalarHandle(); + } + return new Table(scatterScalars(srcScalarHandles, scatterMap.getNativeView(), + target.getNativeView(), checkBounds)); + } + private GatherMap[] buildJoinGatherMaps(long[] gatherMapData) { long bufferSize = gatherMapData[0]; long leftAddr = gatherMapData[1]; diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 0914c8a23f7..0e6425ea7a2 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -2746,6 +2746,46 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclas CATCH_STD(env, 0); } +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, jclass, + jlong j_input, jlong j_map, + jlong j_target, + jboolean check_bounds) { + JNI_NULL_CHECK(env, j_input, "input table is null", 0); + JNI_NULL_CHECK(env, j_map, "map column is null", 0); + JNI_NULL_CHECK(env, j_target, "target table is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(j_input); + auto const map = reinterpret_cast(j_map); + auto const target = reinterpret_cast(j_target); + auto result = cudf::scatter(*input, *map, *target, check_bounds); + return cudf::jni::convert_table_for_return(env, result); + } + CATCH_STD(env, 0); +} + +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *env, jclass, + jlongArray j_input, + jlong j_map, jlong j_target, + jboolean check_bounds) { + JNI_NULL_CHECK(env, j_input, "input scalars array is null", 0); + JNI_NULL_CHECK(env, j_map, "map column is null", 0); + JNI_NULL_CHECK(env, j_target, "target table is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const scalars_array = cudf::jni::native_jpointerArray(env, j_input); + std::vector> input; + for (int i = 0; i < scalars_array.size(); ++i) { + input.emplace_back(*scalars_array[i]); + } + auto const map = reinterpret_cast(j_map); + auto const target = reinterpret_cast(j_target); + auto result = cudf::scatter(input, *map, *target, check_bounds); + return cudf::jni::convert_table_for_return(env, result); + } + CATCH_STD(env, 0); +} + JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_convertToRows(JNIEnv *env, jclass, jlong input_table) { JNI_NULL_CHECK(env, input_table, "input table is null", 0); diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index eeed8224425..86c55e19776 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -56,12 +56,12 @@ import java.util.function.Function; import java.util.stream.Collectors; -import static ai.rapids.cudf.ColumnWriterOptions.mapColumn; import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; import static ai.rapids.cudf.AssertUtils.assertPartialColumnsAreEqual; import static ai.rapids.cudf.AssertUtils.assertPartialTablesAreEqual; import static ai.rapids.cudf.AssertUtils.assertTableTypes; import static ai.rapids.cudf.AssertUtils.assertTablesAreEqual; +import static ai.rapids.cudf.ColumnWriterOptions.mapColumn; import static ai.rapids.cudf.ParquetWriterOptions.listBuilder; import static ai.rapids.cudf.ParquetWriterOptions.structBuilder; import static ai.rapids.cudf.Table.TestBuilder; @@ -6338,6 +6338,51 @@ void testBoundsCheckedGather() { } } + + @Test + void testScatterTable() { + try (Table srcTable = new Table.TestBuilder() + .column(1, 2, 3, 4, 5) + .column("A", "AA", "AAA", "AAAA", "AAAAA") + .decimal32Column(-3, 1, 2, 3, 4, 5) + .decimal64Column(-8, 100001L, 200002L, 300003L, 400004L, 500005L) + .build(); + ColumnVector scatterMap = ColumnVector.fromInts(0, 2, 4, -2); + Table targetTable = new Table.TestBuilder() + .column(-1, -2, -3, -4, -5) + .column("B", "BB", "BBB", "BBBB", "BBBBB") + .decimal32Column(-3, -1, -2, -3, -4, -5) + .decimal64Column(-8, -100001L, -200002L, -300003L, -400004L, -500005L) + .build(); + Table expected = new Table.TestBuilder() + .column(1, -2, 2, 4, 3) + .column("A", "BB", "AA", "AAAA", "AAA") + .decimal32Column(-3, 1, -2, 2, 4, 3) + .decimal64Column(-8, 100001L, -200002L, 200002L, 400004L, 300003L) + .build(); + Table result = srcTable.scatter(scatterMap, targetTable, false)) { + assertTablesAreEqual(expected, result); + } + } + + @Test + void testScatterScalars() { + try (Scalar s1 = Scalar.fromInt(0); + Scalar s2 = Scalar.fromString("A"); + ColumnVector scatterMap = ColumnVector.fromInts(0, 2, -1); + Table targetTable = new Table.TestBuilder() + .column(-1, -2, -3, -4, -5) + .column("B", "BB", "BBB", "BBBB", "BBBBB") + .build(); + Table expected = new Table.TestBuilder() + .column(0, -2, 0, -4, 0) + .column("A", "BB", "A", "BBBB", "A") + .build(); + Table result = Table.scatter(new Scalar[] { s1, s2 }, scatterMap, targetTable, false)) { + assertTablesAreEqual(expected, result); + } + } + @Test void testMaskWithoutValidity() { try (ColumnVector mask = ColumnVector.fromBoxedBooleans(true, false, true, false, true);