Merge remote-tracking branch 'upstream/branch-22.04' into nvtx

rapidsai · Feb 3, 2022 · e363967 · e363967
2 parents 9867675 + 511aa28
commit e363967
Show file tree

Hide file tree

Showing 13 changed files with 451 additions and 96 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/cpp/benchmarks/io/orc/orc_reader.cpp b/cpp/benchmarks/io/orc/orc_reader.cpp
@@ -88,13 +88,13 @@ void BM_orc_read_varying_options(benchmark::State& state)
   auto const use_np_dtypes = (flags & 2) != 0;
   auto const ts_type       = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};
 
+  // skip_rows is not supported on nested types
   auto const data_types =
     dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
                                                    int32_t(type_group_id::FLOATING_POINT),
                                                    int32_t(type_group_id::FIXED_POINT),
                                                    int32_t(type_group_id::TIMESTAMP),
-                                                   int32_t(cudf::type_id::STRING),
-                                                   int32_t(cudf::type_id::LIST)}),
+                                                   int32_t(cudf::type_id::STRING)}),
                                 col_sel);
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -181,11 +181,12 @@ BENCHMARK_REGISTER_F(OrcRead, column_selection)
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
 
+// Need an API to get the number of stripes to enable row_selection::STRIPES here
 BENCHMARK_DEFINE_F(OrcRead, row_selection)
 (::benchmark::State& state) { BM_orc_read_varying_options(state); }
 BENCHMARK_REGISTER_F(OrcRead, row_selection)
   ->ArgsProduct({{int32_t(column_selection::ALL)},
-                 {int32_t(row_selection::STRIPES), int32_t(row_selection::NROWS)},
+                 {int32_t(row_selection::NROWS)},
                  {1, 8},
                  {0b11},  // defaults
                  {int32_t(cudf::type_id::EMPTY)}})

diff --git a/cpp/benchmarks/io/parquet/parquet_reader.cpp b/cpp/benchmarks/io/parquet/parquet_reader.cpp
@@ -88,13 +88,13 @@ void BM_parq_read_varying_options(benchmark::State& state)
   auto const use_pandas_metadata = (flags & 2) != 0;
   auto const ts_type = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};
 
+  // No nested types here, because of https://github.com/rapidsai/cudf/issues/9970
   auto const data_types = dtypes_for_column_selection(
     get_type_or_group({static_cast<int32_t>(type_group_id::INTEGRAL),
                        static_cast<int32_t>(type_group_id::FLOATING_POINT),
                        static_cast<int32_t>(type_group_id::FIXED_POINT),
                        static_cast<int32_t>(type_group_id::TIMESTAMP),
-                       static_cast<int32_t>(cudf::type_id::STRING),
-                       static_cast<int32_t>(cudf::type_id::LIST)}),
+                       static_cast<int32_t>(cudf::type_id::STRING)}),
     col_sel);
   auto const tbl  = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
   auto const view = tbl->view();
@@ -181,20 +181,18 @@ BENCHMARK_REGISTER_F(ParquetRead, column_selection)
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
 
-// Disabled until we add an API to read metadata from a parquet file and determine num row groups.
-// https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
-/*
+// row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file and
+// determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
 BENCHMARK_DEFINE_F(ParquetRead, row_selection)
 (::benchmark::State& state) { BM_parq_read_varying_options(state); }
 BENCHMARK_REGISTER_F(ParquetRead, row_selection)
   ->ArgsProduct({{int32_t(column_selection::ALL)},
-                 {int32_t(row_selection::ROW_GROUPS), int32_t(row_selection::NROWS)},
+                 {int32_t(row_selection::NROWS)},
                  {1, 4},
                  {0b01},  // defaults
                  {int32_t(cudf::type_id::EMPTY)}})
   ->Unit(benchmark::kMillisecond)
   ->UseManualTime();
-*/
 
 BENCHMARK_DEFINE_F(ParquetRead, misc_options)
 (::benchmark::State& state) { BM_parq_read_varying_options(state); }

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -387,9 +387,9 @@ class fixed_point {
   /**
    * @brief operator + (for adding two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are added <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are added
+   * If `_scale`s are equal, `_value`s are added.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are added.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -402,9 +402,9 @@ class fixed_point {
   /**
    * @brief operator - (for subtracting two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are subtracted <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are subtracted
+   * If `_scale`s are equal, `_value`s are subtracted.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are subtracted.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -417,7 +417,7 @@ class fixed_point {
   /**
    * @brief operator * (for multiplying two `fixed_point` numbers)
    *
-   * `_scale`s are added and `_value`s are multiplied
+   * `_scale`s are added and `_value`s are multiplied.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -430,7 +430,7 @@ class fixed_point {
   /**
    * @brief operator / (for dividing two `fixed_point` numbers)
    *
-   * `_scale`s are subtracted and `_value`s are divided
+   * `_scale`s are subtracted and `_value`s are divided.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -443,9 +443,9 @@ class fixed_point {
   /**
    * @brief operator == (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -458,9 +458,9 @@ class fixed_point {
   /**
    * @brief operator != (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -473,9 +473,9 @@ class fixed_point {
   /**
    * @brief operator <= (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -488,9 +488,9 @@ class fixed_point {
   /**
    * @brief operator >= (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -503,9 +503,9 @@ class fixed_point {
   /**
    * @brief operator < (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -518,9 +518,9 @@ class fixed_point {
   /**
    * @brief operator > (for comparing two `fixed_point` numbers)
    *
-   * If `_scale`s are equal, `_value`s are compared <br>
-   * If `_scale`s are not equal, number with smaller `_scale` is shifted to the
-   * greater `_scale`, and then `_value`s are compared
+   * If `_scale`s are equal, `_value`s are compared.
+   * If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
+   * smaller `_scale`, and then the `_value`s are compared.
    *
    * @tparam Rep1 Representation type of number being added to `this`
    * @tparam Rad1 Radix (base) type of number being added to `this`
@@ -534,7 +534,7 @@ class fixed_point {
    * @brief Method for creating a `fixed_point` number with a new `scale`
    *
    * The `fixed_point` number returned will have the same value, underlying representation and
-   * radix as `this`, the only thing changed is the scale
+   * radix as `this`, the only thing changed is the scale.
    *
    * @param scale The `scale` of the returned `fixed_point` number
    * @return `fixed_point` number with a new `scale`

diff --git a/cpp/src/join/hash_join.cuh b/cpp/src/join/hash_join.cuh
@@ -68,7 +68,7 @@ class make_pair_function {
   {
     // Compute the hash value of row `i`
     auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel);
-    return cuco::make_pair<hash_value_type, size_type>(std::move(row_hash_value), std::move(i));
+    return cuco::make_pair(row_hash_value, i);
   }
 
  private:

diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
@@ -45,7 +45,7 @@ struct make_pair_function_semi {
   {
     // The value is irrelevant since we only ever use the hash map to check for
     // membership of a particular row index.
-    return cuco::make_pair<hash_value_type, size_type>(i, 0);
+    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
   }
 };
 

diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
@@ -49,7 +49,7 @@ struct make_pair_function {
   {
     // The value is irrelevant since we only ever use the hash map to check for
     // membership of a particular row index.
-    return cuco::make_pair<hash_value_type, size_type>(i, 0);
+    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
   }
 };
 

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -6120,6 +6120,37 @@ def __dataframe__(
             self, nan_as_null=nan_as_null, allow_copy=allow_copy
         )
 
+    def nunique(self, axis=0, dropna=True):
+        """
+        Count number of distinct elements in specified axis.
+        Return Series with number of distinct elements. Can ignore NaN values.
+
+        Parameters
+        ----------
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+            The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
+            column-wise.
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        Series
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
+        >>> df.nunique()
+        A    3
+        B    2
+        dtype: int64
+        """
+        if axis != 0:
+            raise NotImplementedError("axis parameter is not supported yet.")
+
+        return cudf.Series(super().nunique(method="sort", dropna=dropna))
+
 
 def from_dataframe(df, allow_copy=False):
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import builtins
 import copy
 import pickle
 import warnings
@@ -6080,12 +6081,12 @@ def eq(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.eq(right)
-        a     b     c     d
+              a     b     c     d
         0  True  True  <NA>  <NA>
         1  True  True  <NA>  <NA>
         2  True  True  <NA>  <NA>
         >>> left.eq(right, fill_value=7)
-        a     b      c      d
+              a     b      c      d
         0  True  True   True  False
         1  True  True  False  False
         2  True  True  False  False
@@ -6156,12 +6157,12 @@ def ne(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.ne(right)
-        a      b     c     d
+               a      b     c     d
         0  False  False  <NA>  <NA>
         1  False  False  <NA>  <NA>
         2  False  False  <NA>  <NA>
         >>> left.ne(right, fill_value=7)
-        a      b      c     d
+               a      b      c     d
         0  False  False  False  True
         1  False  False   True  True
         2  False  False   True  True
@@ -6232,12 +6233,12 @@ def lt(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.lt(right)
-        a      b     c     d
+               a      b     c     d
         0  False  False  <NA>  <NA>
         1  False  False  <NA>  <NA>
         2  False  False  <NA>  <NA>
         >>> left.lt(right, fill_value=7)
-        a      b      c     d
+               a      b      c     d
         0  False  False  False  True
         1  False  False  False  True
         2  False  False  False  True
@@ -6308,12 +6309,12 @@ def le(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.le(right)
-        a     b     c     d
+              a     b     c     d
         0  True  True  <NA>  <NA>
         1  True  True  <NA>  <NA>
         2  True  True  <NA>  <NA>
         >>> left.le(right, fill_value=7)
-        a     b      c     d
+              a     b      c     d
         0  True  True   True  True
         1  True  True  False  True
         2  True  True  False  True
@@ -6384,12 +6385,12 @@ def gt(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.gt(right)
-        a      b     c     d
+               a      b     c     d
         0  False  False  <NA>  <NA>
         1  False  False  <NA>  <NA>
         2  False  False  <NA>  <NA>
         >>> left.gt(right, fill_value=7)
-        a      b      c      d
+               a      b      c      d
         0  False  False  False  False
         1  False  False   True  False
         2  False  False   True  False
@@ -6460,12 +6461,12 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
         ...     'd': [10, 12, 12]}
         ... )
         >>> left.ge(right)
-        a     b     c     d
+              a     b     c     d
         0  True  True  <NA>  <NA>
         1  True  True  <NA>  <NA>
         2  True  True  <NA>  <NA>
         >>> left.ge(right, fill_value=7)
-        a     b     c      d
+              a     b     c      d
         0  True  True  True  False
         1  True  True  True  False
         2  True  True  True  False
@@ -6505,6 +6506,28 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
             other=other, fn="ge", fill_value=fill_value, can_reindex=True
         )
 
+    def nunique(self, method: builtins.str = "sort", dropna: bool = True):
+        """
+        Returns a per column mapping with counts of unique values for
+        each column.
+
+        Parameters
+        ----------
+        method : builtins.str, default "sort"
+            Method used by cpp_distinct_count
+        dropna : bool, default True
+            Don't include NaN in the counts.
+
+        Returns
+        -------
+        dict
+            Name and unique value counts of each column in frame.
+        """
+        return {
+            name: col.distinct_count(method=method, dropna=dropna)
+            for name, col in self._data.items()
+        }
+
 
 @annotate(
     "FRAME_GET_REPLACEMENT_VALUES_FOR_COLUMNS",