Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-22.04' into nvtx
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar committed Feb 3, 2022
2 parents 9867675 + 511aa28 commit e363967
Show file tree
Hide file tree
Showing 13 changed files with 451 additions and 96 deletions.
238 changes: 237 additions & 1 deletion CHANGELOG.md

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions cpp/benchmarks/io/orc/orc_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,13 @@ void BM_orc_read_varying_options(benchmark::State& state)
auto const use_np_dtypes = (flags & 2) != 0;
auto const ts_type = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};

// skip_rows is not supported on nested types
auto const data_types =
dtypes_for_column_selection(get_type_or_group({int32_t(type_group_id::INTEGRAL_SIGNED),
int32_t(type_group_id::FLOATING_POINT),
int32_t(type_group_id::FIXED_POINT),
int32_t(type_group_id::TIMESTAMP),
int32_t(cudf::type_id::STRING),
int32_t(cudf::type_id::LIST)}),
int32_t(cudf::type_id::STRING)}),
col_sel);
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -181,11 +181,12 @@ BENCHMARK_REGISTER_F(OrcRead, column_selection)
->Unit(benchmark::kMillisecond)
->UseManualTime();

// Need an API to get the number of stripes to enable row_selection::STRIPES here
BENCHMARK_DEFINE_F(OrcRead, row_selection)
(::benchmark::State& state) { BM_orc_read_varying_options(state); }
BENCHMARK_REGISTER_F(OrcRead, row_selection)
->ArgsProduct({{int32_t(column_selection::ALL)},
{int32_t(row_selection::STRIPES), int32_t(row_selection::NROWS)},
{int32_t(row_selection::NROWS)},
{1, 8},
{0b11}, // defaults
{int32_t(cudf::type_id::EMPTY)}})
Expand Down
12 changes: 5 additions & 7 deletions cpp/benchmarks/io/parquet/parquet_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,13 +88,13 @@ void BM_parq_read_varying_options(benchmark::State& state)
auto const use_pandas_metadata = (flags & 2) != 0;
auto const ts_type = cudf::data_type{static_cast<cudf::type_id>(state.range(state_idx++))};

// No nested types here, because of https://github.com/rapidsai/cudf/issues/9970
auto const data_types = dtypes_for_column_selection(
get_type_or_group({static_cast<int32_t>(type_group_id::INTEGRAL),
static_cast<int32_t>(type_group_id::FLOATING_POINT),
static_cast<int32_t>(type_group_id::FIXED_POINT),
static_cast<int32_t>(type_group_id::TIMESTAMP),
static_cast<int32_t>(cudf::type_id::STRING),
static_cast<int32_t>(cudf::type_id::LIST)}),
static_cast<int32_t>(cudf::type_id::STRING)}),
col_sel);
auto const tbl = create_random_table(data_types, data_types.size(), table_size_bytes{data_size});
auto const view = tbl->view();
Expand Down Expand Up @@ -181,20 +181,18 @@ BENCHMARK_REGISTER_F(ParquetRead, column_selection)
->Unit(benchmark::kMillisecond)
->UseManualTime();

// Disabled until we add an API to read metadata from a parquet file and determine num row groups.
// https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
/*
// row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file and
// determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863
BENCHMARK_DEFINE_F(ParquetRead, row_selection)
(::benchmark::State& state) { BM_parq_read_varying_options(state); }
BENCHMARK_REGISTER_F(ParquetRead, row_selection)
->ArgsProduct({{int32_t(column_selection::ALL)},
{int32_t(row_selection::ROW_GROUPS), int32_t(row_selection::NROWS)},
{int32_t(row_selection::NROWS)},
{1, 4},
{0b01}, // defaults
{int32_t(cudf::type_id::EMPTY)}})
->Unit(benchmark::kMillisecond)
->UseManualTime();
*/

BENCHMARK_DEFINE_F(ParquetRead, misc_options)
(::benchmark::State& state) { BM_parq_read_varying_options(state); }
Expand Down
54 changes: 27 additions & 27 deletions cpp/include/cudf/fixed_point/fixed_point.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -387,9 +387,9 @@ class fixed_point {
/**
* @brief operator + (for adding two `fixed_point` numbers)
*
* If `_scale`s are equal, `_value`s are added <br>
* If `_scale`s are not equal, number with smaller `_scale` is shifted to the
* greater `_scale`, and then `_value`s are added
* If `_scale`s are equal, `_value`s are added.
* If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
* smaller `_scale`, and then the `_value`s are added.
*
* @tparam Rep1 Representation type of number being added to `this`
* @tparam Rad1 Radix (base) type of number being added to `this`
Expand All @@ -402,9 +402,9 @@ class fixed_point {
/**
* @brief operator - (for subtracting two `fixed_point` numbers)
*
* If `_scale`s are equal, `_value`s are subtracted <br>
* If `_scale`s are not equal, number with smaller `_scale` is shifted to the
* greater `_scale`, and then `_value`s are subtracted
* If `_scale`s are equal, `_value`s are subtracted.
* If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
* smaller `_scale`, and then the `_value`s are subtracted.
*
* @tparam Rep1 Representation type of number being added to `this`
* @tparam Rad1 Radix (base) type of number being added to `this`
Expand All @@ -417,7 +417,7 @@ class fixed_point {
/**
* @brief operator * (for multiplying two `fixed_point` numbers)
*
* `_scale`s are added and `_value`s are multiplied
* `_scale`s are added and `_value`s are multiplied.
*
* @tparam Rep1 Representation type of number being added to `this`
* @tparam Rad1 Radix (base) type of number being added to `this`
Expand All @@ -430,7 +430,7 @@ class fixed_point {
/**
* @brief operator / (for dividing two `fixed_point` numbers)
*
* `_scale`s are subtracted and `_value`s are divided
* `_scale`s are subtracted and `_value`s are divided.
*
* @tparam Rep1 Representation type of number being added to `this`
* @tparam Rad1 Radix (base) type of number being added to `this`
Expand All @@ -443,9 +443,9 @@ class fixed_point {
/**
* @brief operator == (for comparing two `fixed_point` numbers)
*
* If `_scale`s are equal, `_value`s are compared <br>
* If `_scale`s are not equal, number with smaller `_scale` is shifted to the
* greater `_scale`, and then `_value`s are compared
* If `_scale`s are equal, `_value`s are compared.
* If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
* smaller `_scale`, and then the `_value`s are compared.
*
* @tparam Rep1 Representation type of number being added to `this`
* @tparam Rad1 Radix (base) type of number being added to `this`
Expand All @@ -458,9 +458,9 @@ class fixed_point {
/**
* @brief operator != (for comparing two `fixed_point` numbers)
*
* If `_scale`s are equal, `_value`s are compared <br>
* If `_scale`s are not equal, number with smaller `_scale` is shifted to the
* greater `_scale`, and then `_value`s are compared
* If `_scale`s are equal, `_value`s are compared.
* If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
* smaller `_scale`, and then the `_value`s are compared.
*
* @tparam Rep1 Representation type of number being added to `this`
* @tparam Rad1 Radix (base) type of number being added to `this`
Expand All @@ -473,9 +473,9 @@ class fixed_point {
/**
* @brief operator <= (for comparing two `fixed_point` numbers)
*
* If `_scale`s are equal, `_value`s are compared <br>
* If `_scale`s are not equal, number with smaller `_scale` is shifted to the
* greater `_scale`, and then `_value`s are compared
* If `_scale`s are equal, `_value`s are compared.
* If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
* smaller `_scale`, and then the `_value`s are compared.
*
* @tparam Rep1 Representation type of number being added to `this`
* @tparam Rad1 Radix (base) type of number being added to `this`
Expand All @@ -488,9 +488,9 @@ class fixed_point {
/**
* @brief operator >= (for comparing two `fixed_point` numbers)
*
* If `_scale`s are equal, `_value`s are compared <br>
* If `_scale`s are not equal, number with smaller `_scale` is shifted to the
* greater `_scale`, and then `_value`s are compared
* If `_scale`s are equal, `_value`s are compared.
* If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
* smaller `_scale`, and then the `_value`s are compared.
*
* @tparam Rep1 Representation type of number being added to `this`
* @tparam Rad1 Radix (base) type of number being added to `this`
Expand All @@ -503,9 +503,9 @@ class fixed_point {
/**
* @brief operator < (for comparing two `fixed_point` numbers)
*
* If `_scale`s are equal, `_value`s are compared <br>
* If `_scale`s are not equal, number with smaller `_scale` is shifted to the
* greater `_scale`, and then `_value`s are compared
* If `_scale`s are equal, `_value`s are compared.
* If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
* smaller `_scale`, and then the `_value`s are compared.
*
* @tparam Rep1 Representation type of number being added to `this`
* @tparam Rad1 Radix (base) type of number being added to `this`
Expand All @@ -518,9 +518,9 @@ class fixed_point {
/**
* @brief operator > (for comparing two `fixed_point` numbers)
*
* If `_scale`s are equal, `_value`s are compared <br>
* If `_scale`s are not equal, number with smaller `_scale` is shifted to the
* greater `_scale`, and then `_value`s are compared
* If `_scale`s are equal, `_value`s are compared.
* If `_scale`s are not equal, the number with the larger `_scale` is shifted to the
* smaller `_scale`, and then the `_value`s are compared.
*
* @tparam Rep1 Representation type of number being added to `this`
* @tparam Rad1 Radix (base) type of number being added to `this`
Expand All @@ -534,7 +534,7 @@ class fixed_point {
* @brief Method for creating a `fixed_point` number with a new `scale`
*
* The `fixed_point` number returned will have the same value, underlying representation and
* radix as `this`, the only thing changed is the scale
* radix as `this`, the only thing changed is the scale.
*
* @param scale The `scale` of the returned `fixed_point` number
* @return `fixed_point` number with a new `scale`
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/join/hash_join.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class make_pair_function {
{
// Compute the hash value of row `i`
auto row_hash_value = remap_sentinel_hash(_hash(i), _empty_key_sentinel);
return cuco::make_pair<hash_value_type, size_type>(std::move(row_hash_value), std::move(i));
return cuco::make_pair(row_hash_value, i);
}

private:
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/join/mixed_join_semi.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ struct make_pair_function_semi {
{
// The value is irrelevant since we only ever use the hash map to check for
// membership of a particular row index.
return cuco::make_pair<hash_value_type, size_type>(i, 0);
return cuco::make_pair(static_cast<hash_value_type>(i), 0);
}
};

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/join/semi_join.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ struct make_pair_function {
{
// The value is irrelevant since we only ever use the hash map to check for
// membership of a particular row index.
return cuco::make_pair<hash_value_type, size_type>(i, 0);
return cuco::make_pair(static_cast<hash_value_type>(i), 0);
}
};

Expand Down
31 changes: 31 additions & 0 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6120,6 +6120,37 @@ def __dataframe__(
self, nan_as_null=nan_as_null, allow_copy=allow_copy
)

def nunique(self, axis=0, dropna=True):
"""
Count number of distinct elements in specified axis.
Return Series with number of distinct elements. Can ignore NaN values.
Parameters
----------
axis : {0 or 'index', 1 or 'columns'}, default 0
The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
column-wise.
dropna : bool, default True
Don't include NaN in the counts.
Returns
-------
Series
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
>>> df.nunique()
A 3
B 2
dtype: int64
"""
if axis != 0:
raise NotImplementedError("axis parameter is not supported yet.")

return cudf.Series(super().nunique(method="sort", dropna=dropna))


def from_dataframe(df, allow_copy=False):
return df_protocol.from_dataframe(df, allow_copy=allow_copy)
Expand Down
47 changes: 35 additions & 12 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from __future__ import annotations

import builtins
import copy
import pickle
import warnings
Expand Down Expand Up @@ -6080,12 +6081,12 @@ def eq(self, other, axis="columns", level=None, fill_value=None):
... 'd': [10, 12, 12]}
... )
>>> left.eq(right)
a b c d
a b c d
0 True True <NA> <NA>
1 True True <NA> <NA>
2 True True <NA> <NA>
>>> left.eq(right, fill_value=7)
a b c d
a b c d
0 True True True False
1 True True False False
2 True True False False
Expand Down Expand Up @@ -6156,12 +6157,12 @@ def ne(self, other, axis="columns", level=None, fill_value=None):
... 'd': [10, 12, 12]}
... )
>>> left.ne(right)
a b c d
a b c d
0 False False <NA> <NA>
1 False False <NA> <NA>
2 False False <NA> <NA>
>>> left.ne(right, fill_value=7)
a b c d
a b c d
0 False False False True
1 False False True True
2 False False True True
Expand Down Expand Up @@ -6232,12 +6233,12 @@ def lt(self, other, axis="columns", level=None, fill_value=None):
... 'd': [10, 12, 12]}
... )
>>> left.lt(right)
a b c d
a b c d
0 False False <NA> <NA>
1 False False <NA> <NA>
2 False False <NA> <NA>
>>> left.lt(right, fill_value=7)
a b c d
a b c d
0 False False False True
1 False False False True
2 False False False True
Expand Down Expand Up @@ -6308,12 +6309,12 @@ def le(self, other, axis="columns", level=None, fill_value=None):
... 'd': [10, 12, 12]}
... )
>>> left.le(right)
a b c d
a b c d
0 True True <NA> <NA>
1 True True <NA> <NA>
2 True True <NA> <NA>
>>> left.le(right, fill_value=7)
a b c d
a b c d
0 True True True True
1 True True False True
2 True True False True
Expand Down Expand Up @@ -6384,12 +6385,12 @@ def gt(self, other, axis="columns", level=None, fill_value=None):
... 'd': [10, 12, 12]}
... )
>>> left.gt(right)
a b c d
a b c d
0 False False <NA> <NA>
1 False False <NA> <NA>
2 False False <NA> <NA>
>>> left.gt(right, fill_value=7)
a b c d
a b c d
0 False False False False
1 False False True False
2 False False True False
Expand Down Expand Up @@ -6460,12 +6461,12 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
... 'd': [10, 12, 12]}
... )
>>> left.ge(right)
a b c d
a b c d
0 True True <NA> <NA>
1 True True <NA> <NA>
2 True True <NA> <NA>
>>> left.ge(right, fill_value=7)
a b c d
a b c d
0 True True True False
1 True True True False
2 True True True False
Expand Down Expand Up @@ -6505,6 +6506,28 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
other=other, fn="ge", fill_value=fill_value, can_reindex=True
)

def nunique(self, method: builtins.str = "sort", dropna: bool = True):
"""
Returns a per column mapping with counts of unique values for
each column.
Parameters
----------
method : builtins.str, default "sort"
Method used by cpp_distinct_count
dropna : bool, default True
Don't include NaN in the counts.
Returns
-------
dict
Name and unique value counts of each column in frame.
"""
return {
name: col.distinct_count(method=method, dropna=dropna)
for name, col in self._data.items()
}


@annotate(
"FRAME_GET_REPLACEMENT_VALUES_FOR_COLUMNS",
Expand Down
Loading

0 comments on commit e363967

Please sign in to comment.