diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 073f0d62c0a..6fe9e2616e1 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -182,6 +182,7 @@ ConfigureBench(TYPE_DISPATCHER_BENCH "${TD_BENCH_SRC}")
 
 set(REDUCTION_BENCH_SRC
   "${CMAKE_CURRENT_SOURCE_DIR}/reduction/anyall_benchmark.cpp"
+  "${CMAKE_CURRENT_SOURCE_DIR}/reduction/dictionary_benchmark.cpp"
   "${CMAKE_CURRENT_SOURCE_DIR}/reduction/reduce_benchmark.cpp"
   "${CMAKE_CURRENT_SOURCE_DIR}/reduction/minmax_benchmark.cpp")
 
diff --git a/cpp/benchmarks/reduction/anyall_benchmark.cpp b/cpp/benchmarks/reduction/anyall_benchmark.cpp
index 72379725abf..97d66585f8c 100644
--- a/cpp/benchmarks/reduction/anyall_benchmark.cpp
+++ b/cpp/benchmarks/reduction/anyall_benchmark.cpp
@@ -22,7 +22,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
-#include <memory>
 #include <random>
 
 class Reduction : public cudf::benchmark {
@@ -33,18 +32,18 @@ void BM_reduction_anyall(benchmark::State& state, std::unique_ptr<cudf::aggregat
 {
   const cudf::size_type column_size{static_cast<cudf::size_type>(state.range(0))};
 
-  cudf::test::UniformRandomGenerator<long> rand_gen(0, 100);
+  cudf::test::UniformRandomGenerator<long> rand_gen(
+    (agg->kind == cudf::aggregation::ALL ? 1 : 0), (agg->kind == cudf::aggregation::ANY ? 0 : 100));
   auto data_it = cudf::detail::make_counting_transform_iterator(
     0, [&rand_gen](cudf::size_type row) { return rand_gen.generate(); });
   cudf::test::fixed_width_column_wrapper<type, typename decltype(data_it)::value_type> values(
     data_it, data_it + column_size);
 
-  auto input_column = cudf::column_view(values);
   cudf::data_type output_dtype{cudf::type_id::BOOL8};
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    auto result = cudf::reduce(input_column, agg, output_dtype);
+    auto result = cudf::reduce(values, agg, output_dtype);
   }
 }
 
diff --git a/cpp/benchmarks/reduction/dictionary_benchmark.cpp b/cpp/benchmarks/reduction/dictionary_benchmark.cpp
new file mode 100644
index 00000000000..3622b36eb66
--- /dev/null
+++ b/cpp/benchmarks/reduction/dictionary_benchmark.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/types.hpp>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <random>
+
+class ReductionDictionary : public cudf::benchmark {
+};
+
+template <typename T>
+void BM_reduction_dictionary(benchmark::State& state, std::unique_ptr<cudf::aggregation> const& agg)
+{
+  const cudf::size_type column_size{static_cast<cudf::size_type>(state.range(0))};
+
+  cudf::test::UniformRandomGenerator<long> rand_gen(
+    (agg->kind == cudf::aggregation::ALL ? 1 : 0), (agg->kind == cudf::aggregation::ANY ? 0 : 100));
+  auto data_it = cudf::detail::make_counting_transform_iterator(
+    0, [&rand_gen](cudf::size_type row) { return rand_gen.generate(); });
+  cudf::test::dictionary_column_wrapper<T, typename decltype(data_it)::value_type> values(
+    data_it, data_it + column_size);
+
+  cudf::data_type output_dtype = [&] {
+    if (agg->kind == cudf::aggregation::ANY || agg->kind == cudf::aggregation::ALL)
+      return cudf::data_type{cudf::type_id::BOOL8};
+    if (agg->kind == cudf::aggregation::MEAN) return cudf::data_type{cudf::type_id::FLOAT64};
+    return cudf::data_type{cudf::type_to_id<T>()};
+  }();
+
+  for (auto _ : state) {
+    cuda_event_timer timer(state, true);
+    auto result = cudf::reduce(values, agg, output_dtype);
+  }
+}
+
+#define concat(a, b, c) a##b##c
+#define get_agg(op) concat(cudf::make_, op, _aggregation())
+
+// TYPE, OP
+#define RBM_BENCHMARK_DEFINE(name, type, aggregation)                       \
+  BENCHMARK_DEFINE_F(ReductionDictionary, name)(::benchmark::State & state) \
+  {                                                                         \
+    BM_reduction_dictionary<type>(state, get_agg(aggregation));             \
+  }                                                                         \
+  BENCHMARK_REGISTER_F(ReductionDictionary, name)                           \
+    ->UseManualTime()                                                       \
+    ->Arg(10000)      /* 10k */                                             \
+    ->Arg(100000)     /* 100k */                                            \
+    ->Arg(1000000)    /* 1M */                                              \
+    ->Arg(10000000)   /* 10M */                                             \
+    ->Arg(100000000); /* 100M */
+
+#define REDUCE_BENCHMARK_DEFINE(type, aggregation) \
+  RBM_BENCHMARK_DEFINE(concat(type, _, aggregation), type, aggregation)
+
+REDUCE_BENCHMARK_DEFINE(int32_t, all);
+REDUCE_BENCHMARK_DEFINE(float, all);
+REDUCE_BENCHMARK_DEFINE(int32_t, any);
+REDUCE_BENCHMARK_DEFINE(float, any);
+REDUCE_BENCHMARK_DEFINE(int32_t, min);
+REDUCE_BENCHMARK_DEFINE(float, min);
+REDUCE_BENCHMARK_DEFINE(int32_t, max);
+REDUCE_BENCHMARK_DEFINE(float, max);
+REDUCE_BENCHMARK_DEFINE(int32_t, mean);
+REDUCE_BENCHMARK_DEFINE(float, mean);
diff --git a/cpp/include/cudf/dictionary/detail/iterator.cuh b/cpp/include/cudf/dictionary/detail/iterator.cuh
index 88563f2334b..5c9abfa69ce 100644
--- a/cpp/include/cudf/dictionary/detail/iterator.cuh
+++ b/cpp/include/cudf/dictionary/detail/iterator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,13 +66,13 @@ auto make_dictionary_iterator(column_device_view const& dictionary_column)
  * @brief Accessor functor for returning a dictionary pair iterator.
  *
  * @tparam KeyType The type of the dictionary's key element.
- * @tparam has_nulls Set to `true` if `d_dictionary` has nulls.
  *
  * @throw cudf::logic_error if `has_nulls==true` and `d_dictionary` is not nullable.
  */
-template <typename KeyType, bool has_nulls>
+template <typename KeyType>
 struct dictionary_access_pair_fn {
-  dictionary_access_pair_fn(column_device_view const& d_dictionary) : d_dictionary{d_dictionary}
+  dictionary_access_pair_fn(column_device_view const& d_dictionary, bool has_nulls = true)
+    : d_dictionary{d_dictionary}, has_nulls{has_nulls}
   {
     if (has_nulls) { CUDF_EXPECTS(d_dictionary.nullable(), "unexpected non-nullable column"); }
   }
@@ -80,13 +80,14 @@ struct dictionary_access_pair_fn {
   __device__ thrust::pair<KeyType, bool> operator()(size_type idx) const
   {
     if (has_nulls && d_dictionary.is_null(idx)) return {KeyType{}, false};
-    auto keys = d_dictionary.child(1);
+    auto keys = d_dictionary.child(dictionary_column_view::keys_column_index);
     return {keys.element<KeyType>(static_cast<size_type>(d_dictionary.element<dictionary32>(idx))),
             true};
   };
 
  private:
   column_device_view const d_dictionary;
+  bool has_nulls;
 };
 
 /**
@@ -100,19 +101,20 @@ struct dictionary_access_pair_fn {
  * @throw cudf::logic_error if `dictionary_column` is not a dictionary column.
  *
  * @tparam KeyType The type of the dictionary's key element.
- * @tparam has_nulls Set to `true` if the dictionary_column has nulls.
  *
  * @param dictionary_column The dictionary device view to iterate.
+ * @param has_nulls Set to `true` if the `dictionary_column` has nulls.
  * @return Pair iterator with `{value,valid}`
  */
-template <typename KeyType, bool has_nulls>
-auto make_dictionary_pair_iterator(column_device_view const& dictionary_column)
+template <typename KeyType>
+auto make_dictionary_pair_iterator(column_device_view const& dictionary_column,
+                                   bool has_nulls = true)
 {
   CUDF_EXPECTS(is_dictionary(dictionary_column.type()),
                "Dictionary iterator is only for dictionary columns");
   return thrust::make_transform_iterator(
     thrust::make_counting_iterator<size_type>(0),
-    dictionary_access_pair_fn<KeyType, has_nulls>{dictionary_column});
+    dictionary_access_pair_fn<KeyType>{dictionary_column, has_nulls});
 }
 
 }  // namespace detail
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 79390441e0a..81e63278e4e 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,65 @@
  */
 
 #include <cudf/detail/reduction_functions.hpp>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <reductions/simple.cuh>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
+namespace {
+
+/**
+ * @brief Compute reduction all() for dictionary columns.
+ *
+ * This compiles 10x faster than using thrust::reduce or the
+ * cudf::simple::reduction::detail::reduce utility.
+ * Both of these use the CUB DeviceReduce which aggressively inlines
+ * the input iterator logic.
+ */
+struct all_fn {
+  template <typename Iterator>
+  struct all_true_fn {
+    __device__ void operator()(size_type idx)
+    {
+      if (*d_result && (iter[idx] != *d_result)) atomicAnd(d_result, false);
+    }
+    Iterator iter;
+    bool* d_result;
+  };
+
+  template <typename T, std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto const d_dict = cudf::column_device_view::create(input, stream);
+    auto const iter   = [&] {
+      auto null_iter =
+        cudf::reduction::op::min{}.template get_null_replacing_element_transformer<bool>();
+      auto pair_iter =
+        cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
+      return thrust::make_transform_iterator(pair_iter, null_iter);
+    }();
+    auto result = std::make_unique<numeric_scalar<bool>>(true, true, stream, mr);
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       input.size(),
+                       all_true_fn<decltype(iter)>{iter, result->data()});
+    return result;
+  }
+  template <typename T, std::enable_if_t<!std::is_arithmetic<T>::value>* = nullptr>
+  std::unique_ptr<scalar> operator()(column_view const&,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*)
+  {
+    CUDF_FAIL("Unexpected key type for dictionary in reduction all()");
+  }
+};
+
+}  // namespace
+}  // namespace detail
 
 std::unique_ptr<cudf::scalar> all(column_view const& col,
                                   cudf::data_type const output_dtype,
@@ -28,9 +82,13 @@ std::unique_ptr<cudf::scalar> all(column_view const& col,
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "all() operation can be applied with output type `BOOL8` only");
-  auto const dispatch_type =
-    cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
-  return cudf::type_dispatcher(dispatch_type,
+
+  if (cudf::is_dictionary(col.type())) {
+    return cudf::type_dispatcher(
+      dictionary_column_view(col).keys().type(), detail::all_fn{}, col, stream, mr);
+  }
+  // dispatch for non-dictionary types
+  return cudf::type_dispatcher(col.type(),
                                simple::bool_result_element_dispatcher<cudf::reduction::op::min>{},
                                col,
                                stream,
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index 78a1e4635e7..ceeef017768 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,65 @@
  */
 
 #include <cudf/detail/reduction_functions.hpp>
+#include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <reductions/simple.cuh>
 
 namespace cudf {
 namespace reduction {
+namespace detail {
+namespace {
+
+/**
+ * @brief Compute reduction any() for dictionary columns.
+ *
+ * This compiles 10x faster than using thrust::reduce or the
+ * cudf::simple::reduction::detail::reduce utility.
+ * Both of these use the CUB DeviceReduce which aggressively inlines
+ * the input iterator logic.
+ */
+struct any_fn {
+  template <typename Iterator>
+  struct any_true_fn {
+    __device__ void operator()(size_type idx)
+    {
+      if (!*d_result && (iter[idx] != *d_result)) atomicOr(d_result, true);
+    }
+    Iterator iter;
+    bool* d_result;
+  };
+
+  template <typename T, std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+  std::unique_ptr<scalar> operator()(column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
+  {
+    auto const d_dict = cudf::column_device_view::create(input, stream);
+    auto const iter   = [&] {
+      auto null_iter =
+        cudf::reduction::op::max{}.template get_null_replacing_element_transformer<bool>();
+      auto pair_iter =
+        cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
+      return thrust::make_transform_iterator(pair_iter, null_iter);
+    }();
+    auto result = std::make_unique<numeric_scalar<bool>>(false, true, stream, mr);
+    thrust::for_each_n(rmm::exec_policy(stream),
+                       thrust::make_counting_iterator<size_type>(0),
+                       input.size(),
+                       any_true_fn<decltype(iter)>{iter, result->data()});
+    return result;
+  }
+  template <typename T, std::enable_if_t<!std::is_arithmetic<T>::value>* = nullptr>
+  std::unique_ptr<scalar> operator()(column_view const&,
+                                     rmm::cuda_stream_view,
+                                     rmm::mr::device_memory_resource*)
+  {
+    CUDF_FAIL("Unexpected key type for dictionary in reduction any()");
+  }
+};
+
+}  // namespace
+}  // namespace detail
 
 std::unique_ptr<cudf::scalar> any(column_view const& col,
                                   cudf::data_type const output_dtype,
@@ -28,9 +82,13 @@ std::unique_ptr<cudf::scalar> any(column_view const& col,
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "any() operation can be applied with output type `bool8` only");
-  auto const dispatch_type =
-    cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
-  return cudf::type_dispatcher(dispatch_type,
+
+  if (cudf::is_dictionary(col.type())) {
+    return cudf::type_dispatcher(
+      dictionary_column_view(col).keys().type(), detail::any_fn{}, col, stream, mr);
+  }
+  // dispatch for non-dictionary types
+  return cudf::type_dispatcher(col.type(),
                                simple::bool_result_element_dispatcher<cudf::reduction::op::max>{},
                                col,
                                stream,
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index 011af214d45..09d812e5d94 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,19 +70,11 @@ std::unique_ptr<scalar> compound_reduction(column_view const& col,
         it, col.size(), compound_op, valid_count, ddof, stream, mr);
     }
   } else {
-    if (col.has_nulls()) {
-      auto it = thrust::make_transform_iterator(
-        cudf::dictionary::detail::make_dictionary_pair_iterator<ElementType, true>(*dcol),
-        compound_op.template get_null_replacing_element_transformer<ResultType>());
-      result = detail::reduce<Op, decltype(it), ResultType>(
-        it, col.size(), compound_op, valid_count, ddof, stream, mr);
-    } else {
-      auto it = thrust::make_transform_iterator(
-        cudf::dictionary::detail::make_dictionary_iterator<ElementType>(*dcol),
-        compound_op.template get_element_transformer<ResultType>());
-      result = detail::reduce<Op, decltype(it), ResultType>(
-        it, col.size(), compound_op, valid_count, ddof, stream, mr);
-    }
+    auto it = thrust::make_transform_iterator(
+      cudf::dictionary::detail::make_dictionary_pair_iterator<ElementType>(*dcol, col.has_nulls()),
+      compound_op.template get_null_replacing_element_transformer<ResultType>());
+    result = detail::reduce<Op, decltype(it), ResultType>(
+      it, col.size(), compound_op, valid_count, ddof, stream, mr);
   }
 
   // set scalar is valid
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index f1c68e30dc9..baaedda7d63 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -139,17 +139,11 @@ std::unique_ptr<scalar> dictionary_reduction(column_view const& col,
   auto simple_op = Op{};
 
   auto result = [&] {
-    if (col.has_nulls()) {
-      auto f  = simple_op.template get_null_replacing_element_transformer<ResultType>();
-      auto p  = cudf::dictionary::detail::make_dictionary_pair_iterator<ElementType, true>(*dcol);
-      auto it = thrust::make_transform_iterator(p, f);
-      return detail::reduce(it, col.size(), simple_op, stream, mr);
-    } else {
-      auto f  = simple_op.template get_element_transformer<ResultType>();
-      auto p  = cudf::dictionary::detail::make_dictionary_iterator<ElementType>(*dcol);
-      auto it = thrust::make_transform_iterator(p, f);
-      return detail::reduce(it, col.size(), simple_op, stream, mr);
-    }
+    auto f = simple_op.template get_null_replacing_element_transformer<ResultType>();
+    auto p =
+      cudf::dictionary::detail::make_dictionary_pair_iterator<ElementType>(*dcol, col.has_nulls());
+    auto it = thrust::make_transform_iterator(p, f);
+    return detail::reduce(it, col.size(), simple_op, stream, mr);
   }();
 
   // set scalar is valid
@@ -232,9 +226,7 @@ struct bool_result_element_dispatcher {
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
   {
-    return cudf::is_dictionary(col.type())
-             ? dictionary_reduction<ElementType, bool, Op>(col, stream, mr)
-             : simple_reduction<ElementType, bool, Op>(col, stream, mr);
+    return simple_reduction<ElementType, bool, Op>(col, stream, mr);
   }
 
   template <typename ElementType,