diff --git a/BUILD.md b/BUILD.md
index 1bf3783fae..457ee85aad 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -26,39 +26,52 @@ The recommended way to build and install RAFT is to use the `build.sh` script in
 
 ### <a id="install_header_only_cpp"></a>Header-only C++
 
-RAFT depends on many different core libraries such as `thrust`, `cub`, `cucollections`, and `rmm`, which will be downloaded automatically by `cmake` even when only installing the headers. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which can also be downloaded in the RAFT build but will need to be told to do so. 
+RAFT depends on many different core libraries such as `thrust`, `cub`, `cucollections`, and `rmm`, which will be downloaded automatically by `cmake` even when only installing the headers. It's important to note that while all the headers will be installed and available, some parts of the RAFT API depend on libraries like `FAISS`, which can also be downloaded in the RAFT build but will need to be told to do so.
 
 The following example builds and installs raft in header-only mode:
 ```bash
-./build.sh libraft --nogtest
+./build.sh libraft
 ```
 
 ###<a id="shared_cpp_libs"></a>C++ Shared Libraries (optional)
 
-Shared libraries are provided to speed up compile times for larger libraries which may heavily utilize some of the APIs. These shared libraries can also significantly improve re-compile times while developing against the APIs. 
+Shared libraries are provided to speed up compile times for larger libraries which may heavily utilize some of the APIs. These shared libraries can also significantly improve re-compile times while developing against the APIs.
 
 Build all the shared libraries by passing `--compile-libs` flag to `build.sh`:
 
 ```bash
-./build.sh libraft --compile-libs --nogtest
+./build.sh libraft --compile-libs
 ```
- 
+
 To remain flexible, the individual shared libraries have their own flags and multiple can be used (though currently only the `nn` and `distance` packages contain shared libraries):
 ```bash
-./build.sh libraft --compile-nn --compile-dist --nogtest
+./build.sh libraft --compile-nn --compile-dist
 ```
 
 ###<a id="gtests"></a>Googletests
 
-Compile the Googletests by removing the `--nogtest` flag from `build.sh`:
+Compile the Googletests using the `tests` target in `build.sh`:
 ```bash
-./build.sh libraft --compile-nn --compile-dist
+./build.sh libraft tests --compile-nn --compile-dist
 ```
 
 To run C++ tests:
 
 ```bash
-./test_raft
+./cpp/build/test_raft
+```
+
+###<a id="benchmarks"></a>Benchmarks
+
+Compile the benchmarks using the `bench` target in `build.sh`:
+```bash
+./build.sh libraft bench --compile-nn --compile-dist
+```
+
+To run C++ tests:
+
+```bash
+./cpp/build/bench_raft
 ```
 
 ### <a id="cpp_using_cmake"></a>C++ Using Cmake
@@ -77,15 +90,16 @@ RAFT's cmake has the following configurable flags available:.
 
 | Flag | Possible Values | Default Value | Behavior |
 | --- | --- | --- | --- |
-| BUILD_TESTS | ON, OFF | ON | Compile Googletests |  
+| BUILD_TESTS | ON, OFF | ON | Compile Googletests |
+| BUILD_BENCH | ON, OFF | ON | Compile benchmarks |
 | RAFT_COMPILE_LIBRARIES | ON, OFF | OFF | Compiles all `libraft` shared libraries (these are required for Googletests) |
-| RAFT_COMPILE_NN_LIBRARY | ON, OFF | ON | Compiles the `libraft-nn` shared library |  
-| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |  
+| RAFT_COMPILE_NN_LIBRARY | ON, OFF | ON | Compiles the `libraft-nn` shared library |
+| RAFT_COMPILE_DIST_LIBRARY | ON, OFF | ON | Compiles the `libraft-distance` shared library |
 | RAFT_ENABLE_NN_DEPENDENCIES | ON, OFF | OFF | Searches for dependencies of nearest neighbors API, such as FAISS, and compiles them if not found. |
-| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` | 
+| RAFT_USE_FAISS_STATIC | ON, OFF | OFF | Statically link FAISS into `libraft-nn` |
 | DETECT_CONDA_ENV | ON, OFF | ON | Enable detection of conda environment for dependencies |
 | NVTX | ON, OFF | OFF | Enable NVTX Markers |
-| CUDA_ENABLE_KERNELINFO | ON, OFF | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` | 
+| CUDA_ENABLE_KERNELINFO | ON, OFF | OFF | Enables `kernelinfo` in nvcc. This is useful for `compute-sanitizer` |
 | CUDA_ENABLE_LINEINFO  | ON, OFF | OFF | Enable the -lineinfo option for nvcc |
 | CUDA_STATIC_RUNTIME | ON, OFF | OFF | Statically link the CUDA runtime |
 
@@ -115,8 +129,8 @@ python setup.py install
 ```
 
 To run the Python tests:
-```bash 
-cd python 
+```bash
+cd python
 python -m pytest raft
 ```
 
@@ -142,14 +156,14 @@ The following example shows how to use the `libraft-distance` API with the pre-c
 
 RAFT uses the [RAPIDS cmake](https://github.com/rapidsai/rapids-cmake) library, so it can be easily included into downstream projects. RAPIDS cmake provides a convenience layer around the [Cmake Package Manager (CPM)](https://github.com/cpm-cmake/CPM.cmake). The following example is similar to building RAFT itself from source but allows it to be done in cmake, providing the `raft::raft` link target and `RAFT_INCLUDE_DIR` for includes. The `COMPILE_LIBRARIES` option enables the building of the shared libraries.
 
-The following `cmake` snippet enables a flexible configuration of RAFT: 
+The following `cmake` snippet enables a flexible configuration of RAFT:
 
 ```cmake
 
 set(RAFT_VERSION "22.04")
 
 function(find_and_configure_raft)
-  set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC 
+  set(oneValueArgs VERSION FORK PINNED_TAG USE_FAISS_STATIC
           COMPILE_LIBRARIES ENABLE_NN_DEPENDENCIES CLONE_ON_PIN
           USE_NN_LIBRARY USE_DISTANCE_LIBRARY)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
@@ -165,14 +179,14 @@ function(find_and_configure_raft)
   endif()
 
   #-----------------------------------------------------
-  # Add components 
+  # Add components
   #-----------------------------------------------------
 
   string(APPEND RAFT_COMPONENTS "")
   if(PKG_USE_NN_LIBRARY)
     string(APPEND RAFT_COMPONENTS " nn")
   endif()
-  
+
   if(PKG_USE_DISTANCE_LIBRARY)
     string(APPEND RAFT_COMPONENTS " distance")
   endif()
@@ -221,4 +235,4 @@ find_and_configure_raft(VERSION    ${RAFT_VERSION}.00
 
 ### <a id="py_integration"></a>Python/Cython Integration
 
-Once installed, RAFT's Python library can be imported and used directly.
\ No newline at end of file
+Once installed, RAFT's Python library can be imported and used directly.
diff --git a/build.sh b/build.sh
index 9d3a796c65..eb5fa0a250 100755
--- a/build.sh
+++ b/build.sh
@@ -18,25 +18,26 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libraft pyraft docs -v -g --noinstall --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --nogtest --buildfaiss"
+VALIDARGS="clean libraft pyraft docs tests bench -v -g --noinstall --compile-libs --compile-nn --compile-dist --allgpuarch --nvtx --show_depr_warn -h --buildfaiss"
 HELP="$0 [<target> ...] [<flag> ...]
  where <target> is:
    clean            - remove all existing build artifacts and configuration (start over)
    libraft          - build the raft C++ code only. Also builds the C-wrapper library
                       around the C++ code.
-   pyraft             - build the cuml Python package
+   pyraft           - build the cuml Python package
    docs             - build the documentation
+   tests            - build the tests
+   bench            - build the benchmarks
 
  and <flag> is:
    -v               - verbose build mode
    -g               - build for debug
-   --compile-libs    - compile shared libraries for all components
+   --compile-libs   - compile shared libraries for all components
    --compile-nn     - compile shared library for nn component
    --compile-dist   - compile shared library for distance component
    --allgpuarch     - build for all supported GPU architectures
    --buildfaiss     - build faiss statically into raft
-   --nogtest        - do not build google tests for libraft
-   --noinstall     - do not install cmake targets
+   --noinstall      - do not install cmake targets
    --nvtx           - Enable nvtx for profiling support
    --show_depr_warn - show cmake deprecation warnings
    -h               - print this text
@@ -53,12 +54,13 @@ BUILD_DIRS="${LIBRAFT_BUILD_DIR} ${PY_RAFT_BUILD_DIR} ${PYTHON_DEPS_CLONE}"
 CMAKE_LOG_LEVEL=""
 VERBOSE_FLAG=""
 BUILD_ALL_GPU_ARCH=0
-BUILD_TESTS=YES
+BUILD_TESTS=OFF
+BUILD_BENCH=OFF
 BUILD_STATIC_FAISS=OFF
 COMPILE_LIBRARIES=OFF
 COMPILE_NN_LIBRARY=OFF
 COMPILE_DIST_LIBRARY=OFF
-ENABLE_NN_DEPENDENCIES=${BUILD_TESTS}
+ENABLE_NN_DEPENDENCIES=OFF
 NVTX=OFF
 CLEAN=0
 DISABLE_DEPRECATION_WARNINGS=ON
@@ -110,11 +112,6 @@ fi
 if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
-if hasArg --nogtest; then
-    BUILD_TESTS=OFF
-    COMPILE_LIBRARIES=OFF
-    ENABLE_NN_DEPENDENCIES=OFF
-fi
 
 if hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
   COMPILE_LIBRARIES=ON
@@ -123,11 +120,24 @@ fi
 if hasArg --compile-nn || hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
     ENABLE_NN_DEPENDENCIES=ON
     COMPILE_NN_LIBRARY=ON
-    CMAKE_TARGET="raft_nn_lib;${CMAKE_TARGET}"
+    CMAKE_TARGET="${CMAKE_TARGET};raft_nn_lib"
 fi
+
 if hasArg --compile-dist || hasArg --compile-libs || (( ${NUMARGS} == 0 )); then
     COMPILE_DIST_LIBRARY=ON
-    CMAKE_TARGET="raft_distance_lib;${CMAKE_TARGET}"
+    CMAKE_TARGET="${CMAKE_TARGET};raft_distance_lib"
+fi
+
+if hasArg tests || (( ${NUMARGS} == 0 )); then
+    BUILD_TESTS=ON
+    ENABLE_NN_DEPENDENCIES=ON
+    CMAKE_TARGET="${CMAKE_TARGET};test_raft"
+fi
+
+if hasArg bench || (( ${NUMARGS} == 0 )); then
+    BUILD_BENCH=ON
+    ENABLE_NN_DEPENDENCIES=ON
+    CMAKE_TARGET="${CMAKE_TARGET};bench_raft"
 fi
 
 if hasArg --buildfaiss; then
@@ -165,7 +175,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."
@@ -184,17 +194,13 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs; then
           -DNVTX=${NVTX} \
           -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \
           -DBUILD_TESTS=${BUILD_TESTS} \
+          -DBUILD_BENCH=${BUILD_BENCH} \
           -DCMAKE_MESSAGE_LOG_LEVEL=${CMAKE_LOG_LEVEL} \
           -DRAFT_COMPILE_NN_LIBRARY=${COMPILE_NN_LIBRARY} \
           -DRAFT_COMPILE_DIST_LIBRARY=${COMPILE_DIST_LIBRARY} \
           -DRAFT_USE_FAISS_STATIC=${BUILD_STATIC_FAISS}
 
-  if (( ${NUMARGS} == 0 )) || hasArg libraft; then
-      # Run all c++ targets at once
-      if ! hasArg --nogtest; then
-        CMAKE_TARGET="${CMAKE_TARGET};test_raft;"
-      fi
-
+  if [[ ${CMAKE_TARGET} != "" ]] || [[ ${INSTALL_TARGET} != "" ]]; then
       echo "-- Compiling targets: ${CMAKE_TARGET}, verbose=${VERBOSE_FLAG}"
       cmake --build  "${LIBRAFT_BUILD_DIR}" ${VERBOSE_FLAG} -j${PARALLEL_LEVEL} --target ${CMAKE_TARGET} ${INSTALL_TARGET}
   fi
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 2ce8b446b8..fb5a64fdac 100644
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -43,6 +43,7 @@ fi
 
 # Check for a consistent #include syntax
 HASH_INCLUDE=`python cpp/scripts/include_checker.py \
+                     cpp/bench \
                      cpp/include \
                      cpp/test \
                      2>&1`
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index afc6056b42..1affaef0b1 100644
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -96,9 +96,9 @@ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib:$LD_LIBRARY_PATH
 gpuci_logger "Build C++ and Python targets"
 # These should link against the existing shared libs
 if hasArg --skip-tests; then
-  "$WORKSPACE/build.sh" pyraft libraft -v --nogtest
-else
   "$WORKSPACE/build.sh" pyraft libraft -v
+else
+  "$WORKSPACE/build.sh" pyraft libraft tests bench -v
 fi
 
 gpuci_logger "sccache stats"
diff --git a/conda/recipes/libraft_distance/build.sh b/conda/recipes/libraft_distance/build.sh
index 7523263f01..062a5219db 100644
--- a/conda/recipes/libraft_distance/build.sh
+++ b/conda/recipes/libraft_distance/build.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch --compile-dist --nogtest
+./build.sh libraft -v --allgpuarch --compile-dist
diff --git a/conda/recipes/libraft_headers/build.sh b/conda/recipes/libraft_headers/build.sh
index ca6d9b4960..876f46cdfe 100644
--- a/conda/recipes/libraft_headers/build.sh
+++ b/conda/recipes/libraft_headers/build.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch --nogtest
+./build.sh libraft -v --allgpuarch
diff --git a/conda/recipes/libraft_nn/build.sh b/conda/recipes/libraft_nn/build.sh
index 5c60cd2fa1..4f6ffbca25 100644
--- a/conda/recipes/libraft_nn/build.sh
+++ b/conda/recipes/libraft_nn/build.sh
@@ -1,3 +1,4 @@
 #!/usr/bin/env bash
+# Copyright (c) 2022, NVIDIA CORPORATION.
 
-./build.sh libraft -v --allgpuarch --compile-nn --nogtest
+./build.sh libraft -v --allgpuarch --compile-nn
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c13ee03a33..c68be5e619 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -41,6 +41,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 # - User Options  ------------------------------------------------------------
 
 option(BUILD_TESTS "Build raft unit-tests" ON)
+option(BUILD_BENCH "Build raft C++ benchmark tests" ON)
 option(CUDA_ENABLE_KERNELINFO "Enable kernel resource usage info" OFF)
 option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF)
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
@@ -58,6 +59,7 @@ include(CMakeDependentOption)
 cmake_dependent_option(RAFT_USE_FAISS_STATIC "Build and statically link the FAISS library for nearest neighbors search on GPU" ON RAFT_COMPILE_LIBRARIES OFF)
 
 message(VERBOSE "RAFT: Build RAFT unit-tests: ${BUILD_TESTS}")
+message(VERBOSE "RAFT: Building raft C++ benchmarks: ${BUILD_BENCH}")
 message(VERBOSE "RAFT: Enable detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
 message(VERBOSE "RAFT: Disable depreaction warnings " ${DISABLE_DEPRECATION_WARNINGS})
 message(VERBOSE "RAFT: Disable OpenMP: ${DISABLE_OPENMP}")
@@ -122,6 +124,10 @@ if(BUILD_TESTS)
   include(cmake/thirdparty/get_ucx.cmake)
 endif()
 
+if(BUILD_BENCH)
+  include(cmake/thirdparty/get_gbench.cmake)
+endif()
+
 ##############################################################################
 # - raft ---------------------------------------------------------------------
 
@@ -411,6 +417,13 @@ if(BUILD_TESTS)
   include(test/CMakeLists.txt)
 endif()
 
+##############################################################################
+# - build benchmark executable -----------------------------------------------
+
+if(BUILD_BENCH)
+  include(bench/CMakeLists.txt)
+endif()
+
 ##############################################################################
 # - doxygen targets ----------------------------------------------------------
 
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
new file mode 100644
index 0000000000..9f0a6096d9
--- /dev/null
+++ b/cpp/bench/CMakeLists.txt
@@ -0,0 +1,60 @@
+#=============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+set(RAFT_CPP_BENCH_TARGET "bench_raft")
+
+# (please keep the filenames in alphabetical order)
+add_executable(${RAFT_CPP_BENCH_TARGET}
+  bench/linalg/reduce.cu
+  bench/main.cpp
+)
+
+set_target_properties(${RAFT_CPP_BENCH_TARGET}
+  PROPERTIES BUILD_RPATH "\$ORIGIN"
+          # set target compile options
+          CXX_STANDARD                        17
+          CXX_STANDARD_REQUIRED               ON
+          CUDA_STANDARD                       17
+          CUDA_STANDARD_REQUIRED              ON
+          POSITION_INDEPENDENT_CODE           ON
+          INTERFACE_POSITION_INDEPENDENT_CODE ON
+          INSTALL_RPATH "\$ORIGIN/../../../lib"
+)
+
+target_compile_options(${RAFT_CPP_BENCH_TARGET}
+  PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${RAFT_CXX_FLAGS}>"
+          "$<$<COMPILE_LANGUAGE:CUDA>:${RAFT_CUDA_FLAGS}>"
+)
+
+target_include_directories(${RAFT_CPP_BENCH_TARGET}
+  PUBLIC  "$<BUILD_INTERFACE:${RAFT_SOURCE_DIR}/bench>"
+)
+
+target_link_libraries(${RAFT_CPP_BENCH_TARGET}
+  PRIVATE
+    raft::raft
+    faiss::faiss
+    benchmark::benchmark
+    $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+    $<TARGET_NAME_IF_EXISTS:conda_env>
+)
+
+install(
+  TARGETS ${RAFT_CPP_BENCH_TARGET}
+  COMPONENT testing
+  DESTINATION bin/libraft/gbench
+  EXCLUDE_FROM_ALL
+)
diff --git a/cpp/bench/common/benchmark.hpp b/cpp/bench/common/benchmark.hpp
new file mode 100644
index 0000000000..93814ead44
--- /dev/null
+++ b/cpp/bench/common/benchmark.hpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <memory>
+
+#include <raft/cudart_utils.h>
+#include <raft/interruptible.hpp>
+
+#include <benchmark/benchmark.h>
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace raft::bench {
+
+/**
+ * RAII way to temporary set the pooling memory allocator in rmm.
+ * This may be useful for benchmarking functions that do some memory allocations.
+ */
+struct using_pool_memory_res {
+ private:
+  rmm::mr::device_memory_resource* orig_res_;
+  rmm::mr::cuda_memory_resource cuda_res_;
+  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_res_;
+
+ public:
+  using_pool_memory_res(size_t initial_size, size_t max_size)
+    : orig_res_(rmm::mr::get_current_device_resource()),
+      pool_res_(&cuda_res_, initial_size, max_size)
+  {
+    rmm::mr::set_current_device_resource(&pool_res_);
+  }
+
+  using_pool_memory_res() : using_pool_memory_res(size_t(1) << size_t(30), size_t(16) << size_t(30))
+  {
+  }
+
+  ~using_pool_memory_res() { rmm::mr::set_current_device_resource(orig_res_); }
+};
+
+/**
+ * RAII way of timing cuda calls. This has been shamelessly copied from the
+ * cudf codebase via cuml codebase. So, credits for this class goes to cudf developers.
+ */
+struct cuda_event_timer {
+ private:
+  ::benchmark::State* state_;
+  rmm::cuda_stream_view stream_;
+  cudaEvent_t start_;
+  cudaEvent_t stop_;
+
+ public:
+  /**
+   * @param state  the benchmark::State whose timer we are going to update.
+   * @param stream CUDA stream we are measuring time on.
+   */
+  cuda_event_timer(::benchmark::State& state, rmm::cuda_stream_view stream)
+    : state_(&state), stream_(stream)
+  {
+    RAFT_CUDA_TRY(cudaEventCreate(&start_));
+    RAFT_CUDA_TRY(cudaEventCreate(&stop_));
+    raft::interruptible::synchronize(stream_);
+    RAFT_CUDA_TRY(cudaEventRecord(start_, stream_));
+  }
+  cuda_event_timer() = delete;
+
+  /**
+   * @brief The dtor stops the timer and performs a synchroniazation. Time of
+   *       the benchmark::State object provided to the ctor will be set to the
+   *       value given by `cudaEventElapsedTime()`.
+   */
+  ~cuda_event_timer()
+  {
+    RAFT_CUDA_TRY_NO_THROW(cudaEventRecord(stop_, stream_));
+    raft::interruptible::synchronize(stop_);
+    float milliseconds = 0.0f;
+    RAFT_CUDA_TRY_NO_THROW(cudaEventElapsedTime(&milliseconds, start_, stop_));
+    state_->SetIterationTime(milliseconds / 1000.f);
+    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(start_));
+    RAFT_CUDA_TRY_NO_THROW(cudaEventDestroy(stop_));
+  }
+};
+
+/** Main fixture to be inherited and used by all other c++ benchmarks */
+class fixture {
+ private:
+  rmm::cuda_stream stream_owner_{};
+  rmm::device_buffer scratch_buf_;
+
+ public:
+  rmm::cuda_stream_view stream;
+
+  fixture() : stream{stream_owner_.view()}
+  {
+    int l2_cache_size = 0;
+    int device_id     = 0;
+    RAFT_CUDA_TRY(cudaGetDevice(&device_id));
+    RAFT_CUDA_TRY(cudaDeviceGetAttribute(&l2_cache_size, cudaDevAttrL2CacheSize, device_id));
+    scratch_buf_ = rmm::device_buffer(l2_cache_size, stream);
+  }
+
+  // every benchmark should be overriding this
+  virtual void run_benchmark(::benchmark::State& state) = 0;
+  virtual void generate_metrics(::benchmark::State& state) {}
+
+  /**
+   * The helper to be used inside `run_benchmark`, to loop over the state and record time using the
+   * cuda_event_timer.
+   */
+  template <typename Lambda>
+  void loop_on_state(::benchmark::State& state, Lambda benchmark_func, bool flush_L2 = true)
+  {
+    for (auto _ : state) {
+      if (flush_L2) {
+        RAFT_CUDA_TRY(cudaMemsetAsync(scratch_buf_.data(), 0, scratch_buf_.size(), stream));
+      }
+      cuda_event_timer timer(state, stream);
+      benchmark_func();
+    }
+  }
+};
+
+namespace internal {
+
+template <typename Class, typename... Params>
+class Fixture : public ::benchmark::Fixture {
+  using State = ::benchmark::State;
+
+ public:
+  explicit Fixture(const std::string name, const Params&... params)
+    : ::benchmark::Fixture(), params_(params...)
+  {
+    SetName(name.c_str());
+  }
+  Fixture() = delete;
+
+  void SetUp(const State& state) override
+  {
+    fixture_ =
+      std::apply([](const Params&... ps) { return std::make_unique<Class>(ps...); }, params_);
+  }
+  void TearDown(const State& state) override { fixture_.reset(); }
+  void SetUp(State& st) override { SetUp(const_cast<const State&>(st)); }
+  void TearDown(State& st) override { TearDown(const_cast<const State&>(st)); }
+
+ private:
+  std::unique_ptr<Class> fixture_;
+  std::tuple<Params...> params_;
+
+ protected:
+  void BenchmarkCase(State& state) override
+  {
+    fixture_->run_benchmark(state);
+    fixture_->generate_metrics(state);
+  }
+};  // class Fixture
+
+/**
+ * A helper struct to create a fixture for every combination of input vectors.
+ * Use with care, this can blow up quickly!
+ */
+template <typename Class, typename... Params>
+struct cartesian_registrar {
+  template <typename... Fixed>
+  static void run(const std::string case_name,
+                  const std::vector<Params>&... params,
+                  const Fixed&... fixed);
+};
+
+template <typename Class>
+struct cartesian_registrar<Class> {
+  template <typename... Fixed>
+  static void run(const std::string case_name, const Fixed&... fixed)
+  {
+    auto* b = ::benchmark::internal::RegisterBenchmarkInternal(
+      new Fixture<Class, Fixed...>(case_name, fixed...));
+    b->UseManualTime();
+    b->Unit(benchmark::kMillisecond);
+  }
+};
+
+template <typename Class, typename Param, typename... Params>
+struct cartesian_registrar<Class, Param, Params...> {
+  template <typename... Fixed>
+  static void run(const std::string case_name,
+                  const std::vector<Param>& param,
+                  const std::vector<Params>&... params,
+                  const Fixed&... fixed)
+  {
+    int param_len = param.size();
+    for (int i = 0; i < param_len; i++) {
+      cartesian_registrar<Class, Params...>::run(
+        case_name + "/" + std::to_string(i), params..., fixed..., param[i]);
+    }
+  }
+};
+
+template <typename Class>
+struct registrar {
+  /**
+   * Register a fixture `Class` named `testClass` for every combination of input `params`.
+   *
+   * @param test_class
+   *     A string representation of the `Class` name.
+   * @param test_name
+   *     Optional test name. Leave empty, if you don't need it.
+   * @param params
+   *     Zero or more vectors of parameters.
+   *     The generated test cases are a cartesian product of these vectors.
+   *     Use with care, this can blow up quickly!
+   */
+  template <typename... Params>
+  registrar(const std::string& test_class,
+            const std::string& test_name,
+            const std::vector<Params>&... params)
+  {
+    std::stringstream name_stream;
+    name_stream << test_class;
+    if (!test_name.empty()) { name_stream << "/" << test_name; }
+    cartesian_registrar<Class, Params...>::run(name_stream.str(), params...);
+  }
+};
+
+};  // namespace internal
+
+/**
+ * This is the entry point macro for all benchmarks. This needs to be called
+ * for the set of benchmarks to be registered so that the main harness inside
+ * google bench can find these benchmarks and run them.
+ *
+ * @param TestClass   child class of `raft::bench::Fixture` which contains
+ *                    the logic to generate the dataset and run training on it
+ *                    for a given algo. Ideally, once such struct is needed for
+ *                    every algo to be benchmarked
+ * @param test_name   a unique string to identify these tests at the end of run
+ *                    This is optional and if choose not to use this, pass an
+ *                    empty string
+ * @param params...   zero or more lists of params upon which to benchmark.
+ */
+#define RAFT_BENCH_REGISTER(TestClass, ...)                                             \
+  static raft::bench::internal::registrar<TestClass> BENCHMARK_PRIVATE_NAME(registrar)( \
+    #TestClass, __VA_ARGS__)
+
+}  // namespace raft::bench
diff --git a/cpp/bench/linalg/reduce.cu b/cpp/bench/linalg/reduce.cu
new file mode 100644
index 0000000000..018086a689
--- /dev/null
+++ b/cpp/bench/linalg/reduce.cu
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <common/benchmark.hpp>
+#include <raft/linalg/reduce.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+namespace raft::bench::linalg {
+
+struct input_size {
+  int rows, cols;
+  bool along_rows;
+};
+
+template <typename T>
+struct reduce : public fixture {
+  reduce(bool along_rows, const input_size& p)
+    : input_size(p), along_rows(along_rows), in(p.rows * p.cols, stream), out(p.rows, stream)
+  {
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    loop_on_state(state, [this]() {
+      raft::linalg::reduce(
+        out.data(), in.data(), input_size.cols, input_size.rows, T(0.f), true, along_rows, stream);
+    });
+  }
+
+ private:
+  bool along_rows;
+  input_size input_size;
+  rmm::device_uvector<T> in, out;
+};  // struct reduce
+
+const std::vector<input_size> kInputSizes{{8 * 1024, 1024},
+                                          {1024, 8 * 1024},
+                                          {8 * 1024, 8 * 1024},
+                                          {32 * 1024, 1024},
+                                          {1024, 32 * 1024},
+                                          {32 * 1024, 32 * 1024}};
+
+const std::vector<bool> kAlongRows{false, true};
+
+RAFT_BENCH_REGISTER(reduce<float>, "", kAlongRows, kInputSizes);
+RAFT_BENCH_REGISTER(reduce<double>, "", kAlongRows, kInputSizes);
+
+}  // namespace raft::bench::linalg
diff --git a/cpp/bench/main.cpp b/cpp/bench/main.cpp
new file mode 100644
index 0000000000..3162422e8e
--- /dev/null
+++ b/cpp/bench/main.cpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>  // NOLINT
+
+BENCHMARK_MAIN();
diff --git a/cpp/cmake/thirdparty/get_gbench.cmake b/cpp/cmake/thirdparty/get_gbench.cmake
new file mode 100644
index 0000000000..a3d5678f74
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_gbench.cmake
@@ -0,0 +1,43 @@
+#=============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_gbench)
+
+    set(oneValueArgs VERSION PINNED_TAG)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN} )
+
+    rapids_cpm_find(benchmark ${PKG_VERSION}
+        GLOBAL_TARGETS benchmark::benchmark
+        CPM_ARGS
+            GIT_REPOSITORY  https://github.com/google/benchmark.git
+            GIT_TAG         ${PKG_PINNED_TAG}
+            OPTIONS
+              "BENCHMARK_ENABLE_GTEST_TESTS OFF"
+              "BENCHMARK_ENABLE_TESTING OFF"
+              "BENCHMARK_ENABLE_INSTALL OFF"
+              "CMAKE_BUILD_TYPE Release"
+              "CMAKE_INSTALL_LIBDIR lib"
+    )
+
+    if(NOT TARGET benchmark::benchmark)
+        add_library(benchmark::benchmark ALIAS benchmark)
+    endif()
+
+endfunction()
+
+find_and_configure_gbench(VERSION      1.5.3
+                          PINNED_TAG   c05843a9f622db08ad59804c190f98879b76beba)