diff --git a/CHANGELOG.md b/CHANGELOG.md
index b811eccf35e..c136bc59225 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@
 - PR #3032 Use `asarray` to coerce indices to a NumPy array
 - PR #2996 IO Readers: Replace `cuio::device_buffer` with `rmm::device_buffer`
 - PR #3029 Update gdf_ numeric types with stdint and move to cudf namespace
+- PR #2955 Add cmake option to only build for present GPU architecture
 - PR #3070 Move functions.h and related source to legacy
 
 ## Bug Fixes
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4ed7280ce40..65f53cf5966 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -61,9 +61,32 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     endif(CMAKE_CXX11_ABI)
 endif(CMAKE_COMPILER_IS_GNUCXX)
 
-#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70")
+# Auto-detect available GPU compute architectures
+set(GPU_ARCHS "ALL" CACHE STRING
+  "List of GPU architectures (semicolon-separated) to be compiled for. Pass 'ALL' if you want to compile for all supported GPU architectures. Empty string means to auto-detect the GPUs on the current system")
+
+if("${GPU_ARCHS}" STREQUAL "")
+  include(cmake/EvalGpuArchs.cmake)
+  evaluate_gpu_archs(GPU_ARCHS)
+endif()
+
+if("${GPU_ARCHS}" STREQUAL "ALL")
+  set(GPU_ARCHS "60")
+  if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9))
+    set(GPU_ARCHS "${GPU_ARCHS};70")
+  endif()
+  if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10))
+    set(GPU_ARCHS "${GPU_ARCHS};75")
+  endif()
+endif()
+message("GPU_ARCHS = ${GPU_ARCHS}")
+
+foreach(arch ${GPU_ARCHS})
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}")
+endforeach()
+
+list(GET GPU_ARCHS -1 ptx)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${ptx},code=compute_${ptx}")
 
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
 
diff --git a/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu b/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu
index cc1954829ee..5556568354b 100644
--- a/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu
+++ b/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu
@@ -19,6 +19,7 @@
 #include <cudf/quantiles.hpp>
 
 #include <benchmark/benchmark.h>
+#include <random>
 
 #include "../fixture/benchmark_fixture.hpp"
 #include "../synchronization/synchronization.hpp"
diff --git a/cpp/cmake/EvalGpuArchs.cmake b/cpp/cmake/EvalGpuArchs.cmake
new file mode 100644
index 00000000000..740987e4785
--- /dev/null
+++ b/cpp/cmake/EvalGpuArchs.cmake
@@ -0,0 +1,62 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+function(evaluate_gpu_archs gpu_archs)
+  set(eval_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.cu)
+  set(eval_exe ${PROJECT_BINARY_DIR}/eval_gpu_archs)
+  set(error_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.stderr.log)
+  file(WRITE ${eval_file}
+    "
+#include <cstdio>
+#include <set>
+#include <string>
+using namespace std;
+int main(int argc, char** argv) {
+  set<string> archs;
+  int nDevices;
+  if((cudaGetDeviceCount(&nDevices) == cudaSuccess) && (nDevices > 0)) {
+    for(int dev=0;dev<nDevices;++dev) {
+      char buff[32];
+      cudaDeviceProp prop;
+      if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
+      sprintf(buff, \"%d%d\", prop.major, prop.minor);
+      archs.insert(buff);
+    }
+  }
+  if(archs.empty()) {
+    printf(\"ALL\");
+  } else {
+    bool first = true;
+    for(const auto& arch : archs) {
+      printf(first? \"%s\" : \";%s\", arch.c_str());
+      first = false;
+    }
+  }
+  printf(\"\\n\");
+  return 0;
+}
+")
+  execute_process(
+    COMMAND ${CMAKE_CUDA_COMPILER}
+      -std=c++11
+      -o ${eval_exe}
+      --run
+      ${eval_file}
+    OUTPUT_VARIABLE __gpu_archs
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_FILE ${error_file})
+  message("Auto detection of gpu-archs: ${__gpu_archs}")
+  set(${gpu_archs} ${__gpu_archs} PARENT_SCOPE)
+endfunction(evaluate_gpu_archs)
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index eb3bc269ea7..5e7cdda4d7e 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -189,7 +189,7 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, int t)
  *
  **/
 // blockDim {512,1,1}
-extern "C" __global__ void __launch_bounds__(512, 3)
+extern "C" __global__ void __launch_bounds__(512, 2)
 gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns)
 {
     __shared__ __align__(16) dictinit_state_s state_g;