From 8e0759a17ecd8c8d959c5f820ec9828bf7d8944f Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 2 Oct 2019 11:50:10 -0700
Subject: [PATCH 1/7] detect gpu arch in cmake

---
 cpp/CMakeLists.txt           | 29 +++++++++++++++--
 cpp/cmake/EvalGpuArchs.cmake | 62 ++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 cpp/cmake/EvalGpuArchs.cmake
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1543d391cf2..f3be5c19bd5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -61,9 +61,32 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     endif(CMAKE_CXX11_ABI)
 endif(CMAKE_COMPILER_IS_GNUCXX)
 
-#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60")
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70")
+# Auto-detect available GPU compute architectures
+set(GPU_ARCHS "" CACHE STRING
+  "List of GPU architectures (semicolon-separated) to be compiled for. Pass 'ALL' if you want to compile for all supported GPU architectures. Empty string means to auto-detect the GPUs on the current system")
+
+if("${GPU_ARCHS}" STREQUAL "")
+  include(cmake/EvalGpuArchs.cmake)
+  evaluate_gpu_archs(GPU_ARCHS)
+endif()
+
+if("${GPU_ARCHS}" STREQUAL "ALL")
+  set(GPU_ARCHS "60")
+  if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9))
+    set(GPU_ARCHS "${GPU_ARCHS};70")
+  endif()
+  if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10))
+    set(GPU_ARCHS "${GPU_ARCHS};75")
+  endif()
+endif()
+message("GPU_ARCHS = ${GPU_ARCHS}")
+
+foreach(arch ${GPU_ARCHS})
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}")
+endforeach()
+
+list(GET GPU_ARCHS -1 ptx)
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${ptx},code=compute_${ptx}")
 
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")
 
diff --git a/cpp/cmake/EvalGpuArchs.cmake b/cpp/cmake/EvalGpuArchs.cmake
new file mode 100644
index 00000000000..740987e4785
--- /dev/null
+++ b/cpp/cmake/EvalGpuArchs.cmake
@@ -0,0 +1,62 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+function(evaluate_gpu_archs gpu_archs)
+  set(eval_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.cu)
+  set(eval_exe ${PROJECT_BINARY_DIR}/eval_gpu_archs)
+  set(error_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.stderr.log)
+  file(WRITE ${eval_file}
+    "
+#include <cstdio>
+#include <set>
+#include <string>
+using namespace std;
+int main(int argc, char** argv) {
+  set<string> archs;
+  int nDevices;
+  if((cudaGetDeviceCount(&nDevices) == cudaSuccess) && (nDevices > 0)) {
+    for(int dev=0;dev<nDevices;++dev) {
+      char buff[32];
+      cudaDeviceProp prop;
+      if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
+      sprintf(buff, \"%d%d\", prop.major, prop.minor);
+      archs.insert(buff);
+    }
+  }
+  if(archs.empty()) {
+    printf(\"ALL\");
+  } else {
+    bool first = true;
+    for(const auto& arch : archs) {
+      printf(first? \"%s\" : \";%s\", arch.c_str());
+      first = false;
+    }
+  }
+  printf(\"\\n\");
+  return 0;
+}
+")
+  execute_process(
+    COMMAND ${CMAKE_CUDA_COMPILER}
+      -std=c++11
+      -o ${eval_exe}
+      --run
+      ${eval_file}
+    OUTPUT_VARIABLE __gpu_archs
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    ERROR_FILE ${error_file})
+  message("Auto detection of gpu-archs: ${__gpu_archs}")
+  set(${gpu_archs} ${__gpu_archs} PARENT_SCOPE)
+endfunction(evaluate_gpu_archs)

From 61ee753395905436458e196555569a8a48705b2a Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Mon, 14 Oct 2019 17:45:36 -0700
Subject: [PATCH 2/7] default GPU_ARCHS to ALL

---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f3be5c19bd5..1a0356f12f2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -62,7 +62,7 @@ if(CMAKE_COMPILER_IS_GNUCXX)
 endif(CMAKE_COMPILER_IS_GNUCXX)
 
 # Auto-detect available GPU compute architectures
-set(GPU_ARCHS "" CACHE STRING
+set(GPU_ARCHS "ALL" CACHE STRING
   "List of GPU architectures (semicolon-separated) to be compiled for. Pass 'ALL' if you want to compile for all supported GPU architectures. Empty string means to auto-detect the GPUs on the current system")
 
 if("${GPU_ARCHS}" STREQUAL "")

From 3cc2315f607cbdddb95506130f108d29ffabe284 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Mon, 14 Oct 2019 17:47:00 -0700
Subject: [PATCH 3/7] changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a5f71f1dcad..17fb35ad1b9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,6 +17,7 @@
 - PR #3027 Move copying.hpp and related source to legacy folder
 - PR #3014 Snappy decompression optimizations
 - PR #2996 IO Readers: Replace `cuio::device_buffer` with `rmm::device_buffer`
+- PR #2955 Add cmake option to only build for present GPU architecture
 
 ## Bug Fixes
 

From abd11b9f078b627134a826be48a44644f173bb7c Mon Sep 17 00:00:00 2001
From: Olivier Lapicque <olapicque@nvidia.com>
Date: Wed, 16 Oct 2019 12:31:32 -0700
Subject: [PATCH 4/7] Fix warning with 64K smem due to gpuInitDictionaryIndices
 launch bounds

---
 cpp/src/io/orc/dict_enc.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index eb3bc269ea7..5e7cdda4d7e 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -189,7 +189,7 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, int t)
  *
  **/
 // blockDim {512,1,1}
-extern "C" __global__ void __launch_bounds__(512, 3)
+extern "C" __global__ void __launch_bounds__(512, 2)
 gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns)
 {
     __shared__ __align__(16) dictinit_state_s state_g;

From 9300692a82b961a3c901af7167c9821982724466 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Wed, 16 Oct 2019 18:29:51 -0700
Subject: [PATCH 5/7] include <random> header in quantiles benchmark

---
 cpp/benchmarks/quantiles/group_quantiles_benchmark.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu b/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu
index cc1954829ee..5556568354b 100644
--- a/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu
+++ b/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu
@@ -19,6 +19,7 @@
 #include <cudf/quantiles.hpp>
 
 #include <benchmark/benchmark.h>
+#include <random>
 
 #include "../fixture/benchmark_fixture.hpp"
 #include "../synchronization/synchronization.hpp"

From 7ca592c9253b26334abdf09aa452bd87d7ff3a30 Mon Sep 17 00:00:00 2001
From: ptaylor <paul.e.taylor@me.com>
Date: Thu, 17 Oct 2019 11:03:33 -0700
Subject: [PATCH 6/7] remove second launch parameter from scatter_kernel

---
 cpp/src/stream_compaction/copy_if.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/stream_compaction/copy_if.cuh b/cpp/src/stream_compaction/copy_if.cuh
index 628353700df..c46213eeb6f 100644
--- a/cpp/src/stream_compaction/copy_if.cuh
+++ b/cpp/src/stream_compaction/copy_if.cuh
@@ -91,7 +91,7 @@ __device__ cudf::size_type block_scan_mask(bool mask_true,
 // Note: `filter` is not run on indices larger than the input column size
 template <typename T, typename Filter, 
           int block_size, bool has_validity>
-__launch_bounds__(block_size, 2048/block_size)
+__launch_bounds__(block_size)
 __global__ void scatter_kernel(T* __restrict__ output_data,
                                bit_mask_t * __restrict__ output_valid,
                                cudf::size_type * output_null_count,

From 8513cfccae02e446dc13fb349950c0872bd182a1 Mon Sep 17 00:00:00 2001
From: Paul Taylor <paul.e.taylor@me.com>
Date: Fri, 18 Oct 2019 11:28:28 -0700
Subject: [PATCH 7/7] revert change to copy_if.cuh

---
 cpp/src/stream_compaction/copy_if.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/stream_compaction/copy_if.cuh b/cpp/src/stream_compaction/copy_if.cuh
index c46213eeb6f..628353700df 100644
--- a/cpp/src/stream_compaction/copy_if.cuh
+++ b/cpp/src/stream_compaction/copy_if.cuh
@@ -91,7 +91,7 @@ __device__ cudf::size_type block_scan_mask(bool mask_true,
 // Note: `filter` is not run on indices larger than the input column size
 template <typename T, typename Filter, 
           int block_size, bool has_validity>
-__launch_bounds__(block_size)
+__launch_bounds__(block_size, 2048/block_size)
 __global__ void scatter_kernel(T* __restrict__ output_data,
                                bit_mask_t * __restrict__ output_valid,
                                cudf::size_type * output_null_count,