From 8e0759a17ecd8c8d959c5f820ec9828bf7d8944f Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 2 Oct 2019 11:50:10 -0700 Subject: [PATCH 1/7] detect gpu arch in cmake --- cpp/CMakeLists.txt | 29 +++++++++++++++-- cpp/cmake/EvalGpuArchs.cmake | 62 ++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 3 deletions(-) create mode 100644 cpp/cmake/EvalGpuArchs.cmake diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1543d391cf2..f3be5c19bd5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -61,9 +61,32 @@ if(CMAKE_COMPILER_IS_GNUCXX) endif(CMAKE_CXX11_ABI) endif(CMAKE_COMPILER_IS_GNUCXX) -#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60") -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70") +# Auto-detect available GPU compute architectures +set(GPU_ARCHS "" CACHE STRING + "List of GPU architectures (semicolon-separated) to be compiled for. Pass 'ALL' if you want to compile for all supported GPU architectures. Empty string means to auto-detect the GPUs on the current system") + +if("${GPU_ARCHS}" STREQUAL "") + include(cmake/EvalGpuArchs.cmake) + evaluate_gpu_archs(GPU_ARCHS) +endif() + +if("${GPU_ARCHS}" STREQUAL "ALL") + set(GPU_ARCHS "60") + if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9)) + set(GPU_ARCHS "${GPU_ARCHS};70") + endif() + if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10)) + set(GPU_ARCHS "${GPU_ARCHS};75") + endif() +endif() +message("GPU_ARCHS = ${GPU_ARCHS}") + +foreach(arch ${GPU_ARCHS}) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}") +endforeach() + +list(GET GPU_ARCHS -1 ptx) +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${ptx},code=compute_${ptx}") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr") diff --git a/cpp/cmake/EvalGpuArchs.cmake b/cpp/cmake/EvalGpuArchs.cmake new file mode 100644 index 00000000000..740987e4785 --- /dev/null +++ b/cpp/cmake/EvalGpuArchs.cmake @@ -0,0 +1,62 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +function(evaluate_gpu_archs gpu_archs) + set(eval_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.cu) + set(eval_exe ${PROJECT_BINARY_DIR}/eval_gpu_archs) + set(error_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.stderr.log) + file(WRITE ${eval_file} + " +#include +#include +#include +using namespace std; +int main(int argc, char** argv) { + set archs; + int nDevices; + if((cudaGetDeviceCount(&nDevices) == cudaSuccess) && (nDevices > 0)) { + for(int dev=0;dev Date: Mon, 14 Oct 2019 17:45:36 -0700 Subject: [PATCH 2/7] default GPU_ARCHS to ALL --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f3be5c19bd5..1a0356f12f2 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -62,7 +62,7 @@ if(CMAKE_COMPILER_IS_GNUCXX) endif(CMAKE_COMPILER_IS_GNUCXX) # Auto-detect available GPU compute architectures -set(GPU_ARCHS "" CACHE STRING +set(GPU_ARCHS "ALL" CACHE STRING "List of GPU architectures (semicolon-separated) to be compiled for. Pass 'ALL' if you want to compile for all supported GPU architectures. Empty string means to auto-detect the GPUs on the current system") if("${GPU_ARCHS}" STREQUAL "") From 3cc2315f607cbdddb95506130f108d29ffabe284 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Mon, 14 Oct 2019 17:47:00 -0700 Subject: [PATCH 3/7] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a5f71f1dcad..17fb35ad1b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ - PR #3027 Move copying.hpp and related source to legacy folder - PR #3014 Snappy decompression optimizations - PR #2996 IO Readers: Replace `cuio::device_buffer` with `rmm::device_buffer` +- PR #2955 Add cmake option to only build for present GPU architecture ## Bug Fixes From abd11b9f078b627134a826be48a44644f173bb7c Mon Sep 17 00:00:00 2001 From: Olivier Lapicque Date: Wed, 16 Oct 2019 12:31:32 -0700 Subject: [PATCH 4/7] Fix warning with 64K smem due to gpuInitDictionaryIndices launch bounds --- cpp/src/io/orc/dict_enc.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index eb3bc269ea7..5e7cdda4d7e 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -189,7 +189,7 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, int t) * **/ // blockDim {512,1,1} -extern "C" __global__ void __launch_bounds__(512, 3) +extern "C" __global__ void __launch_bounds__(512, 2) gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns) { __shared__ __align__(16) dictinit_state_s state_g; From 9300692a82b961a3c901af7167c9821982724466 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Wed, 16 Oct 2019 18:29:51 -0700 Subject: [PATCH 5/7] include header in quantiles benchmark --- cpp/benchmarks/quantiles/group_quantiles_benchmark.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu b/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu index cc1954829ee..5556568354b 100644 --- a/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu +++ b/cpp/benchmarks/quantiles/group_quantiles_benchmark.cu @@ -19,6 +19,7 @@ #include #include +#include #include "../fixture/benchmark_fixture.hpp" #include "../synchronization/synchronization.hpp" From 7ca592c9253b26334abdf09aa452bd87d7ff3a30 Mon Sep 17 00:00:00 2001 From: ptaylor Date: Thu, 17 Oct 2019 11:03:33 -0700 Subject: [PATCH 6/7] remove second launch parameter from scatter_kernel --- cpp/src/stream_compaction/copy_if.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/stream_compaction/copy_if.cuh b/cpp/src/stream_compaction/copy_if.cuh index 628353700df..c46213eeb6f 100644 --- a/cpp/src/stream_compaction/copy_if.cuh +++ b/cpp/src/stream_compaction/copy_if.cuh @@ -91,7 +91,7 @@ __device__ cudf::size_type block_scan_mask(bool mask_true, // Note: `filter` is not run on indices larger than the input column size template -__launch_bounds__(block_size, 2048/block_size) +__launch_bounds__(block_size) __global__ void scatter_kernel(T* __restrict__ output_data, bit_mask_t * __restrict__ output_valid, cudf::size_type * output_null_count, From 8513cfccae02e446dc13fb349950c0872bd182a1 Mon Sep 17 00:00:00 2001 From: Paul Taylor Date: Fri, 18 Oct 2019 11:28:28 -0700 Subject: [PATCH 7/7] revert change to copy_if.cuh --- cpp/src/stream_compaction/copy_if.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/stream_compaction/copy_if.cuh b/cpp/src/stream_compaction/copy_if.cuh index c46213eeb6f..628353700df 100644 --- a/cpp/src/stream_compaction/copy_if.cuh +++ b/cpp/src/stream_compaction/copy_if.cuh @@ -91,7 +91,7 @@ __device__ cudf::size_type block_scan_mask(bool mask_true, // Note: `filter` is not run on indices larger than the input column size template -__launch_bounds__(block_size) +__launch_bounds__(block_size, 2048/block_size) __global__ void scatter_kernel(T* __restrict__ output_data, bit_mask_t * __restrict__ output_valid, cudf::size_type * output_null_count,