Skip to content

Commit

Permalink
Merge pull request #2955 from trxcllnt/cmake-detect-gpu-arch/2902
Browse files Browse the repository at this point in the history
[REVIEW] Update cmake to only build for present GPU
  • Loading branch information
harrism authored Oct 22, 2019
2 parents c57b315 + 8513cfc commit 6c47bc6
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 4 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
- PR #3091 Move join files to legacy
- PR #3092 Implicitly init RMM if Java allocates before init
- PR #3029 Update gdf_ numeric types with stdint and move to cudf namespace
- PR #2955 Add cmake option to only build for present GPU architecture
- PR #3070 Move functions.h and related source to legacy
- PR #2951 Allow set_index to handle a list of column names
- PR #3093 Move groupby files to legacy
Expand Down
29 changes: 26 additions & 3 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,32 @@ if(CMAKE_COMPILER_IS_GNUCXX)
endif(CMAKE_CXX11_ABI)
endif(CMAKE_COMPILER_IS_GNUCXX)

#set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_61,code=sm_61")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_60,code=sm_60")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_70,code=compute_70")
# Auto-detect available GPU compute architectures
set(GPU_ARCHS "ALL" CACHE STRING
"List of GPU architectures (semicolon-separated) to be compiled for. Pass 'ALL' if you want to compile for all supported GPU architectures. Empty string means to auto-detect the GPUs on the current system")

if("${GPU_ARCHS}" STREQUAL "")
include(cmake/EvalGpuArchs.cmake)
evaluate_gpu_archs(GPU_ARCHS)
endif()

if("${GPU_ARCHS}" STREQUAL "ALL")
set(GPU_ARCHS "60")
if((CUDA_VERSION_MAJOR EQUAL 9) OR (CUDA_VERSION_MAJOR GREATER 9))
set(GPU_ARCHS "${GPU_ARCHS};70")
endif()
if((CUDA_VERSION_MAJOR EQUAL 10) OR (CUDA_VERSION_MAJOR GREATER 10))
set(GPU_ARCHS "${GPU_ARCHS};75")
endif()
endif()
message("GPU_ARCHS = ${GPU_ARCHS}")

foreach(arch ${GPU_ARCHS})
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${arch},code=sm_${arch}")
endforeach()

list(GET GPU_ARCHS -1 ptx)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode arch=compute_${ptx},code=compute_${ptx}")

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda --expt-relaxed-constexpr")

Expand Down
1 change: 1 addition & 0 deletions cpp/benchmarks/quantiles/group_quantiles_benchmark.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/quantiles.hpp>

#include <benchmark/benchmark.h>
#include <random>

#include "../fixture/benchmark_fixture.hpp"
#include "../synchronization/synchronization.hpp"
Expand Down
62 changes: 62 additions & 0 deletions cpp/cmake/EvalGpuArchs.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Copyright (c) 2019, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

function(evaluate_gpu_archs gpu_archs)
set(eval_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.cu)
set(eval_exe ${PROJECT_BINARY_DIR}/eval_gpu_archs)
set(error_file ${PROJECT_BINARY_DIR}/eval_gpu_archs.stderr.log)
file(WRITE ${eval_file}
"
#include <cstdio>
#include <set>
#include <string>
using namespace std;
int main(int argc, char** argv) {
set<string> archs;
int nDevices;
if((cudaGetDeviceCount(&nDevices) == cudaSuccess) && (nDevices > 0)) {
for(int dev=0;dev<nDevices;++dev) {
char buff[32];
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
sprintf(buff, \"%d%d\", prop.major, prop.minor);
archs.insert(buff);
}
}
if(archs.empty()) {
printf(\"ALL\");
} else {
bool first = true;
for(const auto& arch : archs) {
printf(first? \"%s\" : \";%s\", arch.c_str());
first = false;
}
}
printf(\"\\n\");
return 0;
}
")
execute_process(
COMMAND ${CMAKE_CUDA_COMPILER}
-std=c++11
-o ${eval_exe}
--run
${eval_file}
OUTPUT_VARIABLE __gpu_archs
OUTPUT_STRIP_TRAILING_WHITESPACE
ERROR_FILE ${error_file})
message("Auto detection of gpu-archs: ${__gpu_archs}")
set(${gpu_archs} ${__gpu_archs} PARENT_SCOPE)
endfunction(evaluate_gpu_archs)
2 changes: 1 addition & 1 deletion cpp/src/io/orc/dict_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ static __device__ void LoadNonNullIndices(volatile dictinit_state_s *s, int t)
*
**/
// blockDim {512,1,1}
extern "C" __global__ void __launch_bounds__(512, 3)
extern "C" __global__ void __launch_bounds__(512, 2)
gpuInitDictionaryIndices(DictionaryChunk *chunks, uint32_t num_columns)
{
__shared__ __align__(16) dictinit_state_s state_g;
Expand Down

0 comments on commit 6c47bc6

Please sign in to comment.