diff --git a/build.sh b/build.sh index 3758dc26c4..7e1a3e7e36 100755 --- a/build.sh +++ b/build.sh @@ -18,7 +18,7 @@ ARGS=$* # scripts, and that this script resides in the repo dir! REPODIR=$(cd $(dirname $0); pwd) -VALIDARGS="clean libraft pylibraft raft-dask docs tests template bench-prims bench-ann clean --uninstall -v -g -n --compile-lib --allgpuarch --no-nvtx --show_depr_warn -h" +VALIDARGS="clean libraft pylibraft raft-dask docs tests template bench-prims bench-ann clean --uninstall -v -g -n --compile-lib --allgpuarch --no-nvtx --show_depr_warn --time -h" HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool=] [--limit-tests=] [--limit-bench-prims=] [--limit-bench-ann=] where is: clean - remove all existing build artifacts and configuration (start over) @@ -48,6 +48,8 @@ HELP="$0 [ ...] [ ...] [--cmake-args=\"\"] [--cache-tool=\\\" - pass arbitrary list of CMake configuration options (escape all quotes in argument) --cache-tool= - pass the build cache tool (eg: ccache, sccache, distcc) that will be used to speedup the build process. + --time - Enable nvcc compilation time logging into cpp/build/nvcc_compile_log.csv. + Results can be interpreted with cpp/scripts/analyze_nvcc_log.py -h - print this text default action (no args) is to build libraft, tests, pylibraft and raft-dask targets @@ -75,6 +77,7 @@ BENCH_TARGETS="CLUSTER_BENCH;NEIGHBORS_BENCH;DISTANCE_BENCH;LINALG_BENCH;MATRIX_ CACHE_ARGS="" NVTX=ON +LOG_COMPILE_TIME=OFF CLEAN=0 UNINSTALL=0 DISABLE_DEPRECATION_WARNINGS=ON @@ -322,6 +325,10 @@ fi if hasArg --no-nvtx; then NVTX=OFF fi +if hasArg --time; then + echo "-- Logging compile times to cpp/build/nvcc_compile_log.csv" + LOG_COMPILE_TIME=ON +fi if hasArg --show_depr_warn; then DISABLE_DEPRECATION_WARNINGS=OFF fi @@ -379,6 +386,7 @@ if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || has -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ -DRAFT_COMPILE_LIBRARY=${COMPILE_LIBRARY} \ -DRAFT_NVTX=${NVTX} \ + -DCUDA_LOG_COMPILE_TIME=${LOG_COMPILE_TIME} \ -DDISABLE_DEPRECATION_WARNINGS=${DISABLE_DEPRECATION_WARNINGS} \ -DBUILD_TESTS=${BUILD_TESTS} \ -DBUILD_PRIMS_BENCH=${BUILD_PRIMS_BENCH} \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7bb458c44a..2e9c726b8e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -53,6 +53,7 @@ option(CUDA_ENABLE_LINEINFO "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler)" OFF ) option(CUDA_STATIC_RUNTIME "Statically link the CUDA toolkit runtime and libraries" OFF) +option(CUDA_LOG_COMPILE_TIME "Write a log of compilation times to nvcc_compile_log.csv" OFF) option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" ON) option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON) option(DISABLE_OPENMP "Disable OpenMP" OFF) diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake index 5e68ca5bc4..c733d46985 100644 --- a/cpp/cmake/modules/ConfigureCUDA.cmake +++ b/cpp/cmake/modules/ConfigureCUDA.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2022, NVIDIA CORPORATION. +# Copyright (c) 2018-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -21,6 +21,10 @@ if(CMAKE_COMPILER_IS_GNUCXX) list(APPEND RAFT_CXX_FLAGS -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations) endif() +if(CUDA_LOG_COMPILE_TIME) + list(APPEND RAFT_CUDA_FLAGS "--time=nvcc_compile_log.csv") +endif() + list(APPEND RAFT_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr) list(APPEND RAFT_CXX_FLAGS "-DCUDA_API_PER_THREAD_DEFAULT_STREAM") list(APPEND RAFT_CUDA_FLAGS "-DCUDA_API_PER_THREAD_DEFAULT_STREAM") diff --git a/cpp/scripts/analyze_nvcc_log.py b/cpp/scripts/analyze_nvcc_log.py new file mode 100755 index 0000000000..d06e05d265 --- /dev/null +++ b/cpp/scripts/analyze_nvcc_log.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python3 +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +from pathlib import Path +from matplotlib import colors + +def main(input_path): + input_path = Path(input_path) + print("-- loading data") + df = pd.read_csv(input_path) + + print("-- analyzing data") + # Strip spaces from column names + df = df.rename(columns=str.strip) + df["seconds"] = df["metric"] / 1000 + df["file"] = df["source file name"] + df["phase"] = df["phase name"].str.strip() + + dfp = (df + # Remove nvcc driver entries. They don't contain a source file name + .query("phase!='nvcc (driver)'") + # Make a pivot table containing files as row, phase (preprocessing, + # cicc, etc.) as column and the total times as table entries. NOTE: + # if compiled for multiple archs, the archs will be summed. + .pivot_table(index="file", values="seconds", columns="phase", aggfunc='sum')) + + dfp_sum = dfp.sum(axis="columns") + + df_fraction = dfp.divide(dfp_sum, axis="index") + df_fraction["total time"] = dfp_sum + df_fraction = df_fraction.melt(ignore_index=False, id_vars="total time", var_name="phase", value_name="fraction") + + dfp["total time"] = dfp_sum + df_absolute = dfp.melt(ignore_index=False, id_vars="total time", var_name="phase", value_name="seconds") + + # host: light red to dark red (preprocessing, cudafe, gcc (compiling)) + # device: ligt green to dark green (preprocessing, cicc, ptxas) + palette = { + "gcc (preprocessing 4)": colors.hsv_to_rgb((0, 1, 1)), + 'cudafe++': colors.hsv_to_rgb((0, 1, .75)), + 'gcc (compiling)': colors.hsv_to_rgb((0, 1, .4)), + "gcc (preprocessing 1)": colors.hsv_to_rgb((.33, 1, 1)), + 'cicc': colors.hsv_to_rgb((.33, 1, 0.75)), + 'ptxas': colors.hsv_to_rgb((.33, 1, 0.4)), + 'fatbinary': "grey", + } + + print("-- Ten longest translation units:") + colwidth = pd.get_option('display.max_colwidth') - 1 + dfp = dfp.reset_index() + dfp["file"] = dfp["file"].apply(lambda s: s[-colwidth:]) + print(dfp.sort_values("total time", ascending=False).reset_index().loc[:10]) + + print("-- Plotting absolute compile times") + abs_out_path = f"{input_path}.absolute.compile_times.png" + sns.displot( + df_absolute.sort_values("total time").reset_index(), + y="file", + hue="phase", + hue_order=reversed( + ["gcc (preprocessing 4)", 'cudafe++', 'gcc (compiling)', + "gcc (preprocessing 1)", 'cicc', 'ptxas', + 'fatbinary', + ]), + palette=palette, + weights="seconds", + multiple="stack", + kind="hist", + height=20, + ) + plt.xlabel("seconds"); + plt.savefig(abs_out_path) + print(f"-- Wrote absolute compile time plot to {abs_out_path}") + + print("-- Plotting relative compile times") + rel_out_path = f"{input_path}.relative.compile_times.png" + sns.displot( + df_fraction.sort_values('total time').reset_index(), + y="file", + hue="phase", + hue_order=reversed(["gcc (preprocessing 4)", 'cudafe++', 'gcc (compiling)', + "gcc (preprocessing 1)", 'cicc', 'ptxas', + 'fatbinary', + ]), + palette=palette, + weights="fraction", + multiple="stack", + kind="hist", + height=15, + ) + plt.xlabel("fraction"); + plt.savefig(rel_out_path) + print(f"-- Wrote relative compile time plot to {rel_out_path}") + +if __name__ == "__main__": + if len(sys.argv) != 2: + printf("""NVCC log analyzer + + Analyzes nvcc logs and outputs a figure with highest ranking translation + units. + + Usage: + python analyze_nvcc_log.py + cpp/scripts/analyze_nvcc_log.py + + Generate the nvcc log file by adding: + + list(APPEND RAFT_CUDA_FLAGS "--time=CMakeFiles/nvcc_compile_log.csv") + + to cpp/cmake/modules/ConfigureCUDA.cmake. + """) + + input_path = Path(sys.argv[1]) + if not input_path.exists(): + print(f"Path {input_path} does not exist.") + else: + main(input_path)