Skip to content

Commit

Permalink
Merge branch 'cuda_p2p_gzcpy' of github.com:NVIDIA/Fuser into cuda_p2…
Browse files Browse the repository at this point in the history
…p_gzcpy
  • Loading branch information
samnordmann committed Mar 6, 2025
2 parents f7dd83a + 5212fbf commit 34f8b8b
Show file tree
Hide file tree
Showing 206 changed files with 8,257 additions and 4,633 deletions.
26 changes: 18 additions & 8 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,22 @@ jobs:
steps:
- name: Set CI hello status
run: |
curl \
-X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
https://api.github.com/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha }} \
-d "{\"state\":\"success\",\"target_url\":\"https://github.com/NVIDIA/Fuser/wiki/Bot-Commands\",\"description\":\"Authorized users: comment !build or !test to trigger CI pipelines. See wiki.\",\"context\":\"CI notes\"}"
send_status() {
curl -L -X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha }} \
-d "{ \
\"state\":\"pending\", \
\"target_url\":\"https://github.com/NVIDIA/Fuser/wiki/Bot-Commands\", \
\"description\":\"Use !build or !test to start CI. See wiki.\", \
\"context\":\"$1\" \
}"
}
send_status "nvfuser-ci/build_image_17"
send_status "nvfuser-ci/build_image_20"
pr-agent-tools:
name: PR Agent tools
Expand All @@ -48,10 +58,10 @@ jobs:
OPENAI__KEY: ${{ secrets.LLM_OPENAI__KEY }}
OPENAI__API_BASE: ${{ secrets.LLM_OPENAI__API_BASE }}
CONFIG__MODEL: ${{ secrets.LLM_CONFIG__MODEL }}
CONFIG__CUSTOM_MODEL_MAX_TOKENS: 32768
CONFIG__CUSTOM_MODEL_MAX_TOKENS: 30000
CONFIG__FALLBACK_MODELS: '[]'

CONFIG__MAX_MODEL_TOKENS: 32768
CONFIG__MAX_MODEL_TOKENS: 30000
CONFIG__PUBLISH_OUTPUT_PROGRESS: false

PR_REVIEWER__REQUIRE_SCORE_REVIEW: false
Expand Down
49 changes: 49 additions & 0 deletions .github/workflows/upload-ci-logs-v2.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause

# see https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
name: Nvfuser-CI Logs v2
on:
workflow_dispatch:
inputs:
state:
description: 'job status'
required: true
descr:
description: 'description of the job'
required: true
commit_sha:
description: 'SHA of the commit that was tested.'
required: true
target_url:
description: 'target url'
required: true
context:
description: 'context'
required: true
pr_number:
description: 'pr number'
required: false

run-name: PR status ${{ inputs.pr_number || github.run_id }} - ${{ inputs.commit_sha }} - ${{ inputs.context }}
jobs:
status_update:
name: Update commit status
runs-on: ubuntu-latest
permissions:
statuses: write
steps:
- name: Set status
run: |
curl -L -X POST \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
https://api.github.com/repos/${{ github.repository }}/statuses/${{ inputs.commit_sha }} \
-d "{ \
\"state\":\"${{ inputs.state }}\", \
\"target_url\":\"${{ inputs.target_url }}\", \
\"description\":\"${{ inputs.descr }}\", \
\"context\":\"${{ inputs.context }}\" \
}"
60 changes: 39 additions & 21 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,20 @@ set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party")

option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF)
option(NVFUSER_EXPLICIT_ERROR_CHECK "" OFF)
if (NVFUSER_EXPLICIT_ERROR_CHECK)

if(NVFUSER_EXPLICIT_ERROR_CHECK)
add_compile_definitions(NVFUSER_EXPLICIT_ERROR_CHECK)
endif()

option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF)

include(CMakeDependentOption)
cmake_dependent_option(NVFUSER_DISTRIBUTED "" ON "USE_DISTRIBUTED" OFF)
if (NVFUSER_DISTRIBUTED)

if(NVFUSER_DISTRIBUTED)
add_compile_definitions(NVFUSER_DISTRIBUTED)
endif()

message(STATUS "Setting NVFUSER_DISTRIBUTED=${NVFUSER_DISTRIBUTED}")

# We try to update which C++ standard we use together in lockstep across all
Expand Down Expand Up @@ -153,6 +157,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/id_model/validation_utils.cpp
${NVFUSER_SRCS_DIR}/index_compute.cpp
${NVFUSER_SRCS_DIR}/instrumentation.cpp
${NVFUSER_SRCS_DIR}/interval_analysis.cpp
${NVFUSER_SRCS_DIR}/ir/base_nodes.cpp
${NVFUSER_SRCS_DIR}/ir/builder.cpp
${NVFUSER_SRCS_DIR}/ir/cloner.cpp
Expand Down Expand Up @@ -228,6 +233,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/scheduler/matmul_utils.cpp
${NVFUSER_SRCS_DIR}/scheduler/mma_utils.cpp
${NVFUSER_SRCS_DIR}/scheduler/no_op.cpp
${NVFUSER_SRCS_DIR}/scheduler/communication.cpp
${NVFUSER_SRCS_DIR}/scheduler/normalization_inner.cpp
${NVFUSER_SRCS_DIR}/scheduler/normalization_inner_outer.cpp
${NVFUSER_SRCS_DIR}/scheduler/normalization_outer.cpp
Expand Down Expand Up @@ -306,16 +312,17 @@ add_library(codegen_internal OBJECT ${NVFUSER_SRCS})

if(NOT MSVC)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
target_compile_options(codegen_internal PRIVATE
-Wall -Wno-unused-function -Werror
# These warnings are not treated as errors because of gcc 12.2 used in
# manylinux image. consider enable this when we upgrade.
# linking comment:
# https://github.com/NVIDIA/Fuser/pull/3001#discussion_r1772551266
-Wno-error=restrict -Wno-error=stringop-overflow -Wno-error=maybe-uninitialized)
target_compile_options(codegen_internal PRIVATE
-Wall -Wno-unused-function -Werror

# These warnings are not treated as errors because of gcc 12.2 used in
# manylinux image. consider enable this when we upgrade.
# linking comment:
# https://github.com/NVIDIA/Fuser/pull/3001#discussion_r1772551266
-Wno-error=restrict -Wno-error=stringop-overflow -Wno-error=maybe-uninitialized)
else()
target_compile_options(codegen_internal PRIVATE
-Wall -Wno-unused-function -Werror)
target_compile_options(codegen_internal PRIVATE
-Wall -Wno-unused-function -Werror)
endif()
endif()

Expand All @@ -336,6 +343,7 @@ set_target_properties(codegen_internal PROPERTIES
CXX_STANDARD ${NVFUSER_CPP_STANDARD}
CXX_STANDARD_REQUIRED ON
CXX_VISIBILITY_PRESET hidden

# this is to find pip installed nvrtc/nvtx .so
INSTALL_RPATH
"$ORIGIN/../../nvidia/cuda_runtime/lib:$ORIGIN/../../nvidia/cuda_nvrtc/lib:$ORIGIN/../../nvidia/nvtx/lib:$ORIGIN/../../nvidia/cuda_cupti/lib:$ORIGIN/../../torch/lib"
Expand All @@ -354,8 +362,10 @@ target_link_libraries(codegen_internal PUBLIC
)

add_library(nvfuser_codegen SHARED $<TARGET_OBJECTS:codegen_internal>)

if(NVFUSER_BUILD_WITH_ASAN)
target_compile_options(codegen_internal PRIVATE -fsanitize=address)

if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
# https://github.com/google/sanitizers/issues/796#issuecomment-292844823
# recommends to link asan statically. This is the default with Clang. GCC
Expand Down Expand Up @@ -494,13 +504,15 @@ if(BUILD_PYTHON)
"-DC10_BUILD_MAIN_LIB=1"
EXTENSION_NAME=_C
)

if(NOT MSVC)
target_compile_options(nvf_py_internal PRIVATE -Wall -Wno-unused-function)
target_compile_options(nvf_py_internal PRIVATE -Werror)
set(NVF_LIB_SUFFIX ".so")
else()
set(NVF_LIB_SUFFIX ".pyd")
endif()

set_target_properties(nvfuser PROPERTIES
C_STANDARD ${NVFUSER_C_STANDARD}
CUDA_STANDARD ${NVFUSER_CUDA_STANDARD}
Expand Down Expand Up @@ -572,6 +584,7 @@ list(APPEND JIT_TEST_SRCS
${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp
${NVFUSER_ROOT}/tests/cpp/test_indexing_advanced.cpp
${NVFUSER_ROOT}/tests/cpp/test_inlining.cpp
${NVFUSER_ROOT}/tests/cpp/test_interval_analysis.cpp
${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp
${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp
${NVFUSER_ROOT}/tests/cpp/test_loop_domain_scheduling.cpp
Expand Down Expand Up @@ -656,8 +669,8 @@ endfunction()

if(BUILD_TEST)
set(TEST_BINARIES)
add_test(nvfuser_tests "${JIT_TEST_SRCS}" "")
list(APPEND TEST_BINARIES nvfuser_tests)
add_test(test_nvfuser "${JIT_TEST_SRCS}" "")
list(APPEND TEST_BINARIES test_nvfuser)

add_test(test_rng ${NVFUSER_ROOT}/tests/cpp/test_rng.cpp ${RNG_TEST_KERNELS})
list(APPEND TEST_BINARIES test_rng)
Expand Down Expand Up @@ -705,8 +718,12 @@ if(BUILD_TEST)
add_test(test_external_src "${NVFUSER_ROOT}/tests/cpp/test_external_src.cpp" "")
list(APPEND TEST_BINARIES test_external_src)

add_test(tutorial "${NVFUSER_ROOT}/tests/cpp/test_tutorial.cpp" "")
list(APPEND TEST_BINARIES tutorial)
set(TUTORIAL_SRCS)
list(APPEND TUTORIAL_SRCS
${NVFUSER_ROOT}/tests/cpp/test_tutorial.cpp
${NVFUSER_ROOT}/tests/cpp/tutorial_tmem.cpp)
add_test(test_tutorial "${TUTORIAL_SRCS}" "")
list(APPEND TEST_BINARIES test_tutorial)

set(HOSTIR_TEST_SRCS)
list(APPEND HOSTIR_TEST_SRCS
Expand Down Expand Up @@ -805,7 +822,7 @@ if(BUILD_NVFUSER_BENCHMARK)
endif()

# multidevice transformer benchmark
if (NVFUSER_DISTRIBUTED)
if(NVFUSER_DISTRIBUTED)
set(MULTIDEVICE_BENCHMARK_SRCS)
list(APPEND MULTIDEVICE_BENCHMARK_SRCS
${NVFUSER_ROOT}/benchmarks/cpp/transformer.cpp
Expand Down Expand Up @@ -833,11 +850,12 @@ if(BUILD_NVFUSER_BENCHMARK)
codegen_internal
)
add_dependencies(nvfuser_multidevice_bench flatc build_flatbuffer_config)

if(NOT MSVC)
target_compile_options(nvfuser_bench PRIVATE
-Wall -Wno-unused-function
-Werror -Wno-deprecated-copy
)
target_compile_options(nvfuser_bench PRIVATE
-Wall -Wno-unused-function
-Werror -Wno-deprecated-copy
)
endif()
endif()
endif()
Expand Down Expand Up @@ -899,7 +917,7 @@ foreach(src ${NVFUSER_RUNTIME_FILES})

# Do not overwrite resource header if it already exists. This avoids unnecessary rebuilds.
# If ${dst} doesn't exist, this `if` is also true, so header will be generated.
if (${src} IS_NEWER_THAN ${dst})
if(${src} IS_NEWER_THAN ${dst})
# also generate the resource headers during the configuration step
# (so tools like clang-tidy can run w/o requiring a real build)
execute_process(COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst})
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/cpp/batch_norm_channels_first.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,10 @@ static void NvFuserScheduler_BatchNorm(
at::Tensor at_bias = at::zeros({input_shape[1]}, options);
at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
std::vector<c10::IValue> aten_inputs(
{at_x, at_weight, at_bias, at_run_mean, at_run_var});
KernelArgumentHolder args = {
at_x, at_weight, at_bias, at_run_mean, at_run_var};

runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, args);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/cpp/batch_norm_channels_first_backward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,10 @@ static void NvFuserScheduler_BatchNorm_BWD(
at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);

std::vector<c10::IValue> aten_inputs(
{input, grad_out, weight, run_mean, run_var, save_mean, save_var});
KernelArgumentHolder args = {
input, grad_out, weight, run_mean, run_var, save_mean, save_var};

runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, args);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/cpp/batch_norm_channels_last.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,10 @@ static void NvFuserScheduler_BatchNorm_nhwc(
at::Tensor at_bias = at::zeros({input_shape[3]}, options);
at::Tensor at_run_mean = at::zeros({input_shape[3]}, fp32_options);
at::Tensor at_run_var = at::ones({input_shape[3]}, fp32_options);
std::vector<c10::IValue> aten_inputs(
{at_x, at_weight, at_bias, at_run_mean, at_run_var});
KernelArgumentHolder args = {
at_x, at_weight, at_bias, at_run_mean, at_run_var};

runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, args);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
6 changes: 3 additions & 3 deletions benchmarks/cpp/batch_norm_channels_last_backward.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,10 +113,10 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD(
at::Tensor save_mean = at::zeros({input_shape[3]}, fp32_options);
at::Tensor save_var = at::ones({input_shape[3]}, fp32_options);

std::vector<c10::IValue> aten_inputs(
{input, grad_out, weight, run_mean, run_var, save_mean, save_var});
KernelArgumentHolder args = {
input, grad_out, weight, run_mean, run_var, save_mean, save_var};

runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
runBenchmarkIterations(benchmark_state, executor_cache, args);

benchmark_state.SetBytesProcessed(
int64_t(benchmark_state.iterations()) *
Expand Down
30 changes: 12 additions & 18 deletions benchmarks/cpp/bert.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,9 @@ static void NvFuserScheduler_DivMaxSoftDropFwd(
at::Tensor t0 = at::randn({w, 1, 1, z}, options);
at::Tensor t1 = at::randn({w, x, y, z}, options);

std::vector<c10::IValue> at_inputs = {t0, t1};
KernelArgumentHolder args = {t0, t1};

auto bytes =
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand All @@ -159,10 +158,9 @@ static void NvFuserScheduler_DivMaxSoftDropBwd(
at::Tensor t2 = at::randn({w, x, y, z}, options);
at::Tensor t3 = at::randn({w, x, y, z}, options).round().to(at::kBool);

std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
KernelArgumentHolder args = {t0, t1, t2, t3};

auto bytes =
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);

// Some reason t1 isn't used, ignore it.
bytes -=
Expand Down Expand Up @@ -244,10 +242,9 @@ static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
at::Tensor t3 = at::randn({x, y, z}, options);
at::Tensor t4 = at::randn({z}, options);

std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3, t4};
KernelArgumentHolder args = {t0, t1, t2, t3, t4};

auto bytes =
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down Expand Up @@ -319,10 +316,9 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
at::Tensor t2 = at::randn({x, y, 1}, options);
at::Tensor t3 = at::randn({x, y, 1}, options);

std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
KernelArgumentHolder args = {t0, t1, t2, t3};

auto bytes =
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down Expand Up @@ -395,10 +391,9 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
at::Tensor t1 = at::randn({x, y, z}, options);
at::Tensor t8 = at::randn({x, y, z}, options);

std::vector<c10::IValue> at_inputs = {t4, t5, t1, t8};
KernelArgumentHolder args = {t4, t5, t1, t8};

auto bytes =
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down Expand Up @@ -451,10 +446,9 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
at::Tensor t0 = at::randn({x, y, z}, options);
at::Tensor t21 = at::randn({x, y, z}, options);

std::vector<c10::IValue> at_inputs = {t0, t21};
KernelArgumentHolder args = {t0, t21};

auto bytes =
runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);

benchmark_state.SetBytesProcessed(
bytes * int64_t(benchmark_state.iterations()));
Expand Down
Loading

0 comments on commit 34f8b8b

Please sign in to comment.