Merge branch 'cuda_p2p_gzcpy' of github.com:NVIDIA/Fuser into cuda_p2…

…p_gzcpy
NVIDIA · Mar 6, 2025 · 34f8b8b · 34f8b8b
2 parents f7dd83a + 5212fbf
commit 34f8b8b
Show file tree

Hide file tree

Showing 206 changed files with 8,257 additions and 4,633 deletions.
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -21,12 +21,22 @@ jobs:
     steps:
       - name: Set CI hello status
         run: |
-          curl \
-          -X POST \
-          -H "Accept: application/vnd.github+json" \
-          -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
-          https://api.github.com/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha }} \
-          -d "{\"state\":\"success\",\"target_url\":\"https://github.com/NVIDIA/Fuser/wiki/Bot-Commands\",\"description\":\"Authorized users: comment !build or !test to trigger CI pipelines. See wiki.\",\"context\":\"CI notes\"}"
+          send_status() {
+            curl -L -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/${{ github.repository }}/statuses/${{ github.event.pull_request.head.sha }} \
+            -d "{ \
+              \"state\":\"pending\", \
+              \"target_url\":\"https://github.com/NVIDIA/Fuser/wiki/Bot-Commands\", \
+              \"description\":\"Use !build or !test to start CI. See wiki.\", \
+              \"context\":\"$1\" \
+            }"
+          }
+
+          send_status "nvfuser-ci/build_image_17"
+          send_status "nvfuser-ci/build_image_20"
 
   pr-agent-tools:
     name: PR Agent tools
@@ -48,10 +58,10 @@ jobs:
         OPENAI__KEY: ${{ secrets.LLM_OPENAI__KEY }}
         OPENAI__API_BASE: ${{ secrets.LLM_OPENAI__API_BASE }}
         CONFIG__MODEL: ${{ secrets.LLM_CONFIG__MODEL }}
-        CONFIG__CUSTOM_MODEL_MAX_TOKENS: 32768
+        CONFIG__CUSTOM_MODEL_MAX_TOKENS: 30000
         CONFIG__FALLBACK_MODELS: '[]'
 
-        CONFIG__MAX_MODEL_TOKENS: 32768
+        CONFIG__MAX_MODEL_TOKENS: 30000
         CONFIG__PUBLISH_OUTPUT_PROGRESS: false 
 
         PR_REVIEWER__REQUIRE_SCORE_REVIEW: false

diff --git a/.github/workflows/upload-ci-logs-v2.yml b/.github/workflows/upload-ci-logs-v2.yml
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-present NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+
+# see https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
+name: Nvfuser-CI Logs v2
+on:
+  workflow_dispatch:
+    inputs:
+      state:
+        description: 'job status'
+        required: true
+      descr:
+        description: 'description of the job'
+        required: true
+      commit_sha:
+        description: 'SHA of the commit that was tested.'
+        required: true
+      target_url:
+        description: 'target url'
+        required: true
+      context:
+        description: 'context'
+        required: true
+      pr_number:
+        description: 'pr number'
+        required: false
+
+run-name: PR status ${{ inputs.pr_number || github.run_id }} - ${{ inputs.commit_sha }} - ${{ inputs.context }}
+jobs:
+  status_update:
+    name: Update commit status
+    runs-on: ubuntu-latest
+    permissions:
+      statuses: write
+    steps:
+      - name: Set status
+        run: |
+          curl -L -X POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            https://api.github.com/repos/${{ github.repository }}/statuses/${{ inputs.commit_sha }} \
+            -d "{ \
+              \"state\":\"${{ inputs.state }}\", \
+              \"target_url\":\"${{ inputs.target_url }}\", \
+              \"description\":\"${{ inputs.descr }}\", \
+              \"context\":\"${{ inputs.context }}\" \
+            }"
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,16 +14,20 @@ set(NVFUSER_THIRD_PARTY_DIR "${NVFUSER_ROOT}/third_party")
 
 option(NVFUSER_STANDALONE_BUILD_WITH_UCC "" OFF)
 option(NVFUSER_EXPLICIT_ERROR_CHECK "" OFF)
-if (NVFUSER_EXPLICIT_ERROR_CHECK)
+
+if(NVFUSER_EXPLICIT_ERROR_CHECK)
   add_compile_definitions(NVFUSER_EXPLICIT_ERROR_CHECK)
 endif()
+
 option(NVFUSER_BUILD_WITH_ASAN "Build nvFuser with asan" OFF)
 
 include(CMakeDependentOption)
 cmake_dependent_option(NVFUSER_DISTRIBUTED "" ON "USE_DISTRIBUTED" OFF)
-if (NVFUSER_DISTRIBUTED)
+
+if(NVFUSER_DISTRIBUTED)
   add_compile_definitions(NVFUSER_DISTRIBUTED)
 endif()
+
 message(STATUS "Setting NVFUSER_DISTRIBUTED=${NVFUSER_DISTRIBUTED}")
 
 # We try to update which C++ standard we use together in lockstep across all
@@ -153,6 +157,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/id_model/validation_utils.cpp
   ${NVFUSER_SRCS_DIR}/index_compute.cpp
   ${NVFUSER_SRCS_DIR}/instrumentation.cpp
+  ${NVFUSER_SRCS_DIR}/interval_analysis.cpp
   ${NVFUSER_SRCS_DIR}/ir/base_nodes.cpp
   ${NVFUSER_SRCS_DIR}/ir/builder.cpp
   ${NVFUSER_SRCS_DIR}/ir/cloner.cpp
@@ -228,6 +233,7 @@ list(APPEND NVFUSER_SRCS
   ${NVFUSER_SRCS_DIR}/scheduler/matmul_utils.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/mma_utils.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/no_op.cpp
+  ${NVFUSER_SRCS_DIR}/scheduler/communication.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/normalization_inner_outer.cpp
   ${NVFUSER_SRCS_DIR}/scheduler/normalization_outer.cpp
@@ -306,16 +312,17 @@ add_library(codegen_internal OBJECT ${NVFUSER_SRCS})
 
 if(NOT MSVC)
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
-      target_compile_options(codegen_internal PRIVATE
-          -Wall -Wno-unused-function -Werror
-          # These warnings are not treated as errors because of gcc 12.2 used in
-          # manylinux image. consider enable this when we upgrade.
-          # linking comment:
-          # https://github.com/NVIDIA/Fuser/pull/3001#discussion_r1772551266
-          -Wno-error=restrict -Wno-error=stringop-overflow -Wno-error=maybe-uninitialized)
+    target_compile_options(codegen_internal PRIVATE
+      -Wall -Wno-unused-function -Werror
+
+      # These warnings are not treated as errors because of gcc 12.2 used in
+      # manylinux image. consider enable this when we upgrade.
+      # linking comment:
+      # https://github.com/NVIDIA/Fuser/pull/3001#discussion_r1772551266
+      -Wno-error=restrict -Wno-error=stringop-overflow -Wno-error=maybe-uninitialized)
   else()
-      target_compile_options(codegen_internal PRIVATE
-          -Wall -Wno-unused-function -Werror)
+    target_compile_options(codegen_internal PRIVATE
+      -Wall -Wno-unused-function -Werror)
   endif()
 endif()
 
@@ -336,6 +343,7 @@ set_target_properties(codegen_internal PROPERTIES
   CXX_STANDARD ${NVFUSER_CPP_STANDARD}
   CXX_STANDARD_REQUIRED ON
   CXX_VISIBILITY_PRESET hidden
+
   # this is to find pip installed nvrtc/nvtx .so
   INSTALL_RPATH
   "$ORIGIN/../../nvidia/cuda_runtime/lib:$ORIGIN/../../nvidia/cuda_nvrtc/lib:$ORIGIN/../../nvidia/nvtx/lib:$ORIGIN/../../nvidia/cuda_cupti/lib:$ORIGIN/../../torch/lib"
@@ -354,8 +362,10 @@ target_link_libraries(codegen_internal PUBLIC
 )
 
 add_library(nvfuser_codegen SHARED $<TARGET_OBJECTS:codegen_internal>)
+
 if(NVFUSER_BUILD_WITH_ASAN)
   target_compile_options(codegen_internal PRIVATE -fsanitize=address)
+
   if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
     # https://github.com/google/sanitizers/issues/796#issuecomment-292844823
     # recommends to link asan statically. This is the default with Clang. GCC
@@ -494,13 +504,15 @@ if(BUILD_PYTHON)
     "-DC10_BUILD_MAIN_LIB=1"
     EXTENSION_NAME=_C
   )
+
   if(NOT MSVC)
     target_compile_options(nvf_py_internal PRIVATE -Wall -Wno-unused-function)
     target_compile_options(nvf_py_internal PRIVATE -Werror)
     set(NVF_LIB_SUFFIX ".so")
   else()
     set(NVF_LIB_SUFFIX ".pyd")
   endif()
+
   set_target_properties(nvfuser PROPERTIES
     C_STANDARD ${NVFUSER_C_STANDARD}
     CUDA_STANDARD ${NVFUSER_CUDA_STANDARD}
@@ -572,6 +584,7 @@ list(APPEND JIT_TEST_SRCS
   ${NVFUSER_ROOT}/tests/cpp/test_indexing.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_indexing_advanced.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_inlining.cpp
+  ${NVFUSER_ROOT}/tests/cpp/test_interval_analysis.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_iter_visitor.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_linked_hash_map.cpp
   ${NVFUSER_ROOT}/tests/cpp/test_loop_domain_scheduling.cpp
@@ -656,8 +669,8 @@ endfunction()
 
 if(BUILD_TEST)
   set(TEST_BINARIES)
-  add_test(nvfuser_tests "${JIT_TEST_SRCS}" "")
-  list(APPEND TEST_BINARIES nvfuser_tests)
+  add_test(test_nvfuser "${JIT_TEST_SRCS}" "")
+  list(APPEND TEST_BINARIES test_nvfuser)
 
   add_test(test_rng ${NVFUSER_ROOT}/tests/cpp/test_rng.cpp ${RNG_TEST_KERNELS})
   list(APPEND TEST_BINARIES test_rng)
@@ -705,8 +718,12 @@ if(BUILD_TEST)
   add_test(test_external_src "${NVFUSER_ROOT}/tests/cpp/test_external_src.cpp" "")
   list(APPEND TEST_BINARIES test_external_src)
 
-  add_test(tutorial "${NVFUSER_ROOT}/tests/cpp/test_tutorial.cpp" "")
-  list(APPEND TEST_BINARIES tutorial)
+  set(TUTORIAL_SRCS)
+  list(APPEND TUTORIAL_SRCS
+    ${NVFUSER_ROOT}/tests/cpp/test_tutorial.cpp
+    ${NVFUSER_ROOT}/tests/cpp/tutorial_tmem.cpp)
+  add_test(test_tutorial "${TUTORIAL_SRCS}" "")
+  list(APPEND TEST_BINARIES test_tutorial)
 
   set(HOSTIR_TEST_SRCS)
   list(APPEND HOSTIR_TEST_SRCS
@@ -805,7 +822,7 @@ if(BUILD_NVFUSER_BENCHMARK)
   endif()
 
   # multidevice transformer benchmark
-  if (NVFUSER_DISTRIBUTED)
+  if(NVFUSER_DISTRIBUTED)
     set(MULTIDEVICE_BENCHMARK_SRCS)
     list(APPEND MULTIDEVICE_BENCHMARK_SRCS
       ${NVFUSER_ROOT}/benchmarks/cpp/transformer.cpp
@@ -833,11 +850,12 @@ if(BUILD_NVFUSER_BENCHMARK)
       codegen_internal
     )
     add_dependencies(nvfuser_multidevice_bench flatc build_flatbuffer_config)
+
     if(NOT MSVC)
-    target_compile_options(nvfuser_bench PRIVATE
-      -Wall -Wno-unused-function
-      -Werror -Wno-deprecated-copy
-    )
+      target_compile_options(nvfuser_bench PRIVATE
+        -Wall -Wno-unused-function
+        -Werror -Wno-deprecated-copy
+      )
     endif()
   endif()
 endif()
@@ -899,7 +917,7 @@ foreach(src ${NVFUSER_RUNTIME_FILES})
 
   # Do not overwrite resource header if it already exists. This avoids unnecessary rebuilds.
   # If ${dst} doesn't exist, this `if` is also true, so header will be generated.
-  if (${src} IS_NEWER_THAN ${dst})
+  if(${src} IS_NEWER_THAN ${dst})
     # also generate the resource headers during the configuration step
     # (so tools like clang-tidy can run w/o requiring a real build)
     execute_process(COMMAND ${PYTHON_EXECUTABLE} ${NVFUSER_STRINGIFY_TOOL} -i ${src} -o ${dst})

diff --git a/benchmarks/cpp/batch_norm_channels_first.cpp b/benchmarks/cpp/batch_norm_channels_first.cpp
@@ -99,10 +99,10 @@ static void NvFuserScheduler_BatchNorm(
   at::Tensor at_bias = at::zeros({input_shape[1]}, options);
   at::Tensor at_run_mean = at::zeros({input_shape[1]}, fp32_options);
   at::Tensor at_run_var = at::ones({input_shape[1]}, fp32_options);
-  std::vector<c10::IValue> aten_inputs(
-      {at_x, at_weight, at_bias, at_run_mean, at_run_var});
+  KernelArgumentHolder args = {
+      at_x, at_weight, at_bias, at_run_mean, at_run_var};
 
-  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, args);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/batch_norm_channels_first_backward.cpp b/benchmarks/cpp/batch_norm_channels_first_backward.cpp
@@ -112,10 +112,10 @@ static void NvFuserScheduler_BatchNorm_BWD(
   at::Tensor save_mean = at::zeros({input_shape[1]}, fp32_options);
   at::Tensor save_var = at::ones({input_shape[1]}, fp32_options);
 
-  std::vector<c10::IValue> aten_inputs(
-      {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
+  KernelArgumentHolder args = {
+      input, grad_out, weight, run_mean, run_var, save_mean, save_var};
 
-  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, args);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/batch_norm_channels_last.cpp b/benchmarks/cpp/batch_norm_channels_last.cpp
@@ -100,10 +100,10 @@ static void NvFuserScheduler_BatchNorm_nhwc(
   at::Tensor at_bias = at::zeros({input_shape[3]}, options);
   at::Tensor at_run_mean = at::zeros({input_shape[3]}, fp32_options);
   at::Tensor at_run_var = at::ones({input_shape[3]}, fp32_options);
-  std::vector<c10::IValue> aten_inputs(
-      {at_x, at_weight, at_bias, at_run_mean, at_run_var});
+  KernelArgumentHolder args = {
+      at_x, at_weight, at_bias, at_run_mean, at_run_var};
 
-  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, args);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/batch_norm_channels_last_backward.cpp b/benchmarks/cpp/batch_norm_channels_last_backward.cpp
@@ -113,10 +113,10 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD(
   at::Tensor save_mean = at::zeros({input_shape[3]}, fp32_options);
   at::Tensor save_var = at::ones({input_shape[3]}, fp32_options);
 
-  std::vector<c10::IValue> aten_inputs(
-      {input, grad_out, weight, run_mean, run_var, save_mean, save_var});
+  KernelArgumentHolder args = {
+      input, grad_out, weight, run_mean, run_var, save_mean, save_var};
 
-  runBenchmarkIterations(benchmark_state, executor_cache, aten_inputs);
+  runBenchmarkIterations(benchmark_state, executor_cache, args);
 
   benchmark_state.SetBytesProcessed(
       int64_t(benchmark_state.iterations()) *

diff --git a/benchmarks/cpp/bert.cpp b/benchmarks/cpp/bert.cpp
@@ -132,10 +132,9 @@ static void NvFuserScheduler_DivMaxSoftDropFwd(
   at::Tensor t0 = at::randn({w, 1, 1, z}, options);
   at::Tensor t1 = at::randn({w, x, y, z}, options);
 
-  std::vector<c10::IValue> at_inputs = {t0, t1};
+  KernelArgumentHolder args = {t0, t1};
 
-  auto bytes =
-      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
+  auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -159,10 +158,9 @@ static void NvFuserScheduler_DivMaxSoftDropBwd(
   at::Tensor t2 = at::randn({w, x, y, z}, options);
   at::Tensor t3 = at::randn({w, x, y, z}, options).round().to(at::kBool);
 
-  std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
+  KernelArgumentHolder args = {t0, t1, t2, t3};
 
-  auto bytes =
-      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
+  auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);
 
   // Some reason t1 isn't used, ignore it.
   bytes -=
@@ -244,10 +242,9 @@ static void NvFuserScheduler_BiasDropoutAddLayernormFwd(
   at::Tensor t3 = at::randn({x, y, z}, options);
   at::Tensor t4 = at::randn({z}, options);
 
-  std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3, t4};
+  KernelArgumentHolder args = {t0, t1, t2, t3, t4};
 
-  auto bytes =
-      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
+  auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -319,10 +316,9 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd1(
   at::Tensor t2 = at::randn({x, y, 1}, options);
   at::Tensor t3 = at::randn({x, y, 1}, options);
 
-  std::vector<c10::IValue> at_inputs = {t0, t1, t2, t3};
+  KernelArgumentHolder args = {t0, t1, t2, t3};
 
-  auto bytes =
-      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
+  auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -395,10 +391,9 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd2(
   at::Tensor t1 = at::randn({x, y, z}, options);
   at::Tensor t8 = at::randn({x, y, z}, options);
 
-  std::vector<c10::IValue> at_inputs = {t4, t5, t1, t8};
+  KernelArgumentHolder args = {t4, t5, t1, t8};
 
-  auto bytes =
-      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
+  auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));
@@ -451,10 +446,9 @@ static void NvFuserScheduler_BiasDropoutAddLayernormBwd3(
   at::Tensor t0 = at::randn({x, y, z}, options);
   at::Tensor t21 = at::randn({x, y, z}, options);
 
-  std::vector<c10::IValue> at_inputs = {t0, t21};
+  KernelArgumentHolder args = {t0, t21};
 
-  auto bytes =
-      runBenchmarkIterations(benchmark_state, executor_cache, at_inputs);
+  auto bytes = runBenchmarkIterations(benchmark_state, executor_cache, args);
 
   benchmark_state.SetBytesProcessed(
       bytes * int64_t(benchmark_state.iterations()));