Merge branch 'branch-24.04' into test-cuda-12.2

rapidsai · Jan 22, 2024 · 5da4b25 · 5da4b25
2 parents 4d22082 + 5482281
commit 5da4b25
Show file tree

Hide file tree

Showing 17 changed files with 56 additions and 27 deletions.
diff --git a/.gitignore b/.gitignore
@@ -87,3 +87,4 @@ cpp/.idea/
 cpp/cmake-build-debug/
 pylibwholegraph/.idea/
 pylibwholegraph/cmake-build-debug/
+compile_commands.json
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-24.02.00
+24.04.00
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
@@ -22,7 +22,7 @@ rapids-print-env
 rapids-logger "Downloading artifacts from previous jobs"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
-export RAPIDS_VERSION_NUMBER="24.02"
+export RAPIDS_VERSION_NUMBER="24.04"
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
 rapids-mamba-retry install \

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,8 +24,8 @@ dependencies:
 - graphviz
 - ipykernel
 - ipython
-- libraft-headers==24.2.*
-- librmm==24.2.*
+- libraft-headers==24.4.*
+- librmm==24.4.*
 - nanobind>=0.2.0
 - nbsphinx
 - nccl

diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -25,8 +25,8 @@ dependencies:
 - graphviz
 - ipykernel
 - ipython
-- libraft-headers==24.2.*
-- librmm==24.2.*
+- libraft-headers==24.4.*
+- librmm==24.4.*
 - nanobind>=0.2.0
 - nbsphinx
 - nccl

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -14,7 +14,7 @@
 # limitations under the License.
 #=============================================================================
 
-set(RAPIDS_VERSION "24.02")
+set(RAPIDS_VERSION "24.04")
 set(WHOLEGRAPH_VERSION "${RAPIDS_VERSION}.00")
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)

diff --git a/cpp/Doxyfile b/cpp/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "WholeGraph C API"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 24.02
+PROJECT_NUMBER         = 24.04
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a

diff --git a/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu b/cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu
@@ -214,7 +214,8 @@ __global__ void sgd_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value            = grads_ptr[embedding_idx];
     float embedding_value = embedding_ptr[embedding_idx];
     grad_value += weight_decay * embedding_value;
     embedding_value -= lr * grad_value;
@@ -392,7 +393,8 @@ __global__ void lazy_adam_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value            = grads_ptr[local_dim_idx + loop_start_idx];
     float embedding_value = embedding_ptr[embedding_idx];
     if (AdamW) {
       embedding_value -= lr * weight_decay * embedding_value;
@@ -644,7 +646,8 @@ __global__ void ada_grad_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value                   = grads_ptr[embedding_idx];
     float embedding_value        = embedding_ptr[embedding_idx];
     grad_value                   = grad_value + weight_decay * embedding_value;
     float state_sum              = state_sum_ptr[embedding_idx];
@@ -841,7 +844,8 @@ __global__ void rms_prop_optimizer_step_kernel(const IndiceT* indices_ptr,
     int local_dim_idx = threadIdx.x;
     float grad_value  = 0.0f;
     int embedding_idx = local_dim_idx + loop_start_idx;
-    if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; }
+    if (embedding_idx >= embedding_dim) { break; }
+    grad_value                   = grads_ptr[local_dim_idx + loop_start_idx];
     float embedding_value        = embedding_ptr[embedding_idx];
     grad_value                   = grad_value + weight_decay * embedding_value;
     float v                      = v_ptr[embedding_idx];

diff --git a/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh b/cpp/src/wholememory_ops/functions/nvshmem_device_reference.cuh
@@ -29,7 +29,7 @@ class nvshmem_device_reference {
     : pointer_(static_cast<DataTypeT*>(nvshmem_ref.pointer)),
       typed_stride_(nvshmem_ref.stride / sizeof(DataTypeT))
   {
-    assert(gref.stride % sizeof(DataTypeT) == 0);
+    assert(nvshmem_ref.stride % sizeof(DataTypeT) == 0);
   }
 
   __device__ nvshmem_device_reference() = delete;

diff --git a/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu b/cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu
@@ -185,6 +185,7 @@ wholememory_error_code_t wholememory_gather_nvshmem(
                                      p_env_fns,
                                      stream);
     // ungistre
+    WM_CUDA_CHECK(cudaStreamSynchronize(stream));
     if (nvshmemx_buffer_unregister(temp_output_ptr) != 0) {
       WHOLEMEMORY_ERROR("nvshmemx_buffer_unregister error in wholememory_gather_nvshmem");
     }

diff --git a/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu b/cpp/tests/wholememory_ops/wholememory_embedding_gradient_apply_tests.cu
@@ -149,7 +149,7 @@ struct EmbeddingBackwardTestParams {
   wholememory_optimizer_type_t optimizer_type         = WHOLEMEMORY_OPT_SGD;
   float cache_ratio                                   = 0.2;
   bool use_cache                                      = false;
-  int run_count                                       = 1;
+  int run_count                                       = 3;
 
   float lr_ = 0.1;
 
@@ -428,7 +428,7 @@ void prepare_data_and_reference(
                    int64_t end_entry = (thread_rank + 1) * total_entry_count / thread_world_size;
                    CPUOptimizer cpu_optimizer(&params, start_entry, end_entry);
                    int embedding_dim = params.grad_description.sizes[1];
-                   for (int step = 0; step <= params.run_count; step++) {
+                   for (int step = 0; step < params.run_count; step++) {
                      int step_id = std::min(step, params.run_count - 1);
                      std::vector<int64_t> indices;
                      std::vector<std::vector<float>> grads;
@@ -625,7 +625,7 @@ TEST_P(WholeMemoryEmbeddingBackwardParameterTests, EmbeddingGatherGradientApplyT
       EXPECT_EQ(cudaStreamSynchronize(nullptr), cudaSuccess);
       EXPECT_EQ(wholememory_communicator_barrier(wm_comm), WHOLEMEMORY_SUCCESS);
 
-      for (int run = 0; run <= params.run_count; run++) {
+      for (int run = 0; run < params.run_count; run++) {
         int step_id            = std::min(run, params.run_count - 1);
         auto& rank_indices_vec = step_rank_indices[step_id][world_rank];
         auto& rank_grads_vec   = step_rank_grads[step_id][world_rank];
@@ -737,6 +737,8 @@ INSTANTIATE_TEST_SUITE_P(
         EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_ADAGRAD),
         EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_LAZY_ADAM),
 #endif
+    EmbeddingBackwardTestParams().set_entry_count(500).set_indice_count(400).set_embedding_dim(4),
+    EmbeddingBackwardTestParams().set_embedding_dim(3),
     EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131),
     EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131).set_optimizer_type(
       WHOLEMEMORY_OPT_RMSPROP),

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -74,8 +74,8 @@ dependencies:
           - cxx-compiler
           - cython>=3.0.0
           - &doxygen doxygen==1.9.1
-          - libraft-headers==24.2.*
-          - librmm==24.2.*
+          - libraft-headers==24.4.*
+          - librmm==24.4.*
           - nanobind>=0.2.0
           - nccl
           - scikit-build
@@ -196,21 +196,23 @@ dependencies:
         packages: []
   test_cpp:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda]
         packages:
           - nccl
   test_python:
     common:
-      - output_types: [conda, requirements]
+      - output_types: [conda]
         packages:
           - c-compiler
           - cxx-compiler
+          - nccl
+      - output_types: [conda, requirements]
+        packages:
           - ninja
           - numpy>=1.17
           - pytest
           - pytest-forked
           - pytest-xdist
-          - nccl
     specific:
       - output_types: [conda, requirements]
         matrices:
@@ -277,10 +279,12 @@ dependencies:
             packages:
   docs:
     common:
+      - output_types: [conda]
+        packages:
+          - *doxygen
       - output_types: [conda, requirements]
         packages:
           - breathe
-          - *doxygen
           - graphviz
           - ipython
           - ipykernel
@@ -301,10 +305,12 @@ dependencies:
   clang_tools:
     common:
       - output_types: [conda, requirements]
+        packages:
+          - gitpython
+      - output_types: conda
         packages:
           - clangxx==16.0.6
           - clang-tools==16.0.6
-          - gitpython
   python_build_wheel:
     common:
       - output_types: [pyproject]

diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.02/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.04/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake
   )
 endif()

diff --git a/python/pylibwholegraph/CMakeLists.txt b/python/pylibwholegraph/CMakeLists.txt
@@ -16,7 +16,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(RAPIDS_VERSION "24.02")
+set(RAPIDS_VERSION "24.04")
 set(WHOLEGRAPH_VERSION "${RAPIDS_VERSION}.00")
 
 include(FetchContent)

diff --git a/python/pylibwholegraph/pylibwholegraph/torch/comm.py b/python/pylibwholegraph/pylibwholegraph/torch/comm.py
@@ -32,6 +32,19 @@
 all_comm_local_size = 1
 
 
+def reset_communicators():
+    global all_comm_world_rank, all_comm_world_size, all_comm_local_rank, all_comm_local_size
+    global global_communicators, local_node_communicator, local_device_communicator
+    global_communicators = {}
+    local_node_communicator = None
+    local_device_communicator = None
+
+    all_comm_world_rank = 0
+    all_comm_world_size = 1
+    all_comm_local_rank = 0
+    all_comm_local_size = 1
+
+
 def set_world_info(world_rank: int, world_size: int, local_rank: int, local_size: int):
     """
     Set the global world's information. This is used for create common used communicators, like local node communicator,

diff --git a/python/pylibwholegraph/pylibwholegraph/torch/common_options.py b/python/pylibwholegraph/pylibwholegraph/torch/common_options.py
@@ -132,7 +132,7 @@ def add_common_sampler_options(argparser: ArgumentParser):
     argparser.add_argument(
         "-s",
         "--inferencesample",
-        type=int,
+        type=str,
         dest="inferencesample",
         default="30",
         help="inference sample count, -1 is all",

diff --git a/python/pylibwholegraph/pylibwholegraph/torch/initialize.py b/python/pylibwholegraph/pylibwholegraph/torch/initialize.py
@@ -15,7 +15,7 @@
 import torch
 import torch.utils.dlpack
 import pylibwholegraph.binding.wholememory_binding as wmb
-from .comm import set_world_info, get_global_communicator, get_local_node_communicator
+from .comm import set_world_info, get_global_communicator, get_local_node_communicator, reset_communicators
 
 
 def init(world_rank: int, world_size: int, local_rank: int, local_size: int):
@@ -73,3 +73,5 @@ def finalize():
     :return: None
     """
     wmb.finalize()
+    reset_communicators()
+    torch.distributed.destroy_process_group() if torch.distributed.is_initialized() else None