Skip to content
This repository has been archived by the owner on Nov 25, 2024. It is now read-only.

Commit

Permalink
Merge branch 'branch-24.04' into test-cuda-12.2
Browse files Browse the repository at this point in the history
  • Loading branch information
jameslamb committed Jan 22, 2024
2 parents 4d22082 + 5482281 commit 5da4b25
Show file tree
Hide file tree
Showing 17 changed files with 56 additions and 27 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,4 @@ cpp/.idea/
cpp/cmake-build-debug/
pylibwholegraph/.idea/
pylibwholegraph/cmake-build-debug/
compile_commands.json
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
24.02.00
24.04.00
2 changes: 1 addition & 1 deletion ci/build_docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ rapids-print-env
rapids-logger "Downloading artifacts from previous jobs"

CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
export RAPIDS_VERSION_NUMBER="24.02"
export RAPIDS_VERSION_NUMBER="24.04"
export RAPIDS_DOCS_DIR="$(mktemp -d)"

rapids-mamba-retry install \
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ dependencies:
- graphviz
- ipykernel
- ipython
- libraft-headers==24.2.*
- librmm==24.2.*
- libraft-headers==24.4.*
- librmm==24.4.*
- nanobind>=0.2.0
- nbsphinx
- nccl
Expand Down
4 changes: 2 additions & 2 deletions conda/environments/all_cuda-122_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ dependencies:
- graphviz
- ipykernel
- ipython
- libraft-headers==24.2.*
- librmm==24.2.*
- libraft-headers==24.4.*
- librmm==24.4.*
- nanobind>=0.2.0
- nbsphinx
- nccl
Expand Down
2 changes: 1 addition & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.
#=============================================================================

set(RAPIDS_VERSION "24.02")
set(RAPIDS_VERSION "24.04")
set(WHOLEGRAPH_VERSION "${RAPIDS_VERSION}.00")

cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
Expand Down
2 changes: 1 addition & 1 deletion cpp/Doxyfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ PROJECT_NAME = "WholeGraph C API"
# could be handy for archiving the generated documentation or if some version
# control system is used.

PROJECT_NUMBER = 24.02
PROJECT_NUMBER = 24.04

# Using the PROJECT_BRIEF tag one can provide an optional one line description
# for a project that appears at the top of each page and should give viewer a
Expand Down
12 changes: 8 additions & 4 deletions cpp/src/wholememory_ops/functions/embedding_optimizer_func.cu
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ __global__ void sgd_optimizer_step_kernel(const IndiceT* indices_ptr,
int local_dim_idx = threadIdx.x;
float grad_value = 0.0f;
int embedding_idx = local_dim_idx + loop_start_idx;
if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; }
if (embedding_idx >= embedding_dim) { break; }
grad_value = grads_ptr[embedding_idx];
float embedding_value = embedding_ptr[embedding_idx];
grad_value += weight_decay * embedding_value;
embedding_value -= lr * grad_value;
Expand Down Expand Up @@ -392,7 +393,8 @@ __global__ void lazy_adam_optimizer_step_kernel(const IndiceT* indices_ptr,
int local_dim_idx = threadIdx.x;
float grad_value = 0.0f;
int embedding_idx = local_dim_idx + loop_start_idx;
if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; }
if (embedding_idx >= embedding_dim) { break; }
grad_value = grads_ptr[local_dim_idx + loop_start_idx];
float embedding_value = embedding_ptr[embedding_idx];
if (AdamW) {
embedding_value -= lr * weight_decay * embedding_value;
Expand Down Expand Up @@ -644,7 +646,8 @@ __global__ void ada_grad_optimizer_step_kernel(const IndiceT* indices_ptr,
int local_dim_idx = threadIdx.x;
float grad_value = 0.0f;
int embedding_idx = local_dim_idx + loop_start_idx;
if (embedding_idx < embedding_dim) { grad_value = grads_ptr[embedding_idx]; }
if (embedding_idx >= embedding_dim) { break; }
grad_value = grads_ptr[embedding_idx];
float embedding_value = embedding_ptr[embedding_idx];
grad_value = grad_value + weight_decay * embedding_value;
float state_sum = state_sum_ptr[embedding_idx];
Expand Down Expand Up @@ -841,7 +844,8 @@ __global__ void rms_prop_optimizer_step_kernel(const IndiceT* indices_ptr,
int local_dim_idx = threadIdx.x;
float grad_value = 0.0f;
int embedding_idx = local_dim_idx + loop_start_idx;
if (embedding_idx < embedding_dim) { grad_value = grads_ptr[local_dim_idx + loop_start_idx]; }
if (embedding_idx >= embedding_dim) { break; }
grad_value = grads_ptr[local_dim_idx + loop_start_idx];
float embedding_value = embedding_ptr[embedding_idx];
grad_value = grad_value + weight_decay * embedding_value;
float v = v_ptr[embedding_idx];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class nvshmem_device_reference {
: pointer_(static_cast<DataTypeT*>(nvshmem_ref.pointer)),
typed_stride_(nvshmem_ref.stride / sizeof(DataTypeT))
{
assert(gref.stride % sizeof(DataTypeT) == 0);
assert(nvshmem_ref.stride % sizeof(DataTypeT) == 0);
}

__device__ nvshmem_device_reference() = delete;
Expand Down
1 change: 1 addition & 0 deletions cpp/src/wholememory_ops/gather_op_impl_nvshmem.cu
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ wholememory_error_code_t wholememory_gather_nvshmem(
p_env_fns,
stream);
// ungistre
WM_CUDA_CHECK(cudaStreamSynchronize(stream));
if (nvshmemx_buffer_unregister(temp_output_ptr) != 0) {
WHOLEMEMORY_ERROR("nvshmemx_buffer_unregister error in wholememory_gather_nvshmem");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ struct EmbeddingBackwardTestParams {
wholememory_optimizer_type_t optimizer_type = WHOLEMEMORY_OPT_SGD;
float cache_ratio = 0.2;
bool use_cache = false;
int run_count = 1;
int run_count = 3;

float lr_ = 0.1;

Expand Down Expand Up @@ -428,7 +428,7 @@ void prepare_data_and_reference(
int64_t end_entry = (thread_rank + 1) * total_entry_count / thread_world_size;
CPUOptimizer cpu_optimizer(&params, start_entry, end_entry);
int embedding_dim = params.grad_description.sizes[1];
for (int step = 0; step <= params.run_count; step++) {
for (int step = 0; step < params.run_count; step++) {
int step_id = std::min(step, params.run_count - 1);
std::vector<int64_t> indices;
std::vector<std::vector<float>> grads;
Expand Down Expand Up @@ -625,7 +625,7 @@ TEST_P(WholeMemoryEmbeddingBackwardParameterTests, EmbeddingGatherGradientApplyT
EXPECT_EQ(cudaStreamSynchronize(nullptr), cudaSuccess);
EXPECT_EQ(wholememory_communicator_barrier(wm_comm), WHOLEMEMORY_SUCCESS);

for (int run = 0; run <= params.run_count; run++) {
for (int run = 0; run < params.run_count; run++) {
int step_id = std::min(run, params.run_count - 1);
auto& rank_indices_vec = step_rank_indices[step_id][world_rank];
auto& rank_grads_vec = step_rank_grads[step_id][world_rank];
Expand Down Expand Up @@ -737,6 +737,8 @@ INSTANTIATE_TEST_SUITE_P(
EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_ADAGRAD),
EmbeddingBackwardTestParams().set_use_cache().set_indice_count(10000127).set_optimizer_type(WHOLEMEMORY_OPT_LAZY_ADAM),
#endif
EmbeddingBackwardTestParams().set_entry_count(500).set_indice_count(400).set_embedding_dim(4),
EmbeddingBackwardTestParams().set_embedding_dim(3),
EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131),
EmbeddingBackwardTestParams().set_use_cache().set_grad_stride(131).set_optimizer_type(
WHOLEMEMORY_OPT_RMSPROP),
Expand Down
20 changes: 13 additions & 7 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ dependencies:
- cxx-compiler
- cython>=3.0.0
- &doxygen doxygen==1.9.1
- libraft-headers==24.2.*
- librmm==24.2.*
- libraft-headers==24.4.*
- librmm==24.4.*
- nanobind>=0.2.0
- nccl
- scikit-build
Expand Down Expand Up @@ -196,21 +196,23 @@ dependencies:
packages: []
test_cpp:
common:
- output_types: [conda, requirements]
- output_types: [conda]
packages:
- nccl
test_python:
common:
- output_types: [conda, requirements]
- output_types: [conda]
packages:
- c-compiler
- cxx-compiler
- nccl
- output_types: [conda, requirements]
packages:
- ninja
- numpy>=1.17
- pytest
- pytest-forked
- pytest-xdist
- nccl
specific:
- output_types: [conda, requirements]
matrices:
Expand Down Expand Up @@ -277,10 +279,12 @@ dependencies:
packages:
docs:
common:
- output_types: [conda]
packages:
- *doxygen
- output_types: [conda, requirements]
packages:
- breathe
- *doxygen
- graphviz
- ipython
- ipykernel
Expand All @@ -301,10 +305,12 @@ dependencies:
clang_tools:
common:
- output_types: [conda, requirements]
packages:
- gitpython
- output_types: conda
packages:
- clangxx==16.0.6
- clang-tools==16.0.6
- gitpython
python_build_wheel:
common:
- output_types: [pyproject]
Expand Down
2 changes: 1 addition & 1 deletion fetch_rapids.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# the License.
# =============================================================================
if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake)
file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.02/RAPIDS.cmake
file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-24.04/RAPIDS.cmake
${CMAKE_CURRENT_BINARY_DIR}/CUGRAPH_RAPIDS.cmake
)
endif()
Expand Down
2 changes: 1 addition & 1 deletion python/pylibwholegraph/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)

set(RAPIDS_VERSION "24.02")
set(RAPIDS_VERSION "24.04")
set(WHOLEGRAPH_VERSION "${RAPIDS_VERSION}.00")

include(FetchContent)
Expand Down
13 changes: 13 additions & 0 deletions python/pylibwholegraph/pylibwholegraph/torch/comm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,19 @@
all_comm_local_size = 1


def reset_communicators():
global all_comm_world_rank, all_comm_world_size, all_comm_local_rank, all_comm_local_size
global global_communicators, local_node_communicator, local_device_communicator
global_communicators = {}
local_node_communicator = None
local_device_communicator = None

all_comm_world_rank = 0
all_comm_world_size = 1
all_comm_local_rank = 0
all_comm_local_size = 1


def set_world_info(world_rank: int, world_size: int, local_rank: int, local_size: int):
"""
Set the global world's information. This is used for create common used communicators, like local node communicator,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def add_common_sampler_options(argparser: ArgumentParser):
argparser.add_argument(
"-s",
"--inferencesample",
type=int,
type=str,
dest="inferencesample",
default="30",
help="inference sample count, -1 is all",
Expand Down
4 changes: 3 additions & 1 deletion python/pylibwholegraph/pylibwholegraph/torch/initialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import torch
import torch.utils.dlpack
import pylibwholegraph.binding.wholememory_binding as wmb
from .comm import set_world_info, get_global_communicator, get_local_node_communicator
from .comm import set_world_info, get_global_communicator, get_local_node_communicator, reset_communicators


def init(world_rank: int, world_size: int, local_rank: int, local_size: int):
Expand Down Expand Up @@ -73,3 +73,5 @@ def finalize():
:return: None
"""
wmb.finalize()
reset_communicators()
torch.distributed.destroy_process_group() if torch.distributed.is_initialized() else None

0 comments on commit 5da4b25

Please sign in to comment.