Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYNC] 2.22.3-1 #1426

Draft
wants to merge 19 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 23 additions & 11 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -415,12 +415,13 @@ set(SRC_FILES
src/graph/tuning.cc
src/graph/xml.cc
src/graph/xml.h
src/include/align.h
src/include/alloc.h
src/include/alt_rsmi.h
src/include/archinfo.h
src/include/api_trace.h
src/include/argcheck.h
src/include/BfdBacktrace.hpp
src/include/bitops.h
src/include/bootstrap.h
src/include/channel.h
src/include/checks.h
Expand Down Expand Up @@ -479,30 +480,40 @@ set(SRC_FILES
src/include/npkit/npkit.h
src/include/npkit/npkit_event.h
src/include/npkit/npkit_struct.h
src/include/nvtx3/nvToolsExt.h
src/include/nvtx3/nvToolsExtCounters.h
src/include/nvtx3/nvToolsExtCuda.h
src/include/nvtx3/nvToolsExtCudaRt.h
src/include/nvtx3/nvToolsExt.h
src/include/nvtx3/nvToolsExtMem.h
src/include/nvtx3/nvToolsExtMemCudaRt.h
src/include/nvtx3/nvToolsExtOpenCL.h
src/include/nvtx3/nvToolsExtPayload.h
src/include/nvtx3/nvToolsExtPayloadHelper.h
src/include/nvtx3/nvToolsExtSemanticsCounters.h
src/include/nvtx3/nvToolsExtSemanticsScope.h
src/include/nvtx3/nvToolsExtSync.h
src/include/nvtx3/nvtx3.hpp
src/include/nvtx3/nvtxDetail/nvtxExtHelperMacros.h
src/include/nvtx3/nvtxDetail/nvtxExtImpl.h
src/include/nvtx3/nvtxDetail/nvtxExtImplCounters_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtImplMem_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtImplMemCudaRt_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtImplPayload_v1.h
src/include/nvtx3/nvtxDetail/nvtxExtInit.h
src/include/nvtx3/nvtxDetail/nvtxExtPayloadHelperInternal.h
src/include/nvtx3/nvtxDetail/nvtxExtPayloadTypeInfo.h
src/include/nvtx3/nvtxDetail/nvtxExtTypes.h
src/include/nvtx3/nvtxDetail/nvtxImpl.h
src/include/nvtx3/nvtxDetail/nvtxImplCore.h
src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
src/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h
src/include/nvtx3/nvtxDetail/nvtxImpl.h
src/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h
src/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h
src/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h
src/include/nvtx3/nvtxDetail/nvtxInit.h
src/include/nvtx3/nvtxDetail/nvtxInitDecls.h
src/include/nvtx3/nvtxDetail/nvtxInitDefs.h
src/include/nvtx3/nvtxDetail/nvtxInit.h
src/include/nvtx3/nvtxDetail/nvtxLinkOnce.h
src/include/nvtx3/nvtxDetail/nvtxTypes.h
src/include/nvtx3/nvtxExtDetail/nvtxExtImpl.h
src/include/nvtx3/nvtxExtDetail/nvtxExtImplPayload_v1.h
src/include/nvtx3/nvtxExtDetail/nvtxExtInit.h
src/include/nvtx3/nvtxExtDetail/nvtxExtPayloadTypeInfo.h
src/include/nvtx3/nvtxExtDetail/nvtxExtTypes.h
src/include/alt_rsmi.h
src/misc/alt_rsmi.cc
src/misc/archinfo.cc
src/misc/argcheck.cc
Expand Down Expand Up @@ -532,6 +543,7 @@ set(SRC_FILES
src/misc/msccl/msccl_setup.cc
src/misc/msccl/msccl_status.cc
src/transport/coll_net.cc
src/transport/generic.cc
src/transport/net.cc
src/transport/net_ib.cc
src/transport/net_socket.cc
Expand Down
4 changes: 2 additions & 2 deletions NOTICES.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Notices and Licenses file
_______________________________________________________________

Dependencies on nvidia-nccl v2.21.5-1 (BSD3)
Dependencies on nvidia-nccl v2.22.3-1 (BSD3)

Copyright (c) 2015-2020, NVIDIA CORPORATION. All rights reserved.
Modifications Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Modifications Copyright (c) Microsoft Corporation. Licensed under the MIT License.

Redistribution and use in source and binary forms, with or without
Expand Down
4 changes: 2 additions & 2 deletions cmake/scripts/add_unroll.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ if [[ "$HIP_FILE" =~ .*/src/device/.*\.h ]]; then
sed -i "s/\\(runRing<T[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/runTreeUpDown<T, RedOp, ProtoSimple<1, 1, COLL_UNROLL>>/runTreeUpDown<T, RedOp, ProtoSimple<1, 1, COLL_UNROLL>, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/\\(runTreeSplit<T[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/\\(struct RunWorkElement<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/\\(struct RunWork<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/\\(struct RunWorkColl<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"
sed -i "s/\\(struct RunWorkBatch<ncclFunc[^>]*\\)>*/\\1, COLL_UNROLL>/" "$HIP_FILE"

echo "Added COLL_UNROLL template argument to $HIP_FILE"
fi
10 changes: 10 additions & 0 deletions ext-net/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,16 @@ set to `NCCL_PTR_HOST|NCCL_PTR_CUDA`, otherwise it should be set to `NCCL_PTR_HO
supports `dmabuf`, it should set `ptrSupport` to `NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF` and
provide a `regMrDmaBuf` function.

The `regIsGlobal` field allows NCCL to register buffers in advance using e.g. a loopback connection
and later on, expect that another registration on a buffer contained within a previous registration
will be nearly immediate, as the buffer is already known by the network adapter. A typical
implementation would maintain a registration cache; the call to ncclCommRegister will create the
initial entry in the cache using regMr() on a loopback connection. Any later call to NCCL
operations will call regMr() again on the real connection, with the real buffer (could be at a
different offset within the original buffer, with a smaller size, etc), then deregMr() right after.
The call to ncclCommDeregister should call the final deregMr() and effectively remove the mapping
on the network adapter.

The `speed` field indicates the speed of the network port in Mbps (10^6 bits per second). This is
important to ensure proper optimization of flows within the node.

Expand Down
15 changes: 15 additions & 0 deletions ext-net/example/nccl/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

#ifndef COMMON_H_
#define COMMON_H_

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#endif
1 change: 1 addition & 0 deletions ext-net/example/nccl/err.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ typedef enum { ncclSuccess = 0,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;

#endif
6 changes: 1 addition & 5 deletions ext-net/example/nccl/net.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <stdint.h>
#include <stdlib.h>

#include "common.h"
#include "err.h"

#define NCCL_NET_HANDLE_MAXSIZE 128
Expand All @@ -19,11 +20,6 @@
// Maximum number of requests per comm object
#define NCCL_NET_MAX_REQUESTS 32

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#include "net_v8.h"
#include "net_v7.h"
#include "net_v6.h"
Expand Down
4 changes: 2 additions & 2 deletions ext-net/example/nccl/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_
#ifndef NCCL_TYPES_H_
#define NCCL_TYPES_H_

/* Data types */
typedef enum { ncclInt8 = 0, ncclChar = 0,
Expand Down
15 changes: 15 additions & 0 deletions ext-tuner/example/nccl/common.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/*************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* See LICENSE.txt for license information
************************************************************************/

#ifndef COMMON_H_
#define COMMON_H_

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_BOOTSTRAP=4096, NCCL_REG=8192, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);

#endif
17 changes: 17 additions & 0 deletions ext-tuner/example/nccl/err.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/*
* Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
*/

#ifndef NCCL_ERR_H_
#define NCCL_ERR_H_

/* Error type for plugins */
typedef enum { ncclSuccess = 0,
ncclUnhandledCudaError = 1,
ncclSystemError = 2,
ncclInternalError = 3,
ncclInvalidArgument = 4,
ncclInvalidUsage = 5,
ncclRemoteError = 6 } ncclResult_t;

#endif
43 changes: 28 additions & 15 deletions ext-tuner/example/nccl/tuner.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,24 @@
#ifndef NCCL_TUNER_H_
#define NCCL_TUNER_H_

#include "nccl.h"
#include <stdint.h>
#include <stdlib.h>

typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel;
typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_PROXY=1024, NCCL_NVLS=2048, NCCL_ALL=~0} ncclDebugLogSubSys;

typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...);
#include "common.h"
#include "err.h"

#define NCCL_NUM_FUNCTIONS 5 // Send/Recv not included for now
typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncReduceScatter, ncclFuncAllReduce, ncclFuncSendRecv, ncclFuncSend, ncclFuncRecv, ncclNumFuncs} ncclFunc_t;
typedef enum {
ncclFuncBroadcast = 0,
ncclFuncReduce = 1,
ncclFuncAllGather = 2,
ncclFuncReduceScatter = 3,
ncclFuncAllReduce = 4,
ncclFuncSendRecv = 5,
ncclFuncSend = 6,
ncclFuncRecv = 7,
ncclNumFuncs = 8
} ncclFunc_t;

#define NCCL_NUM_ALGORITHMS 6 // Tree/Ring/CollNet*
#define NCCL_ALGO_UNDEF -1
Expand All @@ -33,6 +42,8 @@ typedef enum { ncclFuncBroadcast, ncclFuncReduce, ncclFuncAllGather, ncclFuncRed
#define NCCL_PROTO_LL128 1
#define NCCL_PROTO_SIMPLE 2

#define NCCL_ALGO_PROTO_IGNORE -1.0

// API to be implemented by external tuner
typedef struct {
// Name of the tuner
Expand All @@ -52,31 +63,33 @@ typedef struct {
// - context: tuner context object
// - collType: collective type , e.g., allreduce, allgather…
// - nBytes: collective size in bytes
// - collNetSupport: whether collnet supports this type
// - nvlsSupport: whether nvlink sharp supports this time
// - numPipeOps: number of operations in the group
// - numAlgo: number of algorithms in collCostTable
// - numProto: number of protocols in collCostTable
//
// Outputs:
// - algorithm: selected algorithm to be used for the given collective
// - protocol: selected protocol to be used for the given collective
// - nChannels: number of channels (hence SMs) to be used.
//
// InOut:
// - collCostTable: collective cost table, generated by NCCL core, containing algo|proto|time entries for collType.
// NCCL core sets ignored algo/proto cost table entries to -1.0 (NCCL_ALGO_PROTO_IGNORE).
//
// If getCollInfo() does not return ncclSuccess, NCCL will fall back to the
// default tuning for the given collective.
// Also, the plugin is allowed to not set any output, or set only the
// algorithm and protocol, but not only the algorithm or only the protocol.
// Unset fields will be set automatically by NCCL.
ncclResult_t (*getCollInfo)(void* context, ncclFunc_t collType, size_t nBytes,
int collNetSupport, int nvlsSupport, int numPipeOps,
int *algorithm, int *protocol, int* nChannels);
int numPipeOps, float** collCostTable, int numAlgo, int numProto,
int* nChannels);

// Terminates the plugin and cleans up any resources that the plugin allocated.
// context: tuner context object
ncclResult_t (*destroy)(void* context);
} ncclTuner_v2_t;
} ncclTuner_v3_t;

typedef ncclTuner_v2_t ncclTuner_t;
typedef ncclTuner_v3_t ncclTuner_t;

#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v2"
#define NCCL_TUNER_PLUGIN_SYMBOL "ncclTunerPlugin_v3"

#endif
4 changes: 2 additions & 2 deletions ext-tuner/example/plugin.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ ncclResult_t ncclTopoGetAlgoTime_Tuner(ncclFunc_t collType, int algorithm, int p
return ncclSuccess;
}

__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction, void **context) {
__hidden ncclResult_t pluginInit(size_t nRanks, size_t nNodes, ncclDebugLogger_t logFunction) {
if (nRanks <= 1) return ncclSuccess;
int compCapIndex = HOPPER_COMPCAP_IDX;
int index2 = nNodes <= 2 ? nNodes-1 : 2;
Expand Down Expand Up @@ -226,7 +226,7 @@ __hidden ncclResult_t pluginDestroy(void* context) { return ncclSuccess; }

#define PLUGIN_NAME "Example"

const ncclTuner_v2_t ncclTunerPlugin_v2 = {
const ncclTuner_v3_t ncclTunerPlugin_v3 = {
.name = PLUGIN_NAME,
.init = pluginInit,
.getCollInfo = pluginGetCollInfo,
Expand Down
4 changes: 2 additions & 2 deletions makefiles/version.mk
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
##### version
NCCL_MAJOR := 2
NCCL_MINOR := 21
NCCL_PATCH := 5
NCCL_MINOR := 22
NCCL_PATCH := 3
NCCL_SUFFIX :=
PKG_REVISION := 1
5 changes: 3 additions & 2 deletions src/bootstrap.cc
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,6 @@ ncclResult_t bootstrapCreateRoot(struct ncclBootstrapHandle* handle, bool idFrom

ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
memset(handle, 0, sizeof(ncclBootstrapHandle));
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));

const char* env = ncclGetEnv("NCCL_COMM_ID");
if (env) {
Expand All @@ -211,7 +210,9 @@ ncclResult_t bootstrapGetUniqueId(struct ncclBootstrapHandle* handle) {
WARN("Invalid NCCL_COMM_ID, please use format: <ipv4>:<port> or [<ipv6>]:<port> or <hostname>:<port>");
return ncclInvalidArgument;
}
handle->magic = NCCL_MAGIC;
} else {
NCCLCHECK(getRandomData(&handle->magic, sizeof(handle->magic)));
memcpy(&handle->addr, &bootstrapNetIfAddr, sizeof(union ncclSocketAddress));
NCCLCHECK(bootstrapCreateRoot(handle, false));
}
Expand Down Expand Up @@ -631,7 +632,7 @@ ncclResult_t bootstrapClose(void* commState) {
struct bootstrapState* state = (struct bootstrapState*)commState;
if (state->unexpectedConnections != NULL) {
unexpectedFree(state);
if (__atomic_load_n(state->abortFlag, __ATOMIC_RELAXED) == 0) {
if (__atomic_load_n(state->abortFlag, __ATOMIC_ACQUIRE) == 0) {
WARN("Unexpected connections are not empty");
return ncclInternalError;
}
Expand Down
8 changes: 5 additions & 3 deletions src/channel.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,17 @@
#include "channel.h"
#include "param.h"
#include "gdrwrap.h"
#include "transport.h"

ncclResult_t initChannel(struct ncclComm* comm, int channelId) {
struct ncclChannel* channel = &comm->channels[channelId];
if (channel->id != -1) return ncclSuccess;

int nRanks = comm->nRanks;
int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
int nvlsRanks = comm->localRanks;
int nPeers = nRanks + 1 /* Collnet */ + nvlsRanks /* NVLS */;
channel->id = channelId;
channel->workFifoSent = 0;
channel->workFifoProduced = 0;

struct ncclSharedResources* sharedRes = comm->sharedRes;

Expand Down Expand Up @@ -74,7 +75,8 @@ ncclResult_t initNvlsChannel(struct ncclComm* comm, int channelId, struct ncclCo

NCCLCHECK(ncclStrongStreamAcquireUncaptured(&sharedRes->deviceStream));

int nvlsRanks = comm->MNNVL ? comm->clique.size : comm->localRanks;
int nvlsRanks = comm->localRanks;

if (share) {
channel->nvlsPeers = parent->channels[channelId].nvlsPeers;
channel->nvlsDevPeers = parent->channels[channelId].nvlsDevPeers;
Expand Down
Loading