Skip to content

Commit

Permalink
Move topology detection out of merge pooled embedding (#1440)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #1440

We want to expose the get_nvlink_matrix helper func from fbgemm to use it to detect the GPU topology. This is just a refactoring to move it out of the private namespace.

Reviewed By: jspark1105

Differential Revision: D40464755

fbshipit-source-id: ee50feb07d32daffacad6d3f34f93efe884932ee
  • Loading branch information
xw285cornell authored and facebook-github-bot committed Nov 2, 2022
1 parent f030ebc commit 43ca0c7
Show file tree
Hide file tree
Showing 3 changed files with 205 additions and 181 deletions.
19 changes: 19 additions & 0 deletions fbgemm_gpu/include/fbgemm_gpu/topology_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <functional>

using Node = int64_t;
using Links = int64_t;
template <typename T>
using AdjacencyMatrix = std::function<T(Node, Node)>;

namespace fbgemm_gpu {
AdjacencyMatrix<Links> get_nvlink_matrix();
} // namespace fbgemm_gpu
184 changes: 3 additions & 181 deletions fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,193 +14,14 @@
#include <c10/cuda/CUDAGuard.h>
#include <c10/util/irange.h>
#include <torch/library.h>

#ifdef __HIP_PLATFORM_HCC__
#include "hip/hip_runtime.h"
#include "rocm_smi/rocm_smi.h"

#include <inttypes.h>
#include <algorithm>

#include "fbgemm_gpu/merge_pooled_embeddings.h"
#include "fbgemm_gpu/sparse_ops_utils.h"
#include "fbgemm_gpu/topology_utils.h"

using Tensor = at::Tensor;

#define RSMI_CHECK(fn) \
do { \
rsmi_status_t ret = (fn); \
TORCH_CHECK((ret) == RSMI_STATUS_SUCCESS); \
} while (0)

#define RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16

using Node = int64_t;
using Links = int64_t;
template <typename T>
using AdjacencyMatrix = std::function<T(Node, Node)>;

namespace {

AdjacencyMatrix<Links> get_nvlink_matrix() {
auto world_size = at::cuda::getNumGPUs();
RSMI_CHECK(rsmi_init(0));

// Note that ROCm_SMI uses a different numbering method to ROCm runtime,
// so we need to learn the mapping by using the bus ID.
uint32_t device_count;
RSMI_CHECK(rsmi_num_monitor_devices(&device_count));

std::unordered_map<Node, uint32_t> rocm_device_to_rsmi_device;

for (const auto i : c10::irange(device_count)) {
uint64_t pci_info;
RSMI_CHECK(rsmi_dev_pci_id_get(i, &pci_info));
uint64_t domain, bus, device, function;
domain = (pci_info >> 32) & 0xffffffff;
bus = (pci_info >> 8) & 0xff;
device = (pci_info >> 3) & 0x1f;
function = pci_info & 0x7;
// Different form CUDA, we do not get the PCI BUS ID as a char* and we need
// to reconstruct it.
char pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
sprintf(
pci_bus_id_str,
"%04" PRIu64 ":%02" PRIu64 ":%02" PRIu64 ".%0" PRIu64,
domain,
bus,
device,
function);

std::array<char, RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
std::copy(
&pci_bus_id_str[0],
&pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
pci_bus_id.data());
int32_t node = 0;
auto err = hipDeviceGetByPCIBusId(&node, pci_bus_id.data());
if (err == hipSuccess) {
rocm_device_to_rsmi_device.insert({node, i});
} else {
// flush the last error - this can occur when e.g. we set
// HIP_VISIBLE_DEVICES to a subset of the available GPUs in the system.
hipGetLastError();
}
}

std::vector<Links> links(world_size * world_size);
for (const auto i : c10::irange(world_size)) {
auto src_rsmi_device = rocm_device_to_rsmi_device.find(i);
if (src_rsmi_device != rocm_device_to_rsmi_device.end()) {
for (const auto j : c10::irange(world_size)) {
auto dst_rsmi_device = rocm_device_to_rsmi_device.find(j);
if (dst_rsmi_device != rocm_device_to_rsmi_device.end()) {
bool is_active;
RSMI_CHECK(rsmi_is_P2P_accessible(
src_rsmi_device->second, dst_rsmi_device->second, &is_active));
if (is_active) {
links[i * world_size + j] += 1;
}
}
}
}
}
RSMI_CHECK(rsmi_shut_down());
return [=](Node i, Node j) { return links[i * world_size + j]; };
}
} // namespace

#else // CUDA
#include <nvml.h>

#include <algorithm>

#include "fbgemm_gpu/merge_pooled_embeddings.h"
#include "fbgemm_gpu/sparse_ops_utils.h"

using Tensor = at::Tensor;

#define NVML_CHECK(fn) \
do { \
nvmlReturn_t ret = (fn); \
TORCH_CHECK((ret) == NVML_SUCCESS); \
} while (0)

using Node = int64_t;
using Links = int64_t;
template <typename T>
using AdjacencyMatrix = std::function<T(Node, Node)>;
namespace {

AdjacencyMatrix<Links> get_nvlink_matrix() {
auto world_size = at::cuda::getNumGPUs();
NVML_CHECK(nvmlInit());

// Note that NVML uses a different numbering method to CUDA runtime,
// so we need to learn the mapping by using the bus ID.
uint32_t device_count;
NVML_CHECK(nvmlDeviceGetCount(&device_count));

std::map<std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE>, Node>
pci_bus_ids;
std::unordered_map<Node, uint32_t> cuda_device_to_nvml_device;

for (const auto i : c10::irange(device_count)) {
nvmlDevice_t handle;
NVML_CHECK(nvmlDeviceGetHandleByIndex(i, &handle));
nvmlPciInfo_t pci_info;
NVML_CHECK(nvmlDeviceGetPciInfo(handle, &pci_info));
std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
std::copy(
&pci_info.busId[0],
&pci_info.busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
pci_bus_id.data());
int32_t node = 0;
auto err = cudaDeviceGetByPCIBusId(&node, pci_bus_id.data());
if (err == cudaSuccess) {
pci_bus_ids.insert({pci_bus_id, node});
cuda_device_to_nvml_device.insert({node, i});
} else {
// flush the last error - this can occur when e.g. we set
// CUDA_VISIBLE_DEVICES to a subset of the available GPUs in the system.
cudaGetLastError();
}
}

std::vector<Links> links(world_size * world_size);
for (const auto i : c10::irange(world_size)) {
nvmlDevice_t handle;
NVML_CHECK(
nvmlDeviceGetHandleByIndex(cuda_device_to_nvml_device[i], &handle));
for (const auto link : c10::irange(NVML_NVLINK_MAX_LINKS)) {
nvmlEnableState_t is_active;
auto nvmlRet = nvmlDeviceGetNvLinkState(handle, link, &is_active);
if (nvmlRet == NVML_ERROR_INVALID_ARGUMENT ||
nvmlRet == NVML_ERROR_NOT_SUPPORTED) {
continue;
}
if (is_active != NVML_FEATURE_ENABLED) {
continue;
}
nvmlPciInfo_t pci_info;
NVML_CHECK(nvmlDeviceGetNvLinkRemotePciInfo(handle, link, &pci_info));
std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
std::copy(
&pci_info.busId[0],
&pci_info.busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
pci_bus_id.data());
auto dst = pci_bus_ids.find(pci_bus_id);
if (dst != pci_bus_ids.end()) {
auto j = dst->second;
links[i * world_size + j] += 1;
}
}
}

return [=](Node i, Node j) { return links[i * world_size + j]; };
}
} // namespace
#endif
namespace {
// Hilariously unoptimized, but algorithmic correctness matters more here, and
// we only do it once.
Expand Down Expand Up @@ -298,7 +119,8 @@ void all_to_one(
std::vector<at::cuda::CUDAEvent> copy_begin_events(num_gpus);
std::vector<at::cuda::CUDAEvent> copy_completion_events(num_gpus);

static auto intermediate_nodes = get_intermediate_node(get_nvlink_matrix());
static auto intermediate_nodes =
get_intermediate_node(fbgemm_gpu::get_nvlink_matrix());
for (auto& ten : input_tensors) {
Node src_device_id = ten.get_device();
auto intermediate_node =
Expand Down
Loading

0 comments on commit 43ca0c7

Please sign in to comment.