diff --git a/fbgemm_gpu/include/fbgemm_gpu/topology_utils.h b/fbgemm_gpu/include/fbgemm_gpu/topology_utils.h
new file mode 100644
index 0000000000..b70635ea83
--- /dev/null
+++ b/fbgemm_gpu/include/fbgemm_gpu/topology_utils.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <functional>
+
+using Node = int64_t;
+using Links = int64_t;
+template <typename T>
+using AdjacencyMatrix = std::function<T(Node, Node)>;
+
+namespace fbgemm_gpu {
+AdjacencyMatrix<Links> get_nvlink_matrix();
+} // namespace fbgemm_gpu
diff --git a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
index 1acb91f54e..5fc9dd7444 100644
--- a/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
+++ b/fbgemm_gpu/src/merge_pooled_embeddings_gpu.cpp
@@ -14,193 +14,14 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/util/irange.h>
 #include <torch/library.h>
-
-#ifdef __HIP_PLATFORM_HCC__
-#include "hip/hip_runtime.h"
-#include "rocm_smi/rocm_smi.h"
-
-#include <inttypes.h>
 #include <algorithm>
 
 #include "fbgemm_gpu/merge_pooled_embeddings.h"
 #include "fbgemm_gpu/sparse_ops_utils.h"
+#include "fbgemm_gpu/topology_utils.h"
 
 using Tensor = at::Tensor;
 
-#define RSMI_CHECK(fn)                         \
-  do {                                         \
-    rsmi_status_t ret = (fn);                  \
-    TORCH_CHECK((ret) == RSMI_STATUS_SUCCESS); \
-  } while (0)
-
-#define RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
-
-using Node = int64_t;
-using Links = int64_t;
-template <typename T>
-using AdjacencyMatrix = std::function<T(Node, Node)>;
-
-namespace {
-
-AdjacencyMatrix<Links> get_nvlink_matrix() {
-  auto world_size = at::cuda::getNumGPUs();
-  RSMI_CHECK(rsmi_init(0));
-
-  // Note that ROCm_SMI uses a different numbering method to ROCm runtime,
-  // so we need to learn the mapping by using the bus ID.
-  uint32_t device_count;
-  RSMI_CHECK(rsmi_num_monitor_devices(&device_count));
-
-  std::unordered_map<Node, uint32_t> rocm_device_to_rsmi_device;
-
-  for (const auto i : c10::irange(device_count)) {
-    uint64_t pci_info;
-    RSMI_CHECK(rsmi_dev_pci_id_get(i, &pci_info));
-    uint64_t domain, bus, device, function;
-    domain = (pci_info >> 32) & 0xffffffff;
-    bus = (pci_info >> 8) & 0xff;
-    device = (pci_info >> 3) & 0x1f;
-    function = pci_info & 0x7;
-    // Different form CUDA, we do not get the PCI BUS ID as a char* and we need
-    // to reconstruct it.
-    char pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
-    sprintf(
-        pci_bus_id_str,
-        "%04" PRIu64 ":%02" PRIu64 ":%02" PRIu64 ".%0" PRIu64,
-        domain,
-        bus,
-        device,
-        function);
-
-    std::array<char, RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
-    std::copy(
-        &pci_bus_id_str[0],
-        &pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
-        pci_bus_id.data());
-    int32_t node = 0;
-    auto err = hipDeviceGetByPCIBusId(&node, pci_bus_id.data());
-    if (err == hipSuccess) {
-      rocm_device_to_rsmi_device.insert({node, i});
-    } else {
-      // flush the last error - this can occur when e.g. we set
-      // HIP_VISIBLE_DEVICES to a subset of the available GPUs in the system.
-      hipGetLastError();
-    }
-  }
-
-  std::vector<Links> links(world_size * world_size);
-  for (const auto i : c10::irange(world_size)) {
-    auto src_rsmi_device = rocm_device_to_rsmi_device.find(i);
-    if (src_rsmi_device != rocm_device_to_rsmi_device.end()) {
-      for (const auto j : c10::irange(world_size)) {
-        auto dst_rsmi_device = rocm_device_to_rsmi_device.find(j);
-        if (dst_rsmi_device != rocm_device_to_rsmi_device.end()) {
-          bool is_active;
-          RSMI_CHECK(rsmi_is_P2P_accessible(
-              src_rsmi_device->second, dst_rsmi_device->second, &is_active));
-          if (is_active) {
-            links[i * world_size + j] += 1;
-          }
-        }
-      }
-    }
-  }
-  RSMI_CHECK(rsmi_shut_down());
-  return [=](Node i, Node j) { return links[i * world_size + j]; };
-}
-} // namespace
-
-#else // CUDA
-#include <nvml.h>
-
-#include <algorithm>
-
-#include "fbgemm_gpu/merge_pooled_embeddings.h"
-#include "fbgemm_gpu/sparse_ops_utils.h"
-
-using Tensor = at::Tensor;
-
-#define NVML_CHECK(fn)                  \
-  do {                                  \
-    nvmlReturn_t ret = (fn);            \
-    TORCH_CHECK((ret) == NVML_SUCCESS); \
-  } while (0)
-
-using Node = int64_t;
-using Links = int64_t;
-template <typename T>
-using AdjacencyMatrix = std::function<T(Node, Node)>;
-namespace {
-
-AdjacencyMatrix<Links> get_nvlink_matrix() {
-  auto world_size = at::cuda::getNumGPUs();
-  NVML_CHECK(nvmlInit());
-
-  // Note that NVML uses a different numbering method to CUDA runtime,
-  // so we need to learn the mapping by using the bus ID.
-  uint32_t device_count;
-  NVML_CHECK(nvmlDeviceGetCount(&device_count));
-
-  std::map<std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE>, Node>
-      pci_bus_ids;
-  std::unordered_map<Node, uint32_t> cuda_device_to_nvml_device;
-
-  for (const auto i : c10::irange(device_count)) {
-    nvmlDevice_t handle;
-    NVML_CHECK(nvmlDeviceGetHandleByIndex(i, &handle));
-    nvmlPciInfo_t pci_info;
-    NVML_CHECK(nvmlDeviceGetPciInfo(handle, &pci_info));
-    std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
-    std::copy(
-        &pci_info.busId[0],
-        &pci_info.busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
-        pci_bus_id.data());
-    int32_t node = 0;
-    auto err = cudaDeviceGetByPCIBusId(&node, pci_bus_id.data());
-    if (err == cudaSuccess) {
-      pci_bus_ids.insert({pci_bus_id, node});
-      cuda_device_to_nvml_device.insert({node, i});
-    } else {
-      // flush the last error - this can occur when e.g. we set
-      // CUDA_VISIBLE_DEVICES to a subset of the available GPUs in the system.
-      cudaGetLastError();
-    }
-  }
-
-  std::vector<Links> links(world_size * world_size);
-  for (const auto i : c10::irange(world_size)) {
-    nvmlDevice_t handle;
-    NVML_CHECK(
-        nvmlDeviceGetHandleByIndex(cuda_device_to_nvml_device[i], &handle));
-    for (const auto link : c10::irange(NVML_NVLINK_MAX_LINKS)) {
-      nvmlEnableState_t is_active;
-      auto nvmlRet = nvmlDeviceGetNvLinkState(handle, link, &is_active);
-      if (nvmlRet == NVML_ERROR_INVALID_ARGUMENT ||
-          nvmlRet == NVML_ERROR_NOT_SUPPORTED) {
-        continue;
-      }
-      if (is_active != NVML_FEATURE_ENABLED) {
-        continue;
-      }
-      nvmlPciInfo_t pci_info;
-      NVML_CHECK(nvmlDeviceGetNvLinkRemotePciInfo(handle, link, &pci_info));
-      std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
-      std::copy(
-          &pci_info.busId[0],
-          &pci_info.busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
-          pci_bus_id.data());
-      auto dst = pci_bus_ids.find(pci_bus_id);
-      if (dst != pci_bus_ids.end()) {
-        auto j = dst->second;
-        links[i * world_size + j] += 1;
-      }
-    }
-  }
-
-  return [=](Node i, Node j) { return links[i * world_size + j]; };
-}
-} // namespace
-#endif
 namespace {
 // Hilariously unoptimized, but algorithmic correctness matters more here, and
 // we only do it once.
@@ -298,7 +119,8 @@ void all_to_one(
   std::vector<at::cuda::CUDAEvent> copy_begin_events(num_gpus);
   std::vector<at::cuda::CUDAEvent> copy_completion_events(num_gpus);
 
-  static auto intermediate_nodes = get_intermediate_node(get_nvlink_matrix());
+  static auto intermediate_nodes =
+      get_intermediate_node(fbgemm_gpu::get_nvlink_matrix());
   for (auto& ten : input_tensors) {
     Node src_device_id = ten.get_device();
     auto intermediate_node =
diff --git a/fbgemm_gpu/src/topology_utils.cpp b/fbgemm_gpu/src/topology_utils.cpp
new file mode 100644
index 0000000000..086f930832
--- /dev/null
+++ b/fbgemm_gpu/src/topology_utils.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/core/Device.h>
+#include <algorithm>
+
+#include "fbgemm_gpu/topology_utils.h"
+
+#ifdef __HIP_PLATFORM_HCC__
+#include "hip/hip_runtime.h"
+#include "rocm_smi/rocm_smi.h"
+
+#define RSMI_CHECK(fn)                         \
+  do {                                         \
+    rsmi_status_t ret = (fn);                  \
+    TORCH_CHECK((ret) == RSMI_STATUS_SUCCESS); \
+  } while (0)
+
+#define RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE 16
+
+namespace fbgemm_gpu {
+AdjacencyMatrix<Links> get_nvlink_matrix() {
+  auto world_size = at::cuda::getNumGPUs();
+  RSMI_CHECK(rsmi_init(0));
+
+  // Note that ROCm_SMI uses a different numbering method to ROCm runtime,
+  // so we need to learn the mapping by using the bus ID.
+  uint32_t device_count;
+  RSMI_CHECK(rsmi_num_monitor_devices(&device_count));
+
+  std::unordered_map<Node, uint32_t> rocm_device_to_rsmi_device;
+
+  for (const auto i : c10::irange(device_count)) {
+    uint64_t pci_info;
+    RSMI_CHECK(rsmi_dev_pci_id_get(i, &pci_info));
+    uint64_t domain, bus, device, function;
+    domain = (pci_info >> 32) & 0xffffffff;
+    bus = (pci_info >> 8) & 0xff;
+    device = (pci_info >> 3) & 0x1f;
+    function = pci_info & 0x7;
+    // Different from CUDA, we do not get the PCI BUS ID as a char* and we need
+    // to reconstruct it.
+    char pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE];
+    sprintf(
+        pci_bus_id_str,
+        "%04" PRIu64 ":%02" PRIu64 ":%02" PRIu64 ".%0" PRIu64,
+        domain,
+        bus,
+        device,
+        function);
+
+    std::array<char, RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
+    std::copy(
+        &pci_bus_id_str[0],
+        &pci_bus_id_str[RSMI_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
+        pci_bus_id.data());
+    int32_t node = 0;
+    auto err = hipDeviceGetByPCIBusId(&node, pci_bus_id.data());
+    if (err == hipSuccess) {
+      rocm_device_to_rsmi_device.insert({node, i});
+    } else {
+      // flush the last error - this can occur when e.g. we set
+      // HIP_VISIBLE_DEVICES to a subset of the available GPUs in the system.
+      hipGetLastError();
+    }
+  }
+
+  std::vector<Links> links(world_size * world_size);
+  for (const auto i : c10::irange(world_size)) {
+    auto src_rsmi_device = rocm_device_to_rsmi_device.find(i);
+    if (src_rsmi_device != rocm_device_to_rsmi_device.end()) {
+      for (const auto j : c10::irange(world_size)) {
+        auto dst_rsmi_device = rocm_device_to_rsmi_device.find(j);
+        if (dst_rsmi_device != rocm_device_to_rsmi_device.end()) {
+          bool is_active;
+          RSMI_CHECK(rsmi_is_P2P_accessible(
+              src_rsmi_device->second, dst_rsmi_device->second, &is_active));
+          if (is_active) {
+            links[i * world_size + j] += 1;
+          }
+        }
+      }
+    }
+  }
+  RSMI_CHECK(rsmi_shut_down());
+  return [=](Node i, Node j) {
+    TORCH_CHECK(i < world_size);
+    TORCH_CHECK(j < world_size);
+    return links[i * world_size + j];
+  };
+}
+} // namespace fbgemm_gpu
+
+#else // CUDA
+
+#include <nvml.h>
+
+#define NVML_CHECK(fn)                  \
+  do {                                  \
+    nvmlReturn_t ret = (fn);            \
+    TORCH_CHECK((ret) == NVML_SUCCESS); \
+  } while (0)
+
+namespace fbgemm_gpu {
+AdjacencyMatrix<Links> get_nvlink_matrix() {
+  auto world_size = at::cuda::getNumGPUs();
+  NVML_CHECK(nvmlInit());
+
+  // Note that NVML uses a different numbering method to CUDA runtime,
+  // so we need to learn the mapping by using the bus ID.
+  uint32_t device_count;
+  NVML_CHECK(nvmlDeviceGetCount(&device_count));
+
+  std::map<std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE>, Node>
+      pci_bus_ids;
+  std::unordered_map<Node, uint32_t> cuda_device_to_nvml_device;
+
+  for (const auto i : c10::irange(device_count)) {
+    nvmlDevice_t handle;
+    NVML_CHECK(nvmlDeviceGetHandleByIndex(i, &handle));
+    nvmlPciInfo_t pci_info;
+    NVML_CHECK(nvmlDeviceGetPciInfo(handle, &pci_info));
+    std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
+    std::copy(
+        &pci_info.busId[0],
+        &pci_info.busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
+        pci_bus_id.data());
+    int32_t node = 0;
+    auto err = cudaDeviceGetByPCIBusId(&node, pci_bus_id.data());
+    if (err == cudaSuccess) {
+      pci_bus_ids.insert({pci_bus_id, node});
+      cuda_device_to_nvml_device.insert({node, i});
+    } else {
+      // flush the last error - this can occur when e.g. we set
+      // CUDA_VISIBLE_DEVICES to a subset of the available GPUs in the system.
+      cudaGetLastError();
+    }
+  }
+
+  std::vector<Links> links(world_size * world_size);
+  for (const auto i : c10::irange(world_size)) {
+    nvmlDevice_t handle;
+    NVML_CHECK(
+        nvmlDeviceGetHandleByIndex(cuda_device_to_nvml_device[i], &handle));
+    for (const auto link : c10::irange(NVML_NVLINK_MAX_LINKS)) {
+      nvmlEnableState_t is_active;
+      auto nvmlRet = nvmlDeviceGetNvLinkState(handle, link, &is_active);
+      if (nvmlRet == NVML_ERROR_INVALID_ARGUMENT ||
+          nvmlRet == NVML_ERROR_NOT_SUPPORTED) {
+        continue;
+      }
+      if (is_active != NVML_FEATURE_ENABLED) {
+        continue;
+      }
+      nvmlPciInfo_t pci_info;
+      NVML_CHECK(nvmlDeviceGetNvLinkRemotePciInfo(handle, link, &pci_info));
+      std::array<char, NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE> pci_bus_id;
+      std::copy(
+          &pci_info.busId[0],
+          &pci_info.busId[NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE],
+          pci_bus_id.data());
+      auto dst = pci_bus_ids.find(pci_bus_id);
+      if (dst != pci_bus_ids.end()) {
+        auto j = dst->second;
+        links[i * world_size + j] += 1;
+      }
+    }
+  }
+
+  return [=](Node i, Node j) {
+    TORCH_CHECK(i < world_size);
+    TORCH_CHECK(j < world_size);
+    return links[i * world_size + j];
+  };
+}
+} // namespace fbgemm_gpu
+
+#endif // __HIP_PLATFORM_HCC__