Distribute fixed width data (#3240)

* Send fixed-width topology, not AdjacencyList * Count owned size * Improve docs make shape explicit * Fix typo
FEniCS · Jun 2, 2024 · bd5ba2c · bd5ba2c
1 parent 73238da
commit bd5ba2c
Show file tree

Hide file tree

Showing 3 changed files with 209 additions and 18 deletions.
diff --git a/cpp/dolfinx/graph/partition.cpp b/cpp/dolfinx/graph/partition.cpp
@@ -224,6 +224,173 @@ graph::build::distribute(MPI_Comm comm,
           global_indices0, ghost_index_owner};
 }
 //-----------------------------------------------------------------------------
+std::tuple<std::vector<std::int64_t>, std::vector<std::int64_t>,
+           std::vector<int>>
+graph::build::distribute(MPI_Comm comm, std::span<const std::int64_t> list,
+                         std::array<std::size_t, 2> shape,
+                         const graph::AdjacencyList<std::int32_t>& destinations)
+{
+  common::Timer timer("Distribute fixed size nodes to destination ranks");
+
+  assert(list.size() == shape[0] * shape[1]);
+  assert(destinations.num_nodes() == (std::int32_t)shape[0]);
+  int rank = dolfinx::MPI::rank(comm);
+  std::int64_t num_owned = destinations.num_nodes();
+
+  // Get global offset for converting local index to global index for
+  // nodes in 'list'
+  std::int64_t offset_global = 0;
+  MPI_Exscan(&num_owned, &offset_global, 1, MPI_INT64_T, MPI_SUM, comm);
+
+  // Buffer size (max number of edges + 2 for owning rank,
+  // and node global index)
+  const std::size_t buffer_shape1 = shape[1] + 2;
+
+  // Build (dest, index, owning rank) list and sort
+  std::vector<std::array<int, 3>> dest_to_index;
+  dest_to_index.reserve(destinations.array().size());
+  for (std::int32_t i = 0; i < destinations.num_nodes(); ++i)
+  {
+    auto di = destinations.links(i);
+    for (auto d : di)
+      dest_to_index.push_back({d, i, di[0]});
+  }
+  std::sort(dest_to_index.begin(), dest_to_index.end());
+
+  // Build list of unique dest ranks and count number of rows to send to
+  // each dest (by neighbourhood rank)
+  std::vector<int> dest;
+  std::vector<std::int32_t> num_items_per_dest;
+  {
+    auto it = dest_to_index.begin();
+    while (it != dest_to_index.end())
+    {
+      // Store global rank and find iterator to next global rank
+      dest.push_back((*it)[0]);
+      auto it1
+          = std::find_if(it, dest_to_index.end(),
+                         [r = dest.back()](auto& idx) { return idx[0] != r; });
+
+      // Store number of items for current rank
+      num_items_per_dest.push_back(std::distance(it, it1));
+
+      // Advance iterator
+      it = it1;
+    }
+  }
+
+  // Determine source ranks. Sort ranks to make distribution
+  // deterministic.
+  std::vector<int> src = dolfinx::MPI::compute_graph_edges_nbx(comm, dest);
+  std::sort(src.begin(), src.end());
+
+  // Create neighbourhood communicator
+  MPI_Comm neigh_comm;
+  MPI_Dist_graph_create_adjacent(comm, src.size(), src.data(), MPI_UNWEIGHTED,
+                                 dest.size(), dest.data(), MPI_UNWEIGHTED,
+                                 MPI_INFO_NULL, false, &neigh_comm);
+
+  // Send number of nodes to receivers
+  std::vector<int> num_items_recv(src.size());
+  num_items_per_dest.reserve(1);
+  num_items_recv.reserve(1);
+  MPI_Request request_size;
+  MPI_Ineighbor_alltoall(num_items_per_dest.data(), 1, MPI_INT,
+                         num_items_recv.data(), 1, MPI_INT, neigh_comm,
+                         &request_size);
+
+  // Compute send displacements
+  std::vector<std::int32_t> send_disp(num_items_per_dest.size() + 1, 0);
+  std::partial_sum(num_items_per_dest.begin(), num_items_per_dest.end(),
+                   std::next(send_disp.begin()));
+
+  // Pack send buffer
+  std::vector<std::int64_t> send_buffer(buffer_shape1 * send_disp.back(), -1);
+  {
+    assert(send_disp.back() == (std::int32_t)dest_to_index.size());
+    for (std::size_t i = 0; i < dest_to_index.size(); ++i)
+    {
+      const std::array<int, 3>& dest_data = dest_to_index[i];
+      const std::size_t pos = dest_data[1];
+
+      std::span b(send_buffer.data() + i * buffer_shape1, buffer_shape1);
+      std::span row(list.data() + pos * shape[1], shape[1]);
+      std::copy(row.begin(), row.end(), b.begin());
+
+      auto info = b.last(2);
+      info[0] = dest_data[2];        // Owning rank
+      info[1] = pos + offset_global; // Original global index
+    }
+  }
+
+  // Prepare receive displacement
+  MPI_Wait(&request_size, MPI_STATUS_IGNORE);
+  std::vector<std::int32_t> recv_disp(num_items_recv.size() + 1, 0);
+  std::partial_sum(num_items_recv.begin(), num_items_recv.end(),
+                   std::next(recv_disp.begin()));
+
+  // Send/receive data facet
+  MPI_Datatype compound_type;
+  MPI_Type_contiguous(buffer_shape1, MPI_INT64_T, &compound_type);
+  MPI_Type_commit(&compound_type);
+  std::vector<std::int64_t> recv_buffer(buffer_shape1 * recv_disp.back());
+  MPI_Neighbor_alltoallv(send_buffer.data(), num_items_per_dest.data(),
+                         send_disp.data(), compound_type, recv_buffer.data(),
+                         num_items_recv.data(), recv_disp.data(), compound_type,
+                         neigh_comm);
+  MPI_Type_free(&compound_type);
+  MPI_Comm_free(&neigh_comm);
+
+  spdlog::debug("Received {} data on {} [{}]", recv_disp.back(), rank,
+                shape[1]);
+
+  // Count number of owned entries
+  std::int32_t num_owned_r = 0;
+  for (std::int32_t i = 0; i < recv_disp.back(); ++i)
+  {
+    std::span row(recv_buffer.data() + i * buffer_shape1, buffer_shape1);
+    auto info = row.last(2);
+    int owner = info[0];
+    if (owner == rank)
+      num_owned_r++;
+  }
+
+  // Unpack receive buffer
+  std::vector<std::int64_t> data(shape[1] * recv_disp.back());
+  std::vector<std::int64_t> global_indices(recv_disp.back());
+  std::vector<int> ghost_index_owner(recv_disp.back() - num_owned_r);
+
+  std::int32_t i_owned = 0;
+  std::int32_t i_ghost = 0;
+  for (std::int32_t i = 0; i < recv_disp.back(); ++i)
+  {
+    std::span row(recv_buffer.data() + i * buffer_shape1, buffer_shape1);
+    auto info = row.last(2);
+    int owner = info[0];
+    std::int64_t orig_global_index = info[1];
+    auto edges = row.first(shape[1]);
+    if (owner == rank)
+    {
+      std::copy(edges.begin(), edges.end(),
+                std::next(data.begin(), i_owned * shape[1]));
+      global_indices[i_owned] = orig_global_index;
+      ++i_owned;
+    }
+    else
+    {
+      std::copy(edges.begin(), edges.end(),
+                std::next(data.begin(), (i_ghost + num_owned_r) * shape[1]));
+      global_indices[i_ghost + num_owned_r] = orig_global_index;
+      ghost_index_owner[i_ghost] = owner;
+      ++i_ghost;
+    }
+  }
+  assert(i_owned == num_owned_r);
+
+  spdlog::debug("data.size = {}", data.size());
+  return {data, global_indices, ghost_index_owner};
+}
+//-----------------------------------------------------------------------------
 std::vector<std::int64_t>
 graph::build::compute_ghost_indices(MPI_Comm comm,
                                     std::span<const std::int64_t> owned_indices,

diff --git a/cpp/dolfinx/graph/partition.h b/cpp/dolfinx/graph/partition.h
@@ -69,6 +69,26 @@ std::tuple<graph::AdjacencyList<std::int64_t>, std::vector<int>,
 distribute(MPI_Comm comm, const graph::AdjacencyList<std::int64_t>& list,
            const graph::AdjacencyList<std::int32_t>& destinations);
 
+/// @brief Distribute fixed size nodes to destination ranks.
+///
+/// The global index of each node is assumed to be the local index plus
+/// the offset for this rank.
+///
+/// @param[in] comm MPI Communicator
+/// @param[in] list A flattened 2D row major array
+/// @param[in] shape The shape of the array
+/// @param[in] destinations Destination ranks for the ith row of the
+/// array. The first rank is the 'owner' of the node.
+/// @return
+/// 1. Received list for this process
+/// 2. Original global index for each node
+/// 3. Owner rank of ghost nodes
+std::tuple<std::vector<std::int64_t>, std::vector<std::int64_t>,
+           std::vector<int>>
+distribute(MPI_Comm comm, std::span<const std::int64_t> list,
+           std::array<std::size_t, 2> shape,
+           const graph::AdjacencyList<std::int32_t>& destinations);
+
 /// @brief Take a set of distributed input global indices, including
 /// ghosts, and determine the new global indices after remapping.
 ///

diff --git a/cpp/dolfinx/mesh/utils.h b/cpp/dolfinx/mesh/utils.h
@@ -763,7 +763,7 @@ Mesh<typename std::remove_reference_t<typename U::value_type>> create_mesh(
   const fem::ElementDofLayout doflayout = element.create_dof_layout();
 
   const int num_cell_vertices = mesh::num_cell_vertices(element.cell_shape());
-  const int num_cell_nodes = doflayout.num_dofs();
+  std::size_t num_cell_nodes = doflayout.num_dofs();
 
   // Note: `extract_topology` extracts topology data, i.e. just the
   // vertices. For P1 geometry this should just be the identity
@@ -773,12 +773,12 @@ Mesh<typename std::remove_reference_t<typename U::value_type>> create_mesh(
   // `extract_topology` could be skipped for 'P1 geometry' elements
 
   // -- Partition topology across ranks of comm
-  graph::AdjacencyList<std::int64_t> cells1(0);
-  // std::vector<std::int64_t> cells1;
+  std::vector<std::int64_t> cells1;
   std::vector<std::int64_t> original_idx1;
   std::vector<int> ghost_owners;
   if (partitioner)
   {
+    spdlog::info("Using partitioner with {} cell data", cells.size());
     graph::AdjacencyList<std::int32_t> dest(0);
     if (commt != MPI_COMM_NULL)
     {
@@ -789,26 +789,29 @@ Mesh<typename std::remove_reference_t<typename U::value_type>> create_mesh(
 
     // Distribute cells (topology, includes higher-order 'nodes') to
     // destination rank
-    std::vector<int> src;
-    auto _cells = graph::regular_adjacency_list(
-        std::vector(cells.begin(), cells.end()), num_cell_nodes);
-    std::tie(cells1, src, original_idx1, ghost_owners)
-        = graph::build::distribute(comm, _cells, dest);
+    assert(cells.size() % num_cell_nodes == 0);
+    std::size_t num_cells = cells.size() / num_cell_nodes;
+    std::tie(cells1, original_idx1, ghost_owners) = graph::build::distribute(
+        comm, cells, {num_cells, num_cell_nodes}, dest);
+    spdlog::debug("Got {} cells from distribution", cells1.size());
   }
   else
   {
-    cells1 = graph::regular_adjacency_list(
-        std::vector(cells.begin(), cells.end()), num_cell_nodes);
-    std::int64_t offset(0), num_owned(cells1.num_nodes());
+    cells1 = std::vector<std::int64_t>(cells.begin(), cells.end());
+    assert(cells1.size() % num_cell_nodes == 0);
+    std::int64_t offset = 0;
+    std::int64_t num_owned = cells1.size() / num_cell_nodes;
     MPI_Exscan(&num_owned, &offset, 1, MPI_INT64_T, MPI_SUM, comm);
-    original_idx1.resize(cells1.num_nodes());
+    original_idx1.resize(num_owned);
     std::iota(original_idx1.begin(), original_idx1.end(), offset);
   }
 
   // Extract cell 'topology', i.e. extract the vertices for each cell
   // and discard any 'higher-order' nodes
   std::vector<std::int64_t> cells1_v
-      = extract_topology(celltype, doflayout, cells1.array());
+      = extract_topology(celltype, doflayout, cells1);
+  spdlog::info("Extract basic topology: {}->{}", cells1.size(),
+               cells1_v.size());
 
   // Build local dual graph for owned cells to (i) get list of vertices
   // on the process boundary and (ii) apply re-ordering to cells for
@@ -820,6 +823,7 @@ Mesh<typename std::remove_reference_t<typename U::value_type>> create_mesh(
     std::vector<std::int32_t> cell_offsets(num_owned_cells + 1, 0);
     for (std::size_t i = 1; i < cell_offsets.size(); ++i)
       cell_offsets[i] = cell_offsets[i - 1] + num_cell_vertices;
+    spdlog::info("Build local dual graph");
     auto [graph, unmatched_facets, max_v, facet_attached_cells]
         = build_local_dual_graph(
             std::vector{celltype},
@@ -835,8 +839,8 @@ Mesh<typename std::remove_reference_t<typename U::value_type>> create_mesh(
                 std::next(_original_idx.begin(), num_owned_cells));
     impl::reorder_list(
         std::span(cells1_v.data(), remap.size() * num_cell_vertices), remap);
-    impl::reorder_list(
-        std::span(cells1.array().data(), remap.size() * num_cell_nodes), remap);
+    impl::reorder_list(std::span(cells1.data(), remap.size() * num_cell_nodes),
+                       remap);
     original_idx1 = _original_idx;
 
     // Boundary vertices are marked as 'unknown'
@@ -865,15 +869,15 @@ Mesh<typename std::remove_reference_t<typename U::value_type>> create_mesh(
 
   // Build list of unique (global) node indices from cells1 and
   // distribute coordinate data
-  std::vector<std::int64_t> nodes1 = cells1.array();
+  std::vector<std::int64_t> nodes1 = cells1;
   dolfinx::radix_sort(std::span(nodes1));
   nodes1.erase(std::unique(nodes1.begin(), nodes1.end()), nodes1.end());
   std::vector coords
       = dolfinx::MPI::distribute_data(comm, nodes1, commg, x, xshape[1]);
 
   // Create geometry object
-  Geometry geometry = create_geometry(topology, element, nodes1, cells1.array(),
-                                      coords, xshape[1]);
+  Geometry geometry
+      = create_geometry(topology, element, nodes1, cells1, coords, xshape[1]);
 
   return Mesh(comm, std::make_shared<Topology>(std::move(topology)),
               std::move(geometry));