From 94ccab3b6fb589601a9835d3e8a38bc174d13ca8 Mon Sep 17 00:00:00 2001
From: abkein <rickbatra0z@gmail.com>
Date: Tue, 3 Dec 2024 15:08:17 +0300
Subject: [PATCH] Agrhhhhh

---
 src/compute_cluster_size_ext.cpp |  62 +++--
 src/compute_cluster_size_ext.h   |  53 ++--
 src/compute_cluster_volume.cpp   | 458 ++++++++++++++++++++++---------
 src/compute_cluster_volume.h     |  45 ++-
 4 files changed, 449 insertions(+), 169 deletions(-)
diff --git a/src/compute_cluster_size_ext.cpp b/src/compute_cluster_size_ext.cpp
index 7c5feebb569..7a3a8ebd560 100644
--- a/src/compute_cluster_size_ext.cpp
+++ b/src/compute_cluster_size_ext.cpp
@@ -61,13 +61,13 @@ ComputeClusterSizeExt::ComputeClusterSizeExt(LAMMPS *lmp, int narg, char **arg)
     error->all(FLERR, "size_cutoff for compute cluster/size must be greater than 0");
   }
 
-  // alloc = new CustomAllocator<std::pair<const int, cluster_ptr>>(1024, memory, keeper);
-  // cluster_map =
-  //   new std::u
-  // keeper = new MemoryKeeper(memory);
-  // alloc = new CustomAllocator<std::pair<const int, cluster_ptr>>(1024, memory, keeper);
-  // cluster_map =
-  //   new std::unordered_map<int, cluster_ptr, std::hash<int>, std::equal_to<int>, CustomAllocator<std::pair<const int, cluster_ptr>>>(*alloc);
+  keeper = new MemoryKeeper(memory);
+
+  alloc = new CustomAllocator<std::pair<const int, int>>(memory, keeper);
+  cluster_map = new Cluster_map_t(*alloc);
+
+  alloc_vector = new Allocator_map_vector(memory, keeper);
+  cIDs_by_size = new Sizes_map_t(*alloc_vector);
 
   size_vector = size_cutoff + 1;
 }
@@ -83,10 +83,14 @@ ComputeClusterSizeExt::~ComputeClusterSizeExt() noexcept(true)
   if (clusters != nullptr) { memory->destroy(clusters); }
   if (ns != nullptr) { memory->destroy(ns); }
   if (gathered != nullptr) { memory->destroy(gathered); }
+  if (monomers != nullptr) { memory->destroy(monomers); }
 
-  // delete cluster_map;
-  // delete alloc;
-  // delete keeper;
+  delete cIDs_by_size;
+  delete alloc_vector;
+
+  delete cluster_map;
+  delete alloc;
+  delete keeper;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -105,13 +109,17 @@ void ComputeClusterSizeExt::init()
   vector = dist;
 
   nloc = static_cast<int>(atom->nlocal * LMP_NUCC_ALLOC_COEFF);
-  cluster_map.reserve(nloc);
-  // cluster_map->reserve(nloc);
-  // alloc->pool_size_ = nloc;
+  // cluster_map.reserve(nloc);
+  cluster_map->reserve(nloc);
+  cIDs_by_size->reserve(nloc);
+  keeper->pool_size(nloc);
+
   memory->create(clusters, nloc, "clusters");
   memory->create(ns, 2 * nloc, "ns");
+  memory->create(monomers, nloc, "monomers");
 
-  natom_loc = static_cast<int>(2 * atom->natoms * LMP_NUCC_ALLOC_COEFF);
+  natom_loc =
+      static_cast<bigint>(static_cast<long double>(2 * atom->natoms) * LMP_NUCC_ALLOC_COEFF);
   memory->create(gathered, natom_loc, "gathered");
 
   initialized_flag = 1;
@@ -148,7 +156,7 @@ void ComputeClusterSizeExt::compute_vector()
 {
   invoked_vector = update->ntimestep;
 
-  auto &cmap = cluster_map;
+  auto &cmap = *cluster_map;
 
   if (compute_cluster_atom->invoked_peratom != update->ntimestep) {
     compute_cluster_atom->compute_peratom();
@@ -163,10 +171,13 @@ void ComputeClusterSizeExt::compute_vector()
 
   if (atom->nlocal > nloc) {
     nloc = static_cast<int>(atom->nlocal * LMP_NUCC_ALLOC_COEFF);
+    keeper->pool_size(nloc);
     cmap.reserve(nloc);
-    // alloc->pool_size_ = nloc;
+    cIDs_by_size->reserve(nloc);
+
     memory->grow(clusters, nloc, "clusters");
     memory->grow(ns, 2 * nloc, "ns");
+    memory->grow(monomers, nloc, "monomers");
   }
 
   // Sort atom IDs by cluster IDs
@@ -178,7 +189,7 @@ void ComputeClusterSizeExt::compute_vector()
         cmap[clid] = clidx;
         ns[clidx] = clid;
         ns[clidx + 1] = 0;
-        clusters[clidx] = {0, 0, -1, 0, 0, 0};
+        clusters[clidx] = cluster_data(clid);
       }
       // possible segfault if actual cluster size exceeds LMP_NUCC_CLUSTER_MAX_SIZE + LMP_NUCC_CLUSTER_MAX_GHOST
       const int clidx = cmap[clid];
@@ -189,7 +200,7 @@ void ComputeClusterSizeExt::compute_vector()
   // add ghost atoms
   for (int i = atom->nlocal; i < atom->nmax; ++i) {
     if ((atom->mask[i] & groupbit) != 0) {
-      int const clid = static_cast<int>(cluster_ids[i]);
+      const auto clid = static_cast<bigint>(cluster_ids[i]);
       if (cmap.count(clid) > 0) {
         cluster_data &clstr = clusters[cmap[clid]];
         // also possible segfault if number of ghost exceeds LMP_NUCC_CLUSTER_MAX_GHOST
@@ -222,7 +233,7 @@ void ComputeClusterSizeExt::compute_vector()
       int const k = displs[i] + j;
       if (cmap.count(gathered[k]) > 0) {
         cluster_data &clstr = clusters[cmap[gathered[k]]];
-        clstr.owners[clstr.nowners++] = i;
+        if (i != comm->me) { clstr.owners[clstr.nowners++] = i; }
         clstr.g_size += gathered[k + 1];
         if (gathered[k + 1] > clstr.nhost) {
           clstr.host = i;
@@ -233,11 +244,22 @@ void ComputeClusterSizeExt::compute_vector()
   }
 
   // adjust local data and fill local size distribution
+  nonexclusive = 0;
+  nmono = 0;
   for (auto &[clid, clidx] : cmap) {
     cluster_data &clstr = clusters[clidx];
     clstr.l_size = ns[static_cast<ptrdiff_t>(2) * clidx];
     ::memcpy(clstr.atoms + clstr.l_size, clstr.ghost, clstr.nghost * sizeof(int));
-    if ((clstr.host < 0) && (clstr.g_size < size_cutoff)) { dist_local[clstr.g_size] += 1; }
+    if (clstr.host == comm->me) {
+      if (clstr.g_size < size_cutoff) { dist_local[clstr.g_size] += 1; }
+      if (clstr.g_size == 1) {
+        monomers[nmono++];
+      } else {
+        (*cIDs_by_size)[clstr.g_size].push_back(clidx);
+      }
+    } else {
+      ++nonexclusive;
+    }
   }
 
   ::MPI_Allreduce(dist_local, dist, size_vector, MPI_DOUBLE, MPI_SUM, world);
diff --git a/src/compute_cluster_size_ext.h b/src/compute_cluster_size_ext.h
index 11e21ae1e27..7aa20d2bb64 100644
--- a/src/compute_cluster_size_ext.h
+++ b/src/compute_cluster_size_ext.h
@@ -24,24 +24,38 @@ ComputeStyle(cluster/size/ext,ComputeClusterSizeExt);
 #define LMP_NUCC_CLUSTER_MAX_OWNERS 128
 #define LMP_NUCC_CLUSTER_MAX_SIZE 300
 #define LMP_NUCC_CLUSTER_MAX_GHOST 30
-#include "compute.h"
 
+#include "compute.h"
+#include "nucc_allocator.hpp"
+#include <scoped_allocator>
 #include <unordered_map>
+#include <vector>
 
 namespace LAMMPS_NS {
 
 class ComputeClusterVolume;
+using Cluster_map_t =
+    std::unordered_map<int, int, std::hash<int>, std::equal_to<int>,
+                       std::scoped_allocator_adaptor<CustomAllocator<std::pair<const int, int>>>>;
+using Allocator_map_vector =
+    CustomAllocator<std::pair<const int, std::vector<int, CustomAllocator<int>>>>;
+using Sizes_map_t =
+    std::unordered_map<int, std::vector<int, CustomAllocator<int>>, std::hash<int>,
+                       std::equal_to<int>, std::scoped_allocator_adaptor<Allocator_map_vector>>;
 
 struct cluster_data {
-  int l_size = 0;     // local size
-  int g_size = 0;     // global size
-  int host = -1;      // host proc (me if <0)
-  int nhost = 0;      // local cluster size of host proc
-  int nowners = 0;    // number of owners
-  int nghost = 0;     // number of ghost atoms in cluster
-  int owners[LMP_NUCC_CLUSTER_MAX_OWNERS];
-  int atoms[LMP_NUCC_CLUSTER_MAX_SIZE];     // local ids of atoms
-  int ghost[LMP_NUCC_CLUSTER_MAX_GHOST];    // local ids of ghost atoms
+  cluster_data(const bigint clid = 0) : clid(clid) {}
+
+  bigint clid = 0;                            // cluster ID
+  int l_size = 0;                             // local size
+  int g_size = 0;                             // global size
+  int host = -1;                              // host proc (me if <0)
+  int nhost = 0;                              // local cluster size of host proc
+  int nowners = 0;                            // number of owners
+  int nghost = 0;                             // number of ghost atoms in cluster
+  int owners[LMP_NUCC_CLUSTER_MAX_OWNERS];    // procs owning some cluster's atoms
+  int atoms[LMP_NUCC_CLUSTER_MAX_SIZE];       // local ids of atoms
+  int ghost[LMP_NUCC_CLUSTER_MAX_GHOST];      // local ids of ghost atoms
 };
 
 class ComputeClusterSizeExt : public Compute {
@@ -53,15 +67,18 @@ class ComputeClusterSizeExt : public Compute {
   void compute_vector() override;
   double memory_usage() override;
 
-  inline int get_size_cutoff() const noexcept(true) { return size_cutoff; }
+  int get_size_cutoff() const noexcept(true) { return size_cutoff; }
 
  private:
   int size_cutoff;    // number of elements reserved in dist
 
-  // MemoryKeeper* keeper;
-  // CustomAllocator<std::pair<const int, cluster_ptr>>* alloc;
-  // std::unordered_map<int, cluster_ptr, std::hash<int>, std::equal_to<int>, CustomAllocator<std::pair<const int, cluster_ptr>>>* cluster_map;
-  std::unordered_map<int, int> cluster_map;
+  MemoryKeeper *keeper;
+  CustomAllocator<std::pair<const int, int>> *alloc;
+  Cluster_map_t *cluster_map;
+  // std::unordered_map<bigint, int> cluster_map; // clid -> idx
+  Allocator_map_vector *alloc_vector;
+  Sizes_map_t *cIDs_by_size;
+  // std::unordered_map<int, std::vector<int>> cIDs_by_size; // size -> vector(idx)
 
   int nloc;                // number of reserved elements in atoms_by_cID and cIDs_by_size
   double *dist;            // cluster size distribution (vector == dist)
@@ -72,7 +89,11 @@ class ComputeClusterSizeExt : public Compute {
   cluster_data *clusters{};
   int *ns{};
   int *gathered{};
-  int natom_loc;
+  bigint natom_loc;
+  int nonexclusive;
+
+  int *monomers;
+  int nmono;
 
   Compute *compute_cluster_atom = nullptr;
   // void test_allocator() const;
diff --git a/src/compute_cluster_volume.cpp b/src/compute_cluster_volume.cpp
index cf7e82ea837..6ff3395eaea 100644
--- a/src/compute_cluster_volume.cpp
+++ b/src/compute_cluster_volume.cpp
@@ -14,6 +14,7 @@
 #include "compute_cluster_volume.h"
 #include <cmath>
 
+#include "atom.h"
 #include "comm.h"
 #include "domain.h"
 #include "error.h"
@@ -29,7 +30,7 @@ using namespace LAMMPS_NS;
 /* ---------------------------------------------------------------------- */
 
 ComputeClusterVolume::ComputeClusterVolume(LAMMPS *lmp, int narg, char **arg) :
-    Compute(lmp, narg, arg)
+    Compute(lmp, narg, arg), nloc(0), n_cells(0), nloc_grid(0), noff(0), nloc_recv(0)
 {
   vector_flag = 1;
   size_vector = 0;
@@ -72,21 +73,25 @@ ComputeClusterVolume::ComputeClusterVolume(LAMMPS *lmp, int narg, char **arg) :
     }
   }
 
-  size_local_rows = size_cutoff + 1;
-  memory->create(local_kes, size_local_rows + 1, "compute:cluster/ke:local_kes");
-  vector_local = local_kes;
+  precompute = false;
+  overlap_sq = overlap * overlap;
+  n_cells = static_cast<int>(overlap / voxel_size) + 1;    // should I place +1 here?
 
+  size_local_rows = size_cutoff + 1;
   size_vector = size_cutoff + 1;
-  memory->create(kes, size_vector + 1, "compute:cluster/ke:kes");
-  vector = kes;
 }
 
 /* ---------------------------------------------------------------------- */
 
 ComputeClusterVolume::~ComputeClusterVolume() noexcept(true)
 {
-  if (local_kes != nullptr) { memory->destroy(local_kes); }
-  if (kes != nullptr) { memory->destroy(kes); }
+  if (volumes != nullptr) { memory->destroy(volumes); }
+  if (dist != nullptr) { memory->destroy(dist); }
+  if (dist_local != nullptr) { memory->destroy(dist_local); }
+  if (occupancy_grid != nullptr) { memory->destroy(occupancy_grid); }
+  if (offsets != nullptr) { memory->destroy(offsets); }
+  if (bboxes != nullptr) { memory->destroy(bboxes); }
+  if (recv_buf != nullptr) { memory->destroy(recv_buf); }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -96,6 +101,47 @@ void ComputeClusterVolume::init()
   if ((modify->get_compute_by_style(style).size() > 1) && (comm->me == 0)) {
     error->warning(FLERR, "More than one compute {}", style);
   }
+
+  subbonds[0 + 0] = domain->sublo[0];
+  subbonds[0 + 1] = domain->subhi[0];
+  subbonds[2 + 0] = domain->sublo[1];
+  subbonds[2 + 1] = domain->subhi[1];
+  subbonds[4 + 0] = domain->sublo[2];
+  subbonds[4 + 1] = domain->subhi[2];
+
+  memory->create(dist_local, size_local_rows + 1, "compute:cluster/ke:local_kes");
+  vector_local = dist_local;
+  memory->create(dist, size_vector + 1, "compute:cluster/ke:kes");
+  vector = dist;
+
+  nloc = static_cast<int>(atom->nlocal * LMP_NUCC_ALLOC_COEFF);
+  memory->create(volumes, nloc, "ComputeClusterVolume:volumes");
+  if (mode == VOLUMEMODE::RECTANGLE) {
+    memory->grow(bboxes, 6 * nloc, "ComputeClusterVolume:bboxes");
+  }
+
+  if (precompute) {
+    int radius_voxels_int = static_cast<int>(std::ceil(overlap / voxel_size));
+    noff =
+        3 * 2 * (radius_voxels_int + 1) * 2 * (radius_voxels_int + 1) * 2 * (radius_voxels_int + 1);
+    memory->create(offsets, noff, "ComputeClusterVolume:offsets");
+
+    double voxel_size_sq = voxel_size * voxel_size;
+    noffsets = 0;
+    for (int dx = -radius_voxels_int; dx <= radius_voxels_int; ++dx) {
+      for (int dy = -radius_voxels_int; dy <= radius_voxels_int; ++dy) {
+        for (int dz = -radius_voxels_int; dz <= radius_voxels_int; ++dz) {
+          double dist_sq = dx * dx + dy * dy + dz * dz;
+          if (dist_sq * voxel_size_sq <= overlap_sq) {
+            offsets[noffsets + 0] = dx;
+            offsets[noffsets + 1] = dy;
+            offsets[noffsets + 2] = dz;
+            noffsets += 3;
+          }
+        }
+      }
+    }
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -105,14 +151,21 @@ void ComputeClusterVolume::compute_vector()
   invoked_vector = update->ntimestep;
 
   compute_local();
+  ::memset(dist, 0.0, size_vector * sizeof(double));
+  ::MPI_Allreduce(dist_local, dist, size_vector, MPI_DOUBLE, MPI_SUM, world);
+  for (int i = 0; i < size_vector; ++i) { dist[i] /= comm->nprocs; }
+}
 
-  ::memset(kes, 0.0, size_vector * sizeof(double));
-  ::MPI_Allreduce(local_kes, kes, size_vector, MPI_DOUBLE, MPI_SUM, world);
+/* ---------------------------------------------------------------------- */
 
-  const double *dist = compute_cluster_size->vector;
-  for (int i = 0; i < size_vector; ++i) {
-    if (dist[i] > 0) { kes[i] /= dist[i]; }
-  }
+inline void maximize_bbox(const double *const src, double *const dest) noexcept
+{
+  if (src[0] < dest[0]) { dest[0] = src[0]; }
+  if (src[1] < dest[1]) { dest[1] = src[1]; }
+  if (src[2] < dest[2]) { dest[2] = src[2]; }
+  if (src[3] > dest[3]) { dest[3] = src[3]; }
+  if (src[4] > dest[4]) { dest[4] = src[4]; }
+  if (src[5] > dest[5]) { dest[5] = src[5]; }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -125,75 +178,223 @@ void ComputeClusterVolume::compute_local()
     compute_cluster_size->compute_vector();
   }
 
+  if (nloc < atom->nlocal) {
+    nloc = static_cast<int>(atom->nlocal * LMP_NUCC_ALLOC_COEFF);
+    memory->grow(volumes, nloc, "ComputeClusterVolume:volumes");
+
+    if (mode == VOLUMEMODE::RECTANGLE) {
+      memory->grow(bboxes, 6 * nloc, "ComputeClusterVolume:bboxes");
+    }
+  }
+
+  if (nloc_recv < compute_cluster_size->nonexclusive) {
+    nloc_recv = static_cast<bigint>(compute_cluster_size->nonexclusive * LMP_NUCC_ALLOC_COEFF);
+    memory->grow(in_reqs, comm->nprocs * nloc_recv, "ComputeClusterVolume:in_reqs");
+    memory->grow(out_reqs, comm->nprocs * nloc_recv, "ComputeClusterVolume:out_reqs");
+
+    if (mode == VOLUMEMODE::RECTANGLE) {
+      memory->grow(recv_buf, comm->nprocs * nloc_recv, "ComputeClusterVolume:recv_buf");
+    } else if (mode == VOLUMEMODE::RECTANGLE) {
+      memory->grow(recv_buf, 6 * comm->nprocs * nloc_recv, "ComputeClusterVolume:recv_buf");
+    }
+  }
+
+  int to_send = 0;
+  int to_receive = 0;
+  Sizes_map_t &cmap = *(compute_cluster_size->cIDs_by_size);
+  if (mode == VOLUMEMODE::CALC) {
+    for (const auto [size, clidxs] : cmap) {
+      for (auto clidx : clidxs) {
+        const cluster_data &clstr = compute_cluster_size->clusters[clidx];
+        bool nonexclusive = clstr.nowners > 0;
+        volumes[clidx] =
+            occupied_volume_precomputed(clstr.atoms, clstr.l_size, clstr.nghost, nonexclusive);
+        if (nonexclusive) {
+          if (clstr.host != comm->me) {
+            ::MPI_Send_init(volumes + clidx, 1, MPI_DOUBLE, clstr.host, clstr.clid, world,
+                            out_reqs + to_send);
+            ++to_send;
+          } else {
+            for (int i = 0; i < clstr.nowners; ++i) {
+              ::MPI_Recv_init(recv_buf + to_receive, 1, MPI_DOUBLE, clstr.owners[i], clstr.clid,
+                              world, in_reqs + to_receive);
+              ++to_receive;
+            }
+          }
+        }
+      }
+    }
+  } else if (mode == VOLUMEMODE::RECTANGLE) {
+    for (const auto [size, clidxs] : cmap) {
+      for (auto clidx : clidxs) {
+        const cluster_data &clstr = compute_cluster_size->clusters[clidx];
+        bigint bbox_start = 6 * clidx;
+        cluster_bbox(clstr.atoms, clstr.l_size /*+ clstr.nghost*/, bboxes + bbox_start);
+        if (clstr.nowners > 0) {
+          if (clstr.host != comm->me) {
+            ::MPI_Send_init(bboxes + bbox_start, 6, MPI_DOUBLE, clstr.host, clstr.clid, world,
+                            out_reqs + to_send);
+            ++to_send;
+          } else {
+            for (int i = 0; i < clstr.nowners; ++i) {
+              ::MPI_Recv_init(recv_buf + 6 * to_receive, 6, MPI_DOUBLE, clstr.owners[i], clstr.clid,
+                              world, in_reqs + to_receive);
+              ++to_receive;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  ::MPI_Startall(to_send, out_reqs);
+  ::MPI_Startall(to_receive, in_reqs);
+  ::MPI_Waitall(to_send, out_reqs, MPI_STATUSES_IGNORE /*out_stats*/);
+  ::MPI_Waitall(to_receive, in_reqs, MPI_STATUSES_IGNORE /*in_stats*/);
+
+  int received = 0;
   if (mode == VOLUMEMODE::CALC) {
-    subbonds[0 + 0] = domain->sublo[0];
-    subbonds[0 + 1] = domain->subhi[0];
-    subbonds[2 + 0] = domain->sublo[1];
-    subbonds[2 + 1] = domain->subhi[1];
-    subbonds[4 + 0] = domain->sublo[2];
-    subbonds[4 + 1] = domain->subhi[2];
+    for (const auto [size, clidxs] : cmap) {
+      for (auto clidx : clidxs) {
+        const cluster_data &clstr = compute_cluster_size->clusters[clidx];
+        if ((clstr.nowners > 0) && (clstr.host == comm->me)) {
+          for (int i = 0; i < clstr.nowners; ++i) { volumes[clidx] += recv_buf[received++]; }
+        }
+      }
+    }
+  } else if (mode == VOLUMEMODE::RECTANGLE) {
+    for (const auto [size, clidxs] : cmap) {
+      for (auto clidx : clidxs) {
+        const cluster_data &clstr = compute_cluster_size->clusters[clidx];
+        bigint bbox_start = 6 * clidx;
+        if (clstr.host == comm->me) {
+          if ((clstr.nowners > 0)) {
+            for (int i = 0; i < clstr.nowners; ++i) {
+              maximize_bbox(recv_buf + 6 * received, bboxes + bbox_start);
+              ++received;
+            }
+          }
+          volumes[clidx] = (bboxes[bbox_start + 3] - bboxes[bbox_start + 0]) *
+              (bboxes[bbox_start + 4] - bboxes[bbox_start + 1]) *
+              (bboxes[bbox_start + 5] - bboxes[bbox_start + 2]);
+        }
+      }
+    }
   }
+
+  ::memset(dist_local, 0.0, size_local_rows * sizeof(double));
+  for (const auto [size, clidxs] : cmap) {
+    for (auto clidx : clidxs) {
+      const cluster_data &clstr = compute_cluster_size->clusters[clidx];
+      if (clstr.host == comm->me) { dist_local[size] += volumes[clidx]; }
+    }
+    dist_local[size] /= clidxs.size();
+  }
+
+  for (int i = 0; i < to_send; ++i) { ::MPI_Request_free(out_reqs + i); }
+  for (int i = 0; i < to_receive; ++i) { ::MPI_Request_free(in_reqs + i); }
 }
 
 /* ---------------------------------------------------------------------- */
 
-double ComputeClusterVolume::occupied_volume(const double **const centers, const int *const list,
-                                             const int n, const double r,
-                                             const double voxel_size = 0.1) const
+void ComputeClusterVolume::cluster_bbox(const int *const list, const int n,
+                                        double *bbox) const noexcept
 {
-  double min_c[3]{};
-  double max_c[3]{};
-
-  // find min and max center coordinates, i.e. bounding box for voxel grid
-  min_c[0] = centers[list[0]][0];
-  min_c[1] = centers[list[0]][1];
-  min_c[2] = centers[list[0]][2];
-  max_c[0] = centers[list[0]][0];
-  max_c[1] = centers[list[0]][1];
-  max_c[2] = centers[list[0]][2];
-
-  for (int i = 0; i < n; ++i) {
-    if (centers[list[i]][0] < min_c[0]) { min_c[0] = centers[list[i]][0]; }
-    if (centers[list[i]][1] < min_c[1]) { min_c[1] = centers[list[i]][1]; }
-    if (centers[list[i]][2] < min_c[2]) { min_c[2] = centers[list[i]][2]; }
-    if (centers[list[i]][0] > max_c[0]) { max_c[0] = centers[list[i]][0]; }
-    if (centers[list[i]][1] > max_c[1]) { max_c[1] = centers[list[i]][1]; }
-    if (centers[list[i]][2] > max_c[2]) { max_c[2] = centers[list[i]][2]; }
+  double **const centers = atom->x;
+  // double bbox[6] =
+  //   {
+  //     centers[list[0]][0],
+  //     centers[list[0]][1],
+  //     centers[list[0]][2],
+  //     centers[list[0]][0],
+  //     centers[list[0]][1],
+  //     centers[list[0]][2]
+  //   };
+
+  bbox[0] = centers[list[0]][0];
+  bbox[1] = centers[list[0]][1];
+  bbox[2] = centers[list[0]][2];
+  bbox[3] = centers[list[0]][0];
+  bbox[4] = centers[list[0]][1];
+  bbox[5] = centers[list[0]][2];
+
+  // find min and max center coordinates, i.e. bounding box
+  for (int i = 1; i < n; ++i) {
+    if (centers[list[i]][0] < bbox[0]) { bbox[0] = centers[list[i]][0]; }
+    if (centers[list[i]][1] < bbox[1]) { bbox[1] = centers[list[i]][1]; }
+    if (centers[list[i]][2] < bbox[2]) { bbox[2] = centers[list[i]][2]; }
+    if (centers[list[i]][0] > bbox[3]) { bbox[3] = centers[list[i]][0]; }
+    if (centers[list[i]][1] > bbox[4]) { bbox[4] = centers[list[i]][1]; }
+    if (centers[list[i]][2] > bbox[5]) { bbox[5] = centers[list[i]][2]; }
   }
 
-  min_c[0] -= r;
-  min_c[1] -= r;
-  min_c[2] -= r;
-  max_c[0] += r;
-  max_c[1] += r;
-  max_c[2] += r;
+  bbox[0] -= overlap;
+  bbox[1] -= overlap;
+  bbox[2] -= overlap;
+  bbox[3] += overlap;
+  bbox[4] += overlap;
+  bbox[5] += overlap;
+}
 
-  min_c[0] = MAX(min_c[0], subbonds[0]);
-  min_c[1] = MAX(min_c[1], subbonds[2]);
-  min_c[2] = MAX(min_c[2], subbonds[4]);
-  max_c[0] = MIN(max_c[0], subbonds[1]);
-  max_c[1] = MIN(max_c[1], subbonds[3]);
-  max_c[2] = MIN(max_c[2], subbonds[5]);
+/* ---------------------------------------------------------------------- */
+
+inline bigint idx(bigint i, bigint j, bigint k, bigint ny, bigint nz) noexcept
+{
+  return i * (nz * ny) + j * nz + k;
+}
 
-  const int nx = static_cast<int>(::ceil((max_c[0] - min_c[0]) / voxel_size));
-  const int ny = static_cast<int>(::ceil((max_c[1] - min_c[1]) / voxel_size));
-  const int nz = static_cast<int>(::ceil((max_c[2] - min_c[2]) / voxel_size));
+/* ---------------------------------------------------------------------- */
+
+double ComputeClusterVolume::occupied_volume_precomputed(const int *const list, const int n,
+                                                         const int nghost,
+                                                         bool nonexclusive) noexcept
+{
+  double **const centers = atom->x;
+  double bbox[6]{};
+  cluster_bbox(list, n, bbox);
+
+  if (nonexclusive) {
+    bbox[0] = MAX(bbox[0], subbonds[0]);    // cut should be on only needed sides
+    bbox[1] = MAX(bbox[1], subbonds[2]);
+    bbox[2] = MAX(bbox[2], subbonds[4]);
+    bbox[3] = MIN(bbox[3], subbonds[1]);
+    bbox[4] = MIN(bbox[4], subbonds[3]);
+    bbox[5] = MIN(bbox[5], subbonds[5]);
+  }
+
+  const bigint steps_x = static_cast<bigint>((bbox[3] - bbox[0]) / voxel_size) + 1;
+  const bigint steps_y = static_cast<bigint>((bbox[4] - bbox[1]) / voxel_size) + 1;
+  const bigint steps_z = static_cast<bigint>((bbox[5] - bbox[2]) / voxel_size) + 1;
+
+  if (nloc_grid < steps_x * steps_y * steps_z) {
+    nloc_grid = steps_x * steps_y * steps_z;
+    memory->grow(occupancy_grid, nloc_grid, "compute_cluster_volume:grid");
+  }
+  ::memset(occupancy_grid, false, nloc * sizeof(bool));
+
+  for (int l = 0; l < n + nghost; ++l) {
+    double const *center = centers[list[l]];
+    bigint ni = static_cast<bigint>((center[0] - bbox[0]) / voxel_size);
+    bigint nj = static_cast<bigint>((center[1] - bbox[1]) / voxel_size);
+    bigint nk = static_cast<bigint>((center[2] - bbox[2]) / voxel_size);
+
+    for (int noffset = 0; noffset < noffsets; noffset += 3) {
+      bigint i = ni + offsets[noffset + 0];
+      bigint j = nj + offsets[noffset + 1];
+      bigint k = nk + offsets[noffset + 2];
+
+      if ((i >= 0) && (i < steps_x) && (j >= 0) && (j < steps_y) && (k >= 0) && (k < steps_z)) {
+        occupancy_grid[idx(i, j, k, steps_y, steps_z)] = true;
+      }
+    }
+  }
 
   // count occupied cells
-  int occupied = 0;
-  double const rsq = r * r;
-  for (int i = 0; i < nx; ++i) {
-    for (int j = 0; j < ny; ++j) {
-      for (int k = 0; k < nz; ++k) {
-        for (int l = 0; l < n; ++l) {
-          double const dx = min_c[0] + i * voxel_size - centers[list[l]][0];
-          double const dy = min_c[1] + j * voxel_size - centers[list[l]][1];
-          double const dz = min_c[2] + k * voxel_size - centers[list[l]][2];
-          if (dx * dx + dy * dy + dz * dz < rsq) {
-            ++occupied;
-            break;
-          };
-        }
+  bigint occupied = 0;
+  for (bigint i = 0; i < steps_x; ++i) {
+    for (bigint j = 0; j < steps_y; ++j) {
+      for (bigint k = 0; k < steps_z; ++k) {
+        if (occupancy_grid[idx(i, j, k, steps_y, steps_z)]) { ++occupied; }
       }
     }
   }
@@ -203,69 +404,71 @@ double ComputeClusterVolume::occupied_volume(const double **const centers, const
 
 /* ---------------------------------------------------------------------- */
 
-double ComputeClusterVolume::occupied_volume2(const double **const centers, const int *const list,
-                                              const int n, const double r,
-                                              const double voxel_size = 0.1) const
+double ComputeClusterVolume::occupied_volume_grid(const int *const list, const int n,
+                                                  const int nghost, bool nonexclusive) noexcept
 {
-  double min_c[3]{};
-  double max_c[3]{};
-
-  // find min and max center coordinates, i.e. bounding box for voxel grid
-  min_c[0] = centers[list[0]][0];
-  min_c[1] = centers[list[0]][1];
-  min_c[2] = centers[list[0]][2];
-  max_c[0] = centers[list[0]][0];
-  max_c[1] = centers[list[0]][1];
-  max_c[2] = centers[list[0]][2];
-
-  for (int i = 0; i < n; ++i) {
-    if (centers[list[i]][0] < min_c[0]) { min_c[0] = centers[list[i]][0]; }
-    if (centers[list[i]][1] < min_c[1]) { min_c[1] = centers[list[i]][1]; }
-    if (centers[list[i]][2] < min_c[2]) { min_c[2] = centers[list[i]][2]; }
-    if (centers[list[i]][0] > max_c[0]) { max_c[0] = centers[list[i]][0]; }
-    if (centers[list[i]][1] > max_c[1]) { max_c[1] = centers[list[i]][1]; }
-    if (centers[list[i]][2] > max_c[2]) { max_c[2] = centers[list[i]][2]; }
+  double **const centers = atom->x;
+  double bbox[6]{};
+  cluster_bbox(list, n, bbox);
+
+  if (nonexclusive) {
+    bbox[0] = MAX(bbox[0], subbonds[0]);    // cut should be on only needed sides
+    bbox[1] = MAX(bbox[1], subbonds[2]);
+    bbox[2] = MAX(bbox[2], subbonds[4]);
+    bbox[3] = MIN(bbox[3], subbonds[1]);
+    bbox[4] = MIN(bbox[4], subbonds[3]);
+    bbox[5] = MIN(bbox[5], subbonds[5]);
   }
 
-  min_c[0] -= r;
-  min_c[1] -= r;
-  min_c[2] -= r;
-  max_c[0] += r;
-  max_c[1] += r;
-  max_c[2] += r;
+  const bigint steps_x = static_cast<bigint>((bbox[3] - bbox[0]) / voxel_size) + 1;
+  const bigint steps_y = static_cast<bigint>((bbox[4] - bbox[1]) / voxel_size) + 1;
+  const bigint steps_z = static_cast<bigint>((bbox[5] - bbox[2]) / voxel_size) + 1;
 
-  min_c[0] = MAX(min_c[0], subbonds[0]);
-  min_c[1] = MAX(min_c[1], subbonds[2]);
-  min_c[2] = MAX(min_c[2], subbonds[4]);
-  max_c[0] = MIN(max_c[0], subbonds[1]);
-  max_c[1] = MIN(max_c[1], subbonds[3]);
-  max_c[2] = MIN(max_c[2], subbonds[5]);
+  if (nloc_grid < steps_x * steps_y * steps_z) {
+    nloc_grid = steps_x * steps_y * steps_z;
+    memory->grow(occupancy_grid, nloc_grid, "compute_cluster_volume:grid");
+  }
+  ::memset(occupancy_grid, false, nloc * sizeof(bool));
+
+  for (int l = 0; l < n + nghost; ++l) {
+    double const *center = centers[list[l]];
+    bigint ni = static_cast<bigint>((center[0] - bbox[0]) / voxel_size);
+    bigint nj = static_cast<bigint>((center[1] - bbox[1]) / voxel_size);
+    bigint nk = static_cast<bigint>((center[2] - bbox[2]) / voxel_size);
+    bigint min_i = MAX(ni - n_cells, 0);
+    bigint max_i = MIN(ni + n_cells, steps_x - 1);
+    bigint min_j = MAX(nj - n_cells, 0);
+    bigint max_j = MIN(nj + n_cells, steps_y - 1);
+    bigint min_k = MAX(nk - n_cells, 0);
+    bigint max_k = MIN(nk + n_cells, steps_z - 1);
+
+    for (bigint i = min_i; i <= max_i; ++i) {
+      for (bigint j = min_j; j <= max_j; ++j) {
+        for (bigint k = min_k; k <= max_k; ++k) {
+          // Compute the grid cell center coordinates
+          // double x = bbox[0] + i * voxel_size + voxel_size / 2;
+          // double y = bbox[1] + j * voxel_size + voxel_size / 2;
+          // double z = bbox[2] + k * voxel_size + voxel_size / 2;
+          // Check if the voxel center is within the sphere of radius r
+          double dx = bbox[0] + i * voxel_size /*x*/ - center[0];
+          double dy = bbox[1] + j * voxel_size /*y*/ - center[1];
+          double dz = bbox[2] + k * voxel_size /*z*/ - center[2];
+          if (dx * dx + dy * dy + dz * dz <= overlap_sq) {
+            occupancy_grid[idx(i, j, k, steps_y, steps_z)] = true;
+          }
+        }
+      }
+    }
+  }
 
   // count occupied cells
-  int occupied = 0;
-  double const rsq = r * r;
-  double cc[3]{};
-
-  cc[0] = min_c[0];
-  while (cc[0] < max_c[0]) {
-    cc[1] = min_c[1];
-    while (cc[1] < max_c[1]) {
-      cc[2] = min_c[2];
-      while (cc[2] < max_c[2]) {
-        for (int l = 0; l < n; ++l) {
-          double const dx = cc[0] - centers[list[l]][0];
-          double const dy = cc[1] - centers[list[l]][1];
-          double const dz = cc[2] - centers[list[l]][2];
-          if (dx * dx + dy * dy + dz * dz < rsq) {
-            ++occupied;
-            break;
-          };
-        }
-        cc[2] += voxel_size;
+  bigint occupied = 0;
+  for (bigint i = 0; i < steps_x; ++i) {
+    for (bigint j = 0; j < steps_y; ++j) {
+      for (bigint k = 0; k < steps_z; ++k) {
+        if (occupancy_grid[idx(i, j, k, steps_y, steps_z)]) { ++occupied; }
       }
-      cc[1] += voxel_size;
     }
-    cc[0] += voxel_size;
   }
 
   return occupied * voxel_size * voxel_size * voxel_size;
@@ -277,5 +480,8 @@ double ComputeClusterVolume::occupied_volume2(const double **const centers, cons
 
 double ComputeClusterVolume::memory_usage()
 {
-  return static_cast<double>((size_local_rows + size_vector) * sizeof(double));
+  double num = static_cast<double>((size_local_rows + size_vector) * sizeof(double));
+  num += nloc_grid * sizeof(bool);
+  if (precompute) { num += noff * sizeof(int); }
+  return num;
 }
diff --git a/src/compute_cluster_volume.h b/src/compute_cluster_volume.h
index 56520180a06..af68b747813 100644
--- a/src/compute_cluster_volume.h
+++ b/src/compute_cluster_volume.h
@@ -23,6 +23,8 @@ ComputeStyle(cluster/volume,ComputeClusterVolume);
 #include "compute.h"
 #include "compute_cluster_size_ext.h"
 
+#include <array>
+
 namespace LAMMPS_NS {
 
 enum class VOLUMEMODE { RECTANGLE = 0, SPHERE = 1, CALC = 2 };
@@ -41,13 +43,42 @@ class ComputeClusterVolume : public Compute {
 
   VOLUMEMODE mode;
   double subbonds[6]{};
-  double *kes = nullptr;          // array of kes of global clusters
-  double *local_kes = nullptr;    // array of kes of local clusters
-  int size_cutoff;                // size of max cluster
-  double occupied_volume(const double **const centers, const int *const list, const int n,
-                         const double r, const double voxel_size) const;
-  double occupied_volume2(const double **const centers, const int *const list, const int n,
-                          const double r, const double voxel_size) const;
+  int nloc{};
+  double *volumes{};
+  double *dist_local{};
+  double *dist{};
+  int size_cutoff;    // size of max cluster
+  double voxel_size{};
+  double overlap{};
+  double overlap_sq{};
+
+  // VOLUMEMODE::CALC
+  bool *occupancy_grid;
+  bigint nloc_grid;
+  int n_cells;
+  bool precompute;
+  int *offsets;
+  int noff;
+  int noffsets;
+
+  // VOLUMEMODE::RECTANGLE
+  double *bboxes;
+
+  ::MPI_Request *in_reqs;
+  ::MPI_Request *out_reqs;
+
+  //   ::MPI_Status *in_stats;
+  //   ::MPI_Status *out_stats;
+
+  double *recv_buf;
+  bigint nloc_recv;
+
+  // with AVX512 occupied_volume_grid works faster, than precomputed version
+  double occupied_volume_precomputed(const int *const list, const int n, const int nghost,
+                                     bool nonexclusive) noexcept;
+  double occupied_volume_grid(const int *const list, const int n, const int nghost,
+                              bool nonexclusive) noexcept;
+  void cluster_bbox(const int *const list, const int n, double *bbox) const noexcept;
 };
 
 }    // namespace LAMMPS_NS