From 6b0e67b4ab81898c43a5ee86ce0ca66e470a768a Mon Sep 17 00:00:00 2001
From: John Omotani <john.omotani@ukaea.uk>
Date: Sun, 9 Jun 2024 12:27:22 +0100
Subject: [PATCH 1/8] Fix handling of processes not included in any 'anyv'
 sub-block

The algorithm for deciding the size of shared-memory ranges may
sometimes exclude a few processes, if this allows the number of grid
points to be factorised in a more favourable way. When this happens to
an 'anyv' sub-block, some extra special handling (introduced in this
commit) is required to make sure that the excluded processes actually
don't do anything.
---
 moment_kinetics/src/array_allocation.jl | 36 ++++++++++++++++++++++---
 moment_kinetics/src/communication.jl    | 19 ++++++++++++-
 moment_kinetics/src/looping.jl          |  9 +++++--
 3 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/moment_kinetics/src/array_allocation.jl b/moment_kinetics/src/array_allocation.jl
index 7dc3bc5d3..b265b6879 100644
--- a/moment_kinetics/src/array_allocation.jl
+++ b/moment_kinetics/src/array_allocation.jl
@@ -9,6 +9,8 @@ using ..communication
 using ..debugging
 @debug_initialize_NaN using ..communication: block_rank, _block_synchronize
 
+using MPI
+
 """
 allocate array with dimensions given by dims and entries of type Bool
 """
@@ -55,14 +57,27 @@ function allocate_shared_float(dims...; comm=nothing)
     array = allocate_shared(mk_float, dims; comm=comm)
     @debug_initialize_NaN begin
         # Initialize as NaN to try and catch use of uninitialized values
-        if block_rank[] == 0
+        if comm === nothing
+            comm_rank = block_rank[]
+            this_comm = comm_block[]
+        elseif comm == MPI.COMM_NULL
+            comm_rank = -1
+            this_comm = nothing
+        else
+            # Get MPI.Comm_rank when comm is not nothing
+            comm_rank = MPI.Comm_rank(comm)
+            this_comm = comm
+        end
+        if comm_rank == 0
             array .= NaN
             @debug_track_initialized begin
                 # Track initialization as if the array was not initialized to NaN
                 array.is_initialized .= false
             end
         end
-        _block_synchronize()
+        if this_comm !== nothing
+            MPI.Barrier(this_comm)
+        end
     end
     return array
 end
@@ -85,14 +100,27 @@ function allocate_shared_complex(dims...; comm=nothing)
     array = allocate_shared(Complex{mk_float}, dims; comm=comm)
     @debug_initialize_NaN begin
         # Initialize as NaN to try and catch use of uninitialized values
-        if block_rank[] == 0
+        if comm === nothing
+            comm_rank = block_rank[]
+            this_comm = comm_block[]
+        elseif comm == MPI.COMM_NULL
+            comm_rank = -1
+            this_comm = nothing
+        else
+            # Get MPI.Comm_rank when comm is not nothing
+            comm_rank = MPI.Comm_rank(comm)
+            this_comm = comm
+        end
+        if comm_rank == 0
             array .= NaN
             @debug_track_initialized begin
                 # Track initialization as if the array was not initialized to NaN
                 array.is_initialized .= false
             end
         end
-        _block_synchronize()
+        if this_comm !== nothing
+            MPI.Barrier(this_comm)
+        end
     end
     return array
 end
diff --git a/moment_kinetics/src/communication.jl b/moment_kinetics/src/communication.jl
index ee1b84f3a..d958a5683 100644
--- a/moment_kinetics/src/communication.jl
+++ b/moment_kinetics/src/communication.jl
@@ -100,7 +100,7 @@ const anyv_subblock_size = Ref{mk_int}()
 
 """
 """
-const anyv_isubblock_index = Ref{mk_int}()
+const anyv_isubblock_index = Ref{Union{mk_int,Nothing}}()
 
 """
 """
@@ -567,6 +567,19 @@ Array{mk_float}
 function allocate_shared(T, dims; comm=nothing, maybe_debug=true)
     if comm === nothing
         comm = comm_block[]
+    elseif comm == MPI.COMM_NULL
+        # If `comm` is a null communicator (on this process), then this array is just a
+        # dummy that will not be used.
+        array = Array{T}(undef, (0 for _ ∈ dims)...)
+
+        @debug_shared_array begin
+            # If @debug_shared_array is active, create DebugMPISharedArray instead of Array
+            if maybe_debug
+                array = DebugMPISharedArray(array, comm)
+            end
+        end
+
+        return array
     end
     br = MPI.Comm_rank(comm)
     bs = MPI.Comm_size(comm)
@@ -909,6 +922,10 @@ sub-blocks, depending on how the species and spatial dimensions are split up.
 `_anyv_subblock_synchronize()`.
 """
 function _anyv_subblock_synchronize()
+    if comm_anyv_subblock[] == MPI.COMM_NULL
+        # No synchronization to do for a null communicator
+        return nothing
+    end
 
     MPI.Barrier(comm_anyv_subblock[])
 
diff --git a/moment_kinetics/src/looping.jl b/moment_kinetics/src/looping.jl
index 8dd4b4c7a..a6c1ca880 100644
--- a/moment_kinetics/src/looping.jl
+++ b/moment_kinetics/src/looping.jl
@@ -471,14 +471,19 @@ eval(quote
              anyv_subblock_size[] = anyv_split[end]
              number_of_anyv_blocks = prod(anyv_split[1:end-1])
              anyv_subblock_index = block_rank[] ÷ anyv_subblock_size[]
-             anyv_rank_within_subblock = block_rank[] % anyv_subblock_size[]
+             if anyv_subblock_index ≥ number_of_anyv_blocks
+                 anyv_subblock_index = nothing
+                 anyv_rank_within_subblock = -1
+             else
+                 anyv_rank_within_subblock = block_rank[] % anyv_subblock_size[]
+             end
 
              # Create communicator for the anyv subblock. OK to do this here as
              # communication.setup_distributed_memory_MPI() must have already been called
              # to set block_size[] and block_rank[]
              comm_anyv_subblock[] = MPI.Comm_split(comm_block[], anyv_subblock_index,
                                                    anyv_rank_within_subblock)
-             anyv_subblock_rank[] = MPI.Comm_rank(comm_anyv_subblock[])
+             anyv_subblock_rank[] = anyv_rank_within_subblock
              anyv_isubblock_index[] = anyv_subblock_index
              anyv_nsubblocks_per_block[] = number_of_anyv_blocks
              anyv_rank0 = (anyv_subblock_rank[] == 0)

From 795adba8a9a53d36dd1d0d97aa8c3cc78062c265 Mon Sep 17 00:00:00 2001
From: John Omotani <john.omotani@ukaea.uk>
Date: Sun, 9 Jun 2024 12:55:57 +0100
Subject: [PATCH 2/8] Fix debug features for anyv-subblock shared memory arrays

---
 moment_kinetics/src/communication.jl | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/moment_kinetics/src/communication.jl b/moment_kinetics/src/communication.jl
index d958a5683..2f88c8cd9 100644
--- a/moment_kinetics/src/communication.jl
+++ b/moment_kinetics/src/communication.jl
@@ -531,6 +531,11 @@ end
     # instances, so their is_read and is_written members can be checked and
     # reset by _block_synchronize()
     const global_debugmpisharedarray_store = Vector{DebugMPISharedArray}(undef, 0)
+    # 'anyv' regions require a separate array store, because within an anyv region,
+    # processes in the same shared memory block may still not be synchronized if they are
+    # in different anyv sub-blocks, so debug checks within an anyv region should only
+    # consider the anyv-specific arrays.
+    const global_anyv_debugmpisharedarray_store = Vector{DebugMPISharedArray}(undef, 0)
 end
 
 """
@@ -646,7 +651,11 @@ function allocate_shared(T, dims; comm=nothing, maybe_debug=true)
         # If @debug_shared_array is active, create DebugMPISharedArray instead of Array
         if maybe_debug
             debug_array = DebugMPISharedArray(array, comm)
-            push!(global_debugmpisharedarray_store, debug_array)
+            if comm == comm_anyv_subblock[]
+                push!(global_anyv_debugmpisharedarray_store, debug_array)
+            else
+                push!(global_debugmpisharedarray_store, debug_array)
+            end
             return debug_array
         end
     end
@@ -775,11 +784,17 @@ end
     Raises an error if any array has been accessed incorrectly since the previous call
     to _block_synchronize()
 
-    Can be added when debugging to help in down where an error occurs.
+    Can be added when debugging to help pin down where an error occurs.
     """
-    function debug_check_shared_memory(; kwargs...)
-        for (arraynum, array) ∈ enumerate(global_debugmpisharedarray_store)
-            debug_check_shared_array(array; kwargs...)
+    function debug_check_shared_memory(; comm=comm_block[], kwargs...)
+        if comm == comm_anyv_subblock[]
+            for (arraynum, array) ∈ enumerate(global_anyv_debugmpisharedarray_store)
+                debug_check_shared_array(array; comm=comm, kwargs...)
+            end
+        else
+            for (arraynum, array) ∈ enumerate(global_debugmpisharedarray_store)
+                debug_check_shared_array(array; comm=comm, kwargs...)
+            end
         end
         return nothing
     end
@@ -960,7 +975,7 @@ function _anyv_subblock_synchronize()
         # * If an element is written to, only the rank that writes to it should read it.
         #
         @debug_detect_redundant_block_synchronize previous_was_unnecessary = true
-        for (arraynum, array) ∈ enumerate(global_debugmpisharedarray_store)
+        for (arraynum, array) ∈ enumerate(global_anyv_debugmpisharedarray_store)
 
             debug_check_shared_array(array; comm=comm_anyv_subblock[])
 
@@ -1074,6 +1089,7 @@ end
 """
 function free_shared_arrays()
     @debug_shared_array resize!(global_debugmpisharedarray_store, 0)
+    @debug_shared_array resize!(global_anyv_debugmpisharedarray_store, 0)
 
     for w ∈ global_Win_store
         MPI.free(w)

From 9c3c446b1269d5b13fa191a41556496c950eac51 Mon Sep 17 00:00:00 2001
From: John Omotani <john.omotani@ukaea.uk>
Date: Sat, 8 Jun 2024 18:03:31 +0100
Subject: [PATCH 3/8] Fix initialisation and parallelisation in
 2D_FEM_assembly_test.jl

---
 test_scripts/2D_FEM_assembly_test.jl | 41 +++++++++++++++-------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/test_scripts/2D_FEM_assembly_test.jl b/test_scripts/2D_FEM_assembly_test.jl
index 12e280648..a983e1c13 100644
--- a/test_scripts/2D_FEM_assembly_test.jl
+++ b/test_scripts/2D_FEM_assembly_test.jl
@@ -112,14 +112,14 @@ end
         println("made inputs")
         println("vpa: ngrid: ",ngrid," nelement: ",nelement_local_vpa, " Lvpa: ",Lvpa)
         println("vperp: ngrid: ",ngrid," nelement: ",nelement_local_vperp, " Lvperp: ",Lvperp)
-        vpa, vpa_spectral = define_coordinate(vpa_input)
-        vperp, vperp_spectral = define_coordinate(vperp_input)
         
         # Set up MPI
         if standalone
             initialize_comms!()
         end
         setup_distributed_memory_MPI(1,1,1,1)
+        vpa, vpa_spectral = define_coordinate(vpa_input)
+        vperp, vperp_spectral = define_coordinate(vperp_input)
         looping.setup_loop_ranges!(block_rank[], block_size[];
                                        s=1, sn=1,
                                        r=1, z=1, vperp=vperp.n, vpa=vpa.n,
@@ -229,23 +229,25 @@ end
         msp = 1.0
         nussp = 1.0
         begin_serial_region()
-        for ivperp in 1:vperp.n
-            for ivpa in 1:vpa.n
-                Fs_M[ivpa,ivperp] = F_Maxwellian(denss,upars,vths,vpa,vperp,ivpa,ivperp)
-                F_M[ivpa,ivperp] = F_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                H_M_exact[ivpa,ivperp] = H_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                G_M_exact[ivpa,ivperp] = G_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                d2Gdvpa2_M_exact[ivpa,ivperp] = d2Gdvpa2_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                d2Gdvperp2_M_exact[ivpa,ivperp] = d2Gdvperp2_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                dGdvperp_M_exact[ivpa,ivperp] = dGdvperp_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                d2Gdvperpdvpa_M_exact[ivpa,ivperp] = d2Gdvperpdvpa_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                dHdvpa_M_exact[ivpa,ivperp] = dHdvpa_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                dHdvperp_M_exact[ivpa,ivperp] = dHdvperp_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                C_M_exact[ivpa,ivperp] = Cssp_Maxwellian_inputs(denss,upars,vths,ms,
-                                                                dens,upar,vth,msp,
-                                                                nussp,vpa,vperp,ivpa,ivperp)
+        @serial_region begin
+            for ivperp in 1:vperp.n
+                for ivpa in 1:vpa.n
+                    Fs_M[ivpa,ivperp] = F_Maxwellian(denss,upars,vths,vpa,vperp,ivpa,ivperp)
+                    F_M[ivpa,ivperp] = F_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                    H_M_exact[ivpa,ivperp] = H_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                    G_M_exact[ivpa,ivperp] = G_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                    d2Gdvpa2_M_exact[ivpa,ivperp] = d2Gdvpa2_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                    d2Gdvperp2_M_exact[ivpa,ivperp] = d2Gdvperp2_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                    dGdvperp_M_exact[ivpa,ivperp] = dGdvperp_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                    d2Gdvperpdvpa_M_exact[ivpa,ivperp] = d2Gdvperpdvpa_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                    dHdvpa_M_exact[ivpa,ivperp] = dHdvpa_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                    dHdvperp_M_exact[ivpa,ivperp] = dHdvperp_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                    C_M_exact[ivpa,ivperp] = Cssp_Maxwellian_inputs(denss,upars,vths,ms,
+                                                                    dens,upar,vth,msp,
+                                                                    nussp,vpa,vperp,ivpa,ivperp)
+                end
             end
-        end
+        end
         rpbd_exact = allocate_rosenbluth_potential_boundary_data(vpa,vperp)
 
         begin_s_r_z_anyv_region()
@@ -280,7 +282,8 @@ end
              algebraic_solve_for_d2Gdvperp2=false,calculate_GG=true,calculate_dGdvperp=true)
         # extract C[Fs,Fs'] result
         # and Rosenbluth potentials for testing
-        begin_vperp_vpa_region()
+        begin_s_r_z_anyv_region()
+        begin_anyv_vperp_vpa_region()
         @loop_vperp_vpa ivperp ivpa begin
             C_M_num[ivpa,ivperp] = fkpl_arrays.CC[ivpa,ivperp]
             G_M_num[ivpa,ivperp] = fkpl_arrays.GG[ivpa,ivperp]

From 6b98483765b1e83254ff592dab35123e9f824ec2 Mon Sep 17 00:00:00 2001
From: John Omotani <john.omotani@ukaea.uk>
Date: Sun, 9 Jun 2024 14:20:10 +0100
Subject: [PATCH 4/8] Fix MPI path in macOS CI

---
 .github/workflows/parallel_test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/parallel_test.yml b/.github/workflows/parallel_test.yml
index 88fd99e63..4118bc28f 100644
--- a/.github/workflows/parallel_test.yml
+++ b/.github/workflows/parallel_test.yml
@@ -54,9 +54,9 @@ jobs:
           version: '1.10'
       - uses: julia-actions/cache@v1
       - run: |
-          MPILIBPATH=$(find /opt/homebrew/Cellar/open-mpi/ -name libmpi.dylib)
+          export MPILIBPATH=$(find /opt/homebrew/Cellar/open-mpi/ -name libmpi.dylib)
           touch Project.toml
-          julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["MPI", "MPIPreferences"]); using MPIPreferences; MPIPreferences.use_system_binary(library_names="/opt/homebrew/Cellar/open-mpi/5.0.3/lib/libmpi.dylib")'
+          julia --project -O3 --check-bounds=no -e "import Pkg; Pkg.add([\"MPI\", \"MPIPreferences\"]); using MPIPreferences; MPIPreferences.use_system_binary(library_names=\"$MPILIBPATH\")"
           julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.add(["NCDatasets", "Random", "SpecialFunctions", "Test"]); Pkg.develop(path="moment_kinetics/")'
           julia --project -O3 --check-bounds=no -e 'import Pkg; Pkg.precompile()'
           # Need to use openmpi so that the following arguments work:

From 2f9f5a64375b0f302cd895e0fa746652a6beb33a Mon Sep 17 00:00:00 2001
From: Michael Hardman <michael.hardman@physics.ox.ac.uk>
Date: Sun, 9 Jun 2024 22:53:00 +0100
Subject: [PATCH 5/8] Move the position of define_coordinate() to remove MPI
 initialisation error mentioned in #221.

---
 moment_kinetics/test/fokker_planck_tests.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/moment_kinetics/test/fokker_planck_tests.jl b/moment_kinetics/test/fokker_planck_tests.jl
index edfd0d4d4..63fdd01e6 100644
--- a/moment_kinetics/test/fokker_planck_tests.jl
+++ b/moment_kinetics/test/fokker_planck_tests.jl
@@ -53,12 +53,12 @@ function create_grids(ngrid,nelement_vpa,nelement_vperp;
         #println("made inputs")
         #println("vpa: ngrid: ",ngrid," nelement: ",nelement_local_vpa, " Lvpa: ",Lvpa)
         #println("vperp: ngrid: ",ngrid," nelement: ",nelement_local_vperp, " Lvperp: ",Lvperp)
-        vpa, vpa_spectral = define_coordinate(vpa_input)
-        vperp, vperp_spectral = define_coordinate(vperp_input)
         
         # Set up MPI
         initialize_comms!()
         setup_distributed_memory_MPI(1,1,1,1)
+        vpa, vpa_spectral = define_coordinate(vpa_input)
+        vperp, vperp_spectral = define_coordinate(vperp_input)
         looping.setup_loop_ranges!(block_rank[], block_size[];
                                        s=1, sn=1,
                                        r=1, z=1, vperp=vperp.n, vpa=vpa.n,

From 62e5c92574e353eb242687fc8887f66c0ee6f0cc Mon Sep 17 00:00:00 2001
From: John Omotani <john.omotani@ukaea.uk>
Date: Mon, 10 Jun 2024 22:27:59 +0100
Subject: [PATCH 6/8] Revert incorrect part of 9c3c446b

9c3c446b was "Fix initialisation and parallelisation in
2D_FEM_assembly_test.jl" - should not have introduced a `@serial_region`
where non-shared-memory arrays were being filled.
---
 test_scripts/2D_FEM_assembly_test.jl | 35 +++++++++++++---------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/test_scripts/2D_FEM_assembly_test.jl b/test_scripts/2D_FEM_assembly_test.jl
index a983e1c13..9f443d493 100644
--- a/test_scripts/2D_FEM_assembly_test.jl
+++ b/test_scripts/2D_FEM_assembly_test.jl
@@ -228,26 +228,23 @@ end
         ms = 1.0
         msp = 1.0
         nussp = 1.0
-        begin_serial_region()
-        @serial_region begin
-            for ivperp in 1:vperp.n
-                for ivpa in 1:vpa.n
-                    Fs_M[ivpa,ivperp] = F_Maxwellian(denss,upars,vths,vpa,vperp,ivpa,ivperp)
-                    F_M[ivpa,ivperp] = F_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                    H_M_exact[ivpa,ivperp] = H_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                    G_M_exact[ivpa,ivperp] = G_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                    d2Gdvpa2_M_exact[ivpa,ivperp] = d2Gdvpa2_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                    d2Gdvperp2_M_exact[ivpa,ivperp] = d2Gdvperp2_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                    dGdvperp_M_exact[ivpa,ivperp] = dGdvperp_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                    d2Gdvperpdvpa_M_exact[ivpa,ivperp] = d2Gdvperpdvpa_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                    dHdvpa_M_exact[ivpa,ivperp] = dHdvpa_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                    dHdvperp_M_exact[ivpa,ivperp] = dHdvperp_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
-                    C_M_exact[ivpa,ivperp] = Cssp_Maxwellian_inputs(denss,upars,vths,ms,
-                                                                    dens,upar,vth,msp,
-                                                                    nussp,vpa,vperp,ivpa,ivperp)
-                end
+        for ivperp in 1:vperp.n
+            for ivpa in 1:vpa.n
+                Fs_M[ivpa,ivperp] = F_Maxwellian(denss,upars,vths,vpa,vperp,ivpa,ivperp)
+                F_M[ivpa,ivperp] = F_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                H_M_exact[ivpa,ivperp] = H_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                G_M_exact[ivpa,ivperp] = G_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                d2Gdvpa2_M_exact[ivpa,ivperp] = d2Gdvpa2_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                d2Gdvperp2_M_exact[ivpa,ivperp] = d2Gdvperp2_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                dGdvperp_M_exact[ivpa,ivperp] = dGdvperp_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                d2Gdvperpdvpa_M_exact[ivpa,ivperp] = d2Gdvperpdvpa_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                dHdvpa_M_exact[ivpa,ivperp] = dHdvpa_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                dHdvperp_M_exact[ivpa,ivperp] = dHdvperp_Maxwellian(dens,upar,vth,vpa,vperp,ivpa,ivperp)
+                C_M_exact[ivpa,ivperp] = Cssp_Maxwellian_inputs(denss,upars,vths,ms,
+                                                                dens,upar,vth,msp,
+                                                                nussp,vpa,vperp,ivpa,ivperp)
             end
-        end
+        end
         rpbd_exact = allocate_rosenbluth_potential_boundary_data(vpa,vperp)
 
         begin_s_r_z_anyv_region()

From 23372f52453bf50e4b5f163cf8edc7ba0db618e8 Mon Sep 17 00:00:00 2001
From: John Omotani <john.omotani@ukaea.uk>
Date: Mon, 10 Jun 2024 22:42:19 +0100
Subject: [PATCH 7/8] Fix anyv parallelisation in fokker_planck_tests.jl

---
 moment_kinetics/test/fokker_planck_tests.jl | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/moment_kinetics/test/fokker_planck_tests.jl b/moment_kinetics/test/fokker_planck_tests.jl
index 63fdd01e6..c5b192bac 100644
--- a/moment_kinetics/test/fokker_planck_tests.jl
+++ b/moment_kinetics/test/fokker_planck_tests.jl
@@ -279,7 +279,8 @@ function runtests()
                  calculate_GG=true, calculate_dGdvperp=true)
             # extract C[Fs,Fs'] result
             # and Rosenbluth potentials for testing
-            begin_vperp_vpa_region()
+            begin_s_r_z_anyv_region()
+            begin_anyv_vperp_vpa_region()
             @loop_vperp_vpa ivperp ivpa begin
                 G_M_num[ivpa,ivperp] = fkpl_arrays.GG[ivpa,ivperp]
                 H_M_num[ivpa,ivperp] = fkpl_arrays.HH[ivpa,ivperp]
@@ -419,7 +420,8 @@ function runtests()
                     conserving_corrections!(fkpl_arrays.CC,Fs_M,vpa,vperp,dummy_array)
                 end
                 # extract C[Fs,Fs'] result
-                begin_vperp_vpa_region()
+                begin_s_r_z_anyv_region()
+                begin_anyv_vperp_vpa_region()
                 @loop_vperp_vpa ivperp ivpa begin
                     C_M_num[ivpa,ivperp] = fkpl_arrays.CC[ivpa,ivperp]
                 end
@@ -579,7 +581,8 @@ function runtests()
                     density_conserving_correction!(fkpl_arrays.CC,Fs_M,vpa,vperp,dummy_array)
                 end
                 # extract C[Fs,Fs'] result
-                begin_vperp_vpa_region()
+                begin_s_r_z_anyv_region()
+                begin_anyv_vperp_vpa_region()
                 @loop_vperp_vpa ivperp ivpa begin
                     C_M_num[ivpa,ivperp] = fkpl_arrays.CC[ivpa,ivperp]
                 end

From f57f3a7fdb14f86f019b9a308dcff6bd9bc2734d Mon Sep 17 00:00:00 2001
From: John Omotani <john.omotani@ukaea.uk>
Date: Mon, 10 Jun 2024 22:54:00 +0100
Subject: [PATCH 8/8] Fix anyv MPI.Bcast() calls in fokker_plack

Need to skip the `Bcast()` if `comm_anyv_subblock[]` is a null
communicator (indicating that the process is not part of an anyv
subblock).
---
 moment_kinetics/src/fokker_planck.jl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/moment_kinetics/src/fokker_planck.jl b/moment_kinetics/src/fokker_planck.jl
index 0352f9a1f..1f520e9f6 100644
--- a/moment_kinetics/src/fokker_planck.jl
+++ b/moment_kinetics/src/fokker_planck.jl
@@ -664,7 +664,9 @@ function conserving_corrections!(CC,pdf_in,vpa,vperp,dummy_vpavperp)
 
     # Broadcast x0, x1, x2 to all processes in the 'anyv' subblock
     param_vec = [x0, x1, x2, upar]
-    MPI.Bcast!(param_vec, 0, comm_anyv_subblock[])
+    if comm_anyv_subblock[] != MPI.COMM_NULL
+        MPI.Bcast!(param_vec, 0, comm_anyv_subblock[])
+    end
     (x0, x1, x2, upar) = param_vec
     
     # correct CC
@@ -695,7 +697,9 @@ function density_conserving_correction!(CC,pdf_in,vpa,vperp,dummy_vpavperp)
 
     # Broadcast x0 to all processes in the 'anyv' subblock
     param_vec = [x0]
-    MPI.Bcast!(param_vec, 0, comm_anyv_subblock[])
+    if comm_anyv_subblock[] != MPI.COMM_NULL
+        MPI.Bcast!(param_vec, 0, comm_anyv_subblock[])
+    end
     x0 = param_vec[1]
     
     # correct CC