Solved the following with regards to parallel assembly process in

MPIPETScCommunicator: 1. The current algorithm does not compute, before actual numerical assembly, the (full) sparsity pattern of the rows which are locally owned by each processor. Instead, we preallocate some storage for the matrix (with a rough, upper estimate of the number of nonzero elements per row), and entries are dynamically introduced into the matrix; see [here](https://github.com/gridap/GridapDistributed.jl/blob/f595e3ebb45c7777a25dded4a205b9a80195c7a9/src/MPIPETScDistributedAssemblersInterfaces.jl#L25) and [here](https://github.com/gridap/GridapDistributed.jl/blob/f595e3ebb45c7777a25dded4a205b9a80195c7a9/src/MPIPETScDistributedAssemblersInterfaces.jl#L35). Our experience with FEMPAR reveals that this has a number of practical performance and memory implications that we want to avoid in the final version of the algorithm (see also issue #9). 2. As a complementary side-note to 1., let me recall (to not forget/reuse) that: (1) we discussed [here](#3 (comment)) a possible way to accommodate the final/performant algorithm such that sticks into the current structure of `assembly_matrix_vector`; (2) I already wrote an implementation of this algorithm, but it does not stick to the current structure of `assembly_matrix_vector`, see [here](#3 (comment)) for more details; (3) in the most general case (non-conforming meshes, hanging DoF constraints), the final/performant algorithm requires to perform an ad-hoc communication among nearest neighbours in order to set up the sparsity pattern of the locally owned rows, see [here](https://github.com/gridap/GridapDistributed.jl/blob/distributed_integration_restricted_to_owned_cells/distributed_integration_restricted_to_owned_cells.jl#L275) for more details. At first glance, I do not see how it can be implemented given the aforementioned limitations of `MPIPETScDistributedVector` mentioned above.
gridap · Jun 24, 2020 · 1aae8ae · 1aae8ae
1 parent c57f212
commit 1aae8ae
Show file tree

Hide file tree

Showing 5 changed files with 260 additions and 88 deletions.
diff --git a/src/DistributedVectors.jl b/src/DistributedVectors.jl
@@ -25,10 +25,6 @@ end
 
 # Build a Distributed vector from an index set
 # the resulting object is assumed to be locally indexable when restricted to a part
-function DistributedVector{T}(initializer::Function,indices::DistributedIndexSet,args...) where T
-  @abstractmethod
-end
-
-function DistributedVector(initializer::Function,indices::DistributedIndexSet,args...)
+function DistributedVector{T}(indices::DistributedIndexSet) where T
   @abstractmethod
 end
diff --git a/src/MPIPETScDistributedAssemblersInterfaces.jl b/src/MPIPETScDistributedAssemblersInterfaces.jl
@@ -52,6 +52,14 @@ function Gridap.Algebra.finalize_coo!(
   ::Type{PETSc.Mat{Float64}},IJV::MPIPETScDistributedData,m::MPIPETScDistributedIndexSet,n::MPIPETScDistributedIndexSet)
 end
 
+function _convert_buf_to_petscint(buf)
+  if isempty(buf)
+    Ptr{PETSc.C.PetscInt}(0)
+  else
+    isa(buf,Vector{PETSc.C.PetscInt}) ? buf : PETSc.C.PetscInt[ i for i in buf ]
+  end
+end
+
 function assemble_global_matrix(
   ::Type{RowsComputedLocally{false}},
   ::Type{PETSc.Mat{Float64}},
@@ -65,7 +73,7 @@ function assemble_global_matrix(
   nlcols = nlrows
 
   I, J, V = IJV.part
-  ncols_Alocal = maximum(J)
+  ncols_Alocal = length(m.parts.part.lid_to_owner)
   for i = 1:length(I)
     I[i] = m.app_to_petsc_locidx[I[i]]
   end
@@ -82,54 +90,264 @@ function assemble_global_matrix(
   end
 
   p = Ref{PETSc.C.Mat{Float64}}()
-  f(buf)= if isempty(buf)
-      Ptr{PETSc.C.PetscInt}(0)
-    else
-      isa(buf,Vector{PETSc.C.PetscInt}) ? buf : PETSc.C.PetscInt[ i for i in buf ]
-    end
-
   PETSc.C.chk(PETSc.C.MatCreateMPIAIJWithArrays(
     get_comm(m).comm,
     nlrows,
     nlcols,
     ngrows,
     ngcols,
-    f(Alocal.rowptr),
-    f(Alocal.colval),
+    _convert_buf_to_petscint(Alocal.rowptr),
+    _convert_buf_to_petscint(Alocal.colval),
     Alocal.nzval,
     p,
   ))
   A=PETSc.Mat(p[])
 end
 
+function compute_subdomain_graph_dIS_and_lst_snd(gids, dI)
+  # List parts I have to send data to
+  function compute_lst_snd(part, gids, I)
+    lst_snd = Set{Int}()
+    for i = 1:length(I)
+        owner = gids.lid_to_owner[I[i]]
+        if (owner != part)
+          if (!(owner in lst_snd))
+            push!(lst_snd, owner)
+          end
+        end
+    end
+    collect(lst_snd)
+  end
+
+  part_to_lst_snd = DistributedData(compute_lst_snd, gids, dI)
+  part_to_num_snd = DistributedData(part_to_lst_snd) do part, lst_snd
+    length(lst_snd)
+  end
+
+  offsets = gather(part_to_num_snd)
+  num_edges = sum(offsets)
+  GridapDistributed._fill_offsets!(offsets)
+  part_to_offsets = scatter(get_comm(part_to_num_snd), offsets)
+
+  part_to_owned_subdomain_graph_edge_gids =
+    DistributedData(part_to_num_snd, part_to_offsets) do part, num_snd, offset
+      owned_edge_gids = zeros(Int, num_snd)
+      o = offset
+      for i = 1:num_snd
+        o += 1
+        owned_edge_gids[i] = o
+      end
+      owned_edge_gids
+    end
+
+  num_snd    = PETSc.C.PetscMPIInt(part_to_num_snd.part)
+  lst_snd    = convert(Vector{PETSc.C.PetscMPIInt}, part_to_lst_snd.part) .- PETSc.C.PetscMPIInt(1)
+  snd_buf    = part_to_owned_subdomain_graph_edge_gids.part
+
+  num_rcv    = Ref{PETSc.C.PetscMPIInt}()
+  lst_rcv    = Ref{Ptr{PETSc.C.PetscMPIInt}}()
+  rcv_buf    = Ref{Ptr{Int64}}()
+
+  #PETSc.C.PetscCommBuildTwoSidedSetType(Float64, get_comm(gids).comm, PETSc.C.PETSC_BUILDTWOSIDED_ALLREDUCE)
+  PETSc.C.chk(PETSc.C.PetscCommBuildTwoSided(Float64,
+                                 get_comm(gids).comm,
+                                 PETSc.C.PetscMPIInt(1),
+                                 MPI.Datatype(Int64).val,
+                                 num_snd,
+                                 pointer(lst_snd),
+                                 Ptr{Cvoid}(pointer(snd_buf)),
+                                 num_rcv,
+                                 lst_rcv,
+                                 Base.unsafe_convert(Ptr{Cvoid},rcv_buf)))
+
+
+  #TO-DO: Call to PetscFree for lst_rcv and rcv_buf
+  #All attempts so far resulted in a SEGfault, so I gave up
+  #see https://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/Sys/PetscCommBuildTwoSided.html
+  #for more details
+
+  num_rcv=num_rcv[]
+  lst_rcv_vec=unsafe_wrap(Vector{PETSc.C.PetscMPIInt},lst_rcv[],num_rcv)
+  rcv_buf_vec=unsafe_wrap(Vector{Int64},rcv_buf[],num_rcv)
+
+  function compute_subdomain_graph_index_set(part,num_edges,owned_edge_entries)
+    lid_to_gid=Vector{Int}(undef,num_snd+num_rcv)
+    lid_to_owner=Vector{Int}(undef,num_snd+num_rcv)
+    for i=1:num_snd
+     lid_to_gid[i]=owned_edge_entries[i]
+     lid_to_owner[i]=part
+    end
+    for i=num_snd+1:num_snd+num_rcv
+     lid_to_gid[i]=rcv_buf_vec[i-num_snd]
+     lid_to_owner[i]=lst_rcv_vec[i-num_snd]+1
+    end
+    IndexSet(num_edges, lid_to_gid, lid_to_owner)
+  end
+
+  (DistributedIndexSet(compute_subdomain_graph_index_set,
+                       get_comm(gids),
+                       num_edges,
+                       num_edges,
+                       part_to_owned_subdomain_graph_edge_gids),
+                       part_to_lst_snd)
+end
+
+
+
 function assemble_global_matrix(
   ::Type{OwnedCellsStrategy{false}},
   ::Type{PETSc.Mat{Float64}},
   IJV::MPIPETScDistributedData,
   m::MPIPETScDistributedIndexSet,
   n::MPIPETScDistributedIndexSet,
 )
+
   ngrows = num_gids(m)
   ngcols = num_gids(n)
-  nlrows = num_owned_entries(m)
+  nlrows = length(m.parts.part.lid_to_gid)
   nlcols = nlrows
 
-  # TO-DO: Properly manage PREALLOCATION building the sparsity pattern of the matrix
-  A=Mat(Float64, ngrows, ngcols;
-        mlocal=nlrows, nlocal=nlcols, nz=100, onz=100,
-        comm=get_comm(m).comm, mtype=PETSc.C.MATMPIAIJ)
+  Ilocal = IJV.part[1]
+  Jlocal = IJV.part[2]
+  Vlocal = IJV.part[3]
 
-  A.insertmode = PETSc.C.ADD_VALUES
+  dI = DistributedData(get_comm(m),IJV) do part, IJV
+    I,_,_ = IJV
+    I
+  end
 
-  # TO-DO: Not efficient!, one call to PETSc function per each injection
-  I,J,V = IJV.part
-  for (i,j,v) in zip(I,J,V)
-    A[m.lid_to_gid_petsc[i],n.lid_to_gid_petsc[j]]=v
+  # 2. Determine communication pattern
+  dIS, part_to_lst_snd =
+   compute_subdomain_graph_dIS_and_lst_snd(m, dI)
+
+  # 3. Communicate entries
+  length_entries = DistributedVector{Int}(dIS)
+  do_on_parts(length_entries, m, dI, part_to_lst_snd) do part, length_entries, gid, I, lst_snd
+    fill!(length_entries, zero(eltype(length_entries)))
+    for i = 1:length(I)
+        owner = gid.lid_to_owner[I[i]]
+        if (owner != part)
+          edge_lid = findfirst((i) -> (i == owner), lst_snd)
+          length_entries[edge_lid]+=3
+        end
+    end
+  end
+  exchange!(length_entries)
+  dd_length_entries = DistributedData(length_entries) do part, length_entries
+      collect(length_entries)
+  end
+  exchange_entries_vector = DistributedVector{Vector{Float64}}(dIS, dd_length_entries)
+  # Pack data to be sent
+  do_on_parts(dIS,exchange_entries_vector,
+              m, IJV, part_to_lst_snd) do part, IS, exchange_entries_vector,
+                                             test_gids, IJV, lst_snd
+      current_pack_position = fill(one(Int32), length(IS.lid_to_owner))
+      I,J,V = IJV
+      for i = 1:length(I)
+          owner = test_gids.lid_to_owner[I[i]]
+          if (owner != part)
+            edge_lid = findfirst((i) -> (i == owner), lst_snd)
+            row=m.lid_to_gid_petsc[I[i]]
+            col=n.lid_to_gid_petsc[J[i]]
+            current_pos=current_pack_position[edge_lid]
+            exchange_entries_vector[edge_lid][current_pos  ]=row
+            exchange_entries_vector[edge_lid][current_pos+1]=col
+            exchange_entries_vector[edge_lid][current_pos+2]=V[i]
+            current_pack_position[edge_lid] += 3
+          end
+      end
   end
+  exchange!(exchange_entries_vector)
+
+  #3. Combine local + remote entries
+  part              = MPI.Comm_rank(get_comm(m).comm)+1
+  test_lid_to_owner = m.parts.part.lid_to_owner
+  test_lid_to_gid   = m.lid_to_gid_petsc
+  test_gid_to_lid   = Dict{Int,Int32}()
+  for (lid,gid) in enumerate(test_lid_to_gid)
+    test_gid_to_lid[gid] = lid
+  end
+
+  trial_lid_to_gid  = n.lid_to_gid_petsc
+  trial_gid_to_lid  = Dict{Int,Int32}()
+  for (lid,gid) in enumerate(trial_lid_to_gid)
+    trial_gid_to_lid[gid] = lid
+  end
+
+  #TODO: check with fverdugo if there is an already coded way of
+  #      doing this vector pattern operation
+  lid_to_owned_lid = fill(-1, length(test_lid_to_owner))
+  current = 1
+  for i = 1:length(test_lid_to_owner)
+    if (test_lid_to_owner[i] == part)
+      lid_to_owned_lid[i] = current
+      current += 1
+    end
+  end
+
+  trial_lid_to_gid_extended = copy(trial_lid_to_gid)
+  trial_gid_to_lid_extended = copy(trial_gid_to_lid)
+
+  I,J,V = IJV.part
+  IS    = dIS.parts.part
+  remote_entries = exchange_entries_vector.part
+
+  GI = Int64[]
+  GJ = Int64[]
+  GV = Float64[]
+  # Add local entries
+  for i = 1:length(I)
+    owner = test_lid_to_owner[I[i]]
+    if (owner == part)
+      push!(GI, lid_to_owned_lid[I[i]])
+      push!(GJ, J[i])
+      push!(GV, V[i])
+    end
+   end
+   # Add remote entries
+   for edge_lid = 1:length(IS.lid_to_gid)
+     if (IS.lid_to_owner[edge_lid] != part)
+      for i = 1:3:length(remote_entries[edge_lid])
+        row=Int64(remote_entries[edge_lid][i])
+        push!(GI, lid_to_owned_lid[test_gid_to_lid[row]])
+        col=Int64(remote_entries[edge_lid][i+1])
+        if (!(haskey(trial_gid_to_lid_extended,col)))
+          trial_gid_to_lid_extended[col]=length(trial_lid_to_gid_extended)+1
+          push!(trial_lid_to_gid_extended, col)
+        end
+        push!(GJ, trial_gid_to_lid_extended[col])
+        push!(GV, remote_entries[edge_lid][i+2])
+      end
+     end
+   end
+
+   # 4. Build fully assembled local portions
+   n_owned_dofs = num_owned_entries(m)
+   Alocal = sparse_from_coo(
+      Gridap.Algebra.SparseMatrixCSR{0,Float64,Int64},
+      GI,
+      GJ,
+      GV,
+      n_owned_dofs,
+      length(trial_lid_to_gid_extended))
+
+   for i = 1:length(Alocal.colval)
+     Alocal.colval[i] = trial_lid_to_gid_extended[Alocal.colval[i]+1] - 1
+   end
 
-  PETSc.AssemblyBegin(A, PETSc.C.MAT_FINAL_ASSEMBLY)
-  PETSc.AssemblyEnd(A, PETSc.C.MAT_FINAL_ASSEMBLY)
-  A
+   # 5. Build global matrix from fully assembled local portions
+   p = Ref{PETSc.C.Mat{Float64}}()
+   PETSc.C.chk(PETSc.C.MatCreateMPIAIJWithArrays(
+        get_comm(m).comm,
+        n_owned_dofs,
+        n_owned_dofs,
+        ngrows,
+        ngcols,
+        _convert_buf_to_petscint(Alocal.rowptr),
+        _convert_buf_to_petscint(Alocal.colval),
+        Alocal.nzval,
+        p,))
+   A=PETSc.Mat(p[])
 end
 
 function assemble_global_vector(