Skip to content

Commit

Permalink
Solved the following with regards to parallel assembly process in
Browse files Browse the repository at this point in the history
MPIPETScCommunicator:

1. The current algorithm does not compute, before actual numerical assembly, the (full) sparsity pattern of the rows which are locally owned by each processor. Instead, we preallocate some storage for the matrix (with a rough, upper estimate of the number of nonzero elements per row), and entries are dynamically introduced into the matrix; see [here](https://github.com/gridap/GridapDistributed.jl/blob/f595e3ebb45c7777a25dded4a205b9a80195c7a9/src/MPIPETScDistributedAssemblersInterfaces.jl#L25) and [here](https://github.com/gridap/GridapDistributed.jl/blob/f595e3ebb45c7777a25dded4a205b9a80195c7a9/src/MPIPETScDistributedAssemblersInterfaces.jl#L35). Our experience with FEMPAR reveals that this has a number of practical performance and memory implications that we want to avoid in the final version of the algorithm (see also issue #9).

2. As a complementary side-note to 1., let me recall (to not forget/reuse) that: (1) we discussed [here](#3 (comment)) a possible way to accommodate the final/performant algorithm such that sticks into the current structure of `assembly_matrix_vector`; (2) I already wrote an implementation of this algorithm, but it does not stick to the current structure of `assembly_matrix_vector`, see [here](#3 (comment)) for more details; (3) in the most general case (non-conforming meshes, hanging DoF constraints), the final/performant algorithm requires to perform an ad-hoc communication among nearest neighbours in order to set up the sparsity pattern of the locally owned rows, see [here](https://github.com/gridap/GridapDistributed.jl/blob/distributed_integration_restricted_to_owned_cells/distributed_integration_restricted_to_owned_cells.jl#L275) for more details. At first glance, I do not see how it can be implemented given the aforementioned limitations of `MPIPETScDistributedVector` mentioned above.
  • Loading branch information
amartinhuertas committed Jun 24, 2020
1 parent c57f212 commit 1aae8ae
Show file tree
Hide file tree
Showing 5 changed files with 260 additions and 88 deletions.
6 changes: 1 addition & 5 deletions src/DistributedVectors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,6 @@ end

# Build a Distributed vector from an index set
# the resulting object is assumed to be locally indexable when restricted to a part
function DistributedVector{T}(initializer::Function,indices::DistributedIndexSet,args...) where T
@abstractmethod
end

function DistributedVector(initializer::Function,indices::DistributedIndexSet,args...)
function DistributedVector{T}(indices::DistributedIndexSet) where T
@abstractmethod
end
262 changes: 240 additions & 22 deletions src/MPIPETScDistributedAssemblersInterfaces.jl
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ function Gridap.Algebra.finalize_coo!(
::Type{PETSc.Mat{Float64}},IJV::MPIPETScDistributedData,m::MPIPETScDistributedIndexSet,n::MPIPETScDistributedIndexSet)
end

function _convert_buf_to_petscint(buf)
if isempty(buf)
Ptr{PETSc.C.PetscInt}(0)
else
isa(buf,Vector{PETSc.C.PetscInt}) ? buf : PETSc.C.PetscInt[ i for i in buf ]
end
end

function assemble_global_matrix(
::Type{RowsComputedLocally{false}},
::Type{PETSc.Mat{Float64}},
Expand All @@ -65,7 +73,7 @@ function assemble_global_matrix(
nlcols = nlrows

I, J, V = IJV.part
ncols_Alocal = maximum(J)
ncols_Alocal = length(m.parts.part.lid_to_owner)
for i = 1:length(I)
I[i] = m.app_to_petsc_locidx[I[i]]
end
Expand All @@ -82,54 +90,264 @@ function assemble_global_matrix(
end

p = Ref{PETSc.C.Mat{Float64}}()
f(buf)= if isempty(buf)
Ptr{PETSc.C.PetscInt}(0)
else
isa(buf,Vector{PETSc.C.PetscInt}) ? buf : PETSc.C.PetscInt[ i for i in buf ]
end

PETSc.C.chk(PETSc.C.MatCreateMPIAIJWithArrays(
get_comm(m).comm,
nlrows,
nlcols,
ngrows,
ngcols,
f(Alocal.rowptr),
f(Alocal.colval),
_convert_buf_to_petscint(Alocal.rowptr),
_convert_buf_to_petscint(Alocal.colval),
Alocal.nzval,
p,
))
A=PETSc.Mat(p[])
end

function compute_subdomain_graph_dIS_and_lst_snd(gids, dI)
# List parts I have to send data to
function compute_lst_snd(part, gids, I)
lst_snd = Set{Int}()
for i = 1:length(I)
owner = gids.lid_to_owner[I[i]]
if (owner != part)
if (!(owner in lst_snd))
push!(lst_snd, owner)
end
end
end
collect(lst_snd)
end

part_to_lst_snd = DistributedData(compute_lst_snd, gids, dI)
part_to_num_snd = DistributedData(part_to_lst_snd) do part, lst_snd
length(lst_snd)
end

offsets = gather(part_to_num_snd)
num_edges = sum(offsets)
GridapDistributed._fill_offsets!(offsets)
part_to_offsets = scatter(get_comm(part_to_num_snd), offsets)

part_to_owned_subdomain_graph_edge_gids =
DistributedData(part_to_num_snd, part_to_offsets) do part, num_snd, offset
owned_edge_gids = zeros(Int, num_snd)
o = offset
for i = 1:num_snd
o += 1
owned_edge_gids[i] = o
end
owned_edge_gids
end

num_snd = PETSc.C.PetscMPIInt(part_to_num_snd.part)
lst_snd = convert(Vector{PETSc.C.PetscMPIInt}, part_to_lst_snd.part) .- PETSc.C.PetscMPIInt(1)
snd_buf = part_to_owned_subdomain_graph_edge_gids.part

num_rcv = Ref{PETSc.C.PetscMPIInt}()
lst_rcv = Ref{Ptr{PETSc.C.PetscMPIInt}}()
rcv_buf = Ref{Ptr{Int64}}()

#PETSc.C.PetscCommBuildTwoSidedSetType(Float64, get_comm(gids).comm, PETSc.C.PETSC_BUILDTWOSIDED_ALLREDUCE)
PETSc.C.chk(PETSc.C.PetscCommBuildTwoSided(Float64,
get_comm(gids).comm,
PETSc.C.PetscMPIInt(1),
MPI.Datatype(Int64).val,
num_snd,
pointer(lst_snd),
Ptr{Cvoid}(pointer(snd_buf)),
num_rcv,
lst_rcv,
Base.unsafe_convert(Ptr{Cvoid},rcv_buf)))


#TO-DO: Call to PetscFree for lst_rcv and rcv_buf
#All attempts so far resulted in a SEGfault, so I gave up
#see https://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/Sys/PetscCommBuildTwoSided.html
#for more details

num_rcv=num_rcv[]
lst_rcv_vec=unsafe_wrap(Vector{PETSc.C.PetscMPIInt},lst_rcv[],num_rcv)
rcv_buf_vec=unsafe_wrap(Vector{Int64},rcv_buf[],num_rcv)

function compute_subdomain_graph_index_set(part,num_edges,owned_edge_entries)
lid_to_gid=Vector{Int}(undef,num_snd+num_rcv)
lid_to_owner=Vector{Int}(undef,num_snd+num_rcv)
for i=1:num_snd
lid_to_gid[i]=owned_edge_entries[i]
lid_to_owner[i]=part
end
for i=num_snd+1:num_snd+num_rcv
lid_to_gid[i]=rcv_buf_vec[i-num_snd]
lid_to_owner[i]=lst_rcv_vec[i-num_snd]+1
end
IndexSet(num_edges, lid_to_gid, lid_to_owner)
end

(DistributedIndexSet(compute_subdomain_graph_index_set,
get_comm(gids),
num_edges,
num_edges,
part_to_owned_subdomain_graph_edge_gids),
part_to_lst_snd)
end



function assemble_global_matrix(
::Type{OwnedCellsStrategy{false}},
::Type{PETSc.Mat{Float64}},
IJV::MPIPETScDistributedData,
m::MPIPETScDistributedIndexSet,
n::MPIPETScDistributedIndexSet,
)

ngrows = num_gids(m)
ngcols = num_gids(n)
nlrows = num_owned_entries(m)
nlrows = length(m.parts.part.lid_to_gid)
nlcols = nlrows

# TO-DO: Properly manage PREALLOCATION building the sparsity pattern of the matrix
A=Mat(Float64, ngrows, ngcols;
mlocal=nlrows, nlocal=nlcols, nz=100, onz=100,
comm=get_comm(m).comm, mtype=PETSc.C.MATMPIAIJ)
Ilocal = IJV.part[1]
Jlocal = IJV.part[2]
Vlocal = IJV.part[3]

A.insertmode = PETSc.C.ADD_VALUES
dI = DistributedData(get_comm(m),IJV) do part, IJV
I,_,_ = IJV
I
end

# TO-DO: Not efficient!, one call to PETSc function per each injection
I,J,V = IJV.part
for (i,j,v) in zip(I,J,V)
A[m.lid_to_gid_petsc[i],n.lid_to_gid_petsc[j]]=v
# 2. Determine communication pattern
dIS, part_to_lst_snd =
compute_subdomain_graph_dIS_and_lst_snd(m, dI)

# 3. Communicate entries
length_entries = DistributedVector{Int}(dIS)
do_on_parts(length_entries, m, dI, part_to_lst_snd) do part, length_entries, gid, I, lst_snd
fill!(length_entries, zero(eltype(length_entries)))
for i = 1:length(I)
owner = gid.lid_to_owner[I[i]]
if (owner != part)
edge_lid = findfirst((i) -> (i == owner), lst_snd)
length_entries[edge_lid]+=3
end
end
end
exchange!(length_entries)
dd_length_entries = DistributedData(length_entries) do part, length_entries
collect(length_entries)
end
exchange_entries_vector = DistributedVector{Vector{Float64}}(dIS, dd_length_entries)
# Pack data to be sent
do_on_parts(dIS,exchange_entries_vector,
m, IJV, part_to_lst_snd) do part, IS, exchange_entries_vector,
test_gids, IJV, lst_snd
current_pack_position = fill(one(Int32), length(IS.lid_to_owner))
I,J,V = IJV
for i = 1:length(I)
owner = test_gids.lid_to_owner[I[i]]
if (owner != part)
edge_lid = findfirst((i) -> (i == owner), lst_snd)
row=m.lid_to_gid_petsc[I[i]]
col=n.lid_to_gid_petsc[J[i]]
current_pos=current_pack_position[edge_lid]
exchange_entries_vector[edge_lid][current_pos ]=row
exchange_entries_vector[edge_lid][current_pos+1]=col
exchange_entries_vector[edge_lid][current_pos+2]=V[i]
current_pack_position[edge_lid] += 3
end
end
end
exchange!(exchange_entries_vector)

#3. Combine local + remote entries
part = MPI.Comm_rank(get_comm(m).comm)+1
test_lid_to_owner = m.parts.part.lid_to_owner
test_lid_to_gid = m.lid_to_gid_petsc
test_gid_to_lid = Dict{Int,Int32}()
for (lid,gid) in enumerate(test_lid_to_gid)
test_gid_to_lid[gid] = lid
end

trial_lid_to_gid = n.lid_to_gid_petsc
trial_gid_to_lid = Dict{Int,Int32}()
for (lid,gid) in enumerate(trial_lid_to_gid)
trial_gid_to_lid[gid] = lid
end

#TODO: check with fverdugo if there is an already coded way of
# doing this vector pattern operation
lid_to_owned_lid = fill(-1, length(test_lid_to_owner))
current = 1
for i = 1:length(test_lid_to_owner)
if (test_lid_to_owner[i] == part)
lid_to_owned_lid[i] = current
current += 1
end
end

trial_lid_to_gid_extended = copy(trial_lid_to_gid)
trial_gid_to_lid_extended = copy(trial_gid_to_lid)

I,J,V = IJV.part
IS = dIS.parts.part
remote_entries = exchange_entries_vector.part

GI = Int64[]
GJ = Int64[]
GV = Float64[]
# Add local entries
for i = 1:length(I)
owner = test_lid_to_owner[I[i]]
if (owner == part)
push!(GI, lid_to_owned_lid[I[i]])
push!(GJ, J[i])
push!(GV, V[i])
end
end
# Add remote entries
for edge_lid = 1:length(IS.lid_to_gid)
if (IS.lid_to_owner[edge_lid] != part)
for i = 1:3:length(remote_entries[edge_lid])
row=Int64(remote_entries[edge_lid][i])
push!(GI, lid_to_owned_lid[test_gid_to_lid[row]])
col=Int64(remote_entries[edge_lid][i+1])
if (!(haskey(trial_gid_to_lid_extended,col)))
trial_gid_to_lid_extended[col]=length(trial_lid_to_gid_extended)+1
push!(trial_lid_to_gid_extended, col)
end
push!(GJ, trial_gid_to_lid_extended[col])
push!(GV, remote_entries[edge_lid][i+2])
end
end
end

# 4. Build fully assembled local portions
n_owned_dofs = num_owned_entries(m)
Alocal = sparse_from_coo(
Gridap.Algebra.SparseMatrixCSR{0,Float64,Int64},
GI,
GJ,
GV,
n_owned_dofs,
length(trial_lid_to_gid_extended))

for i = 1:length(Alocal.colval)
Alocal.colval[i] = trial_lid_to_gid_extended[Alocal.colval[i]+1] - 1
end

PETSc.AssemblyBegin(A, PETSc.C.MAT_FINAL_ASSEMBLY)
PETSc.AssemblyEnd(A, PETSc.C.MAT_FINAL_ASSEMBLY)
A
# 5. Build global matrix from fully assembled local portions
p = Ref{PETSc.C.Mat{Float64}}()
PETSc.C.chk(PETSc.C.MatCreateMPIAIJWithArrays(
get_comm(m).comm,
n_owned_dofs,
n_owned_dofs,
ngrows,
ngcols,
_convert_buf_to_petscint(Alocal.rowptr),
_convert_buf_to_petscint(Alocal.colval),
Alocal.nzval,
p,))
A=PETSc.Mat(p[])
end

function assemble_global_vector(
Expand Down
Loading

0 comments on commit 1aae8ae

Please sign in to comment.