Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ufs-release/public-v2: bit-for-bit reproducibility for regional runs when changing MPI decomposition #68

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 32 additions & 18 deletions model/dyn_core.F90
Original file line number Diff line number Diff line change
Expand Up @@ -555,19 +555,23 @@ subroutine dyn_core(npx, npy, npz, ng, sphum, nq, bdt, n_map, n_split, zvir, cp,
endif

if (flagstruct%regional) then
call timing_on('COMM_TOTAL')
reg_bc_update_time=current_time_in_seconds+bdt*(n_map-1)+(0.5+(it-1))*dt
call regional_boundary_update(delpc, 'delp', &
isd, ied, jsd, jed, npz, &
is, ie, js, je, &
isd, ied, jsd, jed, &
reg_bc_update_time )
call mpp_update_domains(delpc, domain, complete=.true.)
#ifndef SW_DYNAMICS
call regional_boundary_update(ptc, 'pt', &
isd, ied, jsd, jed, npz, &
is, ie, js, je, &
isd, ied, jsd, jed, &
reg_bc_update_time )
call mpp_update_domains(ptc, domain, complete=.true.)
#endif
call timing_off('COMM_TOTAL')
endif

if ( hydrostatic ) then
Expand Down Expand Up @@ -670,7 +674,7 @@ subroutine dyn_core(npx, npy, npz, ng, sphum, nq, bdt, n_map, n_split, zvir, cp,

endif

#endif SW_DYNAMICS
#endif

endif ! end hydro check

Expand Down Expand Up @@ -719,9 +723,7 @@ subroutine dyn_core(npx, npy, npz, ng, sphum, nq, bdt, n_map, n_split, zvir, cp,

if (flagstruct%regional) then

!call exch_uv(domain, bd, npz, vc, uc)
call mpp_update_domains(uc, vc, domain, gridtype=CGRID_NE)

reg_bc_update_time=current_time_in_seconds+bdt*(n_map-1)+(0.5+(it-1))*dt
call regional_boundary_update(vc, 'vc', &
isd, ied, jsd, jed+1, npz, &
Expand All @@ -734,13 +736,18 @@ subroutine dyn_core(npx, npy, npz, ng, sphum, nq, bdt, n_map, n_split, zvir, cp,
isd, ied, jsd, jed, &
reg_bc_update_time )
call mpp_update_domains(uc, vc, domain, gridtype=CGRID_NE)

!!! Currently divgd is always 0.0 in the regional domain boundary area.
reg_bc_update_time=current_time_in_seconds+bdt*(n_map-1)+(it-1)*dt
call regional_boundary_update(divgd, 'divgd', &
isd, ied+1, jsd, jed+1, npz, &
is, ie, js, je, &
isd, ied, jsd, jed, &
reg_bc_update_time )
! DH* 2021-02-03: no need to execute this code, because divgd is always
! 0.0 in the regional domain boundary area. This code breaks b4b reproducibility
! when changing the MPI decomposition for regional runs.
! reg_bc_update_time=current_time_in_seconds+bdt*(n_map-1)+(it-1)*dt
! call regional_boundary_update(divgd, 'divgd', &
! isd, ied+1, jsd, jed+1, npz, &
! is, ie, js, je, &
! isd, ied, jsd, jed, &
! reg_bc_update_time )
! *DH
endif

if ( flagstruct%inline_q ) then
Expand Down Expand Up @@ -986,36 +993,39 @@ subroutine dyn_core(npx, npy, npz, ng, sphum, nq, bdt, n_map, n_split, zvir, cp,
#ifdef USE_COND
call nested_grid_BC_apply_intT(q_con, &
0, 0, npx, npy, npz, bd, split_timestep_BC+1, real(n_split*flagstruct%k_split), &
neststruct%q_con_BC, bctype=neststruct%nestbctype )
neststruct%q_con_BC, bctype=neststruct%nestbctype )
#endif

#endif

end if

if (flagstruct%regional) then
call timing_on('COMM_TOTAL')
reg_bc_update_time=current_time_in_seconds+bdt*(n_map-1)+(it-1)*dt
call regional_boundary_update(delp, 'delp', &
isd, ied, jsd, jed, npz, &
is, ie, js, je, &
isd, ied, jsd, jed, &
reg_bc_update_time )
call mpp_update_domains(delp, domain, complete=.true.)
#ifndef SW_DYNAMICS
call regional_boundary_update(pt, 'pt', &
isd, ied, jsd, jed, npz, &
is, ie, js, je, &
isd, ied, jsd, jed, &
reg_bc_update_time )

call mpp_update_domains(pt, domain, complete=.true.)
#ifdef USE_COND
call regional_boundary_update(q_con, 'q_con', &
isd, ied, jsd, jed, npz, &
is, ie, js, je, &
isd, ied, jsd, jed, &
reg_bc_update_time )
call mpp_update_domains(q_con, domain, complete=.true.)
#endif

#endif
call timing_off('COMM_TOTAL')
endif

if ( hydrostatic ) then
Expand Down Expand Up @@ -1122,7 +1132,7 @@ subroutine dyn_core(npx, npy, npz, ng, sphum, nq, bdt, n_map, n_split, zvir, cp,
call complete_group_halo_update(i_pack(5), domain)
call timing_off('COMM_TOTAL')
endif
#endif SW_DYNAMICS
#endif
endif ! end hydro check

#ifdef SW_DYNAMICS
Expand Down Expand Up @@ -1238,7 +1248,7 @@ subroutine dyn_core(npx, npy, npz, ng, sphum, nq, bdt, n_map, n_split, zvir, cp,
endif

#ifndef ROT3
if ( it/=n_split) &
if (.not. flagstruct%regional .and. it/=n_split) &
call start_group_halo_update(i_pack(8), u, v, domain, gridtype=DGRID_NE)
#endif
call timing_off('COMM_TOTAL')
Expand Down Expand Up @@ -1308,7 +1318,7 @@ subroutine dyn_core(npx, npy, npz, ng, sphum, nq, bdt, n_map, n_split, zvir, cp,
0, 0, npx, npy, npz, bd, split_timestep_BC+1, real(n_split*flagstruct%k_split), &
neststruct%w_BC, bctype=neststruct%nestbctype )
end if
#endif SW_DYNAMICS
#endif
call nested_grid_BC_apply_intT(u, &
0, 1, npx, npy, npz, bd, split_timestep_BC+1, real(n_split*flagstruct%k_split), &
neststruct%u_BC, bctype=neststruct%nestbctype )
Expand All @@ -1319,7 +1329,7 @@ subroutine dyn_core(npx, npy, npz, ng, sphum, nq, bdt, n_map, n_split, zvir, cp,
end if

if (flagstruct%regional) then

call timing_on('COMM_TOTAL')
#ifndef SW_DYNAMICS
if (.not. hydrostatic) then
reg_bc_update_time=current_time_in_seconds+bdt*(n_map-1)+it*dt
Expand All @@ -1329,7 +1339,7 @@ subroutine dyn_core(npx, npy, npz, ng, sphum, nq, bdt, n_map, n_split, zvir, cp,
isd, ied, jsd, jed, &
reg_bc_update_time )
endif
#endif SW_DYNAMICS
#endif

call regional_boundary_update(u, 'u', &
isd, ied, jsd, jed+1, npz, &
Expand All @@ -1341,8 +1351,12 @@ subroutine dyn_core(npx, npy, npz, ng, sphum, nq, bdt, n_map, n_split, zvir, cp,
is, ie, js, je, &
isd, ied, jsd, jed, &
reg_bc_update_time )

call mpp_update_domains(u, v, domain, gridtype=DGRID_NE)
#ifndef ROT3
if (it/=n_split) &
call start_group_halo_update(i_pack(8), u, v, domain, gridtype=DGRID_NE)
#endif
call timing_off('COMM_TOTAL')
end if

!-----------------------------------------------------
Expand Down
3 changes: 3 additions & 0 deletions model/fv_tracer2d.F90
Original file line number Diff line number Diff line change
Expand Up @@ -749,6 +749,9 @@ subroutine tracer_2d_nested(q, dp1, mfx, mfy, cx, cy, gridstruct, bd, domain, np
reg_bc_update_time, &
iq )
enddo
call timing_on('COMM_TOTAL')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is OK but could likely be avoided by having the regional boundary update fill boundary rows that lie in the haloes too: ie. if this is a west edge, then have the BCs fill in (-2:0) x (jsd:jed) instead of just (-2:0) x (jsc:jec).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the nesting code already does this.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lharris4 Can you explain please what you mean? I tried to skip this one update call, and the results were different. I did that with every single change in this PR, actually. Took them out on by one but left the others and reran the tests. Every time they failed, thus I think that all of them are required (unless changes are made to the regional BC update code as you mentioned).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hello. That is exactly what I mean: I think that if the regional BC update code is changed, this mpp_update_domains call is unnecessary. Lucas

Got it, thank you. Do you think it is ok to get the changes into the public release code as they are now, or should we try to follow your suggestion. The reason I am asking is that the SRW App is on a tight release schedule. The code freeze was last week and we were hoping to get the b4b issues fixed and merged as an "emergency bugfix".

We should definitely follow your suggestions for the authoritative/development branches and not try to bring over the public release code changes. Doing so would probably allow us to remove several of the already existing calls to mpp_update_domain, if I understand this correctly, but it will require a bit of time for the development to happen.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lharris4 @bensonr I would like to follow up with this issue, do we have the boundary fill updates in the master and dev/emc branch? We still have decomposition reproducibility issue with regional test and would like to see if we can try the fix you mentioned.

call mpp_update_domains(q, domain, complete=.true.)
call timing_off('COMM_TOTAL')
endif


Expand Down
6 changes: 3 additions & 3 deletions model/sw_core.F90
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,12 @@ subroutine c_sw(delpc, delp, ptc, pt, u,v, w, uc,vc, ua,va, wc, &
type(fv_flags_type), intent(IN), target :: flagstruct

! Local:
logical:: sw_corner, se_corner, ne_corner, nw_corner
logical:: sw_corner, se_corner, ne_corner, nw_corner
real, dimension(bd%is-1:bd%ie+1,bd%js-1:bd%je+1):: vort, ke
real, dimension(bd%is-1:bd%ie+2,bd%js-1:bd%je+1):: fx, fx1, fx2
real, dimension(bd%is-1:bd%ie+1,bd%js-1:bd%je+2):: fy, fy1, fy2
real :: dt4
integer :: i,j, is2, ie1
integer :: i,j
integer iep1, jep1

integer :: is, ie, js, je
Expand Down Expand Up @@ -2878,7 +2878,7 @@ subroutine d2a2c_vect(u, v, ua, va, uc, vc, ut, vt, dord4, gridstruct, &
utmp(:,:) = big_number
vtmp(:,:) = big_number

if ( bounded_domain ) then
if ( bounded_domain ) then

do j=jsd+1,jed-1
do i=isd,ied
Expand Down