Skip to content

Commit

Permalink
Fixed performance bug caused by redundant packing. (#680)
Browse files Browse the repository at this point in the history
Details:
- Fixed a performance bug whereby multiple threads were redundantly
  packing the same (rather than separate) micropanels. This bug was
  caused by different parts of the code using the num_threads/thread_id
  field of the thrinfo_t vs. the n_way/work_id fields. The fix was to 
  standardize on the latter and provide a "fake" thrinfo_t sub-prenode 
  in the thrinfo tree which consists of single-member thread teams. The 
  single team with multiple threads node is still required since it and 
  only it can be used to perform barriers and broadcasts (e.g. of the 
  packed buffer pointer).
  • Loading branch information
devinamatthews authored Oct 31, 2022
1 parent aeb5f0c commit 29f79f0
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 25 deletions.
38 changes: 22 additions & 16 deletions frame/1m/packm/bli_packm_blk_var1.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);

void bli_packm_blk_var1
(
const obj_t* c,
obj_t* p,
const cntx_t* cntx,
const cntl_t* cntl,
thrinfo_t* thread
const obj_t* c,
obj_t* p,
const cntx_t* cntx,
const cntl_t* cntl,
thrinfo_t* thread_par
)
{
// Extract various fields from the control tree.
Expand All @@ -67,12 +67,18 @@ void bli_packm_blk_var1
bool revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl );
bool reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl );

// Every thread initializes p and determines the size of memory
// block needed (which gets embedded into the otherwise "blank" mem_t
// entry in the control tree node). Return early if no packing is required.
if ( !bli_packm_init( c, p, cntx, cntl, thread ) )
// Every thread initializes p and determines the size of memory block
// needed (which gets embedded into the otherwise "blank" mem_t entry
// in the control tree node). Return early if no packing is required.
if ( !bli_packm_init( c, p, cntx, cntl, bli_thrinfo_sub_node( thread_par ) ) )
return;

// Use the sub-prenode. In bli_l3_thrinfo_grow(), this node was created to
// represent the team of threads as a group of single-member thread teams.
// This is necessary since the all of the work distribution function depend
// on the work_id and n_way fields.
thrinfo_t* thread = bli_thrinfo_sub_prenode( thread_par );

// Check parameters.
if ( bli_error_checking_is_enabled() )
bli_packm_int_check( c, p, cntx );
Expand Down Expand Up @@ -134,11 +140,11 @@ void bli_packm_blk_var1
packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ];
}

/* Compute the total number of iterations we'll need. */
// Compute the total number of iterations we'll need.
dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 );

/* Set the initial values and increments for indices related to C and P
based on whether reverse iteration was requested. */
// Set the initial values and increments for indices related to C and P
// based on whether reverse iteration was requested.
dim_t ic0, ip0;
doff_t ic_inc, ip_inc;

Expand All @@ -158,10 +164,10 @@ void bli_packm_blk_var1
ip_inc = 1;
}

// Query the number of threads and thread ids from the current thread's
// packm thrinfo_t node.
const dim_t nt = bli_thrinfo_num_threads( thread );
const dim_t tid = bli_thrinfo_thread_id( thread );
// Query the number of threads (single-member thread teams) and the thread
// team ids from the current thread's packm thrinfo_t node.
const dim_t nt = bli_thrinfo_n_way( thread );
const dim_t tid = bli_thrinfo_work_id( thread );

// Determine the thread range and increment using the current thread's
// packm thrinfo_t node. NOTE: The definition of bli_thread_range_jrir()
Expand Down
21 changes: 15 additions & 6 deletions frame/1m/packm/bli_packm_int.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@

void bli_packm_int
(
const obj_t* a,
obj_t* p,
const cntx_t* cntx,
const cntl_t* cntl,
const obj_t* a,
obj_t* p,
const cntx_t* cntx,
const cntl_t* cntl,
thrinfo_t* thread_par
)
{
Expand All @@ -53,14 +53,23 @@ void bli_packm_int
thrinfo_t* thread = bli_thrinfo_sub_node( thread_par );
bli_thrinfo_barrier( thread );

// Invoke the variant with kappa_use.
// Invoke the packm variant.
// NOTE: The packing kernel uses two communicators: one which represents a
// single workgroup of many threads, and one which represents a group of
// many single-member workgroups. The former communicator is used for
// barriers and thread communication (i.e. broadcasting the pack buffer
// pointer), while the latter communicator is used for partitioning work.
// This is because all of the thread range functions rely on the work_id
// and number of workgroups (n_way). Thus, we pass along the parent
// thrinfo_t node which has these two communicators as the sub-node and
// sub-prenode, respectively.
f
(
a,
p,
cntx,
cntl,
thread
thread_par
);

// Barrier so that packing is done before computation.
Expand Down
22 changes: 19 additions & 3 deletions frame/3/bli_l3_thrinfo.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,19 @@ void bli_l3_thrinfo_grow
thrinfo_t* thread_cur = bli_thrinfo_split( n_way, thread_par );
bli_thrinfo_set_sub_node( thread_cur, thread_par );

if ( sub_prenode != NULL )
if ( bszid == BLIS_NO_PART )
{
// A hack: the packing code needs a thread communicator which represents
// a group of single-member thread teams working cooperatively However,
// the "normal" packm thrinfo_t node has a single team of multiple
// threads. Our solution (for now) is to create a sub-prenode on the
// thrinfo_t tree which splits this single team into multiple
// single-member thread teams.
const dim_t n_threads = bli_thrinfo_num_threads( thread_par );
thrinfo_t* thread_pre = bli_thrinfo_split( n_threads, thread_par );
bli_thrinfo_set_sub_prenode( thread_pre, thread_par );
}
else if ( sub_prenode != NULL )
{
// A pre-node is only used in the IC loop of trsm. In this case,
// we cannot actually thread in the m dimension due to data dependencies
Expand All @@ -88,8 +100,12 @@ void bli_l3_thrinfo_grow
bli_rntm_set_ic_ways_only( 1, &rntm_l );
bli_rntm_set_jr_ways_only( ic_nway*jr_nway, &rntm_l );

// Use thread_pre instead of thread_cur since we *don't* want to
// do any parallelism at this level.
// Use thread_pre instead of thread_cur since we *don't* want to do any
// parallelism at this level. So the thread_pre node gets attached to
// thread_par and not thread_cur! This results in a split "one level
// higher" than in the corresponding cntl_t tree. This is intentional
// since two different thrinfo_t nodes will be used at the cntl_t node
// for trsm blocked variant 1 (one for trsm, one for gemm).
thrinfo_t* thread_pre = bli_thrinfo_split( 1, thread_par );
bli_thrinfo_set_sub_prenode( thread_pre, thread_par );
bli_l3_thrinfo_grow( thread_pre, &rntm_l, sub_prenode );
Expand Down

0 comments on commit 29f79f0

Please sign in to comment.