Skip to content

Commit

Permalink
Merge pull request #12 from sfiligoi/igor_cpu_vect2210uw
Browse files Browse the repository at this point in the history
Add zero checking to Unweighted for CPU code
  • Loading branch information
sfiligoi authored Oct 10, 2022
2 parents 10d3b8a + b0f5967 commit d95bc96
Show file tree
Hide file tree
Showing 2 changed files with 309 additions and 53 deletions.
344 changes: 293 additions & 51 deletions src/unifrac_task.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,34 @@ static inline void WeightedZerosAndSums(
}
}

// check for zero values
template<class T>
static inline void WeightedZerosAsync(
bool * const __restrict__ zcheck,
const T * const __restrict__ embedded_proportions,
const unsigned int filled_embs,
const uint64_t n_samples,
const uint64_t n_samples_r) {
#ifdef _OPENACC
#pragma acc parallel loop gang vector present(embedded_proportions,zcheck) async
#else
#pragma omp parallel for default(shared)
#endif
for(uint64_t k=0; k<n_samples; k++) {
bool all_zeros=true;

#pragma acc loop seq
for (uint64_t emb=0; emb<filled_embs; emb++) {
const uint64_t offset = n_samples_r * emb;

T u1 = embedded_proportions[offset + k];
all_zeros = all_zeros && (u1==0);
}

zcheck[k] = all_zeros;
}
}


// Single step in computing Weighted part of Unifrac
template<class TFloat>
Expand Down Expand Up @@ -1165,6 +1193,254 @@ void SUCMP_NM::UnifracVawGeneralizedTask<TFloat>::_run(unsigned int filled_embs,
#endif
}

// Single step in computing NormalizedWeighted Unifrac
#ifdef _OPENACC
template<class TFloat>
static inline void Unweighted1(
TFloat * const __restrict__ dm_stripes_buf,
TFloat * const __restrict__ dm_stripes_total_buf,
const TFloat * const __restrict__ sums,
const uint64_t * const __restrict__ embedded_proportions,
const unsigned int filled_embs_els_round,
const uint64_t idx,
const uint64_t n_samples_r,
const uint64_t k,
const uint64_t l1) {
TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
//TFloat *dm_stripe = dm_stripes[stripe];
//TFloat *dm_stripe_total = dm_stripes_total[stripe];

bool did_update = false;
TFloat my_stripe = 0.0;
TFloat my_stripe_total = 0.0;

for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
const uint64_t offset = n_samples_r * emb_el;
const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);

uint64_t u1 = embedded_proportions[offset + k];
uint64_t v1 = embedded_proportions[offset + l1];
uint64_t o1 = u1 | v1;

if (o1!=0) { // zeros are prevalent
did_update=true;
uint64_t x1 = u1 ^ v1;

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

my_stripe_total += psum[ (uint8_t)(o1) ] +
psum[0x100+((uint8_t)(o1 >> 8))] +
psum[0x200+((uint8_t)(o1 >> 16))] +
psum[0x300+((uint8_t)(o1 >> 24))] +
psum[0x400+((uint8_t)(o1 >> 32))] +
psum[0x500+((uint8_t)(o1 >> 40))] +
psum[0x600+((uint8_t)(o1 >> 48))] +
psum[0x700+((uint8_t)(o1 >> 56))];
my_stripe += psum[ (uint8_t)(x1) ] +
psum[0x100+((uint8_t)(x1 >> 8))] +
psum[0x200+((uint8_t)(x1 >> 16))] +
psum[0x300+((uint8_t)(x1 >> 24))] +
psum[0x400+((uint8_t)(x1 >> 32))] +
psum[0x500+((uint8_t)(x1 >> 40))] +
psum[0x600+((uint8_t)(x1 >> 48))] +
psum[0x700+((uint8_t)(x1 >> 56))];
}
}

if (did_update) {
dm_stripe[k] += my_stripe;
dm_stripe_total[k] += my_stripe_total;
}

}
#else
template<class TFloat>
static inline void Unweighted1(
TFloat * const __restrict__ dm_stripes_buf,
TFloat * const __restrict__ dm_stripes_total_buf,
const bool * const __restrict__ zcheck,
const TFloat * const __restrict__ sums,
const uint64_t * const __restrict__ embedded_proportions,
const unsigned int filled_embs_els_round,
const uint64_t idx,
const uint64_t n_samples_r,
const uint64_t k,
const uint64_t l1) {
const bool allzero_k = zcheck[k];
const bool allzero_l1 = zcheck[l1];

if (allzero_k && allzero_l1) {
// nothing to do, would have to add 0
} else {
TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
//TFloat *dm_stripe = dm_stripes[stripe];
//TFloat *dm_stripe_total = dm_stripes_total[stripe];

bool did_update = false;
TFloat my_stripe = 0.0;
TFloat my_stripe_total = 0.0;

if (allzero_k || allzero_l1) {
// with one side zero, | and ^ are no-ops
const uint64_t kl = (allzero_k) ? l1 : k; // only use the non-sero one
for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
const uint64_t offset = n_samples_r * emb_el;
const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);

//uint64_t u1 = embedded_proportions[offset + k];
//uint64_t v1 = embedded_proportions[offset + l1];
//uint64_t o1 = u1 | v1;
// With one of the two being 0, or is just the non-negative one
uint64_t o1 = embedded_proportions[offset + kl];

if (o1==0) { // zeros are prevalent
// nothing to do
} else if (((uint32_t)o1)==0) {
// only high part relevant
did_update=true;
//uint64_t x1 = u1 ^ v1;
// With one of the two being 0, xor is just the non-negative one

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

TFloat esum = psum[0x400+((uint8_t)(o1 >> 32))] +
psum[0x500+((uint8_t)(o1 >> 40))] +
psum[0x600+((uint8_t)(o1 >> 48))] +
psum[0x700+((uint8_t)(o1 >> 56))];
my_stripe_total += esum;
my_stripe += esum;
} else if ((o1>>32)==0) {
// only low part relevant
did_update=true;
//uint64_t x1 = u1 ^ v1;
// With one of the two being 0, xor is just the non-negative one

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

TFloat esum = psum[ (uint8_t)(o1) ] +
psum[0x100+((uint8_t)(o1 >> 8))] +
psum[0x200+((uint8_t)(o1 >> 16))] +
psum[0x300+((uint8_t)(o1 >> 24))];
my_stripe_total += esum;
my_stripe += esum;
} else {
did_update=true;
//uint64_t x1 = u1 ^ v1;
// With one of the two being 0, xor is just the non-negative one

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

TFloat esum = psum[ (uint8_t)(o1) ] +
psum[0x100+((uint8_t)(o1 >> 8))] +
psum[0x200+((uint8_t)(o1 >> 16))] +
psum[0x300+((uint8_t)(o1 >> 24))] +
psum[0x400+((uint8_t)(o1 >> 32))] +
psum[0x500+((uint8_t)(o1 >> 40))] +
psum[0x600+((uint8_t)(o1 >> 48))] +
psum[0x700+((uint8_t)(o1 >> 56))];
my_stripe_total += esum;
my_stripe += esum;
}
}
} else {
// we need both sides
for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
const uint64_t offset = n_samples_r * emb_el;
const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);

uint64_t u1 = embedded_proportions[offset + k];
uint64_t v1 = embedded_proportions[offset + l1];
uint64_t o1 = u1 | v1;
uint64_t x1 = u1 ^ v1;

if (o1==0) { // zeros are prevalent
// nothing to do
} else if (((uint32_t)o1)==0) {
// only high part relevant
did_update=true;

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

my_stripe_total += psum[0x400+((uint8_t)(o1 >> 32))] +
psum[0x500+((uint8_t)(o1 >> 40))] +
psum[0x600+((uint8_t)(o1 >> 48))] +
psum[0x700+((uint8_t)(o1 >> 56))];
my_stripe += psum[0x400+((uint8_t)(x1 >> 32))] +
psum[0x500+((uint8_t)(x1 >> 40))] +
psum[0x600+((uint8_t)(x1 >> 48))] +
psum[0x700+((uint8_t)(x1 >> 56))];
} else if ((o1>>32)==0) {
// only low part relevant
did_update=true;

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

my_stripe_total += psum[ (uint8_t)(o1) ] +
psum[0x100+((uint8_t)(o1 >> 8))] +
psum[0x200+((uint8_t)(o1 >> 16))] +
psum[0x300+((uint8_t)(o1 >> 24))];
my_stripe += psum[ (uint8_t)(x1) ] +
psum[0x100+((uint8_t)(x1 >> 8))] +
psum[0x200+((uint8_t)(x1 >> 16))] +
psum[0x300+((uint8_t)(x1 >> 24))];

} else {
did_update=true;

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

my_stripe_total += psum[ (uint8_t)(o1) ] +
psum[0x100+((uint8_t)(o1 >> 8))] +
psum[0x200+((uint8_t)(o1 >> 16))] +
psum[0x300+((uint8_t)(o1 >> 24))] +
psum[0x400+((uint8_t)(o1 >> 32))] +
psum[0x500+((uint8_t)(o1 >> 40))] +
psum[0x600+((uint8_t)(o1 >> 48))] +
psum[0x700+((uint8_t)(o1 >> 56))];
my_stripe += psum[ (uint8_t)(x1) ] +
psum[0x100+((uint8_t)(x1 >> 8))] +
psum[0x200+((uint8_t)(x1 >> 16))] +
psum[0x300+((uint8_t)(x1 >> 24))] +
psum[0x400+((uint8_t)(x1 >> 32))] +
psum[0x500+((uint8_t)(x1 >> 40))] +
psum[0x600+((uint8_t)(x1 >> 48))] +
psum[0x700+((uint8_t)(x1 >> 56))];
}
}
} // (allzero_k || allzero_l1)

if (did_update) {
dm_stripe[k] += my_stripe;
dm_stripe_total[k] += my_stripe_total;
}
} // (allzero_k && allzero_l1)

}
#endif

template<class TFloat>
void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) {
const uint64_t start_idx = this->task_p->start;
Expand All @@ -1187,6 +1463,14 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con

const uint64_t filled_embs_els_round = (filled_embs+63)/64;

#ifndef _OPENACC
// not effective for GPUs, but helps a lot on the CPUs
bool * const __restrict__ zcheck = this->zcheck;
// check for zero values
WeightedZerosAsync(zcheck,
embedded_proportions,
filled_embs_els_round, n_samples, n_samples_r);
#endif

// pre-compute sums of length elements, since they are likely to be accessed many times
// We will use a 8-bit map, to keep it small enough to keep in L1 cache
Expand Down Expand Up @@ -1247,7 +1531,7 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
#ifdef _OPENACC
#pragma acc wait
const unsigned int acc_vector_size = SUCMP_NM::UnifracUnweightedTask<TFloat>::acc_vector_size;
#pragma acc parallel loop collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async
#pragma acc parallel loop collapse(3) gang vector vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async
#else
// use dynamic scheduling due to non-homogeneity in the loop
#pragma omp parallel for schedule(dynamic,1) default(shared)
Expand All @@ -1257,61 +1541,19 @@ void SUCMP_NM::UnifracUnweightedTask<TFloat>::_run(unsigned int filled_embs, con
for(uint64_t ik = 0; ik < step_size ; ik++) {
const uint64_t k = sk*step_size + ik;
const uint64_t idx = (stripe-start_idx) * n_samples_r;
TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx;
TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx;
//TFloat *dm_stripe = dm_stripes[stripe];
//TFloat *dm_stripe_total = dm_stripes_total[stripe];

if (k>=n_samples) continue; // past the limit

const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound

bool did_update = false;
TFloat my_stripe = 0.0;
TFloat my_stripe_total = 0.0;

#pragma acc loop seq
for (uint64_t emb_el=0; emb_el<filled_embs_els_round; emb_el++) {
const uint64_t offset = n_samples_r * emb_el;
const TFloat * __restrict__ psum = &(sums[emb_el*0x800]);

uint64_t u1 = embedded_proportions[offset + k];
uint64_t v1 = embedded_proportions[offset + l1];
uint64_t o1 = u1 | v1;

if (o1!=0) { // zeros are prevalent
did_update=true;
uint64_t x1 = u1 ^ v1;

// Use the pre-computed sums
// Each range of 8 lengths has already been pre-computed and stored in psum
// Since embedded_proportions packed format is in 64-bit format for performance reasons
// we need to add the 8 sums using the four 8-bits for addressing inside psum

my_stripe += psum[ (x1 & 0xff)] +
psum[0x100+((x1 >> 8) & 0xff)] +
psum[0x200+((x1 >> 16) & 0xff)] +
psum[0x300+((x1 >> 24) & 0xff)] +
psum[0x400+((x1 >> 32) & 0xff)] +
psum[0x500+((x1 >> 40) & 0xff)] +
psum[0x600+((x1 >> 48) & 0xff)] +
psum[0x700+((x1 >> 56) )];
my_stripe_total += psum[ (o1 & 0xff)] +
psum[0x100+((o1 >> 8) & 0xff)] +
psum[0x200+((o1 >> 16) & 0xff)] +
psum[0x300+((o1 >> 24) & 0xff)] +
psum[0x400+((o1 >> 32) & 0xff)] +
psum[0x500+((o1 >> 40) & 0xff)] +
psum[0x600+((o1 >> 48) & 0xff)] +
psum[0x700+((o1 >> 56) )];
}
}

if (did_update) {
dm_stripe[k] += my_stripe;
dm_stripe_total[k] += my_stripe_total;
}

Unweighted1<TFloat>(
dm_stripes_buf,dm_stripes_total_buf,
#ifndef _OPENACC
zcheck,
#endif
sums, embedded_proportions,
filled_embs_els_round,idx, n_samples_r,
k, l1);
}

}
Expand Down
Loading

0 comments on commit d95bc96

Please sign in to comment.