diff --git a/src/unifrac_task.cpp b/src/unifrac_task.cpp index 627d3b5..e7c21b3 100644 --- a/src/unifrac_task.cpp +++ b/src/unifrac_task.cpp @@ -52,6 +52,34 @@ static inline void WeightedZerosAndSums( } } +// check for zero values +template +static inline void WeightedZerosAsync( + bool * const __restrict__ zcheck, + const T * const __restrict__ embedded_proportions, + const unsigned int filled_embs, + const uint64_t n_samples, + const uint64_t n_samples_r) { +#ifdef _OPENACC +#pragma acc parallel loop gang vector present(embedded_proportions,zcheck) async +#else +#pragma omp parallel for default(shared) +#endif + for(uint64_t k=0; k @@ -1165,6 +1193,254 @@ void SUCMP_NM::UnifracVawGeneralizedTask::_run(unsigned int filled_embs, #endif } +// Single step in computing NormalizedWeighted Unifrac +#ifdef _OPENACC +template +static inline void Unweighted1( + TFloat * const __restrict__ dm_stripes_buf, + TFloat * const __restrict__ dm_stripes_total_buf, + const TFloat * const __restrict__ sums, + const uint64_t * const __restrict__ embedded_proportions, + const unsigned int filled_embs_els_round, + const uint64_t idx, + const uint64_t n_samples_r, + const uint64_t k, + const uint64_t l1) { + TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; + TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx; + //TFloat *dm_stripe = dm_stripes[stripe]; + //TFloat *dm_stripe_total = dm_stripes_total[stripe]; + + bool did_update = false; + TFloat my_stripe = 0.0; + TFloat my_stripe_total = 0.0; + + for (uint64_t emb_el=0; emb_el> 8))] + + psum[0x200+((uint8_t)(o1 >> 16))] + + psum[0x300+((uint8_t)(o1 >> 24))] + + psum[0x400+((uint8_t)(o1 >> 32))] + + psum[0x500+((uint8_t)(o1 >> 40))] + + psum[0x600+((uint8_t)(o1 >> 48))] + + psum[0x700+((uint8_t)(o1 >> 56))]; + my_stripe += psum[ (uint8_t)(x1) ] + + psum[0x100+((uint8_t)(x1 >> 8))] + + psum[0x200+((uint8_t)(x1 >> 16))] + + psum[0x300+((uint8_t)(x1 >> 24))] + + psum[0x400+((uint8_t)(x1 >> 32))] + + psum[0x500+((uint8_t)(x1 >> 40))] + + psum[0x600+((uint8_t)(x1 >> 48))] + + psum[0x700+((uint8_t)(x1 >> 56))]; + } + } + + if (did_update) { + dm_stripe[k] += my_stripe; + dm_stripe_total[k] += my_stripe_total; + } + +} +#else +template +static inline void Unweighted1( + TFloat * const __restrict__ dm_stripes_buf, + TFloat * const __restrict__ dm_stripes_total_buf, + const bool * const __restrict__ zcheck, + const TFloat * const __restrict__ sums, + const uint64_t * const __restrict__ embedded_proportions, + const unsigned int filled_embs_els_round, + const uint64_t idx, + const uint64_t n_samples_r, + const uint64_t k, + const uint64_t l1) { + const bool allzero_k = zcheck[k]; + const bool allzero_l1 = zcheck[l1]; + + if (allzero_k && allzero_l1) { + // nothing to do, would have to add 0 + } else { + TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; + TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx; + //TFloat *dm_stripe = dm_stripes[stripe]; + //TFloat *dm_stripe_total = dm_stripes_total[stripe]; + + bool did_update = false; + TFloat my_stripe = 0.0; + TFloat my_stripe_total = 0.0; + + if (allzero_k || allzero_l1) { + // with one side zero, | and ^ are no-ops + const uint64_t kl = (allzero_k) ? l1 : k; // only use the non-sero one + for (uint64_t emb_el=0; emb_el> 32))] + + psum[0x500+((uint8_t)(o1 >> 40))] + + psum[0x600+((uint8_t)(o1 >> 48))] + + psum[0x700+((uint8_t)(o1 >> 56))]; + my_stripe_total += esum; + my_stripe += esum; + } else if ((o1>>32)==0) { + // only low part relevant + did_update=true; + //uint64_t x1 = u1 ^ v1; + // With one of the two being 0, xor is just the non-negative one + + // Use the pre-computed sums + // Each range of 8 lengths has already been pre-computed and stored in psum + // Since embedded_proportions packed format is in 64-bit format for performance reasons + // we need to add the 8 sums using the four 8-bits for addressing inside psum + + TFloat esum = psum[ (uint8_t)(o1) ] + + psum[0x100+((uint8_t)(o1 >> 8))] + + psum[0x200+((uint8_t)(o1 >> 16))] + + psum[0x300+((uint8_t)(o1 >> 24))]; + my_stripe_total += esum; + my_stripe += esum; + } else { + did_update=true; + //uint64_t x1 = u1 ^ v1; + // With one of the two being 0, xor is just the non-negative one + + // Use the pre-computed sums + // Each range of 8 lengths has already been pre-computed and stored in psum + // Since embedded_proportions packed format is in 64-bit format for performance reasons + // we need to add the 8 sums using the four 8-bits for addressing inside psum + + TFloat esum = psum[ (uint8_t)(o1) ] + + psum[0x100+((uint8_t)(o1 >> 8))] + + psum[0x200+((uint8_t)(o1 >> 16))] + + psum[0x300+((uint8_t)(o1 >> 24))] + + psum[0x400+((uint8_t)(o1 >> 32))] + + psum[0x500+((uint8_t)(o1 >> 40))] + + psum[0x600+((uint8_t)(o1 >> 48))] + + psum[0x700+((uint8_t)(o1 >> 56))]; + my_stripe_total += esum; + my_stripe += esum; + } + } + } else { + // we need both sides + for (uint64_t emb_el=0; emb_el> 32))] + + psum[0x500+((uint8_t)(o1 >> 40))] + + psum[0x600+((uint8_t)(o1 >> 48))] + + psum[0x700+((uint8_t)(o1 >> 56))]; + my_stripe += psum[0x400+((uint8_t)(x1 >> 32))] + + psum[0x500+((uint8_t)(x1 >> 40))] + + psum[0x600+((uint8_t)(x1 >> 48))] + + psum[0x700+((uint8_t)(x1 >> 56))]; + } else if ((o1>>32)==0) { + // only low part relevant + did_update=true; + + // Use the pre-computed sums + // Each range of 8 lengths has already been pre-computed and stored in psum + // Since embedded_proportions packed format is in 64-bit format for performance reasons + // we need to add the 8 sums using the four 8-bits for addressing inside psum + + my_stripe_total += psum[ (uint8_t)(o1) ] + + psum[0x100+((uint8_t)(o1 >> 8))] + + psum[0x200+((uint8_t)(o1 >> 16))] + + psum[0x300+((uint8_t)(o1 >> 24))]; + my_stripe += psum[ (uint8_t)(x1) ] + + psum[0x100+((uint8_t)(x1 >> 8))] + + psum[0x200+((uint8_t)(x1 >> 16))] + + psum[0x300+((uint8_t)(x1 >> 24))]; + + } else { + did_update=true; + + // Use the pre-computed sums + // Each range of 8 lengths has already been pre-computed and stored in psum + // Since embedded_proportions packed format is in 64-bit format for performance reasons + // we need to add the 8 sums using the four 8-bits for addressing inside psum + + my_stripe_total += psum[ (uint8_t)(o1) ] + + psum[0x100+((uint8_t)(o1 >> 8))] + + psum[0x200+((uint8_t)(o1 >> 16))] + + psum[0x300+((uint8_t)(o1 >> 24))] + + psum[0x400+((uint8_t)(o1 >> 32))] + + psum[0x500+((uint8_t)(o1 >> 40))] + + psum[0x600+((uint8_t)(o1 >> 48))] + + psum[0x700+((uint8_t)(o1 >> 56))]; + my_stripe += psum[ (uint8_t)(x1) ] + + psum[0x100+((uint8_t)(x1 >> 8))] + + psum[0x200+((uint8_t)(x1 >> 16))] + + psum[0x300+((uint8_t)(x1 >> 24))] + + psum[0x400+((uint8_t)(x1 >> 32))] + + psum[0x500+((uint8_t)(x1 >> 40))] + + psum[0x600+((uint8_t)(x1 >> 48))] + + psum[0x700+((uint8_t)(x1 >> 56))]; + } + } + } // (allzero_k || allzero_l1) + + if (did_update) { + dm_stripe[k] += my_stripe; + dm_stripe_total[k] += my_stripe_total; + } + } // (allzero_k && allzero_l1) + +} +#endif + template void SUCMP_NM::UnifracUnweightedTask::_run(unsigned int filled_embs, const TFloat * __restrict__ lengths) { const uint64_t start_idx = this->task_p->start; @@ -1187,6 +1463,14 @@ void SUCMP_NM::UnifracUnweightedTask::_run(unsigned int filled_embs, con const uint64_t filled_embs_els_round = (filled_embs+63)/64; +#ifndef _OPENACC + // not effective for GPUs, but helps a lot on the CPUs + bool * const __restrict__ zcheck = this->zcheck; + // check for zero values + WeightedZerosAsync(zcheck, + embedded_proportions, + filled_embs_els_round, n_samples, n_samples_r); +#endif // pre-compute sums of length elements, since they are likely to be accessed many times // We will use a 8-bit map, to keep it small enough to keep in L1 cache @@ -1247,7 +1531,7 @@ void SUCMP_NM::UnifracUnweightedTask::_run(unsigned int filled_embs, con #ifdef _OPENACC #pragma acc wait const unsigned int acc_vector_size = SUCMP_NM::UnifracUnweightedTask::acc_vector_size; -#pragma acc parallel loop collapse(3) vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async +#pragma acc parallel loop collapse(3) gang vector vector_length(acc_vector_size) present(embedded_proportions,dm_stripes_buf,dm_stripes_total_buf,sums) async #else // use dynamic scheduling due to non-homogeneity in the loop #pragma omp parallel for schedule(dynamic,1) default(shared) @@ -1257,61 +1541,19 @@ void SUCMP_NM::UnifracUnweightedTask::_run(unsigned int filled_embs, con for(uint64_t ik = 0; ik < step_size ; ik++) { const uint64_t k = sk*step_size + ik; const uint64_t idx = (stripe-start_idx) * n_samples_r; - TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; - TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx; - //TFloat *dm_stripe = dm_stripes[stripe]; - //TFloat *dm_stripe_total = dm_stripes_total[stripe]; if (k>=n_samples) continue; // past the limit const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound - bool did_update = false; - TFloat my_stripe = 0.0; - TFloat my_stripe_total = 0.0; - -#pragma acc loop seq - for (uint64_t emb_el=0; emb_el> 8) & 0xff)] + - psum[0x200+((x1 >> 16) & 0xff)] + - psum[0x300+((x1 >> 24) & 0xff)] + - psum[0x400+((x1 >> 32) & 0xff)] + - psum[0x500+((x1 >> 40) & 0xff)] + - psum[0x600+((x1 >> 48) & 0xff)] + - psum[0x700+((x1 >> 56) )]; - my_stripe_total += psum[ (o1 & 0xff)] + - psum[0x100+((o1 >> 8) & 0xff)] + - psum[0x200+((o1 >> 16) & 0xff)] + - psum[0x300+((o1 >> 24) & 0xff)] + - psum[0x400+((o1 >> 32) & 0xff)] + - psum[0x500+((o1 >> 40) & 0xff)] + - psum[0x600+((o1 >> 48) & 0xff)] + - psum[0x700+((o1 >> 56) )]; - } - } - - if (did_update) { - dm_stripe[k] += my_stripe; - dm_stripe_total[k] += my_stripe_total; - } - + Unweighted1( + dm_stripes_buf,dm_stripes_total_buf, +#ifndef _OPENACC + zcheck, +#endif + sums, embedded_proportions, + filled_embs_els_round,idx, n_samples_r, + k, l1); } } diff --git a/src/unifrac_task.hpp b/src/unifrac_task.hpp index e068c55..e9b6b4f 100644 --- a/src/unifrac_task.hpp +++ b/src/unifrac_task.hpp @@ -411,6 +411,12 @@ namespace SUCMP_NM { UnifracUnweightedTask(std::vector &_dm_stripes, std::vector &_dm_stripes_total, unsigned int _max_embs, const su::task_parameters* _task_p) : UnifracTask(_dm_stripes,_dm_stripes_total,_max_embs,_task_p) { +#ifndef _OPENACC + // zcheck is not beneficial for GPUs, but helps a lot for the CPUs + const unsigned int n_samples = this->task_p->n_samples; + zcheck = NULL; + posix_memalign((void **)&zcheck, 4096, sizeof(bool) * n_samples); +#endif const unsigned int bsize = _max_embs*(0x400/32); sums = NULL; posix_memalign((void **)&sums, 4096, sizeof(TFloat) * bsize); @@ -420,17 +426,25 @@ namespace SUCMP_NM { virtual ~UnifracUnweightedTask() { #ifdef _OPENACC - const unsigned int bsize = this->max_embs*(0x400/32); + const unsigned int bsize = this->max_embs*(0x400/32); #pragma acc exit data delete(sums[:bsize]) #endif free(sums); +#ifndef _OPENACC + free(zcheck); +#endif } virtual void run(unsigned int filled_embs, const TFloat * __restrict__ length) {_run(filled_embs, length);} void _run(unsigned int filled_embs, const TFloat * __restrict__ length); private: - TFloat *sums; // temp buffer + // temp buffers + TFloat *sums; +#ifndef _OPENACC + // zcheck is not beneficial for GPUs, but helps a lot for the CPUs + bool *zcheck; +#endif }; template class UnifracGeneralizedTask : public UnifracTask {