diff --git a/src/Makefile b/src/Makefile index 7384045..e923607 100644 --- a/src/Makefile +++ b/src/Makefile @@ -72,9 +72,9 @@ ifneq (,$(findstring pgi,$(COMPILER))) endif else ifeq ($(PERFORMING_CONDA_BUILD),True) - CPPFLAGS += -mtune=generic + CPPFLAGS += -mtune=generic -mavx else - CPPFLAGS += -mfma -march=native + CPPFLAGS += -mavx -mfma -march=native endif endif @@ -86,7 +86,7 @@ endif BLASLIB=-llapacke -lcblas -CPPFLAGS += -Wall -std=c++11 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib +CPPFLAGS += -Wall -std=c++14 -pedantic -I. $(OPT) -fPIC -L$(PREFIX)/lib all: api main install diff --git a/src/unifrac_task.cpp b/src/unifrac_task.cpp index 9f7f0cd..627d3b5 100644 --- a/src/unifrac_task.cpp +++ b/src/unifrac_task.cpp @@ -2,6 +2,22 @@ #include "unifrac_task.hpp" #include +#ifndef _OPENACC +// popcnt returns number of bits set to 1, if supported by fast HW compute +// else return 0 when v is 0 and max-bits else +// Note: The user of popcnt in this file must be able to use this modified semantics + +#if __AVX__ +#include +static inline int32_t popcnt_u32(uint32_t v) {return _mm_popcnt_u32(v);} +static inline int64_t popcnt_u64(uint64_t v) {return _mm_popcnt_u64(v);} +#else +static inline int32_t popcnt_u32(uint32_t v) {return (v==0) ? 0 : 32;} +static inline int64_t popcnt_u64(uint64_t v) {return (v==0) ? 0 : 64;} +// For future non-x86 ports, see https://barakmich.dev/posts/popcnt-arm64-go-asm/ +// and https://stackoverflow.com/questions/38113284/whats-the-difference-between-builtin-popcountll-and-mm-popcnt-u64 +#endif +#endif // check for zero values and pre-compute single column sums template @@ -249,12 +265,16 @@ static inline void UnnormalizedWeighted4( const uint64_t ls) { const uint32_t z_k = ((const uint32_t *)(zcheck+ks))[0]; const uint32_t z_l = ((const uint32_t *)(zcheck+ls))[0]; - const bool allzero_k = z_k==0x01010101; - const bool allzero_l1 = z_l==0x01010101; + const bool allzero_k = z_k==0x01010101; + const bool allzero_l = z_l==0x01010101; - if (allzero_k && allzero_l1) { + // popcnt is cheap but has large latency, so compute speculatively/early + const int32_t cnt_k = popcnt_u32(z_k); + const int32_t cnt_l = popcnt_u32(z_l); + + if (allzero_k && allzero_l) { // nothing to do, would have to add 0 - } else if (allzero_k || allzero_l1) { + } else if (allzero_k || allzero_l) { TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; //TFloat *dm_stripe = dm_stripes[stripe]; @@ -281,8 +301,8 @@ static inline void UnnormalizedWeighted4( dm_stripe[ks+2] += sum_k2; dm_stripe[ks+3] += sum_k3; } - } else if ((z_k==0) && (z_l==0)) { - // they are all nonzero, so use the vectorized expensive path + } else if ((cnt_k<3) && (cnt_l<3)) { + // several of the elemens are nonzero, may as well use the vectorized version TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; //TFloat *dm_stripe = dm_stripes[stripe]; @@ -291,7 +311,7 @@ static inline void UnnormalizedWeighted4( filled_embs, n_samples_r, ks, ls); } else { - // both sides partially non zero, try smaller vect size + // only a few have both sides partially non zero, use the fine grained compute for (uint64_t i=0; i<4; i++) { UnnormalizedWeighted1( dm_stripes_buf, @@ -299,7 +319,7 @@ static inline void UnnormalizedWeighted4( filled_embs,idx, n_samples_r, ks+i, ls+i); } - } // (allzero_k && allzero_l1) + } // (allzero_k && allzero_l) } template @@ -316,12 +336,16 @@ static inline void UnnormalizedWeighted8( const uint64_t ls) { const uint64_t z_k = ((const uint64_t *)(zcheck+ks))[0]; const uint64_t z_l = ((const uint64_t *)(zcheck+ls))[0]; - const bool allzero_k = z_k==0x0101010101010101; - const bool allzero_l1 = z_l==0x0101010101010101; + const bool allzero_k = z_k==0x0101010101010101; + const bool allzero_l = z_l==0x0101010101010101; - if (allzero_k && allzero_l1) { + // popcnt is cheap but has large latency, so compute speculatively/early + const int64_t cnt_k = popcnt_u64(z_k); + const int64_t cnt_l = popcnt_u64(z_l); + + if (allzero_k && allzero_l) { // nothing to do, would have to add 0 - } else if (allzero_k || allzero_l1) { + } else if (allzero_k || allzero_l) { TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; //TFloat *dm_stripe = dm_stripes[stripe]; @@ -364,8 +388,8 @@ static inline void UnnormalizedWeighted8( dm_stripe[ks+6] += sum_k6; dm_stripe[ks+7] += sum_k7; } - } else if ((z_k==0) && (z_l==0)) { - // they are all nonzero, so use the vectorized expensive path + } else if ((cnt_k<6) && (cnt_l<6)) { + // several of the elemens are nonzero, may as well use the vectorized version TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; //TFloat *dm_stripe = dm_stripes[stripe]; @@ -374,18 +398,15 @@ static inline void UnnormalizedWeighted8( filled_embs, n_samples_r, ks, ls); } else { - // both sides partially non zero, try smaller vect size - UnnormalizedWeighted4( - dm_stripes_buf, - zcheck, sums, embedded_proportions, lengths, - filled_embs,idx, n_samples_r, - ks, ls); - UnnormalizedWeighted4( + // only a few have both sides partially non zero, use the fine grained compute + for (uint64_t i=0; i<8; i++) { + UnnormalizedWeighted1( dm_stripes_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, - ks+4, ls+4); - } // (allzero_k && allzero_l1) + ks+i, ls+i); + } + } // (allzero_k && allzero_l) } #endif @@ -440,62 +461,39 @@ void SUCMP_NM::UnifracUnnormalizedWeightedTask::_run(unsigned int filled #else // SIMD-based CPUs need help with vectorization const uint64_t idx = (stripe-start_idx) * n_samples_r; - uint64_t ik = 0; - - while( ik < (step_size-7) ) { - const uint64_t ks = sk*step_size + ik; - const uint64_t ke = ks+7; - - if (ke>=n_samples) break; // past the limit - - const uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound - const uint64_t le = (ke + stripe + 1)%n_samples; // wraparound - - if ((le-ls)!=7) break; //nor contiguous, use serial version - ik+=8; + uint64_t ks = sk*step_size; + const uint64_t kmax = std::min(ks+step_size,n_samples); + uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound + while( ((ks+8) <= kmax) && ((n_samples-ls)>=8) ) { UnnormalizedWeighted8( dm_stripes_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, ks, ls); - } // for ik - - while( ik < (step_size-3) ) { - const uint64_t ks = sk*step_size + ik; - const uint64_t ke = ks+3; - - if (ke>=n_samples) break; // past the limit - - const uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound - const uint64_t le = (ke + stripe + 1)%n_samples; // wraparound - - if ((le-ls)!=3) break; //nor contiguous, use serial version - ik+=4; + ks+=8; + ls = (ls + 8)%n_samples; // wraparound + } // for ks+=8 + while( ((ks+4) <= kmax) && ((n_samples-ls)>=4) ) { UnnormalizedWeighted4( dm_stripes_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, ks, ls); - } // for ik + ks+=4; + ls = (ls + 4)%n_samples; // wraparound + } // for ks+=4 // deal with any leftovers in serial way - while( ik < step_size ) { - const uint64_t k = sk*step_size + ik; - - if (k>=n_samples) break; // past the limit - ik++; - - const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound - + for( ; ks < kmax; ks++ ) { UnnormalizedWeighted1( dm_stripes_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, - k, l1); - - } // for ik + ks, ls); + ls = (ls + 1)%n_samples; // wraparound + } // for ks #endif } // for stripe } // for sk @@ -642,10 +640,10 @@ static inline void NormalizedWeighted4( const uint64_t ls) { const uint32_t z_k = ((const uint32_t *)(zcheck+ks))[0]; const uint32_t z_l = ((const uint32_t *)(zcheck+ls))[0]; - const bool allzero_k = z_k==0x01010101; - const bool allzero_l1 = z_l==0x01010101; + const bool allzero_k = z_k==0x01010101; + const bool allzero_l = z_l==0x01010101; - if (allzero_k && allzero_l1) { + if (allzero_k && allzero_l) { // nothing to do, would have to add 0 } else { const TFloat sum_k0 = sums[ks]; @@ -657,19 +655,13 @@ static inline void NormalizedWeighted4( const TFloat sum_l2 = sums[ls+2]; const TFloat sum_l3 = sums[ls+3]; - // the totals can always use the distributed property - { - TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx; - //TFloat *dm_stripe_total = dm_stripes_total[stripe]; - const TFloat sum_kl0 = sum_k0 + sum_l0; - const TFloat sum_kl1 = sum_k1 + sum_l1; - const TFloat sum_kl2 = sum_k2 + sum_l2; - const TFloat sum_kl3 = sum_k3 + sum_l3; - dm_stripe_total[ks] += sum_kl0; - dm_stripe_total[ks+1] += sum_kl1; - dm_stripe_total[ks+2] += sum_kl2; - dm_stripe_total[ks+3] += sum_kl3; - } + const TFloat sum_kl0 = sum_k0 + sum_l0; + const TFloat sum_kl1 = sum_k1 + sum_l1; + const TFloat sum_kl2 = sum_k2 + sum_l2; + const TFloat sum_kl3 = sum_k3 + sum_l3; + + int32_t cnt_k = 0; + int32_t cnt_l = 0; TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; //TFloat *dm_stripe = dm_stripes[stripe]; @@ -681,31 +673,52 @@ static inline void NormalizedWeighted4( dm_stripe[ks+1] += sum_l1; dm_stripe[ks+2] += sum_l2; dm_stripe[ks+3] += sum_l3; - } else if (allzero_l1) { + } else if (allzero_l) { // one side has all zeros, use distributed property // if (nonzero_k) ridx=k // fabs(k-l1), with l1==0 dm_stripe[ks] += sum_k0; dm_stripe[ks+1] += sum_k1; dm_stripe[ks+2] += sum_k2; dm_stripe[ks+3] += sum_k3; - } else if ((z_k==0) && (z_l==0)) { - // they are all nonzero, so use the vectorized expensive path + } else { + // popcnt is cheap but has large latency, so compute early + cnt_k = popcnt_u32(z_k); + cnt_l = popcnt_u32(z_l); + } + + // the totals can always use the distributed property + { + TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx; + //TFloat *dm_stripe_total = dm_stripes_total[stripe]; + dm_stripe_total[ks] += sum_kl0; + dm_stripe_total[ks+1] += sum_kl1; + dm_stripe_total[ks+2] += sum_kl2; + dm_stripe_total[ks+3] += sum_kl3; + } + + if (allzero_k||allzero_l) { + // already done + } else if ((cnt_k<3) && (cnt_l<3)) { + // several of the elemens are nonzero, may as well use the vectorized version + TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; + //TFloat *dm_stripe = dm_stripes[stripe]; + WeightedVal4(dm_stripe, embedded_proportions, lengths, filled_embs, n_samples_r, ks, ls); } else { - // both sides partially non zero, try smaller vect size - // Use UnnormalizedWeighted since we already computed dm_stripe_total - for (uint64_t i=0; i<4; i++) { - UnnormalizedWeighted1( + // only a few have both sides partially non zero, use the fine grained compute + // Use UnnormalizedWeighted since we already computed dm_stripe_total + for (uint64_t i=0; i<4; i++) { + UnnormalizedWeighted1( dm_stripes_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, ks+i, ls+i); - } + } } - } // (allzero_k && allzero_l1) + } // (allzero_k && allzero_l) } template @@ -723,10 +736,10 @@ static inline void NormalizedWeighted8( const uint64_t ls) { const uint64_t z_k = ((const uint64_t *)(zcheck+ks))[0]; const uint64_t z_l = ((const uint64_t *)(zcheck+ls))[0]; - const bool allzero_k = z_k==0x0101010101010101; - const bool allzero_l1 = z_l==0x0101010101010101; + const bool allzero_k = z_k==0x0101010101010101; + const bool allzero_l = z_l==0x0101010101010101; - if (allzero_k && allzero_l1) { + if (allzero_k && allzero_l) { // nothing to do, would have to add 0 } else { const TFloat sum_k0 = sums[ks]; @@ -746,27 +759,17 @@ static inline void NormalizedWeighted8( const TFloat sum_l6 = sums[ls+6]; const TFloat sum_l7 = sums[ls+7]; - // the totals can always use the distributed property - { - TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx; - //TFloat *dm_stripe_total = dm_stripes_total[stripe]; - const TFloat sum_kl0 = sum_k0 + sum_l0; - const TFloat sum_kl1 = sum_k1 + sum_l1; - const TFloat sum_kl2 = sum_k2 + sum_l2; - const TFloat sum_kl3 = sum_k3 + sum_l3; - const TFloat sum_kl4 = sum_k4 + sum_l4; - const TFloat sum_kl5 = sum_k5 + sum_l5; - const TFloat sum_kl6 = sum_k6 + sum_l6; - const TFloat sum_kl7 = sum_k7 + sum_l7; - dm_stripe_total[ks] += sum_kl0; - dm_stripe_total[ks+1] += sum_kl1; - dm_stripe_total[ks+2] += sum_kl2; - dm_stripe_total[ks+3] += sum_kl3; - dm_stripe_total[ks+4] += sum_kl4; - dm_stripe_total[ks+5] += sum_kl5; - dm_stripe_total[ks+6] += sum_kl6; - dm_stripe_total[ks+7] += sum_kl7; - } + const TFloat sum_kl0 = sum_k0 + sum_l0; + const TFloat sum_kl1 = sum_k1 + sum_l1; + const TFloat sum_kl2 = sum_k2 + sum_l2; + const TFloat sum_kl3 = sum_k3 + sum_l3; + const TFloat sum_kl4 = sum_k4 + sum_l4; + const TFloat sum_kl5 = sum_k5 + sum_l5; + const TFloat sum_kl6 = sum_k6 + sum_l6; + const TFloat sum_kl7 = sum_k7 + sum_l7; + + int32_t cnt_k = 0; + int32_t cnt_l = 0; TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; //TFloat *dm_stripe = dm_stripes[stripe]; @@ -782,7 +785,7 @@ static inline void NormalizedWeighted8( dm_stripe[ks+5] += sum_l5; dm_stripe[ks+6] += sum_l6; dm_stripe[ks+7] += sum_l7; - } else if (allzero_l1) { + } else if (allzero_l) { // one side has all zeros, use distributed property // if (nonzero_k) ridx=k // fabs(k-l1), with l1==0 dm_stripe[ks] += sum_k0; @@ -793,27 +796,49 @@ static inline void NormalizedWeighted8( dm_stripe[ks+5] += sum_k5; dm_stripe[ks+6] += sum_k6; dm_stripe[ks+7] += sum_k7; - } else if ((z_k==0) && (z_l==0)) { - // they are all nonzero, so use the vectorized expensive path + } else { + // popcnt is cheap but has large latency, so compute early + cnt_k = popcnt_u64(z_k); + cnt_l = popcnt_u64(z_l); + } + + // the totals can always use the distributed property + { + TFloat * const __restrict__ dm_stripe_total = dm_stripes_total_buf+idx; + //TFloat *dm_stripe_total = dm_stripes_total[stripe]; + dm_stripe_total[ks] += sum_kl0; + dm_stripe_total[ks+1] += sum_kl1; + dm_stripe_total[ks+2] += sum_kl2; + dm_stripe_total[ks+3] += sum_kl3; + dm_stripe_total[ks+4] += sum_kl4; + dm_stripe_total[ks+5] += sum_kl5; + dm_stripe_total[ks+6] += sum_kl6; + dm_stripe_total[ks+7] += sum_kl7; + } + + if (allzero_k||allzero_l) { + // already done + } else if ((cnt_k<6) && (cnt_l<6)) { + // several of the elemens are nonzero, may as well use the vectorized version + TFloat * const __restrict__ dm_stripe = dm_stripes_buf+idx; + //TFloat *dm_stripe = dm_stripes[stripe]; + WeightedVal8(dm_stripe, embedded_proportions, lengths, filled_embs, n_samples_r, ks, ls); } else { - // both sides partially non zero, try smaller vect size - // Use UnnormalizedWeighted since we already computed dm_stripe_total - UnnormalizedWeighted4( - dm_stripes_buf, - zcheck, sums, embedded_proportions, lengths, - filled_embs,idx, n_samples_r, - ks, ls); - UnnormalizedWeighted4( + // only a few have both sides partially non zero, use the fine grained compute + // Use UnnormalizedWeighted since we already computed dm_stripe_total + for (uint64_t i=0; i<8; i++) { + UnnormalizedWeighted1( dm_stripes_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, - ks+4, ls+4); + ks+i, ls+i); + } } - } // (allzero_k && allzero_l1) + } // (allzero_k && allzero_l) } #endif @@ -869,60 +894,39 @@ void SUCMP_NM::UnifracNormalizedWeightedTask::_run(unsigned int filled_e #else // SIMD-based CPUs need help with vectorization const uint64_t idx = (stripe-start_idx) * n_samples_r; - uint64_t ik = 0; - - while( ik < (step_size-7) ) { - const uint64_t ks = sk*step_size + ik; - const uint64_t ke = ks+7; - - if (ke>=n_samples) break; // past the limit - - const uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound - const uint64_t le = (ke + stripe + 1)%n_samples; // wraparound - - if ((le-ls)!=7) break; //nor contiguous, use serial version - ik+=8; + uint64_t ks = sk*step_size; + const uint64_t kmax = std::min(ks+step_size,n_samples); + uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound + while( ((ks+8) <= kmax) && ((n_samples-ls)>=8) ) { NormalizedWeighted8( dm_stripes_buf,dm_stripes_total_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, ks, ls); - } // for ik - - while( ik < (step_size-3) ) { - const uint64_t ks = sk*step_size + ik; - const uint64_t ke = ks+3; - - if (ke>=n_samples) break; // past the limit - - const uint64_t ls = (ks + stripe + 1)%n_samples; // wraparound - const uint64_t le = (ke + stripe + 1)%n_samples; // wraparound - - if ((le-ls)!=3) break; //nor contiguous, use serial version - ik+=4; + ks+=8; + ls = (ls + 8)%n_samples; // wraparound + } // for ks+=8 + while( ((ks+4) <= kmax) && ((n_samples-ls)>=4) ) { NormalizedWeighted4( dm_stripes_buf,dm_stripes_total_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, ks, ls); - } // for ik - // deal with any leftovers in serial way - while( ik < step_size ) { - const uint64_t k = sk*step_size + ik; - - if (k>=n_samples) break; // past the limit - ik++; - - const uint64_t l1 = (k + stripe + 1)%n_samples; // wraparound + ks+=4; + ls = (ls + 4)%n_samples; // wraparound + } // for ks+=4 + // deal with any leftovers in serial way + for( ; ks < kmax; ks++ ) { NormalizedWeighted1( dm_stripes_buf,dm_stripes_total_buf, zcheck, sums, embedded_proportions, lengths, filled_embs,idx, n_samples_r, - k, l1); - } // for ik + ks, ls); + ls = (ls + 1)%n_samples; // wraparound + } // for ks #endif } // for stripe } // for sk diff --git a/src/unifrac_task.hpp b/src/unifrac_task.hpp index 74a6d9d..e068c55 100644 --- a/src/unifrac_task.hpp +++ b/src/unifrac_task.hpp @@ -297,9 +297,8 @@ namespace SUCMP_NM { template class UnifracTask : public UnifracTaskBase { protected: - // Use one cache line on CPU - // On GPU, shaing a cache line is actually a good thing - static const unsigned int step_size = 16*4/sizeof(TFloat); + // Use a moderate sized step, a few cache lines + static const unsigned int step_size = 64*4/sizeof(TFloat); #ifdef _OPENACC // Use as big vector size as we can, to maximize cache line reuse @@ -403,6 +402,9 @@ namespace SUCMP_NM { template class UnifracUnweightedTask : public UnifracTask { public: + // use the smaller size for historical reasons + static const unsigned int step_size = 16*4/sizeof(TFloat); + static const unsigned int RECOMMENDED_MAX_EMBS = UnifracTask::RECOMMENDED_MAX_EMBS_BOOL; // Note: _max_emb MUST be multiple of 64