diff --git a/lib/clover_deriv_quda.cu b/lib/clover_deriv_quda.cu index 34ef0b993b..24615f84ce 100644 --- a/lib/clover_deriv_quda.cu +++ b/lib/clover_deriv_quda.cu @@ -12,6 +12,7 @@ namespace quda { double coeff; int parity; unsigned int minThreads() const { return gauge.LocalVolumeCB(); } + unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array public: DerivativeClover(GaugeField &force, GaugeField &gauge, GaugeField &oprod, double coeff, int parity) : diff --git a/lib/coarse_op.cuh b/lib/coarse_op.cuh index 564d1ed55c..78d5cd716e 100644 --- a/lib/coarse_op.cuh +++ b/lib/coarse_op.cuh @@ -245,7 +245,7 @@ namespace quda { unsigned int sharedBytesPerBlock(const TuneParam ¶m) const override { - if (type == COMPUTE_VUV || type == COMPUTE_VLV) + if (arg.shared_atomic && (type == COMPUTE_VUV || type == COMPUTE_VLV)) return 4*sizeof(storeType)*arg.max_color_height_per_block*arg.max_color_width_per_block*4*coarseSpin*coarseSpin; return TunableKernel3D::sharedBytesPerBlock(param); } @@ -577,9 +577,7 @@ namespace quda { if (type == COMPUTE_VUV || type == COMPUTE_VLV || type == COMPUTE_CONVERT || type == COMPUTE_RESCALE) arg.dim_index = 4*(dir==QUDA_BACKWARDS ? 0 : 1) + dim; arg.kd_dagger = kd_dagger; - if (type == COMPUTE_VUV || type == COMPUTE_VLV) tp.shared_bytes -= sharedBytesPerBlock(tp); // shared memory is static so don't include it in launch Launch(arg, tp, type, stream); - if (type == COMPUTE_VUV || type == COMPUTE_VLV) tp.shared_bytes += sharedBytesPerBlock(tp); // restore shared memory }; /** diff --git a/lib/dslash5_domain_wall.cu b/lib/dslash5_domain_wall.cu index 4dea657a25..f16b6cd7ab 100644 --- a/lib/dslash5_domain_wall.cu +++ b/lib/dslash5_domain_wall.cu @@ -68,7 +68,9 @@ namespace quda int blockMin() const { return 4; } unsigned int sharedBytesPerThread() const { - if (mobius_m5::shared()) { + if (mobius_m5::shared() + && (type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS + || type == Dslash5Type::M5_INV_ZMOBIUS)) { // spin components in shared depend on inversion algorithm bool isInv = type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS || type == Dslash5Type::M5_INV_ZMOBIUS; int nSpin = (!isInv || mobius_m5::var_inverse()) ? mobius_m5::use_half_vector() ? in.Nspin() / 2 : in.Nspin() : in.Nspin(); @@ -81,7 +83,9 @@ namespace quda // overloaded to return max dynamic shared memory if doing shared-memory inverse unsigned int maxSharedBytesPerBlock() const { - if (mobius_m5::shared()) { + if (mobius_m5::shared() + && (type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS + || type == Dslash5Type::M5_INV_ZMOBIUS)) { return maxDynamicSharedBytesPerBlock(); } else { return TunableKernel3D::maxSharedBytesPerBlock(); @@ -104,7 +108,9 @@ namespace quda xpay(a == 0.0 ? false : true), type(type) { - if (mobius_m5::shared()) { + if (mobius_m5::shared() + && (type == Dslash5Type::M5_INV_DWF || type == Dslash5Type::M5_INV_MOBIUS + || type == Dslash5Type::M5_INV_ZMOBIUS)) { TunableKernel2D_base::resizeStep(in.X(4)); // Ls must be contained in the block } diff --git a/lib/gauge_ape.cu b/lib/gauge_ape.cu index 248b7d1d6c..a2c8a92dd8 100644 --- a/lib/gauge_ape.cu +++ b/lib/gauge_ape.cu @@ -13,6 +13,7 @@ namespace quda { const GaugeField ∈ const Float alpha; unsigned int minThreads() const { return in.LocalVolumeCB(); } + unsigned int sharedBytesPerThread() const { return 4 * sizeof(int); } // for thread_array public: // (2,3): 2 for parity in the y thread dim, 3 corresponds to mapping direction to the z thread dim diff --git a/lib/gauge_stout.cu b/lib/gauge_stout.cu index c7f256f2ee..48d7e638e8 100644 --- a/lib/gauge_stout.cu +++ b/lib/gauge_stout.cu @@ -18,8 +18,9 @@ namespace quda { unsigned int maxSharedBytesPerBlock() const { return maxDynamicSharedBytesPerBlock(); } unsigned int sharedBytesPerThread() const { - // use SharedMemoryCache if using over improvement for two link fields - return improved ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper::type) : 0; + // use ThreadLocalCache if using over improvement for two link fields + return (improved ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper::type) : 0) + + 4 * sizeof(int); // for thread_array } public: diff --git a/lib/gauge_wilson_flow.cu b/lib/gauge_wilson_flow.cu index 0c947e4e95..78456b665f 100644 --- a/lib/gauge_wilson_flow.cu +++ b/lib/gauge_wilson_flow.cu @@ -24,8 +24,9 @@ namespace quda { unsigned int sharedBytesPerThread() const { - // use SharedMemoryCache if using Symanzik improvement for two Link fields - return 4*sizeof(int) + (wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper::type) : 0); + // use ThreadLocalCache if using Symanzik improvement for two Link fields + return (wflow_type == QUDA_GAUGE_SMEAR_SYMANZIK_FLOW ? 2 * in.Ncolor() * in.Ncolor() * 2 * sizeof(typename mapper::type) : 0) + + 4 * sizeof(int); // for thread_array } public: