From ccb1c73d7b4aac3bb522016c354e296d67ebf0de Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Tue, 18 Jul 2023 14:36:17 -0700 Subject: [PATCH 01/38] Fix compiler warning with dbldble --- include/dbldbl.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/dbldbl.h b/include/dbldbl.h index f4b198e373..e05aee104f 100644 --- a/include/dbldbl.h +++ b/include/dbldbl.h @@ -294,6 +294,7 @@ struct doubledouble { __device__ __host__ doubledouble& operator=(const double &head) { this->a.y = head; this->a.x = 0.0; + return *this; } __device__ doubledouble& operator+=(const doubledouble &a) { From 2049be6a70eee9f7d66807877ee45b124e3a4c76 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 19 Jul 2023 14:07:13 -0700 Subject: [PATCH 02/38] Add array copy assignment from one type of array to another --- include/array.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/array.h b/include/array.h index acb4ee4e0a..52b8f5aada 100644 --- a/include/array.h +++ b/include/array.h @@ -24,6 +24,12 @@ namespace quda array &operator=(const array &) = default; array &operator=(array &&) = default; + + template constexpr array &operator=(const array &other) + { + for (int i = 0; i < n; i++) data[i] = other[i]; + return *this; + } }; template std::ostream &operator<<(std::ostream &output, const array &a) From 81566c87347f9a7196d11d0c865ab1b18e3f1bc6 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 19 Jul 2023 14:13:47 -0700 Subject: [PATCH 03/38] Remove use of zero function and fix caxpyxmazMR functor for when the reduction type is changed --- include/kernels/blas_core.cuh | 8 ++++---- include/kernels/multi_blas_core.cuh | 4 ++-- lib/reduce_quda.cu | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/kernels/blas_core.cuh b/include/kernels/blas_core.cuh index 96f0c2b5f1..48aa166af7 100644 --- a/include/kernels/blas_core.cuh +++ b/include/kernels/blas_core.cuh @@ -369,19 +369,19 @@ namespace quda static constexpr memory_access<1, 1, 1> read{ }; static constexpr memory_access<1, 1> write{ }; complex a; - double3 *Ar3; + array *Ar3; bool init_; caxpyxmazMR_(const real &a, const real &, const real &) : a(a), - Ar3(static_cast(reducer::get_device_buffer())), + Ar3(static_cast*>(reducer::get_device_buffer())), init_(false) { ; } __device__ __host__ void init() { if (!init_) { - double3 result = *Ar3; - a = a.real() * complex((real)result.x, (real)result.y) * ((real)1.0 / (real)result.z); + auto result = *Ar3; + a = a.real() * complex((real)result[0], (real)result[1]) / (real)result[2]; init_ = true; } } diff --git a/include/kernels/multi_blas_core.cuh b/include/kernels/multi_blas_core.cuh index 449e1de12b..94558fb7f6 100644 --- a/include/kernels/multi_blas_core.cuh +++ b/include/kernels/multi_blas_core.cuh @@ -89,8 +89,8 @@ namespace quda if (arg.f.read.Y) arg.Y[k].load(y, idx, parity); if (arg.f.read.W) arg.W[k].load(w, idx, parity); } else { - y = ::quda::zero, Arg::n/2>(); - w = ::quda::zero, Arg::n/2>(); + y = {}; + w = {}; } #pragma unroll diff --git a/lib/reduce_quda.cu b/lib/reduce_quda.cu index 7b61c78b6e..f2316f20c9 100644 --- a/lib/reduce_quda.cu +++ b/lib/reduce_quda.cu @@ -146,7 +146,7 @@ namespace quda { auto instantiateReduce(Args &&... args) { using host_reduce_t = typename Functor::reduce_t; - host_reduce_t value = ::quda::zero(); + host_reduce_t value = {}; instantiate(args..., value); return value; } From ce5d3969a70b865f56298b73936ba225d9bb3086 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 19 Jul 2023 14:43:17 -0700 Subject: [PATCH 04/38] Make math_helper.cuh safe to include in non CUDA-aware compiler --- include/targets/cuda/math_helper.cuh | 53 +++++++++++++++++++++------- 1 file changed, 41 insertions(+), 12 deletions(-) diff --git a/include/targets/cuda/math_helper.cuh b/include/targets/cuda/math_helper.cuh index bdc333297a..af4ad2bbb5 100644 --- a/include/targets/cuda/math_helper.cuh +++ b/include/targets/cuda/math_helper.cuh @@ -3,7 +3,11 @@ #include #include -#if (CUDA_VERSION >= 11070) && !defined(_NVHPC_CUDA) +#if defined(__CUDACC__) || defined(_NVHPC_CUDA) || (defined(__clang__) && defined(__CUDA__)) +#define QUDA_CUDA_CC +#endif + +#if (CUDA_VERSION >= 11070) && defined(QUDA_CUDA_CC) && !defined(_NVHPC_CUDA) #define BUILTIN_ASSUME(x) \ bool p = x; \ __builtin_assume(p); @@ -34,6 +38,7 @@ namespace quda { template inline void operator()(const T& a, T *s, T *c) { ::sincos(a, s, c); } }; +#ifdef QUDA_CUDA_CC template <> struct sincos_impl { template __device__ inline void operator()(const T& a, T *s, T *c) { @@ -41,6 +46,7 @@ namespace quda { sincos(a, s, c); } }; +#endif /** * @brief Combined sin and cos calculation in QUDA NAMESPACE @@ -55,9 +61,11 @@ namespace quda { inline void operator()(const float& a, float *s, float *c) { ::sincosf(a, s, c); } }; +#ifdef QUDA_CUDA_CC template <> struct sincosf_impl { __device__ inline void operator()(const float& a, float *s, float *c) { __sincosf(a, s, c); } }; +#endif /** * @brief Combined sin and cos calculation in QUDA NAMESPACE @@ -75,10 +83,11 @@ namespace quda { template inline void operator()(const T& a, T *s, T *c) { ::sincos(a * static_cast(M_PI), s, c); } }; +#ifdef QUDA_CUDA_CC template <> struct sincospi_impl { template __device__ inline void operator()(const T& a, T *s, T *c) { sincospi(a, s, c); } }; - +#endif /** * @brief Combined sinpi and cospi calculation in QUDA NAMESPACE @@ -100,20 +109,25 @@ namespace quda { template<> inline __host__ __device__ void sincospi(const float& a, float *s, float *c) { quda::sincos(a * static_cast(M_PI), s, c); } + template struct sinpi_impl { + template inline T operator()(T a) { return ::sin(a * static_cast(M_PI)); } + }; +#ifdef QUDA_CUDA_CC + template <> struct sinpi_impl { template __device__ inline T operator()(T a) { return ::sinpi(a); } }; +#endif /** * @brief Sine pi calculation in QUDA NAMESPACE * @param a the angle * @return result of the sin(a * pi) */ - template inline __host__ __device__ T sinpi(T a) { return ::sinpi(a); } + template inline __host__ __device__ T sinpi(T a) { return target::dispatch(a); } + -#ifndef _NVHPC_CUDA - template struct sinpif_impl { inline float operator()(float a) { return ::sinpif(a); } }; -#else template struct sinpif_impl { inline float operator()(float a) { return ::sinf(a * static_cast(M_PI)); } }; -#endif +#ifdef QUDA_CUDA_CC template <> struct sinpif_impl { __device__ inline float operator()(float a) { return __sinf(a * static_cast(M_PI)); } }; +#endif /** * @brief Sine pi calculation in QUDA NAMESPACE. @@ -124,20 +138,27 @@ namespace quda { */ template<> inline __host__ __device__ float sinpi(float a) { return target::dispatch(a); } + template struct cospi_impl { + template inline T operator()(T a) { return ::cos(a * static_cast(M_PI)); } + }; +#ifdef QUDA_CUDA_CC + template <> struct cospi_impl { + template __device__ inline T operator()(T a) { return ::cospi(a); } + }; +#endif /** * @brief Cosine pi calculation in QUDA NAMESPACE * @param a the angle * @return result of the cos(a * pi) */ - template inline __host__ __device__ T cospi(T a) { return ::cospi(a); } + template inline __host__ __device__ T cospi(T a) { return target::dispatch(a); } + -#ifndef _NVHPC_CUDA - template struct cospif_impl { inline float operator()(float a) { return ::cospif(a); } }; -#else template struct cospif_impl { inline float operator()(float a) { return ::cosf(a * static_cast(M_PI)); } }; -#endif +#ifdef QUDA_CUDA_CC template <> struct cospif_impl { __device__ inline float operator()(float a) { return __cosf(a * static_cast(M_PI)); } }; +#endif /** * @brief Cosine pi calculation in QUDA NAMESPACE. @@ -153,9 +174,11 @@ namespace quda { template inline T operator()(T a) { return static_cast(1.0) / sqrt(a); } }; +#ifdef QUDA_CUDA_CC template <> struct rsqrt_impl { template __device__ inline T operator()(T a) { return ::rsqrt(a); } }; +#endif /** * @brief Reciprocal square root function (rsqrt) @@ -169,6 +192,7 @@ namespace quda { template struct fpow_impl { template inline real operator()(real a, int b) { return std::pow(a, b); } }; +#ifdef QUDA_CUDA_CC template <> struct fpow_impl { __device__ inline double operator()(double a, int b) { return ::pow(a, b); } @@ -179,6 +203,7 @@ namespace quda { return b & 1 ? sign * power : power; } }; +#endif /* @brief Fast power function that works for negative "a" argument @@ -189,7 +214,9 @@ namespace quda { template __device__ __host__ inline real fpow(real a, int b) { return target::dispatch(a, b); } template struct fdividef_impl { inline float operator()(float a, float b) { return a / b; } }; +#ifdef QUDA_CUDA_CC template <> struct fdividef_impl { __device__ inline float operator()(float a, float b) { return __fdividef(a, b); } }; +#endif /** @brief Optimized division routine on the device @@ -197,3 +224,5 @@ namespace quda { __device__ __host__ inline float fdividef(float a, float b) { return target::dispatch(a, b); } } + +#undef QUDA_CUDA_CC From 7a4e04fdb8356b106df2af37603c23e3a916c3d6 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 19 Jul 2023 16:31:26 -0700 Subject: [PATCH 05/38] Add doubledouble support for host, add complex-number support, remove legacy zero helper function --- include/complex_quda.h | 108 ++++++++---- include/dbldbl.h | 247 ++++++++++++++------------- include/float_vector.h | 82 ++------- include/reducer.h | 2 +- include/targets/cuda/math_helper.cuh | 45 +++++ 5 files changed, 266 insertions(+), 218 deletions(-) diff --git a/include/complex_quda.h b/include/complex_quda.h index 66b5eee609..c88694ea4e 100644 --- a/include/complex_quda.h +++ b/include/complex_quda.h @@ -26,6 +26,7 @@ #include #include #include // for double2 / float2 +#include "dbldbl.h" namespace quda { namespace gauge { @@ -427,10 +428,10 @@ struct complex return *this; } - __host__ __device__ inline ValueType real() const; - __host__ __device__ inline ValueType imag() const; - __host__ __device__ inline void real(ValueType); - __host__ __device__ inline void imag(ValueType); + constexpr ValueType real() const; + constexpr ValueType imag() const; + __host__ __device__ inline void real(ValueType); + __host__ __device__ inline void imag(ValueType); }; template <> struct complex : public float2 { @@ -617,25 +618,25 @@ struct complex : public short2 real(real() + z.real()); imag(imag() + z.imag()); return *this; - } - - __host__ __device__ inline complex &operator-=(const complex &z) - { - real(real()-z.real()); - imag(imag()-z.imag()); - return *this; - } + } - constexpr short real() const { return x; } - constexpr short imag() const { return y; } - __host__ __device__ inline void real(short re) { x = re; } - __host__ __device__ inline void imag(short im) { y = im; } + __host__ __device__ inline complex &operator-=(const complex &z) + { + real(real() - z.real()); + imag(imag() - z.imag()); + return *this; + } - // cast operators - template inline __host__ __device__ operator complex() const - { - return complex(static_cast(real()), static_cast(imag())); } + constexpr short real() const { return x; } + constexpr short imag() const { return y; } + __host__ __device__ inline void real(short re) { x = re; } + __host__ __device__ inline void imag(short im) { y = im; } + // cast operators + template inline __host__ __device__ operator complex() const + { + return complex(static_cast(real()), static_cast(imag())); + } }; template<> @@ -653,25 +654,62 @@ struct complex : public int2 real(real() + z.real()); imag(imag() + z.imag()); return *this; - } + } - __host__ __device__ inline complex &operator-=(const complex &z) - { - real(real()-z.real()); - imag(imag()-z.imag()); - return *this; - } + __host__ __device__ inline complex &operator-=(const complex &z) + { + real(real() - z.real()); + imag(imag() - z.imag()); + return *this; + } - constexpr int real() const { return x; } - constexpr int imag() const { return y; } - __host__ __device__ inline void real(int re) { x = re; } - __host__ __device__ inline void imag(int im) { y = im; } + constexpr int real() const { return x; } + constexpr int imag() const { return y; } + __host__ __device__ inline void real(int re) { x = re; } + __host__ __device__ inline void imag(int im) { y = im; } - // cast operators - template inline __host__ __device__ operator complex() const - { - return complex(static_cast(real()), static_cast(imag())); } + // cast operators + template inline __host__ __device__ operator complex() const + { + return complex(static_cast(real()), static_cast(imag())); + } +}; + +template <> struct complex : public doubledouble2 { +public: + typedef doubledouble value_type; + + complex() = default; + + constexpr complex(const doubledouble &re, const doubledouble &im = doubledouble()) : + doubledouble2 {re, im} + { + } + + __host__ __device__ inline complex &operator+=(const complex &z) + { + real(real() + z.real()); + imag(imag() + z.imag()); + return *this; + } + __host__ __device__ inline complex &operator-=(const complex &z) + { + real(real() - z.real()); + imag(imag() - z.imag()); + return *this; + } + + constexpr doubledouble real() const { return x; } + constexpr doubledouble imag() const { return y; } + __host__ __device__ inline void real(doubledouble re) { x = re; } + __host__ __device__ inline void imag(doubledouble im) { y = im; } + + // cast operators + template inline __host__ __device__ operator complex() const + { + return complex(static_cast(real()), static_cast(imag())); + } }; // Binary arithmetic operations diff --git a/include/dbldbl.h b/include/dbldbl.h index e05aee104f..d907051686 100644 --- a/include/dbldbl.h +++ b/include/dbldbl.h @@ -66,7 +66,7 @@ typedef double2 dbldbl; requirement. To create a double-double from two arbitrary double-precision numbers, use add_double_to_dbldbl(). */ -__device__ __forceinline__ dbldbl make_dbldbl (double head, double tail) +__device__ __host__ __forceinline__ dbldbl make_dbldbl (double head, double tail) { dbldbl z; z.x = tail; @@ -75,42 +75,42 @@ __device__ __forceinline__ dbldbl make_dbldbl (double head, double tail) } /* Return the head of a double-double number */ -__device__ __forceinline__ double get_dbldbl_head (dbldbl a) +__device__ __host__ __forceinline__ double get_dbldbl_head (dbldbl a) { return a.y; } /* Return the tail of a double-double number */ -__device__ __forceinline__ double get_dbldbl_tail (dbldbl a) +__device__ __host__ __forceinline__ double get_dbldbl_tail (dbldbl a) { return a.x; } /* Compute error-free sum of two unordered doubles. See Knuth, TAOCP vol. 2 */ -__device__ __forceinline__ dbldbl add_double_to_dbldbl (double a, double b) +__device__ __host__ __forceinline__ dbldbl add_double_to_dbldbl (double a, double b) { double t1, t2; dbldbl z; - z.y = __dadd_rn (a, b); - t1 = __dadd_rn (z.y, -a); - t2 = __dadd_rn (z.y, -t1); - t1 = __dadd_rn (b, -t1); - t2 = __dadd_rn (a, -t2); - z.x = __dadd_rn (t1, t2); + z.y = quda::dadd_rn (a, b); + t1 = quda::dadd_rn (z.y, -a); + t2 = quda::dadd_rn (z.y, -t1); + t1 = quda::dadd_rn (b, -t1); + t2 = quda::dadd_rn (a, -t2); + z.x = quda::dadd_rn (t1, t2); return z; } /* Compute error-free product of two doubles. Take full advantage of FMA */ -__device__ __forceinline__ dbldbl mul_double_to_dbldbl (double a, double b) +__device__ __host__ __forceinline__ dbldbl mul_double_to_dbldbl (double a, double b) { dbldbl z; - z.y = __dmul_rn (a, b); - z.x = __fma_rn (a, b, -z.y); + z.y = quda::dmul_rn (a, b); + z.x = quda::fma_rn (a, b, -z.y); return z; } /* Negate a double-double number, by separately negating head and tail */ -__device__ __forceinline__ dbldbl neg_dbldbl (dbldbl a) +__device__ __host__ __forceinline__ dbldbl neg_dbldbl (dbldbl a) { dbldbl z; z.y = -a.y; @@ -125,22 +125,22 @@ __device__ __forceinline__ dbldbl neg_dbldbl (dbldbl a) Floating-Point Numbers for GPU Computation. Retrieved on 7/12/2011 from http://andrewthall.org/papers/df64_qf128.pdf. */ -__device__ __forceinline__ dbldbl add_dbldbl (dbldbl a, dbldbl b) +__device__ __host__ __forceinline__ dbldbl add_dbldbl (dbldbl a, dbldbl b) { dbldbl z; double t1, t2, t3, t4, t5, e; - t1 = __dadd_rn (a.y, b.y); - t2 = __dadd_rn (t1, -a.y); - t3 = __dadd_rn (__dadd_rn (a.y, t2 - t1), __dadd_rn (b.y, -t2)); - t4 = __dadd_rn (a.x, b.x); - t2 = __dadd_rn (t4, -a.x); - t5 = __dadd_rn (__dadd_rn (a.x, t2 - t4), __dadd_rn (b.x, -t2)); - t3 = __dadd_rn (t3, t4); - t4 = __dadd_rn (t1, t3); - t3 = __dadd_rn (t1 - t4, t3); - t3 = __dadd_rn (t3, t5); - z.y = e = __dadd_rn (t4, t3); - z.x = __dadd_rn (t4 - e, t3); + t1 = quda::dadd_rn (a.y, b.y); + t2 = quda::dadd_rn (t1, -a.y); + t3 = quda::dadd_rn (quda::dadd_rn (a.y, t2 - t1), quda::dadd_rn (b.y, -t2)); + t4 = quda::dadd_rn (a.x, b.x); + t2 = quda::dadd_rn (t4, -a.x); + t5 = quda::dadd_rn (quda::dadd_rn (a.x, t2 - t4), quda::dadd_rn (b.x, -t2)); + t3 = quda::dadd_rn (t3, t4); + t4 = quda::dadd_rn (t1, t3); + t3 = quda::dadd_rn (t1 - t4, t3); + t3 = quda::dadd_rn (t3, t5); + z.y = e = quda::dadd_rn (t4, t3); + z.x = quda::dadd_rn (t4 - e, t3); return z; } @@ -151,22 +151,22 @@ __device__ __forceinline__ dbldbl add_dbldbl (dbldbl a, dbldbl b) Floating-Point Numbers for GPU Computation. Retrieved on 7/12/2011 from http://andrewthall.org/papers/df64_qf128.pdf. */ -__device__ __forceinline__ dbldbl sub_dbldbl (dbldbl a, dbldbl b) +__device__ __host__ __forceinline__ dbldbl sub_dbldbl (dbldbl a, dbldbl b) { dbldbl z; double t1, t2, t3, t4, t5, e; - t1 = __dadd_rn (a.y, -b.y); - t2 = __dadd_rn (t1, -a.y); - t3 = __dadd_rn (__dadd_rn (a.y, t2 - t1), - __dadd_rn (b.y, t2)); - t4 = __dadd_rn (a.x, -b.x); - t2 = __dadd_rn (t4, -a.x); - t5 = __dadd_rn (__dadd_rn (a.x, t2 - t4), - __dadd_rn (b.x, t2)); - t3 = __dadd_rn (t3, t4); - t4 = __dadd_rn (t1, t3); - t3 = __dadd_rn (t1 - t4, t3); - t3 = __dadd_rn (t3, t5); - z.y = e = __dadd_rn (t4, t3); - z.x = __dadd_rn (t4 - e, t3); + t1 = quda::dadd_rn (a.y, -b.y); + t2 = quda::dadd_rn (t1, -a.y); + t3 = quda::dadd_rn (quda::dadd_rn (a.y, t2 - t1), - quda::dadd_rn (b.y, t2)); + t4 = quda::dadd_rn (a.x, -b.x); + t2 = quda::dadd_rn (t4, -a.x); + t5 = quda::dadd_rn (quda::dadd_rn (a.x, t2 - t4), - quda::dadd_rn (b.x, t2)); + t3 = quda::dadd_rn (t3, t4); + t4 = quda::dadd_rn (t1, t3); + t3 = quda::dadd_rn (t1 - t4, t3); + t3 = quda::dadd_rn (t3, t5); + z.y = e = quda::dadd_rn (t4, t3); + z.x = quda::dadd_rn (t4 - e, t3); return z; } @@ -175,17 +175,17 @@ __device__ __forceinline__ dbldbl sub_dbldbl (dbldbl a, dbldbl b) relative error observed with 10 billion test cases was 5.238480533564479e-32 (~= 2**-103.9125). */ -__device__ __forceinline__ dbldbl mul_dbldbl (dbldbl a, dbldbl b) +__device__ __host__ __forceinline__ dbldbl mul_dbldbl (dbldbl a, dbldbl b) { dbldbl t, z; double e; - t.y = __dmul_rn (a.y, b.y); - t.x = __fma_rn (a.y, b.y, -t.y); - t.x = __fma_rn (a.x, b.x, t.x); - t.x = __fma_rn (a.y, b.x, t.x); - t.x = __fma_rn (a.x, b.y, t.x); - z.y = e = __dadd_rn (t.y, t.x); - z.x = __dadd_rn (t.y - e, t.x); + t.y = quda::dmul_rn (a.y, b.y); + t.x = quda::fma_rn (a.y, b.y, -t.y); + t.x = quda::fma_rn (a.x, b.x, t.x); + t.x = quda::fma_rn (a.y, b.x, t.x); + t.x = quda::fma_rn (a.x, b.y, t.x); + z.y = e = quda::dadd_rn (t.y, t.x); + z.x = quda::dadd_rn (t.y - e, t.x); return z; } @@ -197,22 +197,22 @@ __device__ __forceinline__ dbldbl mul_dbldbl (dbldbl a, dbldbl b) maximum relative error observed with 10 billion test cases was 1.0161322480099059e-31 (~= 2**-102.9566). */ -__device__ __forceinline__ dbldbl div_dbldbl (dbldbl a, dbldbl b) +__device__ __host__ __forceinline__ dbldbl div_dbldbl (dbldbl a, dbldbl b) { dbldbl t, z; double e, r; r = 1.0 / b.y; - t.y = __dmul_rn (a.y, r); - e = __fma_rn (b.y, -t.y, a.y); - t.y = __fma_rn (r, e, t.y); - t.x = __fma_rn (b.y, -t.y, a.y); - t.x = __dadd_rn (a.x, t.x); - t.x = __fma_rn (b.x, -t.y, t.x); - e = __dmul_rn (r, t.x); - t.x = __fma_rn (b.y, -e, t.x); - t.x = __fma_rn (r, t.x, e); - z.y = e = __dadd_rn (t.y, t.x); - z.x = __dadd_rn (t.y - e, t.x); + t.y = quda::dmul_rn (a.y, r); + e = quda::fma_rn (b.y, -t.y, a.y); + t.y = quda::fma_rn (r, e, t.y); + t.x = quda::fma_rn (b.y, -t.y, a.y); + t.x = quda::dadd_rn (a.x, t.x); + t.x = quda::fma_rn (b.x, -t.y, t.x); + e = quda::dmul_rn (r, t.x); + t.x = quda::fma_rn (b.y, -e, t.x); + t.x = quda::fma_rn (r, t.x, e); + z.y = e = quda::dadd_rn (t.y, t.x); + z.x = quda::dadd_rn (t.y - e, t.x); return z; } @@ -223,25 +223,25 @@ __device__ __forceinline__ dbldbl div_dbldbl (dbldbl a, dbldbl b) relative error observed with 10 billion test cases was 3.7564109505601846e-32 (~= 2**-104.3923). */ -__device__ __forceinline__ dbldbl sqrt_dbldbl (dbldbl a) +__device__ __host__ __forceinline__ dbldbl sqrt_dbldbl (dbldbl a) { dbldbl t, z; double e, y, s, r; r = quda::rsqrt(a.y); if (a.y == 0.0) r = 0.0; - y = __dmul_rn (a.y, r); - s = __fma_rn (y, -y, a.y); - r = __dmul_rn (0.5, r); - z.y = e = __dadd_rn (s, a.x); - z.x = __dadd_rn (s - e, a.x); - t.y = __dmul_rn (r, z.y); - t.x = __fma_rn (r, z.y, -t.y); - t.x = __fma_rn (r, z.x, t.x); - r = __dadd_rn (y, t.y); - s = __dadd_rn (y - r, t.y); - s = __dadd_rn (s, t.x); - z.y = e = __dadd_rn (r, s); - z.x = __dadd_rn (r - e, s); + y = quda::dmul_rn (a.y, r); + s = quda::fma_rn (y, -y, a.y); + r = quda::dmul_rn (0.5, r); + z.y = e = quda::dadd_rn (s, a.x); + z.x = quda::dadd_rn (s - e, a.x); + t.y = quda::dmul_rn (r, z.y); + t.x = quda::fma_rn (r, z.y, -t.y); + t.x = quda::fma_rn (r, z.x, t.x); + r = quda::dadd_rn (y, t.y); + s = quda::dadd_rn (y - r, t.y); + s = quda::dadd_rn (s, t.x); + z.y = e = quda::dadd_rn (r, s); + z.x = quda::dadd_rn (r - e, s); return z; } @@ -250,26 +250,26 @@ __device__ __forceinline__ dbldbl sqrt_dbldbl (dbldbl a) the maximum relative error observed with 10 billion test cases was 6.4937771666026349e-32 (~= 2**-103.6026) */ -__device__ __forceinline__ dbldbl rsqrt_dbldbl (dbldbl a) +__device__ __host__ __forceinline__ dbldbl rsqrt_dbldbl (dbldbl a) { dbldbl z; double r, s, e; r = quda::rsqrt(a.y); - e = __dmul_rn (a.y, r); - s = __fma_rn (e, -r, 1.0); - e = __fma_rn (a.y, r, -e); - s = __fma_rn (e, -r, s); - e = __dmul_rn (a.x, r); - s = __fma_rn (e, -r, s); + e = quda::dmul_rn (a.y, r); + s = quda::fma_rn (e, -r, 1.0); + e = quda::fma_rn (a.y, r, -e); + s = quda::fma_rn (e, -r, s); + e = quda::dmul_rn (a.x, r); + s = quda::fma_rn (e, -r, s); e = 0.5 * r; - z.y = __dmul_rn (e, s); - z.x = __fma_rn (e, s, -z.y); - s = __dadd_rn (r, z.y); - r = __dadd_rn (r, -s); - r = __dadd_rn (r, z.y); - r = __dadd_rn (r, z.x); - z.y = e = __dadd_rn (s, r); - z.x = __dadd_rn (s - e, r); + z.y = quda::dmul_rn (e, s); + z.x = quda::fma_rn (e, s, -z.y); + s = quda::dadd_rn (r, z.y); + r = quda::dadd_rn (r, -s); + r = quda::dadd_rn (r, z.y); + r = quda::dadd_rn (r, z.x); + z.y = e = quda::dadd_rn (s, r); + z.x = quda::dadd_rn (s - e, r); return z; } @@ -285,11 +285,11 @@ struct doubledouble { dbldbl a; - __device__ __host__ doubledouble() { a.x = 0.0; a.y = 0.0; } - __device__ __host__ doubledouble(const doubledouble &a) : a(a.a) { } - __device__ __host__ doubledouble(const dbldbl &a) : a(a) { } - __device__ __host__ doubledouble(const double &head, const double &tail) { a.y = head; a.x = tail; } - __device__ __host__ doubledouble(const double &head) { a.y = head; a.x = 0.0; } + doubledouble() = default; + constexpr doubledouble(const doubledouble &a) = default; + constexpr doubledouble(const dbldbl &a) : a(a) { } + constexpr doubledouble(const double &head, const double &tail) : a{tail, head} { } + constexpr doubledouble(const double &head) : a{0.0, head} { } __device__ __host__ doubledouble& operator=(const double &head) { this->a.y = head; @@ -297,43 +297,58 @@ struct doubledouble { return *this; } - __device__ doubledouble& operator+=(const doubledouble &a) { + __device__ __host__ doubledouble& operator+=(const doubledouble &a) { this->a = add_dbldbl(this->a, a.a); return *this; } - __device__ __host__ double head() const { return a.y; } - __device__ __host__ double tail() const { return a.x; } + __device__ __host__ doubledouble& operator-=(const doubledouble &a) { + this->a = sub_dbldbl(this->a, a.a); + return *this; + } + constexpr double head() const { return a.y; } + constexpr double tail() const { return a.x; } __device__ __host__ void print() const { printf("scalar: %16.14e + %16.14e\n", head(), tail()); } + constexpr operator double() const { return head(); } }; -__device__ inline bool operator>(const doubledouble &a, const double &b) { +__device__ __host__ inline bool operator>(const doubledouble &a, const doubledouble &b) { + if (a.head() > b.head()) { + return true; + } else if (a.head() == b.head() && a.tail() > b.tail()) { + return true; + } else { + return false; + } +} + +__device__ __host__ inline bool operator>(const doubledouble &a, const double &b) { return a.head() > b; } -__device__ inline doubledouble operator+(const doubledouble &a, const doubledouble &b) { +__device__ __host__ inline doubledouble operator+(const doubledouble &a, const doubledouble &b) { return doubledouble(add_dbldbl(a.a,b.a)); } -__device__ inline doubledouble operator-(const doubledouble &a, const doubledouble &b) { +__device__ __host__ inline doubledouble operator-(const doubledouble &a, const doubledouble &b) { return doubledouble(sub_dbldbl(a.a,b.a)); } -__device__ inline doubledouble operator*(const doubledouble &a, const doubledouble &b) { +__device__ __host__ inline doubledouble operator*(const doubledouble &a, const doubledouble &b) { return doubledouble(mul_dbldbl(a.a,b.a)); } -__device__ inline doubledouble operator/(const doubledouble &a, const doubledouble &b) { +__device__ __host__ inline doubledouble operator/(const doubledouble &a, const doubledouble &b) { return doubledouble(div_dbldbl(a.a,b.a)); } -__device__ inline doubledouble add_double_to_doubledouble(const double &a, const double &b) { +__device__ __host__ inline doubledouble add_double_to_doubledouble(const double &a, const double &b) { return doubledouble(add_double_to_dbldbl(a,b)); } -__device__ inline doubledouble mul_double_to_doubledouble(const double &a, const double &b) { +__device__ __host__ inline doubledouble mul_double_to_doubledouble(const double &a, const double &b) { return doubledouble(mul_double_to_dbldbl(a,b)); } @@ -341,12 +356,12 @@ struct doubledouble2 { doubledouble x; doubledouble y; - __device__ __host__ doubledouble2() : x(), y() { } - __device__ __host__ doubledouble2(const doubledouble2 &a) : x(a.x), y(a.y) { } - __device__ __host__ doubledouble2(const double2 &a) : x(a.x), y(a.y) { } - __device__ __host__ doubledouble2(const doubledouble &x, const doubledouble &y) : x(x), y(y) { } + doubledouble2() = default; + constexpr doubledouble2(const doubledouble2 &a) = default; + constexpr doubledouble2(const double2 &a) : x(a.x), y(a.y) { } + constexpr doubledouble2(const doubledouble &x, const doubledouble &y) : x(x), y(y) { } - __device__ doubledouble2& operator+=(const doubledouble2 &a) { + __device__ __host__ doubledouble2& operator+=(const doubledouble2 &a) { x += a.x; y += a.y; return *this; @@ -360,12 +375,12 @@ struct doubledouble3 { doubledouble y; doubledouble z; - __device__ __host__ doubledouble3() : x(), y() { } - __device__ __host__ doubledouble3(const doubledouble3 &a) : x(a.x), y(a.y), z(a.z) { } - __device__ __host__ doubledouble3(const double3 &a) : x(a.x), y(a.y), z(a.z) { } - __device__ __host__ doubledouble3(const doubledouble &x, const doubledouble &y, const doubledouble &z) : x(x), y(y), z(z) { } + doubledouble3() = default; + constexpr doubledouble3(const doubledouble3 &a) = default; + constexpr doubledouble3(const double3 &a) : x(a.x), y(a.y), z(a.z) { } + constexpr doubledouble3(const doubledouble &x, const doubledouble &y, const doubledouble &z) : x(x), y(y), z(z) { } - __device__ doubledouble3& operator+=(const doubledouble3 &a) { + __device__ __host__ doubledouble3& operator+=(const doubledouble3 &a) { x += a.x; y += a.y; z += a.z; @@ -375,8 +390,8 @@ struct doubledouble3 { __device__ __host__ void print() const { printf("vec3: (%16.14e + %16.14e) (%16.14e + %16.14e) (%16.14e + %16.14e)\n", x.head(), x.tail(), y.head(), y.tail(), z.head(), z.tail()); } }; -__device__ doubledouble2 operator+(const doubledouble2 &a, const doubledouble2 &b) +__device__ __host__ inline doubledouble2 operator+(const doubledouble2 &a, const doubledouble2 &b) { return doubledouble2(a.x + b.x, a.y + b.y); } -__device__ doubledouble3 operator+(const doubledouble3 &a, const doubledouble3 &b) +__device__ __host__ inline doubledouble3 operator+(const doubledouble3 &a, const doubledouble3 &b) { return doubledouble3(a.x + b.x, a.y + b.y, a.z + b.z); } diff --git a/include/float_vector.h b/include/float_vector.h index 201d803c95..ff5dc23e9f 100644 --- a/include/float_vector.h +++ b/include/float_vector.h @@ -12,6 +12,7 @@ #include #include #include +#include "dbldbl.h" namespace quda { @@ -44,69 +45,6 @@ namespace quda { return c; } - template constexpr std::enable_if_t, T> zero() { return static_cast(0); } - template constexpr std::enable_if_t>, T> zero() - { - return static_cast(0); - } - - template using specialize = std::enable_if_t, U>; - - template constexpr specialize zero() { return double2 {0.0, 0.0}; } - template constexpr specialize zero() { return double3 {0.0, 0.0, 0.0}; } - template constexpr specialize zero() { return double4 {0.0, 0.0, 0.0, 0.0}; } - - template constexpr specialize zero() { return float2 {0.0f, 0.0f}; } - template constexpr specialize zero() { return float3 {0.0f, 0.0f, 0.0f}; } - template constexpr specialize zero() { return float4 {0.0f, 0.0f, 0.0f, 0.0f}; } - -#ifdef QUAD_SUM - template __device__ __host__ inline specialize zero() { return doubledouble(); } - template __device__ __host__ inline specialize zero() { return doubledouble2(); } - template __device__ __host__ inline specialize zero() { return doubledouble3(); } -#endif - - template __device__ __host__ inline array zero() - { - array v; -#pragma unroll - for (int i = 0; i < n; i++) v[i] = zero(); - return v; - } - - // array of arithmetic types specialization - template - __device__ __host__ inline std::enable_if_t< - std::is_same_v> && std::is_arithmetic_v, T> - zero() - { - return zero(); - } - - // array of array specialization - template - __device__ __host__ inline std::enable_if_t< - std::is_same_v, T::N>>, T> - zero() - { - T v; -#pragma unroll - for (int i = 0; i < v.size(); i++) v[i] = zero(); - return v; - } - - // array of complex specialization - template - __device__ - __host__ inline std::enable_if_t, T::N>>, T> - zero() - { - T v; -#pragma unroll - for (int i = 0; i < v.size(); i++) v[i] = zero(); - return v; - } - /** Container used when we want to track the reference value when computing an infinity norm @@ -114,10 +52,14 @@ namespace quda { template struct deviation_t { T diff; T ref; - }; - template constexpr specialize> zero() { return {0.0, 0.0}; } - template constexpr specialize> zero() { return {0.0f, 0.0f}; } + template constexpr deviation_t &operator=(const deviation_t &other) + { + diff = other.diff; + ref = other.ref; + return *this; + } + }; template __host__ __device__ inline bool operator>(const deviation_t &a, const deviation_t &b) { @@ -128,6 +70,10 @@ namespace quda { static constexpr std::enable_if_t, T> value() { return std::numeric_limits::lowest(); } }; + template <> struct low { + static constexpr doubledouble value() { return std::numeric_limits::lowest(); } + }; + template struct low> { static inline __host__ __device__ array value() { @@ -146,6 +92,10 @@ namespace quda { static constexpr std::enable_if_t, T> value() { return std::numeric_limits::max(); } }; + template <> struct high { + static constexpr doubledouble value() { return std::numeric_limits::max(); } + }; + template struct RealType { }; template <> struct RealType { diff --git a/include/reducer.h b/include/reducer.h index 3c2f37f364..0af3b44852 100644 --- a/include/reducer.h +++ b/include/reducer.h @@ -84,7 +84,7 @@ namespace quda using reduce_t = T; using reducer_t = plus; template static inline void comm_reduce(std::vector &a) { comm_allreduce_sum(a); } - __device__ __host__ static inline T init() { return zero(); } + __device__ __host__ static inline T init() { return T{}; } __device__ __host__ static inline T apply(T a, T b) { return a + b; } __device__ __host__ inline T operator()(T a, T b) const { return apply(a, b); } }; diff --git a/include/targets/cuda/math_helper.cuh b/include/targets/cuda/math_helper.cuh index af4ad2bbb5..cee198e2d0 100644 --- a/include/targets/cuda/math_helper.cuh +++ b/include/targets/cuda/math_helper.cuh @@ -223,6 +223,51 @@ namespace quda { */ __device__ __host__ inline float fdividef(float a, float b) { return target::dispatch(a, b); } + + template struct dmul_rn_impl { + inline double operator()(double a, double b) { return a * b; } + }; +#ifdef QUDA_CUDA_CC + template <> struct dmul_rn_impl { + __device__ inline double operator()(double a, double b) { return ::__dmul_rn(a, b); } + }; +#endif + + /** + @brief IEEE double precision multiplication + */ + __device__ __host__ inline double dmul_rn(double a, double b) { return target::dispatch(a, b); } + + + template struct dadd_rn_impl { + inline double operator()(double a, double b) { return a + b; } + }; +#ifdef QUDA_CUDA_CC + template <> struct dadd_rn_impl { + __device__ inline double operator()(double a, double b) { return ::__dadd_rn(a, b); } + }; +#endif + + /** + @brief IEEE double precision addition + */ + __device__ __host__ inline double dadd_rn(double a, double b) { return target::dispatch(a, b); } + + + template struct fma_rn_impl { + inline double operator()(double a, double b, double c) { return std::fma(a, b, c); } + }; +#ifdef QUDA_CUDA_CC + template <> struct fma_rn_impl { + __device__ inline double operator()(double a, double b, double c) { return ::__fma_rn(a, b, c); } + }; +#endif + + /** + @brief IEEE double precision fused multiply add + */ + __device__ __host__ inline double fma_rn(double a, double b, double c) { return target::dispatch(a, b, c); } + } #undef QUDA_CUDA_CC From 2d67d97f8be65094a8a2b28957eb0b1f98d0821d Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 19 Jul 2023 17:19:19 -0700 Subject: [PATCH 06/38] Modify reduction kernels to use device_reduce_t and not double for internal computation --- include/kernels/clover_invert.cuh | 2 +- include/kernels/gauge_det_trace.cuh | 2 +- include/kernels/gauge_fix_fft.cuh | 4 ++-- include/kernels/gauge_fix_ovr.cuh | 4 ++-- include/kernels/gauge_loop_trace.cuh | 3 +-- include/kernels/gauge_plaq.cuh | 2 +- include/kernels/gauge_polyakov_loop.cuh | 2 +- include/kernels/gauge_qcharge.cuh | 4 ++-- include/kernels/momentum.cuh | 6 +++--- lib/gauge_qcharge.cu | 2 +- lib/momentum.cu | 2 +- 11 files changed, 16 insertions(+), 17 deletions(-) diff --git a/include/kernels/clover_invert.cuh b/include/kernels/clover_invert.cuh index 42292b7a83..c0dfb00946 100644 --- a/include/kernels/clover_invert.cuh +++ b/include/kernels/clover_invert.cuh @@ -7,7 +7,7 @@ namespace quda { - template struct CloverInvertArg : public ReduceArg> { + template struct CloverInvertArg : public ReduceArg> { using store_t = store_t_; using real = typename mapper::type; static constexpr bool twist = twist_; diff --git a/include/kernels/gauge_det_trace.cuh b/include/kernels/gauge_det_trace.cuh index bc4fc5d5fa..d6c00be412 100644 --- a/include/kernels/gauge_det_trace.cuh +++ b/include/kernels/gauge_det_trace.cuh @@ -9,7 +9,7 @@ namespace quda { enum struct compute_type { determinant, trace }; template - struct KernelArg : public ReduceArg> { + struct KernelArg : public ReduceArg> { static constexpr int nColor = nColor_; static constexpr QudaReconstructType recon = recon_; static constexpr compute_type type = type_; diff --git a/include/kernels/gauge_fix_fft.cuh b/include/kernels/gauge_fix_fft.cuh index 06b0f89b0f..de10b159e5 100644 --- a/include/kernels/gauge_fix_fft.cuh +++ b/include/kernels/gauge_fix_fft.cuh @@ -150,7 +150,7 @@ namespace quda { * @brief container to pass parameters for the gauge fixing quality kernel */ template - struct GaugeFixQualityFFTArg : public ReduceArg> { + struct GaugeFixQualityFFTArg : public ReduceArg> { using real = typename mapper::type; static constexpr QudaReconstructType recon = recon_; using Gauge = typename gauge_mapper::type; @@ -159,7 +159,7 @@ namespace quda { int_fastdiv X[4]; // grid dimensions Gauge data; complex *delta; - reduce_t result; + array result; int volume; GaugeFixQualityFFTArg(const GaugeField &data, complex *delta) : diff --git a/include/kernels/gauge_fix_ovr.cuh b/include/kernels/gauge_fix_ovr.cuh index 644d9c2463..3427a51169 100644 --- a/include/kernels/gauge_fix_ovr.cuh +++ b/include/kernels/gauge_fix_ovr.cuh @@ -14,7 +14,7 @@ namespace quda { * @brief container to pass parameters for the gauge fixing quality kernel */ template - struct GaugeFixQualityOVRArg : public ReduceArg> { + struct GaugeFixQualityOVRArg : public ReduceArg> { using real = typename mapper::type; static constexpr QudaReconstructType recon = recon_; using Gauge = typename gauge_mapper::type; @@ -23,7 +23,7 @@ namespace quda { int X[4]; // grid dimensions int border[4]; Gauge data; - reduce_t result; + array result; GaugeFixQualityOVRArg(const GaugeField &data) : ReduceArg(dim3(data.LocalVolumeCB(), 2, 1), 1, true), // reset = true diff --git a/include/kernels/gauge_loop_trace.cuh b/include/kernels/gauge_loop_trace.cuh index 9c6b6cc3f8..3e9b2a09c9 100644 --- a/include/kernels/gauge_loop_trace.cuh +++ b/include/kernels/gauge_loop_trace.cuh @@ -18,9 +18,8 @@ namespace quda { constexpr unsigned int max_n_batch_block_loop_trace() { return 8; } template - struct GaugeLoopTraceArg : public ReduceArg> { + struct GaugeLoopTraceArg : public ReduceArg> { using real = typename mapper::type; - using reduce_t = array; static constexpr unsigned int max_n_batch_block = max_n_batch_block_loop_trace(); static constexpr int nColor = nColor_; static constexpr QudaReconstructType recon = recon_; diff --git a/include/kernels/gauge_plaq.cuh b/include/kernels/gauge_plaq.cuh index 563d674000..1e4a0d9028 100644 --- a/include/kernels/gauge_plaq.cuh +++ b/include/kernels/gauge_plaq.cuh @@ -9,7 +9,7 @@ namespace quda { template - struct GaugePlaqArg : public ReduceArg> { + struct GaugePlaqArg : public ReduceArg> { using Float = Float_; static constexpr int nColor = nColor_; static_assert(nColor == 3, "Only nColor=3 enabled at this time"); diff --git a/include/kernels/gauge_polyakov_loop.cuh b/include/kernels/gauge_polyakov_loop.cuh index 5ea4bc5219..2b7f86afed 100644 --- a/include/kernels/gauge_polyakov_loop.cuh +++ b/include/kernels/gauge_polyakov_loop.cuh @@ -166,7 +166,7 @@ namespace quda { }; template - struct GaugePolyakovLoopTraceArg : public ReduceArg> { + struct GaugePolyakovLoopTraceArg : public ReduceArg> { using real = typename mapper::type; static constexpr int nColor = nColor_; static_assert(nColor == 3, "Only nColor=3 enabled at this time"); diff --git a/include/kernels/gauge_qcharge.cuh b/include/kernels/gauge_qcharge.cuh index fadfd45321..79ac4156b2 100644 --- a/include/kernels/gauge_qcharge.cuh +++ b/include/kernels/gauge_qcharge.cuh @@ -7,7 +7,7 @@ namespace quda { template struct QChargeArg : - public ReduceArg> + public ReduceArg> { using Float = Float_; static constexpr int nColor = nColor_; @@ -43,7 +43,7 @@ namespace quda constexpr real n_inv = static_cast(1.0 / Arg::nColor); reduce_t E_local{0, 0, 0}; - double &Q = E_local[2]; + device_reduce_t &Q = E_local[2]; // Load the field-strength tensor from global memory //F0 = F[Y,X], F1 = F[Z,X], F2 = F[Z,Y], diff --git a/include/kernels/momentum.cuh b/include/kernels/momentum.cuh index 41f8bc929c..60d5550be0 100644 --- a/include/kernels/momentum.cuh +++ b/include/kernels/momentum.cuh @@ -7,14 +7,14 @@ namespace quda { template - struct MomActionArg : ReduceArg { + struct MomActionArg : ReduceArg { using Float = Float_; static constexpr int nColor = nColor_; static constexpr QudaReconstructType recon = recon_; const typename gauge_mapper::type mom; MomActionArg(const GaugeField &mom) : - ReduceArg(dim3(mom.VolumeCB(), 2, 1)), + ReduceArg(dim3(mom.VolumeCB(), 2, 1)), mom(mom) { } }; @@ -56,7 +56,7 @@ namespace quda { }; template - struct UpdateMomArg : ReduceArg> + struct UpdateMomArg : ReduceArg> { using Float = Float_; static constexpr int nColor = nColor_; diff --git a/lib/gauge_qcharge.cu b/lib/gauge_qcharge.cu index 3b4e584b02..fdbbbe573c 100644 --- a/lib/gauge_qcharge.cu +++ b/lib/gauge_qcharge.cu @@ -33,7 +33,7 @@ namespace quda { TuneParam tp = tuneLaunch(*this, getTuning(), getVerbosity()); - typename Arg<>::reduce_t result{}; + array result{}; if (!density) { Arg arg(Fmunu, static_cast(qdensity)); launch(result, tp, stream, arg); diff --git a/lib/momentum.cu b/lib/momentum.cu index 7a574687ca..e273036443 100644 --- a/lib/momentum.cu +++ b/lib/momentum.cu @@ -107,7 +107,7 @@ namespace quda { const GaugeField &force; GaugeField &mom; double coeff; - typename Arg::reduce_t force_max; + array force_max; public: UpdateMom(const GaugeField &force, GaugeField &mom, double coeff, const char *fname) : From feccf8948db152c7b22ed861952824b7d95fb97c Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 19 Jul 2023 17:23:59 -0700 Subject: [PATCH 07/38] Use same underlying reduction type on host as device --- include/register_traits.h | 12 ++++++++++++ lib/reduce_quda.cu | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/include/register_traits.h b/include/register_traits.h index ea5c567133..5b780a403c 100644 --- a/include/register_traits.h +++ b/include/register_traits.h @@ -308,6 +308,18 @@ namespace quda { typedef char8 type; }; + // demote vector type to underlying scalar type + template struct get_scalar; + template <> struct get_scalar { using type = float; }; + template <> struct get_scalar { using type = double; }; + template <> struct get_scalar { using type = double; }; + template <> struct get_scalar { using type = doubledouble; }; + template <> struct get_scalar { using type = doubledouble; }; + template struct get_scalar> { using type = typename get_scalar::type; }; + template struct get_scalar> { using type = typename get_scalar::type; }; + + template using get_scalar_t = typename get_scalar::type; + template struct AllocType { }; template<> struct AllocType { typedef size_t type; }; template<> struct AllocType { typedef int type; }; diff --git a/lib/reduce_quda.cu b/lib/reduce_quda.cu index f2316f20c9..f6ab452f7a 100644 --- a/lib/reduce_quda.cu +++ b/lib/reduce_quda.cu @@ -101,7 +101,7 @@ namespace quda { using host_store_t = typename host_type_mapper::type; using host_y_store_t = typename host_type_mapper::type; using host_real_t = typename mapper::type; - Reducer r_(a, b); + Reducer r_(a, b); // redefine site_unroll with host_store types to ensure we have correct N/Ny/M values constexpr bool site_unroll = !std::is_same::value || isFixed::value || decltype(r)::site_unroll; From d70303a38c4122950b8b59159d6640ed1386bada Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 19 Jul 2023 17:43:13 -0700 Subject: [PATCH 08/38] Move get_scalar overload to float_Vector.h --- include/float_vector.h | 2 ++ include/register_traits.h | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/include/float_vector.h b/include/float_vector.h index ff5dc23e9f..bcf485f1ba 100644 --- a/include/float_vector.h +++ b/include/float_vector.h @@ -61,6 +61,8 @@ namespace quda { } }; + template struct get_scalar> { using type = typename get_scalar::type; }; + template __host__ __device__ inline bool operator>(const deviation_t &a, const deviation_t &b) { return a.diff > b.diff; diff --git a/include/register_traits.h b/include/register_traits.h index 5b780a403c..f2cd7bf67e 100644 --- a/include/register_traits.h +++ b/include/register_traits.h @@ -316,7 +316,6 @@ namespace quda { template <> struct get_scalar { using type = doubledouble; }; template <> struct get_scalar { using type = doubledouble; }; template struct get_scalar> { using type = typename get_scalar::type; }; - template struct get_scalar> { using type = typename get_scalar::type; }; template using get_scalar_t = typename get_scalar::type; From 4a7061a881c2553d6b3670f8e01b27cb8bb500cd Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 19 Jul 2023 17:43:44 -0700 Subject: [PATCH 09/38] Add *= and /= overloads for doubledouble --- include/dbldbl.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/include/dbldbl.h b/include/dbldbl.h index d907051686..3d5e4ec4b3 100644 --- a/include/dbldbl.h +++ b/include/dbldbl.h @@ -306,6 +306,17 @@ struct doubledouble { this->a = sub_dbldbl(this->a, a.a); return *this; } + + __device__ __host__ void operator*=(const doubledouble &a) + { + this->a = mul_dbldbl(this->a, a.a); + } + + __device__ __host__ void operator/=(const doubledouble &a) + { + this->a = div_dbldbl(this->a, a.a); + } + constexpr double head() const { return a.y; } constexpr double tail() const { return a.x; } From 7e40280d3d262a0f7958f8df37a784b1af1424fc Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Wed, 19 Jul 2023 17:45:24 -0700 Subject: [PATCH 10/38] Fix heavy quark residual norm for non-double reduction type --- include/kernels/reduce_core.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/kernels/reduce_core.cuh b/include/kernels/reduce_core.cuh index 4597f0b8cd..d0517e706d 100644 --- a/include/kernels/reduce_core.cuh +++ b/include/kernels/reduce_core.cuh @@ -493,7 +493,7 @@ namespace quda { sum[0] += aux[0]; sum[1] += aux[1]; - sum[2] += (aux[0] > 0.0) ? (aux[1] / aux[0]) : static_cast(1.0); + sum[2] += (aux[0] > 0.0) ? (aux[1] / aux[0]) : static_cast(1.0); } constexpr int flops() const { return 4; } //! undercounts since it excludes the per-site division @@ -540,7 +540,7 @@ namespace quda { sum[0] += aux[0]; sum[1] += aux[1]; - sum[2] += (aux[0] > 0.0) ? (aux[1] / aux[0]) : static_cast(1.0); + sum[2] += (aux[0] > 0.0) ? (aux[1] / aux[0]) : static_cast(1.0); } constexpr int flops() const { return 5; } From 2a80b2f75007c5c5770a3811f8ed06ecfb1818fc Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 20 Jul 2023 16:14:40 -0700 Subject: [PATCH 11/38] Add various functions to doubledouble needed for generic deployment --- include/dbldbl.h | 58 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/include/dbldbl.h b/include/dbldbl.h index 3d5e4ec4b3..fbbcfa750e 100644 --- a/include/dbldbl.h +++ b/include/dbldbl.h @@ -322,10 +322,23 @@ struct doubledouble { __device__ __host__ void print() const { printf("scalar: %16.14e + %16.14e\n", head(), tail()); } - constexpr operator double() const { return head(); } + explicit constexpr operator double() const { return head(); } + explicit constexpr operator float() const { return static_cast(head()); } + explicit constexpr operator int() const { return static_cast(head()); } }; -__device__ __host__ inline bool operator>(const doubledouble &a, const doubledouble &b) { +__device__ __host__ inline doubledouble sqrt(const doubledouble &a) { return doubledouble(sqrt_dbldbl(a.a)); } + +__device__ __host__ inline doubledouble operator-(const doubledouble &a) { return doubledouble(neg_dbldbl(a.a)); } + +__device__ __host__ inline doubledouble abs(const doubledouble &a) { return (a.head() < 0 ? -a : a); } + +__device__ __host__ inline bool isinf(const doubledouble &a) { return isinf(a.head()); } + +__device__ __host__ inline bool isnan(const doubledouble &a) { return isnan(a.head()); } + +__device__ __host__ inline bool operator>(const doubledouble &a, const doubledouble &b) +{ if (a.head() > b.head()) { return true; } else if (a.head() == b.head() && a.tail() > b.tail()) { @@ -335,6 +348,47 @@ __device__ __host__ inline bool operator>(const doubledouble &a, const doubledo } } +__device__ __host__ inline bool operator>=(const doubledouble &a, const doubledouble &b) +{ + if (a.head() >= b.head()) { + return true; + } else if (a.head() == b.head() && a.tail() >= b.tail()) { + return true; + } else { + return false; + } +} + +__device__ __host__ inline bool operator<(const doubledouble &a, const doubledouble &b) +{ + if (a.head() < b.head()) { + return true; + } else if (a.head() == b.head() && a.tail() < b.tail()) { + return true; + } else { + return false; + } +} + +__device__ __host__ inline bool operator<=(const doubledouble &a, const doubledouble &b) +{ + if (a.head() <= b.head()) { + return true; + } else if (a.head() == b.head() && a.tail() <= b.tail()) { + return true; + } else { + return false; + } +} + +__device__ __host__ inline bool operator==(const doubledouble &a, const doubledouble &b) { + return (a.head() == b.head() && a.tail() == b.tail()); +} + +__device__ __host__ inline bool operator!=(const doubledouble &a, const doubledouble &b) { + return !(a == b); +} + __device__ __host__ inline bool operator>(const doubledouble &a, const double &b) { return a.head() > b; } From e9089e10b6ee4521af5874d96c489a3d4905b2d1 Mon Sep 17 00:00:00 2001 From: maddyscientist Date: Thu, 20 Jul 2023 16:44:13 -0700 Subject: [PATCH 12/38] Commence the slog that generizes the host-side scalar precision: introduce new CMake set types: real_t (QUDA_SCALAR_TYPE) - the host side scalar precision, complex_t the complex version of this (replaces Complex), device_reduce_t (QUDA_REDUCTION_TYPE). Eventually we will be able to set these to non-double types, but we're there yet.... --- include/blas_quda.h | 194 +++++++++--------- include/deflation.h | 4 +- include/dirac_quda.h | 10 +- include/dslash_quda.h | 20 +- include/eigensolve_quda.h | 75 ++++--- include/gauge_path_quda.h | 12 +- include/invert_quda.h | 34 +-- include/kernels/dslash_domain_wall_4d.cuh | 10 +- .../dslash_domain_wall_4d_fused_m5.cuh | 10 +- include/kernels/dslash_domain_wall_m5.cuh | 28 +-- include/kernels/dslash_mdw_fused.cuh | 2 +- include/kernels/dslash_mobius_eofa.cuh | 2 +- include/kernels/dslash_wilson.cuh | 4 +- include/quda_define.h.in | 3 + include/quda_internal.h | 16 +- include/reducer.h | 6 - include/register_traits.h | 2 +- lib/CMakeLists.txt | 10 + lib/blas_quda.cu | 38 ++-- lib/deflation.cpp | 28 +-- lib/dirac_mobius.cpp | 4 +- lib/dslash5_domain_wall.cu | 10 +- lib/dslash5_mobius_eofa.cu | 10 +- lib/dslash_domain_wall_4d.cu | 6 +- lib/dslash_domain_wall_4d_fused_m5.hpp | 2 +- lib/dslash_domain_wall_4d_m5inv.cu | 4 +- lib/dslash_domain_wall_4d_m5inv_m5inv.cu | 4 +- lib/dslash_domain_wall_4d_m5inv_m5pre.cu | 4 +- lib/dslash_domain_wall_4d_m5mob.cu | 4 +- lib/dslash_domain_wall_4d_m5pre.cu | 4 +- lib/dslash_domain_wall_4d_m5pre_m5inv.cu | 4 +- lib/dslash_domain_wall_4d_m5pre_m5mob.cu | 4 +- lib/dslash_mdw_fused.in.cu | 6 +- lib/dslash_mdw_fused.in.hpp | 8 +- lib/dslash_mdw_fused_impl.hpp | 6 +- lib/eig_block_trlm.cpp | 61 +++--- lib/eig_iram.cpp | 117 ++++++----- lib/eig_trlm.cpp | 33 +-- lib/eigensolve_quda.cpp | 132 ++++++------ lib/gauge_loop_trace.cu | 4 +- lib/gauge_observable.cpp | 4 +- lib/interface_quda.cpp | 20 +- lib/inv_bicgstab_quda.cpp | 38 ++-- lib/inv_bicgstabl_quda.cpp | 60 +++--- lib/inv_ca_cg.cpp | 18 +- lib/inv_ca_gcr.cpp | 18 +- lib/inv_cg3_quda.cpp | 26 +-- lib/inv_cg_quda.cpp | 64 +++--- lib/inv_eigcg_quda.cpp | 14 +- lib/inv_gcr_quda.cpp | 30 +-- lib/inv_gmresdr_quda.cpp | 44 ++-- lib/inv_mr_quda.cpp | 4 +- lib/inv_mre.cpp | 12 +- lib/inv_multi_cg_quda.cpp | 8 +- lib/inv_pcg_quda.cpp | 12 +- lib/inv_sd_quda.cpp | 9 +- lib/madwf_ml.cpp | 2 +- lib/multi_blas_quda.cu | 132 ++++++------ lib/multi_reduce_quda.cu | 52 ++--- lib/multigrid.cpp | 10 +- lib/quda_arpack_interface.cpp | 24 +-- lib/reduce_quda.cu | 74 +++---- tests/blas_test.cpp | 137 +++++++------ tests/gauge_path_test.cpp | 4 +- .../domain_wall_dslash_reference.cpp | 6 +- tests/host_reference/dslash_reference.cpp | 2 +- .../host_reference/gauge_force_reference.cpp | 4 +- tests/host_reference/gauge_force_reference.h | 2 +- 68 files changed, 899 insertions(+), 866 deletions(-) diff --git a/include/blas_quda.h b/include/blas_quda.h index 8df40df452..8bb711af38 100644 --- a/include/blas_quda.h +++ b/include/blas_quda.h @@ -49,14 +49,14 @@ namespace quda { @param[in] x input vector @param[out] y output vector */ - void axy(double a, const ColorSpinorField &x, ColorSpinorField &y); + void axy(real_t a, const ColorSpinorField &x, ColorSpinorField &y); /** @brief Apply the rescale operation x = a * x @param[in] a scalar multiplier @param[in] x input vector */ - inline void ax(double a, ColorSpinorField &x) { axy(a, x, x); } + inline void ax(real_t a, ColorSpinorField &x) { axy(a, x, x); } /** @brief Apply the operation z = a * x + b * y @@ -66,7 +66,7 @@ namespace quda { @param[in] y input vector @param[out] z output vector */ - void axpbyz(double a, const ColorSpinorField &x, double b, const ColorSpinorField &y, ColorSpinorField &z); + void axpbyz(real_t a, const ColorSpinorField &x, real_t b, const ColorSpinorField &y, ColorSpinorField &z); /** @brief Apply the operation y += x @@ -88,7 +88,7 @@ namespace quda { @param[in] x input vector @param[in,out] y update vector */ - inline void axpy(double a, const ColorSpinorField &x, ColorSpinorField &y) { axpbyz(a, x, 1.0, y, y); } + inline void axpy(real_t a, const ColorSpinorField &x, ColorSpinorField &y) { axpbyz(a, x, 1.0, y, y); } /** @brief Apply the operation y = a * x + b * y @@ -97,7 +97,7 @@ namespace quda { @param[in] b scalar multiplier @param[in,out] y update vector */ - inline void axpby(double a, const ColorSpinorField &x, double b, ColorSpinorField &y) { axpbyz(a, x, b, y, y); } + inline void axpby(real_t a, const ColorSpinorField &x, real_t b, ColorSpinorField &y) { axpbyz(a, x, b, y, y); } /** @brief Apply the operation y = x + a * y @@ -105,7 +105,7 @@ namespace quda { @param[in] a scalar multiplier @param[in,out] y update vector */ - inline void xpay(const ColorSpinorField &x, double a, ColorSpinorField &y) { axpbyz(1.0, x, a, y, y); } + inline void xpay(const ColorSpinorField &x, real_t a, ColorSpinorField &y) { axpbyz(1.0, x, a, y, y); } /** @brief Apply the operation z = x + a * y @@ -114,7 +114,7 @@ namespace quda { @param[in] y update vector @param[out] z output vector */ - inline void xpayz(const ColorSpinorField &x, double a, const ColorSpinorField &y, ColorSpinorField &z) { axpbyz(1.0, x, a, y, z); } + inline void xpayz(const ColorSpinorField &x, real_t a, const ColorSpinorField &y, ColorSpinorField &z) { axpbyz(1.0, x, a, y, z); } /** @brief Apply the operation y = a * x + y, x = z + b * x @@ -124,7 +124,7 @@ namespace quda { @param[in] z input vector @param[in] b scalar multiplier */ - void axpyZpbx(double a, ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &z, double b); + void axpyZpbx(real_t a, ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &z, real_t b); /** @brief Apply the operation y = a * x + y, x = b * z + c * x @@ -135,7 +135,7 @@ namespace quda { @param[in] z input vector @param[in] c scalar multiplier */ - void axpyBzpcx(double a, ColorSpinorField& x, ColorSpinorField& y, double b, const ColorSpinorField& z, double c); + void axpyBzpcx(real_t a, ColorSpinorField& x, ColorSpinorField& y, real_t b, const ColorSpinorField& z, real_t c); /** @brief Apply the operation w = a * x + b * y + c * z @@ -147,8 +147,8 @@ namespace quda { @param[in] z input vector @param[out] w output vector */ - void axpbypczw(double a, const ColorSpinorField &x, double b, const ColorSpinorField &y, - double c, const ColorSpinorField &z, ColorSpinorField &w); + void axpbypczw(real_t a, const ColorSpinorField &x, real_t b, const ColorSpinorField &y, + real_t c, const ColorSpinorField &z, ColorSpinorField &w); /** @brief Apply the operation y = a * x + b * y @@ -158,7 +158,7 @@ namespace quda { @param[in] y input vector @param[out] z output vector */ - void caxpby(const Complex &a, const ColorSpinorField &x, const Complex &b, ColorSpinorField &y); + void caxpby(const complex_t &a, const ColorSpinorField &x, const complex_t &b, ColorSpinorField &y); /** @brief Apply the operation y += a * x @@ -166,7 +166,7 @@ namespace quda { @param[in] x input vector @param[in] y update vector */ - void caxpy(const Complex &a, const ColorSpinorField &x, ColorSpinorField &y); + void caxpy(const complex_t &a, const ColorSpinorField &x, ColorSpinorField &y); /** @brief Apply the operation z = x + a * y + b * z @@ -176,8 +176,8 @@ namespace quda { @param[in] b scalar multiplier @param[in,out] z update vector */ - void cxpaypbz(const ColorSpinorField &x, const Complex &a, const ColorSpinorField &y, - const Complex &b, ColorSpinorField &z); + void cxpaypbz(const ColorSpinorField &x, const complex_t &a, const ColorSpinorField &y, + const complex_t &b, ColorSpinorField &z); /** @brief Apply the operation z += a * x + b * y, y-= b * w @@ -188,7 +188,7 @@ namespace quda { @param[in,out] z update vector @param[in] w input vector */ - void caxpbypzYmbw(const Complex &a, const ColorSpinorField &x, const Complex &b, ColorSpinorField &y, + void caxpbypzYmbw(const complex_t &a, const ColorSpinorField &x, const complex_t &b, ColorSpinorField &y, ColorSpinorField &z, const ColorSpinorField &w); /** @@ -199,8 +199,8 @@ namespace quda { @param[in] b scalar multiplier @param[in] z input vector */ - void caxpyBzpx(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, - const Complex &b, const ColorSpinorField &z); + void caxpyBzpx(const complex_t &a, ColorSpinorField &x, ColorSpinorField &y, + const complex_t &b, const ColorSpinorField &z); /** @brief Apply the operation y += a * x, z += b * x @@ -210,8 +210,8 @@ namespace quda { @param[in] b scalar multiplier @param[in,out] z update vector */ - void caxpyBxpz(const Complex &a, const ColorSpinorField &x, ColorSpinorField &y, - const Complex &b, ColorSpinorField &z); + void caxpyBxpz(const complex_t &a, const ColorSpinorField &x, ColorSpinorField &y, + const complex_t &b, ColorSpinorField &z); /** @brief Apply the operation y += a * b * x, x = a * x @@ -220,7 +220,7 @@ namespace quda { @param[in,out] x update vector @param[in,out] y update vector */ - void cabxpyAx(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y); + void cabxpyAx(real_t a, const complex_t &b, ColorSpinorField &x, ColorSpinorField &y); /** @brief Apply the operation y += a * x, x -= a * z @@ -229,7 +229,7 @@ namespace quda { @param[in,out] y update vector @param[in] z input vector */ - void caxpyXmaz(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &z); + void caxpyXmaz(const complex_t &a, ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &z); /** @brief Apply the operation y += a * x, x = x - a * z. Special @@ -240,7 +240,7 @@ namespace quda { @param[in,out] y update vector @param[in] z input vector */ - void caxpyXmazMR(const double &a, ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &z); + void caxpyXmazMR(const real_t &a, ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &z); /** @brief Apply the operation y += a * w, z -= a * x, w = z + b * w @@ -251,7 +251,7 @@ namespace quda { @param[in,out] z update vector @param[in,out] w update vector */ - void tripleCGUpdate(double a, double b, const ColorSpinorField &x, + void tripleCGUpdate(real_t a, real_t b, const ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w); // reduction kernels - defined in reduce_quda.cu @@ -260,7 +260,7 @@ namespace quda { @brief Compute the maximum absolute real element of a field @param[in] a The field we are reducing */ - double max(const ColorSpinorField &x); + real_t max(const ColorSpinorField &x); /** @brief Compute the maximum real-valued deviation between two @@ -268,19 +268,19 @@ namespace quda { @param[in] x The field we want to compare @param[in] y The reference field to which we are comparing against */ - array max_deviation(const ColorSpinorField &x, const ColorSpinorField &y); + array max_deviation(const ColorSpinorField &x, const ColorSpinorField &y); /** @brief Compute the L1 norm of a field @param[in] x The field we are reducing */ - double norm1(const ColorSpinorField &x); + real_t norm1(const ColorSpinorField &x); /** @brief Compute the L2 norm (||x||^2) of a field @param[in] x The field we are reducing */ - double norm2(const ColorSpinorField &x); + real_t norm2(const ColorSpinorField &x); /** @brief Compute y += a * x and then (x, y) @@ -288,14 +288,14 @@ namespace quda { @param[in] x input vector @param[in,out] y update vector */ - double axpyReDot(double a, const ColorSpinorField &x, ColorSpinorField &y); + real_t axpyReDot(real_t a, const ColorSpinorField &x, ColorSpinorField &y); /** @brief Compute the real-valued inner product (x, y) @param[in] x input vector @param[in] y input vector */ - double reDotProduct(const ColorSpinorField &x, const ColorSpinorField &y); + real_t reDotProduct(const ColorSpinorField &x, const ColorSpinorField &y); /** @brief Compute z = a * x + b * y and then ||z||^2 @@ -305,7 +305,7 @@ namespace quda { @param[in] y input vector @param[in,out] z update vector */ - double axpbyzNorm(double a, const ColorSpinorField &x, double b, const ColorSpinorField &y, ColorSpinorField &z); + real_t axpbyzNorm(real_t a, const ColorSpinorField &x, real_t b, const ColorSpinorField &y, ColorSpinorField &z); /** @brief Compute y += a * x and then ||y||^2 @@ -313,38 +313,38 @@ namespace quda { @param[in] x input vector @param[in,out] y update vector */ - inline double axpyNorm(double a, const ColorSpinorField &x, ColorSpinorField &y) { return axpbyzNorm(a, x, 1.0, y, y); } + inline real_t axpyNorm(real_t a, const ColorSpinorField &x, ColorSpinorField &y) { return axpbyzNorm(a, x, 1.0, y, y); } /** @brief Compute y -= x and then ||y||^2 @param[in] x input vector @param[in,out] y update vector */ - inline double xmyNorm(const ColorSpinorField &x, ColorSpinorField &y) { return axpbyzNorm(1.0, x, -1.0, y, y); } + inline real_t xmyNorm(const ColorSpinorField &x, ColorSpinorField &y) { return axpbyzNorm(1.0, x, -1.0, y, y); } /** @brief Compute the complex-valued inner product (x, y) @param[in] x input vector @param[in] y input vector */ - Complex cDotProduct(const ColorSpinorField &x, const ColorSpinorField &y); + complex_t cDotProduct(const ColorSpinorField &x, const ColorSpinorField &y); /** @brief Return complex-valued inner product (x,y), ||x||^2 and ||y||^2 @param[in] x input vector @param[in] y input vector */ - double4 cDotProductNormAB(const ColorSpinorField &x, const ColorSpinorField &y); + array cDotProductNormAB(const ColorSpinorField &x, const ColorSpinorField &y); /** @brief Return complex-valued inner product (x,y) and ||x||^2 @param[in] x input vector @param[in] y input vector */ - inline double3 cDotProductNormA(const ColorSpinorField &x, const ColorSpinorField &y) + inline array cDotProductNormA(const ColorSpinorField &x, const ColorSpinorField &y) { auto a4 = cDotProductNormAB(x, y); - return make_double3(a4.x, a4.y, a4.z); + return {a4[0], a4[1], a4[2]}; } /** @@ -352,10 +352,10 @@ namespace quda { @param[in] x input vector @param[in] y input vector */ - inline double3 cDotProductNormB(const ColorSpinorField &x, const ColorSpinorField &y) + inline array cDotProductNormB(const ColorSpinorField &x, const ColorSpinorField &y) { auto a4 = cDotProductNormAB(x, y); - return make_double3(a4.x, a4.y, a4.w); + return {a4[0], a4[1], a4[2]}; } /** @@ -369,7 +369,7 @@ namespace quda { @param[in] w input vector @param[in] v input vector */ - double3 caxpbypzYmbwcDotProductUYNormY(const Complex &a, const ColorSpinorField &x, const Complex &b, + array caxpbypzYmbwcDotProductUYNormY(const complex_t &a, const ColorSpinorField &x, const complex_t &b, ColorSpinorField &y, ColorSpinorField &z, const ColorSpinorField &w, const ColorSpinorField &u); @@ -379,7 +379,7 @@ namespace quda { @param[in] x input vector @param[in,out] y update vector */ - double caxpyNorm(const Complex &a, const ColorSpinorField &x, ColorSpinorField &y); + real_t caxpyNorm(const complex_t &a, const ColorSpinorField &x, ColorSpinorField &y); /** @brief Compute z = a * b * x + y, x = a * x, and then ||x||^2 @@ -389,7 +389,7 @@ namespace quda { @param[in] y input vector @param[in,out] z update vector */ - double cabxpyzAxNorm(double a, const Complex &b, ColorSpinorField &x, const ColorSpinorField &y, + real_t cabxpyzAxNorm(real_t a, const complex_t &b, ColorSpinorField &x, const ColorSpinorField &y, ColorSpinorField &z); /** @@ -399,7 +399,7 @@ namespace quda { @param[in,out] y update vector @param[in] z input vector */ - Complex caxpyDotzy(const Complex &a, const ColorSpinorField &x, ColorSpinorField &y, + complex_t caxpyDotzy(const complex_t &a, const ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &z); /** @@ -409,7 +409,7 @@ namespace quda { @param[in] x input vector @param[in,out] y update vector */ - double2 axpyCGNorm(double a, const ColorSpinorField &x, ColorSpinorField &y); + array axpyCGNorm(real_t a, const ColorSpinorField &x, ColorSpinorField &y); /** @brief Computes ||x||^2, ||r||^2 and the MILC/FNAL heavy quark @@ -417,7 +417,7 @@ namespace quda { @param[in] x input vector @param[in] r input vector (residual vector) */ - double3 HeavyQuarkResidualNorm(const ColorSpinorField &x, const ColorSpinorField &r); + array HeavyQuarkResidualNorm(const ColorSpinorField &x, const ColorSpinorField &r); /** @brief Computes y += x, ||y||^2, ||r||^2 and the MILC/FNAL heavy quark @@ -426,7 +426,7 @@ namespace quda { @param[in,out] y update vector @param[in] r input vector (residual vector) */ - double3 xpyHeavyQuarkResidualNorm(const ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &r); + array xpyHeavyQuarkResidualNorm(const ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &r); /** @brief Computes ||x||^2, ||y||^2, and real-valued inner product (y, z) @@ -434,7 +434,7 @@ namespace quda { @param[in] y input vector @param[in] z input vector */ - double3 tripleCGReduction(const ColorSpinorField &x, const ColorSpinorField &y, const ColorSpinorField &z); + array tripleCGReduction(const ColorSpinorField &x, const ColorSpinorField &y, const ColorSpinorField &z); /** @brief Computes ||x||^2, ||y||^2, the real-valued inner product (y, z), and ||z||^2 @@ -442,7 +442,7 @@ namespace quda { @param[in] y input vector @param[in] z input vector */ - double4 quadrupleCGReduction(const ColorSpinorField &x, const ColorSpinorField &y, const ColorSpinorField &z); + array quadrupleCGReduction(const ColorSpinorField &x, const ColorSpinorField &y, const ColorSpinorField &z); /** @brief Computes z = x, w = y, x += a * y, y -= a * v and ||y||^2 @@ -453,7 +453,7 @@ namespace quda { @param[in,out] w update vector @param[in] v input vector */ - double quadrupleCG3InitNorm(double a, ColorSpinorField &x, ColorSpinorField &y, + real_t quadrupleCG3InitNorm(real_t a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, const ColorSpinorField &v); /** @@ -468,7 +468,7 @@ namespace quda { @param[in,out] w update vector @param[in] v input vector */ - double quadrupleCG3UpdateNorm(double a, double b, ColorSpinorField &x, ColorSpinorField &y, + real_t quadrupleCG3UpdateNorm(real_t a, real_t b, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z, ColorSpinorField &w, const ColorSpinorField &v); // multi-blas kernels - defined in multi_blas.cu @@ -477,7 +477,7 @@ namespace quda { @brief Compute the block "axpy" with over the set of ColorSpinorFields. E.g., it computes y = x * a + y The dimensions of a can be rectangular, e.g., the width of x and y need not be same. - @tparam T The type of a coefficients (double or Complex) + @tparam T The type of a coefficients (real_t or complex_t) @param a[in] Matrix of real coefficients @param x[in] vector of input ColorSpinorFields @param y[in,out] vector of input/output ColorSpinorFields @@ -493,7 +493,7 @@ namespace quda { Where 'a' must be a square, upper triangular matrix. - @tparam T The type of a coefficients (double or Complex) + @tparam T The type of a coefficients (real_t or complex_t) @param a[in] Matrix of coefficients @param x[in] vector of input ColorSpinorFields @param y[in,out] vector of input/output ColorSpinorFields @@ -509,7 +509,7 @@ namespace quda { Where 'a' must be a square, lower triangular matrix. - @tparam T The type of a coefficients (double or Complex) + @tparam T The type of a coefficients (real_t or complex_t) @param a[in] Matrix of coefficients @param x[in] vector of input ColorSpinorFields @param y[in,out] vector of input/output ColorSpinorFields @@ -530,7 +530,7 @@ namespace quda { @param x[in] vector of input ColorSpinorFields @param y[in,out] vector of input/output ColorSpinorFields */ - void caxpy(const std::vector &a, cvector_ref &x, cvector_ref &y); + void caxpy(const std::vector &a, cvector_ref &x, cvector_ref &y); /** @brief Compute the block "caxpy_U" with over the set of @@ -544,7 +544,7 @@ namespace quda { @param x[in] vector of input ColorSpinorFields @param y[in,out] vector of input/output ColorSpinorFields */ - void caxpy_U(const std::vector &a, cvector_ref &x, cvector_ref &y); + void caxpy_U(const std::vector &a, cvector_ref &x, cvector_ref &y); /** @brief Compute the block "caxpy_L" with over the set of @@ -558,7 +558,7 @@ namespace quda { @param x[in] vector of input ColorSpinorFields @param y[in,out] vector of input/output ColorSpinorFields */ - void caxpy_L(const std::vector &a, cvector_ref &x, cvector_ref &y); + void caxpy_L(const std::vector &a, cvector_ref &x, cvector_ref &y); /** @brief Compute the block "axpyz" with over the set of @@ -575,7 +575,7 @@ namespace quda { @param y[in] vector of input ColorSpinorFields @param z[out] vector of output ColorSpinorFields */ - void axpyz(const std::vector &a, cvector_ref &x, + void axpyz(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z); /** @@ -591,7 +591,7 @@ namespace quda { @param y[in] vector of input ColorSpinorFields @param z[out] vector of output ColorSpinorFields */ - void axpyz_U(const std::vector &a, cvector_ref &x, + void axpyz_U(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z); /** @@ -607,7 +607,7 @@ namespace quda { @param y[in] vector of input ColorSpinorFields @param z[out] vector of output ColorSpinorFields */ - void axpyz_L(const std::vector &a, cvector_ref &x, + void axpyz_L(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z); /** @@ -625,7 +625,7 @@ namespace quda { @param y[in] vector of input ColorSpinorFields @param z[out] vector of output ColorSpinorFields */ - void caxpyz(const std::vector &a, cvector_ref &x, + void caxpyz(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z); /** @@ -641,7 +641,7 @@ namespace quda { @param y[in] vector of input ColorSpinorFields @param z[out] vector of output ColorSpinorFields */ - void caxpyz_U(const std::vector &a, cvector_ref &x, + void caxpyz_U(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z); /** @@ -657,7 +657,7 @@ namespace quda { @param y[in] vector of input ColorSpinorFields @param z[out] vector of output ColorSpinorFields */ - void caxpyz_L(const std::vector &a, cvector_ref &x, + void caxpyz_L(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z); /** @@ -678,9 +678,9 @@ namespace quda { @param z[in] input ColorSpinorField @param c[in] Array of coefficients */ - void axpyBzpcx(const std::vector &a, cvector_ref &x, - cvector_ref &y, const std::vector &b, ColorSpinorField &z, - const std::vector &c); + void axpyBzpcx(const std::vector &a, cvector_ref &x, + cvector_ref &y, const std::vector &b, ColorSpinorField &z, + const std::vector &c); /** @brief Compute the vectorized "caxpyBxpz" over the set of @@ -699,8 +699,8 @@ namespace quda { @param b[in] Array of coefficients @param z[in,out] input ColorSpinorField */ - void caxpyBxpz(const std::vector &a, cvector_ref &x, ColorSpinorField &y, - const std::vector &b, ColorSpinorField &z); + void caxpyBxpz(const std::vector &a, cvector_ref &x, ColorSpinorField &y, + const std::vector &b, ColorSpinorField &z); // multi-reduce kernels - defined in multi_reduce.cu @@ -711,7 +711,7 @@ namespace quda { @param a[in] set of input ColorSpinorFields @param b[in] set of input ColorSpinorFields */ - void reDotProduct(std::vector &result, cvector_ref &a, + void reDotProduct(std::vector &result, cvector_ref &a, cvector_ref &b); /** @@ -721,7 +721,7 @@ namespace quda { @param a[in] set of input ColorSpinorFields @param b[in] set of input ColorSpinorFields */ - void cDotProduct(std::vector &result, cvector_ref &a, + void cDotProduct(std::vector &result, cvector_ref &a, cvector_ref &b); /** @@ -734,7 +734,7 @@ namespace quda { @param a[in] set of input ColorSpinorFields @param b[in] set of input ColorSpinorFields */ - void hDotProduct(std::vector &result, cvector_ref &a, + void hDotProduct(std::vector &result, cvector_ref &a, cvector_ref &b); /** @@ -749,47 +749,47 @@ namespace quda { @param a[in] set of input ColorSpinorFields @param b[in] set of input ColorSpinorFields */ - void hDotProduct_Anorm(std::vector &result, cvector_ref &a, + void hDotProduct_Anorm(std::vector &result, cvector_ref &a, cvector_ref &b); // compatibility wrappers until we switch to // std::vector and // std::vector> more broadly - void axpy(const double *a, std::vector &x, std::vector &y); - void axpy(const double *a, ColorSpinorField &x, ColorSpinorField &y); - void axpy_U(const double *a, std::vector &x, std::vector &y); - void axpy_U(const double *a, ColorSpinorField &x, ColorSpinorField &y); - void axpy_L(const double *a, std::vector &x, std::vector &y); - void axpy_L(const double *a, ColorSpinorField &x, ColorSpinorField &y); - void caxpy(const Complex *a, std::vector &x, std::vector &y); - void caxpy(const Complex *a, ColorSpinorField &x, ColorSpinorField &y); - void caxpy_U(const Complex *a, std::vector &x, std::vector &y); - void caxpy_U(const Complex *a, ColorSpinorField &x, ColorSpinorField &y); - void caxpy_L(const Complex *a, std::vector &x, std::vector &y); - void caxpy_L(const Complex *a, ColorSpinorField &x, ColorSpinorField &y); - void axpyz(const double *a, std::vector &x, std::vector &y, + void axpy(const real_t *a, std::vector &x, std::vector &y); + void axpy(const real_t *a, ColorSpinorField &x, ColorSpinorField &y); + void axpy_U(const real_t *a, std::vector &x, std::vector &y); + void axpy_U(const real_t *a, ColorSpinorField &x, ColorSpinorField &y); + void axpy_L(const real_t *a, std::vector &x, std::vector &y); + void axpy_L(const real_t *a, ColorSpinorField &x, ColorSpinorField &y); + void caxpy(const complex_t *a, std::vector &x, std::vector &y); + void caxpy(const complex_t *a, ColorSpinorField &x, ColorSpinorField &y); + void caxpy_U(const complex_t *a, std::vector &x, std::vector &y); + void caxpy_U(const complex_t *a, ColorSpinorField &x, ColorSpinorField &y); + void caxpy_L(const complex_t *a, std::vector &x, std::vector &y); + void caxpy_L(const complex_t *a, ColorSpinorField &x, ColorSpinorField &y); + void axpyz(const real_t *a, std::vector &x, std::vector &y, std::vector &z); - void axpyz(const double *a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z); - void axpyz_U(const double *a, std::vector &x, std::vector &y, + void axpyz(const real_t *a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z); + void axpyz_U(const real_t *a, std::vector &x, std::vector &y, std::vector &z); - void axpyz_L(const double *a, std::vector &x, std::vector &y, + void axpyz_L(const real_t *a, std::vector &x, std::vector &y, std::vector &z); - void caxpyz(const Complex *a, std::vector &x, std::vector &y, + void caxpyz(const complex_t *a, std::vector &x, std::vector &y, std::vector &z); - void caxpyz(const Complex *a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z); - void caxpyz_U(const Complex *a, std::vector &x, std::vector &y, + void caxpyz(const complex_t *a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z); + void caxpyz_U(const complex_t *a, std::vector &x, std::vector &y, std::vector &z); - void caxpyz_L(const Complex *a, std::vector &x, std::vector &y, + void caxpyz_L(const complex_t *a, std::vector &x, std::vector &y, std::vector &z); - void axpyBzpcx(const double *a, std::vector &x, std::vector &y, - const double *b, ColorSpinorField &z, const double *c); - void caxpyBxpz(const Complex *a_, std::vector &x_, ColorSpinorField &y_, const Complex *b_, + void axpyBzpcx(const real_t *a, std::vector &x, std::vector &y, + const real_t *b, ColorSpinorField &z, const real_t *c); + void caxpyBxpz(const complex_t *a_, std::vector &x_, ColorSpinorField &y_, const complex_t *b_, ColorSpinorField &z_); - void reDotProduct(double *result, std::vector &a, std::vector &b); - void cDotProduct(Complex *result, std::vector &a, std::vector &b); - void hDotProduct(Complex *result, std::vector &a, std::vector &b); + void reDotProduct(real_t *result, std::vector &a, std::vector &b); + void cDotProduct(complex_t *result, std::vector &a, std::vector &b); + void hDotProduct(complex_t *result, std::vector &a, std::vector &b); } // namespace blas diff --git a/include/deflation.h b/include/deflation.h index b9d9132577..2ec1c86b88 100644 --- a/include/deflation.h +++ b/include/deflation.h @@ -28,7 +28,7 @@ namespace quda { DiracMatrix &matDeflation; /** Host projection matrix (e.g. eigCG VH A V) */ - Complex *matProj; + complex_t *matProj; /** projection matrix leading dimension */ int ld; @@ -56,7 +56,7 @@ namespace quda { tot_dim = param.np; ld = ((tot_dim+15) / 16) * tot_dim; //allocate deflation resources: - matProj = static_cast(pool_pinned_malloc(ld * tot_dim * sizeof(Complex))); + matProj = static_cast(pool_pinned_malloc(ld * tot_dim * sizeof(complex_t))); invRitzVals = new double[tot_dim]; //Check that RV is a composite field: diff --git a/include/dirac_quda.h b/include/dirac_quda.h index e10437651f..9f330c1171 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -39,8 +39,8 @@ namespace quda { double mass; double m5; // used by domain wall only int Ls; // used by domain wall and twisted mass - Complex b_5[QUDA_MAX_DWF_LS]; // used by mobius domain wall only - Complex c_5[QUDA_MAX_DWF_LS]; // used by mobius domain wall only + complex_t b_5[QUDA_MAX_DWF_LS]; // used by mobius domain wall only + complex_t c_5[QUDA_MAX_DWF_LS]; // used by mobius domain wall only // The EOFA parameters. See the description in InvertParam double eofa_shift; @@ -125,7 +125,7 @@ namespace quda { for (int i=0; i(b_5[i].real()), static_cast(b_5[i].imag()), i, static_cast(c_5[i].real()), static_cast(c_5[i].imag())); printfQuda("use_mma = %d\n", use_mma); printfQuda("allow_truncation = %d\n", allow_truncation); printfQuda("use_mobius_fused_kernel = %s\n", use_mobius_fused_kernel ? "true" : "false"); @@ -924,8 +924,8 @@ namespace quda { protected: //Mobius coefficients - Complex b_5[QUDA_MAX_DWF_LS]; - Complex c_5[QUDA_MAX_DWF_LS]; + complex_t b_5[QUDA_MAX_DWF_LS]; + complex_t c_5[QUDA_MAX_DWF_LS]; /** Whether we are using classical Mobius with constant real-valued diff --git a/include/dslash_quda.h b/include/dslash_quda.h index ea12741be0..7b24e442ca 100644 --- a/include/dslash_quda.h +++ b/include/dslash_quda.h @@ -577,41 +577,41 @@ namespace quda */ void ApplyDomainWall4D(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, double m_5, - const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, int parity, bool dagger, + const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile); void ApplyDomainWall4DM5inv(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile); void ApplyDomainWall4DM5pre(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile); void ApplyDomainWall4DM5invM5pre(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile); void ApplyDomainWall4DM5preM5inv(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile); void ApplyDomainWall4DM5invM5inv(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile); void ApplyDomainWall4DM5mob(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile); void ApplyDomainWall4DM5preM5mob(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile); /** @@ -630,13 +630,13 @@ namespace quda @param[in] type Type of dslash we are applying */ void ApplyDslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, - double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type); + double m_5, const complex_t *b_5, const complex_t *c_5, double a, bool dagger, Dslash5Type type); // The EOFA stuff namespace mobius_eofa { void apply_dslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, - double m_5, const Complex *b_5, const Complex *c_5, double a, int eofa_pm, double inv, + double m_5, const complex_t *b_5, const complex_t *c_5, double a, int eofa_pm, double inv, double kappa, const double *eofa_u, const double *eofa_x, const double *eofa_y, double sherman_morrison, bool dagger, Dslash5Type type); } diff --git a/include/eigensolve_quda.h b/include/eigensolve_quda.h index 06472f069b..f615aa8b11 100644 --- a/include/eigensolve_quda.h +++ b/include/eigensolve_quda.h @@ -28,7 +28,7 @@ namespace quda int n_kr; /** Size of Krylov space after extension */ int n_conv; /** Number of converged eigenvalues requested */ int n_ev_deflate; /** Number of converged eigenvalues to use in deflation */ - double tol; /** Tolerance on eigenvalues */ + real_t tol; /** Tolerance on eigenvalues */ bool reverse; /** True if using polynomial acceleration */ char spectrum[3]; /** Part of the spectrum to be computed */ bool compute_svd; /** Compute the SVD if requested **/ @@ -51,7 +51,7 @@ namespace quda int num_locked; int num_keep; - std::vector residua; + std::vector residua; // Device side vector workspace std::vector r; @@ -82,7 +82,7 @@ namespace quda @param kSpace The converged eigenvectors @param evals The converged eigenvalues */ - virtual void operator()(std::vector &kSpace, std::vector &evals) = 0; + virtual void operator()(std::vector &kSpace, std::vector &evals) = 0; /** @brief Creates the eigensolver using the parameters given and the matrix. @@ -110,7 +110,7 @@ namespace quda @param[in] kSpace The Krylov space vectors @param[in] evals The eigenvalue array */ - void prepareKrylovSpace(std::vector &kSpace, std::vector &evals); + void prepareKrylovSpace(std::vector &kSpace, std::vector &evals); /** @brief Set the epsilon parameter @@ -135,7 +135,7 @@ namespace quda @param[in] kSpace The Krylov space vectors @param[in] evals The eigenvalue array */ - void cleanUpEigensolver(std::vector &kSpace, std::vector &evals); + void cleanUpEigensolver(std::vector &kSpace, std::vector &evals); /** @brief Promoted the specified matVec operation: @@ -151,7 +151,7 @@ namespace quda @param[in] out Output spinor @param[in] in Input spinor */ - double estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in); + real_t estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in); /** @brief Orthogonalise input vectors r against @@ -240,7 +240,7 @@ namespace quda @param[in] accumulate Whether to preserve the sol vector content prior to accumulating */ void deflate(cvector_ref &sol, cvector_ref &src, - cvector_ref &evecs, const std::vector &evals, + cvector_ref &evecs, const std::vector &evals, bool accumulate = false) const; /** @@ -253,7 +253,7 @@ namespace quda @param[in] accumulate Whether to preserve the sol vector content prior to accumulating */ void deflateSVD(cvector_ref &sol, cvector_ref &vec, - cvector_ref &evecs, const std::vector &evals, + cvector_ref &evecs, const std::vector &evals, bool accumulate = false) const; /** @@ -261,7 +261,7 @@ namespace quda @param[in] evecs Computed eigenvectors of NormOp @param[in] evals Computed eigenvalues of NormOp */ - void computeSVD(std::vector &evecs, std::vector &evals); + void computeSVD(std::vector &evecs, std::vector &evals); /** @brief Compute eigenvalues and their residiua @@ -270,8 +270,7 @@ namespace quda @param[in] evals The eigenvalues @param[in] size The number of eigenvalues to compute */ - void computeEvals(std::vector &evecs, std::vector &evals, - int size); + void computeEvals(std::vector &evecs, std::vector &evals, int size); /** @brief Compute eigenvalues and their residiua. This variant compute the number of converged eigenvalues. @@ -279,7 +278,7 @@ namespace quda @param[in] evecs The eigenvectors @param[in] evals The eigenvalues */ - void computeEvals(std::vector &evecs, std::vector &evals) + void computeEvals(std::vector &evecs, std::vector &evals) { computeEvals(evecs, evals, n_conv); } @@ -290,7 +289,7 @@ namespace quda @param[in] eig_vecs The eigenvectors to save @param[in] file The filename to save */ - void loadFromFile(std::vector &eig_vecs, std::vector &evals); + void loadFromFile(std::vector &eig_vecs, std::vector &evals); /** @brief Sort array the first n elements of x according to spec_type, y comes along for the ride @@ -299,7 +298,7 @@ namespace quda @param[in] x The array to sort @param[in] y An array whose elements will be permuted in tandem with x */ - void sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y); + void sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y); /** @brief Sort array the first n elements of x according to spec_type, y comes along for the ride @@ -309,7 +308,7 @@ namespace quda @param[in] x The array to sort @param[in] y An array whose elements will be permuted in tandem with x */ - void sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y); + void sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y); /** @brief Sort array the first n elements of x according to spec_type, y comes along for the ride @@ -319,7 +318,7 @@ namespace quda @param[in] x The array to sort @param[in] y An array whose elements will be permuted in tandem with x */ - void sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y); + void sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y); /** @brief Sort array the first n elements of x according to spec_type, y comes along for the ride @@ -330,7 +329,7 @@ namespace quda @param[in] x The array to sort @param[in] y An array whose elements will be permuted in tandem with x */ - void sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y); + void sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y); /** @brief Sort array the first n elements of x according to spec_type, y comes along for the ride @@ -341,7 +340,7 @@ namespace quda @param[in] x The array to sort @param[in] y An array whose elements will be permuted in tandem with x */ - void sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y); + void sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y); }; /** @@ -365,18 +364,18 @@ namespace quda virtual bool hermitian() { return true; } /** TRLM is only for Hermitian systems */ // Variable size matrix - std::vector ritz_mat; + std::vector ritz_mat; // Tridiagonal/Arrow matrix, fixed size. - std::vector alpha; - std::vector beta; + std::vector alpha; + std::vector beta; /** @brief Compute eigenpairs @param[in] kSpace Krylov vector space @param[in] evals Computed eigenvalues */ - void operator()(std::vector &kSpace, std::vector &evals); + void operator()(std::vector &kSpace, std::vector &evals); /** @brief Lanczos step: extends the Krylov space. @@ -421,14 +420,14 @@ namespace quda virtual bool hermitian() { return true; } /** (BLOCK)TRLM is only for Hermitian systems */ // Variable size matrix - std::vector block_ritz_mat; + std::vector block_ritz_mat; /** Block Tridiagonal/Arrow matrix, fixed size. */ - std::vector block_alpha; - std::vector block_beta; + std::vector block_alpha; + std::vector block_beta; /** Temp storage used in blockLanczosStep, fixed size. */ - std::vector jth_block; + std::vector jth_block; /** Size of blocks of data in alpha/beta */ int block_data_length; @@ -438,7 +437,7 @@ namespace quda @param[in] kSpace Krylov vector space @param[in] evals Computed eigenvalues */ - void operator()(std::vector &kSpace, std::vector &evals); + void operator()(std::vector &kSpace, std::vector &evals); /** @brief block lanczos step: extends the Krylov space in block step @@ -474,9 +473,9 @@ namespace quda { public: - std::vector> upperHess; - std::vector> Qmat; - std::vector> Rmat; + std::vector> upperHess; + std::vector> Qmat; + std::vector> Rmat; /** @brief Constructor for Thick Restarted Eigensolver class @@ -496,7 +495,7 @@ namespace quda @param[in] kSpace Krylov vector space @param[in] evals Computed eigenvalues */ - void operator()(std::vector &kSpace, std::vector &evals); + void operator()(std::vector &kSpace, std::vector &evals); /** @brief Arnoldi step: extends the Krylov space by one vector @@ -505,14 +504,14 @@ namespace quda @param[in] beta Norm of residual vector @param[in] j Index of vector being computed */ - void arnoldiStep(std::vector &v, std::vector &r, double &beta, int j); + void arnoldiStep(std::vector &v, std::vector &r, real_t &beta, int j); /** @brief Get the eigendecomposition from the upper Hessenberg matrix via QR - @param[in] evals Complex eigenvalues + @param[in] evals complex_t eigenvalues @param[in] beta Norm of residual (used to compute errors on eigenvalues) */ - void eigensolveFromUpperHess(std::vector &evals, const double beta); + void eigensolveFromUpperHess(std::vector &evals, const real_t beta); /** @brief Rotate the Krylov space @@ -526,14 +525,14 @@ namespace quda @param[in] evals The shifts to apply @param[in] num_shifts The number of shifts to apply */ - void qrShifts(const std::vector evals, const int num_shifts); + void qrShifts(const std::vector evals, const int num_shifts); /** @brief Apply One step of the the QR algorithm @param[in] Q The Q matrix @param[in] R The R matrix */ - void qrIteration(std::vector> &Q, std::vector> &R); + void qrIteration(std::vector> &Q, std::vector> &R); /** @brief Reorder the Krylov space and eigenvalues @@ -542,7 +541,7 @@ namespace quda @param[in] spec_type The spectrum type (Largest/Smallest)(Modulus/Imaginary/Real) that determines the sorting condition */ - void reorder(std::vector &kSpace, std::vector &evals, + void reorder(std::vector &kSpace, std::vector &evals, const QudaEigSpectrumType spec_type); }; @@ -561,7 +560,7 @@ namespace quda @param[in] eig_param Parameter structure for all QUDA eigensolvers @param[in,out] profile TimeProfile instance used for profiling */ - void arpack_solve(std::vector &h_evecs, std::vector &h_evals, const DiracMatrix &mat, + void arpack_solve(std::vector &h_evecs, std::vector &h_evals, const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile); } // namespace quda diff --git a/include/gauge_path_quda.h b/include/gauge_path_quda.h index db398c11a5..7b53af9acd 100644 --- a/include/gauge_path_quda.h +++ b/include/gauge_path_quda.h @@ -14,8 +14,8 @@ namespace quda @param[in] num_paths Numer of paths @param[in] max_length Maximum length of each path */ - void gaugeForce(GaugeField &mom, const GaugeField &u, double coeff, std::vector &input_path, - std::vector &length, std::vector &path_coeff, int num_paths, int max_length); + void gaugeForce(GaugeField &mom, const GaugeField &u, real_t coeff, std::vector &input_path, + std::vector &length, std::vector &path_coeff, int num_paths, int max_length); /** @brief Compute the product of gauge-links along the given path @@ -28,8 +28,8 @@ namespace quda @param[in] num_paths Numer of paths @param[in] max_length Maximum length of each path */ - void gaugePath(GaugeField &out, const GaugeField &u, double coeff, std::vector &input_path, - std::vector &length, std::vector &path_coeff, int num_paths, int max_length); + void gaugePath(GaugeField &out, const GaugeField &u, real_t coeff, std::vector &input_path, + std::vector &length, std::vector &path_coeff, int num_paths, int max_length); /** @brief Compute the trace of an arbitrary set of gauge loops @@ -42,8 +42,8 @@ namespace quda @param[in] num_paths Numer of paths @param[in] path_max_length Maximum length of each path */ - void gaugeLoopTrace(const GaugeField &u, std::vector &loop_traces, double factor, - std::vector &input_path, std::vector &length, std::vector &path_coeff_h, + void gaugeLoopTrace(const GaugeField &u, std::vector &loop_traces, real_t factor, + std::vector &input_path, std::vector &length, std::vector &path_coeff_h, int num_paths, int path_max_length); } // namespace quda diff --git a/include/invert_quda.h b/include/invert_quda.h index 8043fb7f0d..c455f35ea1 100644 --- a/include/invert_quda.h +++ b/include/invert_quda.h @@ -515,7 +515,7 @@ namespace quda { bool deflate_compute; /** If true, instruct the solver to create a deflation space. */ bool recompute_evals; /** If true, instruct the solver to recompute evals from an existing deflation space. */ std::vector evecs; /** Holds the eigenvectors. */ - std::vector evals; /** Holds the eigenvalues. */ + std::vector evals; /** Holds the eigenvalues. */ bool mixed() { return param.precision != param.precision_sloppy; } @@ -1019,10 +1019,10 @@ namespace quda { int pipeline; // pipelining factor for legacyGramSchmidt // Various coefficients and params needed on each iteration. - Complex rho0, rho1, alpha, omega, beta; // Various coefficients for the BiCG part of BiCGstab-L. - std::vector gamma, gamma_prime, gamma_prime_prime; // Parameters for MR part of BiCGstab-L. (L+1) length. - std::vector tau; // Parameters for MR part of BiCGstab-L. Tech. modified Gram-Schmidt coeffs. (L+1)x(L+1) length. - std::vector sigma; // Parameters for MR part of BiCGstab-L. Tech. the normalization part of Gram-Scmidt. (L+1) length. + complex_t rho0, rho1, alpha, omega, beta; // Various coefficients for the BiCG part of BiCGstab-L. + std::vector gamma, gamma_prime, gamma_prime_prime; // Parameters for MR part of BiCGstab-L. (L+1) length. + std::vector tau; // Parameters for MR part of BiCGstab-L. Tech. modified Gram-Schmidt coeffs. (L+1)x(L+1) length. + std::vector sigma; // Parameters for MR part of BiCGstab-L. Tech. the normalization part of Gram-Scmidt. (L+1) length. ColorSpinorField r_full; //! Full precision residual. ColorSpinorField y; //! Full precision temporary. @@ -1121,8 +1121,8 @@ namespace quda { */ int n_krylov; - std::vector alpha; - std::vector beta; + std::vector alpha; + std::vector beta; std::vector gamma; /** @@ -1136,12 +1136,12 @@ namespace quda { std::vector p; // GCR direction vectors std::vector Ap; // mat * direction vectors - void computeBeta(std::vector &beta, std::vector &Ap, int i, int N, int k); - void updateAp(std::vector &beta, std::vector &Ap, int begin, int size, int k); - void orthoDir(std::vector &beta, std::vector &Ap, int k, int pipeline); - void backSubs(const std::vector &alpha, const std::vector &beta, const std::vector &gamma, - std::vector &delta, int n); - void updateSolution(ColorSpinorField &x, const std::vector &alpha, const std::vector &beta, + void computeBeta(std::vector &beta, std::vector &Ap, int i, int N, int k); + void updateAp(std::vector &beta, std::vector &Ap, int begin, int size, int k); + void orthoDir(std::vector &beta, std::vector &Ap, int k, int pipeline); + void backSubs(const std::vector &alpha, const std::vector &beta, const std::vector &gamma, + std::vector &delta, int n); + void updateSolution(ColorSpinorField &x, const std::vector &alpha, const std::vector &beta, std::vector &gamma, int k, std::vector &p); /** @@ -1346,7 +1346,7 @@ namespace quda { bool lambda_init; // whether or not lambda_max has been initialized QudaCABasis basis; // CA basis - std::vector alpha; // Solution coefficient vectors + std::vector alpha; // Solution coefficient vectors ColorSpinorField r; @@ -1367,7 +1367,7 @@ namespace quda { @param[in] q Search direction vectors with the operator applied @param[in] b Source vector against which we are solving */ - void solve(std::vector &psi, std::vector &q, ColorSpinorField &b); + void solve(std::vector &psi, std::vector &q, ColorSpinorField &b); public: CAGCR(const DiracMatrix &mat, const DiracMatrix &matSloppy, const DiracMatrix &matPrecon, const DiracMatrix &matEig, @@ -1550,7 +1550,7 @@ namespace quda { @param[in] q Search direction vectors with the operator applied @param[in] hermitian Whether the linear system is Hermitian or not */ - void solve(std::vector &psi_, std::vector &p, std::vector &q, + void solve(std::vector &psi_, std::vector &p, std::vector &q, const ColorSpinorField &b, bool hermitian); public: @@ -1678,7 +1678,7 @@ namespace quda { struct deflation_space : public Object { bool svd; /** Whether this space is for an SVD deflaton */ std::vector evecs; /** Container for the eigenvectors */ - std::vector evals; /** The eigenvalues */ + std::vector evals; /** The eigenvalues */ }; /** diff --git a/include/kernels/dslash_domain_wall_4d.cuh b/include/kernels/dslash_domain_wall_4d.cuh index 09a09a9f3e..595a4f8026 100644 --- a/include/kernels/dslash_domain_wall_4d.cuh +++ b/include/kernels/dslash_domain_wall_4d.cuh @@ -11,16 +11,16 @@ namespace quda int Ls; /** fifth dimension length */ complex a_5[QUDA_MAX_DWF_LS]; /** xpay scale factor for each 4-d subvolume */ - DomainWall4DArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, double m_5, - const Complex *b_5, const Complex *c_5, bool xpay, const ColorSpinorField &x, int parity, + DomainWall4DArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, real_t a, real_t m_5, + const complex_t *b_5, const complex_t *c_5, bool xpay, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) : - WilsonArg(out, in, U, xpay ? a : 0.0, x, parity, dagger, comm_override), + WilsonArg(out, in, U, xpay ? a : real_t(0.0), x, parity, dagger, comm_override), Ls(in.X(4)) { if (b_5 == nullptr || c_5 == nullptr) - for (int s = 0; s < Ls; s++) a_5[s] = a; // 4-d Shamir + for (int s = 0; s < Ls; s++) a_5[s] = static_cast(a); // 4-d Shamir else - for (int s = 0; s < Ls; s++) a_5[s] = 0.5 * a / (b_5[s] * (m_5 + 4.0) + 1.0); // 4-d Mobius + for (int s = 0; s < Ls; s++) a_5[s] = Float(real_t(0.5) * a / (b_5[s] * (m_5 + real_t(4.0)) + real_t(1.0))); // 4-d Mobius } }; diff --git a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh index dfcd519214..7e3614976c 100644 --- a/include/kernels/dslash_domain_wall_4d_fused_m5.cuh +++ b/include/kernels/dslash_domain_wall_4d_fused_m5.cuh @@ -40,16 +40,16 @@ namespace quda bool fuse_m5inv_m5pre; - DomainWall4DFusedM5Arg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, double m_5, - const Complex *b_5, const Complex *c_5, bool xpay, const ColorSpinorField &x, - ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f) : + DomainWall4DFusedM5Arg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, real_t a, real_t m_5, + const complex_t *b_5, const complex_t *c_5, bool xpay, const ColorSpinorField &x, + ColorSpinorField &y, int parity, bool dagger, const int *comm_override, real_t m_f) : DomainWall4DArg(out, in, U, a, m_5, b_5, c_5, xpay, x, parity, dagger, comm_override), Dslash5Arg(out, in, x, m_f, m_5, b_5, c_5, a), y(y) { for (int s = 0; s < Ls; s++) { - auto kappa_b_s = 0.5 / (b_5[s] * (m_5 + 4.0) + 1.0); - a_5[s] = a * kappa_b_s * kappa_b_s; + auto kappa_b_s = real_t(0.5) / (b_5[s] * (real_t(m_5) + real_t(4.0)) + real_t(1.0)); + a_5[s] = static_cast>(a * kappa_b_s * kappa_b_s); }; // 4-d Mobius } }; diff --git a/include/kernels/dslash_domain_wall_m5.cuh b/include/kernels/dslash_domain_wall_m5.cuh index bab21d4c11..8c7b03a86c 100644 --- a/include/kernels/dslash_domain_wall_m5.cuh +++ b/include/kernels/dslash_domain_wall_m5.cuh @@ -110,25 +110,25 @@ namespace quda coeff_5 coeff; // constant buffer used for Mobius coefficients for CPU kernel - void compute_coeff_mobius_pre(const Complex *b_5, const Complex *c_5) + void compute_coeff_mobius_pre(const complex_t *b_5, const complex_t *c_5) { // out = (b + c * D5) * in for (int s = 0; s < Ls; s++) { coeff.beta[s] = b_5[s]; - coeff.alpha[s] = 0.5 * c_5[s]; // 0.5 from gamma matrices + coeff.alpha[s] = 0.5 * complex(c_5[s]); // 0.5 from gamma matrices // xpay - coeff.a[s] = 0.5 / (b_5[s] * (m_5 + 4.0) + 1.0); + coeff.a[s] = 0.5 / (complex(b_5[s]) * (m_5 + 4.0) + 1.0); coeff.a[s] *= coeff.a[s] * static_cast(a); // kappa_b * kappa_b * a } } - void compute_coeff_mobius(const Complex *b_5, const Complex *c_5) + void compute_coeff_mobius(const complex_t *b_5, const complex_t *c_5) { // out = (1 + kappa * D5) * in for (int s = 0; s < Ls; s++) { - coeff.kappa[s] = 0.5 * (c_5[s] * (m_5 + 4.0) - 1.0) / (b_5[s] * (m_5 + 4.0) + 1.0); // 0.5 from gamma matrices + coeff.kappa[s] = 0.5 * (complex(c_5[s]) * (m_5 + 4.0) - 1.0) / (complex(b_5[s]) * (m_5 + 4.0) + 1.0); // 0.5 from gamma matrices // axpy - coeff.a[s] = 0.5 / (b_5[s] * (m_5 + 4.0) + 1.0); + coeff.a[s] = 0.5 / (complex(b_5[s]) * (m_5 + 4.0) + 1.0); coeff.a[s] *= coeff.a[s] * static_cast(a); // kappa_b * kappa_b * a } } @@ -139,33 +139,33 @@ namespace quda inv = 0.5 / (1.0 + std::pow(kappa, (int)Ls) * m_f); } - void compute_coeff_m5inv_mobius(const Complex *b_5, const Complex *c_5) + void compute_coeff_m5inv_mobius(const complex_t *b_5, const complex_t *c_5) { // out = (1 + kappa * D5)^-1 * in = M5inv * in - kappa = -(c_5[0].real() * (4.0 + m_5) - 1.0) / (b_5[0].real() * (4.0 + m_5) + 1.0); // kappa = kappa_b / kappa_c + kappa = -(double(c_5[0].real()) * (4.0 + m_5) - 1.0) / (double(b_5[0].real()) * (4.0 + m_5) + 1.0); // kappa = kappa_b / kappa_c inv = 0.5 / (1.0 + std::pow(kappa, (int)Ls) * m_f); // 0.5 from gamma matrices - a *= pow(0.5 / (b_5[0].real() * (m_5 + 4.0) + 1.0), 2); // kappa_b * kappa_b * a + a *= pow(0.5 / (double(b_5[0].real()) * (m_5 + 4.0) + 1.0), 2); // kappa_b * kappa_b * a } - void compute_coeff_m5inv_zmobius(const Complex *b_5, const Complex *c_5) + void compute_coeff_m5inv_zmobius(const complex_t *b_5, const complex_t *c_5) { // out = (1 + kappa * D5)^-1 * in = M5inv * in // Similar to mobius convention, but variadic across 5th dim complex k = 1.0; for (int s = 0; s < Ls; s++) { - coeff.kappa[s] = -(c_5[s] * (4.0 + m_5) - 1.0) / (b_5[s] * (4.0 + m_5) + 1.0); + coeff.kappa[s] = -(complex(c_5[s]) * (4.0 + m_5) - 1.0) / (complex(b_5[s]) * (4.0 + m_5) + 1.0); k *= coeff.kappa[s]; } coeff.inv = static_cast(0.5) / (static_cast(1.0) + k * m_f); for (int s = 0; s < Ls; s++) { // axpy coefficients - coeff.a[s] = 0.5 / (b_5[s] * (m_5 + 4.0) + 1.0); + coeff.a[s] = 0.5 / (complex(b_5[s]) * (m_5 + 4.0) + 1.0); coeff.a[s] *= coeff.a[s] * static_cast(a); } } - Dslash5Arg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, double m_5, - const Complex *b_5, const Complex *c_5, double a_) : + Dslash5Arg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, real_t m_f, real_t m_5, + const complex_t *b_5, const complex_t *c_5, real_t a_) : kernel_param(dim3(in.VolumeCB() / in.X(4), in.X(4), in.SiteSubset())), out(out), in(in), diff --git a/include/kernels/dslash_mdw_fused.cuh b/include/kernels/dslash_mdw_fused.cuh index 973f100445..c079b07479 100644 --- a/include/kernels/dslash_mdw_fused.cuh +++ b/include/kernels/dslash_mdw_fused.cuh @@ -91,7 +91,7 @@ namespace quda { const bool comm[4]; FusedDslashArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, ColorSpinorField &y, - const ColorSpinorField &x, double m_f_, double m_5_, const Complex *b_5, const Complex *c_5, + const ColorSpinorField &x, double m_f_, double m_5_, const complex_t *b_5, const complex_t *c_5, int parity, int shift_[4], int halo_shift_[4]) : out(out), in(in), diff --git a/include/kernels/dslash_mobius_eofa.cuh b/include/kernels/dslash_mobius_eofa.cuh index f5e0a5c8ac..421ac2789e 100644 --- a/include/kernels/dslash_mobius_eofa.cuh +++ b/include/kernels/dslash_mobius_eofa.cuh @@ -53,7 +53,7 @@ namespace quda eofa_coeff coeff; Dslash5Arg(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, const double m_f_, - const double m_5_, const Complex */*b_5_*/, const Complex */*c_5_*/, double a_, double inv_, double kappa_, + const double m_5_, const complex_t */*b_5_*/, const complex_t */*c_5_*/, double a_, double inv_, double kappa_, const double *eofa_u, const double *eofa_x, const double *eofa_y, double sherman_morrison_) : kernel_param(dim3(in.VolumeCB() / in.X(4), in.X(4), in.SiteSubset())), out(out), diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh index f87e8f9865..7720a21bda 100644 --- a/include/kernels/dslash_wilson.cuh +++ b/include/kernels/dslash_wilson.cuh @@ -36,9 +36,9 @@ namespace quda const G U; /** the gauge field */ const real a; /** xpay scale factor - can be -kappa or -kappa^2 */ - WilsonArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, + WilsonArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, real_t a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) : - DslashArg(out, in, U, x, parity, dagger, a != 0.0 ? true : false, 1, spin_project, comm_override), + DslashArg(out, in, U, x, parity, dagger, a != real_t(0.0) ? true : false, 1, spin_project, comm_override), out(out), in(in), in_pack(in), diff --git a/include/quda_define.h.in b/include/quda_define.h.in index 41aba4aacb..dc94b74b97 100644 --- a/include/quda_define.h.in +++ b/include/quda_define.h.in @@ -260,3 +260,6 @@ static_assert(QUDA_ORDER_FP_MG == 2 || QUDA_ORDER_FP_MG == 4 || QUDA_ORDER_FP_MG #if !defined(QUDA_TARGET_CUDA) && !defined(QUDA_TARGET_HIP) && !defined(QUDA_TARGET_SYCL) #error "No QUDA_TARGET selected" #endif + +#cmakedefine QUDA_SCALAR_TYPE @QUDA_SCALAR_TYPE@ +#cmakedefine QUDA_REDUCTION_TYPE @QUDA_REDUCTION_TYPE@ diff --git a/include/quda_internal.h b/include/quda_internal.h index dd8a6c8177..c0478aa3ac 100644 --- a/include/quda_internal.h +++ b/include/quda_internal.h @@ -50,10 +50,24 @@ #include #include #include "timer.h" +#include "dbldbl.h" namespace quda { - using Complex = std::complex; + /** + Precision of scalar real variables on the host + */ + using real_t = QUDA_SCALAR_TYPE; + + /** + Precision of scalar complex variables on the host + */ + using complex_t = std::complex; + + /** + Sum reduction type + */ + using device_reduce_t = QUDA_REDUCTION_TYPE; /** Array object type used to storing lattice dimensions diff --git a/include/reducer.h b/include/reducer.h index 0af3b44852..24594a3850 100644 --- a/include/reducer.h +++ b/include/reducer.h @@ -23,12 +23,6 @@ namespace quda { -#ifdef QUAD_SUM - using device_reduce_t = doubledouble; -#else - using device_reduce_t = double; -#endif - namespace reducer { /** diff --git a/include/register_traits.h b/include/register_traits.h index f2cd7bf67e..c5f91cb380 100644 --- a/include/register_traits.h +++ b/include/register_traits.h @@ -180,7 +180,7 @@ namespace quda { static const int value = 1; }; - template <> struct vec_length { + template <> struct vec_length { static const int value = 2; }; template <> struct vec_length> { diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 5050133341..5d53ee2fd6 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -490,6 +490,16 @@ if(QUDA_BACKWARDS) target_link_libraries(quda PUBLIC ${BACKWARD_LIBRARIES}) endif() +set(QUDA_SCALAR_TYPE + double + CACHE STRING "set the host scalar coefficient type (double, doubledouble)") +set_property(CACHE QUDA_SCALAR_TYPE PROPERTY STRINGS double doubledouble) + +set(QUDA_REDUCTION_TYPE + double + CACHE STRING "set the reduction computation type (double, doubledouble)") +set_property(CACHE QUDA_REDUCTION_TYPE PROPERTY STRINGS double doubledouble) + configure_file(../include/quda_define.h.in ../include/quda_define.h @ONLY) install(FILES "${CMAKE_BINARY_DIR}/include/quda_define.h" DESTINATION include/) diff --git a/lib/blas_quda.cu b/lib/blas_quda.cu index 4c8719f309..4e38876787 100644 --- a/lib/blas_quda.cu +++ b/lib/blas_quda.cu @@ -150,14 +150,14 @@ namespace quda { instantiate(a, 0.0, 0.0, x, y, y, y, y); } - void caxpy(const Complex &a, const ColorSpinorField &x, ColorSpinorField &y) + void caxpy(const complex_t &a, const ColorSpinorField &x, ColorSpinorField &y) { - instantiate(a, Complex(0.0), Complex(0.0), x, y, x, x, y); + instantiate(a, complex_t(0.0), complex_t(0.0), x, y, x, x, y); } - void caxpby(const Complex &a, const ColorSpinorField &x, const Complex &b, ColorSpinorField &y) + void caxpby(const complex_t &a, const ColorSpinorField &x, const complex_t &b, ColorSpinorField &y) { - instantiate(a, b, Complex(0.0), x, y, x, x, y); + instantiate(a, b, complex_t(0.0), x, y, x, x, y); } void axpbypczw(double a, const ColorSpinorField &x, double b, const ColorSpinorField &y, @@ -166,10 +166,10 @@ namespace quda { instantiate(a, b, c, x, y, z, w, y); } - void cxpaypbz(const ColorSpinorField &x, const Complex &a, const ColorSpinorField &y, - const Complex &b, ColorSpinorField &z) + void cxpaypbz(const ColorSpinorField &x, const complex_t &a, const ColorSpinorField &y, + const complex_t &b, ColorSpinorField &z) { - instantiate(a, b, Complex(0.0), x, y, z, x, y); + instantiate(a, b, complex_t(0.0), x, y, z, x, y); } void axpyBzpcx(double a, ColorSpinorField& x, ColorSpinorField& y, double b, const ColorSpinorField& z, double c) @@ -182,32 +182,32 @@ namespace quda { instantiate(a, b, 0.0, x, y, z, x, y); } - void caxpyBzpx(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, - const Complex &b, const ColorSpinorField &z) + void caxpyBzpx(const complex_t &a, ColorSpinorField &x, ColorSpinorField &y, + const complex_t &b, const ColorSpinorField &z) { - instantiate(a, b, Complex(0.0), x, y, z, x, y); + instantiate(a, b, complex_t(0.0), x, y, z, x, y); } - void caxpyBxpz(const Complex &a, const ColorSpinorField &x, ColorSpinorField &y, - const Complex &b, ColorSpinorField &z) + void caxpyBxpz(const complex_t &a, const ColorSpinorField &x, ColorSpinorField &y, + const complex_t &b, ColorSpinorField &z) { - instantiate(a, b, Complex(0.0), x, y, z, x, y); + instantiate(a, b, complex_t(0.0), x, y, z, x, y); } - void caxpbypzYmbw(const Complex &a, const ColorSpinorField &x, const Complex &b, + void caxpbypzYmbw(const complex_t &a, const ColorSpinorField &x, const complex_t &b, ColorSpinorField &y, ColorSpinorField &z, const ColorSpinorField &w) { - instantiate(a, b, Complex(0.0), x, y, z, w, y); + instantiate(a, b, complex_t(0.0), x, y, z, w, y); } - void cabxpyAx(double a, const Complex &b, ColorSpinorField &x, ColorSpinorField &y) + void cabxpyAx(double a, const complex_t &b, ColorSpinorField &x, ColorSpinorField &y) { - instantiate(Complex(a), b, Complex(0.0), x, y, x, x, y); + instantiate(complex_t(a), b, complex_t(0.0), x, y, x, x, y); } - void caxpyXmaz(const Complex &a, ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &z) + void caxpyXmaz(const complex_t &a, ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &z) { - instantiate(a, Complex(0.0), Complex(0.0), x, y, z, x, y); + instantiate(a, complex_t(0.0), complex_t(0.0), x, y, z, x, y); } void caxpyXmazMR(const double &a, ColorSpinorField &x, ColorSpinorField &y, const ColorSpinorField &z) diff --git a/lib/deflation.cpp b/lib/deflation.cpp index 9b6270db32..638072829f 100644 --- a/lib/deflation.cpp +++ b/lib/deflation.cpp @@ -10,8 +10,8 @@ namespace quda using namespace blas; using DynamicStride = Stride; - static auto pinned_allocator = [] (size_t bytes ) { return static_cast(pool_pinned_malloc(bytes)); }; - static auto pinned_deleter = [] (Complex *hptr) { pool_pinned_free(hptr); }; + static auto pinned_allocator = [] (size_t bytes ) { return static_cast(pool_pinned_malloc(bytes)); }; + static auto pinned_deleter = [] (complex_t *hptr) { pool_pinned_free(hptr); }; Deflation::Deflation(DeflationParam ¶m, TimeProfile &profile) : param(param), @@ -83,7 +83,7 @@ namespace quda const int n_evs_to_print = param.cur_dim; if (n_evs_to_print == 0) errorQuda("Incorrect size of current deflation space"); - std::unique_ptr projm( pinned_allocator(param.ld*param.cur_dim * sizeof(Complex)), pinned_deleter); + std::unique_ptr projm( pinned_allocator(param.ld*param.cur_dim * sizeof(complex_t)), pinned_deleter); if( param.eig_global.extlib_type == QUDA_EIGEN_EXTLIB ) { Map projm_(param.matProj, param.cur_dim, param.cur_dim, DynamicStride(param.ld, 1)); @@ -104,10 +104,10 @@ if( param.eig_global.extlib_type == QUDA_EIGEN_EXTLIB ) { blas::caxpy(&projm.get()[i * param.ld], rv, res); // multiblas *r_sloppy = *r; param.matDeflation(*Av_sloppy, *r_sloppy); - double3 dotnorm = cDotProductNormA(*r_sloppy, *Av_sloppy); - double eval = dotnorm.x / dotnorm.z; + auto dotnorm = cDotProductNormA(*r_sloppy, *Av_sloppy); + double eval = dotnorm[0] / dotnorm[2]; blas::xpay(*Av_sloppy, -eval, *r_sloppy); - double relerr = sqrt(norm2(*r_sloppy) / dotnorm.z); + double relerr = sqrt(norm2(*r_sloppy) / dotnorm[2]); printfQuda("Eigenvalue %d: %1.12e Residual: %1.12e\n", i + 1, eval, relerr); } } @@ -120,7 +120,7 @@ if( param.eig_global.extlib_type == QUDA_EIGEN_EXTLIB ) { if(param.cur_dim == 0) return;//nothing to do - std::unique_ptr vec(new Complex[param.ld]); + std::unique_ptr vec(new complex_t[param.ld]); double check_nrm2 = norm2(b); @@ -184,7 +184,7 @@ if( param.eig_global.extlib_type == QUDA_EIGEN_EXTLIB ) { const int cdot_pipeline_length = 4; for (int i = first_idx; i < (first_idx + n_ev); i++) { - std::unique_ptr alpha(new Complex[i]); + std::unique_ptr alpha(new complex_t[i]); ColorSpinorField *accum = param.eig_global.cuda_prec_ritz != QUDA_DOUBLE_PRECISION ? r : ¶m.RV->Component(i); *accum = param.RV->Component(i); @@ -246,10 +246,10 @@ if( param.eig_global.extlib_type == QUDA_EIGEN_EXTLIB ) { } std::unique_ptr evals(new double[param.cur_dim]); - std::unique_ptr projm( - pinned_allocator(param.ld * param.cur_dim * sizeof(Complex)), pinned_deleter); + std::unique_ptr projm( + pinned_allocator(param.ld * param.cur_dim * sizeof(complex_t)), pinned_deleter); - memcpy(projm.get(), param.matProj, param.ld * param.cur_dim * sizeof(Complex)); + memcpy(projm.get(), param.matProj, param.ld * param.cur_dim * sizeof(complex_t)); if (param.eig_global.extlib_type == QUDA_EIGEN_EXTLIB) { Map projm_(projm.get(), param.cur_dim, param.cur_dim, @@ -298,10 +298,10 @@ if( param.eig_global.extlib_type == QUDA_EIGEN_EXTLIB ) { if (do_residual_check) { // if tol=0.0 then disable relative residual norm check *r_sloppy = *r; param.matDeflation(*Av_sloppy, *r_sloppy); - double3 dotnorm = cDotProductNormA(*r_sloppy, *Av_sloppy); - double eval = dotnorm.x / dotnorm.z; + auto dotnorm = cDotProductNormA(*r_sloppy, *Av_sloppy); + auto eval = dotnorm[0] / dotnorm[2]; blas::xpay(*Av_sloppy, -eval, *r_sloppy); - relerr = sqrt(norm2(*r_sloppy) / dotnorm.z); + relerr = sqrt(norm2(*r_sloppy) / dotnorm[2]); if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Eigenvalue: %1.12e Residual: %1.12e\n", eval, relerr); } diff --git a/lib/dirac_mobius.cpp b/lib/dirac_mobius.cpp index d60d8060fc..6a0023f53b 100644 --- a/lib/dirac_mobius.cpp +++ b/lib/dirac_mobius.cpp @@ -9,8 +9,8 @@ namespace quda { DiracMobius::DiracMobius(const DiracParam ¶m) : DiracDomainWall(param), zMobius(false) { - memcpy(b_5, param.b_5, sizeof(Complex) * param.Ls); - memcpy(c_5, param.c_5, sizeof(Complex) * param.Ls); + memcpy(b_5, param.b_5, sizeof(complex_t) * param.Ls); + memcpy(c_5, param.c_5, sizeof(complex_t) * param.Ls); double b = b_5[0].real(); double c = c_5[0].real(); diff --git a/lib/dslash5_domain_wall.cu b/lib/dslash5_domain_wall.cu index 10e6ae8a48..2973b57f8b 100644 --- a/lib/dslash5_domain_wall.cu +++ b/lib/dslash5_domain_wall.cu @@ -14,8 +14,8 @@ namespace quda const ColorSpinorField &x; double m_f; double m_5; - const Complex *b_5; - const Complex *c_5; + const complex_t *b_5; + const complex_t *c_5; double a; bool dagger; bool xpay; @@ -93,7 +93,7 @@ namespace quda public: Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, - double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type) : + double m_5, const complex_t *b_5, const complex_t *c_5, double a, bool dagger, Dslash5Type type) : TunableKernel3D(in, in.X(4), in.SiteSubset()), out(out), in(in), @@ -169,7 +169,7 @@ namespace quda // out = Dslash5*in #ifdef GPU_DOMAIN_WALL_DIRAC void ApplyDslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, - double m_5, const Complex *b_5, const Complex *c_5, double a, bool dagger, Dslash5Type type) + double m_5, const complex_t *b_5, const complex_t *c_5, double a, bool dagger, Dslash5Type type) { if (in.PCType() != QUDA_4D_PC) errorQuda("Only 4-d preconditioned fields are supported"); checkLocation(out, in, x); // check all locations match @@ -177,7 +177,7 @@ namespace quda } #else void ApplyDslash5(ColorSpinorField &, const ColorSpinorField &, const ColorSpinorField &, double, - double, const Complex *, const Complex *, double, bool, Dslash5Type) + double, const complex_t *, const complex_t *, double, bool, Dslash5Type) { errorQuda("Domain wall dslash has not been built"); } diff --git a/lib/dslash5_mobius_eofa.cu b/lib/dslash5_mobius_eofa.cu index f97ff5f80f..1c97a91313 100644 --- a/lib/dslash5_mobius_eofa.cu +++ b/lib/dslash5_mobius_eofa.cu @@ -17,8 +17,8 @@ namespace quda const ColorSpinorField &x; double m_f; double m_5; - const Complex *b_5; - const Complex *c_5; + const complex_t *b_5; + const complex_t *c_5; double a; bool eofa_pm; double inv; @@ -84,7 +84,7 @@ namespace quda public: Dslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, const double m_f, - const double m_5, const Complex *b_5, const Complex *c_5, double a, bool eofa_pm, double inv, + const double m_5, const complex_t *b_5, const complex_t *c_5, double a, bool eofa_pm, double inv, double kappa, const double *eofa_u, const double *eofa_x, const double *eofa_y, double sherman_morrison, bool dagger, Dslash5Type type) : TunableKernel3D(in, in.X(4), in.SiteSubset()), @@ -171,7 +171,7 @@ namespace quda // out = Dslash5*in #ifdef GPU_DOMAIN_WALL_DIRAC void apply_dslash5(ColorSpinorField &out, const ColorSpinorField &in, const ColorSpinorField &x, double m_f, - double m_5, const Complex *b_5, const Complex *c_5, double a, int eofa_pm, double inv, + double m_5, const complex_t *b_5, const complex_t *c_5, double a, int eofa_pm, double inv, double kappa, const double *eofa_u, const double *eofa_x, const double *eofa_y, double sherman_morrison, bool dagger, Dslash5Type type) { @@ -181,7 +181,7 @@ namespace quda } #else void apply_dslash5(ColorSpinorField &, const ColorSpinorField &, const ColorSpinorField &, double, - double, const Complex *, const Complex *, double, int, double, + double, const complex_t *, const complex_t *, double, int, double, double, const double *, const double *, const double *, double, bool, Dslash5Type) { errorQuda("Mobius EOFA dslash has not been built"); diff --git a/lib/dslash_domain_wall_4d.cu b/lib/dslash_domain_wall_4d.cu index a41285dc14..fc2a8af063 100644 --- a/lib/dslash_domain_wall_4d.cu +++ b/lib/dslash_domain_wall_4d.cu @@ -39,7 +39,7 @@ namespace quda template struct DomainWall4DApply { inline DomainWall4DApply(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, int parity, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile) { constexpr int nDim = 4; @@ -55,14 +55,14 @@ namespace quda // out(x) = M*in = in(x) + a*\sum_mu U_{-\mu}(x)in(x+mu) + U^\dagger_mu(x-mu)in(x-mu) #if defined(GPU_DOMAIN_WALL_DIRAC) || defined(GPU_NDEG_TWISTED_CLOVER_DIRAC) void ApplyDomainWall4D(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, double m_5, - const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, int parity, bool dagger, + const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override, TimeProfile &profile) { instantiate(out, in, U, a, m_5, b_5, c_5, x, parity, dagger, comm_override, profile); } #else void ApplyDomainWall4D(ColorSpinorField &, const ColorSpinorField &, const GaugeField &, double, double, - const Complex *, const Complex *, const ColorSpinorField &, int, bool, const int *, TimeProfile &) + const complex_t *, const complex_t *, const ColorSpinorField &, int, bool, const int *, TimeProfile &) { errorQuda("Domain-wall dslash has not been built"); } diff --git a/lib/dslash_domain_wall_4d_fused_m5.hpp b/lib/dslash_domain_wall_4d_fused_m5.hpp index 3ba9e77c39..192893ef34 100644 --- a/lib/dslash_domain_wall_4d_fused_m5.hpp +++ b/lib/dslash_domain_wall_4d_fused_m5.hpp @@ -131,7 +131,7 @@ namespace quda template inline DomainWall4DApplyFusedM5(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, Dslash5TypeList, TimeProfile &profile) { diff --git a/lib/dslash_domain_wall_4d_m5inv.cu b/lib/dslash_domain_wall_4d_m5inv.cu index a206a0ff75..a9f78bf785 100644 --- a/lib/dslash_domain_wall_4d_m5inv.cu +++ b/lib/dslash_domain_wall_4d_m5inv.cu @@ -12,7 +12,7 @@ namespace quda // ... and then m5inv #ifdef GPU_DOMAIN_WALL_DIRAC void ApplyDomainWall4DM5inv(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile) { @@ -22,7 +22,7 @@ namespace quda } #else void ApplyDomainWall4DM5inv(ColorSpinorField &, const ColorSpinorField &, const GaugeField &, double, - double, const Complex *, const Complex *, const ColorSpinorField &, + double, const complex_t *, const complex_t *, const ColorSpinorField &, ColorSpinorField &, int, bool, const int *, double, TimeProfile &) { errorQuda("Domain-wall dslash has not been built"); diff --git a/lib/dslash_domain_wall_4d_m5inv_m5inv.cu b/lib/dslash_domain_wall_4d_m5inv_m5inv.cu index c9e580acf0..4706c40253 100644 --- a/lib/dslash_domain_wall_4d_m5inv_m5inv.cu +++ b/lib/dslash_domain_wall_4d_m5inv_m5inv.cu @@ -12,7 +12,7 @@ namespace quda // ... and then m5inv + m5inv-dagger #ifdef GPU_DOMAIN_WALL_DIRAC void ApplyDomainWall4DM5invM5inv(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile) { @@ -22,7 +22,7 @@ namespace quda } #else void ApplyDomainWall4DM5invM5inv(ColorSpinorField &, const ColorSpinorField &, const GaugeField &, double, - double, const Complex *, const Complex *, const ColorSpinorField &, + double, const complex_t *, const complex_t *, const ColorSpinorField &, ColorSpinorField &, int, bool, const int *, double, TimeProfile &) { errorQuda("Domain-wall dslash has not been built"); diff --git a/lib/dslash_domain_wall_4d_m5inv_m5pre.cu b/lib/dslash_domain_wall_4d_m5inv_m5pre.cu index 2c71b958a2..fb8e08ce17 100644 --- a/lib/dslash_domain_wall_4d_m5inv_m5pre.cu +++ b/lib/dslash_domain_wall_4d_m5inv_m5pre.cu @@ -12,7 +12,7 @@ namespace quda // ... and then m5inv + m5pre #ifdef GPU_DOMAIN_WALL_DIRAC void ApplyDomainWall4DM5invM5pre(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile) { @@ -22,7 +22,7 @@ namespace quda } #else void ApplyDomainWall4DM5invM5pre(ColorSpinorField &, const ColorSpinorField &, const GaugeField &, double, - double, const Complex *, const Complex *, const ColorSpinorField &, + double, const complex_t *, const complex_t *, const ColorSpinorField &, ColorSpinorField &, int, bool, const int *, double, TimeProfile &) { errorQuda("Domain-wall dslash has not been built"); diff --git a/lib/dslash_domain_wall_4d_m5mob.cu b/lib/dslash_domain_wall_4d_m5mob.cu index c05edcb4fa..9699ceb964 100644 --- a/lib/dslash_domain_wall_4d_m5mob.cu +++ b/lib/dslash_domain_wall_4d_m5mob.cu @@ -12,7 +12,7 @@ namespace quda // ... and then m5 #ifdef GPU_DOMAIN_WALL_DIRAC void ApplyDomainWall4DM5mob(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile) { @@ -22,7 +22,7 @@ namespace quda } #else void ApplyDomainWall4DM5mob(ColorSpinorField &, const ColorSpinorField &, const GaugeField &, double, - double, const Complex *, const Complex *, const ColorSpinorField &, + double, const complex_t *, const complex_t *, const ColorSpinorField &, ColorSpinorField &, int, bool, const int *, double, TimeProfile &) { errorQuda("Domain-wall dslash has not been built"); diff --git a/lib/dslash_domain_wall_4d_m5pre.cu b/lib/dslash_domain_wall_4d_m5pre.cu index 12d43adb8f..f0c7fc49b0 100644 --- a/lib/dslash_domain_wall_4d_m5pre.cu +++ b/lib/dslash_domain_wall_4d_m5pre.cu @@ -12,7 +12,7 @@ namespace quda // ... and then m5pre #ifdef GPU_DOMAIN_WALL_DIRAC void ApplyDomainWall4DM5pre(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile) { @@ -22,7 +22,7 @@ namespace quda } #else void ApplyDomainWall4DM5pre(ColorSpinorField &, const ColorSpinorField &, const GaugeField &, double, - double, const Complex *, const Complex *, const ColorSpinorField &, + double, const complex_t *, const complex_t *, const ColorSpinorField &, ColorSpinorField &, int, bool, const int *, double, TimeProfile &) { errorQuda("Domain-wall dslash has not been built"); diff --git a/lib/dslash_domain_wall_4d_m5pre_m5inv.cu b/lib/dslash_domain_wall_4d_m5pre_m5inv.cu index 4b9fbac5e6..c71abea6db 100644 --- a/lib/dslash_domain_wall_4d_m5pre_m5inv.cu +++ b/lib/dslash_domain_wall_4d_m5pre_m5inv.cu @@ -12,7 +12,7 @@ namespace quda // ... and then m5pre + m5inv #ifdef GPU_DOMAIN_WALL_DIRAC void ApplyDomainWall4DM5preM5inv(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile) { @@ -22,7 +22,7 @@ namespace quda } #else void ApplyDomainWall4DM5preM5inv(ColorSpinorField &, const ColorSpinorField &, const GaugeField &, double, - double, const Complex *, const Complex *, const ColorSpinorField &, + double, const complex_t *, const complex_t *, const ColorSpinorField &, ColorSpinorField &, int, bool, const int *, double, TimeProfile &) { errorQuda("Domain-wall dslash has not been built"); diff --git a/lib/dslash_domain_wall_4d_m5pre_m5mob.cu b/lib/dslash_domain_wall_4d_m5pre_m5mob.cu index 04d39856eb..34bff0c3f6 100644 --- a/lib/dslash_domain_wall_4d_m5pre_m5mob.cu +++ b/lib/dslash_domain_wall_4d_m5pre_m5mob.cu @@ -12,7 +12,7 @@ namespace quda // ... and then m5pre + m5 #ifdef GPU_DOMAIN_WALL_DIRAC void ApplyDomainWall4DM5preM5mob(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, - double m_5, const Complex *b_5, const Complex *c_5, const ColorSpinorField &x, + double m_5, const complex_t *b_5, const complex_t *c_5, const ColorSpinorField &x, ColorSpinorField &y, int parity, bool dagger, const int *comm_override, double m_f, TimeProfile &profile) { @@ -22,7 +22,7 @@ namespace quda } #else void ApplyDomainWall4DM5preM5mob(ColorSpinorField &, const ColorSpinorField &, const GaugeField &, double, - double, const Complex *, const Complex *, const ColorSpinorField &, + double, const complex_t *, const complex_t *, const ColorSpinorField &, ColorSpinorField &, int, bool, const int *, double, TimeProfile &) { errorQuda("Domain-wall dslash has not been built"); diff --git a/lib/dslash_mdw_fused.in.cu b/lib/dslash_mdw_fused.in.cu index bf0639cab3..313870acf4 100644 --- a/lib/dslash_mdw_fused.in.cu +++ b/lib/dslash_mdw_fused.in.cu @@ -12,7 +12,7 @@ namespace quda template <> void apply_fused_dslash_impl(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, ColorSpinorField &y, const ColorSpinorField &x, double m_f, double m_5, - const Complex *b_5, const Complex *c_5, bool dagger, int parity, int shift[4], + const complex_t *b_5, const complex_t *c_5, bool dagger, int parity, int shift[4], int halo_shift[4], MdwfFusedDslashType type) { checkLocation(out, in); // check all locations match @@ -22,8 +22,8 @@ namespace quda #else template <> void apply_fused_dslash_impl(ColorSpinorField &, const ColorSpinorField &, const GaugeField &, - ColorSpinorField &, const ColorSpinorField &, double, double, const Complex *, - const Complex *, bool, int, int[4], int[4], MdwfFusedDslashType) + ColorSpinorField &, const ColorSpinorField &, double, double, const complex_t *, + const complex_t *, bool, int, int[4], int[4], MdwfFusedDslashType) { errorQuda("Domain wall dslash with tensor cores has not been built"); } diff --git a/lib/dslash_mdw_fused.in.hpp b/lib/dslash_mdw_fused.in.hpp index c97922a776..f17a267680 100644 --- a/lib/dslash_mdw_fused.in.hpp +++ b/lib/dslash_mdw_fused.in.hpp @@ -18,7 +18,7 @@ namespace quda template void apply_fused_dslash_impl(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, ColorSpinorField &y, const ColorSpinorField &x, double m_f, double m_5, - const Complex *b_5, const Complex *c_5, bool dagger, int parity, int shift[4], + const complex_t *b_5, const complex_t *c_5, bool dagger, int parity, int shift[4], int halo_shift[4], MdwfFusedDslashType type); template struct IntList { @@ -27,7 +27,7 @@ namespace quda template void apply_fused_dslash_list(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, ColorSpinorField &y, const ColorSpinorField &x, double m_f, double m_5, - const Complex *b_5, const Complex *c_5, bool dagger, int parity, int shift[4], + const complex_t *b_5, const complex_t *c_5, bool dagger, int parity, int shift[4], int halo_shift[4], MdwfFusedDslashType type, IntList) { if (in.X(4) == Ls) { @@ -44,7 +44,7 @@ namespace quda #if defined(GPU_DOMAIN_WALL_DIRAC) && defined(QUDA_MMA_AVAILABLE) void inline apply_fused_dslash(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, ColorSpinorField &y, const ColorSpinorField &x, double m_f, double m_5, - const Complex *b_5, const Complex *c_5, bool dagger, int parity, int shift[4], + const complex_t *b_5, const complex_t *c_5, bool dagger, int parity, int shift[4], int halo_shift[4], MdwfFusedDslashType type) { // clang-format off @@ -55,7 +55,7 @@ namespace quda #else void inline apply_fused_dslash(ColorSpinorField &, const ColorSpinorField &, const GaugeField &, ColorSpinorField &, const ColorSpinorField &, double, double, - const Complex *, const Complex *, bool, int, int[4], + const complex_t *, const complex_t *, bool, int, int[4], int[4], MdwfFusedDslashType) { errorQuda("Domain wall dslash with tensor cores has not been built"); diff --git a/lib/dslash_mdw_fused_impl.hpp b/lib/dslash_mdw_fused_impl.hpp index 4a09d35123..12f166c248 100644 --- a/lib/dslash_mdw_fused_impl.hpp +++ b/lib/dslash_mdw_fused_impl.hpp @@ -26,8 +26,8 @@ namespace quda const ColorSpinorField &x; double m_f; double m_5; - const Complex *b_5; - const Complex *c_5; + const complex_t *b_5; + const complex_t *c_5; int parity; int dim[4]; int *shift; @@ -132,7 +132,7 @@ namespace quda public: FusedDslash(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, ColorSpinorField &y, - const ColorSpinorField &x, double m_f, double m_5, const Complex *b_5, const Complex *c_5, + const ColorSpinorField &x, double m_f, double m_5, const complex_t *b_5, const complex_t *c_5, bool dagger, int parity, int shift[4], int halo_shift[4], MdwfFusedDslashType type) : TunableGridStrideKernel2D(in, x.X(4)), out(out), diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp index 160af9dff4..7ff0474816 100644 --- a/lib/eig_block_trlm.cpp +++ b/lib/eig_block_trlm.cpp @@ -16,6 +16,9 @@ namespace quda { + + using MatrixXc = Matrix; + // Thick Restarted Block Lanczos Method constructor BLKTRLM::BLKTRLM(const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile) : TRLM(mat, eig_param, profile) @@ -46,8 +49,8 @@ namespace quda block_data_length = block_size * block_size; auto arrow_mat_array_size = block_data_length * n_blocks; // Tridiagonal/Arrow matrix - block_alpha.resize(arrow_mat_array_size, 0.0); - block_beta.resize(arrow_mat_array_size, 0.0); + block_alpha.resize(arrow_mat_array_size, real_t(0.0)); + block_beta.resize(arrow_mat_array_size, real_t(0.0)); // Temp storage used in blockLanczosStep jth_block.resize(block_data_length); @@ -55,7 +58,7 @@ namespace quda if (!profile_running) profile.TPSTOP(QUDA_PROFILE_INIT); } - void BLKTRLM::operator()(std::vector &kSpace, std::vector &evals) + void BLKTRLM::operator()(std::vector &kSpace, std::vector &evals) { // Pre-launch checks and preparation //--------------------------------------------------------------------------- @@ -83,8 +86,8 @@ namespace quda checkChebyOpMax(kSpace); // Convergence and locking criteria - double mat_norm = 0.0; - double epsilon = setEpsilon(kSpace[0].Precision()); + real_t mat_norm = 0.0; + real_t epsilon = setEpsilon(kSpace[0].Precision()); // Print Eigensolver params printEigensolverSetup(); @@ -107,14 +110,14 @@ namespace quda // mat_norm is updated. for (int i = num_locked; i < n_kr; i++) - if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]); + if (abs(alpha[i]) > mat_norm) mat_norm = abs(alpha[i]); // Locking check iter_locked = 0; for (int i = 1; i < (n_kr - num_locked); i++) { if (residua[i + num_locked] < epsilon * mat_norm) { - logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - epsilon * mat_norm); + logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, double(residua[i + num_locked]), + double(epsilon * mat_norm)); iter_locked = i; } else { // Unlikely to find new locked pairs @@ -126,8 +129,8 @@ namespace quda iter_converged = iter_locked; for (int i = iter_locked + 1; i < n_kr - num_locked; i++) { if (residua[i + num_locked] < tol * mat_norm) { - logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - tol * mat_norm); + logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, double(residua[i + num_locked]), + double(tol * mat_norm)); iter_converged = i; } else { // Unlikely to find new converged pairs @@ -162,7 +165,7 @@ namespace quda logQuda(QUDA_DEBUG_VERBOSE, "num_keep = %d\n", num_keep); logQuda(QUDA_DEBUG_VERBOSE, "num_locked = %d\n", num_locked); for (int i = 0; i < n_kr; i++) { - logQuda(QUDA_DEBUG_VERBOSE, "Ritz[%d] = %.16e residual[%d] = %.16e\n", i, alpha[i], i, residua[i]); + logQuda(QUDA_DEBUG_VERBOSE, "Ritz[%d] = %.16e residual[%d] = %.16e\n", i, double(alpha[i]), i, double(residua[i])); } // Check for convergence @@ -192,7 +195,7 @@ namespace quda // Dump all Ritz values and residua for (int i = 0; i < n_conv; i++) { - logQuda(QUDA_SUMMARIZE, "RitzValue[%04d]: (%+.16e, %+.16e) residual %.16e\n", i, alpha[i], 0.0, residua[i]); + logQuda(QUDA_SUMMARIZE, "RitzValue[%04d]: (%+.16e, %+.16e) residual %.16e\n", i, double(alpha[i]), 0.0, double(residua[i])); } // Compute eigenvalues @@ -223,7 +226,7 @@ namespace quda if (j - start > 0) { int blocks = (j - start) / block_size; - std::vector beta_(blocks * block_data_length); + std::vector beta_(blocks * block_data_length); // Switch beta block order from COLUMN to ROW major // This switches the block from upper to lower triangular @@ -245,7 +248,7 @@ namespace quda // a_j = v_j^dag * r // Block dot products stored in alpha_block. - std::vector block_alpha_(block_size); + std::vector block_alpha_(block_size); blas::cDotProduct(block_alpha_, {v.begin() + j, v.begin() + j + block_size}, {r.begin(), r.end()}); for (auto i = 0u; i < block_alpha_.size(); i++) block_alpha[arrow_offset + i] = block_alpha_[i]; @@ -254,7 +257,7 @@ namespace quda for (int b = 0; b < block_size; b++) { for (int c = 0; c < block_size; c++) { idx = b * block_size + c; - jth_block[idx] = -1.0 * block_alpha[arrow_offset + idx]; + jth_block[idx] = -block_alpha[arrow_offset + idx]; } } @@ -283,12 +286,12 @@ namespace quda // Compute R_{k} logQuda(QUDA_DEBUG_VERBOSE, "Orthonormalisation attempt k = %d\n", k); for (int b = 0; b < block_size; b++) { - double norm = sqrt(blas::norm2(r[b])); + auto norm = sqrt(blas::norm2(r[b])); blas::ax(1.0 / norm, r[b]); jth_block[b * (block_size + 1)] = norm; for (int c = b + 1; c < block_size; c++) { - Complex cnorm = blas::cDotProduct(r[b], r[c]); + complex_t cnorm = blas::cDotProduct(r[b], r[c]); blas::caxpy(-cnorm, r[b], r[c]); idx = c * block_size + b; @@ -325,9 +328,9 @@ namespace quda } else { // Compute BetaNew_ac = (R_k)_ab * Beta_bc // Use Eigen, it's neater - MatrixXcd betaN = MatrixXcd::Zero(block_size, block_size); - MatrixXcd beta = MatrixXcd::Zero(block_size, block_size); - MatrixXcd Rk = MatrixXcd::Zero(block_size, block_size); + MatrixXc betaN = MatrixXc::Zero(block_size, block_size); + MatrixXc beta = MatrixXc::Zero(block_size, block_size); + MatrixXc Rk = MatrixXc::Zero(block_size, block_size); int idx = 0; // Populate matrices @@ -365,7 +368,7 @@ namespace quda int num_locked_offset = (num_locked / block_size) * block_data_length; // Eigen objects - MatrixXcd T = MatrixXcd::Zero(dim, dim); + MatrixXc T = MatrixXc::Zero(dim, dim); block_ritz_mat.resize(dim * dim); int idx = 0; @@ -420,7 +423,7 @@ namespace quda } // Eigensolve the arrow matrix - SelfAdjointEigenSolver eigensolver; + SelfAdjointEigenSolver eigensolver; eigensolver.compute(T); // Populate the alpha array with eigenvalues @@ -432,13 +435,13 @@ namespace quda // Use Sum of all beta values in the final block for // the convergence condition - double beta_sum = 0; - for (int i = 0; i < block_data_length; i++) beta_sum += fabs(block_beta[n_kr * block_size - block_data_length + i]); + real_t beta_sum = 0; + for (int i = 0; i < block_data_length; i++) beta_sum += abs(block_beta[n_kr * block_size - block_data_length + i]); for (int i = 0; i < blocks; i++) { for (int b = 0; b < block_size; b++) { idx = b * (block_size + 1); - residua[i * block_size + b + num_locked] = fabs(beta_sum * block_ritz_mat[dim * (i * block_size + b + 1) - 1]); + residua[i * block_size + b + num_locked] = abs(beta_sum * block_ritz_mat[dim * (i * block_size + b + 1) - 1]); } } @@ -451,7 +454,7 @@ namespace quda int dim = n_kr - num_locked; // Multi-BLAS friendly array to store part of Ritz matrix we want - std::vector ritz_mat_keep(dim * iter_keep); + std::vector ritz_mat_keep(dim * iter_keep); for (int j = 0; j < dim; j++) { for (int i = 0; i < iter_keep; i++) { ritz_mat_keep[j * iter_keep + i] = block_ritz_mat[i * dim + j]; } } @@ -463,9 +466,9 @@ namespace quda // Compute new r blocks // Use Eigen, it's neater - MatrixXcd beta = MatrixXcd::Zero(block_size, block_size); - MatrixXcd ri = MatrixXcd::Zero(block_size, block_size); - MatrixXcd ritzi = MatrixXcd::Zero(block_size, block_size); + MatrixXc beta = MatrixXc::Zero(block_size, block_size); + MatrixXc ri = MatrixXc::Zero(block_size, block_size); + MatrixXc ritzi = MatrixXc::Zero(block_size, block_size); int blocks = iter_keep / block_size; int idx = 0; int beta_offset = n_kr * block_size - block_data_length; diff --git a/lib/eig_iram.cpp b/lib/eig_iram.cpp index 4817cf0fc7..e62a843065 100644 --- a/lib/eig_iram.cpp +++ b/lib/eig_iram.cpp @@ -16,6 +16,8 @@ namespace quda { + using MatrixXc = Matrix; + // Implicitly Restarted Arnoldi Method constructor IRAM::IRAM(const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile) : EigenSolver(mat, eig_param, profile) @@ -28,9 +30,9 @@ namespace quda Qmat.resize(n_kr); Rmat.resize(n_kr); for (int i = 0; i < n_kr; i++) { - upperHess[i].resize(n_kr, 0.0); - Qmat[i].resize(n_kr, 0.0); - Rmat[i].resize(n_kr, 0.0); + upperHess[i].resize(n_kr, real_t(0.0)); + Qmat[i].resize(n_kr, real_t(0.0)); + Rmat[i].resize(n_kr, real_t(0.0)); } if (eig_param->qr_tol == 0) { eig_param->qr_tol = eig_param->tol * 1e-2; } @@ -40,7 +42,7 @@ namespace quda // Arnoldi Member functions //--------------------------------------------------------------------------- - void IRAM::arnoldiStep(std::vector &v, std::vector &r, double &beta, int j) + void IRAM::arnoldiStep(std::vector &v, std::vector &r, real_t &beta, int j) { beta = sqrt(blas::norm2(r[0])); if (j > 0) upperHess[j][j - 1] = beta; @@ -52,7 +54,7 @@ namespace quda // r_{j} = M * v_{j}; mat(r[0], v[j]); - double beta_pre = sqrt(blas::norm2(r[0])); + real_t beta_pre = sqrt(blas::norm2(r[0])); // Compute the j-th residual corresponding // to the j step factorization. @@ -61,14 +63,14 @@ namespace quda // r_{j} <- M * v_{j} - V_{j} * w_{j} // H_{j,i}_j = v_i^dag * r - std::vector tmp(j + 1); + std::vector tmp(j + 1); blas::cDotProduct(tmp, {v.begin(), v.begin() + j + 1}, r); // Orthogonalise r_{j} against V_{j}. // r = r - H_{j,i} * v_j for (int i = 0; i < j + 1; i++) tmp[i] *= -1.0; blas::caxpy(tmp, {v.begin(), v.begin() + j + 1}, r); - for (int i = 0; i < j + 1; i++) upperHess[i][j] = -1.0 * tmp[i]; + for (int i = 0; i < j + 1; i++) upperHess[i][j] = -tmp[i]; // Re-orthogonalization / Iterative refinement phase // Maximum 100 tries. @@ -92,8 +94,8 @@ namespace quda beta = sqrt(blas::norm2(r[0])); while (beta < 0.717 * beta_pre && orth_iter < orth_iter_max) { - logQuda(QUDA_DEBUG_VERBOSE, "beta = %e > 0.717*beta_pre = %e: Reorthogonalise at step %d, iter %d\n", beta, - 0.717 * beta_pre, j, orth_iter); + logQuda(QUDA_DEBUG_VERBOSE, "beta = %e > 0.717*beta_pre = %e: Reorthogonalise at step %d, iter %d\n", + double(beta), 0.717 * double(beta_pre), j, orth_iter); beta_pre = beta; @@ -116,60 +118,60 @@ namespace quda void IRAM::rotateBasis(std::vector &kSpace, int keep) { // Multi-BLAS friendly array to store the part of the rotation matrix - std::vector Qmat_keep(n_kr * keep, 0.0); + std::vector Qmat_keep(n_kr * keep, real_t(0.0)); for (int j = 0; j < n_kr; j++) for (int i = 0; i < keep; i++) { Qmat_keep[j * keep + i] = Qmat[j][i]; } rotateVecs(kSpace, Qmat_keep, n_kr, n_kr, keep, 0, profile); } - void IRAM::reorder(std::vector &kSpace, std::vector &evals, + void IRAM::reorder(std::vector &kSpace, std::vector &evals, const QudaEigSpectrumType spec_type) { int n = n_kr; - std::vector> array(n); + std::vector> array(n); for (int i = 0; i < n; i++) array[i] = std::make_tuple(evals[i], residua[i], std::move(kSpace[i])); switch (spec_type) { case QUDA_SPECTRUM_LM_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::tuple &a, - const std::tuple &b) { + [](const std::tuple &a, + const std::tuple &b) { return (abs(std::get<0>(a)) > abs(std::get<0>(b))); }); break; case QUDA_SPECTRUM_SM_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::tuple &a, - const std::tuple &b) { + [](const std::tuple &a, + const std::tuple &b) { return (abs(std::get<0>(a)) < abs(std::get<0>(b))); }); break; case QUDA_SPECTRUM_LR_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::tuple &a, - const std::tuple &b) { + [](const std::tuple &a, + const std::tuple &b) { return (std::get<0>(a).real() > std::get<0>(b).real()); }); break; case QUDA_SPECTRUM_SR_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::tuple &a, - const std::tuple &b) { + [](const std::tuple &a, + const std::tuple &b) { return (std::get<0>(a).real() < std::get<0>(b).real()); }); break; case QUDA_SPECTRUM_LI_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::tuple &a, - const std::tuple &b) { + [](const std::tuple &a, + const std::tuple &b) { return (std::get<0>(a).imag() > std::get<0>(b).imag()); }); break; case QUDA_SPECTRUM_SI_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::tuple &a, - const std::tuple &b) { + [](const std::tuple &a, + const std::tuple &b) { return (std::get<0>(a).imag() < std::get<0>(b).imag()); }); break; @@ -184,13 +186,13 @@ namespace quda } } - void IRAM::qrShifts(const std::vector evals, const int num_shifts) + void IRAM::qrShifts(const std::vector evals, const int num_shifts) { // This isn't really Eigen, but it's morally equivalent profile.TPSTART(QUDA_PROFILE_HOST_COMPUTE); // Reset Q to the identity, copy upper Hessenberg - MatrixXcd UHcopy = MatrixXcd::Zero(n_kr, n_kr); + MatrixXc UHcopy = MatrixXc::Zero(n_kr, n_kr); for (int i = 0; i < n_kr; i++) { for (int j = 0; j < n_kr; j++) { if (i == j) @@ -214,18 +216,18 @@ namespace quda profile.TPSTOP(QUDA_PROFILE_HOST_COMPUTE); } - void IRAM::qrIteration(std::vector> &Q, std::vector> &R) + void IRAM::qrIteration(std::vector> &Q, std::vector> &R) { - Complex T11, T12, T21, T22, U1, U2; - double dV; + complex_t T11, T12, T21, T22, U1, U2; + real_t dV; - double tol = eig_param->qr_tol; + real_t tol = eig_param->qr_tol; // Allocate the rotation matrices. - std::vector R11(n_kr - 1, 0.0); - std::vector R12(n_kr - 1, 0.0); - std::vector R21(n_kr - 1, 0.0); - std::vector R22(n_kr - 1, 0.0); + std::vector R11(n_kr - 1, real_t {0.0}); + std::vector R12(n_kr - 1, real_t {0.0}); + std::vector R21(n_kr - 1, real_t {0.0}); + std::vector R22(n_kr - 1, real_t {0.0}); for (int i = 0; i < n_kr - 1; i++) { @@ -263,7 +265,7 @@ namespace quda #pragma omp parallel for schedule(static, 32) #endif for (int j = i + 1; j < n_kr; j++) { - Complex temp = R[i][j]; + complex_t temp = R[i][j]; R[i][j] -= (T11 * temp + T12 * R[i + 1][j]); R[i + 1][j] -= (T21 * temp + T22 * R[i + 1][j]); } @@ -280,7 +282,7 @@ namespace quda #pragma omp for schedule(static, 32) nowait #endif for (int i = 0; i < j + 2; i++) { - Complex temp = R[i][j]; + complex_t temp = R[i][j]; R[i][j] -= (R11[j] * temp + R12[j] * R[i][j + 1]); R[i][j + 1] -= (R21[j] * temp + R22[j] * R[i][j + 1]); } @@ -288,7 +290,7 @@ namespace quda #pragma omp for schedule(static, 32) nowait #endif for (int i = 0; i < n_kr; i++) { - Complex temp = Q[i][j]; + complex_t temp = Q[i][j]; Q[i][j] -= (R11[j] * temp + R12[j] * Q[i][j + 1]); Q[i][j + 1] -= (R21[j] * temp + R22[j] * Q[i][j + 1]); } @@ -299,29 +301,29 @@ namespace quda } } - void IRAM::eigensolveFromUpperHess(std::vector &evals, const double beta) + void IRAM::eigensolveFromUpperHess(std::vector &evals, const real_t beta) { if (eig_param->use_eigen_qr) { profile.TPSTART(QUDA_PROFILE_EIGENQR); // Construct the upper Hessenberg matrix - MatrixXcd Q = MatrixXcd::Identity(n_kr, n_kr); - MatrixXcd R = MatrixXcd::Zero(n_kr, n_kr); + MatrixXc Q = MatrixXc::Identity(n_kr, n_kr); + MatrixXc R = MatrixXc::Zero(n_kr, n_kr); for (int i = 0; i < n_kr; i++) { for (int j = 0; j < n_kr; j++) { R(i, j) = upperHess[i][j]; } } // QR the upper Hessenberg matrix - Eigen::ComplexSchur schurUH; + Eigen::ComplexSchur schurUH; schurUH.computeFromHessenberg(R, Q); profile.TPSTOP(QUDA_PROFILE_EIGENQR); profile.TPSTART(QUDA_PROFILE_EIGENEV); // Extract the upper triangular matrix, eigensolve, then // get the eigenvectors of the upper Hessenberg - MatrixXcd matUpper = MatrixXcd::Zero(n_kr, n_kr); + MatrixXc matUpper = MatrixXc::Zero(n_kr, n_kr); matUpper = schurUH.matrixT().triangularView(); matUpper.conservativeResize(n_kr, n_kr); - Eigen::ComplexEigenSolver eigenSolver(matUpper); + Eigen::ComplexEigenSolver eigenSolver(matUpper); Q = schurUH.matrixU() * eigenSolver.eigenvectors(); // Update eigenvalues, residuia, and the Q matrix @@ -346,11 +348,11 @@ namespace quda // This is about as high as one cat get in double without causing // the Arnoldi to compute more restarts. - double tol = eig_param->qr_tol; + real_t tol = eig_param->qr_tol; int max_iter = 100000; int iter = 0; - Complex temp, discriminant, sol1, sol2, eval; + complex_t temp, discriminant, sol1, sol2, eval; for (int i = n_kr - 2; i >= 0; i--) { while (iter < max_iter) { if (abs(Rmat[i + 1][i]) < tol) { @@ -361,11 +363,11 @@ namespace quda // Compute the 2 eigenvalues via the quadratic formula //---------------------------------------------------- // The discriminant - temp = (Rmat[i][i] - Rmat[i + 1][i + 1]) * (Rmat[i][i] - Rmat[i + 1][i + 1]) / 4.0; + temp = (Rmat[i][i] - Rmat[i + 1][i + 1]) * (Rmat[i][i] - Rmat[i + 1][i + 1]) / real_t(4.0); discriminant = sqrt(Rmat[i + 1][i] * Rmat[i][i + 1] + temp); // Reuse temp - temp = (Rmat[i][i] + Rmat[i + 1][i + 1]) / 2.0; + temp = (Rmat[i][i] + Rmat[i + 1][i + 1]) / real_t(2.0); sol1 = temp - Rmat[i + 1][i + 1] + discriminant; sol2 = temp - Rmat[i + 1][i + 1] - discriminant; @@ -392,8 +394,8 @@ namespace quda // Compute the eigevectors of the origial upper Hessenberg // This is now very cheap because the input matrix to Eigen // is upper triangular. - MatrixXcd Q = MatrixXcd::Zero(n_kr, n_kr); - MatrixXcd R = MatrixXcd::Zero(n_kr, n_kr); + MatrixXc Q = MatrixXc::Zero(n_kr, n_kr); + MatrixXc R = MatrixXc::Zero(n_kr, n_kr); for (int i = 0; i < n_kr; i++) { for (int j = 0; j < n_kr; j++) { Q(i, j) = Qmat[i][j]; @@ -401,10 +403,10 @@ namespace quda } } - MatrixXcd matUpper = MatrixXcd::Zero(n_kr, n_kr); + MatrixXc matUpper = MatrixXc::Zero(n_kr, n_kr); matUpper = R.triangularView(); matUpper.conservativeResize(n_kr, n_kr); - Eigen::ComplexEigenSolver eigenSolver(matUpper); + Eigen::ComplexEigenSolver eigenSolver(matUpper); Q *= eigenSolver.eigenvectors(); // Update eigenvalues, residuia, and the Q matrix @@ -419,7 +421,7 @@ namespace quda } } - void IRAM::operator()(std::vector &kSpace, std::vector &evals) + void IRAM::operator()(std::vector &kSpace, std::vector &evals) { // Override any user input for block size. block_size = 1; @@ -447,9 +449,9 @@ namespace quda mat(r[0], kSpace[0]); // Convergence criteria - double epsilon = setEpsilon(kSpace[0].Precision()); - double epsilon23 = pow(epsilon, 2.0 / 3.0); - double beta = 0.0; + auto epsilon = setEpsilon(kSpace[0].Precision()); + real_t epsilon23 = pow(epsilon, 2.0 / 3.0); + real_t beta = 0.0; // Print Eigensolver params printEigensolverSetup(); @@ -484,10 +486,11 @@ namespace quda iter_converged = 0; for (int i = 0; i < n_ev; i++) { int idx = n_kr - 1 - i; - double rtemp = std::max(epsilon23, abs(evals[idx])); + auto rtemp = std::max(epsilon23, abs(evals[idx])); if (residua[idx] < tol * rtemp) { iter_converged++; - logQuda(QUDA_DEBUG_VERBOSE, "residuum[%d] = %e, condition = %e\n", i, residua[idx], tol * abs(evals[idx])); + logQuda(QUDA_DEBUG_VERBOSE, "residuum[%d] = %e, condition = %e\n", i, double(residua[idx]), + double(tol * abs(evals[idx]))); } else { // Unlikely to find new converged eigenvalues break; diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp index 00d3941527..dda2d5c1ca 100644 --- a/lib/eig_trlm.cpp +++ b/lib/eig_trlm.cpp @@ -16,6 +16,9 @@ namespace quda { + + using MatrixX = Matrix; + // Thick Restarted Lanczos Method constructor TRLM::TRLM(const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile) : EigenSolver(mat, eig_param, profile) @@ -37,7 +40,7 @@ namespace quda if (!profile_running) profile.TPSTOP(QUDA_PROFILE_INIT); } - void TRLM::operator()(std::vector &kSpace, std::vector &evals) + void TRLM::operator()(std::vector &kSpace, std::vector &evals) { // Override any user input for block size. block_size = 1; @@ -64,8 +67,8 @@ namespace quda checkChebyOpMax(kSpace); // Convergence and locking criteria - double mat_norm = 0.0; - double epsilon = setEpsilon(kSpace[0].Precision()); + real_t mat_norm = 0.0; + real_t epsilon = setEpsilon(kSpace[0].Precision()); // Print Eigensolver params printEigensolverSetup(); @@ -88,14 +91,14 @@ namespace quda // mat_norm is updated. for (int i = num_locked; i < n_kr; i++) - if (fabs(alpha[i]) > mat_norm) mat_norm = fabs(alpha[i]); + if (abs(alpha[i]) > mat_norm) mat_norm = abs(alpha[i]); // Locking check iter_locked = 0; for (int i = 1; i < (n_kr - num_locked); i++) { if (residua[i + num_locked] < epsilon * mat_norm) { - logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - epsilon * mat_norm); + logQuda(QUDA_DEBUG_VERBOSE, "**** Locking %d resid=%+.6e condition=%.6e ****\n", i, double(residua[i + num_locked]), + double(epsilon * mat_norm)); iter_locked = i; } else { // Unlikely to find new locked pairs @@ -107,8 +110,8 @@ namespace quda iter_converged = iter_locked; for (int i = iter_locked + 1; i < n_kr - num_locked; i++) { if (residua[i + num_locked] < tol * mat_norm) { - logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, residua[i + num_locked], - tol * mat_norm); + logQuda(QUDA_DEBUG_VERBOSE, "**** Converged %d resid=%+.6e condition=%.6e ****\n", i, double(residua[i + num_locked]), + double(tol * mat_norm)); iter_converged = i; } else { // Unlikely to find new converged pairs @@ -135,7 +138,7 @@ namespace quda logQuda(QUDA_DEBUG_VERBOSE, "num_keep = %d\n", num_keep); logQuda(QUDA_DEBUG_VERBOSE, "num_locked = %d\n", num_locked); for (int i = 0; i < n_kr; i++) { - logQuda(QUDA_DEBUG_VERBOSE, "Ritz[%d] = %.16e residual[%d] = %.16e\n", i, alpha[i], i, residua[i]); + logQuda(QUDA_DEBUG_VERBOSE, "Ritz[%d] = %.16e residual[%d] = %.16e\n", i, double(alpha[i]), i, double(residua[i])); } // Check for convergence @@ -167,7 +170,7 @@ namespace quda // Dump all Ritz values and residua if using Chebyshev for (int i = 0; i < n_conv && eig_param->use_poly_acc; i++) { - logQuda(QUDA_SUMMARIZE, "RitzValue[%04d]: (%+.16e, %+.16e) residual %.16e\n", i, alpha[i], 0.0, residua[i]); + logQuda(QUDA_SUMMARIZE, "RitzValue[%04d]: (%+.16e, %+.16e) residual %.16e\n", i, double(alpha[i]), 0.0, double(residua[i])); } // Compute eigenvalues/singular values @@ -198,7 +201,7 @@ namespace quda int start = (j > num_keep) ? j - 1 : 0; if (j - start > 0) { - std::vector beta_ = {beta.begin() + start, beta.begin() + j}; + std::vector beta_ = {beta.begin() + start, beta.begin() + j}; for (auto & bi : beta_) bi = -bi; // r = r - b_{j-1} * v_{j-1} @@ -250,7 +253,7 @@ namespace quda int arrow_pos = num_keep - num_locked; // Eigen objects - MatrixXd A = MatrixXd::Zero(dim, dim); + MatrixX A = MatrixX::Zero(dim, dim); ritz_mat.resize(dim * dim, 0.0); // Invert the spectrum due to chebyshev @@ -284,7 +287,7 @@ namespace quda } // Eigensolve the arrow matrix - SelfAdjointEigenSolver eigensolver; + SelfAdjointEigenSolver eigensolver; eigensolver.compute(A); // repopulate ritz matrix @@ -292,7 +295,7 @@ namespace quda for (int j = 0; j < dim; j++) ritz_mat[dim * i + j] = eigensolver.eigenvectors().col(i)[j]; for (int i = 0; i < dim; i++) { - residua[i + num_locked] = fabs(beta[n_kr - 1] * eigensolver.eigenvectors().col(i)[dim - 1]); + residua[i + num_locked] = abs(beta[n_kr - 1] * eigensolver.eigenvectors().col(i)[dim - 1]); // Update the alpha array alpha[i + num_locked] = eigensolver.eigenvalues()[i]; } @@ -311,7 +314,7 @@ namespace quda int dim = n_kr - num_locked; // Multi-BLAS friendly array to store part of Ritz matrix we want - std::vector ritz_mat_keep(dim * iter_keep); + std::vector ritz_mat_keep(dim * iter_keep); for (int j = 0; j < dim; j++) { for (int i = 0; i < iter_keep; i++) { ritz_mat_keep[j * iter_keep + i] = ritz_mat[i * dim + j]; } } diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp index afa869745b..ff63a886c8 100644 --- a/lib/eigensolve_quda.cpp +++ b/lib/eigensolve_quda.cpp @@ -20,6 +20,9 @@ namespace quda { + using MatrixX = Matrix; + using MatrixXc = Matrix; + // Eigensolver class //----------------------------------------------------------------------------- EigenSolver::EigenSolver(const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile) : @@ -179,7 +182,7 @@ namespace quda if (eig_param->use_poly_acc) { if (eig_param->a_max <= 0.0) { // Use part of the kSpace as temps - eig_param->a_max = estimateChebyOpMax(kSpace[block_size + 2], kSpace[block_size + 1]); + eig_param->a_max = double(estimateChebyOpMax(kSpace[block_size + 2], kSpace[block_size + 1])); logQuda(QUDA_SUMMARIZE, "Chebyshev maximum estimate: %e.\n", eig_param->a_max); } if (eig_param->a_min >= eig_param->a_max) @@ -187,11 +190,11 @@ namespace quda } } - void EigenSolver::prepareKrylovSpace(std::vector &kSpace, std::vector &evals) + void EigenSolver::prepareKrylovSpace(std::vector &kSpace, std::vector &evals) { resize(kSpace, n_kr + block_size, QUDA_ZERO_FIELD_CREATE); // increase Krylov space to n_kr + block_size resize(r, block_size, QUDA_ZERO_FIELD_CREATE, kSpace[0]); // create residual - evals.resize(n_kr, 0.0); // increase evals space to n_ev + evals.resize(n_kr, real_t(0.0)); // increase evals space to n_ev } void EigenSolver::printEigensolverSetup() @@ -201,7 +204,7 @@ namespace quda logQuda(QUDA_SUMMARIZE, "********************************\n"); logQuda(QUDA_VERBOSE, "spectrum %s\n", spectrum); - logQuda(QUDA_VERBOSE, "tol %.4e\n", tol); + logQuda(QUDA_VERBOSE, "tol %.4e\n", double(tol)); logQuda(QUDA_VERBOSE, "n_conv %d\n", n_conv); logQuda(QUDA_VERBOSE, "n_ev %d\n", n_ev); logQuda(QUDA_VERBOSE, "n_kr %d\n", n_kr); @@ -238,7 +241,7 @@ namespace quda } } - void EigenSolver::cleanUpEigensolver(std::vector &kSpace, std::vector &evals) + void EigenSolver::cleanUpEigensolver(std::vector &kSpace, std::vector &evals) { r.clear(); @@ -278,15 +281,15 @@ namespace quda if (eig_param->poly_deg == 0) errorQuda("Polynomial acceleration requested with zero polynomial degree"); // Compute the polynomial accelerated operator. - double a = eig_param->a_min; - double b = eig_param->a_max; - double delta = (b - a) / 2.0; - double theta = (b + a) / 2.0; - double sigma1 = -delta / theta; - double sigma; - double d1 = sigma1 / delta; - double d2 = 1.0; - double d3; + real_t a = eig_param->a_min; + real_t b = eig_param->a_max; + real_t delta = (b - a) / 2.0; + real_t theta = (b + a) / 2.0; + real_t sigma1 = -delta / theta; + real_t sigma; + real_t d1 = sigma1 / delta; + real_t d2 = 1.0; + real_t d3; // out = d2 * in + d1 * out // C_1(x) = x @@ -306,7 +309,7 @@ namespace quda // Using Chebyshev polynomial recursion relation, // C_{m+1}(x) = 2*x*C_{m} - C_{m-1} - double sigma_old = sigma1; + real_t sigma_old = sigma1; // construct C_{m+1}(x) for (int i = 2; i < eig_param->poly_deg; i++) { @@ -330,13 +333,13 @@ namespace quda for (auto i = 0u; i < in.size(); i++) std::swap(out[i], tmp2[i]); } - double EigenSolver::estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in) + real_t EigenSolver::estimateChebyOpMax(ColorSpinorField &out, ColorSpinorField &in) { RNG rng(in, 1234); spinorNoise(in, rng, QUDA_NOISE_UNIFORM); // Power iteration - double norm = 0.0; + real_t norm = 0.0; for (int i = 0; i < 100; i++) { if ((i + 1) % 10 == 0) { norm = sqrt(blas::norm2(in)); @@ -347,7 +350,7 @@ namespace quda } // Compute spectral radius estimate - double result = blas::reDotProduct(out, in); + auto result = blas::reDotProduct(out, in); // Increase final result by 10% for safety return result * 1.10; @@ -356,25 +359,26 @@ namespace quda bool EigenSolver::orthoCheck(std::vector &vecs, int size) { bool orthed = true; - const Complex Unit(1.0, 0.0); + const complex_t Unit(1.0, 0.0); - std::vector H(size * size); + std::vector H(size * size); blas::hDotProduct(H, {vecs.begin(), vecs.begin() + size}, {vecs.begin(), vecs.begin() + size}); - double epsilon = setEpsilon(vecs[0].Precision()); + real_t epsilon = setEpsilon(vecs[0].Precision()); for (int i = 0; i < size; i++) { for (int j = 0; j < size; j++) { auto cnorm = H[i * size + j]; if (j != i) { if (abs(cnorm) > 5.0 * epsilon) { - logQuda(QUDA_SUMMARIZE, "Norm <%d|%d>^2 = ||(%e,%e)|| = %e\n", i, j, cnorm.real(), cnorm.imag(), abs(cnorm)); + logQuda(QUDA_SUMMARIZE, "Norm <%d|%d>^2 = ||(%e,%e)|| = %e\n", i, j, + double(cnorm.real()), double(cnorm.imag()), double(abs(cnorm))); orthed = false; } } else { if (abs(Unit - cnorm) > 5.0 * epsilon) { - logQuda(QUDA_SUMMARIZE, "1 - Norm <%d|%d>^2 = 1 - ||(%e,%e)|| = %e\n", i, j, cnorm.real(), cnorm.imag(), - abs(Unit - cnorm)); + logQuda(QUDA_SUMMARIZE, "1 - Norm <%d|%d>^2 = 1 - ||(%e,%e)|| = %e\n", i, j, double(cnorm.real()), double(cnorm.imag()), + double(abs(Unit - cnorm))); orthed = false; } } @@ -393,12 +397,12 @@ namespace quda if (i - j < h_block_size) array_size = i - j; logQuda(QUDA_DEBUG_VERBOSE, "Current block size = %d\n", array_size); - std::vector s(array_size); + std::vector s(array_size); blas::cDotProduct(s, {vecs.begin() + j, vecs.begin() + j + array_size}, vecs[i]); // with i normalised. for (auto k = 0; k < array_size; k++) s[k] *= -1.0; blas::caxpy(s, {vecs.begin() + j, vecs.begin() + j + array_size}, vecs[i]); // i = i - proj_{j}(i) = i - * j } - double norm = sqrt(blas::norm2(vecs[i])); + real_t norm = sqrt(blas::norm2(vecs[i])); blas::ax(1.0 / norm, vecs[i]); // i/ } } @@ -413,7 +417,7 @@ namespace quda auto array_size = block_array_size * rvecs.size(); logQuda(QUDA_DEBUG_VERBOSE, "Current block array size = %d\n", block_array_size); - std::vector s(array_size); + std::vector s(array_size); // Block dot products stored in s. blas::cDotProduct(s, {vecs.begin() + j, vecs.begin() + j + block_array_size}, {rvecs.begin(), rvecs.end()}); @@ -504,7 +508,7 @@ namespace quda } } - void EigenSolver::computeSVD(std::vector &evecs, std::vector &evals) + void EigenSolver::computeSVD(std::vector &evecs, std::vector &evals) { logQuda(QUDA_SUMMARIZE, "Computing SVD of M\n"); @@ -512,7 +516,7 @@ namespace quda if (evecs.size() < (unsigned int)(2 * n_conv)) errorQuda("Incorrect deflation space sized %d passed to computeSVD, expected %d", (int)(evecs.size()), 2 * n_conv); - std::vector sigma_tmp(n_conv); + std::vector sigma_tmp(n_conv); for (int i = 0; i < n_conv; i++) { @@ -527,7 +531,7 @@ namespace quda //-------------------------------------------------------------------------- // Lambda already contains the square root of the eigenvalue of the norm op. - Complex lambda = evals[i]; + complex_t lambda = evals[i]; // M*Rev_i = M*Rsv_i = sigma_i Lsv_i mat.Expose()->M(evecs[n_conv + i], evecs[i]); @@ -538,8 +542,8 @@ namespace quda // Normalise the Lsv: sigma_i Lsv_i -> Lsv_i blas::ax(1.0 / sigma_tmp[i], evecs[n_conv + i]); - logQuda(QUDA_SUMMARIZE, "Sval[%04d] = %+.16e sigma - sqrt(|lambda|) = %+.16e\n", i, sigma_tmp[i], - sigma_tmp[i] - sqrt(abs(lambda.real()))); + logQuda(QUDA_SUMMARIZE, "Sval[%04d] = %+.16e sigma - sqrt(|lambda|) = %+.16e\n", i, double(sigma_tmp[i]), + double(sigma_tmp[i] - sqrt(abs(lambda.real())))); evals[i] = sigma_tmp[i]; //-------------------------------------------------------------------------- @@ -548,7 +552,7 @@ namespace quda // Deflate vec, place result in vec_defl void EigenSolver::deflateSVD(cvector_ref &sol, cvector_ref &src, - cvector_ref &evecs, const std::vector &evals, + cvector_ref &evecs, const std::vector &evals, bool accumulate) const { // number of evecs @@ -568,7 +572,7 @@ namespace quda // for all i computed eigenvectors and values. // 1. Take block inner product: L_i^dag * vec = A_i - std::vector s(n_defl * src.size()); + std::vector s(n_defl * src.size()); blas::cDotProduct(s, {evecs.begin() + eig_param->n_conv, evecs.begin() + eig_param->n_conv + n_defl}, {src.begin(), src.end()}); @@ -582,7 +586,7 @@ namespace quda } void EigenSolver::computeEvals(std::vector &evecs, - std::vector &evals, int size) + std::vector &evals, int size) { if (size > (int)evecs.size()) errorQuda("Requesting %d eigenvectors with only storage allocated for %lu", size, evecs.size()); @@ -600,21 +604,21 @@ namespace quda // lambda_i = v_i^dag A v_i / (v_i^dag * v_i) evals[i] = blas::cDotProduct(evecs[i], temp) / sqrt(blas::norm2(evecs[i])); // Measure ||lambda_i*v_i - A*v_i|| - Complex n_unit(-1.0, 0.0); + complex_t n_unit(-1.0, 0.0); blas::caxpby(evals[i], evecs[i], n_unit, temp); residua[i] = sqrt(blas::norm2(temp)); // eig_param->invert_param->true_res_offset[i] = residua[i]; // If size = n_conv, this routine is called post sort if (size == n_conv) - logQuda(QUDA_SUMMARIZE, "Eval[%04d] = (%+.16e,%+.16e) ||%+.16e|| Residual = %+.16e\n", i, evals[i].real(), - evals[i].imag(), abs(evals[i]), residua[i]); + logQuda(QUDA_SUMMARIZE, "Eval[%04d] = (%+.16e,%+.16e) ||%+.16e|| Residual = %+.16e\n", i, double(evals[i].real()), + double(evals[i].imag()), double(abs(evals[i])), double(residua[i])); } } // Deflate vec, place result in vec_defl void EigenSolver::deflate(cvector_ref &sol, cvector_ref &src, - cvector_ref &evecs, const std::vector &evals, + cvector_ref &evecs, const std::vector &evals, bool accumulate) const { // number of evecs @@ -630,7 +634,7 @@ namespace quda // for all i computed eigenvectors and values. // 1. Take block inner product: (V_i)^dag * vec = A_i - std::vector s(n_defl * src.size()); + std::vector s(n_defl * src.size()); blas::cDotProduct(s, {evecs.begin(), evecs.begin() + n_defl}, {src.begin(), src.end()}); // 2. Perform block caxpy: V_i * (L_i)^{-1} * A_i @@ -643,7 +647,7 @@ namespace quda } void EigenSolver::loadFromFile(std::vector &kSpace, - std::vector &evals) + std::vector &evals) { // Set suggested parity of fields const QudaParity mat_parity = impliedParityFromMatPC(mat.getMatPCType()); @@ -663,7 +667,7 @@ namespace quda computeEvals(kSpace, evals); } - void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y) + void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y) { if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { switch (spec_type) { @@ -677,43 +681,43 @@ namespace quda } } - std::vector> array(n); + std::vector> array(n); for (int i = 0; i < n; i++) array[i] = std::make_pair(x[i], y[i]); switch (spec_type) { case QUDA_SPECTRUM_LM_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::pair &a, const std::pair &b) { + [](const std::pair &a, const std::pair &b) { return (abs(a.first) < abs(b.first)); }); break; case QUDA_SPECTRUM_SM_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::pair &a, const std::pair &b) { + [](const std::pair &a, const std::pair &b) { return (abs(a.first) > abs(b.first)); }); break; case QUDA_SPECTRUM_LR_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::pair &a, const std::pair &b) { + [](const std::pair &a, const std::pair &b) { return (a.first).real() < (b.first).real(); }); break; case QUDA_SPECTRUM_SR_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::pair &a, const std::pair &b) { + [](const std::pair &a, const std::pair &b) { return (a.first).real() > (b.first).real(); }); break; case QUDA_SPECTRUM_LI_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::pair &a, const std::pair &b) { + [](const std::pair &a, const std::pair &b) { return (a.first).imag() < (b.first).imag(); }); break; case QUDA_SPECTRUM_SI_EIG: std::sort(array.begin(), array.begin() + n, - [](const std::pair &a, const std::pair &b) { + [](const std::pair &a, const std::pair &b) { return (a.first).imag() > (b.first).imag(); }); break; @@ -728,28 +732,28 @@ namespace quda } // Overloaded version of sortArrays to deal with real y array. - void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y) + void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y) { - std::vector y_tmp(n, 0.0); + std::vector y_tmp(n, real_t(0.0)); for (int i = 0; i < n; i++) y_tmp[i].real(y[i]); sortArrays(spec_type, n, x, y_tmp); for (int i = 0; i < n; i++) y[i] = y_tmp[i].real(); } // Overloaded version of sortArrays to deal with real x array. - void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y) + void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y) { - std::vector x_tmp(n, 0.0); + std::vector x_tmp(n, real_t(0.0)); for (int i = 0; i < n; i++) x_tmp[i].real(x[i]); sortArrays(spec_type, n, x_tmp, y); for (int i = 0; i < n; i++) x[i] = x_tmp[i].real(); } // Overloaded version of sortArrays to deal with real x and y array. - void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y) + void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y) { - std::vector x_tmp(n, 0.0); - std::vector y_tmp(n, 0.0); + std::vector x_tmp(n, real_t(0.0)); + std::vector y_tmp(n, real_t(0.0)); for (int i = 0; i < n; i++) { x_tmp[i].real(x[i]); y_tmp[i].real(y[i]); @@ -762,12 +766,12 @@ namespace quda } // Overloaded version of sortArrays to deal with complex x and integer y array. - void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y) + void EigenSolver::sortArrays(QudaEigSpectrumType spec_type, int n, std::vector &x, std::vector &y) { - std::vector y_tmp(n, 0.0); + std::vector y_tmp(n, real_t(0.0)); for (int i = 0; i < n; i++) y_tmp[i].real(y[i]); sortArrays(spec_type, n, x, y_tmp); - for (int i = 0; i < n; i++) y[i] = (int)(y_tmp[i].real()); + for (int i = 0; i < n; i++) y[i] = int(y_tmp[i].real()); } /** @@ -775,8 +779,8 @@ namespace quda arithmetic (real or complex) */ template struct eigen_matrix_map; - template <> struct eigen_matrix_map { using type = MatrixXd; }; - template <> struct eigen_matrix_map { using type = MatrixXcd; }; + template <> struct eigen_matrix_map { using type = MatrixX; }; + template <> struct eigen_matrix_map { using type = MatrixXc; }; template using eigen_matrix_t = typename eigen_matrix_map::type; template @@ -905,10 +909,10 @@ namespace quda } } - template void EigenSolver::rotateVecs(std::vector &kSpace, const std::vector &rot_array, + template void EigenSolver::rotateVecs(std::vector &kSpace, const std::vector &rot_array, int offset, int dim, int keep, int locked, TimeProfile &profile); - template void EigenSolver::rotateVecs(std::vector &kSpace, const std::vector &rot_array, - int offset, int dim, int keep, int locked, TimeProfile &profile); + template void EigenSolver::rotateVecs(std::vector &kSpace, const std::vector &rot_array, + int offset, int dim, int keep, int locked, TimeProfile &profile); } // namespace quda diff --git a/lib/gauge_loop_trace.cu b/lib/gauge_loop_trace.cu index 0b1af50ba4..db3acb561a 100644 --- a/lib/gauge_loop_trace.cu +++ b/lib/gauge_loop_trace.cu @@ -52,7 +52,7 @@ namespace quda { } }; - void gaugeLoopTrace(const GaugeField& u, std::vector& loop_traces, double factor, std::vector& input_path, + void gaugeLoopTrace(const GaugeField& u, std::vector& loop_traces, double factor, std::vector& input_path, std::vector& length, std::vector& path_coeff, int num_paths, int path_max_length) { getProfile().TPSTART(QUDA_PROFILE_COMPUTE); @@ -63,7 +63,7 @@ namespace quda { // gauge field must be passed as first argument so we peel off its reconstruct type instantiate(u, tr_array, factor, p); - for (auto i = 0u; i < tr_array.size(); i++) { loop_traces[i] = Complex(tr_array[i][0], tr_array[i][1]); } + for (auto i = 0u; i < tr_array.size(); i++) { loop_traces[i] = complex_t(tr_array[i][0], tr_array[i][1]); } p.free(); getProfile().TPSTOP(QUDA_PROFILE_COMPUTE); diff --git a/lib/gauge_observable.cpp b/lib/gauge_observable.cpp index 041dc6164d..2f21828bd9 100644 --- a/lib/gauge_observable.cpp +++ b/lib/gauge_observable.cpp @@ -41,13 +41,13 @@ namespace quda for (int d = 0; d < 1; d++) { input_path_v[d] = param.input_path_buff; } // prepare trace storage - std::vector loop_traces(param.num_paths); + std::vector loop_traces(param.num_paths); // actually do the computation gaugeLoopTrace(u, loop_traces, param.factor, input_path_v, path_length_v, loop_coeff_v, param.num_paths, param.max_length); - for (int i = 0; i < param.num_paths; i++) { memcpy(param.traces + i, &loop_traces[i], sizeof(Complex)); } + for (int i = 0; i < param.num_paths; i++) { memcpy(param.traces + i, &loop_traces[i], sizeof(complex_t)); } } // no point constructing Fmunu unless we are going to use it diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 17b6bd4391..ff3c6f4a24 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -1477,11 +1477,11 @@ namespace quda { } diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_EOFA_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_EOFA_DIRAC; diracParam.Ls = inv_param->Ls; - if (sizeof(Complex) != sizeof(double _Complex)) { + if (sizeof(complex_t) != sizeof(double _Complex)) { errorQuda("Irreconcilable difference between interface and internal complex number conventions"); } - memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls); - memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls); + memcpy(diracParam.b_5, inv_param->b_5, sizeof(complex_t) * inv_param->Ls); + memcpy(diracParam.c_5, inv_param->c_5, sizeof(complex_t) * inv_param->Ls); diracParam.eofa_shift = inv_param->eofa_shift; diracParam.eofa_pm = inv_param->eofa_pm; diracParam.mq1 = inv_param->mq1; @@ -1493,11 +1493,11 @@ namespace quda { errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS); diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_DIRAC; diracParam.Ls = inv_param->Ls; - if (sizeof(Complex) != sizeof(double _Complex)) { + if (sizeof(complex_t) != sizeof(double _Complex)) { errorQuda("Irreconcilable difference between interface and internal complex number conventions"); } - memcpy(diracParam.b_5, inv_param->b_5, sizeof(Complex) * inv_param->Ls); - memcpy(diracParam.c_5, inv_param->c_5, sizeof(Complex) * inv_param->Ls); + memcpy(diracParam.b_5, inv_param->b_5, sizeof(complex_t) * inv_param->Ls); + memcpy(diracParam.c_5, inv_param->c_5, sizeof(complex_t) * inv_param->Ls); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Printing b_5 and c_5 values\n"); for (int i = 0; i < diracParam.Ls; i++) { @@ -2276,7 +2276,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam } // Simple vector for eigenvalues. - std::vector evals(eig_param->n_conv, 0.0); + std::vector evals(eig_param->n_conv, 0.0); //------------------------------------------------------ // Sanity checks for operator/eigensolver compatibility. @@ -2336,7 +2336,7 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam // Transfer Eigenpairs back to host if using GPU eigensolver. The copy // will automatically rotate from device UKQCD gamma basis to the // host side gamma basis. - for (int i = 0; i < eig_param->n_conv; i++) { memcpy(host_evals + i, &evals[i], sizeof(Complex)); } + for (int i = 0; i < eig_param->n_conv; i++) { memcpy(host_evals + i, &evals[i], sizeof(complex_t)); } if (!(eig_param->arpack_check)) { for (int i = 0; i < n_eig; i++) host_evecs_[i] = kSpace[i]; } @@ -3009,7 +3009,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE); if (param->compute_action) { - Complex action = blas::cDotProduct(b, x); + auto action = blas::cDotProduct(b, x); param->action[0] = action.real(); param->action[1] = action.imag(); } @@ -3751,7 +3751,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) for (int i = 0; i < param->num_offset; i++) param->offset[i] = unscaled_shifts[i]; if (param->compute_action) { - Complex action(0); + complex_t action(0); for (int i = 0; i < param->num_offset; i++) action += param->residue[i] * blas::cDotProduct(b, x[i]); param->action[0] = action.real(); param->action[1] = action.imag(); diff --git a/lib/inv_bicgstab_quda.cpp b/lib/inv_bicgstab_quda.cpp index e906c2d463..df1f089977 100644 --- a/lib/inv_bicgstab_quda.cpp +++ b/lib/inv_bicgstab_quda.cpp @@ -196,7 +196,7 @@ namespace quda { const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false; - double heavy_quark_res = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x,r).z) : 0.0; + real_t heavy_quark_res = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x,r)[2]) : real_t(0.0); const int heavy_quark_check = param.heavy_quark_check; // how often to check the heavy quark residual double delta = param.delta; @@ -204,14 +204,14 @@ namespace quda { int k = 0; int rUpdate = 0; - Complex rho(1.0, 0.0); - Complex rho0 = rho; - Complex alpha(1.0, 0.0); - Complex omega(1.0, 0.0); - Complex beta; + complex_t rho(1.0, 0.0); + complex_t rho0 = rho; + complex_t alpha(1.0, 0.0); + complex_t omega(1.0, 0.0); + complex_t beta; - double3 rho_r2; - double3 omega_t2; + array rho_r2; + array omega_t2; double rNorm = sqrt(r2); //double r0Norm = rNorm; @@ -240,7 +240,7 @@ namespace quda { matSloppy(v, p); - Complex r0v; + complex_t r0v; if (param.pipeline) { r0v = blas::cDotProduct(r0, v); if (k>0) rho = blas::cDotProduct(r0, r); @@ -259,11 +259,11 @@ namespace quda { if (param.pipeline) { // omega = (t, r) / (t, t) omega_t2 = blas::cDotProductNormA(t, rSloppy); - Complex tr = Complex(omega_t2.x, omega_t2.y); - double t2 = omega_t2.z; + complex_t tr = complex_t(omega_t2[0], omega_t2[1]); + real_t t2 = omega_t2[2]; omega = tr / t2; - double s2 = blas::norm2(rSloppy); - Complex r0t = blas::cDotProduct(r0, t); + real_t s2 = blas::norm2(rSloppy); + complex_t r0t = blas::cDotProduct(r0, t); beta = -r0t / r0v; r2 = s2 - real(omega * conj(tr)) ; @@ -272,7 +272,7 @@ namespace quda { } else { // omega = (t, r) / (t, t) omega_t2 = blas::cDotProductNormA(t, rSloppy); - omega = Complex(omega_t2.x / omega_t2.z, omega_t2.y / omega_t2.z); + omega = complex_t(omega_t2[0] / omega_t2[2], omega_t2[1] / omega_t2[2]); } if (param.pipeline && !updateR) { @@ -284,17 +284,17 @@ namespace quda { //x += alpha*p + omega*r, r -= omega*t, r2 = (r,r), rho = (r0, r) rho_r2 = blas::caxpbypzYmbwcDotProductUYNormY(alpha, p, omega, rSloppy, xSloppy, t, r0); rho0 = rho; - rho = Complex(rho_r2.x, rho_r2.y); - r2 = rho_r2.z; + rho = complex_t(rho_r2[0], rho_r2[1]); + r2 = rho_r2[2]; } if (use_heavy_quark_res && k%heavy_quark_check==0) { if (&x != &xSloppy) { blas::copy(tmp,y); - heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(xSloppy, tmp, rSloppy).z); + heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(xSloppy, tmp, rSloppy)[2]); } else { blas::copy(r, rSloppy); - heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r).z); + heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r)[2]); } } @@ -364,7 +364,7 @@ namespace quda { // Calculate the true residual mat(r, x); param.true_res = sqrt(blas::xmyNorm(b, r) / b2); - param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x,r).z) : 0.0; + param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x,r)[2]) : 0.0; PrintSummary("BiCGstab", k, r2, b2, stop, param.tol_hq); } diff --git a/lib/inv_bicgstabl_quda.cpp b/lib/inv_bicgstabl_quda.cpp index 0393fe308c..219e2f8897 100644 --- a/lib/inv_bicgstabl_quda.cpp +++ b/lib/inv_bicgstabl_quda.cpp @@ -26,8 +26,8 @@ namespace quda { // Compute the MR portion of BiCGstab-L void BiCGstabL::computeMR(ColorSpinorField &x_sloppy, bool fixed_iteration) { - using matrix = Matrix, Dynamic, Dynamic, RowMajor>; - using vector = Matrix, Dynamic, 1>; + using matrix = Matrix, Dynamic, Dynamic, RowMajor>; + using vector = Matrix, Dynamic, 1>; // Compute gamma: minimize ||r - R \gamma||, where R is an L x R matrix // of r_1, r_2, ... @@ -37,7 +37,7 @@ namespace quda { auto r_dagger_vec = {r.begin() + 1, r.begin() + n_krylov + 1}; auto r_vec = {r.begin(), r.begin() + n_krylov + 1}; - std::vector r_dagger_dot_r((n_krylov + 1) * n_krylov); + std::vector r_dagger_dot_r((n_krylov + 1) * n_krylov); blas::cDotProduct(r_dagger_dot_r, r_dagger_vec, r_vec) ; matrix R_dag_R(n_krylov, n_krylov); @@ -68,7 +68,7 @@ namespace quda { // Update omega for the next BiCG iteration omega = gamma(n_krylov - 1); - std::vector gamma_(n_krylov); + std::vector gamma_(n_krylov); if (!fixed_iteration) { @@ -85,11 +85,11 @@ namespace quda { // r = r[0] - \sum_{j=1}^L \gamma_L r_L // we see that if we're a bit clever with zero padding we can // reuse r between the two updates. - std::vector gamma_for_x(n_krylov + 1); + std::vector gamma_for_x(n_krylov + 1); for (int i = 0; i < n_krylov; i++) gamma_for_x[i] = gamma(i); gamma_for_x[n_krylov] = 0; // do not want to update with last r - std::vector gamma_for_r(n_krylov + 1); + std::vector gamma_for_r(n_krylov + 1); gamma_for_r[0] = 0; // do not want to update with first r for (int i = 0; i < n_krylov; i++) gamma_for_r[i + 1] = -gamma(i); @@ -125,7 +125,7 @@ namespace quda { // Big change is we need to go from 1 to n_krylov, not 0 to n_krylov-1. void BiCGstabL::computeTau(int begin, int size, int j) { - std::vector Tau(size); + std::vector Tau(size); blas::cDotProduct(Tau, {r.begin() + begin, r.begin() + begin + size}, r[j]); // vectorized dot product for (int k = 0; k < size; k++) { tau[(begin + k) * (n_krylov + 1) + j] = Tau[k] / sigma[begin + k]; } @@ -133,7 +133,7 @@ namespace quda { void BiCGstabL::updateR(int begin, int size, int j) { - std::vector tau_(size); + std::vector tau_(size); for (int i = 0; i < size; i++) { tau_[i] = -tau[(i + begin) * (n_krylov + 1) + j]; } auto r_ = {r.begin() + begin, r.begin() + begin + size}; @@ -201,9 +201,9 @@ namespace quda { // sigma_j = r_j^2, gamma'_j = /sigma_j // rjr.x = Re(), rjr.z = - double3 rjr = blas::cDotProductNormA(r[j], r[0]); - sigma[j] = rjr.z; - gamma_prime[j] = Complex(rjr.x, rjr.y) / sigma[j]; + auto rjr = blas::cDotProductNormA(r[j], r[0]); + sigma[j] = rjr[2]; + gamma_prime[j] = complex_t(rjr[0], rjr[1]) / sigma[j]; } // gamma[n_krylov] = gamma'[n_krylov], omega = gamma[n_krylov] @@ -230,7 +230,7 @@ namespace quda { // Update U { - std::vector gamma_(n_krylov); + std::vector gamma_(n_krylov); for (int i = 0; i < n_krylov; i++) { gamma_[i] = -gamma[i + 1]; } blas::caxpy(gamma_, {u.begin() + 1, u.end()}, u[0]); } @@ -241,8 +241,8 @@ namespace quda { // the alternative way would be un-fusing some calls, which would require // loading and saving x twice. In a solve where the sloppy precision is lower than // the full precision, this can be a killer. - std::vector gamma_prime_prime_(n_krylov + 1); - std::vector gamma_prime_(n_krylov + 1); + std::vector gamma_prime_prime_(n_krylov + 1); + std::vector gamma_prime_(n_krylov + 1); gamma_prime_prime_[0] = gamma[1]; gamma_prime_prime_[n_krylov] = 0.0; // x never gets updated with r[n_krylov] gamma_prime_[0] = 0.0; // r[0] never gets updated with r[0]... obvs. @@ -279,8 +279,8 @@ namespace quda { std::vector &r; std::vector &u; - Complex α - Complex β + complex_t α + complex_t β BiCGstabLUpdateType update_type; @@ -299,7 +299,7 @@ namespace quda { public: BiCGstabLUpdate(ColorSpinorField &x, std::vector &r, std::vector &u, - Complex &alpha, Complex &beta, BiCGstabLUpdateType update_type, int j_max, int n_update) : + complex_t &alpha, complex_t &beta, BiCGstabLUpdateType update_type, int j_max, int n_update) : x(x), r(r), u(u), alpha(alpha), beta(beta), update_type(update_type), j_max(j_max), n_update(n_update) { @@ -319,7 +319,7 @@ namespace quda { if (update_type == BICGSTABL_UPDATE_U) { for (int i = (count * j_max) / n_update; i < ((count + 1) * j_max) / n_update && i < j_max; i++) { - blas::caxpby(1.0, r[i], -beta, u[i]); + blas::caxpby(real_t(1.0), r[i], -beta, u[i]); } } else // (update_type == BICGSTABL_UPDATE_R) @@ -453,8 +453,8 @@ namespace quda { // compute b2, but only if we need to bool fixed_iteration = param.sloppy_converge && n_krylov == param.maxiter && !param.compute_true_res; - double b2 = !fixed_iteration ? blas::norm2(b) : 1.0; // norm sq of source. - double r2; // norm sq of residual + real_t b2 = !fixed_iteration ? blas::norm2(b) : real_t(1.0); // norm sq of source. + real_t r2; // norm sq of residual if (param.deflate) { // Construct the eigensolver and deflation space if requested. @@ -559,7 +559,7 @@ namespace quda { const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false; - double heavy_quark_res = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x, r_full).z) : 0.0; + real_t heavy_quark_res = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x, r_full)[2]) : real_t(0.0); const int heavy_quark_check = param.heavy_quark_check; // how often to check the heavy quark residual blas::flops = 0; @@ -580,10 +580,10 @@ namespace quda { // Various variables related to reliable updates. int rUpdate = 0; // count reliable updates. - double delta = param.delta; // delta for reliable updates. - double rNorm = sqrt(r2); // The current residual norm. - double maxrr = rNorm; // The maximum residual norm since the last reliable update. - double maxrx = rNorm; // The same. Would be different if we did 'x' reliable updates. + real_t delta = param.delta; // delta for reliable updates. + real_t rNorm = sqrt(r2); // The current residual norm. + real_t maxrr = rNorm; // The maximum residual norm since the last reliable update. + real_t maxrx = rNorm; // The same. Would be different if we did 'x' reliable updates. PrintStats(solver_name.c_str(), total_iter, r2, b2, heavy_quark_res); while (!convergence(r2, 0.0, stop, 0.0) && total_iter < param.maxiter) { @@ -651,10 +651,10 @@ namespace quda { if (use_heavy_quark_res && total_iter % heavy_quark_check == 0) { if (&x != &x_sloppy) { blas::copy(temp, y); - heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x_sloppy, temp, r[0]).z); + heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x_sloppy, temp, r[0])[2]); } else { blas::copy(r_full, r[0]); - heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r_full).z); + heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r_full)[2]); } } } @@ -721,9 +721,9 @@ namespace quda { // !param.is_preconditioner comes from bicgstab, param.compute_true_res came from gcr. if (!param.is_preconditioner && param.compute_true_res) { // do not do the below if this is an inner solver. mat(r_full, x); - double true_res = blas::xmyNorm(b, r_full); - param.true_res = sqrt(true_res / b2); - param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x, r[0]).z) : 0.0; + real_t true_res = blas::xmyNorm(b, r_full); + param.true_res = real_t(sqrt(true_res / b2)); + param.true_res_hq = use_heavy_quark_res ? sqrt(blas::HeavyQuarkResidualNorm(x, r[0])[2]) : 0.0; } // Reset flops counters. diff --git a/lib/inv_ca_cg.cpp b/lib/inv_ca_cg.cpp index ec95bf3ffe..98d1171c51 100644 --- a/lib/inv_ca_cg.cpp +++ b/lib/inv_ca_cg.cpp @@ -98,9 +98,9 @@ namespace quda double r2; if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) { - double3 h3 = blas::HeavyQuarkResidualNorm(x, xp); - r2 = h3.y; - param.true_res_hq = sqrt(h3.z); + auto h3 = blas::HeavyQuarkResidualNorm(x, xp); + r2 = h3[1]; + param.true_res_hq = sqrt(h3[2]); } else { r2 = blas::norm2(xp); } @@ -168,9 +168,9 @@ namespace quda if (param.compute_true_res) { double r2; if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) { - double3 h3 = blas::HeavyQuarkResidualNorm(x, br); - r2 = h3.y; - param.true_res_hq = sqrt(h3.z); + auto h3 = blas::HeavyQuarkResidualNorm(x, br); + r2 = h3[1]; + param.true_res_hq = sqrt(h3[2]); } else { r2 = blas::norm2(br); } @@ -516,7 +516,7 @@ namespace quda const int maxResIncreaseTotal = param.max_res_increase_total; double heavy_quark_res = 0.0; // heavy quark residual - if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]); int resIncrease = 0; int resIncreaseTotal = 0; @@ -638,7 +638,7 @@ namespace quda blas::copy(S[0], r); - if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]); // break-out check if we have reached the limit of the precision if (r2 > r2_old) { @@ -671,7 +671,7 @@ namespace quda double true_res = blas::xmyNorm(b, r); param.true_res = sqrt(true_res / b2); param.true_res_hq - = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? sqrt(blas::HeavyQuarkResidualNorm(x, r).z) : 0.0; + = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]) : 0.0; } if (!param.is_preconditioner) { diff --git a/lib/inv_ca_gcr.cpp b/lib/inv_ca_gcr.cpp index f9e605ea86..6debd0de8f 100644 --- a/lib/inv_ca_gcr.cpp +++ b/lib/inv_ca_gcr.cpp @@ -66,10 +66,10 @@ namespace quda } // init } - void CAGCR::solve(std::vector &psi_, std::vector &q, ColorSpinorField &b) + void CAGCR::solve(std::vector &psi_, std::vector &q, ColorSpinorField &b) { - typedef Matrix matrix; - typedef Matrix vector; + typedef Matrix matrix; + typedef Matrix vector; const int N = q.size(); vector phi(N), psi(N); @@ -80,7 +80,7 @@ namespace quda // compute rhs vector phi = Q* b = (q_i, b) // Construct the matrix Q* Q = (A P)* (A P) = (q_i, q_j) = (A p_i, A p_j) - std::vector A_(N * (N + 1)); + std::vector A_(N * (N + 1)); blas::cDotProduct(A_, q, {q, b}); for (int i = 0; i < N; i++) { @@ -90,12 +90,12 @@ namespace quda #else // two reductions but uses the Hermitian block dot product // compute rhs vector phi = Q* b = (q_i, b) - std::vector phi_(N); + std::vector phi_(N); blas::cDotProduct(phi_, q, b); for (int i = 0; i < N; i++) phi(i) = phi_[i]; // Construct the matrix Q* Q = (A P)* (A P) = (q_i, q_j) = (A p_i, A p_j) - std::vector A_(N * N); + std::vector A_(N * N); blas::hDotProduct(A_, q, q); for (int i = 0; i < N; i++) for (int j = 0; j < N; j++) A(i, j) = A_[i * N + j]; @@ -262,7 +262,7 @@ namespace quda const int maxResIncreaseTotal = param.max_res_increase_total; double heavy_quark_res = 0.0; // heavy quark residual - if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]); int resIncrease = 0; int resIncreaseTotal = 0; @@ -326,7 +326,7 @@ namespace quda maxr_deflate = sqrt(r2); } - if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]); // break-out check if we have reached the limit of the precision if (r2 > r2_old) { @@ -371,7 +371,7 @@ namespace quda double true_res = blas::xmyNorm(b, r); param.true_res = sqrt(true_res / b2); param.true_res_hq - = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? sqrt(blas::HeavyQuarkResidualNorm(x, r).z) : 0.0; + = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]) : 0.0; } if (!param.is_preconditioner) { diff --git a/lib/inv_cg3_quda.cpp b/lib/inv_cg3_quda.cpp index 42ab22fcab..22319fd3ec 100644 --- a/lib/inv_cg3_quda.cpp +++ b/lib/inv_cg3_quda.cpp @@ -106,9 +106,9 @@ namespace quda { double r2; if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) { - double3 h3 = blas::HeavyQuarkResidualNorm(x, xp); - r2 = h3.y; - param.true_res_hq = sqrt(h3.z); + auto h3 = blas::HeavyQuarkResidualNorm(x, xp); + r2 = h3[1]; + param.true_res_hq = sqrt(h3[2]); } else { r2 = blas::norm2(xp); } @@ -175,9 +175,9 @@ namespace quda { if (param.compute_true_res) { double r2; if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) { - double3 h3 = blas::HeavyQuarkResidualNorm(x, br); - r2 = h3.y; - param.true_res_hq = sqrt(h3.z); + auto h3 = blas::HeavyQuarkResidualNorm(x, br); + r2 = h3[1]; + param.true_res_hq = sqrt(h3[2]); } else { r2 = blas::norm2(br); } @@ -292,7 +292,7 @@ namespace quda { blas::copy(rS, r); if (use_heavy_quark_res) { - heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]); heavy_quark_res_old = heavy_quark_res; } @@ -356,9 +356,9 @@ namespace quda { heavy_quark_res_old = heavy_quark_res; if (mixed_precision) { blas::copy(tmpS,y); - heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(xS, tmpS, rS).z); + heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(xS, tmpS, rS)[2]); } else { - heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(xS, rS).z); + heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(xS, rS)[2]); } } @@ -386,7 +386,7 @@ namespace quda { r2 = blas::xmyNorm(b, r); param.true_res = sqrt(r2 / b2); if (use_heavy_quark_res) { - heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(y, r).z); + heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(y, r)[2]); param.true_res_hq = heavy_quark_res; } rNorm = sqrt(r2); @@ -398,7 +398,7 @@ namespace quda { blas::copy(rS, r); blas::axpy(-1., xS, xS_old); // we preserve the orthogonality between the previous residual and the new - Complex rr_old = blas::cDotProduct(rS, rS_old); + complex_t rr_old = blas::cDotProduct(rS, rS_old); r2_old = blas::caxpyNorm(-rr_old/r2, rS, rS_old); blas::zero(xS); } @@ -448,7 +448,7 @@ namespace quda { // we update sloppy and old fields if (!convergence(r2, heavy_quark_res, stop, param.tol_hq)) { // we preserve the orthogonality between the previous residual and the new - Complex rr_old = blas::cDotProduct(rS, rS_old); + complex_t rr_old = blas::cDotProduct(rS, rS_old); r2_old = blas::caxpyNorm(-rr_old/r2, rS, rS_old); } } @@ -486,7 +486,7 @@ namespace quda { if (!mixed_precision && param.compute_true_res) { mat(r, x); param.true_res = sqrt(blas::xmyNorm(b, r) / b2); - if (use_heavy_quark_res) param.true_res_hq = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + if (use_heavy_quark_res) param.true_res_hq = sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]); } PrintSummary("CG3", k, r2, b2, stop, param.tol_hq); diff --git a/lib/inv_cg_quda.cpp b/lib/inv_cg_quda.cpp index bfce1b422b..3ee176fa63 100644 --- a/lib/inv_cg_quda.cpp +++ b/lib/inv_cg_quda.cpp @@ -125,9 +125,9 @@ namespace quda { double r2; if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) { - double3 h3 = blas::HeavyQuarkResidualNorm(x, xp); - r2 = h3.y; - param.true_res_hq = sqrt(h3.z); + auto h3 = blas::HeavyQuarkResidualNorm(x, xp); + r2 = h3[1]; + param.true_res_hq = sqrt(h3[2]); } else { r2 = blas::norm2(xp); } @@ -195,9 +195,9 @@ namespace quda { if (param.compute_true_res) { double r2; if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) { - double3 h3 = blas::HeavyQuarkResidualNorm(x, br); - r2 = h3.y; - param.true_res_hq = sqrt(h3.z); + auto h3 = blas::HeavyQuarkResidualNorm(x, br); + r2 = h3[1]; + param.true_res_hq = sqrt(h3[2]); } else { r2 = blas::norm2(br); } @@ -350,7 +350,7 @@ namespace quda { double r2_old = 0.0; if (r2_old_init != 0.0 and p_init) { r2_old = r2_old_init; - Complex rp = blas::cDotProduct(rSloppy, x_update_batch.get_current_field()) / (r2); + auto rp = blas::cDotProduct(rSloppy, x_update_batch.get_current_field()) / (r2); blas::caxpy(-rp, rSloppy, x_update_batch.get_current_field()); beta = r2 / r2_old; blas::xpayz(rSloppy, beta, x_update_batch.get_current_field(), x_update_batch.get_current_field()); @@ -371,7 +371,7 @@ namespace quda { double heavy_quark_res_old = 0.0; // heavy quark residual if (use_heavy_quark_res) { - heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]); heavy_quark_res_old = heavy_quark_res; // heavy quark residual } const int heavy_quark_check = param.heavy_quark_check; // how often to check the heavy quark residual @@ -420,14 +420,14 @@ namespace quda { if (advanced_feature && param.pipeline) { double Ap2; if(alternative_reliable){ - double4 quadruple = blas::quadrupleCGReduction(rSloppy, Ap, x_update_batch.get_current_field()); - r2 = quadruple.x; - Ap2 = quadruple.y; - pAp = quadruple.z; - ru.update_ppnorm(quadruple.w); + auto quadruple = blas::quadrupleCGReduction(rSloppy, Ap, x_update_batch.get_current_field()); + r2 = quadruple[0]; + Ap2 = quadruple[1]; + pAp = quadruple[2]; + ru.update_ppnorm(quadruple[3]); } else { - double3 triplet = blas::tripleCGReduction(rSloppy, Ap, x_update_batch.get_current_field()); - r2 = triplet.x; Ap2 = triplet.y; pAp = triplet.z; + auto triplet = blas::tripleCGReduction(rSloppy, Ap, x_update_batch.get_current_field()); + r2 = triplet[0]; Ap2 = triplet[1]; pAp = triplet[2]; } r2_old = r2; x_update_batch.get_current_alpha() = r2 / pAp; @@ -444,9 +444,9 @@ namespace quda { // alternative reliable updates, if (advanced_feature && alternative_reliable) { - double3 pAppp = blas::cDotProductNormA(x_update_batch.get_current_field(), Ap); - pAp = pAppp.x; - ru.update_ppnorm(pAppp.z); + auto pAppp = blas::cDotProductNormA(x_update_batch.get_current_field(), Ap); + pAp = pAppp[0]; + ru.update_ppnorm(pAppp[2]); } else { pAp = blas::reDotProduct(x_update_batch.get_current_field(), Ap); } @@ -455,8 +455,8 @@ namespace quda { // here we are deploying the alternative beta computation auto cg_norm = blas::axpyCGNorm(-x_update_batch.get_current_alpha(), Ap, rSloppy); - r2 = cg_norm.x; // (r_new, r_new) - sigma = cg_norm.y >= 0.0 ? cg_norm.y : r2; // use r2 if (r_k+1, r_k+1-r_k) breaks + r2 = cg_norm[0]; // (r_new, r_new) + sigma = cg_norm[1] >= 0.0 ? cg_norm[1] : r2; // use r2 if (r_k+1, r_k+1-r_k) breaks } // reliable update conditions @@ -502,10 +502,10 @@ namespace quda { if (use_heavy_quark_res && k % heavy_quark_check == 0) { if (&x != &xSloppy) { blas::copy(tmp, y); - heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(xSloppy, tmp, rSloppy).z); + heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(xSloppy, tmp, rSloppy)[2]); } else { blas::copy(r, rSloppy); - heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r).z); + heavy_quark_res = sqrt(blas::xpyHeavyQuarkResidualNorm(x, y, r)[2]); } } @@ -539,7 +539,7 @@ namespace quda { if (advanced_feature) { ru.update_norm(r2, y); } // calculate new reliable HQ resididual - if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(y, r).z); + if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(y, r)[3]); if (advanced_feature) { if (ru.reliable_break(r2, stop, L2breakdown, L2breakdown_eps)) { break; } @@ -557,7 +557,7 @@ namespace quda { heavy_quark_restart = false; } else { // explicitly restore the orthogonality of the gradient vector - Complex rp = blas::cDotProduct(rSloppy, x_update_batch.get_current_field()) / (r2); + auto rp = blas::cDotProduct(rSloppy, x_update_batch.get_current_field()) / (r2); blas::caxpy(-rp, rSloppy, x_update_batch.get_current_field()); beta = r2 / r2_old; @@ -619,7 +619,7 @@ namespace quda { // compute the true residuals mat(r, x); param.true_res = sqrt(blas::xmyNorm(b, r) / b2); - param.true_res_hq = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + param.true_res_hq = sqrt(blas::HeavyQuarkResidualNorm(x, r)[3]); } PrintSummary("CG", k, r2, b2, stop, param.tol_hq); @@ -759,7 +759,7 @@ namespace quda { MatrixXcd C = MatrixXcd::Zero(param.num_src, param.num_src); MatrixXcd S = MatrixXcd::Identity(param.num_src, param.num_src); MatrixXcd pAp = MatrixXcd::Identity(param.num_src, param.num_src); - quda::Complex *AC = new quda::Complex[param.num_src * param.num_src]; + complex_t *AC = new complex_t[param.num_src * param.num_src]; #ifdef MWVERBOSE MatrixXcd pTp = MatrixXcd::Identity(param.num_src, param.num_src); @@ -947,7 +947,7 @@ namespace quda { for (int i = 0; i < param.num_src; i++) { mat(r.Component(i), x.Component(i)); param.true_res = sqrt(blas::xmyNorm(b.Component(i), r.Component(i)) / b2[i]); - param.true_res_hq = sqrt(blas::HeavyQuarkResidualNorm(x.Component(i), r.Component(i)).z); + param.true_res_hq = sqrt(blas::HeavyQuarkResidualNorm(x.Component(i), r.Component(i))[3]); param.true_res_offset[i] = param.true_res; param.true_res_hq_offset[i] = param.true_res_hq; @@ -1107,7 +1107,7 @@ void CG::solve(ColorSpinorField& x, ColorSpinorField& b) { for(int i = 0; i < param.num_src; i++){ stop[i] = stopping(param.tol, b2[i], param.residual_type); // stopping condition of solver if (use_heavy_quark_res) { - heavy_quark_res[i] = sqrt(blas::HeavyQuarkResidualNorm(x.Component(i), r.Component(i)).z); + heavy_quark_res[i] = sqrt(blas::HeavyQuarkResidualNorm(x.Component(i), r.Component(i))[3]); heavy_quark_res_old[i] = heavy_quark_res[i]; // heavy quark residual } } @@ -1361,12 +1361,12 @@ void CG::solve(ColorSpinorField& x, ColorSpinorField& b) { if (&x != &xSloppy) { blas::copy(tmp, y); // FIXME: check whether copy works here for(int i=0; i; + using RowMajorDenseMatrix = Matrix; static int max_eigcg_cycles = 4;//how many eigcg cycles do we allow? @@ -84,7 +84,7 @@ namespace quda { } // method for constructing Lanczos matrix : - inline void SetLanczos(Complex diag_val, Complex offdiag_val) + inline void SetLanczos(complex_t diag_val, complex_t offdiag_val) { if (run_residual_correction) return; @@ -121,7 +121,7 @@ namespace quda { { Tm.setZero(); - auto s = std::make_unique(2 * k); + auto s = std::make_unique(2 * k); for (int i = 0; i < 2 * k; i++) Tm(i, i) = Tmvals(i); //?? @@ -298,7 +298,7 @@ namespace quda { std::vector v2k(args.V2k->Components()); RowMajorDenseMatrix Alpha(args.ritzVecs.topLeftCorner(args.m, 2*args.k)); - blas::caxpy( static_cast(Alpha.data()), vm , v2k); + blas::caxpy( static_cast(Alpha.data()), vm , v2k); for(int i = 0; i < 2*args.k; i++) blas::copy(Vm->Component(i), args.V2k->Component(i)); @@ -451,7 +451,7 @@ namespace quda { double heavy_quark_res = 0.0; // heavy quark res idual - if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]); double pAp; double alpha=1.0, alpha_inv=1.0, beta=0.0, alpha_old_inv = 1.0; @@ -528,7 +528,7 @@ namespace quda { // compute the true residuals matSloppy(r, x); param.true_res = sqrt(blas::xmyNorm(b, r) / b2); - param.true_res_hq = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + param.true_res_hq = sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]); PrintSummary("eigCG", k, r2, b2, args.global_stop, param.tol_hq); @@ -701,7 +701,7 @@ namespace quda { r2 = blas::xmyNorm(in, r); param.true_res = sqrt(r2 / b2); - param.true_res_hq = sqrt(HeavyQuarkResidualNorm(out,r).z); + param.true_res_hq = sqrt(HeavyQuarkResidualNorm(out,r)[2]); PrintSummary( !dcg_cycle ? "EigCG:" : "DCG (correction cycle):", iters, r2, b2, stop, param.tol_hq); if( getVerbosity() >= QUDA_VERBOSE ) { diff --git a/lib/inv_gcr_quda.cpp b/lib/inv_gcr_quda.cpp index 8a938c17db..508e6738b7 100644 --- a/lib/inv_gcr_quda.cpp +++ b/lib/inv_gcr_quda.cpp @@ -57,9 +57,9 @@ namespace quda { inner.sloppy_converge = true; } - void GCR::computeBeta(std::vector &beta, std::vector &Ap, int i, int N, int k) + void GCR::computeBeta(std::vector &beta, std::vector &Ap, int i, int N, int k) { - std::vector Beta(N, 0.0); + std::vector Beta(N, 0.0); blas::cDotProduct(Beta, {Ap.begin() + i, Ap.begin() + i + N}, Ap[k]); // vectorized dot product #if 0 @@ -72,14 +72,14 @@ namespace quda { for (int j = 0; j < N; j++) beta[(i + j) * n_krylov + k] = Beta[j]; } - void GCR::updateAp(std::vector &beta, std::vector &Ap, int begin, int size, int k) + void GCR::updateAp(std::vector &beta, std::vector &Ap, int begin, int size, int k) { - std::vector beta_(size); + std::vector beta_(size); for (int i = 0; i < size; i++) beta_[i] = -beta[(i + begin) * n_krylov + k]; blas::caxpy(beta_, {Ap.begin() + begin, Ap.begin() + begin + size}, Ap[k]); } - void GCR::orthoDir(std::vector &beta, std::vector &Ap, int k, int pipeline) + void GCR::orthoDir(std::vector &beta, std::vector &Ap, int k, int pipeline) { switch (pipeline) { case 0: // no kernel fusion @@ -118,8 +118,8 @@ namespace quda { } } - void GCR::backSubs(const std::vector &alpha, const std::vector &beta, - const std::vector &gamma, std::vector &delta, int n) + void GCR::backSubs(const std::vector &alpha, const std::vector &beta, + const std::vector &gamma, std::vector &delta, int n) { for (int k=n-1; k>=0;k--) { delta[k] = alpha[k]; @@ -128,10 +128,10 @@ namespace quda { } } - void GCR::updateSolution(ColorSpinorField &x, const std::vector &alpha, const std::vector &beta, + void GCR::updateSolution(ColorSpinorField &x, const std::vector &alpha, const std::vector &beta, std::vector &gamma, int k, std::vector &p) { - std::vector delta(k); + std::vector delta(k); // Update the solution vector backSubs(alpha, beta, gamma, delta, k); @@ -311,7 +311,7 @@ namespace quda { const int maxResIncreaseTotal = param.max_res_increase_total; double heavy_quark_res = 0.0; // heavy quark residual - if(use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x,r).z); + if(use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x,r)[2]); int resIncrease = 0; int resIncreaseTotal = 0; @@ -355,11 +355,11 @@ namespace quda { orthoDir(beta, Ap, k, pipeline); - double3 Apr = blas::cDotProductNormA(Ap[k], K ? r_sloppy : p[k]); + auto Apr = blas::cDotProductNormA(Ap[k], K ? r_sloppy : p[k]); - gamma[k] = sqrt(Apr.z); // gamma[k] = Ap[k] + gamma[k] = sqrt(Apr[2]); // gamma[k] = Ap[k] if (gamma[k] == 0.0) errorQuda("GCR breakdown"); - alpha[k] = Complex(Apr.x, Apr.y) / gamma[k]; // alpha = (1/|Ap|) * (Ap, r) + alpha[k] = complex_t(Apr[0], Apr[1]) / gamma[k]; // alpha = (1/|Ap|) * (Ap, r) // r -= (1/|Ap|^2) * (Ap, r) r, Ap *= 1/|Ap| r2 = blas::cabxpyzAxNorm(1.0 / gamma[k], -alpha[k], Ap[k], K ? r_sloppy : p[k], K ? r_sloppy : p[k + 1]); @@ -391,7 +391,7 @@ namespace quda { maxr_deflate = sqrt(r2); } - if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]); // break-out check if we have reached the limit of the precision if (r2 > r2_old) { @@ -444,7 +444,7 @@ namespace quda { double true_res = blas::xmyNorm(b, r); param.true_res = sqrt(true_res / b2); if (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) - param.true_res_hq = sqrt(blas::HeavyQuarkResidualNorm(x,r).z); + param.true_res_hq = sqrt(blas::HeavyQuarkResidualNorm(x,r)[2]); else param.true_res_hq = 0.0; //if (param.preserve_source == QUDA_PRESERVE_SOURCE_NO) blas::copy(b, r); diff --git a/lib/inv_gmresdr_quda.cpp b/lib/inv_gmresdr_quda.cpp index 389206853e..6a7ba31e70 100644 --- a/lib/inv_gmresdr_quda.cpp +++ b/lib/inv_gmresdr_quda.cpp @@ -35,7 +35,7 @@ namespace quda { using Vector = VectorXcd; // special types needed for compatibility with QUDA blas: - using RowMajorDenseMatrix = Matrix; + using RowMajorDenseMatrix = Matrix; struct SortedEvals { @@ -60,7 +60,7 @@ namespace quda { int k; int restarts; - Complex *c; + complex_t *c; ColorSpinorFieldSet *Vkp1; // high-precision accumulation array @@ -73,7 +73,7 @@ namespace quda { restarts(0), Vkp1(nullptr) { - c = static_cast(ritzVecs.col(k).data()); + c = static_cast(ritzVecs.col(k).data()); } inline void ResetArgs() @@ -116,7 +116,7 @@ namespace quda { std::stable_sort(sorted_evals.begin(), sorted_evals.end(), SortedEvals::SelectSmall); for (int e = 0; e < args.k; e++) - memcpy(args.ritzVecs.col(e).data(), harVecs.col(sorted_evals[e]._idx).data(), (args.m) * sizeof(Complex)); + memcpy(args.ritzVecs.col(e).data(), harVecs.col(sorted_evals[e]._idx).data(), (args.m) * sizeof(complex_t)); return; } @@ -237,13 +237,13 @@ namespace quda { std::vector x_, r_; x_.push_back(x), r_.push_back(r); - blas::caxpy(static_cast(args.eta.data()), Z_, x_); + blas::caxpy(static_cast(args.eta.data()), Z_, x_); VectorXcd minusHeta = -(args.H * args.eta); Map c_(args.c, args.m + 1); c_ += minusHeta; - blas::caxpy(static_cast(minusHeta.data()), V_, r_); + blas::caxpy(static_cast(minusHeta.data()), V_, r_); return; } @@ -272,7 +272,7 @@ namespace quda { std::vector vm(Vm->Components()); RowMajorDenseMatrix Alpha(Qkp1); // convert Qkp1 to Row-major format first - blas::caxpy(static_cast(Alpha.data()), vm, vkp1); + blas::caxpy(static_cast(Alpha.data()), vm, vkp1); for (int i = 0; i < (args.m + 1); i++) { if (i < (args.k + 1)) { @@ -287,7 +287,7 @@ namespace quda { std::vector vk(args.Vkp1->Components().begin(), args.Vkp1->Components().begin() + args.k); RowMajorDenseMatrix Beta(Qkp1.topLeftCorner(args.m, args.k)); - blas::caxpy(static_cast(Beta.data()), z, vk); + blas::caxpy(static_cast(Beta.data()), z, vk); for (int i = 0; i < (args.m); i++) { if (i < (args.k)) @@ -298,7 +298,7 @@ namespace quda { } for (int j = 0; j < args.k; j++) { - Complex alpha = cDotProduct(Vm->Component(j), Vm->Component(args.k)); + complex_t alpha = cDotProduct(Vm->Component(j), Vm->Component(args.k)); caxpy(-alpha, Vm->Component(j), Vm->Component(args.k)); } @@ -313,11 +313,11 @@ namespace quda { int j = start_idx; GMResDRArgs &args = *gmresdr_args; - std::unique_ptr givensH((do_givens) ? new Complex[(args.m + 1) * args.m] : nullptr); - std::unique_ptr cn((do_givens) ? new Complex[args.m] : nullptr); + std::unique_ptr givensH((do_givens) ? new complex_t[(args.m + 1) * args.m] : nullptr); + std::unique_ptr cn((do_givens) ? new complex_t[args.m] : nullptr); std::unique_ptr sn((do_givens) ? new double[args.m] : nullptr); - Complex c0 = args.c[0]; + complex_t c0 = args.c[0]; while (j < args.m) { if (K) { @@ -337,7 +337,7 @@ namespace quda { args.H(0, j) = cDotProduct(Vm->Component(0), Vm->Component(j + 1)); caxpy(-args.H(0, j), Vm->Component(0), Vm->Component(j + 1)); - Complex h0 = do_givens ? args.H(0, j) : 0.0; + complex_t h0 = do_givens ? args.H(0, j) : 0.0; for (int i = 1; i <= j; i++) { args.H(i, j) = cDotProduct(Vm->Component(i), Vm->Component(j + 1)); @@ -349,7 +349,7 @@ namespace quda { } } - args.H(j + 1, j) = Complex(sqrt(norm2(Vm->Component(j + 1))), 0.0); + args.H(j + 1, j) = complex_t(sqrt(norm2(Vm->Component(j + 1))), 0.0); blas::ax(1.0 / args.H(j + 1, j).real(), Vm->Component(j + 1)); if (do_givens) { double inv_denom = 1.0 / sqrt(norm(h0) + norm(args.H(j + 1, j))); @@ -366,14 +366,14 @@ namespace quda { if (do_givens) { Map givensH_(givensH.get(), args.m, args.m, DynamicStride(args.m + 1, 1)); - memcpy(args.eta.data(), args.c, args.m * sizeof(Complex)); - memset((void *)args.c, 0, (args.m + 1) * sizeof(Complex)); + memcpy(args.eta.data(), args.c, args.m * sizeof(complex_t)); + memset((void *)args.c, 0, (args.m + 1) * sizeof(complex_t)); args.c[0] = c0; givensH_.triangularView().solveInPlace(args.eta); } else { - memset((void *)args.c, 0, (args.m + 1) * sizeof(Complex)); + memset((void *)args.c, 0, (args.m + 1) * sizeof(complex_t)); std::vector v_(Vm->Components().begin(), Vm->Components().begin() + args.k + 1); std::vector r_; @@ -453,7 +453,7 @@ namespace quda { double r2 = xmyNorm(b, r); double b2 = r2; - args.c[0] = Complex(sqrt(r2), 0.0); + args.c[0] = complex_t(sqrt(r2), 0.0); printfQuda("\nInitial residual squared: %1.16e, source %1.16e, tolerance %1.16e\n", r2, sqrt(normb), param.tol); @@ -474,7 +474,7 @@ namespace quda { const bool use_heavy_quark_res = (param.residual_type & QUDA_HEAVY_QUARK_RESIDUAL) ? true : false; double heavy_quark_res = 0.0; - if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r).z); + if (use_heavy_quark_res) heavy_quark_res = sqrt(blas::HeavyQuarkResidualNorm(x, r)[2]); int restart_idx = 0, j = 0, check_interval = 4; @@ -497,7 +497,7 @@ namespace quda { // can this be done as a single 2-d reduction? for (int l = 0; l < args.k + 1; l++) { - Complex *col = Gm.col(l).data(); + complex_t *col = Gm.col(l).data(); std::vector v1_(Vm->Components().begin(), Vm->Components().begin() + args.k + 1); std::vector v2_; @@ -507,7 +507,7 @@ namespace quda { } // end l-loop - Complex detGm = Gm.determinant(); + complex_t detGm = Gm.determinant(); PrintStats("FGMResDR:", tot_iters, r2, b2, heavy_quark_res); printfQuda("\nCheck cycle %d, true residual squared %1.15e, Gramm det : (%le, %le)\n", restart_idx, ext_r2, @@ -533,7 +533,7 @@ namespace quda { r = y; zero(e); - args.c[0] = Complex(sqrt(ext_r2), 0.0); + args.c[0] = complex_t(sqrt(ext_r2), 0.0); blas::zero(Vm->Component(0)); blas::axpy(1.0 / args.c[0].real(), rSloppy, Vm->Component(0)); diff --git a/lib/inv_mr_quda.cpp b/lib/inv_mr_quda.cpp index 4f636bf279..01ac08376f 100644 --- a/lib/inv_mr_quda.cpp +++ b/lib/inv_mr_quda.cpp @@ -113,8 +113,8 @@ namespace quda if (param.global_reduction) { auto Ar4 = blas::cDotProductNormAB(Ar, r_sloppy); - Complex alpha = Complex(Ar4.x, Ar4.y) / Ar4.z; - r2 = Ar4.w; + auto alpha = complex_t(Ar4[0], Ar4[1]) / Ar4[2]; + r2 = Ar4[3]; PrintStats("MR (inner)", iter, r2, b2, 0.0); // x += omega*alpha*r, r -= omega*alpha*Ar, r2 = blas::norm2(r) diff --git a/lib/inv_mre.cpp b/lib/inv_mre.cpp index 10733a6aaa..4149369a55 100644 --- a/lib/inv_mre.cpp +++ b/lib/inv_mre.cpp @@ -12,11 +12,11 @@ namespace quda /* Solve the equation A p_k psi_k = b by minimizing the residual and using Eigen's SVD algorithm for numerical stability */ - void MinResExt::solve(std::vector &psi_, std::vector &p, std::vector &q, + void MinResExt::solve(std::vector &psi_, std::vector &p, std::vector &q, const ColorSpinorField &b, bool hermitian) { - typedef Matrix matrix; - typedef Matrix vector; + typedef Matrix matrix; + typedef Matrix vector; const int N = q.size(); vector phi(N), psi(N); @@ -25,7 +25,7 @@ namespace quda // form the a Nx(N+1) matrix using only a single reduction - this // presently requires forgoing the matrix symmetry, but the improvement is well worth it - std::vector A_(N * (N + 1)); + std::vector A_(N * (N + 1)); if (hermitian) { // linear system is Hermitian, solve directly @@ -93,7 +93,7 @@ namespace quda if (!apply_mat) blas::ax(1 / sqrt(p2), q[i]); if (i + 1 < N) { - std::vector alpha(N - (i + 1)); + std::vector alpha(N - (i + 1)); blas::cDotProduct(alpha, {p[i]}, {p.begin() + i + 1, p.end()}); for (auto &a : alpha) a = -a; blas::caxpy(alpha, {p[i]}, {p.begin() + i + 1, p.end()}); @@ -111,7 +111,7 @@ namespace quda for (int i = 0; i < N; i++) mat(q[i], p[i]); // Solution coefficient vectors - std::vector alpha(N); + std::vector alpha(N); if (b.Precision() != p[0].Precision()) { // need to make a sloppy copy of b ColorSpinorParam param(b); diff --git a/lib/inv_multi_cg_quda.cpp b/lib/inv_multi_cg_quda.cpp index b6757440f4..6d9a9683f2 100644 --- a/lib/inv_multi_cg_quda.cpp +++ b/lib/inv_multi_cg_quda.cpp @@ -298,8 +298,8 @@ namespace quda { r2_old_array[0] = r2_old; auto cg_norm = blas::axpyCGNorm(-alpha[j_low], Ap, r_sloppy); - r2[0] = cg_norm.x; - double zn = cg_norm.y; + r2[0] = cg_norm[0]; + double zn = cg_norm[1]; // reliable update conditions rNorm[0] = sqrt(r2[0]); @@ -370,7 +370,7 @@ namespace quda { // explicitly restore the orthogonality of the gradient vector for (int j=0; j::infinity(); param.true_res_hq_offset[i] = std::numeric_limits::infinity(); diff --git a/lib/inv_pcg_quda.cpp b/lib/inv_pcg_quda.cpp index 595b1653d6..3353ccd72f 100644 --- a/lib/inv_pcg_quda.cpp +++ b/lib/inv_pcg_quda.cpp @@ -235,7 +235,7 @@ namespace quda double stop = stopping(param.tol, b2, param.residual_type); // stopping condition of solver double heavy_quark_res = 0.0; // heavy quark residual - if (use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNorm(x, r).z); + if (use_heavy_quark_res) heavy_quark_res = sqrt(HeavyQuarkResidualNorm(x, r)[2]); double beta = 0.0; double pAp; @@ -282,9 +282,9 @@ namespace quda double sigma; // alternative reliable updates, if (alternative_reliable) { - double3 pAppp = blas::cDotProductNormA(x_update_batch.get_current_field(), Ap); - pAp = pAppp.x; - ru.update_ppnorm(pAppp.z); + auto pAppp = blas::cDotProductNormA(x_update_batch.get_current_field(), Ap); + pAp = pAppp[0]; + ru.update_ppnorm(pAppp[2]); } else { pAp = reDotProduct(x_update_batch.get_current_field(), Ap); } @@ -293,9 +293,9 @@ namespace quda auto cg_norm = axpyCGNorm(-x_update_batch.get_current_alpha(), Ap, r_sloppy); // r --> r - alpha*A*p r2_old = r2; - r2 = cg_norm.x; + r2 = cg_norm[0]; - sigma = cg_norm.y >= 0.0 ? cg_norm.y : r2; // use r2 if (r_k+1, r_k-1 - r_k) breaks + sigma = cg_norm[1] >= 0.0 ? cg_norm[1] : r2; // use r2 if (r_k+1, r_k-1 - r_k) breaks if (K) rMinvr_old = rMinvr; diff --git a/lib/inv_sd_quda.cpp b/lib/inv_sd_quda.cpp index cbb51bac01..524fa4c816 100644 --- a/lib/inv_sd_quda.cpp +++ b/lib/inv_sd_quda.cpp @@ -45,13 +45,12 @@ namespace quda { zero(*r), zero(x); double r2 = xmyNorm(b,*r); double alpha=0.; - double3 rAr; int k=0; while (k < param.maxiter - 1) { mat(*Ar, *r); - rAr = cDotProductNormA(*r, *Ar); - alpha = rAr.z/rAr.x; + auto rAr = cDotProductNormA(*r, *Ar); + alpha = rAr[2]/rAr[0]; axpy(alpha, *r, x); axpy(-alpha, *Ar, *r); @@ -63,8 +62,8 @@ namespace quda { ++k; } - rAr = cDotProductNormA(*r, *Ar); - alpha = rAr.z/rAr.x; + auto rAr = cDotProductNormA(*r, *Ar); + alpha = rAr[2] / rAr[0]; axpy(alpha, *r, x); if(getVerbosity() >= QUDA_VERBOSE){ axpy(-alpha, *Ar, *r); diff --git a/lib/madwf_ml.cpp b/lib/madwf_ml.cpp index 2c36294a94..dcae71b68e 100644 --- a/lib/madwf_ml.cpp +++ b/lib/madwf_ml.cpp @@ -219,7 +219,7 @@ namespace quda ref(lambda, tmp); - std::vector dot(9); + std::vector dot(9); blas::cDotProduct(dot, {chi, theta, lambda}, {chi, theta, lambda}); a[0] += dot[0].real(); diff --git a/lib/multi_blas_quda.cu b/lib/multi_blas_quda.cu index 5020f7b109..7f01e12745 100644 --- a/lib/multi_blas_quda.cu +++ b/lib/multi_blas_quda.cu @@ -308,7 +308,7 @@ namespace quda { } template <> - void axpy(const std::vector &a, cvector_ref &x, cvector_ref &y) + void axpy(const std::vector &a, cvector_ref &x, cvector_ref &y) { // Enter a recursion. // Pass a, x, y. (0,0) indexes the tiles. false specifies the matrix is unstructured. @@ -316,7 +316,7 @@ namespace quda { } template <> - void axpy_U(const std::vector &a, cvector_ref &x, cvector_ref &y) + void axpy_U(const std::vector &a, cvector_ref &x, cvector_ref &y) { // Enter a recursion. // Pass a, x, y. (0,0) indexes the tiles. 1 indicates the matrix is upper-triangular, @@ -330,7 +330,7 @@ namespace quda { } template <> - void axpy_L(const std::vector &a, cvector_ref &x, cvector_ref &y) + void axpy_L(const std::vector &a, cvector_ref &x, cvector_ref &y) { // Enter a recursion. // Pass a, x, y. (0,0) indexes the tiles. -1 indicates the matrix is lower-triangular @@ -344,20 +344,20 @@ namespace quda { } template <> - void axpy(const std::vector &a, cvector_ref &x, cvector_ref &y) + void axpy(const std::vector &a, cvector_ref &x, cvector_ref &y) { // Enter a recursion. // Pass a, x, y. (0,0) indexes the tiles. false specifies the matrix is unstructured. axpy_recurse(a, x, y, range(0,x.size()), range(0,y.size()), 0); } - void caxpy(const std::vector &a, cvector_ref &x, cvector_ref &y) + void caxpy(const std::vector &a, cvector_ref &x, cvector_ref &y) { axpy(a, std::move(x), std::move(y)); } template <> - void axpy_U(const std::vector &a, cvector_ref &x, cvector_ref &y) + void axpy_U(const std::vector &a, cvector_ref &x, cvector_ref &y) { // Enter a recursion. // Pass a, x, y. (0,0) indexes the tiles. 1 indicates the matrix is upper-triangular, @@ -369,13 +369,13 @@ namespace quda { axpy_recurse(a, x, y, range(0,x.size()), range(0,y.size()), 1); } - void caxpy_U(const std::vector &a, cvector_ref &x, cvector_ref &y) + void caxpy_U(const std::vector &a, cvector_ref &x, cvector_ref &y) { axpy_U(a, std::move(x), std::move(y)); } template <> - void axpy_L(const std::vector &a, cvector_ref &x, cvector_ref &y) + void axpy_L(const std::vector &a, cvector_ref &x, cvector_ref &y) { // Enter a recursion. // Pass a, x, y. (0,0) indexes the tiles. -1 indicates the matrix is lower-triangular @@ -387,7 +387,7 @@ namespace quda { axpy_recurse(a, x, y, range(0,x.size()), range(0,y.size()), -1); } - void caxpy_L(const std::vector &a, cvector_ref &x, cvector_ref &y) + void caxpy_L(const std::vector &a, cvector_ref &x, cvector_ref &y) { axpy_L(a, std::move(x), std::move(y)); } @@ -448,12 +448,12 @@ namespace quda { } // end if (y.size() > max_YW_size()) } - void axpyz(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) + void axpyz(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) { axpyz_recurse(a, x, y, z, range(0, x.size()), range(0, y.size()), 0, 0); } - void axpyz_U(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) + void axpyz_U(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) { if (x.size() != y.size()) { errorQuda("An optimal block axpyz_U with non-square 'a' (%lu != %lu) has not yet been implemented. Use block axpyz instead", @@ -466,7 +466,7 @@ namespace quda { axpyz_recurse(a, x, y, z, range(0, x.size()), range(0, y.size()), 1, 1); } - void axpyz_L(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) + void axpyz_L(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) { if (x.size() != y.size()) { errorQuda("An optimal block axpyz_L with non-square 'a' (%lu != %lu) has not yet been implemented. Use block axpyz instead", @@ -479,12 +479,12 @@ namespace quda { axpyz_recurse(a, x, y, z, range(0, x.size()), range(0, y.size()), 1, -1); } - void caxpyz(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) + void caxpyz(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) { axpyz_recurse(a, x, y, z, range(0, x.size()), range(0, y.size()), 0, 0); } - void caxpyz_U(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) + void caxpyz_U(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) { if (x.size() != y.size()) { errorQuda("An optimal block caxpyz_U with non-square 'a' (%lu != %lu) has not yet been implemented. Use block caxpyz instead", @@ -497,7 +497,7 @@ namespace quda { axpyz_recurse(a, x, y, z, range(0, x.size()), range(0, y.size()), 1, 1); } - void axpyz_L(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) + void axpyz_L(const std::vector &a, cvector_ref &x, cvector_ref &y, cvector_ref &z) { if (x.size() != y.size()) { errorQuda("An optimal block axpyz_U with non-square 'a' (%lu != %lu) has not yet been implemented. Use block axpyz instead", @@ -510,8 +510,8 @@ namespace quda { axpyz_recurse(a, x, y, z, range(0, x.size()), range(0, y.size()), 1, -1); } - void axpyBzpcx(const std::vector &a, cvector_ref &x_, cvector_ref &y_, - const std::vector &b, ColorSpinorField &z_, const std::vector &c) + void axpyBzpcx(const std::vector &a, cvector_ref &x_, cvector_ref &y_, + const std::vector &b, ColorSpinorField &z_, const std::vector &c) { if (y_.size() <= (size_t)max_N_multi_1d()) { // swizzle order since we are writing to x_ and y_, but the @@ -538,8 +538,8 @@ namespace quda { } } - void caxpyBxpz(const std::vector &a, cvector_ref &x_, ColorSpinorField &y_, - const std::vector &b, ColorSpinorField &z_) + void caxpyBxpz(const std::vector &a, cvector_ref &x_, ColorSpinorField &y_, + const std::vector &b, ColorSpinorField &z_) { if (x_.size() <= (size_t)max_N_multi_1d() && is_valid_NXZ(x_.size(), false, y_.Precision())) // only split if we have to. { @@ -554,7 +554,7 @@ namespace quda { auto &x = x_; constexpr bool mixed = true; - instantiate(a, b, std::vector(), x[0], y[0], x, y, x, w); + instantiate(a, b, std::vector(), x[0], y[0], x, y, x, w); } else { // split the problem in half and recurse auto a_ = bisect(a); @@ -567,10 +567,10 @@ namespace quda { } // temporary wrappers - void axpy(const double *a, std::vector &x, std::vector &y) + void axpy(const real_t *a, std::vector &x, std::vector &y) { - std::vector a_(x.size() * y.size()); - memcpy(a_.data(), a, x.size() * y.size() * sizeof(double)); + std::vector a_(x.size() * y.size()); + memcpy(a_.data(), a, x.size() * y.size() * sizeof(real_t)); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); vector_ref y_; @@ -578,10 +578,10 @@ namespace quda { axpy(a_, x_, y_); } - void axpy_U(const double *a, std::vector &x, std::vector &y) + void axpy_U(const real_t *a, std::vector &x, std::vector &y) { - std::vector a_(x.size() * y.size()); - memcpy(a_.data(), a, x.size() * y.size() * sizeof(double)); + std::vector a_(x.size() * y.size()); + memcpy(a_.data(), a, x.size() * y.size() * sizeof(real_t)); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); vector_ref y_; @@ -589,10 +589,10 @@ namespace quda { axpy_U(a_, x_, y_); } - void axpy_L(const double *a, std::vector &x, std::vector &y) + void axpy_L(const real_t *a, std::vector &x, std::vector &y) { - std::vector a_(x.size() * y.size()); - memcpy(a_.data(), a, x.size() * y.size() * sizeof(double)); + std::vector a_(x.size() * y.size()); + memcpy(a_.data(), a, x.size() * y.size() * sizeof(real_t)); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); vector_ref y_; @@ -600,9 +600,9 @@ namespace quda { axpy_L(a_, x_, y_); } - void caxpy(const Complex *a, std::vector &x, std::vector &y) { - std::vector a_(x.size() * y.size()); - memcpy(a_.data(), a, x.size() * y.size() * sizeof(Complex)); + void caxpy(const complex_t *a, std::vector &x, std::vector &y) { + std::vector a_(x.size() * y.size()); + memcpy(a_.data(), a, x.size() * y.size() * sizeof(complex_t)); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); vector_ref y_; @@ -610,9 +610,9 @@ namespace quda { caxpy(a_, x_, y_); } - void caxpy_U(const Complex *a, std::vector &x, std::vector &y) { - std::vector a_(x.size() * y.size()); - memcpy(a_.data(), a, x.size() * y.size() * sizeof(Complex)); + void caxpy_U(const complex_t *a, std::vector &x, std::vector &y) { + std::vector a_(x.size() * y.size()); + memcpy(a_.data(), a, x.size() * y.size() * sizeof(complex_t)); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); vector_ref y_; @@ -620,9 +620,9 @@ namespace quda { caxpy_U(a_, x_, y_); } - void caxpy_L(const Complex *a, std::vector &x, std::vector &y) { - std::vector a_(x.size() * y.size()); - memcpy(a_.data(), a, x.size() * y.size() * sizeof(Complex)); + void caxpy_L(const complex_t *a, std::vector &x, std::vector &y) { + std::vector a_(x.size() * y.size()); + memcpy(a_.data(), a, x.size() * y.size() * sizeof(complex_t)); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); vector_ref y_; @@ -630,11 +630,11 @@ namespace quda { caxpy_L(a_, x_, y_); } - void axpyz(const double *a, std::vector &x, std::vector &y, + void axpyz(const real_t *a, std::vector &x, std::vector &y, std::vector &z) { - std::vector a_(x.size() * y.size()); - memcpy(a_.data(), a, x.size() * y.size() * sizeof(double)); + std::vector a_(x.size() * y.size()); + memcpy(a_.data(), a, x.size() * y.size() * sizeof(real_t)); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); vector_ref y_; @@ -644,11 +644,11 @@ namespace quda { axpyz(a_, x_, y_, z_); } - void caxpyz(const Complex *a, std::vector &x, std::vector &y, + void caxpyz(const complex_t *a, std::vector &x, std::vector &y, std::vector &z) { - std::vector a_(x.size() * y.size()); - memcpy(a_.data(), a, x.size() * y.size() * sizeof(Complex)); + std::vector a_(x.size() * y.size()); + memcpy(a_.data(), a, x.size() * y.size() * sizeof(complex_t)); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); vector_ref y_; @@ -658,15 +658,15 @@ namespace quda { caxpyz(a_, x_, y_, z_); } - void axpyBzpcx(const double *a, std::vector &x, std::vector &y, - const double *b, ColorSpinorField &z, const double *c) + void axpyBzpcx(const real_t *a, std::vector &x, std::vector &y, + const real_t *b, ColorSpinorField &z, const real_t *c) { - std::vector a_(x.size()); - memcpy(a_.data(), a, x.size() * sizeof(double)); - std::vector b_(x.size()); - memcpy(b_.data(), b, x.size() * sizeof(double)); - std::vector c_(x.size()); - memcpy(c_.data(), c, x.size() * sizeof(double)); + std::vector a_(x.size()); + memcpy(a_.data(), a, x.size() * sizeof(real_t)); + std::vector b_(x.size()); + memcpy(b_.data(), b, x.size() * sizeof(real_t)); + std::vector c_(x.size()); + memcpy(c_.data(), c, x.size() * sizeof(real_t)); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); @@ -675,13 +675,13 @@ namespace quda { axpyBzpcx(a_, x_, y_, b_, z, c_); } - void caxpyBxpz(const Complex *a, std::vector &x, ColorSpinorField &y, - const Complex *b, ColorSpinorField &z) + void caxpyBxpz(const complex_t *a, std::vector &x, ColorSpinorField &y, + const complex_t *b, ColorSpinorField &z) { - std::vector a_(x.size()); - memcpy(a_.data(), a, x.size() * sizeof(Complex)); - std::vector b_(x.size()); - memcpy(b_.data(), b, x.size() * sizeof(Complex)); + std::vector a_(x.size()); + memcpy(a_.data(), a, x.size() * sizeof(complex_t)); + std::vector b_(x.size()); + memcpy(b_.data(), b, x.size() * sizeof(complex_t)); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); @@ -689,20 +689,20 @@ namespace quda { } // Composite field version - void caxpy(const Complex *a, ColorSpinorField &x, ColorSpinorField &y){ caxpy(a, x.Components(), y.Components()); } - void caxpy_U(const Complex *a, ColorSpinorField &x, ColorSpinorField &y) { caxpy_U(a, x.Components(), y.Components()); } - void caxpy_L(const Complex *a, ColorSpinorField &x, ColorSpinorField &y) { caxpy_L(a, x.Components(), y.Components()); } + void caxpy(const complex_t *a, ColorSpinorField &x, ColorSpinorField &y){ caxpy(a, x.Components(), y.Components()); } + void caxpy_U(const complex_t *a, ColorSpinorField &x, ColorSpinorField &y) { caxpy_U(a, x.Components(), y.Components()); } + void caxpy_L(const complex_t *a, ColorSpinorField &x, ColorSpinorField &y) { caxpy_L(a, x.Components(), y.Components()); } - void axpy(const double *a, ColorSpinorField &x, ColorSpinorField &y) { axpy(a, x.Components(), y.Components()); } - void axpy_U(const double *a, ColorSpinorField &x, ColorSpinorField &y) { axpy_U(a, x.Components(), y.Components()); } - void axpy_L(const double *a, ColorSpinorField &x, ColorSpinorField &y) { axpy_L(a, x.Components(), y.Components()); } + void axpy(const real_t *a, ColorSpinorField &x, ColorSpinorField &y) { axpy(a, x.Components(), y.Components()); } + void axpy_U(const real_t *a, ColorSpinorField &x, ColorSpinorField &y) { axpy_U(a, x.Components(), y.Components()); } + void axpy_L(const real_t *a, ColorSpinorField &x, ColorSpinorField &y) { axpy_L(a, x.Components(), y.Components()); } - void axpyz(const double *a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z) + void axpyz(const real_t *a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z) { axpyz(a, x.Components(), y.Components(), z.Components()); } - void caxpyz(const Complex *a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z) + void caxpyz(const complex_t *a, ColorSpinorField &x, ColorSpinorField &y, ColorSpinorField &z) { caxpyz(a, x.Components(), y.Components(), z.Components()); } diff --git a/lib/multi_reduce_quda.cu b/lib/multi_reduce_quda.cu index f93ab431e4..25a993750e 100644 --- a/lib/multi_reduce_quda.cu +++ b/lib/multi_reduce_quda.cu @@ -12,7 +12,7 @@ namespace quda { class MultiReduce : public TunableMultiReduction { using real = typename mapper::type; - using host_reduce_t = typename Reducer::reduce_t; + using host_reduce_t = typename Reducer::reduce_t; const int NXZ; const int NYW; Reducer r; @@ -596,14 +596,14 @@ namespace quda { void postTune() override {} // FIXME - use write to determine what needs to be saved }; - void reDotProduct(std::vector &result, cvector_ref &x, + void reDotProduct(std::vector &result, cvector_ref &x, cvector_ref &y) { auto &x0 = x[0]; auto &y0 = y[0]; if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0"); - std::vector result_tmp(x.size() * y.size(), 0.0); + std::vector result_tmp(x.size() * y.size(), 0.0); if (x.size() == 1) { auto NYW_max = y0.Precision() == QUDA_DOUBLE_PRECISION ? @@ -615,7 +615,7 @@ namespace quda { multiReduce_recurse(result_tmp, x, y, x, x, 0, 0, false, max_tile_size); } else if (y.size() == 1 && x0.Precision() == y0.Precision()) { - std::vector result_trans(x.size() * y.size()); + std::vector result_trans(x.size() * y.size()); // swap (x<->y and w<-z> when doing transpose calculation) auto NXZ_max = x0.Precision() == QUDA_DOUBLE_PRECISION ? @@ -633,9 +633,9 @@ namespace quda { for (unsigned int i = 0; i < ylen; i++) result_tmp[i * xlen + j] = result_trans[j * ylen + i]; } else if (x0.Precision() == y0.Precision()) { - TransposeTune(result_tmp, x, y, false); + TransposeTune(result_tmp, x, y, false); } else { - TileSizeTune(result_tmp, x, y, x, x, false); + TileSizeTune(result_tmp, x, y, x, x, false); } // do a single multi-node reduction only once we have computed all local dot products @@ -647,14 +647,14 @@ namespace quda { result = transpose(result_tmp, y.size(), x.size()); } - void cDotProduct(std::vector &result, cvector_ref &x, + void cDotProduct(std::vector &result, cvector_ref &x, cvector_ref &y) { auto &x0 = x[0]; auto &y0 = y[0]; if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0"); - std::vector result_tmp(x.size() * y.size(), 0.0); + std::vector result_tmp(x.size() * y.size(), 0.0); if (x.size() == 1) { auto NYW_max = y0.Precision() == QUDA_DOUBLE_PRECISION ? @@ -666,7 +666,7 @@ namespace quda { multiReduce_recurse(result_tmp, x, y, x, x, 0, 0, false, max_tile_size); } else if (y.size() == 1 && x0.Precision() == y0.Precision()) { - std::vector result_trans(x.size() * y.size()); + std::vector result_trans(x.size() * y.size()); // swap (x<->y and w<-z> when doing transpose calculation) auto NXZ_max = x0.Precision() == QUDA_DOUBLE_PRECISION ? @@ -684,9 +684,9 @@ namespace quda { for (unsigned int i = 0; i < ylen; i++) result_tmp[i * xlen + j] = conj(result_trans[j * ylen + i]); } } else if (x0.Precision() == y0.Precision()) { - TransposeTune(result_tmp, x, y, false); + TransposeTune(result_tmp, x, y, false); } else { - TileSizeTune(result_tmp, x, y, x, x, false); + TileSizeTune(result_tmp, x, y, x, x, false); } // do a single multi-node reduction only once we have computed all local dot products @@ -698,14 +698,14 @@ namespace quda { result = transpose(result_tmp, y.size(), x.size()); } - void hDotProduct(std::vector &result, cvector_ref &x, + void hDotProduct(std::vector &result, cvector_ref &x, cvector_ref &y) { if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0"); if (x.size() != y.size()) errorQuda("Cannot call Hermitian block dot product on non-square inputs"); - std::vector result_tmp(x.size() * y.size(), 0.0); - TileSizeTune(result_tmp, x, y, x, x, true, false); // last false is b/c L2 norm + std::vector result_tmp(x.size() * y.size(), 0.0); + TileSizeTune(result_tmp, x, y, x, x, true, false); // last false is b/c L2 norm // do a single multi-node reduction only once we have computed all local dot products comm_allreduce_sum(result_tmp); // FIXME - could optimize this for Hermiticity as well @@ -723,14 +723,14 @@ namespace quda { } // for (p, Ap) norms in CG which are Hermitian. - void hDotProduct_Anorm(std::vector &result, cvector_ref &x, + void hDotProduct_Anorm(std::vector &result, cvector_ref &x, cvector_ref &y) { if (x.size() == 0 || y.size() == 0) errorQuda("vector.size() == 0"); if (x.size() != y.size()) errorQuda("Cannot call Hermitian block A-norm dot product on non-square inputs"); - std::vector result_tmp(x.size() * y.size(), 0.0); - TileSizeTune(result_tmp, x, y, x, x, true, true); // last true is b/c A norm + std::vector result_tmp(x.size() * y.size(), 0.0); + TileSizeTune(result_tmp, x, y, x, x, true, true); // last true is b/c A norm // do a single multi-node reduction only once we have computed all local dot products comm_allreduce_sum(result_tmp); @@ -747,37 +747,37 @@ namespace quda { result[i * y.size() + j] = conj(result[j * x.size() + i]); } - void reDotProduct(double *result, std::vector &x, std::vector &y) + void reDotProduct(real_t *result, std::vector &x, std::vector &y) { - std::vector result_(x.size() * y.size()); + std::vector result_(x.size() * y.size()); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); vector_ref y_; for (auto &yi : y) y_.push_back(*yi); reDotProduct(result_, std::move(x_), std::move(y_)); - memcpy(result, result_.data(), x.size() * y.size() * sizeof(double)); + memcpy(result, result_.data(), x.size() * y.size() * sizeof(real_t)); } - void cDotProduct(Complex *result, std::vector &x, std::vector &y) + void cDotProduct(complex_t *result, std::vector &x, std::vector &y) { - std::vector result_(x.size() * y.size()); + std::vector result_(x.size() * y.size()); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); vector_ref y_; for (auto &yi : y) y_.push_back(*yi); cDotProduct(result_, std::move(x_), std::move(y_)); - memcpy(result, result_.data(), x.size() * y.size() * sizeof(Complex)); + memcpy(result, result_.data(), x.size() * y.size() * sizeof(complex_t)); } - void hDotProduct(Complex *result, std::vector &x, std::vector &y) + void hDotProduct(complex_t *result, std::vector &x, std::vector &y) { - std::vector result_(x.size() * y.size()); + std::vector result_(x.size() * y.size()); vector_ref x_; for (auto &xi : x) x_.push_back(*xi); vector_ref y_; for (auto &yi : y) y_.push_back(*yi); hDotProduct(result_, std::move(x_), std::move(y_)); - memcpy(result, result_.data(), x.size() * y.size() * sizeof(Complex)); + memcpy(result, result_.data(), x.size() * y.size() * sizeof(complex_t)); } } // namespace blas diff --git a/lib/multigrid.cpp b/lib/multigrid.cpp index 929849fdec..6878a05539 100644 --- a/lib/multigrid.cpp +++ b/lib/multigrid.cpp @@ -1123,7 +1123,7 @@ namespace quda } else { diracSmoother->MdagM(tmp2.Even(), tmp1.Odd()); } - Complex dot = cDotProduct(tmp2.Even(), tmp1.Odd()); + complex_t dot = cDotProduct(tmp2.Even(), tmp1.Odd()); double deviation = std::fabs(dot.imag()) / std::fabs(dot.real()); logQuda(QUDA_VERBOSE, "Smoother normal operator test (eta^dag M^dag M eta): real=%e imag=%e, relative imaginary deviation=%e\n", @@ -1140,7 +1140,7 @@ namespace quda // staggered preconditioned op. diracResidual->M(tmp2, tmp1); } - Complex dot = cDotProduct(tmp1, tmp2); + complex_t dot = cDotProduct(tmp1, tmp2); double deviation = std::fabs(dot.imag()) / std::fabs(dot.real()); logQuda(QUDA_VERBOSE, "Normal operator test (eta^dag M^dag M eta): real=%e imag=%e, relative imaginary deviation=%e\n", @@ -1501,7 +1501,7 @@ namespace quda if(param.mg_global.pre_orthonormalize) { for(int i=0; i<(int)B.size(); i++) { for (int j=0; j + complex_t alpha = cDotProduct(*B[j], *B[i]);// caxpy(-alpha, *B[j], *B[i]); // i-j } double nrm2 = norm2(*B[i]); @@ -1536,7 +1536,7 @@ namespace quda if (param.mg_global.post_orthonormalize) { for(int i=0; i<(int)B.size(); i++) { for (int j=0; j + complex_t alpha = cDotProduct(*B[j], *B[i]);// caxpy(-alpha, *B[j], *B[i]); // i-j } double nrm2 = norm2(*B[i]); @@ -1780,7 +1780,7 @@ namespace quda // This is the vector precision used by matResidual csParam.setPrecision(param.mg_global.invert_param->cuda_prec_sloppy, QUDA_INVALID_PRECISION, true); - std::vector evals(n_conv, 0.0); + std::vector evals(n_conv, 0.0); std::vector B_evecs(n_conv); for (auto &b : B_evecs) b = ColorSpinorField(csParam); diff --git a/lib/quda_arpack_interface.cpp b/lib/quda_arpack_interface.cpp index 5100cb0001..e542f78fcd 100644 --- a/lib/quda_arpack_interface.cpp +++ b/lib/quda_arpack_interface.cpp @@ -28,7 +28,7 @@ namespace quda void arpackErrorHelpNAUPD(); void arpackErrorHelpNEUPD(); - void arpack_solve(std::vector &h_evecs, std::vector &h_evals, const DiracMatrix &mat, + void arpack_solve(std::vector &h_evecs, std::vector &h_evals, const DiracMatrix &mat, QudaEigParam *eig_param, TimeProfile &profile) { // Create Eigensolver object for member function use @@ -105,23 +105,23 @@ namespace quda double tol_ = eig_param->tol; // ARPACK workspace - Complex I(0.0, 1.0); - std::vector resid_(ldv_); + complex_t I(0.0, 1.0); + std::vector resid_(ldv_); // Use initial guess? if (info_ > 0) { for (int a = 0; a < ldv_; a++) resid_[a] = drand48(); } - Complex sigma_ = 0.0; - std::vector w_workd_(3 * ldv_); - std::vector w_workl_(lworkl_); - std::vector w_workev_(2 * n_kr_); + complex_t sigma_ = 0.0; + std::vector w_workd_(3 * ldv_); + std::vector w_workl_(lworkl_); + std::vector w_workev_(2 * n_kr_); std::vector w_rwork_(n_kr_); std::vector select_(n_kr_); - std::vector h_evecs_(n_kr_ * ldv_); - std::vector h_evals_(n_ev_); + std::vector h_evecs_(n_kr_ * ldv_); + std::vector h_evals_(n_ev_); // create container wrapping the vectors returned from ARPACK ColorSpinorParam param(h_evecs[0]); @@ -350,7 +350,7 @@ namespace quda // Sort the eigenvalues. To do this we use the QUDA EigenSolver method, which // requires transferring data to std::vector arrays. - std::vector evals(nconv, 0.0); + std::vector evals(nconv, 0.0); std::vector arpack_index(nconv, 0.0); for (int i = 0; i < nconv; i++) { evals[i] = h_evals_[i]; @@ -411,7 +411,7 @@ namespace quda // d_v2 = M*v = lambda_measured * v mat(d_v2, d_v); // d_v = ||lambda_measured*v - lambda_arpack*v|| - blas::caxpby(Complex {1.0, 0.0}, d_v2, -evals[i], d_v); + blas::caxpby(complex_t {1.0, 0.0}, d_v2, -evals[i], d_v); double L2norm = blas::norm2(d_v); profile.TPSTOP(QUDA_PROFILE_COMPUTE); @@ -538,7 +538,7 @@ namespace quda #else - void arpack_solve(std::vector &, std::vector &, const DiracMatrix &, QudaEigParam *, + void arpack_solve(std::vector &, std::vector &, const DiracMatrix &, QudaEigParam *, TimeProfile &) { errorQuda("(P)ARPACK has not been enabled for this build"); diff --git a/lib/reduce_quda.cu b/lib/reduce_quda.cu index f6ab452f7a..b89bf3d795 100644 --- a/lib/reduce_quda.cu +++ b/lib/reduce_quda.cu @@ -12,7 +12,7 @@ namespace quda { class Reduce : public TunableReduction2D { using real = typename mapper::type; - using host_reduce_t = typename Reducer::reduce_t; + using host_reduce_t = typename Reducer::reduce_t; Reducer r; const int nParity; // for composite fields this includes the number of composites @@ -145,128 +145,128 @@ namespace quda { template