diff --git a/include/blas_helper.cuh b/include/blas_helper.cuh index e2617e879a..1c55c5c2e3 100644 --- a/include/blas_helper.cuh +++ b/include/blas_helper.cuh @@ -111,9 +111,7 @@ namespace quda {} data_t(const ColorSpinorField &x) : - spinor(static_cast(const_cast(x).V())), - stride(x.VolumeCB()), - cb_offset(x.Bytes() / (2 * sizeof(store_t) * N)) + spinor(x.data()), stride(x.VolumeCB()), cb_offset(x.Bytes() / (2 * sizeof(store_t) * N)) {} }; @@ -141,8 +139,8 @@ namespace quda {} data_t(const ColorSpinorField &x) : - spinor(static_cast(const_cast(x).V())), - norm(static_cast(const_cast(x).Norm())), + spinor(x.data()), + norm(static_cast(x.Norm())), stride(x.VolumeCB()), cb_offset(x.Bytes() / (2 * sizeof(store_t) * N)), cb_norm_offset(x.Bytes() / (2 * sizeof(norm_t))) diff --git a/include/blas_quda.h b/include/blas_quda.h index 3fc051d3ff..07b09f1209 100644 --- a/include/blas_quda.h +++ b/include/blas_quda.h @@ -23,9 +23,6 @@ namespace quda { void setParam(int kernel, int prec, int threads, int blocks); - extern unsigned long long flops; - extern unsigned long long bytes; - inline void zero(cvector_ref &x) { for (auto i = 0u; i < x.size(); i++) x[i].zero(); @@ -33,7 +30,7 @@ namespace quda { inline void copy(ColorSpinorField &dst, const ColorSpinorField &src) { - if (dst.V() == src.V()) { + if (dst.data() == src.data()) { // check the fields are equivalent else error if (ColorSpinorField::are_compatible(dst, src)) return; diff --git a/include/clover_field.h b/include/clover_field.h index fcbf7ffd7a..41920d0bc1 100644 --- a/include/clover_field.h +++ b/include/clover_field.h @@ -178,9 +178,10 @@ namespace quda { int nColor = 0; int nSpin = 0; - void *clover = nullptr; - void *cloverInv = nullptr; + quda_ptr clover = {}; + quda_ptr cloverInv = {}; + bool inverse = false; double diagonal = 0.0; array max = {}; @@ -213,12 +214,18 @@ namespace quda { public: CloverField(const CloverFieldParam ¶m); - virtual ~CloverField(); static CloverField *Create(const CloverFieldParam ¶m); - void* V(bool inverse=false) { return inverse ? cloverInv : clover; } - const void* V(bool inverse=false) const { return inverse ? cloverInv : clover; } + template auto data(bool inverse = false) const + { + return inverse ? reinterpret_cast(cloverInv.data()) : reinterpret_cast(clover.data()); + } + + /** + @return whether the inverse is explicitly been allocated + */ + bool Inverse() const { return inverse; } /** @return diagonal scaling factor applied to the identity @@ -406,10 +413,6 @@ namespace quda { */ void copy_from_buffer(void *buffer); - friend class DiracClover; - friend class DiracCloverPC; - friend class DiracTwistedClover; - friend class DiracTwistedCloverPC; }; /** diff --git a/include/clover_field_order.h b/include/clover_field_order.h index 1464a02629..65d5ef6cff 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -312,7 +312,7 @@ namespace quda { static constexpr int N = nColor * nSpin / 2; reconstruct_t recon; FloatNAccessor(const CloverField &A, bool inverse = false) : - a(static_cast(const_cast(A.V(inverse)))), + a(A.Bytes() ? A.data(inverse) : nullptr), stride(A.VolumeCB()), offset_cb(A.Bytes() / (2 * sizeof(Float))), compressed_block_size(A.compressed_block_size()), @@ -403,7 +403,7 @@ namespace quda { const int N = nSpin * nColor / 2; const complex zero; Accessor(const CloverField &A, bool inverse = false) : - a(static_cast(const_cast(A.V(inverse)))), + a(A.Bytes() ? A.data(inverse) : nullptr), offset_cb(A.Bytes() / (2 * sizeof(Float))), zero(complex(0.0, 0.0)) { @@ -639,7 +639,7 @@ namespace quda { if (clover.max_element(is_inverse) == 0.0 && isFixed::value) errorQuda("%p max_element(%d) appears unset", &clover, is_inverse); if (clover.Diagonal() == 0.0 && clover.Reconstruct()) errorQuda("%p diagonal appears unset", &clover); - this->clover = clover_ ? clover_ : (Float *)(clover.V(is_inverse)); + this->clover = clover_ ? clover_ : clover.data(is_inverse); } QudaTwistFlavorType TwistFlavor() const { return twist_flavor; } @@ -844,7 +844,7 @@ namespace quda { if (clover.Order() != QUDA_PACKED_CLOVER_ORDER) { errorQuda("Invalid clover order %d for this accessor", clover.Order()); } - this->clover = clover_ ? clover_ : (Float *)(clover.V(inverse)); + this->clover = clover_ ? clover_ : clover.data(inverse); } QudaTwistFlavorType TwistFlavor() const { return twist_flavor; } @@ -892,8 +892,8 @@ namespace quda { if (clover.Order() != QUDA_QDPJIT_CLOVER_ORDER) { errorQuda("Invalid clover order %d for this accessor", clover.Order()); } - offdiag = clover_ ? ((Float **)clover_)[0] : ((Float **)clover.V(inverse))[0]; - diag = clover_ ? ((Float **)clover_)[1] : ((Float **)clover.V(inverse))[1]; + offdiag = clover_ ? ((Float **)clover_)[0] : clover.data(inverse)[0]; + diag = clover_ ? ((Float **)clover_)[1] : clover.data(inverse)[1]; } QudaTwistFlavorType TwistFlavor() const { return twist_flavor; } @@ -970,7 +970,7 @@ namespace quda { if (clover.Order() != QUDA_BQCD_CLOVER_ORDER) { errorQuda("Invalid clover order %d for this accessor", clover.Order()); } - this->clover[0] = clover_ ? clover_ : (Float *)(clover.V(inverse)); + this->clover[0] = clover_ ? clover_ : clover.data(inverse); this->clover[1] = (Float *)((char *)this->clover[0] + clover.Bytes() / 2); } diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 76fa31b943..4801bdbf48 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -121,18 +121,13 @@ namespace quda } }; - class ColorSpinorParam : public LatticeFieldParam - { - - public: + struct ColorSpinorParam : public LatticeFieldParam { int nColor = 0; // Number of colors of the field int nSpin = 0; // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor int nVec = 1; // number of packed vectors (for multigrid transfer operator) QudaTwistFlavorType twistFlavor = QUDA_TWIST_INVALID; // used by twisted mass - QudaSiteOrder siteOrder = QUDA_INVALID_SITE_ORDER; // defined for full fields - QudaFieldOrder fieldOrder = QUDA_INVALID_FIELD_ORDER; // Float, Float2, Float4 etc. QudaGammaBasis gammaBasis = QUDA_INVALID_GAMMA_BASIS; QudaFieldCreate create = QUDA_INVALID_FIELD_CREATE; @@ -179,7 +174,6 @@ namespace quda ColorSpinorParam() = default; // used to create cpu params - ColorSpinorParam(void *V, QudaInvertParam &inv_param, const lat_dim_t &X, const bool pc_solution, QudaFieldLocation location = QUDA_CPU_FIELD_LOCATION) : LatticeFieldParam(4, X, 0, location, inv_param.cpu_prec), @@ -188,20 +182,12 @@ namespace quda || inv_param.dslash_type == QUDA_LAPLACE_DSLASH) ? 1 : 4), - nVec(1), twistFlavor(inv_param.twist_flavor), - siteOrder(QUDA_INVALID_SITE_ORDER), - fieldOrder(QUDA_INVALID_FIELD_ORDER), gammaBasis(inv_param.gamma_basis), create(QUDA_REFERENCE_FIELD_CREATE), pc_type(inv_param.dslash_type == QUDA_DOMAIN_WALL_DSLASH ? QUDA_5D_PC : QUDA_4D_PC), - v(V), - is_composite(false), - composite_dim(0), - is_component(false), - component_id(0) + v(V) { - if (nDim > QUDA_MAX_DIM) errorQuda("Number of dimensions too great"); for (int d = 0; d < nDim; d++) x[d] = X[d]; @@ -343,8 +329,7 @@ namespace quda size_t length = 0; // length including pads, but not norm zone - void *v = nullptr; // the field elements - void *v_h = nullptr; // the field elements + quda_ptr v = {}; // the field elements size_t norm_offset = 0; /** offset to the norm (if applicable) */ // multi-GPU parameters @@ -441,6 +426,12 @@ namespace quda */ ColorSpinorField &operator=(ColorSpinorField &&field); + /** + @brief Returns if the object is empty (not initialized) + @return true if the object has not been allocated, otherwise false + */ + bool empty() const { return !init; } + /** @brief Copy the source field contents into this @param[in] src Source from which we are copying @@ -477,37 +468,19 @@ namespace quda /** @brief Return pointer to the field allocation */ - void *V() - { - if (ghost_only) errorQuda("Not defined for ghost-only field"); - return v; - } - - /** - @brief Return pointer to the field allocation - */ - const void *V() const - { - if (ghost_only) errorQuda("Not defined for ghost-only field"); - return v; - } - - /** - @brief Return pointer to the norm base pointer in the field allocation - */ - void *Norm() + template auto data() const { if (ghost_only) errorQuda("Not defined for ghost-only field"); - return static_cast(v) + norm_offset; + return reinterpret_cast(v.data()); } /** @brief Return pointer to the norm base pointer in the field allocation */ - const void *Norm() const + void *Norm() const { if (ghost_only) errorQuda("Not defined for ghost-only field"); - return static_cast(v) + norm_offset; + return static_cast(v.data()) + norm_offset; } size_t NormOffset() const { return norm_offset; } @@ -938,7 +911,7 @@ namespace quda static void test_compatible_weak(const ColorSpinorField &a, const ColorSpinorField &b); friend std::ostream &operator<<(std::ostream &out, const ColorSpinorField &); - friend class ColorSpinorParam; + friend struct ColorSpinorParam; }; /** @@ -1022,28 +995,30 @@ namespace quda /** @brief Generate a random noise spinor. This variant allows the user to manage the RNG state. - @param src The colorspinorfield - @param randstates Random state - @param type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM) + @param[out] src The colorspinorfield + @param[in,out] randstates Random state + @param[in] type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM) */ void spinorNoise(ColorSpinorField &src, RNG &randstates, QudaNoiseType type); /** @brief Generate a random noise spinor. This variant just requires a seed and will create and destroy the random number state. - @param src The colorspinorfield - @param seed Seed - @param type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM) + @param[out] src The colorspinorfield + @param[in] seed Seed + @param[in] type The type of noise to create (QUDA_NOISE_GAUSSIAN or QUDA_NOISE_UNIFORM) */ void spinorNoise(ColorSpinorField &src, unsigned long long seed, QudaNoiseType type); /** @brief Generate a set of diluted color spinors from a single source. - @param v Diluted vector set - @param src The input source - @param type The type of dilution to apply (QUDA_DILUTION_SPIN_COLOR, etc.) + @param[out] v Diluted vector set + @param[in] src The input source + @param[in] type The type of dilution to apply (QUDA_DILUTION_SPIN_COLOR, etc.) + @param[in] local_block The local block size to use when using QUDA_DILUTION_BLOCK dilution */ - void spinorDilute(std::vector &v, const ColorSpinorField &src, QudaDilutionType type); + void spinorDilute(std::vector &v, const ColorSpinorField &src, QudaDilutionType type, + const lat_dim_t &local_block = {}); /** @brief Helper function for determining if the preconditioning diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 67e2192122..022d2a2b69 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -877,14 +877,13 @@ namespace quda FieldOrderCB(const ColorSpinorField &field, int nFace = 1, void *const v_ = 0, void *const *ghost_ = 0) : GhostOrder(field, nFace, ghost_), volumeCB(field.VolumeCB()), accessor(field) { - v.v = v_ ? static_cast *>(const_cast(v_)) : - static_cast *>(const_cast(field.V())); + v.v = v_ ? static_cast *>(const_cast(v_)) : field.data *>(); resetScale(field.Scale()); if constexpr (fixed && block_float) { if constexpr (nColor == 3 && nSpin == 1 && nVec == 1 && order == 2) // special case where the norm is packed into the per site struct - v.norm = reinterpret_cast(const_cast(field.V())); + v.norm = field.data(); else v.norm = static_cast(const_cast(field.Norm())); v.norm_offset = field.Bytes() / (2 * sizeof(norm_t)); @@ -1075,33 +1074,37 @@ namespace quda using GhostVector = typename VectorType::type; using AllocInt = typename AllocType::type; using norm_type = float; - Float *field; - norm_type *norm; - const AllocInt offset; // offset can be 32-bit or 64-bit - const AllocInt norm_offset; - int volumeCB; - int faceVolumeCB[4]; - mutable Float *ghost[8]; - mutable norm_type *ghost_norm[8]; - int nParity; - void *backup_h; //! host memory for backing up the field when tuning - size_t bytes; + Float *field = nullptr; + norm_type *norm = nullptr; + AllocInt offset = 0; // offset can be 32-bit or 64-bit + AllocInt norm_offset = 0; + int volumeCB = 0; + array faceVolumeCB = {}; + mutable array ghost = {}; + mutable array ghost_norm = {}; + int nParity = 0; + void *backup_h = nullptr; //! host memory for backing up the field when tuning + size_t bytes = 0; + + FloatNOrder() = default; + FloatNOrder(const FloatNOrder &) = default; FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) : - field(buffer ? buffer : (Float *)a.V()), + field(buffer ? buffer : a.data()), norm(buffer ? reinterpret_cast(reinterpret_cast(buffer) + a.NormOffset()) : const_cast(reinterpret_cast(a.Norm()))), offset(a.Bytes() / (2 * sizeof(Float) * N)), norm_offset(a.Bytes() / (2 * sizeof(norm_type))), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), - backup_h(nullptr), bytes(a.Bytes()) { for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; } resetGhost(ghost_ ? (void **)ghost_ : a.Ghost()); } + FloatNOrder &operator=(const FloatNOrder &) = default; + void resetGhost(void *const *ghost_) const { for (int dim = 0; dim < 4; dim++) { @@ -1306,27 +1309,31 @@ namespace quda using GhostVector = int4; // 128-bit packed type using AllocInt = typename AllocType::type; using norm_type = float; - Float *field; - const AllocInt offset; // offset can be 32-bit or 64-bit - int volumeCB; - int faceVolumeCB[4]; - mutable Float *ghost[8]; - int nParity; - void *backup_h; //! host memory for backing up the field when tuning - size_t bytes; + Float *field = nullptr; + const AllocInt offset = 0; // offset can be 32-bit or 64-bit + int volumeCB = 0; + array faceVolumeCB = {}; + mutable array ghost = {}; + int nParity = 0; + void *backup_h = nullptr; //! host memory for backing up the field when tuning + size_t bytes = 0; + + FloatNOrder() = default; + FloatNOrder(const FloatNOrder &) = default; FloatNOrder(const ColorSpinorField &a, int nFace = 1, Float *buffer = 0, Float **ghost_ = 0) : - field(buffer ? buffer : (Float *)a.V()), + field(buffer ? buffer : a.data()), offset(a.Bytes() / (2 * sizeof(Vector))), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), - backup_h(nullptr), bytes(a.Bytes()) { for (int i = 0; i < 4; i++) { faceVolumeCB[i] = a.SurfaceCB(i) * nFace; } resetGhost(ghost_ ? (void **)ghost_ : a.Ghost()); } + FloatNOrder &operator=(const FloatNOrder &) = default; + void resetGhost(void *const *ghost_) const { for (int dim = 0; dim < 4; dim++) { @@ -1505,7 +1512,7 @@ namespace quda int faceVolumeCB[4]; int nParity; SpaceColorSpinorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : - field(field_ ? field_ : (Float *)a.V()), + field(field_ ? field_ : a.data()), offset(a.Bytes() / (2 * sizeof(Float))), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()) @@ -1589,7 +1596,7 @@ namespace quda int faceVolumeCB[4]; int nParity; SpaceSpinorColorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : - field(field_ ? field_ : (Float *)a.V()), + field(field_ ? field_ : a.data()), offset(a.Bytes() / (2 * sizeof(Float))), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()) @@ -1668,7 +1675,7 @@ namespace quda int exDim[4]; // full field dimensions PaddedSpaceSpinorColorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : - field(field_ ? field_ : (Float *)a.V()), + field(field_ ? field_ : a.data()), volumeCB(a.VolumeCB()), exVolumeCB(1), nParity(a.SiteSubset()), @@ -1763,7 +1770,7 @@ namespace quda int volumeCB; int nParity; QDPJITDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) : - field(field_ ? field_ : (Float *)a.V()), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()) + field(field_ ? field_ : a.data()), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()) { } diff --git a/include/dirac_quda.h b/include/dirac_quda.h index 37573f68ee..1aa339139f 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -52,9 +52,9 @@ namespace quda { QudaMatPCType matpcType; QudaDagType dagger; - cudaGaugeField *gauge; - cudaGaugeField *fatGauge; // used by staggered only - cudaGaugeField *longGauge; // used by staggered only + GaugeField *gauge; + GaugeField *fatGauge; // used by staggered only + GaugeField *longGauge; // used by staggered only int laplace3D; CloverField *clover; GaugeField *xInvKD; // used for the Kahler-Dirac operator only @@ -168,13 +168,12 @@ namespace quda { friend class DiracG5M; protected: - cudaGaugeField *gauge; + GaugeField *gauge; double kappa; double mass; int laplace3D; QudaMatPCType matpcType; mutable QudaDagType dagger; // mutable to simplify implementation of Mdag - mutable unsigned long long flops; QudaDiracType type; mutable QudaPrecision halo_precision; // only does something for DiracCoarse at present @@ -404,16 +403,6 @@ namespace quda { */ virtual bool AllowTruncation() const { return false; } - /** - @brief returns and then zeroes flopcount - */ - unsigned long long Flops() const - { - unsigned long long rtn = flops; - flops = 0; - return rtn; - } - /** @brief returns preconditioning type */ @@ -450,7 +439,7 @@ namespace quda { @return Error for non-staggered operators */ - virtual cudaGaugeField *getStaggeredShortLinkField() const + virtual GaugeField *getStaggeredShortLinkField() const { errorQuda("Invalid dirac type %d", getDiracType()); return nullptr; @@ -461,7 +450,7 @@ namespace quda { @return Error for non-improved staggered operators */ - virtual cudaGaugeField *getStaggeredLongLinkField() const + virtual GaugeField *getStaggeredLongLinkField() const { errorQuda("Invalid dirac type %d", getDiracType()); return nullptr; @@ -476,10 +465,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *) - { - gauge = gauge_in; - } + virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *) { gauge = gauge_in; } /** * @brief Create the coarse operator (virtual parent) @@ -623,7 +609,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *clover_in) + virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *clover_in) { DiracWilson::updateFields(gauge_in, nullptr, nullptr, nullptr); clover = clover_in; @@ -979,7 +965,7 @@ namespace quda { class DiracMobiusPC : public DiracMobius { protected: - mutable cudaGaugeField *extended_gauge; + mutable GaugeField *extended_gauge; private: public: @@ -1227,7 +1213,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *clover_in) + virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *clover_in) { DiracWilson::updateFields(gauge_in, nullptr, nullptr, nullptr); clover = clover_in; @@ -1365,7 +1351,7 @@ namespace quda { @return Gauge field */ - virtual cudaGaugeField *getStaggeredShortLinkField() const { return gauge; } + virtual GaugeField *getStaggeredShortLinkField() const { return gauge; } /** * @brief Create the coarse staggered operator. @@ -1500,7 +1486,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in, + virtual void updateFields(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in, CloverField *clover_in); /** @@ -1541,8 +1527,8 @@ namespace quda { class DiracImprovedStaggered : public Dirac { protected: - cudaGaugeField *fatGauge; - cudaGaugeField *longGauge; + GaugeField *fatGauge; + GaugeField *longGauge; public: DiracImprovedStaggered(const DiracParam ¶m); @@ -1569,14 +1555,14 @@ namespace quda { @return fat link field */ - virtual cudaGaugeField *getStaggeredShortLinkField() const { return fatGauge; } + virtual GaugeField *getStaggeredShortLinkField() const { return fatGauge; } /** @brief return the long link field for staggered operators for MG setup @return long link field */ - virtual cudaGaugeField *getStaggeredLongLinkField() const { return longGauge; } + virtual GaugeField *getStaggeredLongLinkField() const { return longGauge; } /** * @brief Update the internal gauge, fat gauge, long gauge, clover field pointer as appropriate. @@ -1587,7 +1573,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in, CloverField *) + virtual void updateFields(GaugeField *, GaugeField *fat_gauge_in, GaugeField *long_gauge_in, CloverField *) { Dirac::updateFields(fat_gauge_in, nullptr, nullptr, nullptr); fatGauge = fat_gauge_in; @@ -1736,7 +1722,7 @@ namespace quda { * @param long_gauge_in Updated long links * @param clover_in Updated clover field */ - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in, + virtual void updateFields(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in, CloverField *clover_in); /** @@ -1791,19 +1777,19 @@ namespace quda { const bool dslash_use_mma; /** Whether to use tensor cores or not */ const bool need_aos_gauge_copy; // Whether or not we need an AoS copy of the gauge fields - mutable std::shared_ptr Y_h; /** CPU copy of the coarse link field */ - mutable std::shared_ptr X_h; /** CPU copy of the coarse clover term */ - mutable std::shared_ptr Xinv_h; /** CPU copy of the inverse coarse clover term */ - mutable std::shared_ptr Yhat_h; /** CPU copy of the preconditioned coarse link field */ + mutable std::shared_ptr Y_h; /** CPU copy of the coarse link field */ + mutable std::shared_ptr X_h; /** CPU copy of the coarse clover term */ + mutable std::shared_ptr Xinv_h; /** CPU copy of the inverse coarse clover term */ + mutable std::shared_ptr Yhat_h; /** CPU copy of the preconditioned coarse link field */ - mutable std::shared_ptr Y_d; /** GPU copy of the coarse link field */ - mutable std::shared_ptr X_d; /** GPU copy of the coarse clover term */ - mutable std::shared_ptr Y_aos_d; /** AoS GPU copy of the coarse link field */ - mutable std::shared_ptr X_aos_d; /** AoS GPU copy of the coarse clover term */ - mutable std::shared_ptr Xinv_d; /** GPU copy of inverse coarse clover term */ - mutable std::shared_ptr Yhat_d; /** GPU copy of the preconditioned coarse link field */ - mutable std::shared_ptr Xinv_aos_d; /** AoS GPU copy of inverse coarse clover term */ - mutable std::shared_ptr Yhat_aos_d; /** AoS GPU copy of the preconditioned coarse link field */ + mutable std::shared_ptr Y_d; /** GPU copy of the coarse link field */ + mutable std::shared_ptr X_d; /** GPU copy of the coarse clover term */ + mutable std::shared_ptr Y_aos_d; /** AoS GPU copy of the coarse link field */ + mutable std::shared_ptr X_aos_d; /** AoS GPU copy of the coarse clover term */ + mutable std::shared_ptr Xinv_d; /** GPU copy of inverse coarse clover term */ + mutable std::shared_ptr Yhat_d; /** GPU copy of the preconditioned coarse link field */ + mutable std::shared_ptr Xinv_aos_d; /** AoS GPU copy of inverse coarse clover term */ + mutable std::shared_ptr Yhat_aos_d; /** AoS GPU copy of the preconditioned coarse link field */ /** @brief Initialize the coarse gauge fields. Location is @@ -1862,10 +1848,10 @@ namespace quda { @param[in] Xinv_d GPU coarse inverse clover field @param[in] Yhat_d GPU coarse preconditioned link field */ - DiracCoarse(const DiracParam ¶m, std::shared_ptr Y_h, std::shared_ptr X_h, - std::shared_ptr Xinv_h, std::shared_ptr Yhat_h, - std::shared_ptr Y_d = nullptr, std::shared_ptr X_d = nullptr, - std::shared_ptr Xinv_d = nullptr, std::shared_ptr Yhat_d = nullptr); + DiracCoarse(const DiracParam ¶m, std::shared_ptr Y_h, std::shared_ptr X_h, + std::shared_ptr Xinv_h, std::shared_ptr Yhat_h, + std::shared_ptr Y_d = nullptr, std::shared_ptr X_d = nullptr, + std::shared_ptr Xinv_d = nullptr, std::shared_ptr Yhat_d = nullptr); /** @param[in] dirac Another operator instance to clone from (shallow copy) @@ -1955,7 +1941,7 @@ namespace quda { virtual QudaDiracType getDiracType() const { return QUDA_COARSE_DIRAC; } - virtual void updateFields(cudaGaugeField *gauge_in, cudaGaugeField *, cudaGaugeField *, CloverField *) + virtual void updateFields(GaugeField *gauge_in, GaugeField *, GaugeField *, CloverField *) { Dirac::updateFields(gauge_in, nullptr, nullptr, nullptr); warningQuda("Coarse gauge links cannot be trivially updated for DiracCoarse(PC). Perform an MG update instead."); @@ -2027,10 +2013,10 @@ namespace quda { @param[in] Xinv_d GPU coarse inverse clover field @param[in] Yhat_d GPU coarse preconditioned link field */ - DiracCoarsePC(const DiracParam ¶m, std::shared_ptr Y_h, std::shared_ptr X_h, - std::shared_ptr Xinv_h, std::shared_ptr Yhat_h, - std::shared_ptr Y_d = nullptr, std::shared_ptr X_d = nullptr, - std::shared_ptr Xinv_d = nullptr, std::shared_ptr Yhat_d = nullptr); + DiracCoarsePC(const DiracParam ¶m, std::shared_ptr Y_h, std::shared_ptr X_h, + std::shared_ptr Xinv_h, std::shared_ptr Yhat_h, + std::shared_ptr Y_d = nullptr, std::shared_ptr X_d = nullptr, + std::shared_ptr Xinv_d = nullptr, std::shared_ptr Yhat_d = nullptr); /** @param[in] dirac Another operator instance to clone from (shallow copy) @@ -2243,8 +2229,6 @@ namespace quda { */ virtual void operator()(cvector_ref &out, cvector_ref &in) const = 0; - unsigned long long flops() const { return dirac->Flops(); } - QudaMatPCType getMatPCType() const { return dirac->getMatPCType(); } virtual int getStencilSteps() const = 0; diff --git a/include/dslash_helper.cuh b/include/dslash_helper.cuh index 836b474cf0..e67582b682 100644 --- a/include/dslash_helper.cuh +++ b/include/dslash_helper.cuh @@ -305,8 +305,8 @@ namespace quda #endif // constructor needed for staggered to set xpay from derived class - DslashArg(const ColorSpinorField &in, const GaugeField &U, int parity, bool dagger, bool xpay, int nFace, - int spin_project, const int *comm_override, + DslashArg(const ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, const ColorSpinorField &x, + int parity, bool dagger, bool xpay, int nFace, int spin_project, const int *comm_override, #ifdef NVSHMEM_COMMS int shmem_ = 0) : #else @@ -348,8 +348,14 @@ namespace quda retcount_intra(dslash::get_shmem_retcount_intra()), retcount_inter(dslash::get_shmem_retcount_inter()) #endif - { + if (in.data() == out.data()) errorQuda("Aliasing pointers"); + checkOrder(out, in, x); // check all orders match + checkPrecision(out, in, x, U); // check all precisions match + checkLocation(out, in, x, U); // check all locations match + if (!in.isNative() || !U.isNative()) + errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder()); + for (int d = 0; d < 4; d++) { commDim[d] = (comm_override[d] == 0) ? 0 : comm_dim_partitioned(d); } diff --git a/include/enum_quda.h b/include/enum_quda.h index 0aa7966d55..c4cbb59901 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -10,8 +10,11 @@ typedef enum qudaError_t { QUDA_SUCCESS = 0, QUDA_ERROR = 1, QUDA_ERROR_UNINITIA typedef enum QudaMemoryType_s { QUDA_MEMORY_DEVICE, - QUDA_MEMORY_PINNED, + QUDA_MEMORY_DEVICE_PINNED, + QUDA_MEMORY_HOST, + QUDA_MEMORY_HOST_PINNED, QUDA_MEMORY_MAPPED, + QUDA_MEMORY_MANAGED, QUDA_MEMORY_INVALID = QUDA_INVALID_ENUM } QudaMemoryType; @@ -394,6 +397,7 @@ typedef enum QudaDilutionType_s { QUDA_DILUTION_COLOR, QUDA_DILUTION_SPIN_COLOR, QUDA_DILUTION_SPIN_COLOR_EVEN_ODD, + QUDA_DILUTION_BLOCK, QUDA_DILUTION_INVALID = QUDA_INVALID_ENUM } QudaDilutionType; @@ -532,7 +536,7 @@ typedef enum QudaGhostExchange_s { typedef enum QudaStaggeredPhase_s { QUDA_STAGGERED_PHASE_NO = 0, QUDA_STAGGERED_PHASE_MILC = 1, - QUDA_STAGGERED_PHASE_CPS = 2, + QUDA_STAGGERED_PHASE_CHROMA = 2, QUDA_STAGGERED_PHASE_TIFR = 3, QUDA_STAGGERED_PHASE_INVALID = QUDA_INVALID_ENUM } QudaStaggeredPhase; diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index c24dd3b869..6a8708948a 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -9,7 +9,7 @@ # gfortran). #*/ -#define QUDA_INVALID_ENUM (-Z'7fffffff' - 1) +#define QUDA_INVALID_ENUM -int(Z'7FFFFFFF') - 1 #define QudaLinkType integer(4) @@ -17,11 +17,6 @@ #define QUDA_ERROR 1 #define QUDA_ERROR_UNINITIALIZED 2 -#define QUDA_MEMORY_DEVICE 0 -#define QUDA_MEMORY_PINNED 1 -#define QUDA_MEMORY_MAPPED 2 -#define QUDA_MEMORY_INVALID QUDA_INVALID_ENUM - #define QUDA_SU3_LINKS 0 #define QUDA_GENERAL_LINKS 1 #define QUDA_THREE_LINKS 2 @@ -366,6 +361,7 @@ #define QUDA_DILUTION_COLOR 1 #define QUDA_DILUTION_SPIN_COLOR 2 #define QUDA_DILUTION_SPIN_COLOR_EVEN_ODD 3 +#define QUDA_DILUTION_BLOCK 4 #define QUDA_DILUTION_INVALID QUDA_INVALID_ENUM #define QudaProjectionType integer(4) @@ -474,10 +470,10 @@ #define QUDA_GHOST_EXCHANGE_INVALID QUDA_INVALID_ENUM #define QudaStaggeredPhase integer(4) -#define QUDA_STAGGERED_PHASE_NO 0 -#define QUDA_STAGGERED_PHASE_MILC 1 -#define QUDA_STAGGERED_PHASE_CPS 2 -#define QUDA_STAGGERED_PHASE_TIFR 3 +#define QUDA_STAGGERED_PHASE_NO 0 +#define QUDA_STAGGERED_PHASE_MILC 1 +#define QUDA_STAGGERED_PHASE_CHROMA 2 +#define QUDA_STAGGERED_PHASE_TIFR 3 #define QUDA_STAGGERED_PHASE_INVALID QUDA_INVALID_ENUM #define QudaContractType integer(4) diff --git a/include/gauge_field.h b/include/gauge_field.h index e4ec3ae09d..948960b36f 100644 --- a/include/gauge_field.h +++ b/include/gauge_field.h @@ -37,113 +37,76 @@ namespace quda { } // namespace gauge struct GaugeFieldParam : public LatticeFieldParam { + int nColor = 3; + int nFace = 0; - int nColor; - int nFace; + QudaGaugeFieldOrder order = QUDA_INVALID_GAUGE_ORDER; + QudaGaugeFixed fixed = QUDA_GAUGE_FIXED_NO; + QudaLinkType link_type = QUDA_WILSON_LINKS; + QudaTboundary t_boundary = QUDA_INVALID_T_BOUNDARY; + QudaReconstructType reconstruct = QUDA_RECONSTRUCT_NO; - QudaReconstructType reconstruct; - QudaGaugeFieldOrder order; - QudaGaugeFixed fixed; - QudaLinkType link_type; - QudaTboundary t_boundary; + double anisotropy = 1.0; + double tadpole = 1.0; + GaugeField *field = nullptr; // pointer to a pre-allocated field + void *gauge = nullptr; // used when we use a reference to an external field - double anisotropy; - double tadpole; - void *gauge; // used when we use a reference to an external field + QudaFieldCreate create = QUDA_REFERENCE_FIELD_CREATE; // used to determine the type of field created - QudaFieldCreate create; // used to determine the type of field created - - QudaFieldGeometry geometry; // whether the field is a scale, vector or tensor + QudaFieldGeometry geometry = QUDA_VECTOR_GEOMETRY; // whether the field is a scalar, vector or tensor // whether we need to compute the fat link maxima // FIXME temporary flag until we have a kernel that can do this, then we just do this in copy() // always set to false, requires external override - bool compute_fat_link_max; + bool compute_fat_link_max = false; /** The staggered phase convention to use */ - QudaStaggeredPhase staggeredPhaseType; + QudaStaggeredPhase staggeredPhaseType = QUDA_STAGGERED_PHASE_NO; /** Whether the staggered phase factor has been applied */ - bool staggeredPhaseApplied; + bool staggeredPhaseApplied = false; /** Imaginary chemical potential */ - double i_mu; + double i_mu = 0.0; /** Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ - size_t site_offset; + size_t site_offset = 0; /** Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ - size_t site_size; + size_t site_size = 0; // Default constructor - GaugeFieldParam(void *const h_gauge = NULL) : - LatticeFieldParam(), - nColor(3), - nFace(0), - reconstruct(QUDA_RECONSTRUCT_NO), - order(QUDA_INVALID_GAUGE_ORDER), - fixed(QUDA_GAUGE_FIXED_NO), - link_type(QUDA_WILSON_LINKS), - t_boundary(QUDA_INVALID_T_BOUNDARY), - anisotropy(1.0), - tadpole(1.0), - gauge(h_gauge), - create(QUDA_REFERENCE_FIELD_CREATE), - geometry(QUDA_VECTOR_GEOMETRY), - compute_fat_link_max(false), - staggeredPhaseType(QUDA_STAGGERED_PHASE_NO), - staggeredPhaseApplied(false), - i_mu(0.0), - site_offset(0), - site_size(0) - { - } + GaugeFieldParam(void *const h_gauge = nullptr) : gauge(h_gauge) { } GaugeFieldParam(const GaugeField &u); GaugeFieldParam(const lat_dim_t &x, QudaPrecision precision, QudaReconstructType reconstruct, int pad, QudaFieldGeometry geometry, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_PAD) : LatticeFieldParam(4, x, pad, QUDA_INVALID_FIELD_LOCATION, precision, ghostExchange), - nColor(3), - nFace(0), reconstruct(reconstruct), - order(QUDA_INVALID_GAUGE_ORDER), - fixed(QUDA_GAUGE_FIXED_NO), - link_type(QUDA_WILSON_LINKS), - t_boundary(QUDA_INVALID_T_BOUNDARY), - anisotropy(1.0), - tadpole(1.0), - gauge(0), create(QUDA_NULL_FIELD_CREATE), - geometry(geometry), - compute_fat_link_max(false), - staggeredPhaseType(QUDA_STAGGERED_PHASE_NO), - staggeredPhaseApplied(false), - i_mu(0.0), - site_offset(0), - site_size(0) + geometry(geometry) { } GaugeFieldParam(const QudaGaugeParam ¶m, void *h_gauge = nullptr, QudaLinkType link_type_ = QUDA_INVALID_LINKS) : LatticeFieldParam(param), - nColor(3), - nFace(0), - reconstruct(QUDA_RECONSTRUCT_NO), order(param.gauge_order), fixed(param.gauge_fix), link_type(link_type_ != QUDA_INVALID_LINKS ? link_type_ : param.type), - t_boundary(param.t_boundary), + t_boundary(link_type == QUDA_ASQTAD_MOM_LINKS ? QUDA_PERIODIC_T : param.t_boundary), + // if we have momentum field and not using TIFR field, then we always have recon-10 + reconstruct(link_type == QUDA_ASQTAD_MOM_LINKS && order != QUDA_TIFR_GAUGE_ORDER + && order != QUDA_TIFR_PADDED_GAUGE_ORDER ? + QUDA_RECONSTRUCT_10 : + QUDA_RECONSTRUCT_NO), anisotropy(param.anisotropy), tadpole(param.tadpole_coeff), gauge(h_gauge), - create(QUDA_REFERENCE_FIELD_CREATE), - geometry(QUDA_VECTOR_GEOMETRY), - compute_fat_link_max(false), staggeredPhaseType(param.staggered_phase_type), staggeredPhaseApplied(param.staggered_phase_applied), i_mu(param.i_mu), - site_offset(param.gauge_offset), + site_offset(link_type == QUDA_ASQTAD_MOM_LINKS ? param.mom_offset : param.gauge_offset), site_size(param.site_size) { switch (link_type) { @@ -183,90 +146,217 @@ namespace quda { }; std::ostream& operator<<(std::ostream& output, const GaugeFieldParam& param); + std::ostream &operator<<(std::ostream &output, const GaugeField ¶m); class GaugeField : public LatticeField { + friend std::ostream &operator<<(std::ostream &output, const GaugeField ¶m); + + private: + /** + @brief Create the field as specified by the param + @param[in] Parameter struct + */ + void create(const GaugeFieldParam ¶m); + + /** + @brief Move the contents of a field to this + @param[in,out] other Field we are moving from + */ + void move(GaugeField &&other); + + /** + @brief Fills the param with this field's meta data (used for + creating a cloned field) + @param[in] param The parameter we are filling + */ + void fill(GaugeFieldParam &) const; + protected: - size_t bytes; // bytes allocated per full field - size_t phase_offset; // offset in bytes to gauge phases - useful to keep track of texture alignment - size_t phase_bytes; // bytes needed to store the phases - size_t length; - size_t real_length; - int nColor; - int nFace; - QudaFieldGeometry geometry; // whether the field is a scale, vector or tensor - - QudaReconstructType reconstruct; - int nInternal; // number of degrees of freedom per link matrix - QudaGaugeFieldOrder order; - QudaGaugeFixed fixed; - QudaLinkType link_type; - QudaTboundary t_boundary; - - double anisotropy; - double tadpole; - double fat_link_max; - - QudaFieldCreate create; // used to determine the type of field created - - mutable void *ghost[2 * QUDA_MAX_DIM]; // stores the ghost zone of the gauge field (non-native fields only) - - mutable int ghostFace[QUDA_MAX_DIM]; // the size of each face - - /** - The staggered phase convention to use - */ - QudaStaggeredPhase staggeredPhaseType; - - /** - Whether the staggered phase factor has been applied - */ - bool staggeredPhaseApplied; - - /** - @brief Exchange the buffers across all dimensions in a given direction - @param[out] recv Receive buffer - @param[in] send Send buffer - @param[in] dir Direction in which we are sending (forwards OR backwards only) - */ - void exchange(void **recv, void **send, QudaDirection dir) const; - - /** - Imaginary chemical potential - */ - double i_mu; - - /** - Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER) - */ - size_t site_offset; - - /** - Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) - */ - size_t site_size; - - /** - Compute the required extended ghost zone sizes and offsets - @param[in] R Radius of the ghost zone - @param[in] no_comms_fill If true we create a full halo - regardless of partitioning - @param[in] bidir Is this a bi-directional exchange - if not - then we alias the fowards and backwards offsetss - */ - void createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const; - - /** - @brief Set the vol_string and aux_string for use in tuning - */ - void setTuningString(); + bool init = false; + quda_ptr gauge = {}; /** The gauge field allocation */ + array gauge_array = {}; /** Array of pointers to each subset (e.g., QDP or QDPJITorder) */ + size_t bytes = 0; // bytes allocated per full field + size_t phase_offset = 0; // offset in bytes to gauge phases - useful to keep track of texture alignment + size_t phase_bytes = 0; // bytes needed to store the phases + size_t length = 0; + size_t real_length = 0; + int nColor = 0; + int nFace = 0; + QudaFieldGeometry geometry = QUDA_INVALID_GEOMETRY; // whether the field is a scale, vector or tensor + int site_dim = 0; // the dimensionality of each site (number of matrices per lattice site) + + QudaReconstructType reconstruct = QUDA_RECONSTRUCT_INVALID; + int nInternal = 0; // number of degrees of freedom per link matrix + QudaGaugeFieldOrder order = QUDA_INVALID_GAUGE_ORDER; + QudaGaugeFixed fixed = QUDA_GAUGE_FIXED_INVALID; + QudaLinkType link_type = QUDA_INVALID_LINKS; + QudaTboundary t_boundary = QUDA_INVALID_T_BOUNDARY; + + double anisotropy = 0.0; + double tadpole = 0.0; + double fat_link_max = 0.0; + + mutable array ghost + = {}; // stores the ghost zone of the gauge field (non-native fields only) + + mutable array ghostFace = {}; // the size of each face + + /** + The staggered phase convention to use + */ + QudaStaggeredPhase staggeredPhaseType = QUDA_STAGGERED_PHASE_INVALID; + + /** + Whether the staggered phase factor has been applied + */ + bool staggeredPhaseApplied = false; + + /** + Imaginary chemical potential + */ + double i_mu = 0.0; + + /** + Offset into MILC site struct to the desired matrix field (only if gauge_order=MILC_SITE_GAUGE_ORDER) + */ + size_t site_offset = 0; + + /** + Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) + */ + size_t site_size = 0; + + /** + @brief Exchange the buffers across all dimensions in a given direction + @param[out] recv Receive buffer + @param[in] send Send buffer + @param[in] dir Direction in which we are sending (forwards OR backwards only) + */ + void exchange(void **recv, void **send, QudaDirection dir) const; + + /** + Compute the required extended ghost zone sizes and offsets + @param[in] R Radius of the ghost zone + @param[in] no_comms_fill If true we create a full halo + regardless of partitioning + @param[in] bidir Is this a bi-directional exchange - if not + then we alias the fowards and backwards offsetss + */ + void createGhostZone(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const; + + /** + @brief Set the vol_string and aux_string for use in tuning + */ + void setTuningString(); + + /** + @brief Initialize the padded region to 0 + */ + void zeroPad(); public: + /** + @brief Default constructor + */ + GaugeField() = default; + + /** + @brief Copy constructor for creating a GaugeField from another GaugeField + @param field Instance of GaugeField from which we are cloning + */ + GaugeField(const GaugeField &field) noexcept; + + /** + @brief Move constructor for creating a GaugeField from another GaugeField + @param field Instance of GaugeField from which we are moving + */ + GaugeField(GaugeField &&field) noexcept; + + /** + @brief Constructor for creating a GaugeField from a GaugeFieldParam + @param param Contains the metadata for creating the field + */ GaugeField(const GaugeFieldParam ¶m); - virtual ~GaugeField(); - virtual void exchangeGhost(QudaLinkDirection = QUDA_LINK_BACKWARDS) = 0; - virtual void injectGhost(QudaLinkDirection = QUDA_LINK_BACKWARDS) = 0; + /** + @brief Copy assignment operator + @param[in] field Instance from which we are copying + @return Reference to this field + */ + GaugeField &operator=(const GaugeField &field); + + /** + @brief Move assignment operator + @param[in] field Instance from which we are moving + @return Reference to this field + */ + GaugeField &operator=(GaugeField &&field); + + /** + @brief Returns if the object is empty (not initialized) + @return true if the object has not been allocated, otherwise false + */ + bool empty() const { return !init; } + + /** + @brief Create the communication handlers and buffers + @param[in] R The thickness of the extended region in each dimension + @param[in] no_comms_fill Do local exchange to fill out the extended + region in non-partitioned dimensions + @param[in] bidir Whether to allocate communication buffers to + allow for simultaneous bi-directional exchange. If false, then + the forwards and backwards buffers will alias (saving memory). + */ + void createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir = true); + + /** + @brief Allocate the ghost buffers + @param[in] R The thickness of the extended region in each dimension + @param[in] no_comms_fill Do local exchange to fill out the extended + @param[in] bidir Is this a bi-directional exchange - if not + then we alias the fowards and backwards offsetss + region in non-partitioned dimensions + */ + void allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const; + + /** + @brief Start the receive communicators + @param[in] dim The communication dimension + @param[in] dir The communication direction (0=backwards, 1=forwards) + */ + void recvStart(int dim, int dir); + + /** + @brief Start the sending communicators + @param[in] dim The communication dimension + @param[in] dir The communication direction (0=backwards, 1=forwards) + @param[in] stream_p Pointer to CUDA stream to post the + communication in (if 0, then use null stream) + */ + void sendStart(int dim, int dir, const qudaStream_t &stream_p); + + /** + @brief Wait for communication to complete + @param[in] dim The communication dimension + @param[in] dir The communication direction (0=backwards, 1=forwards) + */ + void commsComplete(int dim, int dir); + + /** + @brief Exchange the ghost and store store in the padded region + @param[in] link_direction Which links are we exchanging: this + flag only applies to bi-directional coarse-link fields + */ + void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); + + /** + @brief The opposite of exchangeGhost: take the ghost zone on x, + send to node x-1, and inject back into the field + @param[in] link_direction Which links are we injecting: this + flag only applies to bi-directional coarse-link fields + */ + void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); size_t Length() const { return length; } int Ncolor() const { return nColor; } @@ -315,7 +405,7 @@ namespace quda { @param no_comms_fill Do local exchange to fill out the extended region in non-partitioned dimensions */ - virtual void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false) = 0; + void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false); /** @brief This routine will populate the border / halo region @@ -326,7 +416,7 @@ namespace quda { @param no_comms_fill Do local exchange to fill out the extended region in non-partitioned dimensions */ - virtual void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false) = 0; + void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false); void checkField(const LatticeField &) const; @@ -342,22 +432,82 @@ namespace quda { size_t TotalBytes() const { return bytes; } - virtual void* Gauge_p() { errorQuda("Not implemented"); return (void*)0;} - virtual void* Even_p() { errorQuda("Not implemented"); return (void*)0;} - virtual void* Odd_p() { errorQuda("Not implemented"); return (void*)0;} + /** + @brief Helper function that returns true if the gauge order is an array of pointers + @param[in] order The gauge order requested + @return If the order is an array of pointers + */ + constexpr bool is_pointer_array(QudaGaugeFieldOrder order) const + { + switch (order) { + case QUDA_QDP_GAUGE_ORDER: + case QUDA_QDPJIT_GAUGE_ORDER: return true; + default: return false; + } + } - virtual const void* Gauge_p() const { errorQuda("Not implemented"); return (void*)0;} - virtual const void* Even_p() const { errorQuda("Not implemented"); return (void*)0;} - virtual const void* Odd_p() const { errorQuda("Not implemented"); return (void*)0;} + /** + @brief Return base pointer to the gauge field allocation. + @tparam T Optional type to cast the pointer to (default is void*). + @return Base pointer to the gauge field allocation + */ + template + std::enable_if_t && !std::is_pointer_v::type>, T> data() const + { + if (is_pointer_array(order)) errorQuda("Non dim-array ordered field requested but order is %d", order); + return reinterpret_cast(gauge.data()); + } - virtual int full_dim(int d) const { return x[d]; } + /** + @brief Return base pointer to the gauge field allocation + specified by the array index. This is for geometry-array + ordered fields, e.g., QDP or QDPJIT. - const void** Ghost() const { - if ( isNative() ) errorQuda("No ghost zone pointer for quda-native gauge fields"); - return (const void**)ghost; + @tparam T Optional type to cast the pointer to (default is void*) + @param[in] d Dimension index when the allocation is an array type + @return Base pointer to the gauge field allocation + */ + template auto data(unsigned int d) const + { + static_assert(std::is_pointer_v && !std::is_pointer_v::type>, + "data() requires a pointer cast type"); + if (d >= (unsigned)geometry) errorQuda("Invalid array index %d for geometry %d field", d, geometry); + if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order); + return reinterpret_cast(gauge_array[d].data()); } - void** Ghost() { + void *raw_pointer() const + { + if (is_pointer_array(order)) { + static void *data_array[8]; + for (int i = 0; i < site_dim; i++) data_array[i] = gauge_array[i].data(); + return data_array; + } else { + return gauge.data(); + } + } + + /** + @brief Return array of pointers to the per dimension gauge field allocation(s). + @tparam T Optional type to cast the pointer to (default is + void*). this is for geometry-array ordered fields, e.g., QDP + or QDPJIT. + @return Array of pointers to the gauge field allocations + */ + template + std::enable_if_t && !std::is_pointer_v::type>, array> + data_array() const + { + if (!is_pointer_array(order)) errorQuda("Dim-array ordered field requested but order is %d", order); + array u = {}; + for (auto d = 0; d < geometry; d++) u[d] = static_cast(gauge_array[d]); + return u; + } + + virtual int full_dim(int d) const { return x[d]; } + + auto &Ghost() const + { if ( isNative() ) errorQuda("No ghost zone pointer for quda-native gauge fields"); return ghost; } @@ -375,15 +525,15 @@ namespace quda { size_t SiteSize() const { return site_size; } /** - Set all field elements to zero (virtual) + Set all field elements to zero */ - virtual void zero() = 0; + void zero(); /** * Generic gauge field copy * @param[in] src Source from which we are copying */ - virtual void copy(const GaugeField &src) = 0; + void copy(const GaugeField &src); /** @brief Compute the L1 norm of the field @@ -431,175 +581,15 @@ namespace quda { */ static GaugeField* Create(const GaugeFieldParam ¶m); - }; - - class cudaGaugeField : public GaugeField { - - private: - void *gauge; - void *gauge_h; // mapped-memory pointer when allocating on the host - void *even; - void *odd; - - /** - @brief Initialize the padded region to 0 - */ - void zeroPad(); - - public: - cudaGaugeField(const GaugeFieldParam &); - virtual ~cudaGaugeField(); - /** - @brief Exchange the ghost and store store in the padded region - @param[in] link_direction Which links are we exchanging: this - flag only applies to bi-directional coarse-link fields - */ - void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); - - /** - @brief The opposite of exchangeGhost: take the ghost zone on x, - send to node x-1, and inject back into the field - @param[in] link_direction Which links are we injecting: this - flag only applies to bi-directional coarse-link fields - */ - void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); - - /** - @brief Create the communication handlers and buffers - @param[in] R The thickness of the extended region in each dimension - @param[in] no_comms_fill Do local exchange to fill out the extended - region in non-partitioned dimensions - @param[in] bidir Whether to allocate communication buffers to - allow for simultaneous bi-directional exchange. If false, then - the forwards and backwards buffers will alias (saving memory). - */ - void createComms(const lat_dim_t &R, bool no_comms_fill, bool bidir = true); - - /** - @brief Allocate the ghost buffers - @param[in] R The thickness of the extended region in each dimension - @param[in] no_comms_fill Do local exchange to fill out the extended - @param[in] bidir Is this a bi-directional exchange - if not - then we alias the fowards and backwards offsetss - region in non-partitioned dimensions - */ - void allocateGhostBuffer(const lat_dim_t &R, bool no_comms_fill, bool bidir = true) const; - - /** - @brief Start the receive communicators - @param[in] dim The communication dimension - @param[in] dir The communication direction (0=backwards, 1=forwards) - */ - void recvStart(int dim, int dir); - - /** - @brief Start the sending communicators - @param[in] dim The communication dimension - @param[in] dir The communication direction (0=backwards, 1=forwards) - @param[in] stream_p Pointer to CUDA stream to post the - communication in (if 0, then use null stream) - */ - void sendStart(int dim, int dir, const qudaStream_t &stream_p); - - /** - @brief Wait for communication to complete - @param[in] dim The communication dimension - @param[in] dir The communication direction (0=backwards, 1=forwards) - */ - void commsComplete(int dim, int dir); - - /** - @brief This does routine will populate the border / halo region of a - gauge field that has been created using copyExtendedGauge. - @param R The thickness of the extended region in each dimension - @param no_comms_fill Do local exchange to fill out the extended - region in non-partitioned dimensions - */ - void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false); - - /** - @brief This does routine will populate the border / halo region - of a gauge field that has been created using copyExtendedGauge. - Overloaded variant that will start and stop a comms profile. - @param R The thickness of the extended region in each dimension - @param profile TimeProfile intance which will record the time taken - @param no_comms_fill Do local exchange to fill out the extended - region in non-partitioned dimensions - */ - void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false); - - /** - * Generic gauge field copy - * @param[in] src Source from which we are copying - */ - void copy(const GaugeField &src); - - /** - @brief Download into this field from a CPU field - @param[in] cpu The CPU field source - */ - void loadCPUField(const cpuGaugeField &cpu); - - /** - @brief Download into this field from a CPU field. Overloaded - variant that includes profiling - @param[in] cpu The CPU field source - @param[in] profile Time profile to record the transfer - */ - void loadCPUField(const cpuGaugeField &cpu, TimeProfile &profile); - - /** - @brief Upload from this field into a CPU field - @param[out] cpu The CPU field source - */ - void saveCPUField(cpuGaugeField &cpu) const; - - /** - @brief Upload from this field into a CPU field. Overloaded - variant that includes profiling. - @param[out] cpu The CPU field source - @param[in] profile Time profile to record the transfer - */ - void saveCPUField(cpuGaugeField &cpu, TimeProfile &profile) const; - - // (ab)use with care - void* Gauge_p() { return gauge; } - void* Even_p() { return even; } - void* Odd_p() { return odd; } - - const void* Gauge_p() const { return gauge; } - const void* Even_p() const { return even; } - const void *Odd_p() const { return odd; } - - /** - @brief Copy all contents of the field to a host buffer. - @param[in] the host buffer to copy to. + @brief Create a field that aliases this field's storage. The + alias field can use a different precision than this field, + though it cannot be greater. This functionality is useful for + the case where we have multiple temporaries in different + precisions, but do not need them simultaneously. Use this functionality with caution. + @param[in] param Parameters for the alias field */ - virtual void copy_to_buffer(void *buffer) const; - - /** - @brief Copy all contents of the field from a host buffer to this field. - @param[in] the host buffer to copy from. - */ - virtual void copy_from_buffer(void *buffer); - - void setGauge(void* _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE - - /** - Set all field elements to zero - */ - void zero(); - - /** - @brief Backs up the cudaGaugeField to CPU memory - */ - void backup() const; - - /** - @brief Restores the cudaGaugeField to CUDA memory - */ - void restore() const; + GaugeField create_alias(const GaugeFieldParam ¶m = GaugeFieldParam()); /** @brief If managed memory and prefetch is enabled, prefetch @@ -608,101 +598,47 @@ namespace quda { @param[in] stream Which stream to run the prefetch in (default 0) */ void prefetch(QudaFieldLocation mem_space, qudaStream_t stream = device::get_default_stream()) const; - }; - - class cpuGaugeField : public GaugeField { - - friend void cudaGaugeField::copy(const GaugeField &cpu); - friend void cudaGaugeField::loadCPUField(const cpuGaugeField &cpu); - friend void cudaGaugeField::saveCPUField(cpuGaugeField &cpu) const; - - private: - void **gauge; // the actual gauge field - - public: - /** - @brief Constructor for cpuGaugeField from a GaugeFieldParam - @param[in,out] param Parameter struct - note that in the case - that we are wrapping host-side extended fields, this param is - modified for subsequent creation of fields that are not - extended. - */ - cpuGaugeField(const GaugeFieldParam ¶m); - virtual ~cpuGaugeField(); - - /** - @brief Exchange the ghost and store store in the padded region - @param[in] link_direction Which links are we extracting: this - flag only applies to bi-directional coarse-link fields - */ - void exchangeGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); - - /** - @brief The opposite of exchangeGhost: take the ghost zone on x, - send to node x-1, and inject back into the field - @param[in] link_direction Which links are we injecting: this - flag only applies to bi-directional coarse-link fields - */ - void injectGhost(QudaLinkDirection link_direction = QUDA_LINK_BACKWARDS); /** - @brief This does routine will populate the border / halo region of a - gauge field that has been created using copyExtendedGauge. - - @param R The thickness of the extended region in each dimension - @param no_comms_fill Do local exchange to fill out the extended - region in non-partitioned dimenions + @brief Backs up the GaugeField */ - void exchangeExtendedGhost(const lat_dim_t &R, bool no_comms_fill = false); + void backup() const; /** - @brief This does routine will populate the border / halo region - of a gauge field that has been created using copyExtendedGauge. - Overloaded variant that will start and stop a comms profile. - @param R The thickness of the extended region in each dimension - @param profile TimeProfile intance which will record the time taken - @param no_comms_fill Do local exchange to fill out the extended - region in non-partitioned dimensions + @brief Restores the GaugeField */ - void exchangeExtendedGhost(const lat_dim_t &R, TimeProfile &profile, bool no_comms_fill = false); - - /** - * Generic gauge field copy - * @param[in] src Source from which we are copying - */ - void copy(const GaugeField &src); - - void* Gauge_p() { return gauge; } - const void* Gauge_p() const { return gauge; } + void restore() const; /** @brief Copy all contents of the field to a host buffer. @param[in] the host buffer to copy to. */ - virtual void copy_to_buffer(void *buffer) const; + void copy_to_buffer(void *buffer) const; /** @brief Copy all contents of the field from a host buffer to this field. @param[in] the host buffer to copy from. */ - virtual void copy_from_buffer(void *buffer); - - void setGauge(void** _gauge); //only allowed when create== QUDA_REFERENCE_FIELD_CREATE + void copy_from_buffer(void *buffer); /** - Set all field elements to zero - */ - void zero(); + @brief Check if two instances are compatible + @param[in] a Input field + @param[in] b Input field + @return Return true if two fields are compatible + */ + static bool are_compatible(const GaugeField &a, const GaugeField &b); /** - @brief Backs up the cpuGaugeField - */ - void backup() const; + @brief Check if two instances are weakly compatible (precision + and order can differ) + @param[in] a Input field + @param[in] b Input field + @return Return true if two fields are compatible + */ + static bool are_compatible_weak(const GaugeField &a, const GaugeField &b); - /** - @brief Restores the cpuGaugeField - */ - void restore() const; + friend struct GaugeFieldParam; }; /** @@ -775,8 +711,8 @@ namespace quda { @param recon The reconsturction type @return the pointer to the extended gauge field */ - cudaGaugeField *createExtendedGauge(cudaGaugeField &in, const lat_dim_t &R, TimeProfile &profile, - bool redundant_comms = false, QudaReconstructType recon = QUDA_RECONSTRUCT_INVALID); + GaugeField *createExtendedGauge(GaugeField &in, const lat_dim_t &R, TimeProfile &profile, + bool redundant_comms = false, QudaReconstructType recon = QUDA_RECONSTRUCT_INVALID); /** This function is used for creating an exteneded (cpu) gauge field from the input, @@ -785,7 +721,7 @@ namespace quda { @param R By how many do we want to extend the gauge field in each direction @return the pointer to the extended gauge field */ - cpuGaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R); + GaugeField *createExtendedGauge(void **gauge, QudaGaugeParam &gauge_param, const lat_dim_t &R); /** This function is used for extracting the gauge ghost zone from a diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 899f187ad7..a2c0300a1d 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -375,10 +375,9 @@ namespace quda { scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - for (int d=0; d**>(gauge_)[d] : - static_cast**>(const_cast(U.Gauge_p()))[d]; - resetScale(U.Scale()); + for (int d = 0; d < U.Geometry(); d++) + u[d] = gauge_ ? static_cast **>(gauge_)[d] : U.data *>(d); + resetScale(U.Scale()); } void resetScale(Float max) @@ -437,27 +436,30 @@ namespace quda { template struct GhostAccessor { using wrapper = fieldorder_wrapper; - complex *ghost[8]; - unsigned int ghostOffset[8]; - Float scale; - Float scale_inv; + complex *ghost[8] = {}; + unsigned int ghostOffset[8] = {}; + Float scale = static_cast(1.0); + Float scale_inv = static_cast(1.0); static constexpr bool fixed = fixed_point(); - GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) : - scale(static_cast(1.0)), scale_inv(static_cast(1.0)) + GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) { for (int d=0; d<4; d++) { - ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : - static_cast*>(const_cast(U.Ghost()[d])); - ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - - ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : - ghost_ ? static_cast*>(ghost_[d+4]) : - static_cast*>(const_cast(U.Ghost()[d+4])); - ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - } + ghost[d] = ghost_ ? static_cast *>(ghost_[d]) : + U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? + static_cast *>(const_cast(U.Ghost()[d].data())) : + nullptr; + ghostOffset[d] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); + + ghost[d + 4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : + ghost_ ? static_cast *>(ghost_[d + 4]) : + U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? + static_cast *>(const_cast(U.Ghost()[d + 4].data())) : + nullptr; + ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); + } - resetScale(U.Scale()); + resetScale(U.Scale()); } void resetScale(Float max) @@ -486,8 +488,7 @@ namespace quda { static constexpr bool fixed = fixed_point(); Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) : - u(gauge_ ? static_cast *>(gauge_) : - static_cast *>(const_cast(U.Gauge_p()))), + u(gauge_ ? static_cast *>(gauge_) : U.data *>()), volumeCB(U.VolumeCB()), geometry(U.Geometry()), scale(static_cast(1.0)), @@ -559,27 +560,30 @@ namespace quda { template struct GhostAccessor { using wrapper = fieldorder_wrapper; - complex *ghost[8]; - unsigned int ghostOffset[8]; - Float scale; - Float scale_inv; + complex *ghost[8] = {}; + unsigned int ghostOffset[8] = {}; + Float scale = static_cast(1.0); + Float scale_inv = static_cast(1.0); static constexpr bool fixed = fixed_point(); - GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) : - scale(static_cast(1.0)), scale_inv(static_cast(1.0)) + GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) { for (int d=0; d<4; d++) { - ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : - static_cast*>(const_cast(U.Ghost()[d])); - ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - - ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : - ghost_ ? static_cast*>(ghost_[d+4]) : - static_cast*>(const_cast(U.Ghost()[d+4])); - ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - } + ghost[d] = ghost_ ? static_cast *>(ghost_[d]) : + U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? + static_cast *>(const_cast(U.Ghost()[d].data())) : + nullptr; + ghostOffset[d] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); + + ghost[d + 4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : + ghost_ ? static_cast *>(ghost_[d + 4]) : + U.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? + static_cast *>(const_cast(U.Ghost()[d + 4].data())) : + nullptr; + ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); + } - resetScale(U.Scale()); + resetScale(U.Scale()); } void resetScale(Float max) @@ -624,8 +628,7 @@ namespace quda { static constexpr bool fixed = fixed_point(); Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) : - u(gauge_ ? static_cast *>(gauge_) : - static_cast *>(const_cast(U.Gauge_p()))), + u(gauge_ ? static_cast *>(gauge_) : U.data *>()), offset_cb((U.Bytes() >> 1) / sizeof(complex)), volumeCB(U.VolumeCB()), stride(U.Stride()), @@ -691,26 +694,26 @@ namespace quda { template struct GhostAccessor { using wrapper = fieldorder_wrapper; - complex *ghost[8]; - const int volumeCB; - unsigned int ghostVolumeCB[8]; - Float scale; - Float scale_inv; + complex *ghost[8] = {}; + const unsigned int volumeCB; + unsigned int ghostVolumeCB[8] = {}; + Float scale = static_cast(1.0); + Float scale_inv = static_cast(1.0); static constexpr bool fixed = fixed_point(); Accessor accessor; GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_ = 0) : volumeCB(U.VolumeCB()), - scale(static_cast(1.0)), - scale_inv(static_cast(1.0)), accessor(U, gauge_, ghost_) { if constexpr (!native_ghost) assert(ghost_ != nullptr); for (int d = 0; d < 4; d++) { ghost[d] = !native_ghost ? static_cast*>(ghost_[d]) : nullptr; - ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d); - ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast*>(ghost_[d+4]) : nullptr; - ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d); + ghostVolumeCB[d] = U.Nface() * U.SurfaceCB(d); + ghost[d + 4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY ? + static_cast *>(ghost_[d + 4]) : + nullptr; + ghostVolumeCB[d + 4] = U.Nface() * U.SurfaceCB(d); } resetScale(U.Scale()); } @@ -755,7 +758,7 @@ namespace quda { using wrapper = fieldorder_wrapper; /** An internal reference to the actual field we are accessing */ - const int volumeCB; + const unsigned int volumeCB; const int nDim; const int_fastdiv geometry; const QudaFieldLocation location; @@ -874,13 +877,13 @@ namespace quda { __device__ __host__ inline int Ncolor() const { return nColor; } /** Returns the field volume */ - __device__ __host__ inline int Volume() const { return 2*volumeCB; } + __device__ __host__ inline auto Volume() const { return 2 * volumeCB; } - /** Returns the field volume */ - __device__ __host__ inline int VolumeCB() const { return volumeCB; } + /** Returns the field volume */ + __device__ __host__ inline auto VolumeCB() const { return volumeCB; } - /** Returns the field geometric dimension */ - __device__ __host__ inline int Ndim() const { return nDim; } + /** Returns the field geometric dimension */ + __device__ __host__ inline int Ndim() const { return nDim; } /** Returns the field geometry */ __device__ __host__ inline int Geometry() const { return geometry; } @@ -1501,7 +1504,7 @@ namespace quda { { switch (phase) { case QUDA_STAGGERED_PHASE_MILC: - case QUDA_STAGGERED_PHASE_CPS: + case QUDA_STAGGERED_PHASE_CHROMA: case QUDA_STAGGERED_PHASE_TIFR: return true; default: return false; } @@ -1530,7 +1533,7 @@ namespace quda { int coords[QUDA_MAX_DIM]; int_fastdiv X[QUDA_MAX_DIM]; int R[QUDA_MAX_DIM]; - const int volumeCB; + const unsigned int volumeCB; int faceVolumeCB[4]; const int stride; const int geometry; @@ -1539,7 +1542,7 @@ namespace quda { FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : reconstruct(u), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + gauge(gauge_ ? gauge_ : u.data()), offset(u.Bytes() / (2 * sizeof(Float) * N)), ghostExchange(u.GhostExchange()), volumeCB(u.VolumeCB()), @@ -1767,7 +1770,7 @@ namespace quda { /** @brief The LegacyOrder defines the ghost zone storage and ordering for - all cpuGaugeFields, which use the same ghost zone storage. + all non-native fields, which use the same ghost zone storage. */ template struct LegacyOrder { static constexpr int length = length_; @@ -1775,9 +1778,9 @@ namespace quda { using store_t = Float; using real = typename mapper::type; using complex = complex; - Float *ghost[QUDA_MAX_DIM]; - int faceVolumeCB[QUDA_MAX_DIM]; - const int volumeCB; + Float *ghost[QUDA_MAX_DIM] = {}; + int faceVolumeCB[QUDA_MAX_DIM] = {}; + const unsigned int volumeCB; const int stride; const int geometry; const int hasPhase; @@ -1792,7 +1795,9 @@ namespace quda { errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); for (int i = 0; i < 4; i++) { - ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]); + ghost[i] = (ghost_) ? ghost_[i] : + u.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD ? (Float *)(u.Ghost()[i].data()) : + nullptr; faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth } } @@ -1849,10 +1854,12 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge[QUDA_MAX_DIM]; - const int volumeCB; - QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) - : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) - { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; } + const unsigned int volumeCB; + QDPOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) + { + for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data(i); + } __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const { @@ -1893,10 +1900,12 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge[QUDA_MAX_DIM]; - const int volumeCB; - QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) - : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) - { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; } + const unsigned int volumeCB; + QDPJITOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) + { + for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : u.data(i); + } __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const { @@ -1941,16 +1950,21 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; + const unsigned int volumeCB; const int geometry; - MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : - LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()), - volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; } + MILCOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), + gauge(gauge_ ? gauge_ : u.data()), + volumeCB(u.VolumeCB()), + geometry(u.Geometry()) + { + ; + } - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - block_load(v, reinterpret_cast(in)); + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + block_load(v, reinterpret_cast(in)); } __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const @@ -1997,13 +2011,13 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; + const unsigned int volumeCB; const int geometry; const size_t offset; const size_t size; MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + gauge(gauge_ ? gauge_ : u.data()), volumeCB(u.VolumeCB()), geometry(u.Geometry()), offset(u.SiteOffset()), @@ -2056,14 +2070,14 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; + const unsigned int volumeCB; const real anisotropy; const real anisotropy_inv; static constexpr int Nc = 3; const int geometry; CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + gauge(gauge_ ? gauge_ : u.data()), volumeCB(u.VolumeCB()), anisotropy(u.Anisotropy()), anisotropy_inv(1.0 / anisotropy), @@ -2125,13 +2139,11 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; - int exVolumeCB; // extended checkerboard volume + const unsigned int volumeCB; + unsigned int exVolumeCB; // extended checkerboard volume static constexpr int Nc = 3; BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - volumeCB(u.VolumeCB()) + LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : u.data()), volumeCB(u.VolumeCB()) { if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); // compute volumeCB + halo region @@ -2189,13 +2201,13 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; + const unsigned int volumeCB; static constexpr int Nc = 3; const real scale; const real scale_inv; TIFROrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + gauge(gauge_ ? gauge_ : u.data()), volumeCB(u.VolumeCB()), scale(u.Scale()), scale_inv(1.0 / scale) @@ -2253,7 +2265,7 @@ namespace quda { using real = typename mapper::type; using complex = complex; Float *gauge; - const int volumeCB; + const unsigned int volumeCB; int exVolumeCB; static constexpr int Nc = 3; const real scale; @@ -2262,7 +2274,7 @@ namespace quda { const int exDim[4]; TIFRPaddedOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + gauge(gauge_ ? gauge_ : u.data()), volumeCB(u.VolumeCB()), exVolumeCB(1), scale(u.Scale()), diff --git a/include/gauge_tools.h b/include/gauge_tools.h index 503c20bc9f..9b7d68db37 100644 --- a/include/gauge_tools.h +++ b/include/gauge_tools.h @@ -9,9 +9,8 @@ namespace quda * @param[in] Gauge field upon which we are measuring. * @param[in,out] param Parameter struct that defines which * observables we are making and the resulting observables. - * @param[in] profile TimeProfile instance used for profiling. */ - void gaugeObservables(GaugeField &u, QudaGaugeObservableParam ¶m, TimeProfile &profile); + void gaugeObservables(GaugeField &u, QudaGaugeObservableParam ¶m); /** * @brief Project the input gauge field onto the SU(3) group. This diff --git a/include/invert_quda.h b/include/invert_quda.h index 0a2ca1b335..11ac64708e 100644 --- a/include/invert_quda.h +++ b/include/invert_quda.h @@ -225,12 +225,6 @@ namespace quda { /** The type of accelerator type to use for preconditioner */ QudaAcceleratorType accelerator_type_precondition; - /**< The time taken by the solver */ - double secs; - - /**< The Gflops rate of the solver */ - double gflops; - // Incremental EigCG solver parameters /**< The precision of the Ritz vectors */ QudaPrecision precision_ritz;//also search space precision @@ -333,8 +327,6 @@ namespace quda { ca_lambda_max_precondition(param.ca_lambda_max_precondition), schwarz_type(param.schwarz_type), accelerator_type_precondition(param.accelerator_type_precondition), - secs(param.secs), - gflops(param.gflops), precision_ritz(param.cuda_prec_ritz), n_ev(param.n_ev), m(param.max_search_dim), @@ -422,8 +414,6 @@ namespace quda { ca_lambda_max_precondition(param.ca_lambda_max_precondition), schwarz_type(param.schwarz_type), accelerator_type_precondition(param.accelerator_type_precondition), - secs(param.secs), - gflops(param.gflops), precision_ritz(param.precision_ritz), n_ev(param.n_ev), m(param.m), @@ -466,9 +456,6 @@ namespace quda { param.true_res = true_res; param.true_res_hq = true_res_hq; param.iter += iter; - comm_allreduce_sum(gflops); - param.gflops += gflops; - param.secs += secs; if (offset >= 0) { param.true_res_offset[offset] = true_res_offset[offset]; param.iter_res_offset[offset] = iter_res_offset[offset]; @@ -786,12 +773,6 @@ namespace quda { static void computeCAKrylovSpace(const DiracMatrix &diracm, std::vector &Ap, std::vector &p, int n_krylov, QudaCABasis basis, double m_map, double b_map, Args &&...args); - - /** - * @brief Return flops - * @return flops expended by this operator - */ - virtual double flops() const { return 0; } }; /** @@ -1641,8 +1622,6 @@ namespace quda { bool apply_mat; //! Whether to compute q = Ap or assume it is provided bool hermitian; //! Whether A is hermitian or not - TimeProfile &profile; - /** @brief Solve the equation A p_k psi_k = q_k psi_k = b by minimizing the residual and using Eigen's SVD algorithm for numerical stability @@ -1661,7 +1640,7 @@ namespace quda { @param apply_mat Whether to apply the operator in place or assume q already contains this @profile Timing profile to use */ - MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian, TimeProfile &profile = dummy); + MinResExt(const DiracMatrix &mat, bool orthogonal, bool apply_mat, bool hermitian); /** @param x The optimum for the solution vector. diff --git a/include/kernels/covDev.cuh b/include/kernels/covDev.cuh index b86e989bf7..28c52e9b38 100644 --- a/include/kernels/covDev.cuh +++ b/include/kernels/covDev.cuh @@ -37,19 +37,13 @@ namespace quda CovDevArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int mu, int parity, bool dagger, const int *comm_override) : - DslashArg(in, U, parity, dagger, false, 1, spin_project, comm_override), + DslashArg(out, in, U, in, parity, dagger, false, 1, spin_project, comm_override), out(out), in(in), in_pack(in), U(U), mu(mu) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); - checkOrder(out, in); // check all orders match - checkPrecision(out, in, U); // check all precisions match - checkLocation(out, in, U); // check all locations match - if (!out.isNative() || !in.isNative() || !U.isNative()) - errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder()); } }; diff --git a/include/kernels/dslash_gamma_helper.cuh b/include/kernels/dslash_gamma_helper.cuh index 3b5e27492a..5261ea5b32 100644 --- a/include/kernels/dslash_gamma_helper.cuh +++ b/include/kernels/dslash_gamma_helper.cuh @@ -78,11 +78,11 @@ namespace quda { { ColorSpinor in = arg.in(x_cb, parity); switch(arg.d) { - case 0: arg.out(x_cb, parity) = in.gamma(0); - case 1: arg.out(x_cb, parity) = in.gamma(1); - case 2: arg.out(x_cb, parity) = in.gamma(2); - case 3: arg.out(x_cb, parity) = in.gamma(3); - case 4: arg.out(x_cb, parity) = in.gamma(4); + case 0: arg.out(x_cb, parity) = in.gamma(0); break; + case 1: arg.out(x_cb, parity) = in.gamma(1); break; + case 2: arg.out(x_cb, parity) = in.gamma(2); break; + case 3: arg.out(x_cb, parity) = in.gamma(3); break; + case 4: arg.out(x_cb, parity) = in.gamma(4); break; } } }; diff --git a/include/kernels/dslash_staggered.cuh b/include/kernels/dslash_staggered.cuh index 8f772165bf..deb38455f8 100644 --- a/include/kernels/dslash_staggered.cuh +++ b/include/kernels/dslash_staggered.cuh @@ -51,7 +51,7 @@ namespace quda StaggeredArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, const GaugeField &L, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) : - DslashArg(in, U, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, spin_project, + DslashArg(out, in, U, x, parity, dagger, a == 0.0 ? false : true, improved_ ? 3 : 1, spin_project, comm_override), out(out), in(in, improved_ ? 3 : 1), @@ -65,12 +65,6 @@ namespace quda is_last_time_slice(comm_coord(3) == comm_dim(3) - 1 ? true : false), dagger_scale(dagger ? static_cast(-1.0) : static_cast(1.0)) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); - checkOrder(out, in, x); // check all orders match - checkPrecision(out, in, x, U); // check all precisions match - checkLocation(out, in, x, U); // check all locations match - if (!in.isNative() || !U.isNative()) - errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder()); } }; diff --git a/include/kernels/dslash_wilson.cuh b/include/kernels/dslash_wilson.cuh index cd7575974a..f87e8f9865 100644 --- a/include/kernels/dslash_wilson.cuh +++ b/include/kernels/dslash_wilson.cuh @@ -38,7 +38,7 @@ namespace quda WilsonArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, double a, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) : - DslashArg(in, U, parity, dagger, a != 0.0 ? true : false, 1, spin_project, comm_override), + DslashArg(out, in, U, x, parity, dagger, a != 0.0 ? true : false, 1, spin_project, comm_override), out(out), in(in), in_pack(in), @@ -46,12 +46,6 @@ namespace quda U(U), a(a) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); - checkOrder(out, in, x); // check all orders match - checkPrecision(out, in, x, U); // check all precisions match - checkLocation(out, in, x, U); // check all locations match - if (!in.isNative() || !U.isNative()) - errorQuda("Unsupported field order colorspinor=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder()); } }; diff --git a/include/kernels/gauge_phase.cuh b/include/kernels/gauge_phase.cuh index ef57369cb8..73def9ab49 100644 --- a/include/kernels/gauge_phase.cuh +++ b/include/kernels/gauge_phase.cuh @@ -63,9 +63,10 @@ namespace quda { } else if (dim == 3) { // also apply boundary condition phase = (t == arg.X[3]-1) ? arg.tBoundary : 1.0; } - } else if (Arg::phase == QUDA_STAGGERED_PHASE_CPS) { + } else if (Arg::phase == QUDA_STAGGERED_PHASE_CHROMA) { + // Chroma follows CPS convention, but uses -Dslash instead of Dslash compared to QUDA if (dim==0) { - phase = 1.0; + phase = -1.0; } else if (dim == 1) { phase = (1.0 - 2.0 * ((1 + x) % 2) ); } else if (dim == 2) { diff --git a/include/kernels/laplace.cuh b/include/kernels/laplace.cuh index ac09ddc5ed..a029242210 100644 --- a/include/kernels/laplace.cuh +++ b/include/kernels/laplace.cuh @@ -40,8 +40,7 @@ namespace quda LaplaceArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int dir, double a, double b, const ColorSpinorField &x, int parity, bool dagger, const int *comm_override) : - - DslashArg(in, U, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override), + DslashArg(out, in, U, x, parity, dagger, a != 0.0 ? true : false, 1, false, comm_override), out(out), in(in), in_pack(in), @@ -51,12 +50,6 @@ namespace quda b(b), dir(dir) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); - checkOrder(out, in, x); // check all orders match - checkPrecision(out, in, x, U); // check all precisions match - checkLocation(out, in, x, U); // check all locations match - if (!in.isNative() || !U.isNative()) - errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination\n", in.FieldOrder(), U.FieldOrder()); if (dir < 3 || dir > 4) errorQuda("Unsupported laplace direction %d (must be 3 or 4)", dir); } }; diff --git a/include/kernels/spinor_dilute.cuh b/include/kernels/spinor_dilute.cuh index 538610ff44..956b559092 100644 --- a/include/kernels/spinor_dilute.cuh +++ b/include/kernels/spinor_dilute.cuh @@ -18,7 +18,7 @@ namespace quda { case QUDA_DILUTION_COLOR: return nColor; case QUDA_DILUTION_SPIN_COLOR: return nSpin * nColor; case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: return nSpin * nColor * 2; - default: return 1; + default: return 128; } } @@ -28,10 +28,15 @@ namespace quda { static constexpr int nSpin = nSpin_; static constexpr int nColor = nColor_; static constexpr QudaDilutionType type = type_; - static constexpr int dilution_size = get_size(type); + static constexpr int max_dilution_size = get_size(type); using V = typename colorspinor_mapper::type; - V v[dilution_size]; + int dilution_size; + V v[max_dilution_size]; V src; + int nParity; + lat_dim_t dims = {}; + lat_dim_t dilution_block_dims = {}; + lat_dim_t dilution_block_grid = {}; /** @brief Constructor for the dilution arg @@ -39,14 +44,36 @@ namespace quda { @param src The source vector we are diluting */ template - SpinorDiluteArg(std::vector &v, const ColorSpinorField &src, std::index_sequence) : + SpinorDiluteArg(std::vector &v, const ColorSpinorField &src, const lat_dim_t &dilution_block_dims, + std::index_sequence) : kernel_param(dim3(src.VolumeCB(), src.SiteSubset(), 1)), - v{v[S]...}, - src(src) + dilution_size(v.size()), + src(src), + nParity(src.SiteSubset()), + dims(static_cast(src).X()), + dilution_block_dims(dilution_block_dims) { + for (auto i = 0u; i < v.size(); i++) this->v[i] = V(v[i]); + if (nParity == 1) { // dimensions need to be full-field + this->dims[0] *= 2; + this->dilution_block_dims[0] *= 2; + } + for (auto i = 0; i < src.Ndim() && type == QUDA_DILUTION_BLOCK; i++) + dilution_block_grid[i] = (dims[i] * comms_dim[i]) / this->dilution_block_dims[i]; } }; + template + __device__ __host__ void getCoordsGlobal(coord_t &coords, int x_cb, int parity, const Arg &arg) + { + getCoords(coords, x_cb, arg.dims, parity); + + // first 4 dimensions are potentially distributed so include global offsets + for (int i = 0; i < 4; i++) { + coords[i] += arg.comms_coord[i] * arg.dims[i]; // global coordinate + } + } + /** Functor for diluting the src vector */ @@ -69,8 +96,8 @@ namespace quda { case QUDA_DILUTION_COLOR: return c == i; case QUDA_DILUTION_SPIN_COLOR: return (s * Arg::nColor + c) == i; case QUDA_DILUTION_SPIN_COLOR_EVEN_ODD: return ((parity * Arg::nSpin + s) * Arg::nColor + c) == i; + default: return 0; } - return 0; } __device__ __host__ void operator()(int x_cb, int parity) @@ -78,16 +105,30 @@ namespace quda { using vector = ColorSpinor; vector src = arg.src(x_cb, parity); - for (int i = 0; i < Arg::dilution_size; i++) { - vector v; + if (Arg::type == QUDA_DILUTION_BLOCK) { + lat_dim_t coords; + getCoordsGlobal(coords, x_cb, parity, arg); + + lat_dim_t block_coords; + for (int i = 0; i < coords.size(); i++) block_coords[i] = coords[i] / arg.dilution_block_dims[i]; + int block_idx = ((block_coords[3] * arg.dilution_block_grid[2] + block_coords[2]) * arg.dilution_block_grid[1] + + block_coords[1]) + * arg.dilution_block_grid[0] + + block_coords[0]; + + for (int i = 0; i < arg.dilution_size; i++) { arg.v[i](x_cb, parity) = i == block_idx ? src : vector(); } + } else { + for (int i = 0; i < Arg::max_dilution_size; i++) { // for these types max = actual size + vector v; - for (int s = 0; s < Arg::nSpin; s++) { - for (int c = 0; c < Arg::nColor; c++) { - v(s, c) = write_source(i, s, c, parity) ? src(s, c) : complex(0.0, 0.0); + for (int s = 0; s < Arg::nSpin; s++) { + for (int c = 0; c < Arg::nColor; c++) { + v(s, c) = write_source(i, s, c, parity) ? src(s, c) : complex(0.0, 0.0); + } } - } - arg.v[i](x_cb, parity) = v; + arg.v[i](x_cb, parity) = v; + } } } diff --git a/include/kernels/staggered_kd_apply_xinv_kernel.cuh b/include/kernels/staggered_kd_apply_xinv_kernel.cuh index bbe8b70166..f5b137486f 100644 --- a/include/kernels/staggered_kd_apply_xinv_kernel.cuh +++ b/include/kernels/staggered_kd_apply_xinv_kernel.cuh @@ -39,7 +39,7 @@ namespace quda { X0h(out.X()[0]/2), volumeCB(in.VolumeCB()) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); + if (in.data() == out.data()) errorQuda("Aliasing pointers"); checkOrder(out, in); // check all orders match checkPrecision(out, in, xInv); // check all precisions match checkLocation(out, in, xInv); diff --git a/include/kernels/staggered_quark_smearing.cuh b/include/kernels/staggered_quark_smearing.cuh index 2fdb42f17a..9f4db096e8 100644 --- a/include/kernels/staggered_quark_smearing.cuh +++ b/include/kernels/staggered_quark_smearing.cuh @@ -45,8 +45,7 @@ namespace quda StaggeredQSmearArg(ColorSpinorField &out, const ColorSpinorField &in, const GaugeField &U, int t0, bool is_t0_kernel, int parity, int dir, bool dagger, const int *comm_override) : - - DslashArg(in, U, parity, dagger, false, 3, false, comm_override), + DslashArg(out, in, U, in, parity, dagger, false, 3, false, comm_override), out(out, 3), in(in, 3), in_pack(in, 3), @@ -56,12 +55,6 @@ namespace quda is_t0_kernel(is_t0_kernel), t0_offset(is_t0_kernel ? in.VolumeCB() / in.X(3) : 0) { - if (in.V() == out.V()) errorQuda("Aliasing pointers"); - checkOrder(out, in); // check all orders match - checkPrecision(out, in, U); // check all precisions match - checkLocation(out, in, U); // check all locations match - if (!in.isNative() || !U.isNative()) - errorQuda("Unsupported field order colorspinor(in)=%d gauge=%d combination", in.FieldOrder(), U.FieldOrder()); if (dir < 3 || dir > 4) errorQuda("Unsupported laplace direction %d (must be 3 or 4)", dir); for (int i = 0; i < 4; i++) { diff --git a/include/lattice_field.h b/include/lattice_field.h index 04190db9f3..b92297eabc 100644 --- a/include/lattice_field.h +++ b/include/lattice_field.h @@ -34,9 +34,6 @@ namespace quda { class cudaEigVecSet; class GaugeField; - class cpuGaugeField; - class cudaGaugeField; - class CloverField; enum class QudaOffsetCopyMode { COLLECT, DISPERSE }; @@ -71,11 +68,14 @@ namespace quda { /** Array storing the length of dimension */ lat_dim_t x = {}; + /** Padding to be added to the checker-boarded volume (only for native field ordering) */ int pad = 0; + /** Whether the field is full or single parity */ QudaSiteSubset siteSubset = QUDA_INVALID_SITE_SUBSET; - QudaMemoryType mem_type = QUDA_MEMORY_DEVICE; + /** The type of memory allocation to use for the field */ + QudaMemoryType mem_type = QUDA_MEMORY_INVALID; /** The type of ghost exchange to be done with this field */ QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_PAD; @@ -108,7 +108,7 @@ namespace quda { nDim(nDim), pad(pad), siteSubset(QUDA_FULL_SITE_SUBSET), - mem_type(QUDA_MEMORY_DEVICE), + mem_type(location == QUDA_CUDA_FIELD_LOCATION ? QUDA_MEMORY_DEVICE : QUDA_MEMORY_HOST), ghostExchange(ghostExchange), scale(1.0) { @@ -126,14 +126,14 @@ namespace quda { @param[in] param Contains the metadata for filling out the LatticeFieldParam */ LatticeFieldParam(const QudaGaugeParam ¶m) : - location(QUDA_CPU_FIELD_LOCATION), + location(param.location), precision(param.cpu_prec), ghost_precision(param.cpu_prec), init(true), nDim(4), pad(0), siteSubset(QUDA_FULL_SITE_SUBSET), - mem_type(QUDA_MEMORY_DEVICE), + mem_type(QUDA_MEMORY_HOST), ghostExchange(QUDA_GHOST_EXCHANGE_NO), scale(param.scale) { @@ -144,15 +144,18 @@ namespace quda { } /** - @brief Contructor for creating LatticeFieldParam from a LatticeField + @brief Constructor for creating LatticeFieldParam from a LatticeField */ LatticeFieldParam(const LatticeField &field); }; std::ostream& operator<<(std::ostream& output, const LatticeFieldParam& param); + std::ostream &operator<<(std::ostream &output, const LatticeField &field); class LatticeField : public Object { + friend std::ostream &operator<<(std::ostream &output, const LatticeField ¶m); + /** @brief Create the field as specified by the param @param[in] Parameter struct @@ -178,9 +181,13 @@ namespace quda { /** Checkerboarded local volume */ size_t localVolumeCB = 0; + /** Stride used for native field ordering (stride = volumeCB + pad) */ size_t stride = 0; + + /** Padding to be added to the checker-boarded volume (only for native field ordering) */ int pad = 0; + /** Total size of the allocation */ size_t total_bytes = 0; /** Number of field dimensions */ @@ -463,9 +470,7 @@ namespace quda { } } - mutable char *backup_h = nullptr; - mutable char *backup_norm_h = nullptr; - mutable bool backed_up = false; + mutable std::vector backup_h = {}; public: /** diff --git a/include/llfat_quda.h b/include/llfat_quda.h index 696c67d3f8..0bf9f5b249 100644 --- a/include/llfat_quda.h +++ b/include/llfat_quda.h @@ -11,7 +11,7 @@ namespace quda { @param u[in] The input gauge field @param coeff[in] Array of path coefficients */ - void fatKSLink(GaugeField *fat, const GaugeField &u, const double *coeff); + void fatKSLink(GaugeField &fat, const GaugeField &u, const double *coeff); /** @brief Compute the long links for an improved staggered (Kogut-Susskind) fermions. @@ -19,6 +19,6 @@ namespace quda { @param u[in] The input gauge field @param coeff[in] Array of path coefficients */ - void longKSLink(GaugeField *lng, const GaugeField &u, const double *coeff); + void longKSLink(GaugeField &lng, const GaugeField &u, const double *coeff); } // namespace quda diff --git a/include/malloc_quda.h b/include/malloc_quda.h index 8df59bbf56..05a36fcd77 100644 --- a/include/malloc_quda.h +++ b/include/malloc_quda.h @@ -114,6 +114,9 @@ namespace quda { #define register_pinned(ptr, bytes) quda::register_pinned_(__func__, quda::file_name(__FILE__), __LINE__, ptr, bytes) #define unregister_pinned(size) quda::unregister_pinned_(__func__, quda::file_name(__FILE__), __LINE__, ptr) +#define quda_malloc(size) quda::quda_malloc_(__func__, quda::file_name(__FILE__), __LINE__, size) +#define quda_free(ptr) quda::quda_free_(__func__, quda::file_name(__FILE__), __LINE__, ptr) + namespace quda { namespace pool { diff --git a/include/multi_blas_helper.cuh b/include/multi_blas_helper.cuh index 6a470fe576..78aaa1ac4b 100644 --- a/include/multi_blas_helper.cuh +++ b/include/multi_blas_helper.cuh @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include diff --git a/include/multigrid.h b/include/multigrid.h index 2d50bcec97..e5981baac2 100644 --- a/include/multigrid.h +++ b/include/multigrid.h @@ -397,9 +397,9 @@ namespace quda { @brief This method only resets the KD operators with the updated fine links and rebuilds the KD inverse */ - void resetStaggeredKD(cudaGaugeField *gauge_in, cudaGaugeField *fat_gauge_in, cudaGaugeField *long_gauge_in, - cudaGaugeField *gauge_sloppy_in, cudaGaugeField *fat_gauge_sloppy_in, - cudaGaugeField *long_gauge_sloppy_in, double mass); + void resetStaggeredKD(GaugeField *gauge_in, GaugeField *fat_gauge_in, GaugeField *long_gauge_in, + GaugeField *gauge_sloppy_in, GaugeField *fat_gauge_sloppy_in, + GaugeField *long_gauge_sloppy_in, double mass); /** @brief Dump the null-space vectors to disk. Will recurse dumping all levels. @@ -486,11 +486,6 @@ namespace quda { */ void buildFreeVectors(std::vector &B); - /** - @brief Return the total flops done on this and all coarser levels. - */ - double flops() const; - /** @brief Return if we're on a fine grid right now */ @@ -634,13 +629,13 @@ namespace quda { operator we are constructing the coarse grid operator from. For staggered, should always be QUDA_MATPC_INVALID. */ - void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge, - const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, + void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge, + const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc); template - void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const cudaGaugeField &gauge, - const cudaGaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, + void StaggeredCoarseOp(GaugeField &Y, GaugeField &X, const Transfer &T, const GaugeField &gauge, + const GaugeField &longGauge, const GaugeField &XinvKD, double mass, bool allow_truncation, QudaDiracType dirac, QudaMatPCType matpc); /** diff --git a/include/quda.h b/include/quda.h index 1d97e6c737..1f8e642b98 100644 --- a/include/quda.h +++ b/include/quda.h @@ -62,7 +62,7 @@ extern "C" { QudaGaugeFixed gauge_fix; /**< Whether the input gauge field is in the axial gauge or not */ - int ga_pad; /**< The pad size that the cudaGaugeField will use (default=0) */ + int ga_pad; /**< The pad size that native GaugeFields will use (default=0) */ int site_ga_pad; /**< Used by link fattening and the gauge and fermion forces */ @@ -1497,7 +1497,7 @@ extern "C" { void saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param); /** - * Reinterpret gauge as a pointer to cudaGaugeField and call destructor. + * Reinterpret gauge as a pointer to a GaugeField and call destructor. * * @param gauge Gauge field to be freed */ @@ -1698,12 +1698,10 @@ extern "C" { * @param[in] reunit_interval, reunitarize gauge field when iteration count is a multiple of this * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value * @param[in] param The parameters of the external fields and the computation settings - * @param[out] timeinfo */ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double relax_boost, const double tolerance, - const unsigned int reunit_interval, const unsigned int stopWtheta, - QudaGaugeParam *param, double *timeinfo); + const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param); /** * @brief Gauge fixing with Steepest descent method with FFTs with support for single GPU only. @@ -1717,12 +1715,10 @@ extern "C" { * iteration reachs the maximum number of steps defined by Nsteps * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value * @param[in] param The parameters of the external fields and the computation settings - * @param[out] timeinfo */ int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, const unsigned int verbose_interval, const double alpha, const unsigned int autotune, - const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, - double *timeinfo); + const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param); /** * @brief Strided Batched GEMM @@ -1779,9 +1775,11 @@ extern "C" { int delete_2link; /** Set if the input spinor is on a time slice **/ int t0; + /** Time taken for the smearing operations **/ + double secs; /** Flops count for the smearing operations **/ - int gflops; - + double gflops; + } QudaQuarkSmearParam; /** diff --git a/include/quda_api.h b/include/quda_api.h index 45c226ba19..e1ec69bbe1 100644 --- a/include/quda_api.h +++ b/include/quda_api.h @@ -3,6 +3,7 @@ #include #include #include +#include /** @file quda_api.h @@ -22,7 +23,7 @@ enum qudaMemcpyKind { namespace quda { - class TuneParam; + struct TuneParam; struct qudaStream_t { int idx; @@ -42,6 +43,16 @@ namespace quda void qudaMemcpy_(void *dst, const void *src, size_t count, qudaMemcpyKind kind, const char *func, const char *file, const char *line); + /** + @brief Wrapper around cudaMemcpy or driver API equivalent + @param[out] dst Destination pointer + @param[in] src Source pointer + @param[in] count Size of transfer + @param[in] kind Type of memory copy + */ + void qudaMemcpy_(const quda_ptr &dst, const quda_ptr &src, size_t count, qudaMemcpyKind kind, const char *func, + const char *file, const char *line); + /** @brief Wrapper around cudaMemcpyAsync or driver API equivalent @param[out] dst Destination pointer @@ -63,6 +74,14 @@ namespace quda void qudaMemcpyP2PAsync_(void *dst, const void *src, size_t count, const qudaStream_t &stream, const char *func, const char *file, const char *line); + /** + @brief Heterogenous memset function + @param[out] ptr Heterogeneous pointer + @param[in] value Value to set for each byte of specified memory + @param[in] count Size in bytes to set + */ + void qudaMemset_(quda_ptr &ptr, int value, size_t count, const char *func, const char *file, const char *line); + /** @brief Wrapper around cudaMemset or driver API equivalent @param[out] ptr Starting address pointer @@ -72,15 +91,14 @@ namespace quda void qudaMemset_(void *ptr, int value, size_t count, const char *func, const char *file, const char *line); /** - @brief Wrapper around cudaMemset2D or driver API equivalent + @brief Wrapper around cudaMemsetAsync or driver API equivalent @param[out] ptr Starting address pointer - @param[in] Pitch in bytes @param[in] value Value to set for each byte of specified memory - @param[in] width Width in bytes - @param[in] height Height in bytes + @param[in] count Size in bytes to set + @param[in] stream Stream to issue memset */ - void qudaMemset2D_(void *ptr, size_t pitch, int value, size_t width, size_t height, const char *func, - const char *file, const char *line); + void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func, + const char *file, const char *line); /** @brief Wrapper around cudaMemsetAsync or driver API equivalent @@ -89,20 +107,21 @@ namespace quda @param[in] count Size in bytes to set @param[in] stream Stream to issue memset */ - void qudaMemsetAsync_(void *ptr, int value, size_t count, const qudaStream_t &stream, const char *func, + void qudaMemsetAsync_(quda_ptr &ptr, int value, size_t count, const qudaStream_t &stream, const char *func, const char *file, const char *line); /** - @brief Wrapper around cudaMemsetAsync or driver API equivalent + @brief Asynchronous heterogenous memset2d function @param[out] ptr Starting address pointer + @param[in] Initial offset from pointer @param[in] Pitch in bytes @param[in] value Value to set for each byte of specified memory @param[in] width Width in bytes @param[in] height Height in bytes @param[in] stream Stream to issue memset */ - void qudaMemset2DAsync_(void *ptr, size_t pitch, int value, size_t width, size_t height, const qudaStream_t &stream, - const char *func, const char *file, const char *line); + void qudaMemset2DAsync_(quda_ptr &ptr, size_t offset, size_t pitch, int value, size_t width, size_t height, + const qudaStream_t &stream, const char *func, const char *file, const char *line); /** @brief Wrapper around cudaMemPrefetchAsync or driver API equivalent @@ -224,14 +243,11 @@ namespace quda #define qudaMemset(ptr, value, count) \ ::quda::qudaMemset_(ptr, value, count, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)) -#define qudaMemset2D(ptr, pitch, value, width, height) \ - ::quda::qudaMemset2D_(ptr, pitch, value, width, height, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)) - #define qudaMemsetAsync(ptr, value, count, stream) \ ::quda::qudaMemsetAsync_(ptr, value, count, stream, __func__, quda::file_name(__FILE__), __STRINGIFY__(__LINE__)) -#define qudaMemset2DAsync(ptr, pitch, value, width, height, stream) \ - ::quda::qudaMemset2DAsync_(ptr, pitch, value, width, height, stream, __func__, quda::file_name(__FILE__), \ +#define qudaMemset2DAsync(ptr, offset, pitch, value, width, height, stream) \ + ::quda::qudaMemset2DAsync_(ptr, offset, pitch, value, width, height, stream, __func__, quda::file_name(__FILE__), \ __STRINGIFY__(__LINE__)) #define qudaMemPrefetchAsync(ptr, count, mem_space, stream) \ diff --git a/include/quda_arch.h b/include/quda_arch.h index ed88fb0b8e..45a8ed34e4 100644 --- a/include/quda_arch.h +++ b/include/quda_arch.h @@ -14,5 +14,8 @@ #elif defined(QUDA_TARGET_SYCL) #include +#endif +#ifdef QUDA_OPENMP +#include #endif diff --git a/include/quda_internal.h b/include/quda_internal.h index 756d5822e0..dd8a6c8177 100644 --- a/include/quda_internal.h +++ b/include/quda_internal.h @@ -49,6 +49,7 @@ #include #include #include +#include "timer.h" namespace quda { diff --git a/include/quda_milc_interface.h b/include/quda_milc_interface.h index 23b8bccc87..c3207cfaa2 100644 --- a/include/quda_milc_interface.h +++ b/include/quda_milc_interface.h @@ -1026,7 +1026,7 @@ extern "C" { void* inGauge); /** - * Reinterpret gauge as a pointer to cudaGaugeField and call destructor. + * Reinterpret gauge as a pointer to a GaugeField and call destructor. * * @param gauge Gauge field to be freed */ diff --git a/include/quda_ptr.h b/include/quda_ptr.h new file mode 100644 index 0000000000..aab76f6b89 --- /dev/null +++ b/include/quda_ptr.h @@ -0,0 +1,106 @@ +#pragma once + +#include +#include "malloc_quda.h" + +namespace quda +{ + + /** + Object that stores a memory allocation with different views for + host or device. Depending on the nature of the underlying memory + type, both views may not be defined + + type defined views + QUDA_MEMORY_DEVICE device only + QUDA_MEMORY_DEVICE_PINNED device only + QUDA_MEMORY_HOST host only + QUDA_MEMORY_HOST_PINNED both + QUDA_MEMORY_MAPPED both (pinned to host) + QUDA_MEMORY_MANAGED both + */ + class quda_ptr + { + friend std::ostream &operator<<(std::ostream &output, const quda_ptr &ptr); + QudaMemoryType type = QUDA_MEMORY_INVALID; /** Memory type of the allocation */ + size_t size = 0; /** Size of the allocation */ + bool pool = false; /** Is the allocation is pooled */ + void *device = nullptr; /** Device-view of the allocation */ + void *host = nullptr; /** Host-view of the allocation */ + bool reference = false; /** Is this a reference to another allocation */ + + /** + @brief Internal deallocation routine + */ + void destroy(); + + public: + quda_ptr() = default; + quda_ptr(quda_ptr &&) = default; + quda_ptr &operator=(quda_ptr &&); + quda_ptr(const quda_ptr &) = delete; + quda_ptr &operator=(const quda_ptr &) = delete; + + /** + @brief Constructor for quda_ptr + @param[in] type The memory type of the allocation + @param[in] size The size of the allocation + @param[in] pool Whether the allocation should be in the memory pool (default is true) + */ + quda_ptr(QudaMemoryType type, size_t size, bool pool = true); + + /** + @brief Constructor for quda_ptr where we are wrapping a non-owned pointer + @param[in] ptr Raw base pointer + @param[in] type The memory type of the allocation + */ + quda_ptr(void *ptr, QudaMemoryType type); + + /** + @brief Destructor for the quda_ptr + */ + virtual ~quda_ptr(); + + /** + @brief Specialized exchange function to use in place of + std::exchange when exchanging quda_ptr objects: moves obj to + *this, and moves new_value to obj + @param[in,out] obj + @param[in] new_value New value for obj to take + */ + void exchange(quda_ptr &obj, quda_ptr &&new_value); + + /** + @return Returns true if allocation is visible to the device + */ + bool is_device() const; + + /** + @return Returns true if allocation is visible to the host + */ + bool is_host() const; + + /** + Return view of the pointer. For mapped memory we return the device view. + */ + void *data() const; + + /** + Return the device view of the pointer + */ + void *data_device() const; + + /** + Return the host view of the pointer + */ + void *data_host() const; + + /** + Return if the instance is a reference rather than an allocation + */ + bool is_reference() const; + }; + + std::ostream &operator<<(std::ostream &output, const quda_ptr &ptr); + +} // namespace quda diff --git a/include/reference_wrapper_helper.h b/include/reference_wrapper_helper.h index 2b85c497fd..3f73709ca6 100644 --- a/include/reference_wrapper_helper.h +++ b/include/reference_wrapper_helper.h @@ -1,6 +1,8 @@ #pragma once +#include #include +#include #include #include #include diff --git a/include/staggered_kd_build_xinv.h b/include/staggered_kd_build_xinv.h index fdf57eccf8..2bd1b4f600 100644 --- a/include/staggered_kd_build_xinv.h +++ b/include/staggered_kd_build_xinv.h @@ -14,7 +14,7 @@ namespace quda @param mass [in] Mass of the original staggered operator w/out factor of 2 convention @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv */ - void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const cudaGaugeField &gauge, const double mass, + void BuildStaggeredKahlerDiracInverse(GaugeField &Xinv, const GaugeField &gauge, const double mass, const bool dagger_approximation); /** @@ -34,7 +34,7 @@ namespace quda @param dagger_approximation[in] Whether or not to use the dagger approximation, using the dagger of X instead of Xinv @return constructed Xinv */ - std::shared_ptr AllocateAndBuildStaggeredKahlerDiracInverse(const cudaGaugeField &gauge, const double mass, + std::shared_ptr AllocateAndBuildStaggeredKahlerDiracInverse(const GaugeField &gauge, const double mass, const bool dagger_approximation); } // namespace quda diff --git a/include/targets/cuda/externals/jitify.hpp b/include/targets/cuda/externals/jitify.hpp index 46a51a97cd..110be5d22e 100644 --- a/include/targets/cuda/externals/jitify.hpp +++ b/include/targets/cuda/externals/jitify.hpp @@ -365,7 +365,7 @@ inline std::string path_base(std::string p) { // "foo/bar" -> "foo" // "foo/bar/" -> "foo/bar" #if defined _WIN32 || defined _WIN64 - char sep = '\\'; + const char* sep = "\\/"; #else char sep = '/'; #endif @@ -496,10 +496,13 @@ inline std::string comment_out_code_line(int line_num, std::string source) { inline void print_with_line_numbers(std::string const& source) { int linenum = 1; std::stringstream source_ss(source); + std::stringstream output_ss; + output_ss.imbue(std::locale::classic()); for (std::string line; std::getline(source_ss, line); ++linenum) { - std::cout << std::setfill(' ') << std::setw(3) << linenum << " " << line + output_ss << std::setfill(' ') << std::setw(3) << linenum << " " << line << std::endl; } + std::cout << output_ss.str(); } inline void print_compile_log(std::string program_name, @@ -554,7 +557,7 @@ inline bool load_source( std::string filename, std::map& sources, std::string current_dir = "", std::vector include_paths = std::vector(), - file_callback_type file_callback = 0, + file_callback_type file_callback = 0, std::string* program_name = nullptr, std::map* fullpaths = nullptr, bool search_current_dir = true) { std::istream* source_stream = 0; @@ -568,6 +571,9 @@ inline bool load_source( string_stream << source; source_stream = &string_stream; } + if (program_name) { + *program_name = filename; + } if (sources.count(filename)) { // Already got this one return true; @@ -672,6 +678,8 @@ inline bool load_source( // TODO: Handle block comments (currently they cause a compilation error). size_t comment_start = line_after_pragma.find("//"); std::string pragma_args = line_after_pragma.substr(0, comment_start); + // handle quote character used in #pragma expression + pragma_args = replace_token(pragma_args, "\"", "\\\""); std::string comment = comment_start != std::string::npos ? line_after_pragma.substr(comment_start) : ""; @@ -682,7 +690,7 @@ inline bool load_source( source += line + "\n"; } // HACK TESTING (WAR for cub) - // source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source; + source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source; ////source = "cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }\n" + /// source; @@ -690,6 +698,7 @@ inline bool load_source( // of the same header from different paths. if (pragma_once) { std::stringstream ss; + ss.imbue(std::locale::classic()); ss << std::uppercase << std::hex << std::setw(8) << std::setfill('0') << hash; std::string include_guard_name = "_JITIFY_INCLUDE_GUARD_" + ss.str() + "\n"; @@ -1385,7 +1394,16 @@ static const char* jitsafe_header_preinclude_h = R"( // WAR to allow exceptions to be parsed #define try #define catch(...) -)"; +)" +#if defined(_WIN32) || defined(_WIN64) +// WAR for NVRTC <= 11.0 not defining _WIN64. +R"( +#ifndef _WIN64 +#define _WIN64 1 +#endif +)" +#endif +; static const char* jitsafe_header_float_h = R"( #pragma once @@ -1403,12 +1421,12 @@ static const char* jitsafe_header_float_h = R"( #define DBL_MAX_EXP 1024 #define FLT_MAX_10_EXP 38 #define DBL_MAX_10_EXP 308 -#define FLT_MAX 3.4028234e38f -#define DBL_MAX 1.7976931348623157e308 -#define FLT_EPSILON 1.19209289e-7f -#define DBL_EPSILON 2.220440492503130e-16 -#define FLT_MIN 1.1754943e-38f; -#define DBL_MIN 2.2250738585072013e-308 +#define FLT_MAX 3.4028234e38f +#define DBL_MAX 1.7976931348623157e308 +#define FLT_EPSILON 1.19209289e-7f +#define DBL_EPSILON 2.220440492503130e-16 +#define FLT_MIN 1.1754943e-38f +#define DBL_MIN 2.2250738585072013e-308 #define FLT_ROUNDS 1 #if defined __cplusplus && __cplusplus >= 201103L #define FLT_EVAL_METHOD 0 @@ -1596,14 +1614,28 @@ struct IntegerLimits { #endif // __cplusplus >= 201103L enum { is_specialized = true, - digits = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits, - digits10 = (digits * 30103) / 100000, - is_signed = ((T)(-1)<0), - is_integer = true, - is_exact = true, - radix = 2, - is_bounded = true, - is_modulo = false + digits = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits, + digits10 = (digits * 30103) / 100000, + is_signed = ((T)(-1)<0), + is_integer = true, + is_exact = true, + has_infinity = false, + has_quiet_NaN = false, + has_signaling_NaN = false, + has_denorm = 0, + has_denorm_loss = false, + round_style = 0, + is_iec559 = false, + is_bounded = true, + is_modulo = !(is_signed || Max == 1 /*is bool*/), + max_digits10 = 0, + radix = 2, + min_exponent = 0, + min_exponent10 = 0, + max_exponent = 0, + max_exponent10 = 0, + tinyness_before = false, + traps = false }; }; } // namespace __jitify_detail @@ -1910,6 +1942,46 @@ static const char* jitsafe_header_type_traits = R"( template struct aligned_storage { struct type { alignas(alignment) char data[len]; }; }; template struct alignment_of : std::integral_constant {}; + template struct make_unsigned; + template <> struct make_unsigned { typedef unsigned char type; }; + template <> struct make_unsigned { typedef unsigned short type; }; + template <> struct make_unsigned { typedef unsigned int type; }; + template <> struct make_unsigned { typedef unsigned long type; }; + template <> struct make_unsigned { typedef unsigned long long type; }; + template <> struct make_unsigned { typedef unsigned char type; }; + template <> struct make_unsigned { typedef unsigned short type; }; + template <> struct make_unsigned { typedef unsigned int type; }; + template <> struct make_unsigned { typedef unsigned long type; }; + template <> struct make_unsigned { typedef unsigned long long type; }; + template <> struct make_unsigned { typedef unsigned char type; }; + #if defined _WIN32 || defined _WIN64 + template <> struct make_unsigned { typedef unsigned short type; }; + #else + template <> struct make_unsigned { typedef unsigned int type; }; + #endif + + template struct make_signed; + template <> struct make_signed { typedef signed char type; }; + template <> struct make_signed { typedef signed short type; }; + template <> struct make_signed { typedef signed int type; }; + template <> struct make_signed { typedef signed long type; }; + template <> struct make_signed { typedef signed long long type; }; + template <> struct make_signed { typedef signed char type; }; + template <> struct make_signed { typedef signed short type; }; + template <> struct make_signed { typedef signed int type; }; + template <> struct make_signed { typedef signed long type; }; + template <> struct make_signed { typedef signed long long type; }; + template <> struct make_signed { typedef signed char type; }; + #if defined _WIN32 || defined _WIN64 + template <> struct make_signed { typedef signed short type; }; + #else + template <> struct make_signed { typedef signed int type; }; + #endif + + #if __cplusplus >= 201703L + template< typename... Ts > struct make_void { typedef void type; }; + template< typename... Ts > using void_t = typename make_void::type; + #endif // __cplusplus >= 201703L } // namespace std #endif // c++11 )"; @@ -1949,8 +2021,8 @@ static const char* jitsafe_header_stdint_h = "#define INT8_MIN SCHAR_MIN\n" "#define INT16_MIN SHRT_MIN\n" "#if defined _WIN32 || defined _WIN64\n" - "#define WCHAR_MIN SHRT_MIN\n" - "#define WCHAR_MAX SHRT_MAX\n" + "#define WCHAR_MIN 0\n" + "#define WCHAR_MAX USHRT_MAX\n" "typedef unsigned long long uintptr_t; //optional\n" "#else\n" "#define WCHAR_MIN INT_MIN\n" @@ -2083,24 +2155,33 @@ static const char* jitsafe_header_sstream = "#include \n" "#include \n"; -static const char* jitsafe_header_utility = - "#pragma once\n" - "namespace std {\n" - "template\n" - "struct pair {\n" - " T1 first;\n" - " T2 second;\n" - " inline pair() {}\n" - " inline pair(T1 const& first_, T2 const& second_)\n" - " : first(first_), second(second_) {}\n" - " // TODO: Standard includes many more constructors...\n" - " // TODO: Comparison operators\n" - "};\n" - "template\n" - "pair make_pair(T1 const& first, T2 const& second) {\n" - " return pair(first, second);\n" - "}\n" - "} // namespace std\n"; +static const char* jitsafe_header_utility = R"( + #pragma once + namespace std { + template + struct pair { + T1 first; + T2 second; + inline pair() {} + inline pair(T1 const& first_, T2 const& second_): first(first_), second(second_) {} + // TODO: Standard includes many more constructors... + // TODO: Comparison operators + }; + template + pair make_pair(T1 const& first, T2 const& second) { + return pair(first, second); + } + + template + constexpr bool always_false = false; + + template + typename std::add_rvalue_reference::type declval() noexcept + { + static_assert(always_false, "declval not allowed in an evaluated context"); + } + } // namespace std +)"; // TODO: incomplete static const char* jitsafe_header_vector = @@ -2340,14 +2421,81 @@ static const char* jitsafe_header_tuple = R"( #if __cplusplus >= 201103L namespace std { template class tuple; + + template< size_t I, class T > + struct tuple_element; + // recursive case + template< size_t I, class Head, class... Tail > + struct tuple_element> + : tuple_element> { }; + // base case + template< class Head, class... Tail > + struct tuple_element<0, tuple> { + using type = Head; + }; } // namespace std #endif )"; +static const char* jitsafe_header_functional = R"( + #pragma once + #if __cplusplus >= 201103L + namespace std { + template + class reference_wrapper + { + public: + // types + using type = T; + reference_wrapper(const reference_wrapper&) noexcept = default; + // assignment + reference_wrapper& operator=(const reference_wrapper& x) noexcept = default; + // access + constexpr operator T& () const noexcept { return *_ptr; } + constexpr T& get() const noexcept { return *_ptr; } + private: + T* _ptr; + }; + } // namespace std + #endif +)"; + +static const char* jitsafe_header_map = R"( + #pragma once + namespace std { + template class map {}; + } // namespace std +)"; + +static const char* jitsafe_header_stack = R"( + #pragma once + namespace std { + template class stack {}; + } // namespace std +)"; + +static const char* jitsafe_header_initializer_list = R"( + #pragma once +)"; + static const char* jitsafe_header_assert = R"( #pragma once )"; +static const char* jitsafe_header_sys_time = R"( + #pragma once + struct timeval { + unsigned long long tv_sec; + unsigned long long tv_usec; + }; + struct timeval it_interval; + struct timeval it_value; + int getitimer(int, struct itimerval *); + int gettimeofday(struct timeval *, void *); + int setitimer(int, const struct itimerval *, struct itimerval *); + int utimes(const char *, const struct timeval [2]); + )"; + // WAR: These need to be pre-included as a workaround for NVRTC implicitly using // /usr/include as an include path. The other built-in headers will be included // lazily as needed. @@ -2406,8 +2554,13 @@ static const std::map& get_jitsafe_headers_map() { {"time.h", jitsafe_header_time_h}, {"ctime", jitsafe_header_time_h}, {"tuple", jitsafe_header_tuple}, + {"functional", jitsafe_header_functional}, + {"map", jitsafe_header_map}, + {"stack", jitsafe_header_stack}, + {"initializer_list", jitsafe_header_initializer_list}, {"assert.h", jitsafe_header_assert}, - {"cassert", jitsafe_header_assert}}; + {"cassert", jitsafe_header_assert}, + {"sys/time.h", jitsafe_header_sys_time}}; return jitsafe_headers_map; } @@ -2673,6 +2826,17 @@ inline nvrtcResult compile_kernel(std::string program_name, &nvrtc_program, program_source.c_str(), program_name.c_str(), num_headers, header_sources_c.data(), header_names_c.data())); + // Ensure nvrtc_program gets destroyed. + struct ScopedNvrtcProgramDestroyer { + nvrtcProgram& nvrtc_program_; + ScopedNvrtcProgramDestroyer(nvrtcProgram& nvrtc_program) + : nvrtc_program_(nvrtc_program) {} + ~ScopedNvrtcProgramDestroyer() { nvrtcDestroyProgram(&nvrtc_program_); } + ScopedNvrtcProgramDestroyer(const ScopedNvrtcProgramDestroyer&) = delete; + ScopedNvrtcProgramDestroyer& operator=(const ScopedNvrtcProgramDestroyer&) = + delete; + } nvrtc_program_scope_guard{nvrtc_program}; + #if CUDA_VERSION >= 8000 if (!instantiation.empty()) { CHECK_NVRTC(nvrtcAddNameExpression(nvrtc_program, instantiation.c_str())); @@ -2720,7 +2884,6 @@ inline nvrtcResult compile_kernel(std::string program_name, #endif } - CHECK_NVRTC(nvrtcDestroyProgram(&nvrtc_program)); #undef CHECK_NVRTC return NVRTC_SUCCESS; } @@ -2746,10 +2909,9 @@ inline void load_program(std::string const& cuda_source, // Load program source if (!detail::load_source(cuda_source, *program_sources, "", *include_paths, - file_callback)) { + file_callback, program_name)) { throw std::runtime_error("Source not found: " + cuda_source); } - *program_name = program_sources->begin()->first; // Maps header include names to their full file paths. std::map header_fullpaths; @@ -2757,7 +2919,7 @@ inline void load_program(std::string const& cuda_source, // Load header sources for (std::string const& header : headers) { if (!detail::load_source(header, *program_sources, "", *include_paths, - file_callback, &header_fullpaths)) { + file_callback, nullptr, &header_fullpaths)) { // **TODO: Deal with source not found throw std::runtime_error("Source not found: " + header); } @@ -2816,8 +2978,8 @@ inline void load_program(std::string const& cuda_source, std::string include_parent_fullpath = header_fullpaths[include_parent]; std::string include_path = detail::path_base(include_parent_fullpath); if (detail::load_source(include_name, *program_sources, include_path, - *include_paths, file_callback, &header_fullpaths, - is_included_with_quotes)) { + *include_paths, file_callback, nullptr, + &header_fullpaths, is_included_with_quotes)) { #if JITIFY_PRINT_HEADER_PATHS std::cout << "Found #include " << include_name << " from " << include_parent << ":" << line_num << " [" @@ -3067,6 +3229,7 @@ class KernelLauncher { std::unique_ptr _impl; public: + KernelLauncher() = default; inline KernelLauncher(KernelInstantiation const& kernel_inst, dim3 grid, dim3 block, unsigned int smem = 0, cudaStream_t stream = 0); @@ -3135,6 +3298,7 @@ class KernelInstantiation { std::unique_ptr _impl; public: + KernelInstantiation() = default; inline KernelInstantiation(Kernel const& kernel, std::vector const& template_args); @@ -3282,6 +3446,7 @@ class Kernel { std::unique_ptr _impl; public: + Kernel() = default; Kernel(Program const& program, std::string name, jitify::detail::vector options = 0); @@ -3346,6 +3511,7 @@ class Program { std::unique_ptr _impl; public: + Program() = default; Program(JitCache& cache, std::string source, jitify::detail::vector headers = 0, jitify::detail::vector options = 0, diff --git a/include/targets/hip/FFT_Plans.h b/include/targets/hip/FFT_Plans.h index d43415cd99..d4de34d368 100644 --- a/include/targets/hip/FFT_Plans.h +++ b/include/targets/hip/FFT_Plans.h @@ -2,7 +2,7 @@ #include #include -#include +#include #define FFT_FORWARD HIPFFT_FORWARD #define FFT_INVERSE HIPFFT_BACKWARD diff --git a/include/timer.h b/include/timer.h index 4c1557b7ce..0b69d5d466 100644 --- a/include/timer.h +++ b/include/timer.h @@ -2,10 +2,6 @@ #include -#ifdef INTERFACE_NVTX -#include "nvtx3/nvToolsExt.h" -#endif - #include #include #include @@ -65,15 +61,16 @@ namespace quda { } } + int ref_count = 0; + /** @brief Start the timer */ - void start(const char *func = nullptr, const char *file = nullptr, int line = 0) + void start(const char * = nullptr, const char * = nullptr, int = 0) { - if (running) { - printfQuda("ERROR: Cannot start an already running timer (%s:%d in %s())", file ? file : "", line, - func ? func : ""); - errorQuda("Aborting"); + if (running) { // if the timer has already started, we increment the ref counter and return + ref_count++; + return; } if (!device) { gettimeofday(&host_start, NULL); @@ -110,6 +107,10 @@ namespace quda { */ void stop(const char *func = nullptr, const char *file = nullptr, int line = 0) { + if (ref_count > 0) { + ref_count--; + return; + } peek(func, file, line); time += last_interval; count++; @@ -186,70 +187,26 @@ namespace quda { QUDA_PROFILE_COUNT /**< The total number of timers we have. Must be last enum type. */ }; -#ifdef INTERFACE_NVTX - -#define PUSH_RANGE(name,cid) { \ - int color_id = cid; \ - color_id = color_id%nvtx_num_colors;\ - nvtxEventAttributes_t eventAttrib = {}; \ - eventAttrib.version = NVTX_VERSION; \ - eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ - eventAttrib.colorType = NVTX_COLOR_ARGB; \ - eventAttrib.color = nvtx_colors[color_id]; \ - eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ - eventAttrib.message.ascii = name; \ - eventAttrib.category = cid;\ - nvtxRangePushEx(&eventAttrib); \ -} -#define POP_RANGE nvtxRangePop(); -#else -#define PUSH_RANGE(name,cid) -#define POP_RANGE -#endif - class TimeProfile { std::string fname; /**< Which function are we profiling */ #ifdef INTERFACE_NVTX static const uint32_t nvtx_colors[];// = { 0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff }; static const int nvtx_num_colors;// = sizeof(nvtx_colors)/sizeof(uint32_t); #endif - host_timer_t profile[QUDA_PROFILE_COUNT]; + array profile; static std::string pname[]; bool switchOff; bool use_global; - // global timer - static host_timer_t global_profile[QUDA_PROFILE_COUNT]; - static bool global_switchOff[QUDA_PROFILE_COUNT]; - static int global_total_level[QUDA_PROFILE_COUNT]; // zero initialize - - static void StopGlobal(const char *func, const char *file, int line, QudaProfileType idx) { - - global_total_level[idx]--; - if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line); - - // switch off total timer if we need to - if (global_switchOff[idx]) { - global_total_level[idx]--; - if (global_total_level[idx] == 0) global_profile[idx].stop(func, file, line); - global_switchOff[idx] = false; - } - } - - static void StartGlobal(const char *func, const char *file, int line, QudaProfileType idx) { - // if total timer isn't running, then start it running - if (!global_profile[idx].running) { - global_profile[idx].start(func, file, line); - global_total_level[idx]++; - global_switchOff[idx] = true; - } - - if (global_total_level[idx] == 0) global_profile[idx].start(func, file, line); - global_total_level[idx]++; - } + static void StopGlobal(const char *func, const char *file, int line, QudaProfileType idx); + static void StartGlobal(const char *func, const char *file, int line, QudaProfileType idx); public: + TimeProfile() = default; + TimeProfile(const TimeProfile &) = default; + TimeProfile &operator=(const TimeProfile &) = default; + TimeProfile(std::string fname) : fname(fname), switchOff(false), use_global(true) { ; } TimeProfile(std::string fname, bool use_global) : fname(fname), switchOff(false), use_global(use_global) { ; } @@ -257,30 +214,8 @@ namespace quda { /**< Print out the profile information */ void Print(); - void Start_(const char *func, const char *file, int line, QudaProfileType idx) - { - // if total timer isn't running, then start it running - if (!profile[QUDA_PROFILE_TOTAL].running && idx != QUDA_PROFILE_TOTAL) { - profile[QUDA_PROFILE_TOTAL].start(func, file, line); - switchOff = true; - } - - profile[idx].start(func, file, line); - PUSH_RANGE(fname.c_str(),idx) - if (use_global) StartGlobal(func,file,line,idx); - } - - void Stop_(const char *func, const char *file, int line, QudaProfileType idx) { - profile[idx].stop(func, file, line); - POP_RANGE - - // switch off total timer if we need to - if (switchOff && idx != QUDA_PROFILE_TOTAL) { - profile[QUDA_PROFILE_TOTAL].stop(func, file, line); - switchOff = false; - } - if (use_global) StopGlobal(func,file,line,idx); - } + void Start_(const char *func, const char *file, int line, QudaProfileType idx); + void Stop_(const char *func, const char *file, int line, QudaProfileType idx); void Reset_(const char *func, const char *file, int line) { for (int idx = 0; idx < QUDA_PROFILE_COUNT; idx++) profile[idx].reset(func, file, line); @@ -294,12 +229,29 @@ namespace quda { }; - static TimeProfile dummy("dummy"); + /** + @brief Container that we use for pushing a profile onto the + profile stack. While this object is in scope it will exist on + the profile stack, and be popped when its destructor is called. + */ + struct pushProfile { + static inline double secs_dummy = 0; + static inline double gflops_dummy = 0; + TimeProfile &profile; + double &secs; + double &gflops; + uint64_t flops; + pushProfile(TimeProfile &profile, double &secs = secs_dummy, double &gflops = gflops_dummy); + virtual ~pushProfile(); + }; -} // namespace quda + /** + @brief Return a reference to the present profile at the top of + the stack + */ + TimeProfile &getProfile(); -#undef PUSH_RANGE -#undef POP_RANGE +} // namespace quda #define TPSTART(idx) Start_(__func__, __FILE__, __LINE__, idx) #define TPSTOP(idx) Stop_(__func__, __FILE__, __LINE__, idx) diff --git a/include/tune_quda.h b/include/tune_quda.h index 1511f6f881..9da6a82411 100644 --- a/include/tune_quda.h +++ b/include/tune_quda.h @@ -17,34 +17,26 @@ namespace quda { - class TuneParam { - - public: - dim3 block; + struct TuneParam { + dim3 block = {1, 1, 1}; dim3 grid; - unsigned int shared_bytes; - bool set_max_shared_bytes; // whether to opt in to max shared bytes per thread block - int4 aux; // free parameter that can be used as an arbitrary autotuning dimension outside of launch parameters + unsigned int shared_bytes = 0; + bool set_max_shared_bytes = false; // whether to opt in to max shared bytes per thread block + int4 aux = {1, 1, 1, 1}; // free parameter used as an arbitrary autotuning dimension std::string comment; - float time; - long long n_calls; + float time = FLT_MAX; + long long n_calls = 0; TuneParam(); TuneParam(const TuneParam &) = default; TuneParam(TuneParam &&) = default; TuneParam &operator=(const TuneParam &) = default; TuneParam &operator=(TuneParam &&) = default; - - friend std::ostream& operator<<(std::ostream& output, const TuneParam& param) { - output << "block=(" << param.block.x << "," << param.block.y << "," << param.block.z << "), "; - output << "grid=(" << param.grid.x << "," << param.grid.y << "," << param.grid.z << "), "; - output << "shared_bytes=" << param.shared_bytes; - output << ", aux=(" << param.aux.x << "," << param.aux.y << "," << param.aux.z << "," << param.aux.w << ")"; - return output; - } }; + std::ostream &operator<<(std::ostream &, const TuneParam &); + /** * @brief Returns a reference to the tunecache map * @return tunecache reference @@ -53,6 +45,10 @@ namespace quda { class Tunable { + friend TuneParam tuneLaunch(Tunable &, QudaTune, QudaVerbosity); + static inline uint64_t _flops_global = 0; + static inline uint64_t _bytes_global = 0; + protected: virtual long long flops() const { return 0; } virtual long long bytes() const { return 0; } @@ -68,20 +64,7 @@ namespace quda { virtual bool tuneGridDim() const { return true; } virtual bool tuneAuxDim() const { return false; } - virtual bool tuneSharedBytes() const - { - static bool tune_shared = true; - static bool init = false; - - if (!init) { - char *enable_shared_env = getenv("QUDA_ENABLE_TUNING_SHARED"); - if (enable_shared_env) { - if (strcmp(enable_shared_env, "0") == 0) { tune_shared = false; } - } - init = true; - } - return tune_shared; - } + virtual bool tuneSharedBytes() const; virtual bool advanceGridDim(TuneParam ¶m) const { @@ -239,16 +222,7 @@ namespace quda { @brief Whether the present instance has already been tuned or not @return True if tuned, false if not */ - bool tuned() - { - // not tuning is equivalent to already tuned - if (!getTuning()) return true; - - TuneKey key = tuneKey(); - if (use_managed_memory()) strcat(key.aux, ",managed"); - // if key is present in cache then already tuned - return getTuneCache().find(key) != getTuneCache().end(); - } + bool tuned() const; public: Tunable() : launch_error(QUDA_SUCCESS) { aux[0] = '\0'; } @@ -287,24 +261,9 @@ namespace quda { */ virtual float min_tune_time() const { return 1e-3; } - virtual std::string paramString(const TuneParam ¶m) const - { - std::stringstream ps; - ps << param; - return ps.str(); - } - - virtual std::string perfString(float time) const - { - float gflops = flops() / (1e9 * time); - float gbytes = bytes() / (1e9 * time); - std::stringstream ss; - ss << std::setiosflags(std::ios::fixed) << std::setprecision(2) << gflops << " Gflop/s, "; - ss << gbytes << " GB/s"; - return ss.str(); - } - - virtual std::string miscString(const TuneParam &) const { return std::string(); } + virtual std::string paramString(const TuneParam ¶m) const; + virtual std::string perfString(float time) const; + virtual std::string miscString(const TuneParam &) const; virtual void initTuneParam(TuneParam ¶m) const { @@ -385,6 +344,12 @@ namespace quda { qudaError_t launchError() const { return launch_error; } qudaError_t &launchError() { return launch_error; } + + static void flops_global(uint64_t value) { _flops_global = value; } + static uint64_t flops_global() { return _flops_global; } + + static void bytes_global(uint64_t value) { _bytes_global = value; } + static uint64_t bytes_global() { return _bytes_global; } }; /** diff --git a/include/util_quda.h b/include/util_quda.h index 533df01970..3d68fb5a2e 100644 --- a/include/util_quda.h +++ b/include/util_quda.h @@ -66,7 +66,7 @@ char *getPrintBuffer(); number of OMP threads for CPU functions recorded in the tune cache. @return Returns the string */ -char* getOmpThreadStr(); +const char *getOmpThreadStr(); void errorQuda_(const char *func, const char *file, int line, ...); diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 60c245b743..6c18ccae43 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -36,7 +36,7 @@ set (QUDA_OBJS field_cache.cpp gauge_covdev.cpp dirac.cpp clover_field.cpp lattice_field.cpp gauge_field.cpp - cpu_gauge_field.cpp cuda_gauge_field.cpp extract_gauge_ghost.cu + extract_gauge_ghost.cu gauge_norm.cu gauge_update_quda.cu max_clover.cu dirac_clover.cpp dirac_wilson.cpp dirac_staggered.cpp dirac_staggered_kd.cpp dirac_clover_hasenbusch_twist.cpp @@ -84,7 +84,7 @@ set (QUDA_OBJS clover_sigma_outer_product.cu momentum.cu gauge_qcharge.cu deflation.cpp checksum.cu transform_reduce.cu dslash5_mobius_eofa.cu - madwf_ml.cpp + madwf_ml.cpp quda_ptr.cpp instantiate.cpp version.cpp block_transpose.cu ) # cmake-format: on @@ -174,6 +174,7 @@ configure_file(extract_gauge_ghost.in.cu extract_gauge_ghost.cu @ONLY) configure_file(gauge_noise.in.cu gauge_noise.cu @ONLY) configure_file(gauge_norm.in.cu gauge_norm.cu @ONLY) configure_file(spinor_noise.in.cu spinor_noise.cu @ONLY) +configure_file(spinor_dilute.in.cu spinor_dilute.cu @ONLY) configure_file(copy_color_spinor_mg.in.hpp copy_color_spinor_mg.hpp @ONLY) configure_file(color_spinor_pack.in.cu color_spinor_pack.cu @ONLY) configure_file(color_spinor_util.in.cu color_spinor_util.cu @ONLY) @@ -469,6 +470,7 @@ endif() if(QUDA_OPENMP) target_link_libraries(quda PUBLIC OpenMP::OpenMP_CXX) + target_compile_definitions(quda PUBLIC QUDA_OPENMP) endif() # set which precisions to enable diff --git a/lib/blas_quda.cu b/lib/blas_quda.cu index 4c8719f309..f84f2eeb59 100644 --- a/lib/blas_quda.cu +++ b/lib/blas_quda.cu @@ -7,9 +7,6 @@ namespace quda { namespace blas { - unsigned long long flops; - unsigned long long bytes; - template